diff --git a/.github/workflows/AT2.yml b/.github/workflows/AT2.yml index c085620db33a..c72e311ad456 100644 --- a/.github/workflows/AT2.yml +++ b/.github/workflows/AT2.yml @@ -92,7 +92,7 @@ jobs: printf "\n\n\n" - echo "image: ${AT2_IMAGE:-unknown}" + echo "image: ${AT2_IMAGE_FULLPATH:-${AT2_IMAGE:-unknown}}" python3 ${GITHUB_WORKSPACE}/packages/framework/pr_tools/PullRequestLinuxDriverTest.py \ --target-branch-name ${{ github.event.pull_request.base.ref }} \ @@ -103,18 +103,20 @@ jobs: --workspace-dir /home/runner/_work/Trilinos \ --source-dir ${GITHUB_WORKSPACE} \ --build-dir /home/Trilinos/build \ - --dashboard-build-name `cat /etc/hostname` \ + --dashboard-build-name=PR-${{ github.event.pull_request.number }}_${AT2_IMAGE}_debug_shared \ --ctest-driver /home/runner/_work/Trilinos/Trilinos/cmake/SimpleTesting/cmake/ctest-driver.cmake \ --ctest-drop-site sems-cdash-son.sandia.gov/cdash \ --filename-subprojects ./package_subproject_list.cmake \ - --filename-packageenables ./packageEnables.cmake + --filename-packageenables ./packageEnables.cmake \ + --max-cores-allowed=29 \ + --num-concurrent-tests=16 - name: Summary if: ${{ !cancelled() }} shell: bash -l {0} working-directory: /home/Trilinos/build run: | echo "## Image" >> $GITHUB_STEP_SUMMARY - echo "image: ${AT2_IMAGE:-unknown}" >> $GITHUB_STEP_SUMMARY + echo "image: ${AT2_IMAGE_FULLPATH:-${AT2_IMAGE:-unknown}}" >> $GITHUB_STEP_SUMMARY echo "## CDash Links" >> $GITHUB_STEP_SUMMARY echo "### Current Build" >> $GITHUB_STEP_SUMMARY AT2_URL=$(> $GITHUB_STEP_SUMMARY - echo "image: ${AT2_IMAGE:-unknown}" >> $GITHUB_STEP_SUMMARY + echo "image: ${AT2_IMAGE_FULLPATH:-${AT2_IMAGE:-unknown}}" >> $GITHUB_STEP_SUMMARY echo "## CDash Links" >> $GITHUB_STEP_SUMMARY echo "### Current Build" >> $GITHUB_STEP_SUMMARY AT2_URL=$(> $GITHUB_STEP_SUMMARY echo "https://gitlab-ex.sandia.gov/trilinos-project/trilinos-containers/-/wikis/Containers-at-Sandia" >> $GITHUB_STEP_SUMMARY - cuda11-uvm-EXPERIMENTAL: + cuda11-EXPERIMENTAL: needs: pre-checks runs-on: [self-hosted, cuda-11.4.2_gcc-10.3.0_openmpi-4.1.6] if: ${{ needs.pre-checks.outputs.should_skip != 'true' && (github.event.action == 'synchronize' || github.event.action == 'opened' || github.event.review.state == 'APPROVED') }} @@ -273,31 +277,32 @@ jobs: printf "\n\n\n" - echo "image: ${AT2_IMAGE:-unknown}" + echo "image: ${AT2_IMAGE_FULLPATH:-${AT2_IMAGE:-unknown}}" type python python3 ${GITHUB_WORKSPACE}/packages/framework/pr_tools/PullRequestLinuxDriverTest.py \ --target-branch-name ${{ github.event.pull_request.base.ref }} \ - --genconfig-build-name rhel8_cuda-gcc-openmpi_release_static_Ampere80_no-asan_complex_no-fpic_mpi_pt_no-rdc_uvm_deprecated-on_no-package-enables \ + --genconfig-build-name rhel8_cuda-11-gcc-openmpi_release_static_Ampere80_no-asan_complex_no-fpic_mpi_pt_no-rdc_no-uvm_deprecated-on_no-package-enables \ --pullrequest-number ${{ github.event.pull_request.number }} \ --pullrequest-env-config-file ${GITHUB_WORKSPACE}/packages/framework/pr_tools/trilinos_pr.ini \ --pullrequest-gen-config-file ${GITHUB_WORKSPACE}/packages/framework/GenConfig/src/gen-config.ini \ --workspace-dir /home/runner/_work/Trilinos \ --source-dir ${GITHUB_WORKSPACE} \ --build-dir /home/Trilinos/build \ - --dashboard-build-name `cat /etc/hostname` \ + --dashboard-build-name=PR-${{ github.event.pull_request.number }}_${AT2_IMAGE}_release_static_uvm \ --ctest-driver /home/runner/_work/Trilinos/Trilinos/cmake/SimpleTesting/cmake/ctest-driver.cmake \ --ctest-drop-site sems-cdash-son.sandia.gov/cdash \ --filename-subprojects ./package_subproject_list.cmake \ --filename-packageenables ./packageEnables.cmake \ - --max-cores-allowed=96 \ - --num-concurrent-tests=96 + --max-cores-allowed=56 \ + --num-concurrent-tests=112 \ + --slots-per-gpu=8 - name: Summary if: ${{ !cancelled() }} shell: bash -l {0} working-directory: /home/Trilinos/build run: | echo "## Image" >> $GITHUB_STEP_SUMMARY - echo "image: ${AT2_IMAGE:-unknown}" >> $GITHUB_STEP_SUMMARY + echo "image: ${AT2_IMAGE_FULLPATH:-${AT2_IMAGE:-unknown}}" >> $GITHUB_STEP_SUMMARY echo "## CDash Links" >> $GITHUB_STEP_SUMMARY echo "### Current Build" >> $GITHUB_STEP_SUMMARY AT2_URL=$(> $GITHUB_STEP_SUMMARY - echo "image: ${AT2_IMAGE:-unknown}" >> $GITHUB_STEP_SUMMARY + echo "image: ${AT2_IMAGE_FULLPATH:-${AT2_IMAGE:-unknown}}" >> $GITHUB_STEP_SUMMARY echo "## CDash Links" >> $GITHUB_STEP_SUMMARY echo "### Current Build" >> $GITHUB_STEP_SUMMARY AT2_URL=$(> $GITHUB_STEP_SUMMARY echo "https://github.com/trilinos/Trilinos/wiki/Containers" >> $GITHUB_STEP_SUMMARY echo "https://gitlab-ex.sandia.gov/trilinos-project/trilinos-containers/-/wikis/Containers-at-Sandia" >> $GITHUB_STEP_SUMMARY - diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index f280dd632cef..19c03bf2714f 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -45,7 +45,7 @@ jobs: uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Initialize CodeQL - uses: github/codeql-action/init@662472033e021d55d94146f66f6058822b0b39fd # v3.27.0 + uses: github/codeql-action/init@f09c1c0a94de965c15400f5634aa42fac8fb8f88 # v3.27.5 with: languages: ${{ matrix.language }} build-mode: ${{ matrix.build-mode }} @@ -108,6 +108,6 @@ jobs: ninja -j 16 - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@662472033e021d55d94146f66f6058822b0b39fd # v3.27.0 + uses: github/codeql-action/analyze@f09c1c0a94de965c15400f5634aa42fac8fb8f88 # v3.27.5 with: category: "/language:${{matrix.language}}" diff --git a/.github/workflows/dependency-review.yml b/.github/workflows/dependency-review.yml index 955b3b3fb2d0..21a469b132cb 100644 --- a/.github/workflows/dependency-review.yml +++ b/.github/workflows/dependency-review.yml @@ -17,11 +17,11 @@ jobs: runs-on: ubuntu-latest steps: - name: Harden Runner - uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 # v2.10.1 + uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2 with: egress-policy: audit - name: 'Checkout Repository' uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: 'Dependency Review' - uses: actions/dependency-review-action@4081bf99e2866ebe428fc0477b69eb4fcda7220a # v4.4.0 + uses: actions/dependency-review-action@3b139cfc5fae8b618d3eae3675e383bb1769c019 # v4.5.0 diff --git a/.github/workflows/scorecards.yml b/.github/workflows/scorecards.yml index 1ac917d3af8a..1cbaf2b3c6e4 100644 --- a/.github/workflows/scorecards.yml +++ b/.github/workflows/scorecards.yml @@ -66,6 +66,6 @@ jobs: # Upload the results to GitHub's code scanning dashboard. - name: "Upload to code-scanning" - uses: github/codeql-action/upload-sarif@662472033e021d55d94146f66f6058822b0b39fd # v3.27.0 + uses: github/codeql-action/upload-sarif@f09c1c0a94de965c15400f5634aa42fac8fb8f88 # v3.27.5 with: sarif_file: results.sarif diff --git a/.gitignore b/.gitignore index 66ded49f02d6..ddfa4d79f29f 100644 --- a/.gitignore +++ b/.gitignore @@ -163,3 +163,7 @@ rsync.* # pycharm ide .idea + +# clangd server stuff +.cache/ +compile_commands.json diff --git a/cmake/ProjectCompilerPostConfig.cmake b/cmake/ProjectCompilerPostConfig.cmake index b365047eeef4..2f47f7104de1 100644 --- a/cmake/ProjectCompilerPostConfig.cmake +++ b/cmake/ProjectCompilerPostConfig.cmake @@ -43,7 +43,7 @@ IF (KokkosEnable) ENDIF() set(upcoming_warnings shadow ${Trilinos_ADDITIONAL_WARNINGS}) -set(promoted_warnings parentheses sign-compare unused-variable) +set(promoted_warnings parentheses sign-compare unused-variable reorder) if("${Trilinos_WARNINGS_MODE}" STREQUAL "WARN") enable_warnings("${upcoming_warnings}") diff --git a/cmake/TPLs/FindTPLCUSOLVER.cmake b/cmake/TPLs/FindTPLCUSOLVER.cmake index ee90edaf352d..ea7d3ee02f02 100644 --- a/cmake/TPLs/FindTPLCUSOLVER.cmake +++ b/cmake/TPLs/FindTPLCUSOLVER.cmake @@ -1,17 +1,6 @@ -IF (NOT TPL_ENABLE_CUDA) - MESSAGE(FATAL_ERROR "\nCUSOLVER: This TPL requires CUDA") -ELSE() - find_library(CUDA_cusolver_LIBRARY - cusolver - HINTS ${CUDA_TOOLKIT_ROOT_DIR}/lib - ) - IF(CUDA_cusolver_LIBRARY STREQUAL "CUDA_cusolver_LIBRARY-NOTFOUND") - MESSAGE(FATAL_ERROR "\nCUSOLVER: could not find cusolver library.") - ENDIF() - SET(TPL_CUSOLVER_LIBRARIES ${CUDA_cusolver_LIBRARY}) -ENDIF() - -tribits_tpl_find_include_dirs_and_libraries(CUSOLVER REQUIRED_LIBS_NAMES cusparse) - -unset(TPL_CUSOLVER_LIBRARIES) +tribits_extpkg_create_imported_all_libs_target_and_config_file( CUSOLVER + INNER_FIND_PACKAGE_NAME CUDAToolkit + IMPORTED_TARGETS_FOR_ALL_LIBS CUDA::cusolver ) +# Above, the CUDA TPL should have already found CUDAToolkit so we just need to +# grab the target from it to form the CUSPARSE::all_libs target. diff --git a/cmake/TPLs/FindTPLCUSOLVERDependencies.cmake b/cmake/TPLs/FindTPLCUSOLVERDependencies.cmake new file mode 100644 index 000000000000..6a89bc45fa2c --- /dev/null +++ b/cmake/TPLs/FindTPLCUSOLVERDependencies.cmake @@ -0,0 +1,2 @@ +tribits_extpkg_define_dependencies( CUSOLVER + DEPENDENCIES CUDA) diff --git a/packages/amesos2/src/Amesos2_EpetraCrsMatrix_MatrixAdapter_def.hpp b/packages/amesos2/src/Amesos2_EpetraCrsMatrix_MatrixAdapter_def.hpp index d29a00991eb3..eb197a4222a6 100644 --- a/packages/amesos2/src/Amesos2_EpetraCrsMatrix_MatrixAdapter_def.hpp +++ b/packages/amesos2/src/Amesos2_EpetraCrsMatrix_MatrixAdapter_def.hpp @@ -36,7 +36,8 @@ namespace Amesos2 { o_map = rcpFromRef(this->mat_->RowMap()); t_map = Util::tpetra_map_to_epetra_map(*map); - RCP t_mat = rcp(new Epetra_CrsMatrix(Copy, *t_map, this->getMaxRowNNZ())); + int maxRowNNZ = 0; // this->getMaxRowNNZ(); + RCP t_mat = rcp(new Epetra_CrsMatrix(Copy, *t_map, maxRowNNZ)); Epetra_Import importer(*t_map, *o_map); t_mat->Import(*(this->mat_), importer, Insert); diff --git a/packages/amesos2/src/Amesos2_ShyLUBasker_def.hpp b/packages/amesos2/src/Amesos2_ShyLUBasker_def.hpp index a4a5718e7498..6a7195f6b412 100644 --- a/packages/amesos2/src/Amesos2_ShyLUBasker_def.hpp +++ b/packages/amesos2/src/Amesos2_ShyLUBasker_def.hpp @@ -557,7 +557,7 @@ bool ShyLUBasker::loadA_impl(EPhase current_phase) { using Teuchos::as; - if(current_phase == SOLVE) return (false); + if(current_phase == SOLVE || current_phase == PREORDERING ) return( false ); #ifdef HAVE_AMESOS2_TIMERS Teuchos::TimeMonitor convTimer(this->timers_.mtxConvTime_); @@ -573,7 +573,8 @@ ShyLUBasker::loadA_impl(EPhase current_phase) { // Only the root image needs storage allocated - if( this->root_ ){ + if( this->root_ && current_phase == SYMBFACT ) + { Kokkos::resize(nzvals_view_, this->globalNumNonZeros_); Kokkos::resize(rowind_view_, this->globalNumNonZeros_); Kokkos::resize(colptr_view_, this->globalNumCols_ + 1); //this will be wrong for case of gapped col ids, e.g. 0,2,4,9; num_cols = 10 ([0,10)) but num GIDs = 4... diff --git a/packages/epetraext/src/transform/EpetraExt_AMD_CrsGraph.cpp b/packages/epetraext/src/transform/EpetraExt_AMD_CrsGraph.cpp index f6fe50193bfa..fd693aab1d29 100644 --- a/packages/epetraext/src/transform/EpetraExt_AMD_CrsGraph.cpp +++ b/packages/epetraext/src/transform/EpetraExt_AMD_CrsGraph.cpp @@ -123,7 +123,7 @@ operator()( OriginalTypeRef orig ) std::vector perm(n); std::vector info(AMD_INFO); - amd_order( n, &iat[0], &jat[0], &perm[0], NULL, &info[0] ); + amd_order( n, iat.data(), jat.data(), perm.data(), NULL, &info[0] ); if( info[AMD_STATUS] == AMD_INVALID ) std::cout << "AMD ORDERING: Invalid!!!!\n"; @@ -150,13 +150,30 @@ operator()( OriginalTypeRef orig ) //Generate New Domain and Range Maps //for now, assume they start out as identical const Epetra_BlockMap & OldMap = orig.RowMap(); - int nG = orig.NumGlobalRows(); - std::vector newElements( n ); - for( int i = 0; i < n; ++i ) - newElements[i] = OldMap.GID( perm[i] ); +#ifndef EPETRA_NO_32BIT_GLOBAL_INDICES + if(OldMap.GlobalIndicesInt()) { + int nG = orig.NumGlobalRows(); + std::vector newElements( n ); + for( int i = 0; i < n; ++i ) + newElements[i] = OldMap.GID( perm[i] ); - NewMap_ = new Epetra_Map( nG, n, &newElements[0], OldMap.IndexBase(), OldMap.Comm() ); + NewMap_ = new Epetra_Map( nG, n, newElements.data(), OldMap.IndexBase(), OldMap.Comm() ); + } + else +#endif +#ifndef EPETRA_NO_64BIT_GLOBAL_INDICES + if(OldMap.GlobalIndicesLongLong()) { + long long nG = orig.NumGlobalRows64(); + std::vector newElements( nG ); + for( int i = 0; i < n; ++i ) + newElements[i] = OldMap.GID64( perm[i] ); + + NewMap_ = new Epetra_Map( nG, n, newElements.data(), OldMap.IndexBase64(), OldMap.Comm() ); + } + else +#endif + throw "CrsGraph_AMD::operator(): GlobalIndices type unknown for OldMap"; if( verbose_ ) { diff --git a/packages/framework/ini-files/config-specs.ini b/packages/framework/ini-files/config-specs.ini index de052bca3530..cf190a554ffb 100644 --- a/packages/framework/ini-files/config-specs.ini +++ b/packages/framework/ini-files/config-specs.ini @@ -1069,6 +1069,8 @@ opt-set-cmake-var Kokkos_CoreUnitTest_CudaInterOpInit_MPI_1_SET_RUN_SERIAL BOOL [ATS2-TEST-DISABLES] opt-set-cmake-var Adelus_vector_random_MPI_1_DISABLE BOOL : ON opt-set-cmake-var Kokkos_CoreUnitTest_CudaTimingBased_MPI_1_DISABLE BOOL : ON +opt-set-cmake-var Intrepid2_unit-test_Discretization_Basis_HGRAD_PYR_C1_FEM_test_02_CUDA_DFAD_DFAD_MPI_1_DISABLE BOOL : ON +opt-set-cmake-var Intrepid2_unit-test_Discretization_Basis_HGRAD_TRI_Cn_FEM_test_02_CUDA_DOUBLE_DOUBLE_MPI_1_DISABLE BOOL : ON opt-set-cmake-var MueLu_ParameterListInterpreterTpetraHeavy_MPI_1_DISABLE BOOL : ON opt-set-cmake-var MueLu_ParameterListInterpreterTpetra_MPI_1_DISABLE BOOL : ON opt-set-cmake-var MueLu_UnitTestsIntrepid2Tpetra_MPI_4_DISABLE BOOL : ON @@ -1187,7 +1189,7 @@ opt-set-cmake-var Trilinos_WARNINGS_MODE STRING : WARN [COMPILER|INTEL] opt-set-cmake-var MPI_EXEC FILEPATH : mpirun -[SEMS_COMMON_CUDA_11] +[SEMS_COMMON_CUDA] # TPL ENABLE/DISABLE settings opt-set-cmake-var TPL_ENABLE_BLAS BOOL FORCE : ON opt-set-cmake-var TPL_ENABLE_BinUtils BOOL FORCE : OFF @@ -1441,7 +1443,7 @@ use USE-UVM|NO use USE-DEPRECATED|YES use PACKAGE-ENABLES|NO-PACKAGE-ENABLES use COMMON_SPACK_TPLS -use SEMS_COMMON_CUDA_11 +use SEMS_COMMON_CUDA # TPL ENABLE/DISABLE settings opt-set-cmake-var TPL_ENABLE_BLAS BOOL FORCE : ON @@ -1518,6 +1520,11 @@ opt-set-cmake-var ROL_test_elementwise_TpetraMultiVector_MPI_4_DISABLE BOOL : ON opt-set-cmake-var STKUnit_tests_stk_mesh_unit_tests_MPI_4_DISABLE BOOL : ON # This was failing fairly reliably and Nate said it should be okay to disable (https://github.com/trilinos/Trilinos/issues/11678) opt-set-cmake-var Kokkos_CoreUnitTest_CudaTimingBased_MPI_1_DISABLE BOOL : ON +# nvcc compiler errors unless compiled with -j1 +opt-set-cmake-var Intrepid2_unit-test_Discretization_Basis_HGRAD_PYR_C1_FEM_test_02_CUDA_DFAD_DFAD_MPI_1_DISABLE BOOL : ON +# temporary disable for reevaluation +opt-set-cmake-var Intrepid2_unit-test_Discretization_Basis_HGRAD_TRI_Cn_FEM_test_02_CUDA_DOUBLE_DOUBLE_MPI_1_DISABLE BOOL : ON + # Unstable opt-set-cmake-var Adelus_vector_random_npr3_rhs1_MPI_3_DISABLE BOOL : ON @@ -1553,7 +1560,7 @@ use USE-DEPRECATED|YES use PACKAGE-ENABLES|NO-PACKAGE-ENABLES use PACKAGE-ENABLES|NO-EPETRA use COMMON_SPACK_TPLS -use SEMS_COMMON_CUDA_11 +use SEMS_COMMON_CUDA use CUDA11-RUN-SERIAL-TESTS @@ -1583,7 +1590,7 @@ use USE-DEPRECATED|YES use PACKAGE-ENABLES|NO-PACKAGE-ENABLES use COMMON_SPACK_TPLS -use SEMS_COMMON_CUDA_11 +use SEMS_COMMON_CUDA # TPL ENABLE/DISABLE settings opt-set-cmake-var TPL_ENABLE_BLAS BOOL FORCE : ON @@ -1660,6 +1667,10 @@ opt-set-cmake-var ROL_test_elementwise_TpetraMultiVector_MPI_4_DISABLE BOOL : ON opt-set-cmake-var STKUnit_tests_stk_mesh_unit_tests_MPI_4_DISABLE BOOL : ON # This was failing fairly reliably and Nate said it should be okay to disable (https://github.com/trilinos/Trilinos/issues/11678) opt-set-cmake-var Kokkos_CoreUnitTest_CudaTimingBased_MPI_1_DISABLE BOOL : ON +# nvcc compiler errors unless compiled with -j1 +opt-set-cmake-var Intrepid2_unit-test_Discretization_Basis_HGRAD_PYR_C1_FEM_test_02_CUDA_DFAD_DFAD_MPI_1_DISABLE BOOL : ON +# temporary disable for reevaluation +opt-set-cmake-var Intrepid2_unit-test_Discretization_Basis_HGRAD_TRI_Cn_FEM_test_02_CUDA_DOUBLE_DOUBLE_MPI_1_DISABLE BOOL : ON # Unstable opt-set-cmake-var Adelus_vector_random_npr3_rhs1_MPI_3_DISABLE BOOL : ON @@ -2101,7 +2112,16 @@ use USE-RDC|NO use USE-UVM|NO use USE-DEPRECATED|YES use PACKAGE-ENABLES|NO-EPETRA -use SEMS_COMMON_CUDA_11 +use SEMS_COMMON_CUDA +opt-set-cmake-var Trilinos_ENABLE_TESTS BOOL : ON +opt-set-cmake-var TPL_ENABLE_X11 BOOL : OFF +opt-set-cmake-var MPI_EXEC_PRE_NUMPROCS_FLAGS STRING FORCE : --bind-to;none --mca btl ^smcuda +opt-set-cmake-var Kokkos_ENABLE_IMPL_CUDA_MALLOC_ASYNC BOOL : OFF + +[rhel8_cuda-11-gcc-openmpi_release_static_Ampere80_no-asan_complex_no-fpic_mpi_pt_no-rdc_no-uvm_deprecated-on_no-package-enables] +use rhel8_cuda-gcc-openmpi_release_static_Ampere80_no-asan_complex_no-fpic_mpi_pt_no-rdc_no-uvm_deprecated-on_no-package-enables +use CUDA11-RUN-SERIAL-TESTS +opt-set-cmake-var ROL_test_elementwise_TpetraMultiVector_MPI_4_DISABLE BOOL : ON [rhel8_cuda-gcc-openmpi_release_static_Ampere80_no-asan_complex_no-fpic_mpi_pt_no-rdc_uvm_deprecated-on_no-package-enables] use NODE-TYPE|CUDA @@ -2118,7 +2138,8 @@ use USE-RDC|NO use USE-UVM|YES use USE-DEPRECATED|YES use PACKAGE-ENABLES|NO-EPETRA -use SEMS_COMMON_CUDA_11 +use SEMS_COMMON_CUDA +use CUDA11-RUN-SERIAL-TESTS opt-set-cmake-var Trilinos_ENABLE_TESTS BOOL FORCE : OFF opt-set-cmake-var Kokkos_ENABLE_TESTS BOOL FORCE : ON diff --git a/packages/framework/ini-files/environment-specs.ini b/packages/framework/ini-files/environment-specs.ini index f75cda728307..30bf8f1c42e6 100644 --- a/packages/framework/ini-files/environment-specs.ini +++ b/packages/framework/ini-files/environment-specs.ini @@ -160,6 +160,9 @@ envvar-find-in-path MPIF90 : mpif90 envvar-set-if-empty TRILINOS_DIR : ${path_to_src} envvar-set OMPI_CXX: ${TRILINOS_DIR}/packages/kokkos/bin/nvcc_wrapper +[rhel8_cuda-11-gcc-openmpi] +use rhel8_cuda-gcc-openmpi + [rhel8_gcc-openmpi] [rhel8_cxx-20-gcc-openmpi] diff --git a/packages/framework/ini-files/supported-envs.ini b/packages/framework/ini-files/supported-envs.ini index cd472fb0b188..3e83b3ff1dd9 100644 --- a/packages/framework/ini-files/supported-envs.ini +++ b/packages/framework/ini-files/supported-envs.ini @@ -177,6 +177,7 @@ aue-gnu-12.1.0-anaconda3-serial python oneapi-intelmpi cuda-gcc-openmpi +cuda-11-gcc-openmpi gcc-openmpi gcc-serial aue-gcc-openmpi diff --git a/packages/framework/pr_tools/PullRequestLinuxDriver.sh b/packages/framework/pr_tools/PullRequestLinuxDriver.sh index ab4b1fdf0fc9..d491f2dfd7bc 100755 --- a/packages/framework/pr_tools/PullRequestLinuxDriver.sh +++ b/packages/framework/pr_tools/PullRequestLinuxDriver.sh @@ -52,6 +52,8 @@ function bootstrap_modules() { module unload sems-python module load sems-git/2.37.0 module load sems-python/3.9.0 + execute_command_checked "module load sems-ccache" + configure_ccache module list else @@ -255,12 +257,16 @@ test_cmd_options=( --build-dir=${TRILINOS_BUILD_DIR:?} --ctest-driver=${WORKSPACE:?}/Trilinos/cmake/SimpleTesting/cmake/ctest-driver.cmake --ctest-drop-site=${TRILINOS_CTEST_DROP_SITE:?} - --dashboard-build-name=${DASHBOARD_BUILD_NAME} ) +if [[ ${DASHBOARD_BUILD_NAME:-} ]] +then + test_cmd_options+=( "--dashboard-build-name=${DASHBOARD_BUILD_NAME} ") +fi + if [[ ${extra_configure_args} ]] then - test_cmd_options+=( "--extra-configure-args=\"${extra_configure_args}\"") + test_cmd_options+=( "--extra-configure-args=\"${extra_configure_args}\" ") fi if [[ ${GENCONFIG_BUILD_NAME} == *"gnu"* ]] diff --git a/packages/framework/pr_tools/PullRequestLinuxDriverTest.py b/packages/framework/pr_tools/PullRequestLinuxDriverTest.py index 73caa76a1c62..9bcefc2c5691 100755 --- a/packages/framework/pr_tools/PullRequestLinuxDriverTest.py +++ b/packages/framework/pr_tools/PullRequestLinuxDriverTest.py @@ -70,14 +70,12 @@ def parse_args(): required.add_argument('--source-repo-url', dest="source_repo_url", action='store', - default="UNKNOWN", help='Repo with the new changes', required=False) required.add_argument('--target-repo-url', dest="target_repo_url", action='store', - default="UNKNOWN", help='Repo to merge into', required=False) @@ -90,7 +88,6 @@ def parse_args(): required.add_argument('--pullrequest-build-name', dest="pullrequest_build_name", action='store', - default="UNKNOWN", help='The Jenkins job base name', required=False) @@ -109,28 +106,24 @@ def parse_args(): required.add_argument('--jenkins-job-number', dest="jenkins_job_number", action='store', - default="UNKNOWN", help='The Jenkins build number', required=False) optional.add_argument('--dashboard-build-name', dest="dashboard_build_name", action='store', - default="UNKNOWN", help='The build name posted by ctest to a dashboard', required=False) optional.add_argument('--source-dir', dest="source_dir", action='store', - default="UNKNOWN", help="Directory containing the source code to compile/test.", required=False) optional.add_argument('--build-dir', dest="build_dir", action='store', - default="UNKNOWN", help="Path to the build directory.", required=False) @@ -144,7 +137,6 @@ def parse_args(): optional.add_argument('--ctest-driver', dest="ctest_driver", action='store', - default="UNKNOWN", help="Location of the CTest driver script to load via `-S`.", required=False) @@ -231,12 +223,21 @@ def parse_args(): dest='num_concurrent_tests', action='store', default=-1, - help="Set the number of concurrent tests allowd in CTest. " + \ + help="Set the number of concurrent tests allowed in CTest. " + \ "This is equivalent to `ctest -j `. " "If > 0 then this value is used, otherwise the value is calculated " + \ "based on number_of_available_cores / max_test_parallelism" + \ " Default = %(default)s") + optional.add_argument('--slots-per-gpu', + dest='slots_per_gpu', + action='store', + default=2, + help="If the machine has GPUs, this value will be the number " + \ + "of resource slots allowed per GPU (e.g. 4 would allow 4 MPI " + \ + "ranks to talk to each GPU)" + \ + " Default = %(default)s") + optional.add_argument("--enable-ccache", dest="ccache_enable", action="store_true", diff --git a/packages/framework/pr_tools/trilinosprhelpers/TrilinosPRConfigurationBase.py b/packages/framework/pr_tools/trilinosprhelpers/TrilinosPRConfigurationBase.py index 5ca1499c2dfa..9c1f8567fe5e 100644 --- a/packages/framework/pr_tools/trilinosprhelpers/TrilinosPRConfigurationBase.py +++ b/packages/framework/pr_tools/trilinosprhelpers/TrilinosPRConfigurationBase.py @@ -82,6 +82,18 @@ def __init__(self, args): # A R G U M E N T S # -------------------- + @property + def arg_slots_per_gpu(self): + """ + This attribute stores the number of resource slots to be used per GPU + inside of the CTest resource file (e.g. 4 slots per GPU allows for 4 + MPI ranks to talk to the each GPU). + """ + if hasattr(self.args, "slots_per_gpu"): + return self.args.slots_per_gpu + else: + return 2 + @property def arg_extra_configure_args(self): """ @@ -98,11 +110,10 @@ def arg_extra_configure_args(self): if not self._arg_extra_configure_args: if gpu_utils.has_nvidia_gpus(): self.message("-- REMARK: I see that I am running on a machine that has NVidia GPUs; I will feed TriBITS some data enabling GPU resource management") - slots_per_gpu = 2 gpu_indices = gpu_utils.list_nvidia_gpus() - self.message(f"-- REMARK: Using {slots_per_gpu} slots per GPU") + self.message(f"-- REMARK: Using {self.arg_slots_per_gpu} slots per GPU") self.message(f"-- REMARK: Using GPUs {gpu_indices}") - self._arg_extra_configure_args = f"-DTrilinos_AUTOGENERATE_TEST_RESOURCE_FILE:BOOL=ON;-DTrilinos_CUDA_NUM_GPUS:STRING={len(gpu_indices)};-DTrilinos_CUDA_SLOTS_PER_GPU:STRING={slots_per_gpu}" + (";" + self.args.extra_configure_args if self.args.extra_configure_args else "") + self._arg_extra_configure_args = f"-DTrilinos_AUTOGENERATE_TEST_RESOURCE_FILE:BOOL=ON;-DTrilinos_CUDA_NUM_GPUS:STRING={len(gpu_indices)};-DTrilinos_CUDA_SLOTS_PER_GPU:STRING={self.arg_slots_per_gpu}" + (";" + self.args.extra_configure_args if self.args.extra_configure_args else "") else: self._arg_extra_configure_args = self.args.extra_configure_args return self._arg_extra_configure_args @@ -494,14 +505,14 @@ def pullrequest_build_name(self): PR--test-- """ - if "Pull Request" in self.arg_pullrequest_cdash_track: + if self.arg_dashboard_build_name: + output = self.arg_dashboard_build_name + elif "Pull Request" in self.arg_pullrequest_cdash_track: output = f"PR-{self.arg_pullrequest_number}-test-{self.arg_pr_genconfig_job_name}" - if not self.arg_jenkins_job_number or "UNKNOWN" not in str(self.arg_jenkins_job_number): + if self.arg_jenkins_job_number: output = f"{output}-{self.arg_jenkins_job_number}" - elif self.arg_dashboard_build_name != "__UNKNOWN__": - output = self.arg_dashboard_build_name else: - output = self.arg_pr_genconfig_job_name + output = self.arg_pr_genconfig_job_name return output diff --git a/packages/framework/pr_tools/trilinosprhelpers/unittests/test_TrilinosPRConfigurationBase.py b/packages/framework/pr_tools/trilinosprhelpers/unittests/test_TrilinosPRConfigurationBase.py index b323abfe1943..0d48f8d5b190 100755 --- a/packages/framework/pr_tools/trilinosprhelpers/unittests/test_TrilinosPRConfigurationBase.py +++ b/packages/framework/pr_tools/trilinosprhelpers/unittests/test_TrilinosPRConfigurationBase.py @@ -212,7 +212,7 @@ def dummy_args(self): target_branch_name="develop", pullrequest_build_name="Trilinos-pullrequest-gcc", genconfig_build_name="rhel8_sems-gnu-openmpi_release_static_no-kokkos-arch_no-asan_no-complex_no-fpic_mpi_no-pt_no-rdc_no-package-enables", - dashboard_build_name="gnu-openmpi_release_static", + dashboard_build_name=None, jenkins_job_number=99, pullrequest_number='0000', pullrequest_cdash_track="Pull Request", @@ -331,56 +331,41 @@ def test_TrilinosPRConfigurationCDashTrack(self): self.assertEqual(cdash_track, "Pull Request") - def test_TrilinosPRConfigurationBuildNamePython2(self): - args = self.dummy_args_python3() - pr_config = trilinosprhelpers.TrilinosPRConfigurationBase(args) - build_name = pr_config.pullrequest_build_name - print("--- build_name = {}".format(build_name)) - expected_build_name = "PR-{}-test-{}-{}".format(args.pullrequest_number, args.genconfig_build_name, args.jenkins_job_number) - self.assertEqual(build_name, expected_build_name) - - def test_TrilinosPRConfigurationBaseBuildNameGCC720(self): args = self.dummy_args_gcc_720() pr_config = trilinosprhelpers.TrilinosPRConfigurationBase(args) build_name = pr_config.pullrequest_build_name - print("--- build_name = {}".format(build_name)) - expected_build_name = "PR-{}-test-{}-{}".format(args.pullrequest_number, args.genconfig_build_name, args.jenkins_job_number) + expected_build_name = f"PR-{args.pullrequest_number}-test-{args.genconfig_build_name}-{args.jenkins_job_number}" self.assertEqual(build_name, expected_build_name) - def test_TrilinosPRConfigurationBaseBuildNameContainsPullRequest(self): + + def test_TrilinosPRConfigurationBaseBuildGroupContainsPullRequest(self): """Test that a group containing 'Pull Request' causes the build name to reflect a PR build.""" args = self.dummy_args_gcc_720() args.pullrequest_cdash_track = "Pull Request (Non-blocking)" pr_config = trilinosprhelpers.TrilinosPRConfigurationBase(args) build_name = pr_config.pullrequest_build_name - print("--- build_name = {}".format(build_name)) - expected_build_name = "PR-{}-test-{}-{}".format(args.pullrequest_number, args.genconfig_build_name, args.jenkins_job_number) + expected_build_name = f"PR-{args.pullrequest_number}-test-{args.genconfig_build_name}-{args.jenkins_job_number}" self.assertEqual(build_name, expected_build_name) + def test_TrilinosPRConfigurationBaseBuildNameNonPRTrack(self): + """Test that the default (non-PR) dashboard name is the GenConfig build ID.""" args = self.dummy_args_non_pr_track() - pr_config = trilinosprhelpers.TrilinosPRConfigurationBase(args) - build_name = pr_config.pullrequest_build_name - expected_build_name = args.dashboard_build_name + expected_build_name = args.genconfig_build_name self.assertEqual(build_name, expected_build_name) - def test_TrilinosPRConfigurationBaseBuildNameDefaultDashboardName(self): - """ - Test the build name output when dashboard_build_name contains - the default Jenkins parameter value, '__UNKNOWN__'. - """ - args = self.dummy_args_non_pr_track() - args.dashboard_build_name = "__UNKNOWN__" - + def test_TrilinosPRConfigurationBaseBuildNamePassed(self): + """Test that a passed build name is used.""" + args = self.dummy_args() + args.dashboard_build_name = "some-dashboard-build-name" pr_config = trilinosprhelpers.TrilinosPRConfigurationBase(args) - - result_build_name = pr_config.pullrequest_build_name - expected_build_name = args.genconfig_build_name - self.assertEqual(expected_build_name, result_build_name) + build_name = pr_config.pullrequest_build_name + expected_build_name = args.dashboard_build_name + self.assertEqual(build_name, expected_build_name) def test_TrilinosPRConfigurationBaseDashboardModelPRTrack(self): diff --git a/packages/framework/pr_tools/unittests/test_PullRequestLinuxDriverTest.py b/packages/framework/pr_tools/unittests/test_PullRequestLinuxDriverTest.py index 91e8e7068d71..8d596681d910 100755 --- a/packages/framework/pr_tools/unittests/test_PullRequestLinuxDriverTest.py +++ b/packages/framework/pr_tools/unittests/test_PullRequestLinuxDriverTest.py @@ -67,12 +67,12 @@ def setUp(self): target_branch_name='real_trash', pullrequest_build_name='Some_odd_compiler', genconfig_build_name='Some_odd_compiler_and_options', - dashboard_build_name='UNKNOWN', + dashboard_build_name=None, pullrequest_number='4242', jenkins_job_number='2424', - source_dir='UNKNOWN', - build_dir='UNKNOWN', - ctest_driver='UNKNOWN', + source_dir=None, + build_dir=None, + ctest_driver=None, ctest_drop_site='testing.sandia.gov', pullrequest_cdash_track='Pull Request', pullrequest_env_config_file='/dev/null/Trilinos_clone/pr_config/pullrequest.ini', @@ -97,9 +97,9 @@ def setUp(self): | - [R] genconfig-build-name : Some_odd_compiler_and_options | - [R] pullrequest-number : 4242 | - [R] jenkins-job-number : 2424 - | - [R] source-dir : UNKNOWN - | - [R] build-dir : UNKNOWN - | - [R] ctest-driver : UNKNOWN + | - [R] source-dir : None + | - [R] build-dir : None + | - [R] ctest-driver : None | - [R] ctest-drop-site : testing.sandia.gov | | - [O] dry-run : False @@ -157,7 +157,6 @@ def test_parse_args_returns_defaults(self): ''' No inputs ''' - import difflib with self.m_argv, self.stdoutRedirect as m_stdout: returned_default = PullRequestLinuxDriverTest.parse_args() diff --git a/packages/ifpack/src/Ifpack_CrsRiluk.cpp b/packages/ifpack/src/Ifpack_CrsRiluk.cpp index 3628d7adfc8d..afb597ae5d6b 100644 --- a/packages/ifpack/src/Ifpack_CrsRiluk.cpp +++ b/packages/ifpack/src/Ifpack_CrsRiluk.cpp @@ -379,7 +379,7 @@ int Ifpack_CrsRiluk::Factor() { int NumIn, NumL, NumU; // Get Maximun Row length - int MaxNumEntries = L_->MaxNumEntries() + U_->MaxNumEntries() + 1; + int MaxNumEntries = L_->MaxNumEntries() + U_->MaxNumEntries() + 2; std::vector InI(MaxNumEntries); // Allocate temp space std::vector InV(MaxNumEntries); diff --git a/packages/intrepid2/src/Cell/Intrepid2_CellData.cpp b/packages/intrepid2/src/Cell/Intrepid2_CellData.cpp index 7fd508035a2f..23e184e60907 100644 --- a/packages/intrepid2/src/Cell/Intrepid2_CellData.cpp +++ b/packages/intrepid2/src/Cell/Intrepid2_CellData.cpp @@ -21,6 +21,9 @@ const CellTopologyData* Intrepid2::getCellTopologyData(const unsigned& cellTopologyKey){ const CellTopologyData* cellTopologyData; switch (cellTopologyKey) { + case shards::Node::key: + cellTopologyData = shards::getCellTopologyData(); + break; case shards::Line<2>::key: cellTopologyData = shards::getCellTopologyData>(); break; diff --git a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_TET_Cn_FEMDef.hpp b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_TET_Cn_FEMDef.hpp index 1c8715525bc0..44f09ce456eb 100644 --- a/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_TET_Cn_FEMDef.hpp +++ b/packages/intrepid2/src/Discretization/Basis/Intrepid2_HGRAD_TET_Cn_FEMDef.hpp @@ -71,9 +71,12 @@ getValues( OutputViewType output, Impl::Basis_HGRAD_TET_Cn_FEM_ORTH:: Serial::getValues(phis, input, workView, order); - for (ordinal_type i=0;i + bugprone-reserved-identifier, + -clang-analyzer* + +WarningsAsErrors: '*' +FormatStyle: 'file' +HeaderFilterRegex: '.*Kokkos.*' diff --git a/packages/kokkos-kernels/CHANGELOG.md b/packages/kokkos-kernels/CHANGELOG.md index 343f815ed721..58695228e4ef 100644 --- a/packages/kokkos-kernels/CHANGELOG.md +++ b/packages/kokkos-kernels/CHANGELOG.md @@ -1,5 +1,83 @@ # Change Log +## [4.5.00](https://github.com/kokkos/kokkos-kernels/tree/4.5.00) +[Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/4.4.01...4.5.00) + +### New Features + +#### Batched updates +- Implement batched serial laswp [\#2395](https://github.com/kokkos/kokkos-kernels/pull/2395) +- implement batched serial iamax [\#2399](https://github.com/kokkos/kokkos-kernels/pull/2399) +- Implement batched serial pbtrs [\#2330](https://github.com/kokkos/kokkos-kernels/pull/2330) +- Implement batched serial pbtrf [\#2322](https://github.com/kokkos/kokkos-kernels/pull/2322) +- Implement batched serial pttrs [\#2277](https://github.com/kokkos/kokkos-kernels/pull/2277) + +#### BLAS +- gemm perf_test: print matrix sizes [\#2362](https://github.com/kokkos/kokkos-kernels/pull/2362) + +#### LAPACK +- Modify validity checks for output views sizes in svd [\#2350](https://github.com/kokkos/kokkos-kernels/pull/2350) + +#### ODE +- Improved convergence and robustness of Runge-Kutta integrators [\#2229](https://github.com/kokkos/kokkos-kernels/pull/2229) + +#### Sparse +- Don't use bulk sort in KokkosSparse::sort_crs_matrix sometimes [\#2353](https://github.com/kokkos/kokkos-kernels/pull/2353) +- `OpenMPSmartStatic_SPMV.hpp`: throw if posix_memalign fails [\#2368](https://github.com/kokkos/kokkos-kernels/pull/2368) + +### Enhancements: +- Eti extern marking [\#2292](https://github.com/kokkos/kokkos-kernels/pull/2292) + +#### Common utilities +- Add KokkosKernels::eager_initialize() to common [\#2317](https://github.com/kokkos/kokkos-kernels/pull/2317) +- Put default types in KokkosKernels namespace [\#2341](https://github.com/kokkos/kokkos-kernels/pull/2341) + +#### TPL support +- Add MAGMA TPL support for GESV on HIP backend [\#2326](https://github.com/kokkos/kokkos-kernels/pull/2326) +- BLAS - gemv: using fallback when mode is 't' or 'c' and onemkl is used [\#2272](https://github.com/kokkos/kokkos-kernels/pull/2272) + +### Bug Fixes: +- SerialInverseLU: fix overflow in integer multiplication [\#2410](https://github.com/kokkos/kokkos-kernels/pull/2410) +- Fix potential overflow issue in spiluk [\#2409](https://github.com/kokkos/kokkos-kernels/pull/2409) +- Mult result conversion [\#2405](https://github.com/kokkos/kokkos-kernels/pull/2405) +- Blas1 asum: workaround for openblas error with short vectors [\#2384](https://github.com/kokkos/kokkos-kernels/pull/2384) +- Set `KokkosKernels_ENABLE_COMPONENT` variables to value instead of variable name [\#2380](https://github.com/kokkos/kokkos-kernels/pull/2380) +- Block Sptrsv fixes [\#2376](https://github.com/kokkos/kokkos-kernels/pull/2376) +- Fix set-but-unused in Test_ODE_BDF [\#2355](https://github.com/kokkos/kokkos-kernels/pull/2355) +- sparse_sort_crs: fix column shuffle indices [\#2346](https://github.com/kokkos/kokkos-kernels/pull/2346) +- Fix #2344: SVD hanging [\#2345](https://github.com/kokkos/kokkos-kernels/pull/2345) +- Some compilers throw shadow warnings in static functions [\#2297](https://github.com/kokkos/kokkos-kernels/pull/2297) +- A couple platforms do not correctly handle static complexes [\#2285](https://github.com/kokkos/kokkos-kernels/pull/2285) +- Help gcc/8.3 with ctad issue [\#2265](https://github.com/kokkos/kokkos-kernels/pull/2265) + +### Deprecations and Cleanup: +- Clean and replace forbidden names for macros and symbols (see [identifiers](https://en.cppreference.com/w/cpp/language/identifiers)) + - Rename reserved identifiers [\#2373](https://github.com/kokkos/kokkos-kernels/pull/2373) + - search/replace KOKKOS_-prefixed macros [\#2372](https://github.com/kokkos/kokkos-kernels/pull/2372) + - Deprecate `__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__` [\#2406](https://github.com/kokkos/kokkos-kernels/pull/2406) + - Deprecate `__KOKKOSBATCHED_ENABLE_INTEL_MKL__` [\#2403](https://github.com/kokkos/kokkos-kernels/pull/2403) + - Deprecate `__KOKKOSBATCHED_PROMOTION__` [\#2392](https://github.com/kokkos/kokkos-kernels/pull/2392) +- Update atomic function usage ahead of Kokkos deprecation and removal + - Prefer `expected == atomic_compare_exchange(ptr, expected, desired)` [\#2387](https://github.com/kokkos/kokkos-kernels/pull/2387) + - Prefer `atomic_assign(ptr, val) -> atomic_store(ptr, val)` [\#2383](https://github.com/kokkos/kokkos-kernels/pull/2383) + - Replace atomic_{inc, dec}[rement] [\#2386](https://github.com/kokkos/kokkos-kernels/pull/2386) + - Do not specify template argument when using Kokkos atomics [\#2382](https://github.com/kokkos/kokkos-kernels/pull/2382) +- Deprecate redundant team-level sort functions [\#2306](https://github.com/kokkos/kokkos-kernels/pull/2306) +- Free allocated `MatrixPrec` [\#2407](https://github.com/kokkos/kokkos-kernels/pull/2407) +- Reduce duplicated code in trsv [\#2388](https://github.com/kokkos/kokkos-kernels/pull/2388) +- perf_tests: remove false dependence on google test [\#2385](https://github.com/kokkos/kokkos-kernels/pull/2385) +- `kk_is_gpu_exec_space()` -> `is_gpu_exec_space_v` [\#2354](https://github.com/kokkos/kokkos-kernels/pull/2354) +- remove unneeded volatile qualifier for Kokkos::Single [\#2333](https://github.com/kokkos/kokkos-kernels/pull/2333) + +### Documentation and Testing: + +- CI: `address` sanitizer and most of `undefined` sanitizer [\#2408](https://github.com/kokkos/kokkos-kernels/pull/2408) +- Workflow volta70 [\#2356](https://github.com/kokkos/kokkos-kernels/pull/2356) +- AT-2: adding non-TPL build for HIP backend [\#2329](https://github.com/kokkos/kokkos-kernels/pull/2329) +- Workflows: Add remaining spr and bdw checks [\#2321](https://github.com/kokkos/kokkos-kernels/pull/2321) +- Remove review trigger and group github-{BDW,H100,MI201} under github-AT2 [\#2320](https://github.com/kokkos/kokkos-kernels/pull/2320) +- Don't error out if graph unit tests disabled [\#2305](https://github.com/kokkos/kokkos-kernels/pull/2305) + ## [4.4.01](https://github.com/kokkos/kokkos-kernels/tree/4.4.01) [Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/4.4.00...4.4.01) diff --git a/packages/kokkos-kernels/CMakeLists.txt b/packages/kokkos-kernels/CMakeLists.txt index fd3515e0c44a..c766cdf18713 100644 --- a/packages/kokkos-kernels/CMakeLists.txt +++ b/packages/kokkos-kernels/CMakeLists.txt @@ -10,8 +10,8 @@ SET(KOKKOSKERNELS_TOP_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}) SET(KOKKOSKERNELS_TOP_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) SET(KokkosKernels_VERSION_MAJOR 4) -SET(KokkosKernels_VERSION_MINOR 4) -SET(KokkosKernels_VERSION_PATCH 1) +SET(KokkosKernels_VERSION_MINOR 5) +SET(KokkosKernels_VERSION_PATCH 0) SET(KokkosKernels_VERSION "${KokkosKernels_VERSION_MAJOR}.${KokkosKernels_VERSION_MINOR}.${KokkosKernels_VERSION_PATCH}") #Set variables for config file @@ -136,13 +136,13 @@ ELSE() SET(CMAKE_HIP_ARCHITECTURES ${Kokkos_HIP_ARCHITECTURES}) ENDIF() ENDIF() - IF(${Kokkos_VERSION} VERSION_GREATER_EQUAL "4.3.01") + IF(${Kokkos_VERSION} VERSION_GREATER_EQUAL "4.4.01") MESSAGE(STATUS "Found Kokkos version ${Kokkos_VERSION} at ${Kokkos_DIR}") - IF((${Kokkos_VERSION} VERSION_GREATER "4.4.99")) + IF((${Kokkos_VERSION} VERSION_GREATER "4.5.99")) MESSAGE(WARNING "Configuring with Kokkos ${Kokkos_VERSION} which is newer than the expected develop branch - version check may need update") ENDIF() ELSE() - MESSAGE(FATAL_ERROR "Kokkos Kernels ${KokkosKernels_VERSION} requires Kokkos 4.3.01 or greater (found ${Kokkos_VERSION})") + MESSAGE(FATAL_ERROR "Kokkos Kernels ${KokkosKernels_VERSION} requires Kokkos 4.4.01 or greater (found ${Kokkos_VERSION})") ENDIF() ENDIF() diff --git a/packages/kokkos-kernels/batched/KokkosBatched_Util.hpp b/packages/kokkos-kernels/batched/KokkosBatched_Util.hpp index 8a1cb0e01b73..520427e8c6ec 100644 --- a/packages/kokkos-kernels/batched/KokkosBatched_Util.hpp +++ b/packages/kokkos-kernels/batched/KokkosBatched_Util.hpp @@ -13,13 +13,27 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_UTIL_HPP__ -#define __KOKKOSBATCHED_UTIL_HPP__ +#ifndef KOKKOSBATCHED_UTIL_HPP +#define KOKKOSBATCHED_UTIL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) // no experimental name space guard for trilinos -#define __KOKKOSBATCHED_PROMOTION__ 1 + +#if defined(KOKKOS_COMPILER_MSVC) +#define KOKKOSBATCHED_IMPL_PROMOTION \ + (__pragma(message("warning: __KOKKOSBATCHED_PROMOTION__ is deprecated and will be removed in a future version")) 1) +#elif defined(KOKKOS_COMPILER_GNU) || defined(KOKKOS_COMPILER_CLANG) +#define KOKKOSBATCHED_IMPL_PROMOTION \ + (__extension__({ \ + _Pragma("GCC warning \"__KOKKOSBATCHED_PROMOTION__ is deprecated and will be removed in a future version\""); \ + 1; \ + })) +#else +#define KOKKOSBATCHED_IMPL_PROMOTION 1 // no good way to deprecate? +#endif + +#define __KOKKOSBATCHED_PROMOTION__ KOKKOSBATCHED_IMPL_PROMOTION #include #include @@ -42,10 +56,41 @@ // TPL macros #if defined(KOKKOSKERNELS_ENABLE_TPL_MKL) -#define __KOKKOSBATCHED_ENABLE_INTEL_MKL__ 1 + +#if defined(KOKKOS_COMPILER_MSVC) +#define KOKKOSBATCHED_IMPL_ENABLE_INTEL_MKL \ + (__pragma( \ + message("warning: __KOKKOSBATCHED_ENABLE_INTEL_MKL__ is deprecated and will be removed in a future version")) 1) +#elif defined(KOKKOS_COMPILER_GNU) || defined(KOKKOS_COMPILER_CLANG) +#define KOKKOSBATCHED_IMPL_ENABLE_INTEL_MKL \ + (__extension__({ \ + _Pragma("warning: __KOKKOSBATCHED_ENABLE_INTEL_MKL__ is deprecated and will be removed in a future version"); \ + 1; \ + })) +#else +#define KOKKOSBATCHED_IMPL_ENABLE_INTEL_MKL 1 // no good way to deprecate? +#endif +#define __KOKKOSBATCHED_ENABLE_INTEL_MKL__ KOKKOSBATCHED_IMPL_ENABLE_INTEL_MKL + #include "mkl_version.h" #if __INTEL_MKL__ >= 2018 -#define __KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__ 1 + +#if defined(KOKKOS_COMPILER_MSVC) +#define KOKKOSBATCHED_IMPL_ENABLE_INTEL_MKL_BATCHED \ + (__pragma(message( \ + "warning: __KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__ is deprecated and will be removed in a future version")) 1) +#elif defined(KOKKOS_COMPILER_GNU) || defined(KOKKOS_COMPILER_CLANG) +#define KOKKOSBATCHED_IMPL_ENABLE_INTEL_MKL_BATCHED \ + (__extension__({ \ + _Pragma( \ + "warning: __KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__ is deprecated and will be removed in a future version"); \ + 1; \ + })) +#else +#define KOKKOSBATCHED_IMPL_ENABLE_INTEL_MKL_BATCHED 1 // no good way to deprecate? +#endif +#define __KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__ KOKKOSBATCHED_IMPL_ENABLE_INTEL_MKL_BATCHED + #define __KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__ 1 #include "mkl.h" // #include "mkl_types.h" @@ -671,4 +716,4 @@ KOKKOS_INLINE_FUNCTION void fma_bounds_check(ViewType v, SizeType m, SizeType n, } } // namespace KokkosBatched -#endif // __KOKKOSBATCHED_UTIL_HPP__ +#endif // KOKKOSBATCHED_UTIL_HPP diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_AddRadial_Impl.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_AddRadial_Impl.hpp index d89a82ae2cb8..bc8c0ab77298 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_AddRadial_Impl.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_AddRadial_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_ADD_RADIAL_IMPL_HPP__ -#define __KOKKOSBATCHED_ADD_RADIAL_IMPL_HPP__ +#ifndef KOKKOSBATCHED_ADD_RADIAL_IMPL_HPP +#define KOKKOSBATCHED_ADD_RADIAL_IMPL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_AddRadial_Internal.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_AddRadial_Internal.hpp index 634879530e5f..7b8220dcfea2 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_AddRadial_Internal.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_AddRadial_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_ADD_RADIAL_INTERNAL_HPP__ -#define __KOKKOSBATCHED_ADD_RADIAL_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_ADD_RADIAL_INTERNAL_HPP +#define KOKKOSBATCHED_ADD_RADIAL_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_ApplyGivens_Serial_Internal.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_ApplyGivens_Serial_Internal.hpp index 2d3d2af915f1..095488ac1670 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_ApplyGivens_Serial_Internal.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_ApplyGivens_Serial_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_APPLY_GIVENS_SERIAL_INTERNAL_HPP__ -#define __KOKKOSBATCHED_APPLY_GIVENS_SERIAL_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_APPLY_GIVENS_SERIAL_INTERNAL_HPP +#define KOKKOSBATCHED_APPLY_GIVENS_SERIAL_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_ApplyHouseholder_Serial_Impl.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_ApplyHouseholder_Serial_Impl.hpp index db85d96680a8..b39fa3e5ad9f 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_ApplyHouseholder_Serial_Impl.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_ApplyHouseholder_Serial_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_APPLY_HOUSEHOLDER_SERIAL_IMPL_HPP__ -#define __KOKKOSBATCHED_APPLY_HOUSEHOLDER_SERIAL_IMPL_HPP__ +#ifndef KOKKOSBATCHED_APPLY_HOUSEHOLDER_SERIAL_IMPL_HPP +#define KOKKOSBATCHED_APPLY_HOUSEHOLDER_SERIAL_IMPL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_ApplyHouseholder_Serial_Internal.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_ApplyHouseholder_Serial_Internal.hpp index e129fef5a572..574280c82b09 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_ApplyHouseholder_Serial_Internal.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_ApplyHouseholder_Serial_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_APPLY_HOUSEHOLDER_SERIAL_INTERNAL_HPP__ -#define __KOKKOSBATCHED_APPLY_HOUSEHOLDER_SERIAL_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_APPLY_HOUSEHOLDER_SERIAL_INTERNAL_HPP +#define KOKKOSBATCHED_APPLY_HOUSEHOLDER_SERIAL_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_ApplyHouseholder_TeamVector_Impl.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_ApplyHouseholder_TeamVector_Impl.hpp index b322574ad00f..dc5cb104dd57 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_ApplyHouseholder_TeamVector_Impl.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_ApplyHouseholder_TeamVector_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_APPLY_HOUSEHOLDER_TEAMVECTOR_IMPL_HPP__ -#define __KOKKOSBATCHED_APPLY_HOUSEHOLDER_TEAMVECTOR_IMPL_HPP__ +#ifndef KOKKOSBATCHED_APPLY_HOUSEHOLDER_TEAMVECTOR_IMPL_HPP +#define KOKKOSBATCHED_APPLY_HOUSEHOLDER_TEAMVECTOR_IMPL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_ApplyHouseholder_TeamVector_Internal.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_ApplyHouseholder_TeamVector_Internal.hpp index 2474a10fe3ee..b3cfc8b6debf 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_ApplyHouseholder_TeamVector_Internal.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_ApplyHouseholder_TeamVector_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_APPLY_HOUSEHOLDER_TEAMVECTOR_INTERNAL_HPP__ -#define __KOKKOSBATCHED_APPLY_HOUSEHOLDER_TEAMVECTOR_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_APPLY_HOUSEHOLDER_TEAMVECTOR_INTERNAL_HPP +#define KOKKOSBATCHED_APPLY_HOUSEHOLDER_TEAMVECTOR_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_ApplyPivot_Impl.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_ApplyPivot_Impl.hpp index 10455f65b600..adbce23d9af3 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_ApplyPivot_Impl.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_ApplyPivot_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_APPLY_PIVOT_IMPL_HPP__ -#define __KOKKOSBATCHED_APPLY_PIVOT_IMPL_HPP__ +#ifndef KOKKOSBATCHED_APPLY_PIVOT_IMPL_HPP +#define KOKKOSBATCHED_APPLY_PIVOT_IMPL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_ApplyPivot_Internal.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_ApplyPivot_Internal.hpp index a30138210841..e01325225547 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_ApplyPivot_Internal.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_ApplyPivot_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_APPLY_PIVOT_INTERNAL_HPP__ -#define __KOKKOSBATCHED_APPLY_PIVOT_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_APPLY_PIVOT_INTERNAL_HPP +#define KOKKOSBATCHED_APPLY_PIVOT_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_ApplyQ_Serial_Impl.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_ApplyQ_Serial_Impl.hpp index ba9d85350ffe..f1f2d2908900 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_ApplyQ_Serial_Impl.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_ApplyQ_Serial_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_APPLY_Q_SERIAL_IMPL_HPP__ -#define __KOKKOSBATCHED_APPLY_Q_SERIAL_IMPL_HPP__ +#ifndef KOKKOSBATCHED_APPLY_Q_SERIAL_IMPL_HPP +#define KOKKOSBATCHED_APPLY_Q_SERIAL_IMPL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_ApplyQ_Serial_Internal.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_ApplyQ_Serial_Internal.hpp index dbb11df74706..d91ab8c6f63f 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_ApplyQ_Serial_Internal.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_ApplyQ_Serial_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_APPLY_Q_SERIAL_INTERNAL_HPP__ -#define __KOKKOSBATCHED_APPLY_Q_SERIAL_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_APPLY_Q_SERIAL_INTERNAL_HPP +#define KOKKOSBATCHED_APPLY_Q_SERIAL_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_ApplyQ_TeamVector_Impl.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_ApplyQ_TeamVector_Impl.hpp index d6abd61a784e..943706276519 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_ApplyQ_TeamVector_Impl.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_ApplyQ_TeamVector_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_APPLY_Q_TEAMVECTOR_IMPL_HPP__ -#define __KOKKOSBATCHED_APPLY_Q_TEAMVECTOR_IMPL_HPP__ +#ifndef KOKKOSBATCHED_APPLY_Q_TEAMVECTOR_IMPL_HPP +#define KOKKOSBATCHED_APPLY_Q_TEAMVECTOR_IMPL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_ApplyQ_TeamVector_Internal.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_ApplyQ_TeamVector_Internal.hpp index 8fc6c8a78af6..5a2c074b54fa 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_ApplyQ_TeamVector_Internal.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_ApplyQ_TeamVector_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_APPLY_Q_TEAMVECTOR_INTERNAL_HPP__ -#define __KOKKOSBATCHED_APPLY_Q_TEAMVECTOR_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_APPLY_Q_TEAMVECTOR_INTERNAL_HPP +#define KOKKOSBATCHED_APPLY_Q_TEAMVECTOR_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Axpy_Impl.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Axpy_Impl.hpp index 6d65ebc294e4..381a53e0a4ae 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Axpy_Impl.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Axpy_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_AXPY_IMPL_HPP__ -#define __KOKKOSBATCHED_AXPY_IMPL_HPP__ +#ifndef KOKKOSBATCHED_AXPY_IMPL_HPP +#define KOKKOSBATCHED_AXPY_IMPL_HPP /// \author Kim Liegeois (knliege@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Copy_Impl.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Copy_Impl.hpp index e11106cc24df..0096c91e664a 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Copy_Impl.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Copy_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_COPY_IMPL_HPP__ -#define __KOKKOSBATCHED_COPY_IMPL_HPP__ +#ifndef KOKKOSBATCHED_COPY_IMPL_HPP +#define KOKKOSBATCHED_COPY_IMPL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Copy_Internal.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Copy_Internal.hpp index 004c62646a41..c25215be6d61 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Copy_Internal.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Copy_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_COPY_INTERNAL_HPP__ -#define __KOKKOSBATCHED_COPY_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_COPY_INTERNAL_HPP +#define KOKKOSBATCHED_COPY_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Dot_Internal.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Dot_Internal.hpp index 48d1b1f1acd8..75f366829c18 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Dot_Internal.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Dot_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_DOT_INTERNAL_HPP__ -#define __KOKKOSBATCHED_DOT_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_DOT_INTERNAL_HPP +#define KOKKOSBATCHED_DOT_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Eigendecomposition_Serial_Impl.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Eigendecomposition_Serial_Impl.hpp index 8ca3b09e5977..b76ad99d7cc0 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Eigendecomposition_Serial_Impl.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Eigendecomposition_Serial_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_EIGENDECOMPOSITION_SERIAL_IMPL_HPP__ -#define __KOKKOSBATCHED_EIGENDECOMPOSITION_SERIAL_IMPL_HPP__ +#ifndef KOKKOSBATCHED_EIGENDECOMPOSITION_SERIAL_IMPL_HPP +#define KOKKOSBATCHED_EIGENDECOMPOSITION_SERIAL_IMPL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Eigendecomposition_Serial_Internal.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Eigendecomposition_Serial_Internal.hpp index b1cfb6ef253d..62852f7872d3 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Eigendecomposition_Serial_Internal.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Eigendecomposition_Serial_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_EIGENDECOMPOSITION_SERIAL_INTERNAL_HPP__ -#define __KOKKOSBATCHED_EIGENDECOMPOSITION_SERIAL_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_EIGENDECOMPOSITION_SERIAL_INTERNAL_HPP +#define KOKKOSBATCHED_EIGENDECOMPOSITION_SERIAL_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) @@ -338,7 +338,7 @@ struct SerialEigendecompositionInternal { inline static int host_invoke(const int m, RealType* A, const int as0, const int as1, RealType* er, const int ers, RealType* ei, const int eis, RealType* UL, const int uls0, const int uls1, RealType* UR, const int urs0, const int urs1, RealType* w, const int wlen) { -#if defined(__KOKKOSBATCHED_ENABLE_LAPACKE__) || defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) +#if defined(__KOKKOSBATCHED_ENABLE_LAPACKE__) || defined(KOKKOSBATCHED_IMPL_ENABLE_INTEL_MKL) int matrix_layout(0), lda(0), uls(0), urs(0); if (as0 == 1) { assert(uls0 == 1 && "UL is not column major"); diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Eigendecomposition_TeamVector_Impl.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Eigendecomposition_TeamVector_Impl.hpp index 97f68d63de0c..7ffc9f7bc0c2 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Eigendecomposition_TeamVector_Impl.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Eigendecomposition_TeamVector_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_EIGENDECOMPOSITION_TEAMVECTOR_IMPL_HPP__ -#define __KOKKOSBATCHED_EIGENDECOMPOSITION_TEAMVECTOR_IMPL_HPP__ +#ifndef KOKKOSBATCHED_EIGENDECOMPOSITION_TEAMVECTOR_IMPL_HPP +#define KOKKOSBATCHED_EIGENDECOMPOSITION_TEAMVECTOR_IMPL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Eigendecomposition_TeamVector_Internal.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Eigendecomposition_TeamVector_Internal.hpp index 567bbd3ad569..95e9d1149dc6 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Eigendecomposition_TeamVector_Internal.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Eigendecomposition_TeamVector_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_EIGENDECOMPOSITION_TEAMVECTOR_INTERNAL_HPP__ -#define __KOKKOSBATCHED_EIGENDECOMPOSITION_TEAMVECTOR_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_EIGENDECOMPOSITION_TEAMVECTOR_INTERNAL_HPP +#define KOKKOSBATCHED_EIGENDECOMPOSITION_TEAMVECTOR_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Eigenvalue_Serial_Internal.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Eigenvalue_Serial_Internal.hpp index 0ac8ed3859a8..408ba2f2ada3 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Eigenvalue_Serial_Internal.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Eigenvalue_Serial_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_EIGENVALUE_SERIAL_INTERNAL_HPP__ -#define __KOKKOSBATCHED_EIGENVALUE_SERIAL_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_EIGENVALUE_SERIAL_INTERNAL_HPP +#define KOKKOSBATCHED_EIGENVALUE_SERIAL_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_FindAmax_Internal.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_FindAmax_Internal.hpp index 42dc9480144d..50bea3fd1026 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_FindAmax_Internal.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_FindAmax_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_FIND_AMAX_INTERNAL_HPP__ -#define __KOKKOSBATCHED_FIND_AMAX_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_FIND_AMAX_INTERNAL_HPP +#define KOKKOSBATCHED_FIND_AMAX_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Francis_Serial_Internal.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Francis_Serial_Internal.hpp index e303cafd1fe1..9a3be4ab5622 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Francis_Serial_Internal.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Francis_Serial_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_FRANCIS_SERIAL_INTERNAL_HPP__ -#define __KOKKOSBATCHED_FRANCIS_SERIAL_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_FRANCIS_SERIAL_INTERNAL_HPP +#define KOKKOSBATCHED_FRANCIS_SERIAL_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Gemm_Serial_Impl.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Gemm_Serial_Impl.hpp index 82d6b1641b30..fae44c8f830d 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Gemm_Serial_Impl.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Gemm_Serial_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_GEMM_SERIAL_IMPL_HPP__ -#define __KOKKOSBATCHED_GEMM_SERIAL_IMPL_HPP__ +#ifndef KOKKOSBATCHED_GEMM_SERIAL_IMPL_HPP +#define KOKKOSBATCHED_GEMM_SERIAL_IMPL_HPP #include "KokkosBatched_Util.hpp" #include "KokkosBatched_Gemm_Serial_Internal.hpp" @@ -36,7 +36,7 @@ namespace KokkosBatched { /// NT/NT /// -#if defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) && defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) && \ +#if defined(KOKKOSBATCHED_IMPL_ENABLE_INTEL_MKL) && defined(KOKKOSBATCHED_IMPL_ENABLE_INTEL_MKL_BATCHED) && \ defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__) template <> template @@ -95,7 +95,7 @@ KOKKOS_INLINE_FUNCTION int SerialGemm template @@ -154,7 +154,7 @@ KOKKOS_INLINE_FUNCTION int SerialGemm template @@ -213,7 +213,7 @@ KOKKOS_INLINE_FUNCTION int SerialGemm template diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Gemm_Serial_Internal.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Gemm_Serial_Internal.hpp index eaa5b67ffa15..1a83a27112d9 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Gemm_Serial_Internal.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Gemm_Serial_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_GEMM_SERIAL_INTERNAL_HPP__ -#define __KOKKOSBATCHED_GEMM_SERIAL_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_GEMM_SERIAL_INTERNAL_HPP +#define KOKKOSBATCHED_GEMM_SERIAL_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Gemm_TeamVector_Impl.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Gemm_TeamVector_Impl.hpp index 64e65d62d828..3818cc4258a3 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Gemm_TeamVector_Impl.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Gemm_TeamVector_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_GEMM_TEAMVECTOR_IMPL_HPP__ -#define __KOKKOSBATCHED_GEMM_TEAMVECTOR_IMPL_HPP__ +#ifndef KOKKOSBATCHED_GEMM_TEAMVECTOR_IMPL_HPP +#define KOKKOSBATCHED_GEMM_TEAMVECTOR_IMPL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Gemm_TeamVector_Internal.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Gemm_TeamVector_Internal.hpp index 8ad7d570df09..1b46270f5aac 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Gemm_TeamVector_Internal.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Gemm_TeamVector_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_GEMM_TEAMVECTOR_INTERNAL_HPP__ -#define __KOKKOSBATCHED_GEMM_TEAMVECTOR_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_GEMM_TEAMVECTOR_INTERNAL_HPP +#define KOKKOSBATCHED_GEMM_TEAMVECTOR_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Gemm_Team_Impl.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Gemm_Team_Impl.hpp index 0a9fb87b9e6d..32c28b456226 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Gemm_Team_Impl.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Gemm_Team_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_GEMM_TEAM_IMPL_HPP__ -#define __KOKKOSBATCHED_GEMM_TEAM_IMPL_HPP__ +#ifndef KOKKOSBATCHED_GEMM_TEAM_IMPL_HPP +#define KOKKOSBATCHED_GEMM_TEAM_IMPL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Gemm_Team_Internal.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Gemm_Team_Internal.hpp index 1b77a2599109..b8647f520548 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Gemm_Team_Internal.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Gemm_Team_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_GEMM_TEAM_INTERNAL_HPP__ -#define __KOKKOSBATCHED_GEMM_TEAM_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_GEMM_TEAM_INTERNAL_HPP +#define KOKKOSBATCHED_GEMM_TEAM_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) @@ -115,7 +115,7 @@ KOKKOS_INLINE_FUNCTION int TeamGemmInternal::invoke( Kokkos::parallel_for(Kokkos::TeamThreadRange(member, mq * nq), [&](const int &ij) { int i, j; // note: the condition is constexpr - if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { + if (KokkosKernels::Impl::is_gpu_exec_space_v) { i = ij % mq * mb; j = ij / mq * nb; } else { diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Gemv_TeamVector_Impl.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Gemv_TeamVector_Impl.hpp index 4f54bf7f31c0..924529511375 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Gemv_TeamVector_Impl.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Gemv_TeamVector_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_GEMV_TEAMVECTOR_IMPL_HPP__ -#define __KOKKOSBATCHED_GEMV_TEAMVECTOR_IMPL_HPP__ +#ifndef KOKKOSBATCHED_GEMV_TEAMVECTOR_IMPL_HPP +#define KOKKOSBATCHED_GEMV_TEAMVECTOR_IMPL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Gemv_TeamVector_Internal.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Gemv_TeamVector_Internal.hpp index 8d9676b22360..a10bfbaae135 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Gemv_TeamVector_Internal.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Gemv_TeamVector_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_GEMV_TEAMVECTOR_INTERNAL_HPP__ -#define __KOKKOSBATCHED_GEMV_TEAMVECTOR_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_GEMV_TEAMVECTOR_INTERNAL_HPP +#define KOKKOSBATCHED_GEMV_TEAMVECTOR_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Gemv_Team_Impl.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Gemv_Team_Impl.hpp index 16f12529d49d..028e52b859ee 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Gemv_Team_Impl.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Gemv_Team_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_GEMV_TEAM_IMPL_HPP__ -#define __KOKKOSBATCHED_GEMV_TEAM_IMPL_HPP__ +#ifndef KOKKOSBATCHED_GEMV_TEAM_IMPL_HPP +#define KOKKOSBATCHED_GEMV_TEAM_IMPL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Gemv_Team_Internal.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Gemv_Team_Internal.hpp index 8f63e24b27ec..8b534c9081b1 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Gemv_Team_Internal.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Gemv_Team_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_GEMV_TEAM_INTERNAL_HPP__ -#define __KOKKOSBATCHED_GEMV_TEAM_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_GEMV_TEAM_INTERNAL_HPP +#define KOKKOSBATCHED_GEMV_TEAM_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Gesv_Impl.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Gesv_Impl.hpp index ba18cbafd7f8..399eeea12740 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Gesv_Impl.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Gesv_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_GESV_IMPL_HPP__ -#define __KOKKOSBATCHED_GESV_IMPL_HPP__ +#ifndef KOKKOSBATCHED_GESV_IMPL_HPP +#define KOKKOSBATCHED_GESV_IMPL_HPP /// \author Kim Liegeois (knliege@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Givens_Serial_Internal.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Givens_Serial_Internal.hpp index 963862661b16..2fb22d275d46 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Givens_Serial_Internal.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Givens_Serial_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_GIVENS_SERIAL_INTERNAL_HPP__ -#define __KOKKOSBATCHED_GIVENS_SERIAL_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_GIVENS_SERIAL_INTERNAL_HPP +#define KOKKOSBATCHED_GIVENS_SERIAL_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_HadamardProduct_Impl.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_HadamardProduct_Impl.hpp index 658acd6b605f..4cd3a27eebd1 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_HadamardProduct_Impl.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_HadamardProduct_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_HADAMARDPRODUCT_IMPL_HPP__ -#define __KOKKOSBATCHED_HADAMARDPRODUCT_IMPL_HPP__ +#ifndef KOKKOSBATCHED_HADAMARDPRODUCT_IMPL_HPP +#define KOKKOSBATCHED_HADAMARDPRODUCT_IMPL_HPP /// \author Kim Liegeois (knliege@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_HessenbergFormQ_Serial_Internal.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_HessenbergFormQ_Serial_Internal.hpp index 8db5d40a9801..a72c8c007799 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_HessenbergFormQ_Serial_Internal.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_HessenbergFormQ_Serial_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_HESSENBERG_FORM_Q_SERIAL_INTERNAL_HPP__ -#define __KOKKOSBATCHED_HESSENBERG_FORM_Q_SERIAL_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_HESSENBERG_FORM_Q_SERIAL_INTERNAL_HPP +#define KOKKOSBATCHED_HESSENBERG_FORM_Q_SERIAL_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_HessenbergQR_WithShift_Serial_Internal.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_HessenbergQR_WithShift_Serial_Internal.hpp index 3815a9e18e8f..92498c0ce6b4 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_HessenbergQR_WithShift_Serial_Internal.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_HessenbergQR_WithShift_Serial_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_HESSENBERG_QR_WITH_SHIFT_SERIAL_INTERNAL_HPP__ -#define __KOKKOSBATCHED_HESSENBERG_QR_WITH_SHIFT_SERIAL_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_HESSENBERG_QR_WITH_SHIFT_SERIAL_INTERNAL_HPP +#define KOKKOSBATCHED_HESSENBERG_QR_WITH_SHIFT_SERIAL_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Hessenberg_Serial_Internal.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Hessenberg_Serial_Internal.hpp index 44c5b44373c7..99ab0171b655 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Hessenberg_Serial_Internal.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Hessenberg_Serial_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_HESSENBERG_SERIAL_INTERNAL_HPP__ -#define __KOKKOSBATCHED_HESSENBERG_SERIAL_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_HESSENBERG_SERIAL_INTERNAL_HPP +#define KOKKOSBATCHED_HESSENBERG_SERIAL_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Armpl_Impl.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Armpl_Impl.hpp index 7e814646a206..8191ac299714 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Armpl_Impl.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Armpl_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_HOSTLEVEL_GEMM_ARMPL_IMPL_HPP__ -#define __KOKKOSBATCHED_HOSTLEVEL_GEMM_ARMPL_IMPL_HPP__ +#ifndef KOKKOSBATCHED_HOSTLEVEL_GEMM_ARMPL_IMPL_HPP +#define KOKKOSBATCHED_HOSTLEVEL_GEMM_ARMPL_IMPL_HPP #if defined(KOKKOSKERNELS_ENABLE_TPL_ARMPL) && ARMPL_BUILD >= 1058 #include "KokkosBatched_Util.hpp" #include "KokkosKernels_Error.hpp" diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_HostLevel_Gemm_DblBuf_Impl.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_HostLevel_Gemm_DblBuf_Impl.hpp index 6888de725d79..2e63da90e7c5 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_HostLevel_Gemm_DblBuf_Impl.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_HostLevel_Gemm_DblBuf_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_HOSTLEVEL_GEMM_DBLBUF_IMPL_HPP__ -#define __KOKKOSBATCHED_HOSTLEVEL_GEMM_DBLBUF_IMPL_HPP__ +#ifndef KOKKOSBATCHED_HOSTLEVEL_GEMM_DBLBUF_IMPL_HPP +#define KOKKOSBATCHED_HOSTLEVEL_GEMM_DBLBUF_IMPL_HPP #include "KokkosBatched_Util.hpp" #include "KokkosKernels_Error.hpp" @@ -249,8 +249,7 @@ class BatchedDblBufGemm { CViewType __C; ScalarType __alpha, __beta; int __k; - size_t __n_sub_tiles; - unsigned __tiles_per_col, __tiles_per_row; + size_t __n_sub_tiles, __tiles_per_col, __tiles_per_row; public: size_t get_n_sub_tiles() { return __n_sub_tiles; } @@ -283,8 +282,8 @@ class BatchedDblBufGemm { // with '!!'. This extra tile will hang off the edge of the 2-rank matrix. // For cases where tiles hang off the edge, we over-compute 0s within // registers via a conditional bounds check selected at compile-time. - __tiles_per_row = ei.__c_m / TILE_M + !!((unsigned)ei.__c_m % TILE_M); - __tiles_per_col = ei.__c_n / TILE_N + !!((unsigned)ei.__c_n % TILE_N); + __tiles_per_row = ei.__c_m / TILE_M + !!((size_t)ei.__c_m % TILE_M); + __tiles_per_col = ei.__c_n / TILE_N + !!((size_t)ei.__c_n % TILE_N); __n_sub_tiles = __tiles_per_row * __tiles_per_col; } diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Impl.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Impl.hpp index 6216aeb099c8..a248f4e14c03 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Impl.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_HOSTLEVEL_GEMM_IMPL_HPP__ -#define __KOKKOSBATCHED_HOSTLEVEL_GEMM_IMPL_HPP__ +#ifndef KOKKOSBATCHED_HOSTLEVEL_GEMM_IMPL_HPP +#define KOKKOSBATCHED_HOSTLEVEL_GEMM_IMPL_HPP #include #include // Trans, BatchLayout #include @@ -128,7 +128,7 @@ int BatchedGemmImpl(BatchedGemmHandleType *const handle, const ScalarType alpha, using layout_type = typename CViewType::array_layout; using exec_space = typename CViewType::execution_space; constexpr bool is_vector = KokkosBatched::is_vector::value; - constexpr bool on_gpu = KokkosKernels::Impl::kk_is_gpu_exec_space(); + constexpr bool on_gpu = KokkosKernels::Impl::is_gpu_exec_space_v; constexpr bool on_x86_64 = KokkosKernels::Impl::kk_is_x86_64_mem_space(); constexpr bool on_a64fx = KokkosKernels::Impl::kk_is_a64fx_mem_space(); bool out_of_range = false; @@ -277,4 +277,4 @@ int BatchedGemmImpl(BatchedGemmHandleType *const handle, const ScalarType alpha, } } // namespace Impl } // namespace KokkosBatched -#endif // __KOKKOSBATCHED_HOSTLEVEL_GEMM_IMPL_HPP__ +#endif // KOKKOSBATCHED_HOSTLEVEL_GEMM_IMPL_HPP diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Serial_Impl.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Serial_Impl.hpp index 8da3c7acd1ae..c0d48da2f3a7 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Serial_Impl.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Serial_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_HOSTLEVEL_GEMM_SERIAL_IMPL_HPP__ -#define __KOKKOSBATCHED_HOSTLEVEL_GEMM_SERIAL_IMPL_HPP__ +#ifndef KOKKOSBATCHED_HOSTLEVEL_GEMM_SERIAL_IMPL_HPP +#define KOKKOSBATCHED_HOSTLEVEL_GEMM_SERIAL_IMPL_HPP #include "KokkosBatched_Gemm_Decl.hpp" namespace KokkosBatched { diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Spec.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Spec.hpp index 6f06694f0971..3b312533aa89 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Spec.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Spec.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_HOSTLEVEL_GEMM_SPEC_HPP__ -#define __KOKKOSBATCHED_HOSTLEVEL_GEMM_SPEC_HPP__ +#ifndef KOKKOSBATCHED_HOSTLEVEL_GEMM_SPEC_HPP +#define KOKKOSBATCHED_HOSTLEVEL_GEMM_SPEC_HPP #include #include @@ -165,6 +165,72 @@ struct BatchedGemmSpec { } // namespace Impl } // namespace KokkosBatched +// ETI instantiation macros, consumed by *.cpp.in files +#define KOKKOSBATCHED_GEMM_ETI_SPEC_DECL_INNER(ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + extern template struct BatchedGemmSpec, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + false, true>; + +#if defined(KOKKOSKERNELS_DECL_LAYOUTRIGHT) +#define KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_DECL_INNER(ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, LAYOUT, \ + EXEC_SPACE, MEM_SPACE) \ + KOKKOSBATCHED_GEMM_ETI_SPEC_DECL_INNER(ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, Kokkos::LayoutRight, \ + EXEC_SPACE, MEM_SPACE) +#else +#define KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_DECL_INNER(ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, LAYOUT, \ + EXEC_SPACE, MEM_SPACE) +#endif + +#if defined(KOKKOSKERNELS_DECL_LAYOUTLEFT) +#define KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_DECL_INNER(ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, LAYOUT, \ + EXEC_SPACE, MEM_SPACE) \ + KOKKOSBATCHED_GEMM_ETI_SPEC_DECL_INNER(ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, Kokkos::LayoutLeft, \ + EXEC_SPACE, MEM_SPACE) +#else +#define KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_DECL_INNER(ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, LAYOUT, \ + EXEC_SPACE, MEM_SPACE) +#endif + +///////////////// BatchLayout::Left Permutations ///////////////// +#define KOKKOSBATCHED_GEMM_NT_NT_BLL_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_DECL_INNER(Trans::NoTranspose, Trans::NoTranspose, BatchLayout::Left, SCALAR, \ + LAYOUT, EXEC_SPACE, MEM_SPACE) + +#define KOKKOSBATCHED_GEMM_NT_T_BLL_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_DECL_INNER(Trans::NoTranspose, Trans::Transpose, BatchLayout::Left, SCALAR, LAYOUT, \ + EXEC_SPACE, MEM_SPACE) + +#define KOKKOSBATCHED_GEMM_T_NT_BLL_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_DECL_INNER(Trans::Transpose, Trans::NoTranspose, BatchLayout::Left, SCALAR, LAYOUT, \ + EXEC_SPACE, MEM_SPACE) + +#define KOKKOSBATCHED_GEMM_T_T_BLL_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_DECL_INNER(Trans::Transpose, Trans::Transpose, BatchLayout::Left, SCALAR, LAYOUT, \ + EXEC_SPACE, MEM_SPACE) + +///////////////// BatchLayout::Right Permutations ///////////////// +#define KOKKOSBATCHED_GEMM_NT_NT_BLR_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_DECL_INNER(Trans::NoTranspose, Trans::NoTranspose, BatchLayout::Right, SCALAR, \ + LAYOUT, EXEC_SPACE, MEM_SPACE) + +#define KOKKOSBATCHED_GEMM_NT_T_BLR_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_DECL_INNER(Trans::NoTranspose, Trans::Transpose, BatchLayout::Right, SCALAR, LAYOUT, \ + EXEC_SPACE, MEM_SPACE) + +#define KOKKOSBATCHED_GEMM_T_NT_BLR_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_DECL_INNER(Trans::Transpose, Trans::NoTranspose, BatchLayout::Right, SCALAR, LAYOUT, \ + EXEC_SPACE, MEM_SPACE) + +#define KOKKOSBATCHED_GEMM_T_T_BLR_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_DECL_INNER(Trans::Transpose, Trans::Transpose, BatchLayout::Right, SCALAR, LAYOUT, \ + EXEC_SPACE, MEM_SPACE) + // ETI instantiation macros, consumed by *.cpp.in files #define KOKKOSBATCHED_GEMM_ETI_SPEC_INST_INNER(ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, LAYOUT, EXEC_SPACE, \ MEM_SPACE) \ @@ -230,4 +296,15 @@ struct BatchedGemmSpec { #define KOKKOSBATCHED_GEMM_T_T_BLR_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_INST_INNER(Trans::Transpose, Trans::Transpose, BatchLayout::Right, SCALAR, LAYOUT, \ EXEC_SPACE, MEM_SPACE) -#endif // __KOKKOSBATCHED_HOSTLEVEL_GEMM_SPEC_HPP__ + +#include +#include +#include +#include + +#include +#include +#include +#include + +#endif // KOKKOSBATCHED_HOSTLEVEL_GEMM_SPEC_HPP diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Householder_Serial_Impl.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Householder_Serial_Impl.hpp index c8f5c7a20eb0..e4274d335203 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Householder_Serial_Impl.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Householder_Serial_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_HOUSEHOLDER_SERIAL_IMPL_HPP__ -#define __KOKKOSBATCHED_HOUSEHOLDER_SERIAL_IMPL_HPP__ +#ifndef KOKKOSBATCHED_HOUSEHOLDER_SERIAL_IMPL_HPP +#define KOKKOSBATCHED_HOUSEHOLDER_SERIAL_IMPL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Householder_Serial_Internal.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Householder_Serial_Internal.hpp index 0257ff4d9b97..15e20a5e57d6 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Householder_Serial_Internal.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Householder_Serial_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_HOUSEHOLDER_SERIAL_INTERNAL_HPP__ -#define __KOKKOSBATCHED_HOUSEHOLDER_SERIAL_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_HOUSEHOLDER_SERIAL_INTERNAL_HPP +#define KOKKOSBATCHED_HOUSEHOLDER_SERIAL_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Householder_TeamVector_Impl.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Householder_TeamVector_Impl.hpp index bc55a646bc25..3bcde1964cde 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Householder_TeamVector_Impl.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Householder_TeamVector_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_HOUSEHOLDER_TEAMVECTOR_IMPL_HPP__ -#define __KOKKOSBATCHED_HOUSEHOLDER_TEAMVECTOR_IMPL_HPP__ +#ifndef KOKKOSBATCHED_HOUSEHOLDER_TEAMVECTOR_IMPL_HPP +#define KOKKOSBATCHED_HOUSEHOLDER_TEAMVECTOR_IMPL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Householder_TeamVector_Internal.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Householder_TeamVector_Internal.hpp index 1074dc4280c2..efd046e232ba 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Householder_TeamVector_Internal.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Householder_TeamVector_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_HOUSEHOLDER_TEAMVECTOR_INTERNAL_HPP__ -#define __KOKKOSBATCHED_HOUSEHOLDER_TEAMVECTOR_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_HOUSEHOLDER_TEAMVECTOR_INTERNAL_HPP +#define KOKKOSBATCHED_HOUSEHOLDER_TEAMVECTOR_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Iamax_Serial_Impl.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Iamax_Serial_Impl.hpp new file mode 100644 index 000000000000..9c0f99028b85 --- /dev/null +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Iamax_Serial_Impl.hpp @@ -0,0 +1,37 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBATCHED_IAMAX_SERIAL_IMPL_HPP_ +#define KOKKOSBATCHED_IAMAX_SERIAL_IMPL_HPP_ + +/// \author Yuuichi Asahi (yuuichi.asahi@cea.fr) + +#include "KokkosBatched_Iamax_Serial_Internal.hpp" + +namespace KokkosBatched { + +template +KOKKOS_INLINE_FUNCTION typename XViewType::size_type SerialIamax::invoke(const XViewType &x) { + static_assert(Kokkos::is_view_v, "KokkosBatched::iamax: XViewType is not a Kokkos::View."); + if (x.extent(0) <= 1) return 0; + using size_type = typename XViewType::size_type; + using value_type = typename XViewType::non_const_value_type; + return KokkosBatched::Impl::SerialIamaxInternal::invoke(x.extent(0), x.data(), x.stride(0)); +} + +} // namespace KokkosBatched + +#endif // KOKKOSBATCHED_IAMAX_SERIAL_IMPL_HPP_ diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Iamax_Serial_Internal.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Iamax_Serial_Internal.hpp new file mode 100644 index 000000000000..89aed299ae94 --- /dev/null +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Iamax_Serial_Internal.hpp @@ -0,0 +1,60 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBATCHED_IAMAX_SERIAL_INTERNAL_HPP_ +#define KOKKOSBATCHED_IAMAX_SERIAL_INTERNAL_HPP_ + +/// \author Yuuichi Asahi (yuuichi.asahi@cea.fr) + +#include +#include "KokkosBatched_Util.hpp" + +namespace KokkosBatched { +namespace Impl { + +/// +/// Serial Internal Impl +/// ======================== + +struct SerialIamaxInternal { + template + KOKKOS_INLINE_FUNCTION static IndexType invoke(const int n, const ValueType *KOKKOS_RESTRICT x, const int xs0); +}; + +template +KOKKOS_INLINE_FUNCTION IndexType SerialIamaxInternal::invoke(const int n, const ValueType *KOKKOS_RESTRICT x, + const int xs0) { + using ats = typename Kokkos::ArithTraits; + using RealType = typename ats::mag_type; + + RealType amax = Kokkos::abs(x[0 * xs0]); + IndexType imax = 0; + + for (IndexType i = 1; i < static_cast(n); ++i) { + const RealType abs_x_i = Kokkos::abs(x[i * xs0]); + if (abs_x_i > amax) { + amax = abs_x_i; + imax = i; + } + } + + return imax; +}; + +} // namespace Impl +} // namespace KokkosBatched + +#endif // KOKKOSBATCHED_IAMAX_SERIAL_INTERNAL_HPP_ diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_InnerGemmFixA_Serial_Impl.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_InnerGemmFixA_Serial_Impl.hpp index eb576f1dff8c..771cd8ade9fb 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_InnerGemmFixA_Serial_Impl.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_InnerGemmFixA_Serial_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_INNER_GEMM_FIX_A_SERIAL_IMPL_HPP__ -#define __KOKKOSBATCHED_INNER_GEMM_FIX_A_SERIAL_IMPL_HPP__ +#ifndef KOKKOSBATCHED_INNER_GEMM_FIX_A_SERIAL_IMPL_HPP +#define KOKKOSBATCHED_INNER_GEMM_FIX_A_SERIAL_IMPL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_InnerGemmFixB_Serial_Impl.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_InnerGemmFixB_Serial_Impl.hpp index 6912c285a60f..8a5f17eaea53 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_InnerGemmFixB_Serial_Impl.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_InnerGemmFixB_Serial_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_INNER_GEMM_FIX_B_SERIAL_IMPL_HPP__ -#define __KOKKOSBATCHED_INNER_GEMM_FIX_B_SERIAL_IMPL_HPP__ +#ifndef KOKKOSBATCHED_INNER_GEMM_FIX_B_SERIAL_IMPL_HPP +#define KOKKOSBATCHED_INNER_GEMM_FIX_B_SERIAL_IMPL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_InnerGemmFixC_Serial_Impl.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_InnerGemmFixC_Serial_Impl.hpp index 9ad08549cbf7..e090ce57bd36 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_InnerGemmFixC_Serial_Impl.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_InnerGemmFixC_Serial_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_INNER_GEMM_FIX_C_SERIAL_IMPL_HPP__ -#define __KOKKOSBATCHED_INNER_GEMM_FIX_C_SERIAL_IMPL_HPP__ +#ifndef KOKKOSBATCHED_INNER_GEMM_FIX_C_SERIAL_IMPL_HPP +#define KOKKOSBATCHED_INNER_GEMM_FIX_C_SERIAL_IMPL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_InnerGemmFixC_Team_Impl.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_InnerGemmFixC_Team_Impl.hpp index a3d6dece581f..ae552ddb9123 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_InnerGemmFixC_Team_Impl.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_InnerGemmFixC_Team_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_INNER_GEMM_FIX_C_TEAM_IMPL_HPP__ -#define __KOKKOSBATCHED_INNER_GEMM_FIX_C_TEAM_IMPL_HPP__ +#ifndef KOKKOSBATCHED_INNER_GEMM_FIX_C_TEAM_IMPL_HPP +#define KOKKOSBATCHED_INNER_GEMM_FIX_C_TEAM_IMPL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_InnerLU_Serial_Impl.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_InnerLU_Serial_Impl.hpp index 0d74598b2449..fb8409afabe5 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_InnerLU_Serial_Impl.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_InnerLU_Serial_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_INNER_LU_SERIAL_IMPL_HPP__ -#define __KOKKOSBATCHED_INNER_LU_SERIAL_IMPL_HPP__ +#ifndef KOKKOSBATCHED_INNER_LU_SERIAL_IMPL_HPP +#define KOKKOSBATCHED_INNER_LU_SERIAL_IMPL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_InnerTrsm_Serial_Impl.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_InnerTrsm_Serial_Impl.hpp index 04825ac61c03..35062c8db75e 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_InnerTrsm_Serial_Impl.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_InnerTrsm_Serial_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_INNER_TRSM_SERIAL_IMPL_HPP__ -#define __KOKKOSBATCHED_INNER_TRSM_SERIAL_IMPL_HPP__ +#ifndef KOKKOSBATCHED_INNER_TRSM_SERIAL_IMPL_HPP +#define KOKKOSBATCHED_INNER_TRSM_SERIAL_IMPL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_InverseLU_Serial_Impl.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_InverseLU_Serial_Impl.hpp index 215c62e9f258..216b5f96527d 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_InverseLU_Serial_Impl.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_InverseLU_Serial_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_INVERSELU_SERIAL_IMPL_HPP__ -#define __KOKKOSBATCHED_INVERSELU_SERIAL_IMPL_HPP__ +#ifndef KOKKOSBATCHED_INVERSELU_SERIAL_IMPL_HPP +#define KOKKOSBATCHED_INVERSELU_SERIAL_IMPL_HPP /// \author Vinh Dang (vqdang@sandia.gov) @@ -32,7 +32,7 @@ namespace KokkosBatched { /// InverseLU no piv /// -#if defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) && defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) && \ +#if defined(KOKKOSBATCHED_IMPL_ENABLE_INTEL_MKL) && defined(KOKKOSBATCHED_IMPL_ENABLE_INTEL_MKL_BATCHED) && \ defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__) template <> template diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_LU_Serial_Impl.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_LU_Serial_Impl.hpp index e2acd012cb35..b25b9bbc2ead 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_LU_Serial_Impl.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_LU_Serial_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_LU_SERIAL_IMPL_HPP__ -#define __KOKKOSBATCHED_LU_SERIAL_IMPL_HPP__ +#ifndef KOKKOSBATCHED_LU_SERIAL_IMPL_HPP +#define KOKKOSBATCHED_LU_SERIAL_IMPL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) @@ -31,7 +31,7 @@ namespace KokkosBatched { /// SerialLU no piv /// -#if defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) && defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) && \ +#if defined(KOKKOSBATCHED_IMPL_ENABLE_INTEL_MKL) && defined(KOKKOSBATCHED_IMPL_ENABLE_INTEL_MKL_BATCHED) && \ defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__) template <> template diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_LU_Serial_Internal.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_LU_Serial_Internal.hpp index 6555a16d93cc..52002ad473d4 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_LU_Serial_Internal.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_LU_Serial_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_LU_SERIAL_INTERNAL_HPP__ -#define __KOKKOSBATCHED_LU_SERIAL_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_LU_SERIAL_INTERNAL_HPP +#define KOKKOSBATCHED_LU_SERIAL_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_LU_Team_Impl.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_LU_Team_Impl.hpp index 9ed5e244d2d5..4bcfdbe29c2d 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_LU_Team_Impl.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_LU_Team_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_LU_TEAM_IMPL_HPP__ -#define __KOKKOSBATCHED_LU_TEAM_IMPL_HPP__ +#ifndef KOKKOSBATCHED_LU_TEAM_IMPL_HPP +#define KOKKOSBATCHED_LU_TEAM_IMPL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_LU_Team_Internal.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_LU_Team_Internal.hpp index dacfb02ed480..75516d6f6a01 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_LU_Team_Internal.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_LU_Team_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_LU_TEAM_INTERNAL_HPP__ -#define __KOKKOSBATCHED_LU_TEAM_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_LU_TEAM_INTERNAL_HPP +#define KOKKOSBATCHED_LU_TEAM_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Laswp_Serial_Impl.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Laswp_Serial_Impl.hpp new file mode 100644 index 000000000000..445251a6478a --- /dev/null +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Laswp_Serial_Impl.hpp @@ -0,0 +1,104 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBATCHED_LASWP_SERIAL_IMPL_HPP_ +#define KOKKOSBATCHED_LASWP_SERIAL_IMPL_HPP_ + +#include "KokkosBatched_Util.hpp" +#include "KokkosBatched_Laswp_Serial_Internal.hpp" + +/// \author Yuuichi Asahi (yuuichi.asahi@cea.fr) + +namespace KokkosBatched { +namespace Impl { + +template +KOKKOS_INLINE_FUNCTION static int checkLaswpInput(const PivViewType &piv, const AViewType &A) { + static_assert(Kokkos::is_view_v, "KokkosBatched::laswp: PivViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view_v, "KokkosBatched::laswp: AViewType is not a Kokkos::View."); + static_assert(AViewType::rank == 1 || AViewType::rank == 2, "KokkosBatched::laswp: AViewType must have rank 1 or 2."); + static_assert(PivViewType::rank == 1, "KokkosBatched::laswp: PivViewType must have rank 1."); + +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + const int npiv = piv.extent(0); + const int lda = A.extent(0); + if (npiv > lda) { + Kokkos::printf( + "KokkosBatched::laswp: the dimension of the ipiv array must " + "satisfy ipiv.extent(0) <= A.extent(0): ipiv: %d, A: " + "%d \n", + npiv, lda); + return 1; + } +#endif + return 0; +} +} // namespace Impl + +/// +/// Serial Internal Impl +/// ======================== + +/// +//// Forward pivot apply +/// + +template <> +struct SerialLaswp { + template + KOKKOS_INLINE_FUNCTION static int invoke(const PivViewType &piv, const AViewType &A) { + auto info = KokkosBatched::Impl::checkLaswpInput(piv, A); + if (info) return info; + + if constexpr (AViewType::rank == 1) { + const int plen = piv.extent(0), ps0 = piv.stride(0), as0 = A.stride(0); + return KokkosBatched::Impl::SerialLaswpVectorForwardInternal::invoke(plen, piv.data(), ps0, A.data(), as0); + } else if constexpr (AViewType::rank == 2) { + // row permutation + const int plen = piv.extent(0), ps0 = piv.stride(0), n = A.extent(1), as0 = A.stride(0), as1 = A.stride(1); + return KokkosBatched::Impl::SerialLaswpMatrixForwardInternal::invoke(n, plen, piv.data(), ps0, A.data(), as0, + as1); + } + return 0; + } +}; + +/// +/// Backward pivot apply +/// + +template <> +struct SerialLaswp { + template + KOKKOS_INLINE_FUNCTION static int invoke(const PivViewType piv, const AViewType &A) { + auto info = KokkosBatched::Impl::checkLaswpInput(piv, A); + if (info) return info; + + if constexpr (AViewType::rank == 1) { + const int plen = piv.extent(0), ps0 = piv.stride(0), as0 = A.stride(0); + return KokkosBatched::Impl::SerialLaswpVectorBackwardInternal::invoke(plen, piv.data(), ps0, A.data(), as0); + } else if constexpr (AViewType::rank == 2) { + // row permutation + const int plen = piv.extent(0), ps0 = piv.stride(0), n = A.extent(1), as0 = A.stride(0), as1 = A.stride(1); + return KokkosBatched::Impl::SerialLaswpMatrixBackwardInternal::invoke(n, plen, piv.data(), ps0, A.data(), as0, + as1); + } + return 0; + } +}; +} // namespace KokkosBatched + +#endif // KOKKOSBATCHED_LASWP_SERIAL_IMPL_HPP_ diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Laswp_Serial_Internal.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Laswp_Serial_Internal.hpp new file mode 100644 index 000000000000..dc49f367b1c1 --- /dev/null +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Laswp_Serial_Internal.hpp @@ -0,0 +1,150 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBATCHED_LASWP_SERIAL_INTERNAL_HPP_ +#define KOKKOSBATCHED_LASWP_SERIAL_INTERNAL_HPP_ + +/// \author Yuuichi Asahi (yuuichi.asahi@cea.fr) + +#include "KokkosBatched_Util.hpp" + +namespace KokkosBatched { +namespace Impl { + +/// +/// Serial Internal Impl +/// ======================== + +/// +//// Forward pivot apply +/// + +struct SerialLaswpVectorForwardInternal { + template + KOKKOS_INLINE_FUNCTION static int invoke(const int plen, const IntType *KOKKOS_RESTRICT p, const int ps0, + /* */ ValueType *KOKKOS_RESTRICT A, const int as0) { + for (int i = 0; i < plen; ++i) { + const int piv = p[i * ps0]; + if (piv != i) { + const int idx_i = i * as0, idx_p = piv * as0; + const ValueType tmp = A[idx_i]; + A[idx_i] = A[idx_p]; + A[idx_p] = tmp; + } + } + return 0; + } +}; + +struct SerialLaswpMatrixForwardInternal { + template + KOKKOS_INLINE_FUNCTION static int invoke(const int n, const int plen, const IntType *KOKKOS_RESTRICT p, const int ps0, + /* */ ValueType *KOKKOS_RESTRICT A, const int as0, const int as1) { + if (as0 <= as1) { + // LayoutLeft like + for (int j = 0; j < n; j++) { + ValueType *KOKKOS_RESTRICT A_at_j = A + j * as1; + for (int i = 0; i < plen; ++i) { + const int piv = p[i * ps0]; + if (piv != i) { + const int idx_i = i * as0, idx_p = piv * as0; + const ValueType tmp = A_at_j[idx_i]; + A_at_j[idx_i] = A_at_j[idx_p]; + A_at_j[idx_p] = tmp; + } + } + } + } else { + // LayoutRight like + for (int i = 0; i < plen; ++i) { + const int piv = p[i * ps0]; + if (piv != i) { + const int idx_i = i * as0, idx_p = piv * as0; + for (int j = 0; j < n; j++) { + ValueType *KOKKOS_RESTRICT A_at_j = A + j * as1; + const ValueType tmp = A_at_j[idx_i]; + A_at_j[idx_i] = A_at_j[idx_p]; + A_at_j[idx_p] = tmp; + } + } + } + } + return 0; + } +}; + +/// +/// Backward pivot apply +/// + +struct SerialLaswpVectorBackwardInternal { + template + KOKKOS_INLINE_FUNCTION static int invoke(const int plen, const IntType *KOKKOS_RESTRICT p, const int ps0, + /* */ ValueType *KOKKOS_RESTRICT A, const int as0) { + for (int i = (plen - 1); i >= 0; --i) { + const int piv = p[i * ps0]; + if (piv != i) { + const int idx_i = i * as0, idx_p = piv * as0; + const ValueType tmp = A[idx_i]; + A[idx_i] = A[idx_p]; + A[idx_p] = tmp; + } + } + return 0; + } +}; + +struct SerialLaswpMatrixBackwardInternal { + template + KOKKOS_INLINE_FUNCTION static int invoke(const int n, const int plen, const IntType *KOKKOS_RESTRICT p, const int ps0, + /* */ ValueType *KOKKOS_RESTRICT A, const int as0, const int as1) { + if (as0 <= as1) { + // LayoutLeft like + for (int j = 0; j < n; j++) { + ValueType *KOKKOS_RESTRICT A_at_j = A + j * as1; + for (int i = (plen - 1); i >= 0; --i) { + const int piv = p[i * ps0]; + if (piv != i) { + const int idx_i = i * as0, idx_p = piv * as0; + const ValueType tmp = A_at_j[idx_i]; + A_at_j[idx_i] = A_at_j[idx_p]; + A_at_j[idx_p] = tmp; + } + } + } + } else { + // LayoutRight like + for (int i = (plen - 1); i >= 0; --i) { + const int piv = p[i * ps0]; + if (piv != i) { + const int idx_i = i * as0, idx_p = piv * as0; + for (int j = 0; j < n; j++) { + ValueType *KOKKOS_RESTRICT A_at_j = A + j * as1; + const ValueType tmp = A_at_j[idx_i]; + A_at_j[idx_i] = A_at_j[idx_p]; + A_at_j[idx_p] = tmp; + } + } + } + } + return 0; + } +}; + +} // namespace Impl +} // namespace KokkosBatched + +#endif // KOKKOSBATCHED_LASWP_SERIAL_INTERNAL_HPP_ diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_LeftEigenvectorFromSchur_Serial_Internal.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_LeftEigenvectorFromSchur_Serial_Internal.hpp index c266d65c54e3..159317d438d5 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_LeftEigenvectorFromSchur_Serial_Internal.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_LeftEigenvectorFromSchur_Serial_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_LEFT_EIGENVECTOR_FROM_SCHUR_SERIAL_INTERNAL_HPP__ -#define __KOKKOSBATCHED_LEFT_EIGENVECTOR_FROM_SCHUR_SERIAL_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_LEFT_EIGENVECTOR_FROM_SCHUR_SERIAL_INTERNAL_HPP +#define KOKKOSBATCHED_LEFT_EIGENVECTOR_FROM_SCHUR_SERIAL_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Normalize_Internal.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Normalize_Internal.hpp index af6832940b5e..e1e19c01940c 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Normalize_Internal.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Normalize_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_NORMALIZE_INTERNAL_HPP__ -#define __KOKKOSBATCHED_NORMALIZE_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_NORMALIZE_INTERNAL_HPP +#define KOKKOSBATCHED_NORMALIZE_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Pbtrf_Serial_Impl.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Pbtrf_Serial_Impl.hpp new file mode 100644 index 000000000000..1687d82333b8 --- /dev/null +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Pbtrf_Serial_Impl.hpp @@ -0,0 +1,81 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef KOKKOSBATCHED_PBTRF_SERIAL_IMPL_HPP_ +#define KOKKOSBATCHED_PBTRF_SERIAL_IMPL_HPP_ + +#include +#include "KokkosBatched_Pbtrf_Serial_Internal.hpp" + +/// \author Yuuichi Asahi (yuuichi.asahi@cea.fr) + +namespace KokkosBatched { + +template +KOKKOS_INLINE_FUNCTION static int checkPbtrfInput([[maybe_unused]] const ABViewType &Ab) { + static_assert(Kokkos::is_view_v, "KokkosBatched::pbtrf: ABViewType is not a Kokkos::View."); + static_assert(ABViewType::rank == 2, "KokkosBatched::pbtrf: ABViewType must have rank 2."); + +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + const int kd = Ab.extent(0) - 1; + if (kd < 0) { + Kokkos::printf( + "KokkosBatched::pbtrf: input parameter kd must not be less than 0: kd " + "= " + "%d\n", + kd); + return 1; + } +#endif + return 0; +} + +//// Lower //// +template <> +struct SerialPbtrf { + template + KOKKOS_INLINE_FUNCTION static int invoke(const ABViewType &Ab) { + // Quick return if possible + const int n = Ab.extent(1); + if (n == 0) return 0; + + auto info = checkPbtrfInput(Ab); + if (info) return info; + + const int kd = Ab.extent(0) - 1; + return SerialPbtrfInternalLower::invoke(n, Ab.data(), Ab.stride_0(), Ab.stride_1(), kd); + } +}; + +//// Upper //// +template <> +struct SerialPbtrf { + template + KOKKOS_INLINE_FUNCTION static int invoke(const ABViewType &Ab) { + // Quick return if possible + const int n = Ab.extent(1); + if (n == 0) return 0; + + auto info = checkPbtrfInput(Ab); + if (info) return info; + + const int kd = Ab.extent(0) - 1; + return SerialPbtrfInternalUpper::invoke(n, Ab.data(), Ab.stride_0(), Ab.stride_1(), kd); + } +}; + +} // namespace KokkosBatched + +#endif // KOKKOSBATCHED_PBTRF_SERIAL_IMPL_HPP_ diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Pbtrf_Serial_Internal.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Pbtrf_Serial_Internal.hpp new file mode 100644 index 000000000000..0a4ed7d697cb --- /dev/null +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Pbtrf_Serial_Internal.hpp @@ -0,0 +1,272 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBATCHED_PBTRF_SERIAL_INTERNAL_HPP_ +#define KOKKOSBATCHED_PBTRF_SERIAL_INTERNAL_HPP_ + +#include "KokkosBatched_Util.hpp" +#include "KokkosBlas1_serial_scal_impl.hpp" + +namespace KokkosBatched { + +/// +/// Serial Internal Impl +/// ==================== + +/// +/// Lower +/// + +template +struct SerialPbtrfInternalLower { + template + KOKKOS_INLINE_FUNCTION static int invoke(const int an, + /**/ ValueType *KOKKOS_RESTRICT AB, const int as0, const int as1, + const int kd); + + template + KOKKOS_INLINE_FUNCTION static int invoke(const int an, + /**/ Kokkos::complex *KOKKOS_RESTRICT AB, const int as0, + const int as1, const int kd); +}; + +/// +/// Real matrix +/// + +template <> +template +KOKKOS_INLINE_FUNCTION int SerialPbtrfInternalLower::invoke(const int an, + /**/ ValueType *KOKKOS_RESTRICT AB, + const int as0, const int as1, + const int kd) { + // Compute the Cholesky factorization A = L*L'. + for (int j = 0; j < an; ++j) { + auto a_jj = AB[0 * as0 + j * as1]; + + // Check if L (j, j) is positive definite +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + if (a_jj <= 0) { + return j + 1; + } +#endif + + a_jj = Kokkos::sqrt(a_jj); + AB[0 * as0 + j * as1] = a_jj; + + // Compute elements J+1:J+KN of column J and update the + // trailing submatrix within the band. + int kn = Kokkos::min(an - j - 1, kd); + if (kn > 0) { + // scale to diagonal elements + const ValueType alpha = 1.0 / a_jj; + KokkosBlas::Impl::SerialScaleInternal::invoke(kn, alpha, &(AB[1 * as0 + j * as1]), 1); + + // syr (lower) with alpha = -1.0 to diagonal elements + for (int k = 0; k < kn; ++k) { + auto x_k = AB[(k + 1) * as0 + j * as1]; + if (x_k != 0) { + auto temp = -1.0 * x_k; + for (int i = k; i < kn; ++i) { + auto x_i = AB[(i + 1) * as0 + j * as1]; + AB[i * as0 + (j + 1 + k - i) * as1] += x_i * temp; + } + } + } + } + } + + return 0; +} + +/// +/// Complex matrix +/// +template <> +template +KOKKOS_INLINE_FUNCTION int SerialPbtrfInternalLower::invoke( + const int an, + /**/ Kokkos::complex *KOKKOS_RESTRICT AB, const int as0, const int as1, const int kd) { + // Compute the Cholesky factorization A = L*L**H + for (int j = 0; j < an; ++j) { + auto a_jj = AB[0 * as0 + j * as1].real(); + + // Check if L (j, j) is positive definite +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + if (a_jj <= 0) { + AB[0 * as0 + j * as1] = a_jj; + return j + 1; + } +#endif + + a_jj = Kokkos::sqrt(a_jj); + AB[0 * as0 + j * as1] = a_jj; + + // Compute elements J+1:J+KN of column J and update the + // trailing submatrix within the band. + int kn = Kokkos::min(kd, an - j - 1); + if (kn > 0) { + // scale to diagonal elements + const ValueType alpha = 1.0 / a_jj; + KokkosBlas::Impl::SerialScaleInternal::invoke(kn, alpha, &(AB[1 * as0 + j * as1]), 1); + + // zher (lower) with alpha = -1.0 to diagonal elements + for (int k = 0; k < kn; ++k) { + auto x_k = AB[(k + 1) * as0 + j * as1]; + if (x_k != 0) { + auto temp = -1.0 * Kokkos::conj(x_k); + AB[k * as0 + (j + 1) * as1] = AB[k * as0 + (j + 1) * as1].real() + (temp * x_k).real(); + for (int i = k + 1; i < kn; ++i) { + auto x_i = AB[(i + 1) * as0 + j * as1]; + AB[i * as0 + (j + 1 + k - i) * as1] += x_i * temp; + } + } else { + AB[k * as0 + (j + 1) * as1] = AB[k * as0 + (j + 1) * as1].real(); + } + } + } + } + + return 0; +} + +/// +/// Upper +/// + +template +struct SerialPbtrfInternalUpper { + template + KOKKOS_INLINE_FUNCTION static int invoke(const int an, + /**/ ValueType *KOKKOS_RESTRICT AB, const int as0, const int as1, + const int kd); + + template + KOKKOS_INLINE_FUNCTION static int invoke(const int an, + /**/ Kokkos::complex *KOKKOS_RESTRICT AB, const int as0, + const int as1, const int kd); +}; + +/// +/// Real matrix +/// +template <> +template +KOKKOS_INLINE_FUNCTION int SerialPbtrfInternalUpper::invoke(const int an, + /**/ ValueType *KOKKOS_RESTRICT AB, + const int as0, const int as1, + const int kd) { + // Compute the Cholesky factorization A = U'*U. + for (int j = 0; j < an; ++j) { + auto a_jj = AB[kd * as0 + j * as1]; + + // Check if U (j,j) is positive definite +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + if (a_jj <= 0) { + return j + 1; + } +#endif + a_jj = Kokkos::sqrt(a_jj); + AB[kd * as0 + j * as1] = a_jj; + + // Compute elements J+1:J+KN of row J and update the + // trailing submatrix within the band. + int kn = Kokkos::min(kd, an - j - 1); + int kld = Kokkos::max(1, as0 - 1); + if (kn > 0) { + // scale to diagonal elements + const ValueType alpha = 1.0 / a_jj; + KokkosBlas::Impl::SerialScaleInternal::invoke(kn, alpha, &(AB[(kd - 1) * as0 + (j + 1) * as1]), kld); + + // syr (upper) with alpha = -1.0 to diagonal elements + for (int k = 0; k < kn; ++k) { + auto x_k = AB[(k + kd - 1) * as0 + (j + 1 - k) * as1]; + if (x_k != 0) { + auto temp = -1.0 * x_k; + for (int i = 0; i < k + 1; ++i) { + auto x_i = AB[(i + kd - 1) * as0 + (j + 1 - i) * as1]; + AB[(kd + i) * as0 + (j + 1 + k - i) * as1] += x_i * temp; + } + } + } + } + } + + return 0; +} + +/// +/// Complex matrix +/// +template <> +template +KOKKOS_INLINE_FUNCTION int SerialPbtrfInternalUpper::invoke( + const int an, + /**/ Kokkos::complex *KOKKOS_RESTRICT AB, const int as0, const int as1, const int kd) { + // Compute the Cholesky factorization A = U**H * U. + for (int j = 0; j < an; ++j) { + auto a_jj = AB[kd * as0 + j * as1].real(); + + // Check if U (j,j) is positive definite +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + if (a_jj <= 0) { + AB[kd * as0 + j * as1] = a_jj; + return j + 1; + } +#endif + + a_jj = Kokkos::sqrt(a_jj); + AB[kd * as0 + j * as1] = a_jj; + + // Compute elements J+1:J+KN of row J and update the + // trailing submatrix within the band. + int kn = Kokkos::min(kd, an - j - 1); + int kld = Kokkos::max(1, as0 - 1); + if (kn > 0) { + // scale to diagonal elements + const ValueType alpha = 1.0 / a_jj; + KokkosBlas::Impl::SerialScaleInternal::invoke(kn, alpha, &(AB[(kd - 1) * as0 + (j + 1) * as1]), kld); + + // zlacgv to diagonal elements + for (int i = 0; i < kn; ++i) { + AB[(i + kd - 1) * as0 + (j + 1 - i) * as1] = Kokkos::conj(AB[(i + kd - 1) * as0 + (j + 1 - i) * as1]); + } + + // zher (upper) with alpha = -1.0 to diagonal elements + for (int k = 0; k < kn; ++k) { + auto x_k = AB[(k + kd - 1) * as0 + (j + 1 - k) * as1]; + if (x_k != 0) { + auto temp = -1.0 * Kokkos::conj(x_k); + for (int i = 0; i < k + 1; ++i) { + auto x_i = AB[(i + kd - 1) * as0 + (j + 1 - i) * as1]; + AB[(kd + i) * as0 + (j + 1 + k - i) * as1] += x_i * temp; + } + } + } + + // zlacgv to diagonal elements + for (int i = 0; i < kn; ++i) { + AB[(i + kd - 1) * as0 + (j + 1 - i) * as1] = Kokkos::conj(AB[(i + kd - 1) * as0 + (j + 1 - i) * as1]); + } + } + } + + return 0; +} + +} // namespace KokkosBatched + +#endif // KOKKOSBATCHED_PBTRF_SERIAL_INTERNAL_HPP_ diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Pbtrs_Serial_Impl.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Pbtrs_Serial_Impl.hpp new file mode 100644 index 000000000000..931e87805415 --- /dev/null +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Pbtrs_Serial_Impl.hpp @@ -0,0 +1,95 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef KOKKOSBATCHED_PBTRS_SERIAL_IMPL_HPP_ +#define KOKKOSBATCHED_PBTRS_SERIAL_IMPL_HPP_ + +#include +#include "KokkosBatched_Pbtrs_Serial_Internal.hpp" + +/// \author Yuuichi Asahi (yuuichi.asahi@cea.fr) + +namespace KokkosBatched { +namespace Impl { + +template +KOKKOS_INLINE_FUNCTION static int checkPbtrsInput([[maybe_unused]] const AViewType &A, + [[maybe_unused]] const XViewType &x) { + static_assert(Kokkos::is_view_v, "KokkosBatched::pbtrs: AViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view_v, "KokkosBatched::pbtrs: XViewType is not a Kokkos::View."); + static_assert(AViewType::rank == 2, "KokkosBatched::pbtrs: AViewType must have rank 2."); + static_assert(XViewType::rank == 1, "KokkosBatched::pbtrs: XViewType must have rank 1."); + +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + const int ldb = x.extent(0); + const int lda = A.extent(0), n = A.extent(1); + const int kd = lda - 1; + if (kd < 0) { + Kokkos::printf( + "KokkosBatched::pbtrs: leading dimension of A must not be less than 1: %d, A: " + "%d " + "x %d \n", + lda, n); + return 1; + } + if (ldb < Kokkos::max(1, n)) { + Kokkos::printf( + "KokkosBatched::pbtrs: Dimensions of x and A do not match: x: %d, A: " + "%d " + "x %d \n" + "x.extent(0) must be larger or equal to A.extent(1) \n", + ldb, lda, n); + return 1; + } +#endif + return 0; +} +} // namespace Impl + +//// Lower //// +template <> +struct SerialPbtrs { + template + KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A, const XViewType &x) { + // Quick return if possible + if (A.extent(1) == 0) return 0; + auto info = KokkosBatched::Impl::checkPbtrsInput(A, x); + if (info) return info; + + const int kd = A.extent(0) - 1; + return KokkosBatched::Impl::SerialPbtrsInternalLower::invoke( + A.extent(1), A.data(), A.stride_0(), A.stride_1(), x.data(), x.stride_0(), kd); + } +}; + +//// Upper //// +template <> +struct SerialPbtrs { + template + KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A, const XViewType &x) { + // Quick return if possible + if (A.extent(1) == 0) return 0; + auto info = KokkosBatched::Impl::checkPbtrsInput(A, x); + if (info) return info; + + const int kd = A.extent(0) - 1; + return KokkosBatched::Impl::SerialPbtrsInternalUpper::invoke( + A.extent(1), A.data(), A.stride_0(), A.stride_1(), x.data(), x.stride_0(), kd); + } +}; + +} // namespace KokkosBatched + +#endif // KOKKOSBATCHED_PBTRS_SERIAL_IMPL_HPP_ diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Pbtrs_Serial_Internal.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Pbtrs_Serial_Internal.hpp new file mode 100644 index 000000000000..f380525ae31e --- /dev/null +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Pbtrs_Serial_Internal.hpp @@ -0,0 +1,91 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBATCHED_PBTRS_SERIAL_INTERNAL_HPP_ +#define KOKKOSBATCHED_PBTRS_SERIAL_INTERNAL_HPP_ + +#include "KokkosBatched_Util.hpp" +#include "KokkosBatched_Tbsv_Serial_Internal.hpp" + +namespace KokkosBatched { +namespace Impl { + +/// +/// Serial Internal Impl +/// ==================== + +/// +/// Lower +/// + +template +struct SerialPbtrsInternalLower { + template + KOKKOS_INLINE_FUNCTION static int invoke(const int an, const ValueType *KOKKOS_RESTRICT A, const int as0, + const int as1, + /**/ ValueType *KOKKOS_RESTRICT x, const int xs0, const int kd); +}; + +template <> +template +KOKKOS_INLINE_FUNCTION int SerialPbtrsInternalLower::invoke(const int an, + const ValueType *KOKKOS_RESTRICT A, + const int as0, const int as1, + /**/ ValueType *KOKKOS_RESTRICT x, + const int xs0, const int kd) { + // Solve L*X = B, overwriting B with X. + SerialTbsvInternalLower::invoke(false, an, A, as0, as1, x, xs0, kd); + + // Solve L**T *X = B, overwriting B with X. + constexpr bool do_conj = Kokkos::ArithTraits::is_complex; + SerialTbsvInternalLowerTranspose::invoke(false, do_conj, an, A, as0, as1, x, xs0, kd); + + return 0; +} + +/// +/// Upper +/// + +template +struct SerialPbtrsInternalUpper { + template + KOKKOS_INLINE_FUNCTION static int invoke(const int an, const ValueType *KOKKOS_RESTRICT A, const int as0, + const int as1, + /**/ ValueType *KOKKOS_RESTRICT x, const int xs0, const int kd); +}; + +template <> +template +KOKKOS_INLINE_FUNCTION int SerialPbtrsInternalUpper::invoke(const int an, + const ValueType *KOKKOS_RESTRICT A, + const int as0, const int as1, + /**/ ValueType *KOKKOS_RESTRICT x, + const int xs0, const int kd) { + // Solve U**T *X = B, overwriting B with X. + constexpr bool do_conj = Kokkos::ArithTraits::is_complex; + SerialTbsvInternalUpperTranspose::invoke(false, do_conj, an, A, as0, as1, x, xs0, kd); + + // Solve U*X = B, overwriting B with X. + SerialTbsvInternalUpper::invoke(false, an, A, as0, as1, x, xs0, kd); + + return 0; +} + +} // namespace Impl +} // namespace KokkosBatched + +#endif // KOKKOSBATCHED_PBTRS_SERIAL_INTERNAL_HPP_ diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Pttrs_Serial_Impl.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Pttrs_Serial_Impl.hpp new file mode 100644 index 000000000000..3876e039180a --- /dev/null +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Pttrs_Serial_Impl.hpp @@ -0,0 +1,91 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef KOKKOSBATCHED_PTTRS_SERIAL_IMPL_HPP_ +#define KOKKOSBATCHED_PTTRS_SERIAL_IMPL_HPP_ + +#include +#include +#include "KokkosBatched_Pttrs_Serial_Internal.hpp" + +/// \author Yuuichi Asahi (yuuichi.asahi@cea.fr) + +namespace KokkosBatched { + +template +KOKKOS_INLINE_FUNCTION static int checkPttrsInput([[maybe_unused]] const DViewType &d, + [[maybe_unused]] const EViewType &e, + [[maybe_unused]] const BViewType &b) { + static_assert(Kokkos::is_view_v, "KokkosBatched::pttrs: DViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view_v, "KokkosBatched::pttrs: EViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view_v, "KokkosBatched::pttrs: BViewType is not a Kokkos::View."); + + static_assert(DViewType::rank == 1, "KokkosBatched::pttrs: DViewType must have rank 1."); + static_assert(EViewType::rank == 1, "KokkosBatched::pttrs: EViewType must have rank 1."); + static_assert(BViewType::rank == 1, "KokkosBatched::pttrs: BViewType must have rank 1."); + +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + const int nd = d.extent(0); + const int ne = e.extent(0); + const int ldb = b.extent(0); + + if (ne + 1 != nd) { + Kokkos::printf( + "KokkosBatched::pttrs: Dimensions of d and e do not match: d: %d, e: " + "%d \n" + "e.extent(0) must be equal to d.extent(0) - 1\n", + nd, ne); + return 1; + } + + if (ldb < Kokkos::max(1, nd)) { + Kokkos::printf( + "KokkosBatched::pttrs: Dimensions of d and b do not match: d: %d, b: " + "%d \n" + "b.extent(0) must be larger or equal to d.extent(0) \n", + ldb, nd); + return 1; + } +#endif + return 0; +} + +template +struct SerialPttrs { + template + KOKKOS_INLINE_FUNCTION static int invoke(const DViewType &d, const EViewType &e, const BViewType &b) { + // Quick return if possible + if (d.extent(0) == 0) return 0; + + auto info = checkPttrsInput(d, e, b); + if (info) return info; + + using ScalarType = typename DViewType::non_const_value_type; + int n = d.extent(0); + + if (n == 1) { + const ScalarType alpha = 1.0 / d(0); + return KokkosBlas::SerialScale::invoke(alpha, b); + } + + // Solve A * X = B using the factorization A = L*D*L**T, + // overwriting each right hand side vector with its solution. + return SerialPttrsInternal::invoke(n, d.data(), d.stride(0), e.data(), e.stride(0), + b.data(), b.stride(0)); + } +}; +} // namespace KokkosBatched + +#endif // KOKKOSBATCHED_PTTRS_SERIAL_IMPL_HPP_ diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Pttrs_Serial_Internal.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Pttrs_Serial_Internal.hpp new file mode 100644 index 000000000000..356722d1102c --- /dev/null +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Pttrs_Serial_Internal.hpp @@ -0,0 +1,88 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef KOKKOSBATCHED_PTTRS_SERIAL_INTERNAL_HPP_ +#define KOKKOSBATCHED_PTTRS_SERIAL_INTERNAL_HPP_ + +#include + +/// \author Yuuichi Asahi (yuuichi.asahi@cea.fr) + +namespace KokkosBatched { + +template +struct SerialPttrsInternal { + template + KOKKOS_INLINE_FUNCTION static int invoke(const int n, const ValueType *KOKKOS_RESTRICT d, const int ds0, + const ValueType *KOKKOS_RESTRICT e, const int es0, + ValueType *KOKKOS_RESTRICT b, const int bs0); + + template + KOKKOS_INLINE_FUNCTION static int invoke(const int n, const ValueType *KOKKOS_RESTRICT d, const int ds0, + const Kokkos::complex *KOKKOS_RESTRICT e, const int es0, + Kokkos::complex *KOKKOS_RESTRICT b, const int bs0); +}; + +/// +/// Real matrix +/// + +template +template +KOKKOS_INLINE_FUNCTION int SerialPttrsInternal::invoke( + const int n, const ValueType *KOKKOS_RESTRICT d, const int ds0, const ValueType *KOKKOS_RESTRICT e, const int es0, + ValueType *KOKKOS_RESTRICT b, const int bs0) { + // Solve A * X = B using the factorization L * D * L**T + for (int i = 1; i < n; i++) { + b[i * bs0] -= e[(i - 1) * es0] * b[(i - 1) * bs0]; + } + + b[(n - 1) * bs0] /= d[(n - 1) * ds0]; + + for (int i = n - 2; i >= 0; i--) { + b[i * bs0] = b[i * bs0] / d[i * ds0] - b[(i + 1) * bs0] * e[i * es0]; + } + + return 0; +} + +/// +/// Complex matrix +/// + +template +template +KOKKOS_INLINE_FUNCTION int SerialPttrsInternal::invoke( + const int n, const ValueType *KOKKOS_RESTRICT d, const int ds0, const Kokkos::complex *KOKKOS_RESTRICT e, + const int es0, Kokkos::complex *KOKKOS_RESTRICT b, const int bs0) { + // Solve A * X = B using the factorization L * D * L**H + for (int i = 1; i < n; i++) { + auto tmp_e = std::is_same_v ? Kokkos::conj(e[(i - 1) * es0]) : e[(i - 1) * es0]; + b[i * bs0] -= tmp_e * b[(i - 1) * bs0]; + } + + b[(n - 1) * bs0] /= d[(n - 1) * ds0]; + + for (int i = n - 2; i >= 0; i--) { + auto tmp_e = std::is_same_v ? Kokkos::conj(e[i * es0]) : e[i * es0]; + b[i * bs0] = b[i * bs0] / d[i * ds0] - b[(i + 1) * bs0] * tmp_e; + } + + return 0; +} + +} // namespace KokkosBatched + +#endif // KOKKOSBATCHED_PTTRS_SERIAL_INTERNAL_HPP_ diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_QR_FormQ_Serial_Internal.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_QR_FormQ_Serial_Internal.hpp index 7c717c2eed11..aaacb45ede00 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_QR_FormQ_Serial_Internal.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_QR_FormQ_Serial_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_QR_FORM_Q_SERIAL_INTERNAL_HPP__ -#define __KOKKOSBATCHED_QR_FORM_Q_SERIAL_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_QR_FORM_Q_SERIAL_INTERNAL_HPP +#define KOKKOSBATCHED_QR_FORM_Q_SERIAL_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_QR_FormQ_TeamVector_Internal.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_QR_FormQ_TeamVector_Internal.hpp index af7f458898b6..9f575d29534d 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_QR_FormQ_TeamVector_Internal.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_QR_FormQ_TeamVector_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_QR_FORM_Q_TEAMVECTOR_INTERNAL_HPP__ -#define __KOKKOSBATCHED_QR_FORM_Q_TEAMVECTOR_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_QR_FORM_Q_TEAMVECTOR_INTERNAL_HPP +#define KOKKOSBATCHED_QR_FORM_Q_TEAMVECTOR_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_QR_Serial_Impl.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_QR_Serial_Impl.hpp index 1083e6af2ab8..1da853562815 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_QR_Serial_Impl.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_QR_Serial_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_QR_SERIAL_IMPL_HPP__ -#define __KOKKOSBATCHED_QR_SERIAL_IMPL_HPP__ +#ifndef KOKKOSBATCHED_QR_SERIAL_IMPL_HPP +#define KOKKOSBATCHED_QR_SERIAL_IMPL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_QR_Serial_Internal.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_QR_Serial_Internal.hpp index 95ca1c4340d1..8aa4a6361c63 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_QR_Serial_Internal.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_QR_Serial_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_QR_SERIAL_INTERNAL_HPP__ -#define __KOKKOSBATCHED_QR_SERIAL_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_QR_SERIAL_INTERNAL_HPP +#define KOKKOSBATCHED_QR_SERIAL_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_QR_TeamVector_Impl.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_QR_TeamVector_Impl.hpp index 2497e5adf5d4..0f37f2d8dbc8 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_QR_TeamVector_Impl.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_QR_TeamVector_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_QR_TEAMVECTOR_IMPL_HPP__ -#define __KOKKOSBATCHED_QR_TEAMVECTOR_IMPL_HPP__ +#ifndef KOKKOSBATCHED_QR_TEAMVECTOR_IMPL_HPP +#define KOKKOSBATCHED_QR_TEAMVECTOR_IMPL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_QR_TeamVector_Internal.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_QR_TeamVector_Internal.hpp index e3dde679865a..9ed2ac3a224f 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_QR_TeamVector_Internal.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_QR_TeamVector_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_QR_TEAMVECTOR_INTERNAL_HPP__ -#define __KOKKOSBATCHED_QR_TEAMVECTOR_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_QR_TEAMVECTOR_INTERNAL_HPP +#define KOKKOSBATCHED_QR_TEAMVECTOR_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_QR_WithColumnPivoting_TeamVector_Impl.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_QR_WithColumnPivoting_TeamVector_Impl.hpp index ed9ccd8cce27..790afd31adb4 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_QR_WithColumnPivoting_TeamVector_Impl.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_QR_WithColumnPivoting_TeamVector_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_QR_WITH_COLUMNPIVOTING_TEAMVECTOR_IMPL_HPP__ -#define __KOKKOSBATCHED_QR_WITH_COLUMNPIVOTING_TEAMVECTOR_IMPL_HPP__ +#ifndef KOKKOSBATCHED_QR_WITH_COLUMNPIVOTING_TEAMVECTOR_IMPL_HPP +#define KOKKOSBATCHED_QR_WITH_COLUMNPIVOTING_TEAMVECTOR_IMPL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_QR_WithColumnPivoting_TeamVector_Internal.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_QR_WithColumnPivoting_TeamVector_Internal.hpp index 280bfa434b49..89e68e1757ae 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_QR_WithColumnPivoting_TeamVector_Internal.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_QR_WithColumnPivoting_TeamVector_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_QR_WITH_COLUMNPIVOTING_TEAMVECTOR_INTERNAL_HPP__ -#define __KOKKOSBATCHED_QR_WITH_COLUMNPIVOTING_TEAMVECTOR_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_QR_WITH_COLUMNPIVOTING_TEAMVECTOR_INTERNAL_HPP +#define KOKKOSBATCHED_QR_WITH_COLUMNPIVOTING_TEAMVECTOR_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_RightEigenvectorFromSchur_Serial_Internal.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_RightEigenvectorFromSchur_Serial_Internal.hpp index 029875f81034..a5c2a2074e24 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_RightEigenvectorFromSchur_Serial_Internal.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_RightEigenvectorFromSchur_Serial_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_RIGHT_EIGENVECTOR_FROM_SCHUR_SERIAL_INTERNAL_HPP__ -#define __KOKKOSBATCHED_RIGHT_EIGENVECTOR_FROM_SCHUR_SERIAL_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_RIGHT_EIGENVECTOR_FROM_SCHUR_SERIAL_INTERNAL_HPP +#define KOKKOSBATCHED_RIGHT_EIGENVECTOR_FROM_SCHUR_SERIAL_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_SVD_Serial_Impl.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_SVD_Serial_Impl.hpp index e0c25c2ce739..e1d024ef39fe 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_SVD_Serial_Impl.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_SVD_Serial_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_SVD_SERIAL_IMPL_HPP__ -#define __KOKKOSBATCHED_SVD_SERIAL_IMPL_HPP__ +#ifndef KOKKOSBATCHED_SVD_SERIAL_IMPL_HPP +#define KOKKOSBATCHED_SVD_SERIAL_IMPL_HPP /// \author Brian Kelley (bmkelle@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_SVD_Serial_Internal.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_SVD_Serial_Internal.hpp index 4e5c1e01dd73..56a5619e06d4 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_SVD_Serial_Internal.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_SVD_Serial_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_SVD_SERIAL_INTERNAL_HPP__ -#define __KOKKOSBATCHED_SVD_SERIAL_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_SVD_SERIAL_INTERNAL_HPP +#define KOKKOSBATCHED_SVD_SERIAL_INTERNAL_HPP /// \author Brian Kelley (bmkelle@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Schur2x2_Serial_Internal.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Schur2x2_Serial_Internal.hpp index 41e525d2ba4e..369153c3c582 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Schur2x2_Serial_Internal.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Schur2x2_Serial_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_SCHUR2X2_SERIAL_INTERNAL_HPP__ -#define __KOKKOSBATCHED_SCHUR2X2_SERIAL_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_SCHUR2X2_SERIAL_INTERNAL_HPP +#define KOKKOSBATCHED_SCHUR2X2_SERIAL_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Schur_Serial_Internal.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Schur_Serial_Internal.hpp index c6d55b301bf0..c619d5da26a2 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Schur_Serial_Internal.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Schur_Serial_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_SCHUR_SERIAL_INTERNAL_HPP__ -#define __KOKKOSBATCHED_SCHUR_SERIAL_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_SCHUR_SERIAL_INTERNAL_HPP +#define KOKKOSBATCHED_SCHUR_SERIAL_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_SetIdentity_Impl.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_SetIdentity_Impl.hpp index 9219f3a9ecf2..96fb20c77c89 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_SetIdentity_Impl.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_SetIdentity_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_SET_IDENTITY_IMPL_HPP__ -#define __KOKKOSBATCHED_SET_IDENTITY_IMPL_HPP__ +#ifndef KOKKOSBATCHED_SET_IDENTITY_IMPL_HPP +#define KOKKOSBATCHED_SET_IDENTITY_IMPL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_SetIdentity_Internal.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_SetIdentity_Internal.hpp index f5afb5c79cb2..f1432b923190 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_SetIdentity_Internal.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_SetIdentity_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_SET_IDENTITY_INTERNAL_HPP__ -#define __KOKKOSBATCHED_SET_IDENTITY_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_SET_IDENTITY_INTERNAL_HPP +#define KOKKOSBATCHED_SET_IDENTITY_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_SetTriangular_Internal.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_SetTriangular_Internal.hpp index 09e94ab5f3f4..2d31e14c8360 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_SetTriangular_Internal.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_SetTriangular_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_SET_TRIANGULAR_INTERNAL_HPP__ -#define __KOKKOSBATCHED_SET_TRIANGULAR_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_SET_TRIANGULAR_INTERNAL_HPP +#define KOKKOSBATCHED_SET_TRIANGULAR_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_ShiftedTrsv_Serial_Internal.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_ShiftedTrsv_Serial_Internal.hpp index c0963447c40f..8c6baecee1ab 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_ShiftedTrsv_Serial_Internal.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_ShiftedTrsv_Serial_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_SHIFTED_TRSV_SERIAL_INTERNAL_HPP__ -#define __KOKKOSBATCHED_SHIFTED_TRSV_SERIAL_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_SHIFTED_TRSV_SERIAL_INTERNAL_HPP +#define KOKKOSBATCHED_SHIFTED_TRSV_SERIAL_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_SolveUTV_TeamVector_Impl.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_SolveUTV_TeamVector_Impl.hpp index 3b85a26294b2..f707b03a31ce 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_SolveUTV_TeamVector_Impl.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_SolveUTV_TeamVector_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_SOLVE_UTV_TEAMVECTOR_IMPL_HPP__ -#define __KOKKOSBATCHED_SOLVE_UTV_TEAMVECTOR_IMPL_HPP__ +#ifndef KOKKOSBATCHED_SOLVE_UTV_TEAMVECTOR_IMPL_HPP +#define KOKKOSBATCHED_SOLVE_UTV_TEAMVECTOR_IMPL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_SolveUTV_TeamVector_Internal.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_SolveUTV_TeamVector_Internal.hpp index 18440745eb90..c1ca8dc7cc89 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_SolveUTV_TeamVector_Internal.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_SolveUTV_TeamVector_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_SOLVE_UTV_TEAMVECTOR_INTERNAL_HPP__ -#define __KOKKOSBATCHED_SOLVE_UTV_TEAMVECTOR_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_SOLVE_UTV_TEAMVECTOR_INTERNAL_HPP +#define KOKKOSBATCHED_SOLVE_UTV_TEAMVECTOR_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Trmm_Serial_Impl.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Trmm_Serial_Impl.hpp index 6313d817c6a6..e2624269b65d 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Trmm_Serial_Impl.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Trmm_Serial_Impl.hpp @@ -14,8 +14,8 @@ // //@HEADER -#ifndef __KOKKOSBATCHED_TRMM_SERIAL_IMPL_HPP__ -#define __KOKKOSBATCHED_TRMM_SERIAL_IMPL_HPP__ +#ifndef KOKKOSBATCHED_TRMM_SERIAL_IMPL_HPP +#define KOKKOSBATCHED_TRMM_SERIAL_IMPL_HPP #include "KokkosBatched_Util.hpp" #include "KokkosBatched_Trmm_Serial_Internal.hpp" @@ -137,4 +137,4 @@ struct SerialTrmm:: return 0; } } // namespace KokkosBatched -#endif // __KOKKOSBATCHED_TRMM_SERIAL_INTERNAL_HPP__ +#endif // KOKKOSBATCHED_TRMM_SERIAL_INTERNAL_HPP diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Trsm_Serial_Impl.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Trsm_Serial_Impl.hpp index 694ac36fa0d2..dc459d23d0f0 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Trsm_Serial_Impl.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Trsm_Serial_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_TRSM_SERIAL_IMPL_HPP__ -#define __KOKKOSBATCHED_TRSM_SERIAL_IMPL_HPP__ +#ifndef KOKKOSBATCHED_TRSM_SERIAL_IMPL_HPP +#define KOKKOSBATCHED_TRSM_SERIAL_IMPL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) @@ -29,7 +29,7 @@ namespace KokkosBatched { /// B := inv(tril(A)) (alpha*B) /// A(m x m), B(m x n) -#if defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) && defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) && \ +#if defined(KOKKOSBATCHED_IMPL_ENABLE_INTEL_MKL) && defined(KOKKOSBATCHED_IMPL_ENABLE_INTEL_MKL_BATCHED) && \ defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__) template struct SerialTrsm { @@ -88,7 +88,7 @@ struct SerialTrsm struct SerialTrsm { @@ -167,7 +167,7 @@ struct SerialTrsm struct SerialTrsm { @@ -227,7 +227,7 @@ struct SerialTrsm struct SerialTrsm { @@ -285,7 +285,7 @@ struct SerialTrsm struct SerialTrsm { diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Trsm_Serial_Internal.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Trsm_Serial_Internal.hpp index 0e65d269f0c4..a797357f7ea7 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Trsm_Serial_Internal.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Trsm_Serial_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_TRSM_SERIAL_INTERNAL_HPP__ -#define __KOKKOSBATCHED_TRSM_SERIAL_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_TRSM_SERIAL_INTERNAL_HPP +#define KOKKOSBATCHED_TRSM_SERIAL_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Trsm_TeamVector_Impl.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Trsm_TeamVector_Impl.hpp index 145f8e0c2df5..e89b114c10e8 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Trsm_TeamVector_Impl.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Trsm_TeamVector_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_TRSM_TEAMVECTOR_IMPL_HPP__ -#define __KOKKOSBATCHED_TRSM_TEAMVECTOR_IMPL_HPP__ +#ifndef KOKKOSBATCHED_TRSM_TEAMVECTOR_IMPL_HPP +#define KOKKOSBATCHED_TRSM_TEAMVECTOR_IMPL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Trsm_TeamVector_Internal.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Trsm_TeamVector_Internal.hpp index c1781a001cc9..916fa5322563 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Trsm_TeamVector_Internal.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Trsm_TeamVector_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_TRSM_TEAMVECTOR_INTERNAL_HPP__ -#define __KOKKOSBATCHED_TRSM_TEAMVECTOR_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_TRSM_TEAMVECTOR_INTERNAL_HPP +#define KOKKOSBATCHED_TRSM_TEAMVECTOR_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Trsm_Team_Impl.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Trsm_Team_Impl.hpp index 371dbb483c6a..20426f088ef9 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Trsm_Team_Impl.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Trsm_Team_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_TRSM_TEAM_IMPL_HPP__ -#define __KOKKOSBATCHED_TRSM_TEAM_IMPL_HPP__ +#ifndef KOKKOSBATCHED_TRSM_TEAM_IMPL_HPP +#define KOKKOSBATCHED_TRSM_TEAM_IMPL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Trsm_Team_Internal.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Trsm_Team_Internal.hpp index a1a70628094c..1adedad5937e 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Trsm_Team_Internal.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Trsm_Team_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_TRSM_TEAM_INTERNAL_HPP__ -#define __KOKKOSBATCHED_TRSM_TEAM_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_TRSM_TEAM_INTERNAL_HPP +#define KOKKOSBATCHED_TRSM_TEAM_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) @@ -192,7 +192,7 @@ KOKKOS_INLINE_FUNCTION int TeamTrsmInternalLeftUpper::inv Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, iend * jend), [&](const int &ij) { int i, j; - if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { + if (KokkosKernels::Impl::is_gpu_exec_space_v) { i = ij % iend; j = ij / iend; } else { diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Trsv_Serial_Impl.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Trsv_Serial_Impl.hpp index 073970caa682..d7db47375abc 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Trsv_Serial_Impl.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Trsv_Serial_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_TRSV_SERIAL_IMPL_HPP__ -#define __KOKKOSBATCHED_TRSV_SERIAL_IMPL_HPP__ +#ifndef KOKKOSBATCHED_TRSV_SERIAL_IMPL_HPP +#define KOKKOSBATCHED_TRSV_SERIAL_IMPL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) @@ -38,7 +38,7 @@ namespace KokkosBatched { /// L/NT /// -#if defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) && defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) && \ +#if defined(KOKKOSBATCHED_IMPL_ENABLE_INTEL_MKL) && defined(KOKKOSBATCHED_IMPL_ENABLE_INTEL_MKL_BATCHED) && \ defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__) template struct SerialTrsv { @@ -94,7 +94,7 @@ struct SerialTrsv /// L/T /// -#if defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) && defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) && \ +#if defined(KOKKOSBATCHED_IMPL_ENABLE_INTEL_MKL) && defined(KOKKOSBATCHED_IMPL_ENABLE_INTEL_MKL_BATCHED) && \ defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__) template struct SerialTrsv { @@ -150,7 +150,7 @@ struct SerialTrsv { /// U/NT /// -#if defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) && defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) && \ +#if defined(KOKKOSBATCHED_IMPL_ENABLE_INTEL_MKL) && defined(KOKKOSBATCHED_IMPL_ENABLE_INTEL_MKL_BATCHED) && \ defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__) template struct SerialTrsv { @@ -206,7 +206,7 @@ struct SerialTrsv /// U/T /// -#if defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) && defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) && \ +#if defined(KOKKOSBATCHED_IMPL_ENABLE_INTEL_MKL) && defined(KOKKOSBATCHED_IMPL_ENABLE_INTEL_MKL_BATCHED) && \ defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__) template struct SerialTrsv { diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Trsv_Serial_Internal.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Trsv_Serial_Internal.hpp index 43d95377d438..861c72eec999 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Trsv_Serial_Internal.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Trsv_Serial_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_TRSV_SERIAL_INTERNAL_HPP__ -#define __KOKKOSBATCHED_TRSV_SERIAL_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_TRSV_SERIAL_INTERNAL_HPP +#define KOKKOSBATCHED_TRSV_SERIAL_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Trsv_TeamVector_Impl.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Trsv_TeamVector_Impl.hpp index 42c242414c51..6fb729469305 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Trsv_TeamVector_Impl.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Trsv_TeamVector_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_TRSV_TEAMVECTOR_IMPL_HPP__ -#define __KOKKOSBATCHED_TRSV_TEAMVECTOR_IMPL_HPP__ +#ifndef KOKKOSBATCHED_TRSV_TEAMVECTOR_IMPL_HPP +#define KOKKOSBATCHED_TRSV_TEAMVECTOR_IMPL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Trsv_TeamVector_Internal.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Trsv_TeamVector_Internal.hpp index 894e684ef2ee..6b2dac17b655 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Trsv_TeamVector_Internal.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Trsv_TeamVector_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_TRSV_TEAMVECTOR_INTERNAL_HPP__ -#define __KOKKOSBATCHED_TRSV_TEAMVECTOR_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_TRSV_TEAMVECTOR_INTERNAL_HPP +#define KOKKOSBATCHED_TRSV_TEAMVECTOR_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Trsv_Team_Impl.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Trsv_Team_Impl.hpp index c658080dc214..8b4be66cc45e 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Trsv_Team_Impl.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Trsv_Team_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_TRSV_TEAM_IMPL_HPP__ -#define __KOKKOSBATCHED_TRSV_TEAM_IMPL_HPP__ +#ifndef KOKKOSBATCHED_TRSV_TEAM_IMPL_HPP +#define KOKKOSBATCHED_TRSV_TEAM_IMPL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Trsv_Team_Internal.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Trsv_Team_Internal.hpp index ba3b2ff7b545..f0fe7ca64a31 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Trsv_Team_Internal.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Trsv_Team_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_TRSV_TEAM_INTERNAL_HPP__ -#define __KOKKOSBATCHED_TRSV_TEAM_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_TRSV_TEAM_INTERNAL_HPP +#define KOKKOSBATCHED_TRSV_TEAM_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Trtri_Serial_Impl.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Trtri_Serial_Impl.hpp index 1068bf9e544e..c234edc08e32 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Trtri_Serial_Impl.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Trtri_Serial_Impl.hpp @@ -14,8 +14,8 @@ // //@HEADER -#ifndef __KOKKOSBATCHED_TRTRI_SERIAL_IMPL_HPP__ -#define __KOKKOSBATCHED_TRTRI_SERIAL_IMPL_HPP__ +#ifndef KOKKOSBATCHED_TRTRI_SERIAL_IMPL_HPP +#define KOKKOSBATCHED_TRTRI_SERIAL_IMPL_HPP #include "KokkosBatched_Util.hpp" #include "KokkosBatched_Trtri_Serial_Internal.hpp" @@ -39,4 +39,4 @@ struct SerialTrtri { }; } // namespace KokkosBatched -#endif // __KOKKOSBATCHED_TRTRI_SERIAL_IMPL_HPP__ +#endif // KOKKOSBATCHED_TRTRI_SERIAL_IMPL_HPP diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Trtri_Serial_Internal.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Trtri_Serial_Internal.hpp index f6b0b4bf6d8b..e54852464828 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Trtri_Serial_Internal.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Trtri_Serial_Internal.hpp @@ -14,8 +14,8 @@ // //@HEADER -#ifndef __KOKKOSBATCHED_TRTRI_SERIAL_INTERNAL_HPP__ -#define __KOKKOSBATCHED_TRTRI_SERIAL_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_TRTRI_SERIAL_INTERNAL_HPP +#define KOKKOSBATCHED_TRTRI_SERIAL_INTERNAL_HPP #include "KokkosBatched_Util.hpp" #include "KokkosBatched_Trmm_Serial_Internal.hpp" @@ -130,4 +130,4 @@ KOKKOS_INLINE_FUNCTION int SerialTrtriInternalUpper::inv return 0; } } // namespace KokkosBatched -#endif // __KOKKOSBATCHED_TRTRI_SERIAL_INTERNAL_HPP__ +#endif // KOKKOSBATCHED_TRTRI_SERIAL_INTERNAL_HPP diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_UTV_TeamVector_Impl.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_UTV_TeamVector_Impl.hpp index de5ecebf94da..12856514a586 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_UTV_TeamVector_Impl.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_UTV_TeamVector_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_UTV_TEAMVECTOR_IMPL_HPP__ -#define __KOKKOSBATCHED_UTV_TEAMVECTOR_IMPL_HPP__ +#ifndef KOKKOSBATCHED_UTV_TEAMVECTOR_IMPL_HPP +#define KOKKOSBATCHED_UTV_TEAMVECTOR_IMPL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_UTV_TeamVector_Internal.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_UTV_TeamVector_Internal.hpp index e39dba9a40dc..575eba64c4b0 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_UTV_TeamVector_Internal.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_UTV_TeamVector_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_UTV_TEAMVECTOR_INTERNAL_HPP__ -#define __KOKKOSBATCHED_UTV_TEAMVECTOR_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_UTV_TEAMVECTOR_INTERNAL_HPP +#define KOKKOSBATCHED_UTV_TEAMVECTOR_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_UpdateGivens_Internal.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_UpdateGivens_Internal.hpp index 3f56e71422b4..dcc0da4cef78 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_UpdateGivens_Internal.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_UpdateGivens_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_UPDATE_GIVENS_INTERNAL_HPP__ -#define __KOKKOSBATCHED_UPDATE_GIVENS_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_UPDATE_GIVENS_INTERNAL_HPP +#define KOKKOSBATCHED_UPDATE_GIVENS_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Vector_SIMD_Arith.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Vector_SIMD_Arith.hpp index 08628729bc3d..ad10b39c268b 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Vector_SIMD_Arith.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Vector_SIMD_Arith.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_VECTOR_SIMD_ARITH_HPP__ -#define __KOKKOSBATCHED_VECTOR_SIMD_ARITH_HPP__ +#ifndef KOKKOSBATCHED_VECTOR_SIMD_ARITH_HPP +#define KOKKOSBATCHED_VECTOR_SIMD_ARITH_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Vector_SIMD_Logical.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Vector_SIMD_Logical.hpp index f289d5be095a..8da9ac7f154c 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Vector_SIMD_Logical.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Vector_SIMD_Logical.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_VECTOR_SIMD_LOGICAL_HPP__ -#define __KOKKOSBATCHED_VECTOR_SIMD_LOGICAL_HPP__ +#ifndef KOKKOSBATCHED_VECTOR_SIMD_LOGICAL_HPP +#define KOKKOSBATCHED_VECTOR_SIMD_LOGICAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Vector_SIMD_Math.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Vector_SIMD_Math.hpp index eefaf4ce0d7b..eec5c1092fe4 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Vector_SIMD_Math.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Vector_SIMD_Math.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_VECTOR_SIMD_MATH_HPP__ -#define __KOKKOSBATCHED_VECTOR_SIMD_MATH_HPP__ +#ifndef KOKKOSBATCHED_VECTOR_SIMD_MATH_HPP +#define KOKKOSBATCHED_VECTOR_SIMD_MATH_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Vector_SIMD_Misc.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Vector_SIMD_Misc.hpp index 02f717d4583f..be980b986da7 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Vector_SIMD_Misc.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Vector_SIMD_Misc.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_VECTOR_SIMD_MISC_HPP__ -#define __KOKKOSBATCHED_VECTOR_SIMD_MISC_HPP__ +#ifndef KOKKOSBATCHED_VECTOR_SIMD_MISC_HPP +#define KOKKOSBATCHED_VECTOR_SIMD_MISC_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Vector_SIMD_Relation.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Vector_SIMD_Relation.hpp index c95678019250..cce454349868 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Vector_SIMD_Relation.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Vector_SIMD_Relation.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_VECTOR_SIMD_RELATION_HPP__ -#define __KOKKOSBATCHED_VECTOR_SIMD_RELATION_HPP__ +#ifndef KOKKOSBATCHED_VECTOR_SIMD_RELATION_HPP +#define KOKKOSBATCHED_VECTOR_SIMD_RELATION_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Vector_SIMD_View.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Vector_SIMD_View.hpp index 60e5e43e5796..a9818493cd5c 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Vector_SIMD_View.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Vector_SIMD_View.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_VECTOR_SIMD_VIEW_HPP__ -#define __KOKKOSBATCHED_VECTOR_SIMD_VIEW_HPP__ +#ifndef KOKKOSBATCHED_VECTOR_SIMD_VIEW_HPP +#define KOKKOSBATCHED_VECTOR_SIMD_VIEW_HPP #include diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_WilkinsonShift_Serial_Internal.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_WilkinsonShift_Serial_Internal.hpp index a23a9ea4d03d..1f726baf534d 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_WilkinsonShift_Serial_Internal.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_WilkinsonShift_Serial_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_WILKINSON_SHIFT_SERIAL_INTERNAL_HPP__ -#define __KOKKOSBATCHED_WILKINSON_SHIFT_SERIAL_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_WILKINSON_SHIFT_SERIAL_INTERNAL_HPP +#define KOKKOSBATCHED_WILKINSON_SHIFT_SERIAL_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Xpay_Impl.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Xpay_Impl.hpp index 988bd30c931b..fe142f438960 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Xpay_Impl.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Xpay_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_XPAY_IMPL_HPP__ -#define __KOKKOSBATCHED_XPAY_IMPL_HPP__ +#ifndef KOKKOSBATCHED_XPAY_IMPL_HPP +#define KOKKOSBATCHED_XPAY_IMPL_HPP /// \author Kim Liegeois (knliege@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/src/KokkosBatched_AddRadial_Decl.hpp b/packages/kokkos-kernels/batched/dense/src/KokkosBatched_AddRadial_Decl.hpp index 7eadc4326941..c906fac7afb0 100644 --- a/packages/kokkos-kernels/batched/dense/src/KokkosBatched_AddRadial_Decl.hpp +++ b/packages/kokkos-kernels/batched/dense/src/KokkosBatched_AddRadial_Decl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_ADD_RADIAL_DECL_HPP__ -#define __KOKKOSBATCHED_ADD_RADIAL_DECL_HPP__ +#ifndef KOKKOSBATCHED_ADD_RADIAL_DECL_HPP +#define KOKKOSBATCHED_ADD_RADIAL_DECL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/src/KokkosBatched_ApplyHouseholder_Decl.hpp b/packages/kokkos-kernels/batched/dense/src/KokkosBatched_ApplyHouseholder_Decl.hpp index bee7d3a645d8..21a34963fe1c 100644 --- a/packages/kokkos-kernels/batched/dense/src/KokkosBatched_ApplyHouseholder_Decl.hpp +++ b/packages/kokkos-kernels/batched/dense/src/KokkosBatched_ApplyHouseholder_Decl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_APPLY_HOUSEHOLDER_DECL_HPP__ -#define __KOKKOSBATCHED_APPLY_HOUSEHOLDER_DECL_HPP__ +#ifndef KOKKOSBATCHED_APPLY_HOUSEHOLDER_DECL_HPP +#define KOKKOSBATCHED_APPLY_HOUSEHOLDER_DECL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/src/KokkosBatched_ApplyPivot_Decl.hpp b/packages/kokkos-kernels/batched/dense/src/KokkosBatched_ApplyPivot_Decl.hpp index 2aa00bf8c2ca..f4795f863d21 100644 --- a/packages/kokkos-kernels/batched/dense/src/KokkosBatched_ApplyPivot_Decl.hpp +++ b/packages/kokkos-kernels/batched/dense/src/KokkosBatched_ApplyPivot_Decl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_APPLY_PIVOT_DECL_HPP__ -#define __KOKKOSBATCHED_APPLY_PIVOT_DECL_HPP__ +#ifndef KOKKOSBATCHED_APPLY_PIVOT_DECL_HPP +#define KOKKOSBATCHED_APPLY_PIVOT_DECL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/src/KokkosBatched_ApplyQ_Decl.hpp b/packages/kokkos-kernels/batched/dense/src/KokkosBatched_ApplyQ_Decl.hpp index 7f78e317004a..9d17a8b43538 100644 --- a/packages/kokkos-kernels/batched/dense/src/KokkosBatched_ApplyQ_Decl.hpp +++ b/packages/kokkos-kernels/batched/dense/src/KokkosBatched_ApplyQ_Decl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_APPLY_Q_DECL_HPP__ -#define __KOKKOSBATCHED_APPLY_Q_DECL_HPP__ +#ifndef KOKKOSBATCHED_APPLY_Q_DECL_HPP +#define KOKKOSBATCHED_APPLY_Q_DECL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/src/KokkosBatched_Axpy.hpp b/packages/kokkos-kernels/batched/dense/src/KokkosBatched_Axpy.hpp index 5b89c0862ec3..dcc02eda2204 100644 --- a/packages/kokkos-kernels/batched/dense/src/KokkosBatched_Axpy.hpp +++ b/packages/kokkos-kernels/batched/dense/src/KokkosBatched_Axpy.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_AXPY_HPP__ -#define __KOKKOSBATCHED_AXPY_HPP__ +#ifndef KOKKOSBATCHED_AXPY_HPP +#define KOKKOSBATCHED_AXPY_HPP /// \author Kim Liegeois (knliege@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/src/KokkosBatched_Copy_Decl.hpp b/packages/kokkos-kernels/batched/dense/src/KokkosBatched_Copy_Decl.hpp index 0e2b24e91dda..88ed46d56442 100644 --- a/packages/kokkos-kernels/batched/dense/src/KokkosBatched_Copy_Decl.hpp +++ b/packages/kokkos-kernels/batched/dense/src/KokkosBatched_Copy_Decl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_COPY_DECL_HPP__ -#define __KOKKOSBATCHED_COPY_DECL_HPP__ +#ifndef KOKKOSBATCHED_COPY_DECL_HPP +#define KOKKOSBATCHED_COPY_DECL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/src/KokkosBatched_Dot.hpp b/packages/kokkos-kernels/batched/dense/src/KokkosBatched_Dot.hpp index 545a4954ce83..50242bba8c81 100644 --- a/packages/kokkos-kernels/batched/dense/src/KokkosBatched_Dot.hpp +++ b/packages/kokkos-kernels/batched/dense/src/KokkosBatched_Dot.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_DOT_HPP__ -#define __KOKKOSBATCHED_DOT_HPP__ +#ifndef KOKKOSBATCHED_DOT_HPP +#define KOKKOSBATCHED_DOT_HPP /// \author Kim Liegeois (knliege@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/src/KokkosBatched_Eigendecomposition_Decl.hpp b/packages/kokkos-kernels/batched/dense/src/KokkosBatched_Eigendecomposition_Decl.hpp index 39ead9e26cae..23c1add77b76 100644 --- a/packages/kokkos-kernels/batched/dense/src/KokkosBatched_Eigendecomposition_Decl.hpp +++ b/packages/kokkos-kernels/batched/dense/src/KokkosBatched_Eigendecomposition_Decl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_EIGENDECOMPOSITION_DECL_HPP__ -#define __KOKKOSBATCHED_EIGENDECOMPOSITION_DECL_HPP__ +#ifndef KOKKOSBATCHED_EIGENDECOMPOSITION_DECL_HPP +#define KOKKOSBATCHED_EIGENDECOMPOSITION_DECL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/src/KokkosBatched_Gemm_Decl.hpp b/packages/kokkos-kernels/batched/dense/src/KokkosBatched_Gemm_Decl.hpp index 9f4b7455612e..eabd5c42c26b 100644 --- a/packages/kokkos-kernels/batched/dense/src/KokkosBatched_Gemm_Decl.hpp +++ b/packages/kokkos-kernels/batched/dense/src/KokkosBatched_Gemm_Decl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_GEMM_DECL_HPP__ -#define __KOKKOSBATCHED_GEMM_DECL_HPP__ +#ifndef KOKKOSBATCHED_GEMM_DECL_HPP +#define KOKKOSBATCHED_GEMM_DECL_HPP #include "KokkosBatched_Vector.hpp" @@ -75,4 +75,4 @@ struct Gemm { #include "KokkosBatched_Gemm_Team_Impl.hpp" #include "KokkosBatched_Gemm_TeamVector_Impl.hpp" -#endif // __KOKKOSBATCHED_GEMM_DECL_HPP__ +#endif // KOKKOSBATCHED_GEMM_DECL_HPP diff --git a/packages/kokkos-kernels/batched/dense/src/KokkosBatched_Gemv_Decl.hpp b/packages/kokkos-kernels/batched/dense/src/KokkosBatched_Gemv_Decl.hpp index 9ab86d9e0727..907f684a747d 100644 --- a/packages/kokkos-kernels/batched/dense/src/KokkosBatched_Gemv_Decl.hpp +++ b/packages/kokkos-kernels/batched/dense/src/KokkosBatched_Gemv_Decl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_GEMV_DECL_HPP__ -#define __KOKKOSBATCHED_GEMV_DECL_HPP__ +#ifndef KOKKOSBATCHED_GEMV_DECL_HPP +#define KOKKOSBATCHED_GEMV_DECL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/src/KokkosBatched_Gesv.hpp b/packages/kokkos-kernels/batched/dense/src/KokkosBatched_Gesv.hpp index 77922e4da02c..c0cceb3819df 100644 --- a/packages/kokkos-kernels/batched/dense/src/KokkosBatched_Gesv.hpp +++ b/packages/kokkos-kernels/batched/dense/src/KokkosBatched_Gesv.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_GESV_HPP__ -#define __KOKKOSBATCHED_GESV_HPP__ +#ifndef KOKKOSBATCHED_GESV_HPP +#define KOKKOSBATCHED_GESV_HPP /// \author Kim Liegeois (knliege@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/src/KokkosBatched_HadamardProduct.hpp b/packages/kokkos-kernels/batched/dense/src/KokkosBatched_HadamardProduct.hpp index f21aa8bae257..d8076dbe3c82 100644 --- a/packages/kokkos-kernels/batched/dense/src/KokkosBatched_HadamardProduct.hpp +++ b/packages/kokkos-kernels/batched/dense/src/KokkosBatched_HadamardProduct.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_HADAMARDPRODUCT_HPP__ -#define __KOKKOSBATCHED_HADAMARDPRODUCT_HPP__ +#ifndef KOKKOSBATCHED_HADAMARDPRODUCT_HPP +#define KOKKOSBATCHED_HADAMARDPRODUCT_HPP /// \author Kim Liegeois (knliege@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/src/KokkosBatched_HostLevel_Gemm.hpp b/packages/kokkos-kernels/batched/dense/src/KokkosBatched_HostLevel_Gemm.hpp index 0741b5b41e8a..2f3cf273f437 100644 --- a/packages/kokkos-kernels/batched/dense/src/KokkosBatched_HostLevel_Gemm.hpp +++ b/packages/kokkos-kernels/batched/dense/src/KokkosBatched_HostLevel_Gemm.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_HOSTLEVEL_GEMM_DECL_HPP__ -#define __KOKKOSBATCHED_HOSTLEVEL_GEMM_DECL_HPP__ +#ifndef KOKKOSBATCHED_HOSTLEVEL_GEMM_DECL_HPP +#define KOKKOSBATCHED_HOSTLEVEL_GEMM_DECL_HPP // Include explicit specializations of BatchedGemm. // If ETI_ONLY is disabled, the primary template will @@ -101,4 +101,4 @@ inline int BatchedGemm(BatchedGemmHandleType *const handle, const ScalarType alp UnifiedBVT, UnifiedCVT>::run(handle, alpha, A, B, beta, C); } } // namespace KokkosBatched -#endif // __KOKKOSBATCHED_HOSTLEVEL_GEMM_DECL_HPP__ +#endif // KOKKOSBATCHED_HOSTLEVEL_GEMM_DECL_HPP diff --git a/packages/kokkos-kernels/batched/dense/src/KokkosBatched_HostLevel_Gemm_Handle.hpp b/packages/kokkos-kernels/batched/dense/src/KokkosBatched_HostLevel_Gemm_Handle.hpp index 2aa6f47cb0f5..87b605dc1e3e 100644 --- a/packages/kokkos-kernels/batched/dense/src/KokkosBatched_HostLevel_Gemm_Handle.hpp +++ b/packages/kokkos-kernels/batched/dense/src/KokkosBatched_HostLevel_Gemm_Handle.hpp @@ -14,8 +14,8 @@ // //@HEADER -#ifndef __KOKKOSBATCHED_HOSTLEVEL_GEMM_HANDLE_DECL_HPP__ -#define __KOKKOSBATCHED_HOSTLEVEL_GEMM_HANDLE_DECL_HPP__ +#ifndef KOKKOSBATCHED_HOSTLEVEL_GEMM_HANDLE_DECL_HPP +#define KOKKOSBATCHED_HOSTLEVEL_GEMM_HANDLE_DECL_HPP #include "KokkosBatched_Kernel_Handle.hpp" @@ -151,4 +151,4 @@ class BatchedGemmHandle : public BatchedKernelHandle { } // namespace KokkosBatched -#endif // __KOKKOSBATCHED_HOSTLEVEL_GEMM_HANDLE_DECL_HPP__ +#endif // KOKKOSBATCHED_HOSTLEVEL_GEMM_HANDLE_DECL_HPP diff --git a/packages/kokkos-kernels/batched/dense/src/KokkosBatched_Householder_Decl.hpp b/packages/kokkos-kernels/batched/dense/src/KokkosBatched_Householder_Decl.hpp index 0a4845755192..3e26d6982af6 100644 --- a/packages/kokkos-kernels/batched/dense/src/KokkosBatched_Householder_Decl.hpp +++ b/packages/kokkos-kernels/batched/dense/src/KokkosBatched_Householder_Decl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_HOUSEHOLDER_DECL_HPP__ -#define __KOKKOSBATCHED_HOUSEHOLDER_DECL_HPP__ +#ifndef KOKKOSBATCHED_HOUSEHOLDER_DECL_HPP +#define KOKKOSBATCHED_HOUSEHOLDER_DECL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/src/KokkosBatched_Iamax.hpp b/packages/kokkos-kernels/batched/dense/src/KokkosBatched_Iamax.hpp new file mode 100644 index 000000000000..c388ca943f1f --- /dev/null +++ b/packages/kokkos-kernels/batched/dense/src/KokkosBatched_Iamax.hpp @@ -0,0 +1,43 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef KOKKOSBATCHED_IAMAX_HPP_ +#define KOKKOSBATCHED_IAMAX_HPP_ + +/// \author Yuuichi Asahi (yuuichi.asahi@cea.fr) + +namespace KokkosBatched { + +/// \brief Serial Batched Iamax: +/// Iamax finds the index of the first element having maximum absolute value. +/// +/// \tparam XViewType: Input view type, needs to be a 1D view +/// +/// \param X [in]: Input view type +/// +/// \return The index of the first element having maximum absolute value +/// As well as Blas, this returns 0 (0 in Fortran) for an empty vector +/// No nested parallel_for is used inside of the function. +/// + +struct SerialIamax { + template + KOKKOS_INLINE_FUNCTION static typename XViewType::size_type invoke(const XViewType &x); +}; +} // namespace KokkosBatched + +#include "KokkosBatched_Iamax_Serial_Impl.hpp" + +#endif // KOKKOSBATCHED_IAMAX_HPP_ diff --git a/packages/kokkos-kernels/batched/dense/src/KokkosBatched_InnerGemmFixA_Decl.hpp b/packages/kokkos-kernels/batched/dense/src/KokkosBatched_InnerGemmFixA_Decl.hpp index 757a92ca211b..9aa7ac614b9a 100644 --- a/packages/kokkos-kernels/batched/dense/src/KokkosBatched_InnerGemmFixA_Decl.hpp +++ b/packages/kokkos-kernels/batched/dense/src/KokkosBatched_InnerGemmFixA_Decl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_INNER_GEMM_FIX_A_DECL_HPP__ -#define __KOKKOSBATCHED_INNER_GEMM_FIX_A_DECL_HPP__ +#ifndef KOKKOSBATCHED_INNER_GEMM_FIX_A_DECL_HPP +#define KOKKOSBATCHED_INNER_GEMM_FIX_A_DECL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/src/KokkosBatched_InnerGemmFixB_Decl.hpp b/packages/kokkos-kernels/batched/dense/src/KokkosBatched_InnerGemmFixB_Decl.hpp index b2f885970fb8..9ca74934dc1e 100644 --- a/packages/kokkos-kernels/batched/dense/src/KokkosBatched_InnerGemmFixB_Decl.hpp +++ b/packages/kokkos-kernels/batched/dense/src/KokkosBatched_InnerGemmFixB_Decl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_INNER_GEMM_FIX_B_DECL_HPP__ -#define __KOKKOSBATCHED_INNER_GEMM_FIX_B_DECL_HPP__ +#ifndef KOKKOSBATCHED_INNER_GEMM_FIX_B_DECL_HPP +#define KOKKOSBATCHED_INNER_GEMM_FIX_B_DECL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/src/KokkosBatched_InnerGemmFixC_Decl.hpp b/packages/kokkos-kernels/batched/dense/src/KokkosBatched_InnerGemmFixC_Decl.hpp index c61d966f7701..31ba2a03d9b5 100644 --- a/packages/kokkos-kernels/batched/dense/src/KokkosBatched_InnerGemmFixC_Decl.hpp +++ b/packages/kokkos-kernels/batched/dense/src/KokkosBatched_InnerGemmFixC_Decl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_INNER_GEMM_FIX_C_DECL_HPP__ -#define __KOKKOSBATCHED_INNER_GEMM_FIX_C_DECL_HPP__ +#ifndef KOKKOSBATCHED_INNER_GEMM_FIX_C_DECL_HPP +#define KOKKOSBATCHED_INNER_GEMM_FIX_C_DECL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/src/KokkosBatched_InnerLU_Decl.hpp b/packages/kokkos-kernels/batched/dense/src/KokkosBatched_InnerLU_Decl.hpp index c355185b742e..1d7c9879d309 100644 --- a/packages/kokkos-kernels/batched/dense/src/KokkosBatched_InnerLU_Decl.hpp +++ b/packages/kokkos-kernels/batched/dense/src/KokkosBatched_InnerLU_Decl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_INNER_LU_DECL_HPP__ -#define __KOKKOSBATCHED_INNER_LU_DECL_HPP__ +#ifndef KOKKOSBATCHED_INNER_LU_DECL_HPP +#define KOKKOSBATCHED_INNER_LU_DECL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/src/KokkosBatched_InnerTrsm_Decl.hpp b/packages/kokkos-kernels/batched/dense/src/KokkosBatched_InnerTrsm_Decl.hpp index 5b5b9bb147ea..b9e62aa392f6 100644 --- a/packages/kokkos-kernels/batched/dense/src/KokkosBatched_InnerTrsm_Decl.hpp +++ b/packages/kokkos-kernels/batched/dense/src/KokkosBatched_InnerTrsm_Decl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_INNER_TRSM_DECL_HPP__ -#define __KOKKOSBATCHED_INNER_TRSM_DECL_HPP__ +#ifndef KOKKOSBATCHED_INNER_TRSM_DECL_HPP +#define KOKKOSBATCHED_INNER_TRSM_DECL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/src/KokkosBatched_InverseLU_Decl.hpp b/packages/kokkos-kernels/batched/dense/src/KokkosBatched_InverseLU_Decl.hpp index 930bc790b033..141e2b972210 100644 --- a/packages/kokkos-kernels/batched/dense/src/KokkosBatched_InverseLU_Decl.hpp +++ b/packages/kokkos-kernels/batched/dense/src/KokkosBatched_InverseLU_Decl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_INVERSELU_DECL_HPP__ -#define __KOKKOSBATCHED_INVERSELU_DECL_HPP__ +#ifndef KOKKOSBATCHED_INVERSELU_DECL_HPP +#define KOKKOSBATCHED_INVERSELU_DECL_HPP /// \author Vinh Dang (vqdang@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/src/KokkosBatched_LU_Decl.hpp b/packages/kokkos-kernels/batched/dense/src/KokkosBatched_LU_Decl.hpp index 363193c14784..b4cc5d35e18a 100644 --- a/packages/kokkos-kernels/batched/dense/src/KokkosBatched_LU_Decl.hpp +++ b/packages/kokkos-kernels/batched/dense/src/KokkosBatched_LU_Decl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_LU_DECL_HPP__ -#define __KOKKOSBATCHED_LU_DECL_HPP__ +#ifndef KOKKOSBATCHED_LU_DECL_HPP +#define KOKKOSBATCHED_LU_DECL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/src/KokkosBatched_Laswp.hpp b/packages/kokkos-kernels/batched/dense/src/KokkosBatched_Laswp.hpp new file mode 100644 index 000000000000..1818d456a83c --- /dev/null +++ b/packages/kokkos-kernels/batched/dense/src/KokkosBatched_Laswp.hpp @@ -0,0 +1,52 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef KOKKOSBATCHED_LASWP_HPP_ +#define KOKKOSBATCHED_LASWP_HPP_ + +#include + +/// \author Yuuichi Asahi (yuuichi.asahi@cea.fr) + +namespace KokkosBatched { + +/// \brief Serial Batched Laswp: +/// +/// performs a series of row interchanges on the matrix A. +/// One row interchange is initiated for each of rows K1 through K2 of A. +/// +/// \tparam PivViewType: Input type for the a superdiagonal matrix, needs to +/// be a 1D view +/// \tparam AViewType: Input type for the vector or matrix, needs to be a 1D or +/// 2D view +/// +/// \param piv [in]: The pivot indices; for 0 <= i < N, row i of the +/// matrix was interchanged with row piv(i). +/// \param A [inout]: A is a lda by n matrix. The matrix of column dimension N +/// to which the row interchanges will be applied. +/// +/// No nested parallel_for is used inside of the function. +/// + +template +struct SerialLaswp { + template + KOKKOS_INLINE_FUNCTION static int invoke(const PivViewType &piv, const AViewType &A); +}; +} // namespace KokkosBatched + +#include "KokkosBatched_Laswp_Serial_Impl.hpp" + +#endif // KOKKOSBATCHED_LASWP_HPP_ diff --git a/packages/kokkos-kernels/batched/dense/src/KokkosBatched_Pbtrf.hpp b/packages/kokkos-kernels/batched/dense/src/KokkosBatched_Pbtrf.hpp new file mode 100644 index 000000000000..879dfc0db32d --- /dev/null +++ b/packages/kokkos-kernels/batched/dense/src/KokkosBatched_Pbtrf.hpp @@ -0,0 +1,54 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef KOKKOSBATCHED_PBTRF_HPP_ +#define KOKKOSBATCHED_PBTRF_HPP_ + +#include + +/// \author Yuuichi Asahi (yuuichi.asahi@cea.fr) + +namespace KokkosBatched { + +/// \brief Serial Batched Pbtrf: +/// Compute the Cholesky factorization U**H * U (or L * L**H) of a real +/// symmetric (or complex Hermitian) positive definite banded matrix A_l +/// for all l = 0, ..., +/// The factorization has the form +/// A = U**T * U , if ArgUplo = KokkosBatched::Uplo::Upper, or +/// A = L * L**T, if ArgUplo = KokkosBatched::Uplo::Lower, +/// where U is an upper triangular matrix, U**T is the transpose of U, and +/// L is lower triangular. +/// This is the unblocked version of the algorithm, calling Level 2 BLAS. +/// +/// \tparam ABViewType: Input type for a banded matrix, needs to be a 2D +/// view +/// +/// \param ab [inout]: ab is a ldab by n banded matrix, with ( kd + 1 ) diagonals +/// +/// No nested parallel_for is used inside of the function. +/// + +template +struct SerialPbtrf { + template + KOKKOS_INLINE_FUNCTION static int invoke(const ABViewType &ab); +}; + +} // namespace KokkosBatched + +#include "KokkosBatched_Pbtrf_Serial_Impl.hpp" + +#endif // KOKKOSBATCHED_PBTRF_HPP_ diff --git a/packages/kokkos-kernels/batched/dense/src/KokkosBatched_Pbtrs.hpp b/packages/kokkos-kernels/batched/dense/src/KokkosBatched_Pbtrs.hpp new file mode 100644 index 000000000000..cdacb52b00a1 --- /dev/null +++ b/packages/kokkos-kernels/batched/dense/src/KokkosBatched_Pbtrs.hpp @@ -0,0 +1,56 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef KOKKOSBATCHED_PBTRS_HPP_ +#define KOKKOSBATCHED_PBTRS_HPP_ + +#include + +/// \author Yuuichi Asahi (yuuichi.asahi@cea.fr) + +namespace KokkosBatched { + +/// \brief Serial Batched Pbtrs: +/// Solve Ab_l x_l = b_l for all l = 0, ..., N +/// using the Cholesky factorization A = U**H * U or A = L * L**H computed by +/// Pbtrf. +/// The matrix has the form +/// A = U**H * U , if ArgUplo = KokkosBatched::Uplo::Upper, or +/// A = L * L**H, if ArgUplo = KokkosBatched::Uplo::Lower, +/// where U is an upper triangular matrix, U**H is the transpose of U, and +/// L is lower triangular matrix, L**H is the transpose of L. +/// +/// \tparam ABViewType: Input type for a banded matrix, needs to be a 2D +/// view +/// \tparam BViewType: Input type for a right-hand side and the solution, +/// needs to be a 1D view +/// +/// \param ab [in]: ab is a ldab by n banded matrix, with ( kd + 1 ) diagonals +/// \param b [inout]: right-hand side and the solution, a rank 1 view +/// +/// No nested parallel_for is used inside of the function. +/// + +template +struct SerialPbtrs { + template + KOKKOS_INLINE_FUNCTION static int invoke(const ABViewType &ab, const BViewType &b); +}; + +} // namespace KokkosBatched + +#include "KokkosBatched_Pbtrs_Serial_Impl.hpp" + +#endif // KOKKOSBATCHED_PBTRS_HPP_ diff --git a/packages/kokkos-kernels/batched/dense/src/KokkosBatched_Pttrs.hpp b/packages/kokkos-kernels/batched/dense/src/KokkosBatched_Pttrs.hpp new file mode 100644 index 000000000000..17b7d45f96a1 --- /dev/null +++ b/packages/kokkos-kernels/batched/dense/src/KokkosBatched_Pttrs.hpp @@ -0,0 +1,54 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef KOKKOSBATCHED_PTTRS_HPP_ +#define KOKKOSBATCHED_PTTRS_HPP_ + +#include + +/// \author Yuuichi Asahi (yuuichi.asahi@cea.fr) + +namespace KokkosBatched { + +/// \brief Serial Batched Pttrs: +/// Solve Ab_l x_l = b_l for all l = 0, ..., N +/// using the factorization A = U**H * D * U or A = L * D * L**H computed by +/// Pttrf. +/// +/// \tparam DViewType: Input type for the a diagonal matrix, needs to be a 1D +/// view +/// \tparam EViewType: Input type for the a upper/lower diagonal matrix, +/// needs to be a 1D view +/// \tparam BViewType: Input type for the right-hand side and the solution, +/// needs to be a 1D view +/// +/// \param d [in]: n diagonal elements of the diagonal matrix D +/// \param e [in]: n-1 upper/lower diagonal elements of the diagonal matrix E +/// \param b [inout]: right-hand side and the solution, a rank 1 view +/// +/// No nested parallel_for is used inside of the function. +/// + +template +struct SerialPttrs { + template + KOKKOS_INLINE_FUNCTION static int invoke(const DViewType &d, const EViewType &e, const BViewType &b); +}; + +} // namespace KokkosBatched + +#include "KokkosBatched_Pttrs_Serial_Impl.hpp" + +#endif // KOKKOSBATCHED_PTTRS_HPP_ diff --git a/packages/kokkos-kernels/batched/dense/src/KokkosBatched_QR_Decl.hpp b/packages/kokkos-kernels/batched/dense/src/KokkosBatched_QR_Decl.hpp index 78bdcd4d4b12..4f76d6e70254 100644 --- a/packages/kokkos-kernels/batched/dense/src/KokkosBatched_QR_Decl.hpp +++ b/packages/kokkos-kernels/batched/dense/src/KokkosBatched_QR_Decl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_QR_DECL_HPP__ -#define __KOKKOSBATCHED_QR_DECL_HPP__ +#ifndef KOKKOSBATCHED_QR_DECL_HPP +#define KOKKOSBATCHED_QR_DECL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/src/KokkosBatched_QR_WithColumnPivoting_Decl.hpp b/packages/kokkos-kernels/batched/dense/src/KokkosBatched_QR_WithColumnPivoting_Decl.hpp index b08e5277a0a5..3c663f237e98 100644 --- a/packages/kokkos-kernels/batched/dense/src/KokkosBatched_QR_WithColumnPivoting_Decl.hpp +++ b/packages/kokkos-kernels/batched/dense/src/KokkosBatched_QR_WithColumnPivoting_Decl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_QR_WITH_COLUMNPIVOTING_DECL_HPP__ -#define __KOKKOSBATCHED_QR_WITH_COLUMNPIVOTING_DECL_HPP__ +#ifndef KOKKOSBATCHED_QR_WITH_COLUMNPIVOTING_DECL_HPP +#define KOKKOSBATCHED_QR_WITH_COLUMNPIVOTING_DECL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/src/KokkosBatched_SVD_Decl.hpp b/packages/kokkos-kernels/batched/dense/src/KokkosBatched_SVD_Decl.hpp index efade8029bb7..a022d826cc85 100644 --- a/packages/kokkos-kernels/batched/dense/src/KokkosBatched_SVD_Decl.hpp +++ b/packages/kokkos-kernels/batched/dense/src/KokkosBatched_SVD_Decl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_SVD_DECL_HPP__ -#define __KOKKOSBATCHED_SVD_DECL_HPP__ +#ifndef KOKKOSBATCHED_SVD_DECL_HPP +#define KOKKOSBATCHED_SVD_DECL_HPP /// \author Brian Kelley (bmkelle@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/src/KokkosBatched_Scale_Decl.hpp b/packages/kokkos-kernels/batched/dense/src/KokkosBatched_Scale_Decl.hpp index 94453a5ede49..188c1374afac 100644 --- a/packages/kokkos-kernels/batched/dense/src/KokkosBatched_Scale_Decl.hpp +++ b/packages/kokkos-kernels/batched/dense/src/KokkosBatched_Scale_Decl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_SCALE_DECL_HPP__ -#define __KOKKOSBATCHED_SCALE_DECL_HPP__ +#ifndef KOKKOSBATCHED_SCALE_DECL_HPP +#define KOKKOSBATCHED_SCALE_DECL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/src/KokkosBatched_SetIdentity_Decl.hpp b/packages/kokkos-kernels/batched/dense/src/KokkosBatched_SetIdentity_Decl.hpp index 27c2b22ed7fc..26d8ec39326e 100644 --- a/packages/kokkos-kernels/batched/dense/src/KokkosBatched_SetIdentity_Decl.hpp +++ b/packages/kokkos-kernels/batched/dense/src/KokkosBatched_SetIdentity_Decl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_SET_IDENTITY_DECL_HPP__ -#define __KOKKOSBATCHED_SET_IDENTITY_DECL_HPP__ +#ifndef KOKKOSBATCHED_SET_IDENTITY_DECL_HPP +#define KOKKOSBATCHED_SET_IDENTITY_DECL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/src/KokkosBatched_Set_Decl.hpp b/packages/kokkos-kernels/batched/dense/src/KokkosBatched_Set_Decl.hpp index d33d186275c0..21dbd8b91abd 100644 --- a/packages/kokkos-kernels/batched/dense/src/KokkosBatched_Set_Decl.hpp +++ b/packages/kokkos-kernels/batched/dense/src/KokkosBatched_Set_Decl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_SET_DECL_HPP__ -#define __KOKKOSBATCHED_SET_DECL_HPP__ +#ifndef KOKKOSBATCHED_SET_DECL_HPP +#define KOKKOSBATCHED_SET_DECL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/src/KokkosBatched_SolveLU_Decl.hpp b/packages/kokkos-kernels/batched/dense/src/KokkosBatched_SolveLU_Decl.hpp index 119f5c691648..1d0798001600 100644 --- a/packages/kokkos-kernels/batched/dense/src/KokkosBatched_SolveLU_Decl.hpp +++ b/packages/kokkos-kernels/batched/dense/src/KokkosBatched_SolveLU_Decl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_SOLVELU_DECL_HPP__ -#define __KOKKOSBATCHED_SOLVELU_DECL_HPP__ +#ifndef KOKKOSBATCHED_SOLVELU_DECL_HPP +#define KOKKOSBATCHED_SOLVELU_DECL_HPP /// \author Vinh Dang (vqdang@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/src/KokkosBatched_SolveUTV_Decl.hpp b/packages/kokkos-kernels/batched/dense/src/KokkosBatched_SolveUTV_Decl.hpp index c881a0b0f7d2..344755bbf88b 100644 --- a/packages/kokkos-kernels/batched/dense/src/KokkosBatched_SolveUTV_Decl.hpp +++ b/packages/kokkos-kernels/batched/dense/src/KokkosBatched_SolveUTV_Decl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_SOLVE_UTV_DECL_HPP__ -#define __KOKKOSBATCHED_SOLVE_UTV_DECL_HPP__ +#ifndef KOKKOSBATCHED_SOLVE_UTV_DECL_HPP +#define KOKKOSBATCHED_SOLVE_UTV_DECL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/src/KokkosBatched_Trmm_Decl.hpp b/packages/kokkos-kernels/batched/dense/src/KokkosBatched_Trmm_Decl.hpp index c284ed63b259..1444dad486e1 100644 --- a/packages/kokkos-kernels/batched/dense/src/KokkosBatched_Trmm_Decl.hpp +++ b/packages/kokkos-kernels/batched/dense/src/KokkosBatched_Trmm_Decl.hpp @@ -14,8 +14,8 @@ // //@HEADER -#ifndef __KOKKOSBATCHED_TRMM_DECL_HPP__ -#define __KOKKOSBATCHED_TRMM_DECL_HPP__ +#ifndef KOKKOSBATCHED_TRMM_DECL_HPP +#define KOKKOSBATCHED_TRMM_DECL_HPP #include "KokkosBatched_Util.hpp" #include "KokkosBatched_Vector.hpp" @@ -28,4 +28,4 @@ struct SerialTrmm { KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType &A, const BViewType &B); }; } // namespace KokkosBatched -#endif // __KOKKOSBATCHED_TRMM_DECL_HPP__ +#endif // KOKKOSBATCHED_TRMM_DECL_HPP diff --git a/packages/kokkos-kernels/batched/dense/src/KokkosBatched_Trsm_Decl.hpp b/packages/kokkos-kernels/batched/dense/src/KokkosBatched_Trsm_Decl.hpp index d2220953cce4..f06cb6245e39 100644 --- a/packages/kokkos-kernels/batched/dense/src/KokkosBatched_Trsm_Decl.hpp +++ b/packages/kokkos-kernels/batched/dense/src/KokkosBatched_Trsm_Decl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_TRSM_DECL_HPP__ -#define __KOKKOSBATCHED_TRSM_DECL_HPP__ +#ifndef KOKKOSBATCHED_TRSM_DECL_HPP +#define KOKKOSBATCHED_TRSM_DECL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/src/KokkosBatched_Trsv_Decl.hpp b/packages/kokkos-kernels/batched/dense/src/KokkosBatched_Trsv_Decl.hpp index e3da43a95d32..d711085c9bdb 100644 --- a/packages/kokkos-kernels/batched/dense/src/KokkosBatched_Trsv_Decl.hpp +++ b/packages/kokkos-kernels/batched/dense/src/KokkosBatched_Trsv_Decl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_TRSV_DECL_HPP__ -#define __KOKKOSBATCHED_TRSV_DECL_HPP__ +#ifndef KOKKOSBATCHED_TRSV_DECL_HPP +#define KOKKOSBATCHED_TRSV_DECL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/src/KokkosBatched_Trtri_Decl.hpp b/packages/kokkos-kernels/batched/dense/src/KokkosBatched_Trtri_Decl.hpp index 8c8e6121a3ad..f650fd1cce77 100644 --- a/packages/kokkos-kernels/batched/dense/src/KokkosBatched_Trtri_Decl.hpp +++ b/packages/kokkos-kernels/batched/dense/src/KokkosBatched_Trtri_Decl.hpp @@ -14,8 +14,8 @@ // //@HEADER -#ifndef __KOKKOSBATCHED_TRTRI_DECL_HPP__ -#define __KOKKOSBATCHED_TRTRI_DECL_HPP__ +#ifndef KOKKOSBATCHED_TRTRI_DECL_HPP +#define KOKKOSBATCHED_TRTRI_DECL_HPP #include "KokkosBatched_Util.hpp" #include "KokkosBatched_Vector.hpp" @@ -28,4 +28,4 @@ struct SerialTrtri { KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A); }; } // namespace KokkosBatched -#endif // __KOKKOSBATCHED_TRTRI_DECL_HPP__ +#endif // KOKKOSBATCHED_TRTRI_DECL_HPP diff --git a/packages/kokkos-kernels/batched/dense/src/KokkosBatched_UTV_Decl.hpp b/packages/kokkos-kernels/batched/dense/src/KokkosBatched_UTV_Decl.hpp index bae2780e1089..318f5ddc98fd 100644 --- a/packages/kokkos-kernels/batched/dense/src/KokkosBatched_UTV_Decl.hpp +++ b/packages/kokkos-kernels/batched/dense/src/KokkosBatched_UTV_Decl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_UTV_DECL_HPP__ -#define __KOKKOSBATCHED_UTV_DECL_HPP__ +#ifndef KOKKOSBATCHED_UTV_DECL_HPP +#define KOKKOSBATCHED_UTV_DECL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/src/KokkosBatched_Vector.hpp b/packages/kokkos-kernels/batched/dense/src/KokkosBatched_Vector.hpp index e44af7bc04fc..1eafbfc9ad5c 100644 --- a/packages/kokkos-kernels/batched/dense/src/KokkosBatched_Vector.hpp +++ b/packages/kokkos-kernels/batched/dense/src/KokkosBatched_Vector.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_VECTOR_HPP__ -#define __KOKKOSBATCHED_VECTOR_HPP__ +#ifndef KOKKOSBATCHED_VECTOR_HPP +#define KOKKOSBATCHED_VECTOR_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/src/KokkosBatched_Vector_SIMD.hpp b/packages/kokkos-kernels/batched/dense/src/KokkosBatched_Vector_SIMD.hpp index 52a73deda4f6..82c6e72cb608 100644 --- a/packages/kokkos-kernels/batched/dense/src/KokkosBatched_Vector_SIMD.hpp +++ b/packages/kokkos-kernels/batched/dense/src/KokkosBatched_Vector_SIMD.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_VECTOR_SIMD_HPP__ -#define __KOKKOSBATCHED_VECTOR_SIMD_HPP__ +#ifndef KOKKOSBATCHED_VECTOR_SIMD_HPP +#define KOKKOSBATCHED_VECTOR_SIMD_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/src/KokkosBatched_Xpay.hpp b/packages/kokkos-kernels/batched/dense/src/KokkosBatched_Xpay.hpp index 51418fd81a87..c8d7c374f321 100644 --- a/packages/kokkos-kernels/batched/dense/src/KokkosBatched_Xpay.hpp +++ b/packages/kokkos-kernels/batched/dense/src/KokkosBatched_Xpay.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_XPAY_HPP__ -#define __KOKKOSBATCHED_XPAY_HPP__ +#ifndef KOKKOSBATCHED_XPAY_HPP +#define KOKKOSBATCHED_XPAY_HPP /// \author Kim Liegeois (knliege@sandia.gov) diff --git a/packages/kokkos-kernels/batched/dense/unit_test/Test_Batched_Dense.hpp b/packages/kokkos-kernels/batched/dense/unit_test/Test_Batched_Dense.hpp index 76215b58f836..2378e5ff0115 100644 --- a/packages/kokkos-kernels/batched/dense/unit_test/Test_Batched_Dense.hpp +++ b/packages/kokkos-kernels/batched/dense/unit_test/Test_Batched_Dense.hpp @@ -52,6 +52,17 @@ #include "Test_Batched_SerialPttrf.hpp" #include "Test_Batched_SerialPttrf_Real.hpp" #include "Test_Batched_SerialPttrf_Complex.hpp" +#include "Test_Batched_SerialPttrs.hpp" +#include "Test_Batched_SerialPttrs_Real.hpp" +#include "Test_Batched_SerialPttrs_Complex.hpp" +#include "Test_Batched_SerialPbtrf.hpp" +#include "Test_Batched_SerialPbtrf_Real.hpp" +#include "Test_Batched_SerialPbtrf_Complex.hpp" +#include "Test_Batched_SerialPbtrs.hpp" +#include "Test_Batched_SerialPbtrs_Real.hpp" +#include "Test_Batched_SerialPbtrs_Complex.hpp" +#include "Test_Batched_SerialLaswp.hpp" +#include "Test_Batched_SerialIamax.hpp" // Team Kernels #include "Test_Batched_TeamAxpy.hpp" diff --git a/packages/kokkos-kernels/batched/dense/unit_test/Test_Batched_DenseUtils.hpp b/packages/kokkos-kernels/batched/dense/unit_test/Test_Batched_DenseUtils.hpp index f536f220d3a5..eccdba50d105 100644 --- a/packages/kokkos-kernels/batched/dense/unit_test/Test_Batched_DenseUtils.hpp +++ b/packages/kokkos-kernels/batched/dense/unit_test/Test_Batched_DenseUtils.hpp @@ -145,6 +145,170 @@ void create_diagonal_matrix(InViewType& in, OutViewType& out, int k = 0) { Kokkos::deep_copy(out, h_out); } +/// \brief Creates a positive definite symmetric (PDS) matrix. +/// Takes a full random matrix and converts it to a full pds matrix. +/// +/// \tparam InViewType: Input type for the matrix, needs to be a 3D view +/// \tparam OutViewType: Output type for the matrix, needs to be a 3D view +/// +/// \param in [in]: Input batched banded matrix, a rank 3 view +/// \param out [out]: Output batched full matrix, a rank 3 view +/// +template +void random_to_pds(InViewType& in, OutViewType& out) { + auto h_in = Kokkos::create_mirror_view(in); + auto h_out = Kokkos::create_mirror_view(out); + const int N = in.extent(0), BlkSize = in.extent(1); + using value_type = typename InViewType::non_const_value_type; + + for (std::size_t i = 0; i < InViewType::rank(); i++) { + assert(out.extent(i) == in.extent(i)); + } + + Kokkos::deep_copy(h_in, in); + Kokkos::deep_copy(h_out, 0.0); + + for (int i0 = 0; i0 < N; i0++) { + // Make a hermitian matrix + for (int i1 = 0; i1 < BlkSize; i1++) { + for (int i2 = i1; i2 < BlkSize; i2++) { + if (i1 == i2) { + // Diagonal elements must be real + h_out(i0, i1, i2) = Kokkos::ArithTraits::real(h_in(i0, i1, i2)); + } else { + // Off-diagonal elements are complex and Hermitian + h_out(i0, i1, i2) = h_in(i0, i1, i2); + h_out(i0, i2, i1) = Kokkos::ArithTraits::conj(h_in(i0, i1, i2)); + } + } + } + + // Make matrix diagonal dominant + for (int i1 = 0; i1 < BlkSize; i1++) { + value_type sum = 0; + for (int i2 = 0; i2 < BlkSize; i2++) { + if (i1 != i2) { + sum += Kokkos::abs(h_out(i0, i1, i2)); + } + } + h_out(i0, i1, i1) = sum + 1.0; + } + } + Kokkos::deep_copy(out, h_out); +} + +/// \brief Creates a banded positive definite symmetric (PDS) matrix. +/// Takes a full diagonal dominant matrix and converts it to a banded pds matrix either +/// in banded or full storage. +/// +/// \tparam InViewType: Input type for the matrix, needs to be a 3D view +/// \tparam OutViewType: Output type for the matrix, needs to be a 3D view +/// \tparam UploType: Type indicating whether the matrix is upper or lower triangular +/// +/// \param in [in]: Input batched banded matrix, a rank 3 view +/// \param out [out]: Output batched full matrix, a rank 3 view +/// \param k [in]: Number of sub/super-diagonals for lower/upper triangular (default is 1) +/// \param band_storage [in]: Boolean flag indicating whether the output should be in banded storage format (default is +/// true) +template +void create_banded_pds_matrix(InViewType& in, OutViewType& out, int k = 1, bool band_storage = true) { + auto h_in = Kokkos::create_mirror_view(in); + auto h_out = Kokkos::create_mirror_view(out); + using value_type = typename InViewType::non_const_value_type; + const int N = in.extent(0), BlkSize = in.extent(1); + + Kokkos::deep_copy(h_in, in); + + if (band_storage) { + assert(out.extent(0) == in.extent(0)); + assert(out.extent(1) == static_cast(k + 1)); + assert(out.extent(2) == in.extent(2)); + if constexpr (std::is_same_v) { + for (int i0 = 0; i0 < N; i0++) { + for (int i1 = 0; i1 < k + 1; i1++) { + for (int i2 = i1; i2 < BlkSize; i2++) { + h_out(i0, k - i1, i2) = h_in(i0, i2 - i1, i2); + } + } + } + } else { + for (int i0 = 0; i0 < N; i0++) { + for (int i1 = 0; i1 < k + 1; i1++) { + for (int i2 = 0; i2 < BlkSize - i1; i2++) { + h_out(i0, i1, i2) = h_in(i0, i2 + i1, i2); + } + } + } + } + } else { + for (std::size_t i = 0; i < InViewType::rank(); i++) { + assert(out.extent(i) == in.extent(i)); + } + + if constexpr (std::is_same_v) { + for (int i0 = 0; i0 < N; i0++) { + for (int i1 = 0; i1 < BlkSize; i1++) { + for (int i2 = i1; i2 < Kokkos::min(i1 + k + 1, BlkSize); i2++) { + h_out(i0, i1, i2) = h_in(i0, i1, i2); + h_out(i0, i2, i1) = Kokkos::ArithTraits::conj(h_in(i0, i1, i2)); + } + } + } + } else { + for (int i0 = 0; i0 < N; i0++) { + for (int i1 = 0; i1 < BlkSize; i1++) { + for (int i2 = Kokkos::max(0, i1 - k); i2 <= i1; i2++) { + h_out(i0, i1, i2) = h_in(i0, i1, i2); + h_out(i0, i2, i1) = Kokkos::ArithTraits::conj(h_in(i0, i1, i2)); + } + } + } + } + } + Kokkos::deep_copy(out, h_out); +} + +/// \brief Converts a banded matrix to a full matrix. +/// Takes a banded matrix in banded storage and converts it to a full matrix. +/// +/// \tparam InViewType: Input type for the matrix, needs to be a 3D view +/// \tparam OutViewType: Output type for the matrix, needs to be a 3D view +/// \tparam UploType: Type indicating whether the matrix is upper or lower triangular +/// +/// \param in [in]: Input batched banded matrix, a rank 3 view +/// \param out [out]: Output batched full matrix, a rank 3 view +/// \param k [in]: Number of sub/super-diagonals for lower/upper triangular (default is 1) +/// +template +void banded_to_full(InViewType& in, OutViewType& out, int k = 1) { + auto h_in = Kokkos::create_mirror_view(in); + auto h_out = Kokkos::create_mirror_view(out); + const int N = in.extent(0), BlkSize = in.extent(2); + + Kokkos::deep_copy(h_in, in); + assert(in.extent(0) == out.extent(0)); + assert(in.extent(1) == static_cast(k + 1)); + assert(in.extent(2) == out.extent(2)); + if constexpr (std::is_same_v) { + for (int i0 = 0; i0 < N; i0++) { + for (int i1 = 0; i1 < k + 1; i1++) { + for (int i2 = i1; i2 < BlkSize; i2++) { + h_out(i0, i2 - i1, i2) = h_in(i0, k - i1, i2); + } + } + } + } else { + for (int i0 = 0; i0 < N; i0++) { + for (int i1 = 0; i1 < k + 1; i1++) { + for (int i2 = 0; i2 < BlkSize - i1; i2++) { + h_out(i0, i2 + i1, i2) = h_in(i0, i1, i2); + } + } + } + } + Kokkos::deep_copy(out, h_out); +} + } // namespace KokkosBatched #endif // TEST_BATCHED_DENSE_HELPER_HPP diff --git a/packages/kokkos-kernels/batched/dense/unit_test/Test_Batched_SerialGemm.hpp b/packages/kokkos-kernels/batched/dense/unit_test/Test_Batched_SerialGemm.hpp index 144bb2251e78..0b2ed4a1623b 100644 --- a/packages/kokkos-kernels/batched/dense/unit_test/Test_Batched_SerialGemm.hpp +++ b/packages/kokkos-kernels/batched/dense/unit_test/Test_Batched_SerialGemm.hpp @@ -25,6 +25,7 @@ #include "KokkosBatched_Gemm_Serial_Impl.hpp" #include "KokkosKernels_TestUtils.hpp" +#include "KokkosKernels_TestVanilla.hpp" using namespace KokkosBatched; diff --git a/packages/kokkos-kernels/batched/dense/unit_test/Test_Batched_SerialIamax.hpp b/packages/kokkos-kernels/batched/dense/unit_test/Test_Batched_SerialIamax.hpp new file mode 100644 index 000000000000..38e9e78e04d4 --- /dev/null +++ b/packages/kokkos-kernels/batched/dense/unit_test/Test_Batched_SerialIamax.hpp @@ -0,0 +1,277 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +/// \author Yuuichi Asahi (yuuichi.asahi@cea.fr) +#include +#include +#include + +#include "KokkosBatched_Util.hpp" +#include "KokkosBatched_Iamax.hpp" + +using namespace KokkosBatched; + +namespace Test { +namespace Iamax { + +template +struct Functor_BatchedSerialIamax { + using execution_space = typename DeviceType::execution_space; + XViewType m_x; + RViewType m_r; + + KOKKOS_INLINE_FUNCTION + Functor_BatchedSerialIamax(const XViewType &x, const RViewType &r) : m_x(x), m_r(r) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int k) const { + auto sub_x = Kokkos::subview(m_x, k, Kokkos::ALL()); + auto iamax = KokkosBatched::SerialIamax::invoke(sub_x); + m_r(k) = static_cast(iamax); + } + + inline void run() { + using value_type = typename XViewType::non_const_value_type; + std::string name_region("KokkosBatched::Test::SerialIamax"); + std::string name_value_type = Test::value_type_name(); + std::string name = name_region + name_value_type; + Kokkos::Profiling::pushRegion(name.c_str()); + Kokkos::RangePolicy policy(0, m_x.extent(0)); + Kokkos::parallel_for(name.c_str(), policy, *this); + Kokkos::Profiling::popRegion(); + } +}; + +/// \brief Implementation details of batched iamax analytical test +/// A0: [1, 2, 0] -> 1 +/// A1: [-5, 4, 3] -> 0 +/// A2: [0, 0, 0] -> 0 +/// A3: [0, -1, -1] -> 1 +/// +/// \param N [in] Batch size of A +template +void impl_test_batched_iamax_analytical(const std::size_t N) { + using View2DType = Kokkos::View; + using StridedView2DType = Kokkos::View; + using MaxView1DType = Kokkos::View; + + View2DType A0("A0", N, 3), A1("A1", N, 3), A2("A2", N, 3), A3("A3", N, 3); + MaxView1DType iamax0("iamax0", N), iamax_ref0("iamax_ref0", N), iamax1("iamax1", N), iamax_ref1("iamax_ref1", N), + iamax2("iamax2", N), iamax_ref2("iamax_ref2", N), iamax3("iamax3", N), iamax_ref3("iamax_ref3", N); + + // Testing incx argument with strided views + constexpr std::size_t incx = 2; + Kokkos::LayoutStride layout{N, incx, 3, N * incx}; + StridedView2DType A0_s("A0_s", layout), A1_s("A1_s", layout), A2_s("A2_s", layout), A3_s("A3_s", layout); + MaxView1DType iamax_s0("iamax_s0", N), iamax_s1("iamax_s1", N), iamax_s2("iamax_s2", N), iamax_s3("iamax_s3", N); + + // Initialize A0, A1, A2, A3 + auto h_A0 = Kokkos::create_mirror_view(A0); + auto h_A1 = Kokkos::create_mirror_view(A1); + auto h_A2 = Kokkos::create_mirror_view(A2); + auto h_A3 = Kokkos::create_mirror_view(A3); + + auto h_iamax_ref0 = Kokkos::create_mirror_view(iamax_ref0); + auto h_iamax_ref1 = Kokkos::create_mirror_view(iamax_ref1); + auto h_iamax_ref2 = Kokkos::create_mirror_view(iamax_ref2); + auto h_iamax_ref3 = Kokkos::create_mirror_view(iamax_ref3); + for (std::size_t k = 0; k < N; k++) { + h_A0(k, 0) = 1; + h_A0(k, 1) = 2; + h_A0(k, 2) = 0; + + h_A1(k, 0) = -5; + h_A1(k, 1) = 4; + h_A1(k, 2) = 3; + + h_A2(k, 0) = 0; + h_A2(k, 1) = 0; + h_A2(k, 2) = 0; + + h_A3(k, 0) = 0; + h_A3(k, 1) = -1; + h_A3(k, 2) = -1; + + h_iamax_ref0(k) = 1; + h_iamax_ref1(k) = 0; + h_iamax_ref2(k) = 0; + h_iamax_ref3(k) = 1; + } + Kokkos::deep_copy(A0, h_A0); + Kokkos::deep_copy(A1, h_A1); + Kokkos::deep_copy(A2, h_A2); + Kokkos::deep_copy(A3, h_A3); + + // Strided view can be copied only on the same device + Kokkos::deep_copy(A0_s, A0); + Kokkos::deep_copy(A1_s, A1); + Kokkos::deep_copy(A2_s, A2); + Kokkos::deep_copy(A3_s, A3); + + Functor_BatchedSerialIamax(A0, iamax0).run(); + Functor_BatchedSerialIamax(A1, iamax1).run(); + Functor_BatchedSerialIamax(A2, iamax2).run(); + Functor_BatchedSerialIamax(A3, iamax3).run(); + + // For strided views + Functor_BatchedSerialIamax(A0_s, iamax_s0).run(); + Functor_BatchedSerialIamax(A1_s, iamax_s1).run(); + Functor_BatchedSerialIamax(A2_s, iamax_s2).run(); + Functor_BatchedSerialIamax(A3_s, iamax_s3).run(); + + Kokkos::fence(); + + // Copy to host for comparison + auto h_iamax0 = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), iamax0); + auto h_iamax1 = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), iamax1); + auto h_iamax2 = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), iamax2); + auto h_iamax3 = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), iamax3); + auto h_iamax_s0 = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), iamax_s0); + auto h_iamax_s1 = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), iamax_s1); + auto h_iamax_s2 = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), iamax_s2); + auto h_iamax_s3 = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), iamax_s3); + + // Check if max index is correct + for (std::size_t k = 0; k < N; k++) { + EXPECT_EQ(h_iamax0(k), h_iamax_ref0(k)); + EXPECT_EQ(h_iamax1(k), h_iamax_ref1(k)); + EXPECT_EQ(h_iamax2(k), h_iamax_ref2(k)); + EXPECT_EQ(h_iamax3(k), h_iamax_ref3(k)); + EXPECT_EQ(h_iamax_s0(k), h_iamax_ref0(k)); + EXPECT_EQ(h_iamax_s1(k), h_iamax_ref1(k)); + EXPECT_EQ(h_iamax_s2(k), h_iamax_ref2(k)); + EXPECT_EQ(h_iamax_s3(k), h_iamax_ref3(k)); + } +} + +/// \brief Implementation details of batched pbtrs test +/// +/// \param N [in] Batch size of RHS (banded matrix can also be batched matrix) +/// \param BlkSize [in] Block size of matrix A +template +void impl_test_batched_iamax(const std::size_t N, const std::size_t BlkSize) { + using ats = typename Kokkos::ArithTraits; + using RealType = typename ats::mag_type; + using View2DType = Kokkos::View; + using StridedView2DType = Kokkos::View; + using MaxView1DType = Kokkos::View; + + View2DType A("A", N, BlkSize); + MaxView1DType iamax("iamax", N), iamax_ref("iamax_ref", N); + + // Testing incx argument with strided views + constexpr std::size_t incx = 2; + Kokkos::LayoutStride layout{N, incx, BlkSize, N * incx}; + StridedView2DType A_s("A_s", layout); + MaxView1DType iamax_s("iamax_s", N); + + // Initialize A with random values + using execution_space = typename DeviceType::execution_space; + Kokkos::Random_XorShift64_Pool rand_pool(13718); + ScalarType randStart, randEnd; + + KokkosKernels::Impl::getRandomBounds(1.0, randStart, randEnd); + Kokkos::fill_random(A, rand_pool, randStart, randEnd); + + // Strided view can be copied only on the same device + Kokkos::deep_copy(A_s, A); + + Functor_BatchedSerialIamax(A, iamax).run(); + + // For strided views + Functor_BatchedSerialIamax(A_s, iamax_s).run(); + + Kokkos::fence(); + + // Reference + auto h_iamax_ref = Kokkos::create_mirror_view(iamax_ref); + if (BlkSize == 0) { + // As well as blas, we store 0 (0 in Fortran) for empty matrix + for (std::size_t k = 0; k < N; k++) { + h_iamax_ref(k) = 0; + } + } else { + auto h_A = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A); + for (std::size_t k = 0; k < N; k++) { + RealType amax = Kokkos::abs(h_A(k, 0)); + int iamax_tmp = 0; + for (std::size_t i = 1; i < BlkSize; i++) { + const RealType abs_A_i = Kokkos::abs(h_A(k, i)); + if (abs_A_i > amax) { + amax = abs_A_i; + iamax_tmp = static_cast(i); + } + } + h_iamax_ref(k) = iamax_tmp; + } + } + + // Copy to host for comparison + auto h_iamax = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), iamax); + auto h_iamax_s = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), iamax_s); + + // Check if max index is correct + for (std::size_t k = 0; k < N; k++) { + EXPECT_EQ(h_iamax(k), h_iamax_ref(k)); + EXPECT_EQ(h_iamax_s(k), h_iamax_ref(k)); + } +} + +} // namespace Iamax +} // namespace Test + +template +int test_batched_iamax() { +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) + { + using LayoutType = Kokkos::LayoutLeft; + Test::Iamax::impl_test_batched_iamax_analytical(1); + Test::Iamax::impl_test_batched_iamax_analytical(2); + for (std::size_t i = 0; i < 10; i++) { + Test::Iamax::impl_test_batched_iamax(1, i); + Test::Iamax::impl_test_batched_iamax(2, i); + } + } +#endif +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) + { + using LayoutType = Kokkos::LayoutRight; + Test::Iamax::impl_test_batched_iamax_analytical(1); + Test::Iamax::impl_test_batched_iamax_analytical(2); + for (std::size_t i = 0; i < 10; i++) { + Test::Iamax::impl_test_batched_iamax(1, i); + Test::Iamax::impl_test_batched_iamax(2, i); + } + } +#endif + + return 0; +} + +#if defined(KOKKOSKERNELS_INST_FLOAT) +TEST_F(TestCategory, test_batched_iamax_float) { test_batched_iamax(); } +#endif + +#if defined(KOKKOSKERNELS_INST_DOUBLE) +TEST_F(TestCategory, test_batched_iamax_double) { test_batched_iamax(); } +#endif + +#if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) +TEST_F(TestCategory, test_batched_iamax_fcomplex) { test_batched_iamax>(); } +#endif + +#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) +TEST_F(TestCategory, test_batched_iamax_dcomplex) { test_batched_iamax>(); } +#endif diff --git a/packages/kokkos-kernels/batched/dense/unit_test/Test_Batched_SerialInverseLU.hpp b/packages/kokkos-kernels/batched/dense/unit_test/Test_Batched_SerialInverseLU.hpp index 6f1115447129..fd0ca336bff0 100644 --- a/packages/kokkos-kernels/batched/dense/unit_test/Test_Batched_SerialInverseLU.hpp +++ b/packages/kokkos-kernels/batched/dense/unit_test/Test_Batched_SerialInverseLU.hpp @@ -143,7 +143,7 @@ void impl_test_batched_inverselu(const int N, const int BlkSize) { /// randomized input testing views AViewType a0("a0", N, BlkSize, BlkSize); AViewType a1("a1", N, BlkSize, BlkSize); - WViewType w("w", N, BlkSize * BlkSize); + WViewType w("w", N, BlkSize * static_cast(BlkSize)); AViewType c0("c0", N, BlkSize, BlkSize); Kokkos::Random_XorShift64_Pool random(13718); diff --git a/packages/kokkos-kernels/batched/dense/unit_test/Test_Batched_SerialLaswp.hpp b/packages/kokkos-kernels/batched/dense/unit_test/Test_Batched_SerialLaswp.hpp new file mode 100644 index 000000000000..c515ad408878 --- /dev/null +++ b/packages/kokkos-kernels/batched/dense/unit_test/Test_Batched_SerialLaswp.hpp @@ -0,0 +1,638 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +/// \author Yuuichi Asahi (yuuichi.asahi@cea.fr) +#include +#include +#include + +#include "KokkosBatched_Util.hpp" +#include "KokkosBatched_Laswp.hpp" + +using namespace KokkosBatched; + +namespace Test { +namespace Laswp { + +template +struct Functor_BatchedSerialLaswp { + using execution_space = typename DeviceType::execution_space; + PivViewType m_ipiv; + AViewType m_a; + + KOKKOS_INLINE_FUNCTION + Functor_BatchedSerialLaswp(const PivViewType &ipiv, const AViewType &a) : m_ipiv(ipiv), m_a(a) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const ArgDirect &, const int k, int &info) const { + auto sub_ipiv = Kokkos::subview(m_ipiv, k, Kokkos::ALL); + if constexpr (AViewType::rank == 3) { + auto sub_a = Kokkos::subview(m_a, k, Kokkos::ALL, Kokkos::ALL); + info += KokkosBatched::SerialLaswp::invoke(sub_ipiv, sub_a); + } else { + auto sub_a = Kokkos::subview(m_a, k, Kokkos::ALL); + info += KokkosBatched::SerialLaswp::invoke(sub_ipiv, sub_a); + } + } + + inline int run() { + using value_type = typename AViewType::non_const_value_type; + std::string name_region("KokkosBatched::Test::SerialLaswp"); + const std::string name_value_type = Test::value_type_name(); + std::string name = name_region + name_value_type; + int info_sum = 0; + Kokkos::Profiling::pushRegion(name.c_str()); + Kokkos::RangePolicy policy(0, m_a.extent(0)); + Kokkos::parallel_reduce(name.c_str(), policy, *this, info_sum); + Kokkos::Profiling::popRegion(); + return info_sum; + } +}; + +/// \brief Implementation details of batched laswp analytical test +/// Confirm A = Ref (permuted), where +/// A0: [[4], +/// [1], +/// [2]] +/// p0: [1, 2, 0] +/// Initial 0<->1 1<->2 2<->0 +/// Forward: [4,1,2] -> [1,4,2] -> [1,2,4] -> [4,2,1] +/// Initial 2<->0 1<->2 0<->1 +/// Backward: [4,1,2] -> [2,1,4] -> [2,4,1] -> [4,2,1] +/// +/// A1: [[4, 1, 5], +/// [2, 3, 7], +/// [6, 0, 8]] +/// p1: [1, 2, 0] +/// Initial 0<->1 1<->2 2<->0 +/// Forward: [[4, 1, 5], -> [[2, 3, 7], -> [[2, 3, 7], -> [[4, 1, 5], +/// [2, 3, 7], [4, 1, 5], [6, 0, 8], [6, 0, 8], +/// [6, 0, 8]] [6, 0, 8]] [4, 1, 5]] [2, 3, 7]] +/// Initial 2<->0 1<->2 0<->1 +/// Backward: [[4, 1, 5], -> [[6, 0, 8], -> [[6, 0, 8], -> [[4, 1, 5], +/// [2, 3, 7], [2, 3, 7], 4, 1, 5], [6, 0, 8], +/// [6, 0, 8]] [4, 1, 5]] [2, 3, 7]] [2, 3, 7]] +/// +/// A2: [[5, 1], +/// [2, 4], +/// [3, 0]] +/// p2: [2, 0, 1] +/// Initial 0<->2 1<->0 2<->1 +/// Forward: [[5, 1], -> [[3, 0], -> [[2, 4], -> [[2, 4], +/// [2, 4], [2, 4] [3, 0], [5, 1], +/// [3, 0]] [5, 1]] [5, 1]] [3, 0]] +/// Initial 2<->1 1<->0 0<->2 +/// Backward: [[5, 1], -> [[5, 1], -> [[3, 0], -> [[2, 4], +/// [2, 4], [3, 0], [5, 1], [5, 1], +/// [3, 0]] [2, 4]] [2, 4]] [3, 0]] +/// +/// \param N [in] Batch size of matrices +template +void impl_test_batched_laswp_analytical(const std::size_t N) { + using ats = typename Kokkos::ArithTraits; + using RealType = typename ats::mag_type; + using View2DType = Kokkos::View; + using View3DType = Kokkos::View; + using StridedView2DType = Kokkos::View; + using StridedView3DType = Kokkos::View; + using PivView2DType = Kokkos::View; + + View2DType A0("A0", N, 3), Ref0("Ref0", N, 3), A0_identity("A0_identity", N, 3), Ref0_identity("Ref0_identity", N, 3); + View3DType A1("A1", N, 3, 3), Ref1("Ref1", N, 3, 3), A1_identity("A1_identity", N, 3, 3), + Ref1_identity("Ref1_identity", N, 3, 3); + View3DType A2("A2", N, 3, 2), Ref2("Ref2", N, 3, 2), A2_identity("A2_identity", N, 3, 2), + Ref2_identity("Ref2_identity", N, 3, 2); + PivView2DType ipiv0("ipiv0", N, 3), ipiv1("ipiv1", N, 3), ipiv2("ipiv2", N, 3); + + // Testing incx argument with strided views + constexpr std::size_t incx = 2; + Kokkos::LayoutStride layout0{N, incx, 3, N * incx}; + StridedView2DType A0_s("A0_s", layout0), A0_s_identity("A0_s_identity", layout0); + + Kokkos::LayoutStride layout1{N, incx, 3, N * incx, 3, N * incx * 3}; + StridedView3DType A1_s("A1_s", layout1), A1_s_identity("A1_s_identity", layout1); + + Kokkos::LayoutStride layout2{N, incx, 3, N * incx, 2, N * incx * 3}; + StridedView3DType A2_s("A2_s", layout2), A2_s_identity("A2_s_identity", layout2); + + // Initialize A0, A1, and A2 with random numbers + auto h_A0 = Kokkos::create_mirror_view(A0); + auto h_A1 = Kokkos::create_mirror_view(A1); + auto h_A2 = Kokkos::create_mirror_view(A2); + auto h_Ref0 = Kokkos::create_mirror_view(Ref0); + auto h_Ref1 = Kokkos::create_mirror_view(Ref1); + auto h_Ref2 = Kokkos::create_mirror_view(Ref2); + + for (std::size_t ib = 0; ib < N; ib++) { + h_A0(ib, 0) = 4.0; + h_A0(ib, 1) = 1.0; + h_A0(ib, 2) = 2.0; + + h_Ref0(ib, 0) = 4.0; + h_Ref0(ib, 1) = 2.0; + h_Ref0(ib, 2) = 1.0; + + h_A1(ib, 0, 0) = 4.0; + h_A1(ib, 0, 1) = 1.0; + h_A1(ib, 0, 2) = 5.0; + h_A1(ib, 1, 0) = 2.0; + h_A1(ib, 1, 1) = 3.0; + h_A1(ib, 1, 2) = 7.0; + h_A1(ib, 2, 0) = 6.0; + h_A1(ib, 2, 1) = 0.0; + h_A1(ib, 2, 2) = 8.0; + + h_Ref1(ib, 0, 0) = 4.0; + h_Ref1(ib, 0, 1) = 1.0; + h_Ref1(ib, 0, 2) = 5.0; + h_Ref1(ib, 1, 0) = 6.0; + h_Ref1(ib, 1, 1) = 0.0; + h_Ref1(ib, 1, 2) = 8.0; + h_Ref1(ib, 2, 0) = 2.0; + h_Ref1(ib, 2, 1) = 3.0; + h_Ref1(ib, 2, 2) = 7.0; + + h_A2(ib, 0, 0) = 5.0; + h_A2(ib, 0, 1) = 1.0; + h_A2(ib, 1, 0) = 2.0; + h_A2(ib, 1, 1) = 4.0; + h_A2(ib, 2, 0) = 3.0; + h_A2(ib, 2, 1) = 0.0; + + h_Ref2(ib, 0, 0) = 2.0; + h_Ref2(ib, 0, 1) = 4.0; + h_Ref2(ib, 1, 0) = 5.0; + h_Ref2(ib, 1, 1) = 1.0; + h_Ref2(ib, 2, 0) = 3.0; + h_Ref2(ib, 2, 1) = 0.0; + } + Kokkos::deep_copy(A0, h_A0); + Kokkos::deep_copy(A1, h_A1); + Kokkos::deep_copy(A2, h_A2); + + // Strided view can be copied only on the same device + Kokkos::deep_copy(A0_s, A0); + Kokkos::deep_copy(A1_s, A1); + Kokkos::deep_copy(A2_s, A2); + + // Copy A to Ref_identity + Kokkos::deep_copy(Ref0_identity, A0); + Kokkos::deep_copy(Ref1_identity, A1); + Kokkos::deep_copy(Ref2_identity, A2); + + // Permute ipiv + auto h_ipiv0 = Kokkos::create_mirror_view(ipiv0); + auto h_ipiv1 = Kokkos::create_mirror_view(ipiv1); + auto h_ipiv2 = Kokkos::create_mirror_view(ipiv2); + + for (std::size_t ib = 0; ib < N; ib++) { + h_ipiv0(ib, 0) = 1; + h_ipiv0(ib, 1) = 2; + h_ipiv0(ib, 2) = 0; + + h_ipiv1(ib, 0) = 1; + h_ipiv1(ib, 1) = 2; + h_ipiv1(ib, 2) = 0; + + h_ipiv2(ib, 0) = 2; + h_ipiv2(ib, 1) = 0; + h_ipiv2(ib, 2) = 1; + } + + Kokkos::deep_copy(ipiv0, h_ipiv0); + Kokkos::deep_copy(ipiv1, h_ipiv1); + Kokkos::deep_copy(ipiv2, h_ipiv2); + + auto info0 = Functor_BatchedSerialLaswp(ipiv0, A0).run(); + auto info1 = Functor_BatchedSerialLaswp(ipiv1, A1).run(); + auto info2 = Functor_BatchedSerialLaswp(ipiv2, A2).run(); + + Kokkos::fence(); + EXPECT_EQ(info0, 0); + EXPECT_EQ(info1, 0); + EXPECT_EQ(info2, 0); + + // For strided views + info0 = Functor_BatchedSerialLaswp(ipiv0, A0_s).run(); + info1 = Functor_BatchedSerialLaswp(ipiv1, A1_s).run(); + info2 = Functor_BatchedSerialLaswp(ipiv2, A2_s).run(); + + Kokkos::fence(); + EXPECT_EQ(info0, 0); + EXPECT_EQ(info1, 0); + EXPECT_EQ(info2, 0); + + // Copy permuted A to A_identity which is permuted back to original A + Kokkos::deep_copy(A0_identity, A0); + Kokkos::deep_copy(A1_identity, A1); + Kokkos::deep_copy(A2_identity, A2); + + Kokkos::deep_copy(A0_s_identity, A0_s); + Kokkos::deep_copy(A1_s_identity, A1_s); + Kokkos::deep_copy(A2_s_identity, A2_s); + + using InvDirect = + typename std::conditional_t, Direct::Backward, Direct::Forward>; + + // Permute A_identity in inverse direction to get original A + info0 = Functor_BatchedSerialLaswp(ipiv0, A0_identity).run(); + info1 = Functor_BatchedSerialLaswp(ipiv1, A1_identity).run(); + info2 = Functor_BatchedSerialLaswp(ipiv2, A2_identity).run(); + + Kokkos::fence(); + EXPECT_EQ(info0, 0); + EXPECT_EQ(info1, 0); + EXPECT_EQ(info2, 0); + + // For strided views + info0 = + Functor_BatchedSerialLaswp(ipiv0, A0_s_identity).run(); + info1 = + Functor_BatchedSerialLaswp(ipiv1, A1_s_identity).run(); + info2 = + Functor_BatchedSerialLaswp(ipiv2, A2_s_identity).run(); + + Kokkos::fence(); + EXPECT_EQ(info0, 0); + EXPECT_EQ(info1, 0); + EXPECT_EQ(info2, 0); + + RealType eps = 1.0e1 * ats::epsilon(); + + Kokkos::deep_copy(h_A0, A0); + Kokkos::deep_copy(h_A1, A1); + Kokkos::deep_copy(h_A2, A2); + auto h_A0_identity = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A0_identity); + auto h_A1_identity = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A1_identity); + auto h_A2_identity = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A2_identity); + + auto h_Ref0_identity = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), Ref0_identity); + auto h_Ref1_identity = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), Ref1_identity); + auto h_Ref2_identity = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), Ref2_identity); + // Check if A is permuted correctly and A_identity is restored + for (std::size_t ib = 0; ib < N; ib++) { + for (std::size_t i = 0; i < 3; i++) { + EXPECT_NEAR_KK_REL(h_A0(ib, i), h_Ref0(ib, i), eps); + EXPECT_NEAR_KK_REL(h_A0_identity(ib, i), h_Ref0_identity(ib, i), eps); + for (std::size_t j = 0; j < 2; j++) { + EXPECT_NEAR_KK_REL(h_A2(ib, i, j), h_Ref2(ib, i, j), eps); + EXPECT_NEAR_KK_REL(h_A2_identity(ib, i, j), h_Ref2_identity(ib, i, j), eps); + } + for (std::size_t j = 0; j < 3; j++) { + EXPECT_NEAR_KK_REL(h_A1(ib, i, j), h_Ref1(ib, i, j), eps); + EXPECT_NEAR_KK_REL(h_A1_identity(ib, i, j), h_Ref1_identity(ib, i, j), eps); + } + } + } + + // Testing for strided views, reusing A0, A1, A2, A0_identity, A1_identity, A2_identity + Kokkos::deep_copy(A0, A0_s); + Kokkos::deep_copy(h_A0, A0); + Kokkos::deep_copy(A1, A1_s); + Kokkos::deep_copy(h_A1, A1); + Kokkos::deep_copy(A2, A2_s); + Kokkos::deep_copy(h_A2, A2); + Kokkos::deep_copy(A0_identity, A0_s_identity); + Kokkos::deep_copy(h_A0_identity, A0_identity); + Kokkos::deep_copy(A1_identity, A1_s_identity); + Kokkos::deep_copy(h_A1_identity, A1_identity); + Kokkos::deep_copy(A2_identity, A2_s_identity); + Kokkos::deep_copy(h_A2_identity, A2_identity); + for (std::size_t ib = 0; ib < N; ib++) { + for (std::size_t i = 0; i < 3; i++) { + EXPECT_NEAR_KK_REL(h_A0(ib, i), h_Ref0(ib, i), eps); + EXPECT_NEAR_KK_REL(h_A0_identity(ib, i), h_Ref0_identity(ib, i), eps); + for (std::size_t j = 0; j < 2; j++) { + EXPECT_NEAR_KK_REL(h_A2(ib, i, j), h_Ref2(ib, i, j), eps); + EXPECT_NEAR_KK_REL(h_A2_identity(ib, i, j), h_Ref2_identity(ib, i, j), eps); + } + for (std::size_t j = 0; j < 3; j++) { + EXPECT_NEAR_KK_REL(h_A1(ib, i, j), h_Ref1(ib, i, j), eps); + EXPECT_NEAR_KK_REL(h_A1_identity(ib, i, j), h_Ref1_identity(ib, i, j), eps); + } + } + } +} + +/// \brief Implementation details of batched laswp test on vectors +/// Apply pivot to vector +/// +/// \param N [in] Batch size of vectors +/// \param BlkSize [in] Length of vector b +template +void impl_test_batched_laswp_vector(const std::size_t N, const std::size_t BlkSize) { + using ats = typename Kokkos::ArithTraits; + using RealType = typename ats::mag_type; + using View2DType = Kokkos::View; + using StridedView2DType = Kokkos::View; + using PivView2DType = Kokkos::View; + + View2DType b("b", N, BlkSize), Ref("Ref", N, BlkSize), b_identity("b_identity", N, BlkSize), + Ref_identity("Ref_identity", N, BlkSize); + PivView2DType ipiv("ipiv", N, BlkSize); + + // Testing incx argument with strided views + constexpr std::size_t incx = 2; + Kokkos::LayoutStride layout{N, incx, BlkSize, N * incx}; + StridedView2DType b_s("b_s", layout), b_s_identity("b_s_identity", layout); + + // Initialize b with random numbers + using execution_space = typename DeviceType::execution_space; + Kokkos::Random_XorShift64_Pool rand_pool(13718); + ScalarType randStart, randEnd; + + KokkosKernels::Impl::getRandomBounds(1.0, randStart, randEnd); + Kokkos::fill_random(b, rand_pool, randStart, randEnd); + Kokkos::deep_copy(Ref, b); // This Ref is used to store permuted b + Kokkos::deep_copy(Ref_identity, b); + Kokkos::deep_copy(b_s, b); + + // Permute ipiv + auto h_ipiv = Kokkos::create_mirror_view(ipiv); + std::vector ipiv_vec(BlkSize); + for (int i = 0; i < static_cast(BlkSize); i++) { + ipiv_vec[i] = i; + } + auto rng = std::default_random_engine{}; + std::shuffle(ipiv_vec.begin(), ipiv_vec.end(), rng); + for (std::size_t ib = 0; ib < N; ib++) { + for (std::size_t i = 0; i < BlkSize; i++) { + h_ipiv(ib, i) = ipiv_vec[i]; + } + } + Kokkos::deep_copy(ipiv, h_ipiv); + + auto info = Functor_BatchedSerialLaswp(ipiv, b).run(); + + Kokkos::fence(); + EXPECT_EQ(info, 0); + + // For strided views + info = Functor_BatchedSerialLaswp(ipiv, b_s).run(); + + Kokkos::fence(); + EXPECT_EQ(info, 0); + + // Copy permuted b to b_identity which is permuted back to original b + Kokkos::deep_copy(b_identity, b); + Kokkos::deep_copy(b_s_identity, b_s); + + // Permute b_identity in inverse direction to get original b + using InvDirect = + typename std::conditional_t, Direct::Backward, Direct::Forward>; + Functor_BatchedSerialLaswp(ipiv, b_identity).run(); + Functor_BatchedSerialLaswp(ipiv, b_s_identity).run(); + + // Make a reference + auto h_Ref = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), Ref); + for (std::size_t ib = 0; ib < N; ib++) { + if constexpr (std::is_same_v) { + // Permute Ref by forward pivoting + for (int i = 0; i < static_cast(BlkSize); i++) { + if (h_ipiv(ib, i) != i) { + Kokkos::kokkos_swap(h_Ref(ib, h_ipiv(ib, i)), h_Ref(ib, i)); + } + } + } else { + // Permute Ref by backward pivoting + for (int i = (static_cast(BlkSize) - 1); i >= 0; --i) { + if (h_ipiv(ib, i) != i) { + Kokkos::kokkos_swap(h_Ref(ib, h_ipiv(ib, i)), h_Ref(ib, i)); + } + } + } + } + + RealType eps = 1.0e1 * ats::epsilon(); + + auto h_b = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), b); + auto h_b_identity = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), b_identity); + auto h_Ref_identity = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), Ref_identity); + // Check b is permuted correctly and b_identity is restored + for (std::size_t ib = 0; ib < N; ib++) { + for (std::size_t i = 0; i < BlkSize; i++) { + EXPECT_NEAR_KK_REL(h_b(ib, i), h_Ref(ib, i), eps); + EXPECT_NEAR_KK_REL(h_b_identity(ib, i), h_Ref_identity(ib, i), eps); + } + } + + // Testing for strided views, reusing b and b_identity + Kokkos::deep_copy(b, b_s); + Kokkos::deep_copy(h_b, b); + Kokkos::deep_copy(b_identity, b_s_identity); + Kokkos::deep_copy(h_b_identity, b_identity); + for (std::size_t ib = 0; ib < N; ib++) { + for (std::size_t i = 0; i < BlkSize; i++) { + EXPECT_NEAR_KK_REL(h_b(ib, i), h_Ref(ib, i), eps); + EXPECT_NEAR_KK_REL(h_b_identity(ib, i), h_Ref_identity(ib, i), eps); + } + } +} + +/// \brief Implementation details of batched laswp test on matrices +/// Apply pivot to matrix +/// +/// \param N [in] Batch size of vectors +/// \param BlkSize [in] Row size of matrix A +template +void impl_test_batched_laswp_matrix(const std::size_t N, const std::size_t BlkSize) { + using ats = typename Kokkos::ArithTraits; + using RealType = typename ats::mag_type; + using View3DType = Kokkos::View; + using StridedView3DType = Kokkos::View; + using PivView2DType = Kokkos::View; + + // In order for the tests on non-square matrices, we fix the column size to 5 + // and scan with the row size + constexpr std::size_t M = 5; + View3DType A("A", N, BlkSize, M), Ref("Ref", N, BlkSize, M), A_identity("A_identity", N, BlkSize, M), + Ref_identity("Ref_identity", N, BlkSize, M); + PivView2DType ipiv("ipiv", N, BlkSize); + + // Testing incx argument with strided views + constexpr std::size_t incx = 2; + Kokkos::LayoutStride layout{N, incx, BlkSize, N * incx, M, N * incx * BlkSize}; + StridedView3DType A_s("A_s", layout), A_s_identity("A_s_identity", layout); + + // Initialize A with random numbers + using execution_space = typename DeviceType::execution_space; + Kokkos::Random_XorShift64_Pool rand_pool(13718); + ScalarType randStart, randEnd; + + KokkosKernels::Impl::getRandomBounds(1.0, randStart, randEnd); + Kokkos::fill_random(A, rand_pool, randStart, randEnd); + Kokkos::deep_copy(Ref, A); // This Ref is used to store permuted A + Kokkos::deep_copy(Ref_identity, A); + Kokkos::deep_copy(A_s, A); + + // Permute ipiv + auto h_ipiv = Kokkos::create_mirror_view(ipiv); + std::vector ipiv_vec(BlkSize); + for (int i = 0; i < static_cast(BlkSize); i++) { + ipiv_vec[i] = i; + } + auto rng = std::default_random_engine{}; + std::shuffle(ipiv_vec.begin(), ipiv_vec.end(), rng); + for (std::size_t ib = 0; ib < N; ib++) { + for (std::size_t i = 0; i < BlkSize; i++) { + h_ipiv(ib, i) = ipiv_vec[i]; + } + } + Kokkos::deep_copy(ipiv, h_ipiv); + + auto info = Functor_BatchedSerialLaswp(ipiv, A).run(); + + Kokkos::fence(); + EXPECT_EQ(info, 0); + + // For strided views + info = Functor_BatchedSerialLaswp(ipiv, A_s).run(); + + Kokkos::fence(); + EXPECT_EQ(info, 0); + + // Copy permuted A to A_identity which is permuted back to original A + Kokkos::deep_copy(A_identity, A); + Kokkos::deep_copy(A_s_identity, A_s); + + // Permute A_identity in inverse direction to get original A + using InvDirect = + typename std::conditional_t, Direct::Backward, Direct::Forward>; + info = Functor_BatchedSerialLaswp(ipiv, A_identity).run(); + + Kokkos::fence(); + EXPECT_EQ(info, 0); + + info = Functor_BatchedSerialLaswp(ipiv, A_s_identity).run(); + + Kokkos::fence(); + EXPECT_EQ(info, 0); + + // permute Ref by ipiv + auto h_Ref = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), Ref); + for (std::size_t ib = 0; ib < N; ib++) { + if constexpr (std::is_same_v) { + // Permute Ref by forward pivoting + for (int i = 0; i < static_cast(BlkSize); i++) { + if (h_ipiv(ib, i) != i) { + for (int j = 0; j < static_cast(M); j++) { + Kokkos::kokkos_swap(h_Ref(ib, h_ipiv(ib, i), j), h_Ref(ib, i, j)); + } + } + } + } else { + // Permute Ref by backward pivoting + for (int i = (static_cast(BlkSize) - 1); i >= 0; --i) { + if (h_ipiv(ib, i) != i) { + for (int j = 0; j < static_cast(M); j++) { + Kokkos::kokkos_swap(h_Ref(ib, h_ipiv(ib, i), j), h_Ref(ib, i, j)); + } + } + } + } + } + + RealType eps = 1.0e1 * ats::epsilon(); + + auto h_A = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A); + auto h_A_identity = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A_identity); + auto h_Ref_identity = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), Ref_identity); + // Check A is permuted correctly and A_identity is restored + for (std::size_t ib = 0; ib < N; ib++) { + for (std::size_t i = 0; i < BlkSize; i++) { + for (std::size_t j = 0; j < M; j++) { + EXPECT_NEAR_KK_REL(h_A(ib, i, j), h_Ref(ib, i, j), eps); + EXPECT_NEAR_KK_REL(h_A_identity(ib, i, j), h_Ref_identity(ib, i, j), eps); + } + } + } + + // Testing for strided views, reusing A and A_identity + Kokkos::deep_copy(A, A_s); + Kokkos::deep_copy(h_A, A); + Kokkos::deep_copy(A_identity, A_s_identity); + Kokkos::deep_copy(h_A_identity, A_identity); + for (std::size_t ib = 0; ib < N; ib++) { + for (std::size_t i = 0; i < BlkSize; i++) { + for (std::size_t j = 0; j < M; j++) { + EXPECT_NEAR_KK_REL(h_A(ib, i, j), h_Ref(ib, i, j), eps); + EXPECT_NEAR_KK_REL(h_A_identity(ib, i, j), h_Ref_identity(ib, i, j), eps); + } + } + } +} + +} // namespace Laswp +} // namespace Test + +template +int test_batched_laswp() { +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) + { + using LayoutType = Kokkos::LayoutLeft; + Test::Laswp::impl_test_batched_laswp_analytical(1); + Test::Laswp::impl_test_batched_laswp_analytical(2); + for (int i = 0; i < 10; i++) { + Test::Laswp::impl_test_batched_laswp_vector(1, i); + Test::Laswp::impl_test_batched_laswp_vector(2, i); + Test::Laswp::impl_test_batched_laswp_matrix(1, i); + Test::Laswp::impl_test_batched_laswp_matrix(2, i); + } + } +#endif +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) + { + using LayoutType = Kokkos::LayoutRight; + Test::Laswp::impl_test_batched_laswp_analytical(1); + Test::Laswp::impl_test_batched_laswp_analytical(2); + for (int i = 0; i < 10; i++) { + Test::Laswp::impl_test_batched_laswp_vector(1, i); + Test::Laswp::impl_test_batched_laswp_vector(2, i); + Test::Laswp::impl_test_batched_laswp_matrix(1, i); + Test::Laswp::impl_test_batched_laswp_matrix(2, i); + } + } +#endif + + return 0; +} + +#if defined(KOKKOSKERNELS_INST_FLOAT) +TEST_F(TestCategory, test_batched_laswp_f_float) { test_batched_laswp(); } +TEST_F(TestCategory, test_batched_laswp_b_float) { test_batched_laswp(); } +#endif + +#if defined(KOKKOSKERNELS_INST_DOUBLE) +TEST_F(TestCategory, test_batched_laswp_f_double) { test_batched_laswp(); } +TEST_F(TestCategory, test_batched_laswp_b_double) { test_batched_laswp(); } +#endif + +#if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) +TEST_F(TestCategory, test_batched_laswp_f_fcomplex) { + test_batched_laswp, Direct::Forward>(); +} +TEST_F(TestCategory, test_batched_laswp_b_fcomplex) { + test_batched_laswp, Direct::Backward>(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) +TEST_F(TestCategory, test_batched_laswp_f_dcomplex) { + test_batched_laswp, Direct::Forward>(); +} +TEST_F(TestCategory, test_batched_laswp_b_dcomplex) { + test_batched_laswp, Direct::Backward>(); +} +#endif diff --git a/packages/kokkos-kernels/batched/dense/unit_test/Test_Batched_SerialPbtrf.hpp b/packages/kokkos-kernels/batched/dense/unit_test/Test_Batched_SerialPbtrf.hpp new file mode 100644 index 000000000000..0b16fab24242 --- /dev/null +++ b/packages/kokkos-kernels/batched/dense/unit_test/Test_Batched_SerialPbtrf.hpp @@ -0,0 +1,322 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +/// \author Yuuichi Asahi (yuuichi.asahi@cea.fr) +#include +#include +#include + +#include "KokkosBatched_Util.hpp" +#include "KokkosBatched_Pbtrf.hpp" +#include "Test_Batched_DenseUtils.hpp" + +using namespace KokkosBatched; + +namespace Test { +namespace Pbtrf { + +template +struct ParamTag { + using uplo = U; +}; + +template +struct Functor_BatchedSerialPbtrf { + using execution_space = typename DeviceType::execution_space; + ABViewType _ab; + + KOKKOS_INLINE_FUNCTION + Functor_BatchedSerialPbtrf(const ABViewType &ab) : _ab(ab) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const ParamTagType &, const int k, int &info) const { + auto sub_ab = Kokkos::subview(_ab, k, Kokkos::ALL(), Kokkos::ALL()); + + info += KokkosBatched::SerialPbtrf::invoke(sub_ab); + } + + inline int run() { + using value_type = typename ABViewType::non_const_value_type; + std::string name_region("KokkosBatched::Test::SerialPbtrf"); + const std::string name_value_type = Test::value_type_name(); + std::string name = name_region + name_value_type; + int info_sum = 0; + Kokkos::Profiling::pushRegion(name.c_str()); + Kokkos::RangePolicy policy(0, _ab.extent(0)); + Kokkos::parallel_reduce(name.c_str(), policy, *this, info_sum); + Kokkos::Profiling::popRegion(); + return info_sum; + } +}; + +template +struct Functor_BatchedSerialGemm { + using execution_space = typename DeviceType::execution_space; + AViewType _a; + BViewType _b; + CViewType _c; + ScalarType _alpha, _beta; + + KOKKOS_INLINE_FUNCTION + Functor_BatchedSerialGemm(const ScalarType alpha, const AViewType &a, const BViewType &b, const ScalarType beta, + const CViewType &c) + : _a(a), _b(b), _c(c), _alpha(alpha), _beta(beta) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int k) const { + auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL()); + auto bb = Kokkos::subview(_b, k, Kokkos::ALL(), Kokkos::ALL()); + auto cc = Kokkos::subview(_c, k, Kokkos::ALL(), Kokkos::ALL()); + + KokkosBatched::SerialGemm::invoke(_alpha, aa, bb, _beta, cc); + } + + inline void run() { + using value_type = typename AViewType::non_const_value_type; + std::string name_region("KokkosBatched::Test::SerialPbtrf"); + const std::string name_value_type = Test::value_type_name(); + std::string name = name_region + name_value_type; + Kokkos::RangePolicy policy(0, _a.extent(0)); + Kokkos::parallel_for(name.c_str(), policy, *this); + } +}; + +template +/// \brief Implementation details of batched pbtrf test +/// Confirm A = U**H * U or L * L**H, where +/// For full storage, +/// A: [[4, 1], +/// [1, 4]] +/// L: [[sqrt(4), 0], +/// [1/sqrt(4), sqrt(4 - (1/sqrt(4))**2)] +/// U: [[sqrt(4), 1/sqrt(4)], +/// [0, sqrt(4 - (1/sqrt(4))**2)] +/// +/// For lower banded storage, Ab = Lb * Lb**H +/// Ab: [[4, 4], +/// [1, 0]] +/// Lb: [[sqrt(4), sqrt(4 - (1/sqrt(4))**2)], +/// [1/sqrt(4), 0]] +/// +/// For upper banded storage, Ab = Ub**H * Ub +/// Ab: [[0, 1], +/// [4, 4]] +/// Ub: [[0, 1/sqrt(4)], +/// [sqrt(4), sqrt(4 - (1/sqrt(4))**2)]] +/// \param N [in] Batch size of AB +void impl_test_batched_pbtrf_analytical(const int N) { + using ats = typename Kokkos::ArithTraits; + using RealType = typename ats::mag_type; + using View3DType = Kokkos::View; + + constexpr int BlkSize = 2, k = 1; + View3DType A("A", N, BlkSize, BlkSize), Ab("Ab", N, k + 1, BlkSize), + Ab_ref("Ab_ref", N, k + 1, BlkSize); // Banded matrix + + auto h_A = Kokkos::create_mirror_view(A); + for (int ib = 0; ib < N; ib++) { + for (int i = 0; i < BlkSize; i++) { + for (int j = 0; j < BlkSize; j++) { + h_A(ib, i, j) = i == j ? 4.0 : 1.0; + } + } + } + + Kokkos::deep_copy(A, h_A); + + // Create banded triangluar matrix in normal and banded storage + using ArgUplo = typename ParamTagType::uplo; + create_banded_triangular_matrix(A, Ab, k, true); + + // Make a reference using the naive Cholesky decomposition + // Cholesky decomposition for full storage + // l_kk = np.sqrt( a_kk - sum_{i=1}^{k-1}( l_ik^2 ) ) + // l_ik = 1/l_kk * ( a_ik - sum_{j=1}^{k-1}( l_ij * l_kj ) ) + auto h_Ab_ref = Kokkos::create_mirror_view(Ab_ref); + auto h_Ab = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), Ab); + if (std::is_same_v) { + // A = U**H * U + for (int ib = 0; ib < N; ib++) { + h_Ab_ref(ib, 1, 0) = Kokkos::sqrt(h_Ab(ib, 1, 0)); + h_Ab_ref(ib, 0, 1) = 1.0 / h_Ab_ref(ib, 1, 0); + h_Ab_ref(ib, 1, 1) = Kokkos::sqrt(h_Ab(ib, 1, 1) - h_Ab_ref(ib, 0, 1) * h_Ab_ref(ib, 0, 1)); + } + } else { + // A = L * L**H + for (int ib = 0; ib < N; ib++) { + h_Ab_ref(ib, 0, 0) = Kokkos::sqrt(h_Ab(ib, 0, 0)); + h_Ab_ref(ib, 1, 0) = 1.0 / h_Ab_ref(ib, 0, 0); + h_Ab_ref(ib, 0, 1) = Kokkos::sqrt(h_Ab(ib, 0, 1) - h_Ab_ref(ib, 1, 0) * h_Ab_ref(ib, 1, 0)); + } + } + + // Factorize with Pbtrf: A = U**H * U or A = L * L**H + auto info = Functor_BatchedSerialPbtrf(Ab).run(); + Kokkos::fence(); + EXPECT_EQ(info, 0); + + // this eps is about 10^-14 + RealType eps = 1.0e3 * ats::epsilon(); + + // Check if Ab == Ub or Lb + Kokkos::deep_copy(h_Ab, Ab); + for (int ib = 0; ib < N; ib++) { + for (int i = 0; i < k + 1; i++) { + for (int j = 0; j < BlkSize; j++) { + EXPECT_NEAR_KK(h_Ab(ib, i, j), h_Ab_ref(ib, i, j), eps); + } + } + } +} + +template +/// \brief Implementation details of batched pbtrs test +/// +/// \param N [in] Batch size of RHS (banded matrix can also be batched matrix) +/// \param k [in] Number of superdiagonals or subdiagonals of matrix A +/// \param BlkSize [in] Block size of matrix A +void impl_test_batched_pbtrf(const int N, const int k, const int BlkSize) { + using View3DType = Kokkos::View; + View3DType A("A", N, BlkSize, BlkSize), A_reconst("A_reconst", N, BlkSize, BlkSize), + Ab("Ab", N, k + 1, BlkSize); // Banded matrix + + using execution_space = typename DeviceType::execution_space; + Kokkos::Random_XorShift64_Pool rand_pool(13718); + ScalarType randStart, randEnd; + + // Initialize A_reconst with random matrix + KokkosKernels::Impl::getRandomBounds(1.0, randStart, randEnd); + Kokkos::fill_random(A, rand_pool, randStart, randEnd); + + // Make the matrix Positive Definite Symmetric and Diagonal dominant + random_to_pds(A, A_reconst); + Kokkos::deep_copy(A, 0.0); + + // Create banded triangluar matrix in normal and banded storage + using ArgUplo = typename ParamTagType::uplo; + create_banded_pds_matrix(A_reconst, A, k, false); + + create_banded_triangular_matrix(A_reconst, Ab, k, true); + + // Clear matrix + Kokkos::deep_copy(A_reconst, 0.0); + + // Factorize with Pbtrf: A = U**H * U or A = L * L**H + auto info = Functor_BatchedSerialPbtrf(Ab).run(); + + Kokkos::fence(); + EXPECT_EQ(info, 0); + + if (std::is_same_v) { + // A = U**H * U + View3DType U("U", N, BlkSize, BlkSize), Uc("Uc", N, BlkSize, BlkSize); + banded_to_full(Ab, U, k); + + // Compute the complex conjugate of U + // U -> conj(U) + auto h_U = Kokkos::create_mirror_view(U); + auto h_Uc = Kokkos::create_mirror_view(Uc); + Kokkos::deep_copy(h_U, U); + Kokkos::deep_copy(h_Uc, Uc); + for (int ib = 0; ib < N; ib++) { + for (int i = 0; i < BlkSize; i++) { + for (int j = 0; j < BlkSize; j++) { + h_Uc(ib, i, j) = Kokkos::ArithTraits::conj(h_U(ib, i, j)); + } + } + } + Kokkos::deep_copy(Uc, h_Uc); + + // Create conjugate of U + Functor_BatchedSerialGemm(1.0, Uc, U, 0.0, A_reconst) + .run(); + } else { + // A = L * L**H + View3DType L("L", N, BlkSize, BlkSize), Lc("Lc", N, BlkSize, BlkSize); + banded_to_full(Ab, L, k); + + // Compute the complex conjugate of L + // L -> conj(L) + auto h_L = Kokkos::create_mirror_view(L); + auto h_Lc = Kokkos::create_mirror_view(Lc); + Kokkos::deep_copy(h_L, L); + Kokkos::deep_copy(h_Lc, Lc); + for (int ib = 0; ib < N; ib++) { + for (int i = 0; i < BlkSize; i++) { + for (int j = 0; j < BlkSize; j++) { + h_Lc(ib, i, j) = Kokkos::ArithTraits::conj(h_L(ib, i, j)); + } + } + } + Kokkos::deep_copy(Lc, h_Lc); + + // Create conjugate of L + Functor_BatchedSerialGemm(1.0, L, Lc, 0.0, A_reconst) + .run(); + } + + // this eps is about 10^-14 + using ats = typename Kokkos::ArithTraits; + using RealType = typename ats::mag_type; + RealType eps = 1.0e3 * ats::epsilon(); + + auto h_A = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A); + auto h_A_reconst = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A_reconst); + + // Check if A = U**H * U or A = L * L**H + for (int ib = 0; ib < N; ib++) { + for (int i = 0; i < BlkSize; i++) { + for (int j = 0; j < BlkSize; j++) { + EXPECT_NEAR_KK(h_A_reconst(ib, i, j), h_A(ib, i, j), eps); + } + } + } +} + +} // namespace Pbtrf +} // namespace Test + +template +int test_batched_pbtrf() { +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) + { + using LayoutType = Kokkos::LayoutLeft; + Test::Pbtrf::impl_test_batched_pbtrf_analytical(1); + Test::Pbtrf::impl_test_batched_pbtrf_analytical(2); + for (int i = 0; i < 10; i++) { + int k = 1; + Test::Pbtrf::impl_test_batched_pbtrf(1, k, i); + Test::Pbtrf::impl_test_batched_pbtrf(2, k, i); + } + } +#endif +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) + { + using LayoutType = Kokkos::LayoutRight; + Test::Pbtrf::impl_test_batched_pbtrf_analytical(1); + Test::Pbtrf::impl_test_batched_pbtrf_analytical(2); + for (int i = 0; i < 10; i++) { + int k = 1; + Test::Pbtrf::impl_test_batched_pbtrf(1, k, i); + Test::Pbtrf::impl_test_batched_pbtrf(2, k, i); + } + } +#endif + + return 0; +} diff --git a/packages/kokkos-kernels/batched/dense/unit_test/Test_Batched_SerialPbtrf_Complex.hpp b/packages/kokkos-kernels/batched/dense/unit_test/Test_Batched_SerialPbtrf_Complex.hpp new file mode 100644 index 000000000000..3aebb8ffa514 --- /dev/null +++ b/packages/kokkos-kernels/batched/dense/unit_test/Test_Batched_SerialPbtrf_Complex.hpp @@ -0,0 +1,45 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) +TEST_F(TestCategory, test_batched_pbtrf_l_fcomplex) { + using algo_tag_type = typename Algo::Pbtrf::Unblocked; + using param_tag_type = ::Test::Pbtrf::ParamTag; + + test_batched_pbtrf, param_tag_type, algo_tag_type>(); +} +TEST_F(TestCategory, test_batched_pbtrf_u_fcomplex) { + using algo_tag_type = typename Algo::Pbtrf::Unblocked; + using param_tag_type = ::Test::Pbtrf::ParamTag; + + test_batched_pbtrf, param_tag_type, algo_tag_type>(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) +TEST_F(TestCategory, test_batched_pbtrf_l_dcomplex) { + using algo_tag_type = typename Algo::Pbtrf::Unblocked; + using param_tag_type = ::Test::Pbtrf::ParamTag; + + test_batched_pbtrf, param_tag_type, algo_tag_type>(); +} +TEST_F(TestCategory, test_batched_pbtrf_u_dcomplex) { + using algo_tag_type = typename Algo::Pbtrf::Unblocked; + using param_tag_type = ::Test::Pbtrf::ParamTag; + + test_batched_pbtrf, param_tag_type, algo_tag_type>(); +} +#endif diff --git a/packages/kokkos-kernels/batched/dense/unit_test/Test_Batched_SerialPbtrf_Real.hpp b/packages/kokkos-kernels/batched/dense/unit_test/Test_Batched_SerialPbtrf_Real.hpp new file mode 100644 index 000000000000..e1b77416f5ea --- /dev/null +++ b/packages/kokkos-kernels/batched/dense/unit_test/Test_Batched_SerialPbtrf_Real.hpp @@ -0,0 +1,45 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#if defined(KOKKOSKERNELS_INST_FLOAT) +TEST_F(TestCategory, test_batched_pbtrf_l_float) { + using algo_tag_type = typename Algo::Pbtrf::Unblocked; + using param_tag_type = ::Test::Pbtrf::ParamTag; + + test_batched_pbtrf(); +} +TEST_F(TestCategory, test_batched_pbtrf_u_float) { + using algo_tag_type = typename Algo::Pbtrf::Unblocked; + using param_tag_type = ::Test::Pbtrf::ParamTag; + + test_batched_pbtrf(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_DOUBLE) +TEST_F(TestCategory, test_batched_pbtrf_l_double) { + using algo_tag_type = typename Algo::Pbtrf::Unblocked; + using param_tag_type = ::Test::Pbtrf::ParamTag; + + test_batched_pbtrf(); +} +TEST_F(TestCategory, test_batched_pbtrf_u_double) { + using algo_tag_type = typename Algo::Pbtrf::Unblocked; + using param_tag_type = ::Test::Pbtrf::ParamTag; + + test_batched_pbtrf(); +} +#endif diff --git a/packages/kokkos-kernels/batched/dense/unit_test/Test_Batched_SerialPbtrs.hpp b/packages/kokkos-kernels/batched/dense/unit_test/Test_Batched_SerialPbtrs.hpp new file mode 100644 index 000000000000..01654a4dd54f --- /dev/null +++ b/packages/kokkos-kernels/batched/dense/unit_test/Test_Batched_SerialPbtrs.hpp @@ -0,0 +1,295 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +/// \author Yuuichi Asahi (yuuichi.asahi@cea.fr) +#include +#include +#include +#include +#include "KokkosBatched_Util.hpp" +#include "KokkosBatched_Pbtrf.hpp" +#include "KokkosBatched_Pbtrs.hpp" +#include "Test_Batched_DenseUtils.hpp" + +using namespace KokkosBatched; + +namespace Test { +namespace Pbtrs { + +template +struct ParamTag { + using uplo = U; +}; + +template +struct Functor_BatchedSerialPbtrf { + using execution_space = typename DeviceType::execution_space; + ABViewType _ab; + + KOKKOS_INLINE_FUNCTION + Functor_BatchedSerialPbtrf(const ABViewType &ab) : _ab(ab) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const ParamTagType &, const int k) const { + auto sub_ab = Kokkos::subview(_ab, k, Kokkos::ALL(), Kokkos::ALL()); + + KokkosBatched::SerialPbtrf::invoke(sub_ab); + } + + inline void run() { + using value_type = typename ABViewType::non_const_value_type; + std::string name_region("KokkosBatched::Test::SerialPbtrs"); + const std::string name_value_type = Test::value_type_name(); + std::string name = name_region + name_value_type; + Kokkos::RangePolicy policy(0, _ab.extent(0)); + Kokkos::parallel_for(name.c_str(), policy, *this); + } +}; + +template +struct Functor_BatchedSerialPbtrs { + using execution_space = typename DeviceType::execution_space; + ABViewType _ab; + BViewType _b; + + KOKKOS_INLINE_FUNCTION + Functor_BatchedSerialPbtrs(const ABViewType &ab, const BViewType &b) : _ab(ab), _b(b) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const ParamTagType &, const int k, int &info) const { + auto sub_ab = Kokkos::subview(_ab, k, Kokkos::ALL(), Kokkos::ALL()); + auto bb = Kokkos::subview(_b, k, Kokkos::ALL()); + + info += KokkosBatched::SerialPbtrs::invoke(sub_ab, bb); + } + + inline int run() { + using value_type = typename ABViewType::non_const_value_type; + std::string name_region("KokkosBatched::Test::SerialPbtrs"); + const std::string name_value_type = Test::value_type_name(); + std::string name = name_region + name_value_type; + int info_sum = 0; + Kokkos::Profiling::pushRegion(name.c_str()); + Kokkos::RangePolicy policy(0, _b.extent(0)); + Kokkos::parallel_reduce(name.c_str(), policy, *this, info_sum); + Kokkos::Profiling::popRegion(); + return info_sum; + } +}; + +template +struct Functor_BatchedSerialGemv { + using execution_space = typename DeviceType::execution_space; + AViewType _a; + xViewType _x; + yViewType _y; + ScalarType _alpha, _beta; + + KOKKOS_INLINE_FUNCTION + Functor_BatchedSerialGemv(const ScalarType alpha, const AViewType &a, const xViewType &x, const ScalarType beta, + const yViewType &y) + : _a(a), _x(x), _y(y), _alpha(alpha), _beta(beta) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int k) const { + auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL()); + auto xx = Kokkos::subview(_x, k, Kokkos::ALL()); + auto yy = Kokkos::subview(_y, k, Kokkos::ALL()); + + KokkosBlas::SerialGemv::invoke(_alpha, aa, xx, _beta, yy); + } + + inline void run() { + using value_type = typename AViewType::non_const_value_type; + std::string name_region("KokkosBatched::Test::SerialPbtrs"); + const std::string name_value_type = Test::value_type_name(); + std::string name = name_region + name_value_type; + Kokkos::RangePolicy policy(0, _x.extent(0)); + Kokkos::parallel_for(name.c_str(), policy, *this); + } +}; + +template +/// \brief Implementation details of batched pbtrs test +/// Confirm A * x = b, where +/// A: [[4, 1, 0], +/// [1, 4, 1], +/// [0, 1, 4]] +/// b: [1, 1, 1] +/// x: [3/14, 1/7, 3/14] +/// +/// This corresponds to the following system of equations: +/// 4 x0 + x1 = 1 +/// x0 + 4 x1 + x2 = 1 +/// x1 + 4 x2 = 1 +/// \param N [in] Batch size of RHS (banded matrix can also be batched matrix) +/// \param k [in] Number of superdiagonals or subdiagonals of matrix A +/// \param BlkSize [in] Block size of matrix A +void impl_test_batched_pbtrs_analytical(const int N) { + using ats = typename Kokkos::ArithTraits; + using RealType = typename ats::mag_type; + using View2DType = Kokkos::View; + using View3DType = Kokkos::View; + + constexpr int BlkSize = 3, k = 1; + View3DType A("A", N, BlkSize, BlkSize), A_reconst("A_reconst", N, BlkSize, BlkSize); + View3DType Ab("Ab", N, k + 1, BlkSize); // Banded matrix + View2DType x0("x0", N, BlkSize), x_ref("x_ref", N, BlkSize), y0("y0", N, BlkSize); // Solutions + + auto h_A_reconst = Kokkos::create_mirror_view(A_reconst); + auto h_x_ref = Kokkos::create_mirror_view(x_ref); + + for (int ib = 0; ib < N; ib++) { + for (int i = 0; i < BlkSize; i++) { + for (int j = 0; j < BlkSize; j++) { + h_A_reconst(ib, i, j) = i == j ? 4.0 : 1.0; + } + } + + h_x_ref(ib, 0) = 3.0 / 14.0; + h_x_ref(ib, 1) = 1.0 / 7.0; + h_x_ref(ib, 2) = 3.0 / 14.0; + } + + Kokkos::fence(); + + Kokkos::deep_copy(x0, ScalarType(1.0)); + Kokkos::deep_copy(A_reconst, h_A_reconst); + + // Create banded triangluar matrix in normal and banded storage + using ArgUplo = typename ParamTagType::uplo; + create_banded_pds_matrix(A_reconst, A, k, false); + create_banded_triangular_matrix(A_reconst, Ab, k, true); + + // Factorize with Pbtrf: A = U**H * U or A = L * L**H + Functor_BatchedSerialPbtrf(Ab).run(); + + // pbtrs (Note, Ab is a factorized matrix of A) + auto info = Functor_BatchedSerialPbtrs(Ab, x0).run(); + + Kokkos::fence(); + EXPECT_EQ(info, 0); + + // this eps is about 10^-14 + RealType eps = 1.0e3 * ats::epsilon(); + auto h_x0 = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), x0); + + // Check x0 = x1 + for (int ib = 0; ib < N; ib++) { + for (int i = 0; i < BlkSize; i++) { + Test::EXPECT_NEAR_KK_REL(h_x0(ib, i), h_x_ref(ib, i), eps); + } + } +} + +template +/// \brief Implementation details of batched pbtrs test +/// Confirm A * x = b, where +/// +/// \param N [in] Batch size of RHS (banded matrix can also be batched matrix) +/// \param k [in] Number of superdiagonals or subdiagonals of matrix A +/// \param BlkSize [in] Block size of matrix A +void impl_test_batched_pbtrs(const int N, const int k, const int BlkSize) { + using ats = typename Kokkos::ArithTraits; + using RealType = typename ats::mag_type; + using View2DType = Kokkos::View; + using View3DType = Kokkos::View; + + View3DType A("A", N, BlkSize, BlkSize), A_reconst("A_reconst", N, BlkSize, BlkSize); + View3DType Ab("Ab", N, k + 1, BlkSize); // Banded matrix + View2DType x0("x0", N, BlkSize), x_ref("x_ref", N, BlkSize), y0("y0", N, BlkSize); // Solutions + + using execution_space = typename DeviceType::execution_space; + Kokkos::Random_XorShift64_Pool rand_pool(13718); + ScalarType randStart, randEnd; + + // Initialize A_reconst with random matrix + KokkosKernels::Impl::getRandomBounds(1.0, randStart, randEnd); + Kokkos::fill_random(A, rand_pool, randStart, randEnd); + Kokkos::fill_random(x0, rand_pool, randStart, randEnd); + Kokkos::deep_copy(x_ref, x0); + + // Make the matrix Positive Definite Symmetric and Diagonal dominant + random_to_pds(A, A_reconst); + Kokkos::deep_copy(A, ScalarType(0.0)); + + // Create banded triangluar matrix in normal and banded storage + using ArgUplo = typename ParamTagType::uplo; + create_banded_pds_matrix(A_reconst, A, k, false); + + create_banded_triangular_matrix(A_reconst, Ab, k, true); + + Kokkos::fence(); + + // Factorize with Pbtrf: A = U**H * U or A = L * L**H + Functor_BatchedSerialPbtrf(Ab).run(); + + // pbtrs (Note, Ab is a factorized matrix of A) + auto info = Functor_BatchedSerialPbtrs(Ab, x0).run(); + + Kokkos::fence(); + EXPECT_EQ(info, 0); + + // Gemv to compute A*x0, this should be identical to x_ref + Functor_BatchedSerialGemv(1.0, A, x0, 0.0, y0).run(); + + Kokkos::fence(); + + // this eps is about 10^-14 + RealType eps = 1.0e3 * ats::epsilon(); + + auto h_y0 = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), y0); + auto h_x_ref = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), x_ref); + + // Check A * x0 = x_ref + for (int ib = 0; ib < N; ib++) { + for (int i = 0; i < BlkSize; i++) { + Test::EXPECT_NEAR_KK_REL(h_y0(ib, i), h_x_ref(ib, i), eps); + } + } +} + +} // namespace Pbtrs +} // namespace Test + +template +int test_batched_pbtrs() { +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) + { + using LayoutType = Kokkos::LayoutLeft; + Test::Pbtrs::impl_test_batched_pbtrs_analytical(1); + Test::Pbtrs::impl_test_batched_pbtrs_analytical(2); + for (int i = 0; i < 10; i++) { + int k = 1; + Test::Pbtrs::impl_test_batched_pbtrs(1, k, i); + Test::Pbtrs::impl_test_batched_pbtrs(2, k, i); + } + } +#endif +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) + { + using LayoutType = Kokkos::LayoutRight; + Test::Pbtrs::impl_test_batched_pbtrs_analytical(1); + Test::Pbtrs::impl_test_batched_pbtrs_analytical(2); + for (int i = 0; i < 10; i++) { + int k = 1; + Test::Pbtrs::impl_test_batched_pbtrs(1, k, i); + Test::Pbtrs::impl_test_batched_pbtrs(2, k, i); + } + } +#endif + + return 0; +} diff --git a/packages/kokkos-kernels/batched/dense/unit_test/Test_Batched_SerialPbtrs_Complex.hpp b/packages/kokkos-kernels/batched/dense/unit_test/Test_Batched_SerialPbtrs_Complex.hpp new file mode 100644 index 000000000000..258805d9508c --- /dev/null +++ b/packages/kokkos-kernels/batched/dense/unit_test/Test_Batched_SerialPbtrs_Complex.hpp @@ -0,0 +1,45 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) +TEST_F(TestCategory, test_batched_pbtrs_l_fcomplex) { + using algo_tag_type = typename Algo::Pbtrs::Unblocked; + using param_tag_type = ::Test::Pbtrs::ParamTag; + + test_batched_pbtrs, param_tag_type, algo_tag_type>(); +} +TEST_F(TestCategory, test_batched_pbtrs_u_fcomplex) { + using algo_tag_type = typename Algo::Pbtrs::Unblocked; + using param_tag_type = ::Test::Pbtrs::ParamTag; + + test_batched_pbtrs, param_tag_type, algo_tag_type>(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) +TEST_F(TestCategory, test_batched_pbtrs_l_dcomplex) { + using algo_tag_type = typename Algo::Pbtrs::Unblocked; + using param_tag_type = ::Test::Pbtrs::ParamTag; + + test_batched_pbtrs, param_tag_type, algo_tag_type>(); +} +TEST_F(TestCategory, test_batched_pbtrs_u_dcomplex) { + using algo_tag_type = typename Algo::Pbtrs::Unblocked; + using param_tag_type = ::Test::Pbtrs::ParamTag; + + test_batched_pbtrs, param_tag_type, algo_tag_type>(); +} +#endif diff --git a/packages/kokkos-kernels/batched/dense/unit_test/Test_Batched_SerialPbtrs_Real.hpp b/packages/kokkos-kernels/batched/dense/unit_test/Test_Batched_SerialPbtrs_Real.hpp new file mode 100644 index 000000000000..fbad59bbc686 --- /dev/null +++ b/packages/kokkos-kernels/batched/dense/unit_test/Test_Batched_SerialPbtrs_Real.hpp @@ -0,0 +1,45 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#if defined(KOKKOSKERNELS_INST_FLOAT) +TEST_F(TestCategory, test_batched_pbtrs_l_float) { + using algo_tag_type = typename Algo::Pbtrs::Unblocked; + using param_tag_type = ::Test::Pbtrs::ParamTag; + + test_batched_pbtrs(); +} +TEST_F(TestCategory, test_batched_pbtrs_u_float) { + using algo_tag_type = typename Algo::Pbtrs::Unblocked; + using param_tag_type = ::Test::Pbtrs::ParamTag; + + test_batched_pbtrs(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_DOUBLE) +TEST_F(TestCategory, test_batched_pbtrs_l_double) { + using algo_tag_type = typename Algo::Pbtrs::Unblocked; + using param_tag_type = ::Test::Pbtrs::ParamTag; + + test_batched_pbtrs(); +} +TEST_F(TestCategory, test_batched_pbtrs_u_double) { + using algo_tag_type = typename Algo::Pbtrs::Unblocked; + using param_tag_type = ::Test::Pbtrs::ParamTag; + + test_batched_pbtrs(); +} +#endif diff --git a/packages/kokkos-kernels/batched/dense/unit_test/Test_Batched_SerialPttrs.hpp b/packages/kokkos-kernels/batched/dense/unit_test/Test_Batched_SerialPttrs.hpp new file mode 100644 index 000000000000..6e501b6d979c --- /dev/null +++ b/packages/kokkos-kernels/batched/dense/unit_test/Test_Batched_SerialPttrs.hpp @@ -0,0 +1,430 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +/// \author Yuuichi Asahi (yuuichi.asahi@cea.fr) +#include +#include +#include + +#include "KokkosBatched_Util.hpp" +#include "KokkosBatched_Pttrs.hpp" +#include "Test_Batched_DenseUtils.hpp" + +using namespace KokkosBatched; + +namespace Test { +namespace Pttrs { + +template +struct ParamTag { + using uplo = U; +}; + +template +struct Functor_BatchedSerialPttrf { + using execution_space = typename DeviceType::execution_space; + DViewType _d; + EViewType _e; + + KOKKOS_INLINE_FUNCTION + Functor_BatchedSerialPttrf(const DViewType &d, const EViewType &e) : _d(d), _e(e) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int k) const { + auto dd = Kokkos::subview(_d, k, Kokkos::ALL()); + auto ee = Kokkos::subview(_e, k, Kokkos::ALL()); + + KokkosBatched::SerialPttrf::invoke(dd, ee); + } + + inline void run() { + using value_type = typename EViewType::non_const_value_type; + std::string name_region("KokkosBatched::Test::SerialPttrs"); + const std::string name_value_type = Test::value_type_name(); + std::string name = name_region + name_value_type; + Kokkos::RangePolicy policy(0, _d.extent(0)); + Kokkos::parallel_for(name.c_str(), policy, *this); + } +}; + +template +struct Functor_BatchedSerialPttrs { + using execution_space = typename DeviceType::execution_space; + DViewType _d; + EViewType _e; + BViewType _b; + + KOKKOS_INLINE_FUNCTION + Functor_BatchedSerialPttrs(const DViewType &d, const EViewType &e, const BViewType &b) : _d(d), _e(e), _b(b) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const ParamTagType &, const int k, int &info) const { + auto dd = Kokkos::subview(_d, k, Kokkos::ALL()); + auto ee = Kokkos::subview(_e, k, Kokkos::ALL()); + auto bb = Kokkos::subview(_b, k, Kokkos::ALL()); + + info += KokkosBatched::SerialPttrs::invoke(dd, ee, bb); + } + + inline int run() { + using value_type = typename BViewType::non_const_value_type; + std::string name_region("KokkosBatched::Test::SerialPttrs"); + const std::string name_value_type = Test::value_type_name(); + std::string name = name_region + name_value_type; + int info_sum = 0; + Kokkos::Profiling::pushRegion(name.c_str()); + Kokkos::RangePolicy policy(0, _d.extent(0)); + Kokkos::parallel_reduce(name.c_str(), policy, *this, info_sum); + Kokkos::Profiling::popRegion(); + return info_sum; + } +}; + +template +struct Functor_BatchedSerialGemv { + using execution_space = typename DeviceType::execution_space; + AViewType _a; + xViewType _x; + yViewType _y; + ScalarType _alpha, _beta; + + KOKKOS_INLINE_FUNCTION + Functor_BatchedSerialGemv(const ScalarType alpha, const AViewType &a, const xViewType &x, const ScalarType beta, + const yViewType &y) + : _a(a), _x(x), _y(y), _alpha(alpha), _beta(beta) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int k) const { + auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL()); + auto xx = Kokkos::subview(_x, k, Kokkos::ALL()); + auto yy = Kokkos::subview(_y, k, Kokkos::ALL()); + + KokkosBlas::SerialGemv::invoke(_alpha, aa, xx, _beta, yy); + } + + inline void run() { + using value_type = typename AViewType::non_const_value_type; + std::string name_region("KokkosBatched::Test::SerialPttrs"); + const std::string name_value_type = Test::value_type_name(); + std::string name = name_region + name_value_type; + Kokkos::RangePolicy policy(0, _a.extent(0)); + Kokkos::parallel_for(name.c_str(), policy, *this); + } +}; + +template +struct Functor_BatchedSerialGemm { + using execution_space = typename DeviceType::execution_space; + AViewType _a; + BViewType _b; + CViewType _c; + ScalarType _alpha, _beta; + + KOKKOS_INLINE_FUNCTION + Functor_BatchedSerialGemm(const ScalarType alpha, const AViewType &a, const BViewType &b, const ScalarType beta, + const CViewType &c) + : _a(a), _b(b), _c(c), _alpha(alpha), _beta(beta) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int k) const { + auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL()); + auto bb = Kokkos::subview(_b, k, Kokkos::ALL(), Kokkos::ALL()); + auto cc = Kokkos::subview(_c, k, Kokkos::ALL(), Kokkos::ALL()); + + KokkosBatched::SerialGemm::invoke(_alpha, aa, bb, _beta, cc); + } + + inline void run() { + using value_type = typename AViewType::non_const_value_type; + std::string name_region("KokkosBatched::Test::SerialPttrs"); + const std::string name_value_type = Test::value_type_name(); + std::string name = name_region + name_value_type; + Kokkos::RangePolicy policy(0, _a.extent(0)); + Kokkos::parallel_for(name.c_str(), policy, *this); + } +}; + +template +/// \brief Implementation details of batched pttrs test +/// Confirm A * x = b, where +/// A: [[4, 1], +/// [1, 4]] +/// b: [1, 1] +/// x: [1/5, 1/5] +/// +/// This corresponds to the following system of equations: +/// 4 x0 + x1 = 1 +/// x0 + 4 x1 = 1 +/// +/// \param N [in] Batch size of RHS (banded matrix can also be batched matrix) +/// \param k [in] Number of superdiagonals or subdiagonals of matrix A +/// \param BlkSize [in] Block size of matrix A +void impl_test_batched_pttrs_analytical(const int N) { + using ats = typename Kokkos::ArithTraits; + using RealType = typename ats::mag_type; + using RealView2DType = Kokkos::View; + using View2DType = Kokkos::View; + + constexpr int BlkSize = 2; + RealView2DType d(Kokkos::view_alloc("d", Kokkos::WithoutInitializing), N, + BlkSize); // Diagonal components + View2DType e(Kokkos::view_alloc("e", Kokkos::WithoutInitializing), N, + BlkSize - 1); // Upper and lower diagonal components (identical) + View2DType x(Kokkos::view_alloc("x", Kokkos::WithoutInitializing), N, + BlkSize); // Solutions + + Kokkos::deep_copy(d, RealType(4.0)); + Kokkos::deep_copy(e, ScalarType(1.0)); + Kokkos::deep_copy(x, ScalarType(1.0)); // This initialy stores b + + // Factorize matrix A -> L * D * L**H + // d and e are updated by pttrf + Functor_BatchedSerialPttrf(d, e).run(); + + // pttrs (Note, d and e must be factorized by pttrf) + auto info = + Functor_BatchedSerialPttrs(d, e, x) + .run(); + + Kokkos::fence(); + + EXPECT_EQ(info, 0); + + // this eps is about 10^-14 + RealType eps = 1.0e3 * ats::epsilon(); + + auto h_x = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), x); + + // Check x = [1/5, 1/5] + for (int ib = 0; ib < N; ib++) { + for (int i = 0; i < BlkSize; i++) { + EXPECT_NEAR_KK(h_x(ib, i), ScalarType(1.0 / 5.0), eps); + } + } +} + +template +/// \brief Implementation details of batched pttrs test +/// +/// \param N [in] Batch size of RHS (banded matrix can also be batched matrix) +/// \param k [in] Number of superdiagonals or subdiagonals of matrix A +/// \param BlkSize [in] Block size of matrix A +void impl_test_batched_pttrs(const int N, const int BlkSize) { + using ats = typename Kokkos::ArithTraits; + using RealType = typename ats::mag_type; + using RealView2DType = Kokkos::View; + using View2DType = Kokkos::View; + using View3DType = Kokkos::View; + + View3DType A("A", N, BlkSize, BlkSize), EL("EL", N, BlkSize, BlkSize), EU("EU", N, BlkSize, BlkSize), + D("D", N, BlkSize, BlkSize), I("I", N, BlkSize, BlkSize); + RealView2DType d(Kokkos::view_alloc("d", Kokkos::WithoutInitializing), N, + BlkSize), // Diagonal components + ones(Kokkos::view_alloc("ones", Kokkos::WithoutInitializing), N, BlkSize); + View2DType e_upper("e_upper", N, BlkSize - 1), e_lower("e_lower", N, + BlkSize - 1); // upper and lower diagonal components + View2DType x("x", N, BlkSize), b_ref("x_ref", N, BlkSize), b("b", N, BlkSize); // Solutions + + using execution_space = typename DeviceType::execution_space; + Kokkos::Random_XorShift64_Pool rand_pool(13718); + RealType realRandStart, realRandEnd; + ScalarType randStart, randEnd; + + KokkosKernels::Impl::getRandomBounds(1.0, realRandStart, realRandEnd); + KokkosKernels::Impl::getRandomBounds(1.0, randStart, randEnd); + + // Add BlkSize to ensure positive definiteness + Kokkos::fill_random(d, rand_pool, realRandStart + BlkSize, realRandEnd + BlkSize); + Kokkos::fill_random(e_upper, rand_pool, randStart, randEnd); + Kokkos::fill_random(x, rand_pool, randStart, randEnd); + + auto h_e_upper = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), e_upper); + auto h_e_lower = Kokkos::create_mirror_view(e_lower); + + for (int ib = 0; ib < N; ib++) { + for (int i = 0; i < BlkSize - 1; i++) { + // Fill the lower diagonal with conjugate of the upper diagonal + h_e_lower(ib, i) = Kokkos::ArithTraits::conj(h_e_upper(ib, i)); + } + } + + Kokkos::deep_copy(e_lower, h_e_lower); + Kokkos::deep_copy(b_ref, x); + Kokkos::deep_copy(ones, RealType(1.0)); + + // Reconstruct Tridiagonal matrix A + // A = D + EL + EU + create_diagonal_matrix(e_lower, EL, -1); + create_diagonal_matrix(e_upper, EU, 1); + create_diagonal_matrix(d, D); + create_diagonal_matrix(ones, I); + + // Matrix matrix addition by Gemm + // D + EU by D * I + EU (result stored in EU) + Functor_BatchedSerialGemm(1.0, D, I, + 1.0, EU) + .run(); + + // Copy EL to A + Kokkos::deep_copy(A, EL); + + // EU + EL by EU * I + A (result stored in A) + Functor_BatchedSerialGemm(1.0, EU, I, + 1.0, A) + .run(); + + Kokkos::fence(); + + int info = 0; + if (std::is_same_v) { + // Factorize matrix A -> U**H * D * U + // d and e are updated by pttrf + Functor_BatchedSerialPttrf(d, e_upper).run(); + + // pttrs (Note, d and e must be factorized by pttrf) + info = Functor_BatchedSerialPttrs( + d, e_upper, x) + .run(); + } else { + // Factorize matrix A -> L * D * L**H + // d and e are updated by pttrf + Functor_BatchedSerialPttrf(d, e_lower).run(); + + // pttrs (Note, d and e must be factorized by pttrf) + info = Functor_BatchedSerialPttrs( + d, e_lower, x) + .run(); + } + + Kokkos::fence(); + + EXPECT_EQ(info, 0); + + // Gemv to compute b = A * x, this should be identical to b_ref + Functor_BatchedSerialGemv(1.0, A, x, 0.0, b).run(); + + // this eps is about 10^-14 + RealType eps = 1.0e3 * ats::epsilon(); + + auto h_b = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), b); + auto h_b_ref = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), b_ref); + + // Check A * x = b_ref + for (int ib = 0; ib < N; ib++) { + for (int i = 0; i < BlkSize; i++) { + EXPECT_NEAR_KK(h_b(ib, i), h_b_ref(ib, i), eps); + } + } +} + +template +/// \brief Implementation details of batched pttrs test for early return +/// BlkSize must be 0 or 1 +/// +/// \param N [in] Batch size of RHS (banded matrix can also be batched matrix) +/// \param k [in] Number of superdiagonals or subdiagonals of matrix A +/// \param BlkSize [in] Block size of matrix A +void impl_test_batched_pttrs_quick_return(const int N, const int BlkSize) { + using ats = typename Kokkos::ArithTraits; + using RealType = typename ats::mag_type; + using RealView2DType = Kokkos::View; + using View2DType = Kokkos::View; + + if (BlkSize > 1) return; + + const int BlkSize_minus_1 = BlkSize > 0 ? BlkSize - 1 : 0; + + RealView2DType d(Kokkos::view_alloc("d", Kokkos::WithoutInitializing), N, + BlkSize); // Diagonal components + View2DType e(Kokkos::view_alloc("e", Kokkos::WithoutInitializing), N, + BlkSize_minus_1); // lower diagonal components + View2DType x("x", N, BlkSize); // Solutions + + const RealType reference_value = 4.0; + + Kokkos::deep_copy(d, reference_value); + Kokkos::deep_copy(e, ScalarType(1.0)); + Kokkos::deep_copy(x, ScalarType(1.0)); + + // Factorize matrix A -> U**H * D * U or L * D * L**H + // d and e are updated by pttrf + Functor_BatchedSerialPttrf(d, e).run(); + + // pttrs (Note, d and e must be factorized by pttrf) + auto info = + Functor_BatchedSerialPttrs(d, e, x) + .run(); + + Kokkos::fence(); + + EXPECT_EQ(info, 0); + + // this eps is about 10^-14 + RealType eps = 1.0e3 * ats::epsilon(); + + auto h_x = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), x); + + // Check x = x_ref + for (int ib = 0; ib < N; ib++) { + for (int i = 0; i < BlkSize; i++) { + EXPECT_NEAR_KK(h_x(ib, i), ScalarType(1.0 / reference_value), eps); + } + } +} + +} // namespace Pttrs +} // namespace Test + +template +int test_batched_pttrs() { +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) + { + using LayoutType = Kokkos::LayoutLeft; + Test::Pttrs::impl_test_batched_pttrs_analytical(1); + Test::Pttrs::impl_test_batched_pttrs_analytical(2); + for (int i = 0; i < 2; i++) { + Test::Pttrs::impl_test_batched_pttrs_quick_return( + 1, i); + Test::Pttrs::impl_test_batched_pttrs_quick_return( + 2, i); + } + for (int i = 2; i < 10; i++) { + Test::Pttrs::impl_test_batched_pttrs(1, i); + Test::Pttrs::impl_test_batched_pttrs(2, i); + } + } +#endif +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) + { + using LayoutType = Kokkos::LayoutRight; + Test::Pttrs::impl_test_batched_pttrs_analytical(1); + Test::Pttrs::impl_test_batched_pttrs_analytical(2); + for (int i = 0; i < 2; i++) { + Test::Pttrs::impl_test_batched_pttrs_quick_return( + 1, i); + Test::Pttrs::impl_test_batched_pttrs_quick_return( + 2, i); + } + for (int i = 2; i < 10; i++) { + Test::Pttrs::impl_test_batched_pttrs(1, i); + Test::Pttrs::impl_test_batched_pttrs(2, i); + } + } +#endif + + return 0; +} diff --git a/packages/kokkos-kernels/batched/dense/unit_test/Test_Batched_SerialPttrs_Complex.hpp b/packages/kokkos-kernels/batched/dense/unit_test/Test_Batched_SerialPttrs_Complex.hpp new file mode 100644 index 000000000000..78ca1b1f7cd6 --- /dev/null +++ b/packages/kokkos-kernels/batched/dense/unit_test/Test_Batched_SerialPttrs_Complex.hpp @@ -0,0 +1,43 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) +TEST_F(TestCategory, test_batched_pttrs_l_fcomplex) { + using algo_tag_type = typename Algo::Pttrs::Unblocked; + using param_tag_type = ::Test::Pttrs::ParamTag; + + test_batched_pttrs, param_tag_type, algo_tag_type>(); +} +TEST_F(TestCategory, test_batched_pttrs_u_fcomplex) { + using algo_tag_type = typename Algo::Pttrs::Unblocked; + using param_tag_type = ::Test::Pttrs::ParamTag; + test_batched_pttrs, param_tag_type, algo_tag_type>(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) +TEST_F(TestCategory, test_batched_pttrs_l_dcomplex) { + using algo_tag_type = typename Algo::Pttrs::Unblocked; + using param_tag_type = ::Test::Pttrs::ParamTag; + + test_batched_pttrs, param_tag_type, algo_tag_type>(); +} +TEST_F(TestCategory, test_batched_pttrs_u_dcomplex) { + using algo_tag_type = typename Algo::Pttrs::Unblocked; + using param_tag_type = ::Test::Pttrs::ParamTag; + test_batched_pttrs, param_tag_type, algo_tag_type>(); +} +#endif diff --git a/packages/kokkos-kernels/batched/dense/unit_test/Test_Batched_SerialPttrs_Real.hpp b/packages/kokkos-kernels/batched/dense/unit_test/Test_Batched_SerialPttrs_Real.hpp new file mode 100644 index 000000000000..6bf4fadb6d3d --- /dev/null +++ b/packages/kokkos-kernels/batched/dense/unit_test/Test_Batched_SerialPttrs_Real.hpp @@ -0,0 +1,43 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#if defined(KOKKOSKERNELS_INST_FLOAT) +TEST_F(TestCategory, test_batched_pttrs_l_float) { + using algo_tag_type = typename Algo::Pttrs::Unblocked; + using param_tag_type = ::Test::Pttrs::ParamTag; + + test_batched_pttrs(); +} +TEST_F(TestCategory, test_batched_pttrs_u_float) { + using algo_tag_type = typename Algo::Pttrs::Unblocked; + using param_tag_type = ::Test::Pttrs::ParamTag; + test_batched_pttrs(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_DOUBLE) +TEST_F(TestCategory, test_batched_pttrs_l_double) { + using algo_tag_type = typename Algo::Pttrs::Unblocked; + using param_tag_type = ::Test::Pttrs::ParamTag; + + test_batched_pttrs(); +} +TEST_F(TestCategory, test_batched_pttrs_u_double) { + using algo_tag_type = typename Algo::Pttrs::Unblocked; + using param_tag_type = ::Test::Pttrs::ParamTag; + test_batched_pttrs(); +} +#endif diff --git a/packages/kokkos-kernels/batched/dense/unit_test/Test_Batched_SerialSVD.hpp b/packages/kokkos-kernels/batched/dense/unit_test/Test_Batched_SerialSVD.hpp index a30c9dcc899e..586b69dc7fae 100644 --- a/packages/kokkos-kernels/batched/dense/unit_test/Test_Batched_SerialSVD.hpp +++ b/packages/kokkos-kernels/batched/dense/unit_test/Test_Batched_SerialSVD.hpp @@ -18,6 +18,8 @@ #include "KokkosBatched_SVD_Decl.hpp" //For testing overall kernel #include "KokkosBatched_SVD_Serial_Internal.hpp" //For unit testing individual components #include "KokkosBatched_SetIdentity_Decl.hpp" +#include "KokkosKernels_TestMatrixUtils.hpp" +#include "KokkosKernels_TestVanilla.hpp" namespace Test { template diff --git a/packages/kokkos-kernels/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_nt_bll_eti_spec_decl.hpp.in b/packages/kokkos-kernels/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_nt_bll_eti_spec_decl.hpp.in new file mode 100644 index 000000000000..b1cef3592cdd --- /dev/null +++ b/packages/kokkos-kernels/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_nt_bll_eti_spec_decl.hpp.in @@ -0,0 +1,25 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBATCHED_GEMM_NT_NT_BLL_ETI_SPEC_DECL_HPP_ +#define KOKKOSBATCHED_GEMM_NT_NT_BLL_ETI_SPEC_DECL_HPP_ +namespace KokkosBatched { +namespace Impl { +using KokkosBlas::Trans; +@BATCHED_GEMM_NT_NT_BLL_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/packages/kokkos-kernels/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_nt_blr_eti_spec_decl.hpp.in b/packages/kokkos-kernels/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_nt_blr_eti_spec_decl.hpp.in new file mode 100644 index 000000000000..9f163222d5ca --- /dev/null +++ b/packages/kokkos-kernels/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_nt_blr_eti_spec_decl.hpp.in @@ -0,0 +1,25 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBATCHED_GEMM_NT_NT_BLR_ETI_SPEC_DECL_HPP_ +#define KOKKOSBATCHED_GEMM_NT_NT_BLR_ETI_SPEC_DECL_HPP_ +namespace KokkosBatched { +namespace Impl { +using KokkosBlas::Trans; +@BATCHED_GEMM_NT_NT_BLR_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/packages/kokkos-kernels/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_t_bll_eti_spec_decl.hpp.in b/packages/kokkos-kernels/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_t_bll_eti_spec_decl.hpp.in new file mode 100644 index 000000000000..eaaa44d6c000 --- /dev/null +++ b/packages/kokkos-kernels/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_t_bll_eti_spec_decl.hpp.in @@ -0,0 +1,25 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBATCHED_GEMM_NT_T_BLL_ETI_SPEC_DECL_HPP_ +#define KOKKOSBATCHED_GEMM_NT_T_BLL_ETI_SPEC_DECL_HPP_ +namespace KokkosBatched { +namespace Impl { +using KokkosBlas::Trans; +@BATCHED_GEMM_NT_T_BLL_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/packages/kokkos-kernels/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_t_blr_eti_spec_decl.hpp.in b/packages/kokkos-kernels/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_t_blr_eti_spec_decl.hpp.in new file mode 100644 index 000000000000..616d4f56e11a --- /dev/null +++ b/packages/kokkos-kernels/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_t_blr_eti_spec_decl.hpp.in @@ -0,0 +1,25 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBATCHED_GEMM_NT_T_BLR_ETI_SPEC_DECL_HPP_ +#define KOKKOSBATCHED_GEMM_NT_T_BLR_ETI_SPEC_DECL_HPP_ +namespace KokkosBatched { +namespace Impl { +using KokkosBlas::Trans; +@BATCHED_GEMM_NT_T_BLR_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/packages/kokkos-kernels/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_nt_bll_eti_spec_decl.hpp.in b/packages/kokkos-kernels/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_nt_bll_eti_spec_decl.hpp.in new file mode 100644 index 000000000000..ceeec3e8c129 --- /dev/null +++ b/packages/kokkos-kernels/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_nt_bll_eti_spec_decl.hpp.in @@ -0,0 +1,25 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBATCHED_GEMM_T_NT_BLL_ETI_SPEC_DECL_HPP_ +#define KOKKOSBATCHED_GEMM_T_NT_BLL_ETI_SPEC_DECL_HPP_ +namespace KokkosBatched { +namespace Impl { +using KokkosBlas::Trans; +@BATCHED_GEMM_T_NT_BLL_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/packages/kokkos-kernels/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_nt_blr_eti_spec_decl.hpp.in b/packages/kokkos-kernels/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_nt_blr_eti_spec_decl.hpp.in new file mode 100644 index 000000000000..142720514ec5 --- /dev/null +++ b/packages/kokkos-kernels/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_nt_blr_eti_spec_decl.hpp.in @@ -0,0 +1,25 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBATCHED_GEMM_T_NT_BLR_ETI_SPEC_DECL_HPP_ +#define KOKKOSBATCHED_GEMM_T_NT_BLR_ETI_SPEC_DECL_HPP_ +namespace KokkosBatched { +namespace Impl { +using KokkosBlas::Trans; +@BATCHED_GEMM_T_NT_BLR_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/packages/kokkos-kernels/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_t_bll_eti_spec_decl.hpp.in b/packages/kokkos-kernels/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_t_bll_eti_spec_decl.hpp.in new file mode 100644 index 000000000000..6c11f794007e --- /dev/null +++ b/packages/kokkos-kernels/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_t_bll_eti_spec_decl.hpp.in @@ -0,0 +1,25 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBATCHED_GEMM_T_T_BLL_ETI_SPEC_DECL_HPP_ +#define KOKKOSBATCHED_GEMM_T_T_BLL_ETI_SPEC_DECL_HPP_ +namespace KokkosBatched { +namespace Impl { +using KokkosBlas::Trans; +@BATCHED_GEMM_T_T_BLL_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/packages/kokkos-kernels/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_t_blr_eti_spec_decl.hpp.in b/packages/kokkos-kernels/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_t_blr_eti_spec_decl.hpp.in new file mode 100644 index 000000000000..c84b3d0db660 --- /dev/null +++ b/packages/kokkos-kernels/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_t_blr_eti_spec_decl.hpp.in @@ -0,0 +1,25 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBATCHED_GEMM_T_T_BLR_ETI_SPEC_DECL_HPP_ +#define KOKKOSBATCHED_GEMM_T_T_BLR_ETI_SPEC_DECL_HPP_ +namespace KokkosBatched { +namespace Impl { +using KokkosBlas::Trans; +@BATCHED_GEMM_T_T_BLR_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/packages/kokkos-kernels/batched/sparse/impl/KokkosBatched_CG_TeamVector_Impl.hpp b/packages/kokkos-kernels/batched/sparse/impl/KokkosBatched_CG_TeamVector_Impl.hpp index 9aa4b95f2c1e..513e16b5e1be 100644 --- a/packages/kokkos-kernels/batched/sparse/impl/KokkosBatched_CG_TeamVector_Impl.hpp +++ b/packages/kokkos-kernels/batched/sparse/impl/KokkosBatched_CG_TeamVector_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_CG_TEAMVECTOR_IMPL_HPP__ -#define __KOKKOSBATCHED_CG_TEAMVECTOR_IMPL_HPP__ +#ifndef KOKKOSBATCHED_CG_TEAMVECTOR_IMPL_HPP +#define KOKKOSBATCHED_CG_TEAMVECTOR_IMPL_HPP /// \author Kim Liegeois (knliege@sandia.gov) diff --git a/packages/kokkos-kernels/batched/sparse/impl/KokkosBatched_CG_Team_Impl.hpp b/packages/kokkos-kernels/batched/sparse/impl/KokkosBatched_CG_Team_Impl.hpp index 82c62624c168..b77a0daf204a 100644 --- a/packages/kokkos-kernels/batched/sparse/impl/KokkosBatched_CG_Team_Impl.hpp +++ b/packages/kokkos-kernels/batched/sparse/impl/KokkosBatched_CG_Team_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_CG_TEAM_IMPL_HPP__ -#define __KOKKOSBATCHED_CG_TEAM_IMPL_HPP__ +#ifndef KOKKOSBATCHED_CG_TEAM_IMPL_HPP +#define KOKKOSBATCHED_CG_TEAM_IMPL_HPP /// \author Kim Liegeois (knliege@sandia.gov) diff --git a/packages/kokkos-kernels/batched/sparse/impl/KokkosBatched_GMRES_Serial_Impl.hpp b/packages/kokkos-kernels/batched/sparse/impl/KokkosBatched_GMRES_Serial_Impl.hpp index 2d8c0cae0086..4095090e4dc0 100644 --- a/packages/kokkos-kernels/batched/sparse/impl/KokkosBatched_GMRES_Serial_Impl.hpp +++ b/packages/kokkos-kernels/batched/sparse/impl/KokkosBatched_GMRES_Serial_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_GMRES_SERIAL_IMPL_HPP__ -#define __KOKKOSBATCHED_GMRES_SERIAL_IMPL_HPP__ +#ifndef KOKKOSBATCHED_GMRES_SERIAL_IMPL_HPP +#define KOKKOSBATCHED_GMRES_SERIAL_IMPL_HPP /// \author Kim Liegeois (knliege@sandia.gov) diff --git a/packages/kokkos-kernels/batched/sparse/impl/KokkosBatched_GMRES_TeamVector_Impl.hpp b/packages/kokkos-kernels/batched/sparse/impl/KokkosBatched_GMRES_TeamVector_Impl.hpp index 8d37b2ac5eb9..89f062dd7a84 100644 --- a/packages/kokkos-kernels/batched/sparse/impl/KokkosBatched_GMRES_TeamVector_Impl.hpp +++ b/packages/kokkos-kernels/batched/sparse/impl/KokkosBatched_GMRES_TeamVector_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_GMRES_TEAMVECTOR_IMPL_HPP__ -#define __KOKKOSBATCHED_GMRES_TEAMVECTOR_IMPL_HPP__ +#ifndef KOKKOSBATCHED_GMRES_TEAMVECTOR_IMPL_HPP +#define KOKKOSBATCHED_GMRES_TEAMVECTOR_IMPL_HPP /// \author Kim Liegeois (knliege@sandia.gov) diff --git a/packages/kokkos-kernels/batched/sparse/impl/KokkosBatched_GMRES_Team_Impl.hpp b/packages/kokkos-kernels/batched/sparse/impl/KokkosBatched_GMRES_Team_Impl.hpp index 9fd9e09bd90c..601a18c9977c 100644 --- a/packages/kokkos-kernels/batched/sparse/impl/KokkosBatched_GMRES_Team_Impl.hpp +++ b/packages/kokkos-kernels/batched/sparse/impl/KokkosBatched_GMRES_Team_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_GMRES_TEAM_IMPL_HPP__ -#define __KOKKOSBATCHED_GMRES_TEAM_IMPL_HPP__ +#ifndef KOKKOSBATCHED_GMRES_TEAM_IMPL_HPP +#define KOKKOSBATCHED_GMRES_TEAM_IMPL_HPP /// \author Kim Liegeois (knliege@sandia.gov) diff --git a/packages/kokkos-kernels/batched/sparse/impl/KokkosBatched_Spmv_Serial_Impl.hpp b/packages/kokkos-kernels/batched/sparse/impl/KokkosBatched_Spmv_Serial_Impl.hpp index 3f76ee3d9fbe..88dac8799e7e 100644 --- a/packages/kokkos-kernels/batched/sparse/impl/KokkosBatched_Spmv_Serial_Impl.hpp +++ b/packages/kokkos-kernels/batched/sparse/impl/KokkosBatched_Spmv_Serial_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_SPMV_SERIAL_IMPL_HPP__ -#define __KOKKOSBATCHED_SPMV_SERIAL_IMPL_HPP__ +#ifndef KOKKOSBATCHED_SPMV_SERIAL_IMPL_HPP +#define KOKKOSBATCHED_SPMV_SERIAL_IMPL_HPP /// \author Kim Liegeois (knliege@sandia.gov) diff --git a/packages/kokkos-kernels/batched/sparse/impl/KokkosBatched_Spmv_TeamVector_Impl.hpp b/packages/kokkos-kernels/batched/sparse/impl/KokkosBatched_Spmv_TeamVector_Impl.hpp index 4df4b95e2c2c..d28aba16b8bb 100644 --- a/packages/kokkos-kernels/batched/sparse/impl/KokkosBatched_Spmv_TeamVector_Impl.hpp +++ b/packages/kokkos-kernels/batched/sparse/impl/KokkosBatched_Spmv_TeamVector_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_SPMV_TEAMVECTOR_IMPL_HPP__ -#define __KOKKOSBATCHED_SPMV_TEAMVECTOR_IMPL_HPP__ +#ifndef KOKKOSBATCHED_SPMV_TEAMVECTOR_IMPL_HPP +#define KOKKOSBATCHED_SPMV_TEAMVECTOR_IMPL_HPP /// \author Kim Liegeois (knliege@sandia.gov) diff --git a/packages/kokkos-kernels/batched/sparse/impl/KokkosBatched_Spmv_Team_Impl.hpp b/packages/kokkos-kernels/batched/sparse/impl/KokkosBatched_Spmv_Team_Impl.hpp index 9e3286161223..274ee65203fa 100644 --- a/packages/kokkos-kernels/batched/sparse/impl/KokkosBatched_Spmv_Team_Impl.hpp +++ b/packages/kokkos-kernels/batched/sparse/impl/KokkosBatched_Spmv_Team_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_SPMV_TEAM_IMPL_HPP__ -#define __KOKKOSBATCHED_SPMV_TEAM_IMPL_HPP__ +#ifndef KOKKOSBATCHED_SPMV_TEAM_IMPL_HPP +#define KOKKOSBATCHED_SPMV_TEAM_IMPL_HPP /// \author Kim Liegeois (knliege@sandia.gov) diff --git a/packages/kokkos-kernels/batched/sparse/src/KokkosBatched_CG.hpp b/packages/kokkos-kernels/batched/sparse/src/KokkosBatched_CG.hpp index cabf2eae987a..0a15d328115b 100644 --- a/packages/kokkos-kernels/batched/sparse/src/KokkosBatched_CG.hpp +++ b/packages/kokkos-kernels/batched/sparse/src/KokkosBatched_CG.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_CG_HPP__ -#define __KOKKOSBATCHED_CG_HPP__ +#ifndef KOKKOSBATCHED_CG_HPP +#define KOKKOSBATCHED_CG_HPP /// \author Kim Liegeois (knliege@sandia.gov) diff --git a/packages/kokkos-kernels/batched/sparse/src/KokkosBatched_CrsMatrix.hpp b/packages/kokkos-kernels/batched/sparse/src/KokkosBatched_CrsMatrix.hpp index 0d880cd88011..b2477d7efc32 100644 --- a/packages/kokkos-kernels/batched/sparse/src/KokkosBatched_CrsMatrix.hpp +++ b/packages/kokkos-kernels/batched/sparse/src/KokkosBatched_CrsMatrix.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_CRSMATRIX_HPP__ -#define __KOKKOSBATCHED_CRSMATRIX_HPP__ +#ifndef KOKKOSBATCHED_CRSMATRIX_HPP +#define KOKKOSBATCHED_CRSMATRIX_HPP /// \author Kim Liegeois (knliege@sandia.gov) diff --git a/packages/kokkos-kernels/batched/sparse/src/KokkosBatched_GMRES.hpp b/packages/kokkos-kernels/batched/sparse/src/KokkosBatched_GMRES.hpp index a3f4eda8d362..7982fa30f2c6 100644 --- a/packages/kokkos-kernels/batched/sparse/src/KokkosBatched_GMRES.hpp +++ b/packages/kokkos-kernels/batched/sparse/src/KokkosBatched_GMRES.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_GMRES_HPP__ -#define __KOKKOSBATCHED_GMRES_HPP__ +#ifndef KOKKOSBATCHED_GMRES_HPP +#define KOKKOSBATCHED_GMRES_HPP /// \author Kim Liegeois (knliege@sandia.gov) diff --git a/packages/kokkos-kernels/batched/sparse/src/KokkosBatched_Identity.hpp b/packages/kokkos-kernels/batched/sparse/src/KokkosBatched_Identity.hpp index 311ec09d5c60..421ad2c210c6 100644 --- a/packages/kokkos-kernels/batched/sparse/src/KokkosBatched_Identity.hpp +++ b/packages/kokkos-kernels/batched/sparse/src/KokkosBatched_Identity.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_IDENTITY_HPP__ -#define __KOKKOSBATCHED_IDENTITY_HPP__ +#ifndef KOKKOSBATCHED_IDENTITY_HPP +#define KOKKOSBATCHED_IDENTITY_HPP /// \author Kim Liegeois (knliege@sandia.gov) diff --git a/packages/kokkos-kernels/batched/sparse/src/KokkosBatched_JacobiPrec.hpp b/packages/kokkos-kernels/batched/sparse/src/KokkosBatched_JacobiPrec.hpp index 580f85158bec..973794d68d1f 100644 --- a/packages/kokkos-kernels/batched/sparse/src/KokkosBatched_JacobiPrec.hpp +++ b/packages/kokkos-kernels/batched/sparse/src/KokkosBatched_JacobiPrec.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_JACOBIPREC_HPP__ -#define __KOKKOSBATCHED_JACOBIPREC_HPP__ +#ifndef KOKKOSBATCHED_JACOBIPREC_HPP +#define KOKKOSBATCHED_JACOBIPREC_HPP /// \author Kim Liegeois (knliege@sandia.gov) diff --git a/packages/kokkos-kernels/batched/sparse/src/KokkosBatched_Krylov_Handle.hpp b/packages/kokkos-kernels/batched/sparse/src/KokkosBatched_Krylov_Handle.hpp index c8e8392e1150..5cead801b850 100644 --- a/packages/kokkos-kernels/batched/sparse/src/KokkosBatched_Krylov_Handle.hpp +++ b/packages/kokkos-kernels/batched/sparse/src/KokkosBatched_Krylov_Handle.hpp @@ -14,8 +14,8 @@ // //@HEADER -#ifndef __KOKKOSBATCHED_KRYLOV_HANDLE_HPP__ -#define __KOKKOSBATCHED_KRYLOV_HANDLE_HPP__ +#ifndef KOKKOSBATCHED_KRYLOV_HANDLE_HPP +#define KOKKOSBATCHED_KRYLOV_HANDLE_HPP #include #include diff --git a/packages/kokkos-kernels/batched/sparse/src/KokkosBatched_Krylov_Solvers.hpp b/packages/kokkos-kernels/batched/sparse/src/KokkosBatched_Krylov_Solvers.hpp index b07ed2b973d7..2f25da35bfe2 100644 --- a/packages/kokkos-kernels/batched/sparse/src/KokkosBatched_Krylov_Solvers.hpp +++ b/packages/kokkos-kernels/batched/sparse/src/KokkosBatched_Krylov_Solvers.hpp @@ -14,8 +14,8 @@ // //@HEADER -#ifndef __KOKKOSBATCHED_KRYLOV_SOLVERS_HPP__ -#define __KOKKOSBATCHED_KRYLOV_SOLVERS_HPP__ +#ifndef KOKKOSBATCHED_KRYLOV_SOLVERS_HPP +#define KOKKOSBATCHED_KRYLOV_SOLVERS_HPP namespace KokkosBatched { diff --git a/packages/kokkos-kernels/batched/sparse/src/KokkosBatched_Spmv.hpp b/packages/kokkos-kernels/batched/sparse/src/KokkosBatched_Spmv.hpp index a93d0775beda..72b923eeb819 100644 --- a/packages/kokkos-kernels/batched/sparse/src/KokkosBatched_Spmv.hpp +++ b/packages/kokkos-kernels/batched/sparse/src/KokkosBatched_Spmv.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_SPMV_HPP__ -#define __KOKKOSBATCHED_SPMV_HPP__ +#ifndef KOKKOSBATCHED_SPMV_HPP +#define KOKKOSBATCHED_SPMV_HPP /// \author Kim Liegeois (knliege@sandia.gov) diff --git a/packages/kokkos-kernels/blas/CMakeLists.txt b/packages/kokkos-kernels/blas/CMakeLists.txt index 5bc7217cfdae..08263826abfe 100644 --- a/packages/kokkos-kernels/blas/CMakeLists.txt +++ b/packages/kokkos-kernels/blas/CMakeLists.txt @@ -36,7 +36,7 @@ IF (KOKKOSKERNELS_ENABLE_TPL_BLAS OR KOKKOSKERNELS_ENABLE_TPL_MKL OR KOKKOSKERNE ENDIF() # Include cuda blas TPL source file -IF (KOKKOSKERNELS_ENABLE_TPL_CUBLAS) +IF (KOKKOSKERNELS_ENABLE_TPL_CUBLAS OR KOKKOSKERNELS_ENABLE_TPL_MAGMA) LIST(APPEND SOURCES blas/tpls/KokkosBlas_Cuda_tpl.cpp ) diff --git a/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_abs_eti_spec_decl.hpp.in b/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_abs_eti_spec_decl.hpp.in new file mode 100644 index 000000000000..2780dee8ff9d --- /dev/null +++ b/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_abs_eti_spec_decl.hpp.in @@ -0,0 +1,26 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS1_ABS_ETI_SPEC_DECL_HPP_ +#define KOKKOSBLAS1_ABS_ETI_SPEC_DECL_HPP_ +namespace KokkosBlas { +namespace Impl { + +@BLAS1_ABS_ETI_DECL_BLOCK@ + +} // Impl +} // KokkosBlas +#endif // KOKKOSBLAS1_ABS_ETI_SPEC_DECL_HPP_ diff --git a/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_abs_mv_eti_spec_decl.hpp.in b/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_abs_mv_eti_spec_decl.hpp.in new file mode 100644 index 000000000000..c7af4806beeb --- /dev/null +++ b/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_abs_mv_eti_spec_decl.hpp.in @@ -0,0 +1,26 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS1_ABS_MV_ETI_SPEC_DECL_HPP_ +#define KOKKOSBLAS1_ABS_MV_ETI_SPEC_DECL_HPP_ +namespace KokkosBlas { +namespace Impl { + +@BLAS1_ABS_MV_ETI_DECL_BLOCK@ + +} // Impl +} // KokkosBlas +#endif // KOKKOSBLAS1_ABS_MV_ETI_SPEC_DECL_HPP_ diff --git a/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_axpby_eti_spec_avail.hpp.in b/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_axpby_eti_spec_avail.hpp.in index 66fae7ebd3a7..bebe6679330a 100644 --- a/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_axpby_eti_spec_avail.hpp.in +++ b/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_axpby_eti_spec_avail.hpp.in @@ -19,6 +19,6 @@ namespace KokkosBlas { namespace Impl { @BLAS1_AXPBY_ETI_AVAIL_BLOCK@ - } //IMPL +} //IMPL } //Kokkos #endif diff --git a/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_axpby_eti_spec_decl.hpp.in b/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_axpby_eti_spec_decl.hpp.in new file mode 100644 index 000000000000..cb42d1c5fddb --- /dev/null +++ b/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_axpby_eti_spec_decl.hpp.in @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS1_AXPBY_ETI_SPEC_DECL_HPP_ +#define KOKKOSBLAS1_AXPBY_ETI_SPEC_DECL_HPP_ +namespace KokkosBlas { +namespace Impl { +@BLAS1_AXPBY_ETI_DECL_BLOCK@ +} //IMPL +} //Kokkos +#endif diff --git a/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_axpby_mv_eti_spec_avail.hpp.in b/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_axpby_mv_eti_spec_avail.hpp.in index 6a462246057d..3262fa2485fb 100644 --- a/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_axpby_mv_eti_spec_avail.hpp.in +++ b/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_axpby_mv_eti_spec_avail.hpp.in @@ -19,6 +19,6 @@ namespace KokkosBlas { namespace Impl { @BLAS1_AXPBY_MV_ETI_AVAIL_BLOCK@ - } //IMPL +} //IMPL } //Kokkos #endif diff --git a/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_axpby_mv_eti_spec_decl.hpp.in b/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_axpby_mv_eti_spec_decl.hpp.in new file mode 100644 index 000000000000..94033316cf07 --- /dev/null +++ b/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_axpby_mv_eti_spec_decl.hpp.in @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS1_AXPBY_MV_ETI_SPEC_DECL_HPP_ +#define KOKKOSBLAS1_AXPBY_MV_ETI_SPEC_DECL_HPP_ +namespace KokkosBlas { +namespace Impl { +@BLAS1_AXPBY_MV_ETI_DECL_BLOCK@ +} //IMPL +} //Kokkos +#endif diff --git a/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_dot_eti_spec_avail.hpp.in b/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_dot_eti_spec_avail.hpp.in index 525031c93f26..01ecc162880c 100644 --- a/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_dot_eti_spec_avail.hpp.in +++ b/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_dot_eti_spec_avail.hpp.in @@ -19,6 +19,6 @@ namespace KokkosBlas { namespace Impl { @BLAS1_DOT_ETI_AVAIL_BLOCK@ - } //IMPL +} //IMPL } //Kokkos #endif diff --git a/packages/kokkos/core/unit_test/sycl/TestSYCL_Task.cpp b/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_dot_eti_spec_decl.hpp.in similarity index 75% rename from packages/kokkos/core/unit_test/sycl/TestSYCL_Task.cpp rename to packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_dot_eti_spec_decl.hpp.in index 3c599b95a6f3..827abeef6f41 100644 --- a/packages/kokkos/core/unit_test/sycl/TestSYCL_Task.cpp +++ b/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_dot_eti_spec_decl.hpp.in @@ -14,5 +14,11 @@ // //@HEADER -#include -#include +#ifndef KOKKOSBLAS1_DOT_ETI_SPEC_DECL_HPP_ +#define KOKKOSBLAS1_DOT_ETI_SPEC_DECL_HPP_ +namespace KokkosBlas { +namespace Impl { +@BLAS1_DOT_ETI_DECL_BLOCK@ +} //IMPL +} //Kokkos +#endif diff --git a/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_dot_mv_eti_spec_avail.hpp.in b/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_dot_mv_eti_spec_avail.hpp.in index 9145ec9461f7..ca52819bc3ed 100644 --- a/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_dot_mv_eti_spec_avail.hpp.in +++ b/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_dot_mv_eti_spec_avail.hpp.in @@ -19,6 +19,6 @@ namespace KokkosBlas { namespace Impl { @BLAS1_DOT_MV_ETI_AVAIL_BLOCK@ - } //IMPL +} //IMPL } //Kokkos #endif diff --git a/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_dot_mv_eti_spec_decl.hpp.in b/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_dot_mv_eti_spec_decl.hpp.in new file mode 100644 index 000000000000..96acefb10891 --- /dev/null +++ b/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_dot_mv_eti_spec_decl.hpp.in @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS1_DOT_MV_ETI_SPEC_DECL_HPP_ +#define KOKKOSBLAS1_DOT_MV_ETI_SPEC_DECL_HPP_ +namespace KokkosBlas { +namespace Impl { +@BLAS1_DOT_MV_ETI_DECL_BLOCK@ +} //IMPL +} //Kokkos +#endif diff --git a/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_iamax_eti_spec_decl.hpp.in b/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_iamax_eti_spec_decl.hpp.in new file mode 100644 index 000000000000..17b61a885799 --- /dev/null +++ b/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_iamax_eti_spec_decl.hpp.in @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS1_IAMAX_ETI_SPEC_DECL_HPP_ +#define KOKKOSBLAS1_IAMAX_ETI_SPEC_DECL_HPP_ +namespace KokkosBlas { +namespace Impl { +@BLAS1_IAMAX_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_iamax_mv_eti_spec_decl.hpp.in b/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_iamax_mv_eti_spec_decl.hpp.in new file mode 100644 index 000000000000..35d654012e95 --- /dev/null +++ b/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_iamax_mv_eti_spec_decl.hpp.in @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_DECL_HPP_ +#define KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_DECL_HPP_ +namespace KokkosBlas { +namespace Impl { +@BLAS1_IAMAX_MV_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_mult_eti_spec_avail.hpp.in b/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_mult_eti_spec_avail.hpp.in index 0fa727b8bafb..9fe55baf2a4b 100644 --- a/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_mult_eti_spec_avail.hpp.in +++ b/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_mult_eti_spec_avail.hpp.in @@ -19,6 +19,6 @@ namespace KokkosBlas { namespace Impl { @BLAS1_MULT_ETI_AVAIL_BLOCK@ - } //IMPL +} //IMPL } //Kokkos #endif diff --git a/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_mult_eti_spec_decl.hpp.in b/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_mult_eti_spec_decl.hpp.in new file mode 100644 index 000000000000..8a64db6ba640 --- /dev/null +++ b/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_mult_eti_spec_decl.hpp.in @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS1_MULT_ETI_SPEC_DECL_HPP_ +#define KOKKOSBLAS1_MULT_ETI_SPEC_DECL_HPP_ +namespace KokkosBlas { +namespace Impl { +@BLAS1_MULT_ETI_DECL_BLOCK@ +} //IMPL +} //Kokkos +#endif diff --git a/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_mult_mv_eti_spec_avail.hpp.in b/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_mult_mv_eti_spec_avail.hpp.in index 56f777195683..b1483c8fb0a3 100644 --- a/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_mult_mv_eti_spec_avail.hpp.in +++ b/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_mult_mv_eti_spec_avail.hpp.in @@ -19,6 +19,6 @@ namespace KokkosBlas { namespace Impl { @BLAS1_MULT_MV_ETI_AVAIL_BLOCK@ - } //IMPL +} //IMPL } //Kokkos #endif diff --git a/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_mult_mv_eti_spec_decl.hpp.in b/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_mult_mv_eti_spec_decl.hpp.in new file mode 100644 index 000000000000..945d3a837a42 --- /dev/null +++ b/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_mult_mv_eti_spec_decl.hpp.in @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS1_MULT_MV_ETI_SPEC_DECL_HPP_ +#define KOKKOSBLAS1_MULT_MV_ETI_SPEC_DECL_HPP_ +namespace KokkosBlas { +namespace Impl { +@BLAS1_MULT_MV_ETI_DECL_BLOCK@ +} //IMPL +} //Kokkos +#endif diff --git a/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_nrm1_eti_spec_decl.hpp.in b/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_nrm1_eti_spec_decl.hpp.in new file mode 100644 index 000000000000..1c9a0881222e --- /dev/null +++ b/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_nrm1_eti_spec_decl.hpp.in @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS1_NRM1_ETI_SPEC_DECL_HPP_ +#define KOKKOSBLAS1_NRM1_ETI_SPEC_DECL_HPP_ +namespace KokkosBlas { +namespace Impl { +@BLAS1_NRM1_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_nrm1_mv_eti_spec_decl.hpp.in b/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_nrm1_mv_eti_spec_decl.hpp.in new file mode 100644 index 000000000000..d2a322a0ad2c --- /dev/null +++ b/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_nrm1_mv_eti_spec_decl.hpp.in @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS1_NRM1_MV_ETI_SPEC_DECL_HPP_ +#define KOKKOSBLAS1_NRM1_MV_ETI_SPEC_DECL_HPP_ +namespace KokkosBlas { +namespace Impl { +@BLAS1_NRM1_MV_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_nrm2_eti_spec_avail.hpp.in b/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_nrm2_eti_spec_avail.hpp.in index 5398eb1c3493..659c575afae8 100644 --- a/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_nrm2_eti_spec_avail.hpp.in +++ b/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_nrm2_eti_spec_avail.hpp.in @@ -19,6 +19,6 @@ namespace KokkosBlas { namespace Impl { @BLAS1_NRM2_ETI_AVAIL_BLOCK@ - } //IMPL +} //IMPL } //Kokkos #endif diff --git a/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_nrm2_eti_spec_decl.hpp.in b/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_nrm2_eti_spec_decl.hpp.in new file mode 100644 index 000000000000..444776c72593 --- /dev/null +++ b/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_nrm2_eti_spec_decl.hpp.in @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS1_NRM2_ETI_SPEC_DECL_HPP_ +#define KOKKOSBLAS1_NRM2_ETI_SPEC_DECL_HPP_ +namespace KokkosBlas { +namespace Impl { +@BLAS1_NRM2_ETI_DECL_BLOCK@ +} //IMPL +} //Kokkos +#endif diff --git a/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_nrm2_mv_eti_spec_avail.hpp.in b/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_nrm2_mv_eti_spec_avail.hpp.in index b0446fc62efe..1b1c8f3dbac0 100644 --- a/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_nrm2_mv_eti_spec_avail.hpp.in +++ b/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_nrm2_mv_eti_spec_avail.hpp.in @@ -19,6 +19,6 @@ namespace KokkosBlas { namespace Impl { @BLAS1_NRM2_MV_ETI_AVAIL_BLOCK@ - } //IMPL +} //IMPL } //Kokkos #endif diff --git a/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_nrm2_mv_eti_spec_decl.hpp.in b/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_nrm2_mv_eti_spec_decl.hpp.in new file mode 100644 index 000000000000..a75bb102dc59 --- /dev/null +++ b/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_nrm2_mv_eti_spec_decl.hpp.in @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS1_NRM2_MV_ETI_SPEC_DECL_HPP_ +#define KOKKOSBLAS1_NRM2_MV_ETI_SPEC_DECL_HPP_ +namespace KokkosBlas { +namespace Impl { +@BLAS1_NRM2_MV_ETI_DECL_BLOCK@ +} //IMPL +} //Kokkos +#endif diff --git a/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_nrm2w_eti_spec_decl.hpp.in b/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_nrm2w_eti_spec_decl.hpp.in new file mode 100644 index 000000000000..bd7d1b11b825 --- /dev/null +++ b/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_nrm2w_eti_spec_decl.hpp.in @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS1_NRM2W_ETI_SPEC_DECL_HPP_ +#define KOKKOSBLAS1_NRM2W_ETI_SPEC_DECL_HPP_ +namespace KokkosBlas { +namespace Impl { +@BLAS1_NRM2W_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_nrm2w_mv_eti_spec_decl.hpp.in b/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_nrm2w_mv_eti_spec_decl.hpp.in new file mode 100644 index 000000000000..0a0aadc87a4c --- /dev/null +++ b/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_nrm2w_mv_eti_spec_decl.hpp.in @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS1_NRM2W_MV_ETI_SPEC_DECL_HPP_ +#define KOKKOSBLAS1_NRM2W_MV_ETI_SPEC_DECL_HPP_ +namespace KokkosBlas { +namespace Impl { +@BLAS1_NRM2W_MV_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_nrminf_eti_spec_decl.hpp.in b/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_nrminf_eti_spec_decl.hpp.in new file mode 100644 index 000000000000..3f1e874724b6 --- /dev/null +++ b/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_nrminf_eti_spec_decl.hpp.in @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS1_NRMINF_ETI_SPEC_DECL_HPP_ +#define KOKKOSBLAS1_NRMINF_ETI_SPEC_DECL_HPP_ +namespace KokkosBlas { +namespace Impl { +@BLAS1_NRMINF_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_nrminf_mv_eti_spec_decl.hpp.in b/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_nrminf_mv_eti_spec_decl.hpp.in new file mode 100644 index 000000000000..17559306bf23 --- /dev/null +++ b/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_nrminf_mv_eti_spec_decl.hpp.in @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS1_NRMINF_MV_ETI_SPEC_DECL_HPP_ +#define KOKKOSBLAS1_NRMINF_MV_ETI_SPEC_DECL_HPP_ +namespace KokkosBlas { +namespace Impl { +@BLAS1_NRMINF_MV_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_reciprocal_eti_spec_decl.hpp.in b/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_reciprocal_eti_spec_decl.hpp.in new file mode 100644 index 000000000000..7ac4b74ea496 --- /dev/null +++ b/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_reciprocal_eti_spec_decl.hpp.in @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS1_RECIPROCAL_ETI_SPEC_DECL_HPP_ +#define KOKKOSBLAS1_RECIPROCAL_ETI_SPEC_DECL_HPP_ +namespace KokkosBlas { +namespace Impl { +@BLAS1_RECIPROCAL_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_reciprocal_mv_eti_spec_decl.hpp.in b/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_reciprocal_mv_eti_spec_decl.hpp.in new file mode 100644 index 000000000000..f40958465f36 --- /dev/null +++ b/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_reciprocal_mv_eti_spec_decl.hpp.in @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS1_RECIPROCAL_MV_ETI_SPEC_DECL_HPP_ +#define KOKKOSBLAS1_RECIPROCAL_MV_ETI_SPEC_DECL_HPP_ +namespace KokkosBlas { +namespace Impl { +@BLAS1_RECIPROCAL_MV_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_rot_eti_spec_decl.hpp.in b/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_rot_eti_spec_decl.hpp.in new file mode 100644 index 000000000000..5e6b197460d6 --- /dev/null +++ b/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_rot_eti_spec_decl.hpp.in @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS1_ROT_ETI_SPEC_DECL_HPP_ +#define KOKKOSBLAS1_ROT_ETI_SPEC_DECL_HPP_ +namespace KokkosBlas { +namespace Impl { +@BLAS1_ROT_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_rotg_eti_spec_decl.hpp.in b/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_rotg_eti_spec_decl.hpp.in new file mode 100644 index 000000000000..214dcd75c270 --- /dev/null +++ b/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_rotg_eti_spec_decl.hpp.in @@ -0,0 +1,25 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS1_ROTG_ETI_SPEC_DECL_HPP_ +#define KOKKOSBLAS1_ROTG_ETI_SPEC_DECL_HPP_ + +namespace KokkosBlas { +namespace Impl { +@BLAS1_ROTG_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_rotm_eti_spec_decl.hpp.in b/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_rotm_eti_spec_decl.hpp.in new file mode 100644 index 000000000000..3fdb009574fe --- /dev/null +++ b/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_rotm_eti_spec_decl.hpp.in @@ -0,0 +1,25 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS1_ROTM_ETI_SPEC_DECL_HPP_ +#define KOKKOSBLAS1_ROTM_ETI_SPEC_DECL_HPP_ + +namespace KokkosBlas { +namespace Impl { +@BLAS1_ROTM_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_rotmg_eti_spec_decl.hpp.in b/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_rotmg_eti_spec_decl.hpp.in new file mode 100644 index 000000000000..83465d83cff8 --- /dev/null +++ b/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_rotmg_eti_spec_decl.hpp.in @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS1_ROTMG_ETI_SPEC_DECL_HPP_ +#define KOKKOSBLAS1_ROTMG_ETI_SPEC_DECL_HPP_ +namespace KokkosBlas { +namespace Impl { +@BLAS1_ROTMG_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_scal_eti_spec_decl.hpp.in b/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_scal_eti_spec_decl.hpp.in new file mode 100644 index 000000000000..56d40c9883e8 --- /dev/null +++ b/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_scal_eti_spec_decl.hpp.in @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS1_SCAL_ETI_SPEC_DECL_HPP_ +#define KOKKOSBLAS1_SCAL_ETI_SPEC_DECL_HPP_ +namespace KokkosBlas { +namespace Impl { +@BLAS1_SCAL_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_scal_mv_eti_spec_decl.hpp.in b/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_scal_mv_eti_spec_decl.hpp.in new file mode 100644 index 000000000000..953f8e695404 --- /dev/null +++ b/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_scal_mv_eti_spec_decl.hpp.in @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS1_SCAL_MV_ETI_SPEC_DECL_HPP_ +#define KOKKOSBLAS1_SCAL_MV_ETI_SPEC_DECL_HPP_ +namespace KokkosBlas { +namespace Impl { +@BLAS1_SCAL_MV_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_sum_eti_spec_decl.hpp.in b/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_sum_eti_spec_decl.hpp.in new file mode 100644 index 000000000000..bdac3456e852 --- /dev/null +++ b/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_sum_eti_spec_decl.hpp.in @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS1_SUM_ETI_SPEC_DECL_HPP_ +#define KOKKOSBLAS1_SUM_ETI_SPEC_DECL_HPP_ +namespace KokkosBlas { +namespace Impl { +@BLAS1_SUM_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_sum_mv_eti_spec_decl.hpp.in b/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_sum_mv_eti_spec_decl.hpp.in new file mode 100644 index 000000000000..5182f619852e --- /dev/null +++ b/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_sum_mv_eti_spec_decl.hpp.in @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS1_SUM_MV_ETI_SPEC_DECL_HPP_ +#define KOKKOSBLAS1_SUM_MV_ETI_SPEC_DECL_HPP_ +namespace KokkosBlas { +namespace Impl { +@BLAS1_SUM_MV_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_swap_eti_spec_decl.hpp.in b/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_swap_eti_spec_decl.hpp.in new file mode 100644 index 000000000000..e67c630a9846 --- /dev/null +++ b/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_swap_eti_spec_decl.hpp.in @@ -0,0 +1,26 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +*/ +#ifndef KOKKOSBLAS1_SWAP_ETI_SPEC_DECL_HPP_ +#define KOKKOSBLAS1_SWAP_ETI_SPEC_DECL_HPP_ + +namespace KokkosBlas { +namespace Impl { +@BLAS1_SWAP_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_update_eti_spec_decl.hpp.in b/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_update_eti_spec_decl.hpp.in new file mode 100644 index 000000000000..cff04c9fbef2 --- /dev/null +++ b/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_update_eti_spec_decl.hpp.in @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS1_UPDATE_ETI_SPEC_DECL_HPP_ +#define KOKKOSBLAS1_UPDATE_ETI_SPEC_DECL_HPP_ +namespace KokkosBlas { +namespace Impl { +@BLAS1_UPDATE_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_update_mv_eti_spec_decl.hpp.in b/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_update_mv_eti_spec_decl.hpp.in new file mode 100644 index 000000000000..deec84712bd4 --- /dev/null +++ b/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas1_update_mv_eti_spec_decl.hpp.in @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS1_UPDATE_MV_ETI_SPEC_DECL_HPP_ +#define KOKKOSBLAS1_UPDATE_MV_ETI_SPEC_DECL_HPP_ +namespace KokkosBlas { +namespace Impl { +@BLAS1_UPDATE_MV_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas2_gemv_eti_spec_avail.hpp.in b/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas2_gemv_eti_spec_avail.hpp.in index 1a7cd9acce81..aa52a7d4b54d 100644 --- a/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas2_gemv_eti_spec_avail.hpp.in +++ b/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas2_gemv_eti_spec_avail.hpp.in @@ -19,6 +19,6 @@ namespace KokkosBlas { namespace Impl { @BLAS2_GEMV_ETI_AVAIL_BLOCK@ - } //IMPL +} //IMPL } //Kokkos #endif diff --git a/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas2_gemv_eti_spec_decl.hpp.in b/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas2_gemv_eti_spec_decl.hpp.in new file mode 100644 index 000000000000..606ec52ae2da --- /dev/null +++ b/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas2_gemv_eti_spec_decl.hpp.in @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS2_GEMV_ETI_SPEC_DECL_HPP_ +#define KOKKOSBLAS2_GEMV_ETI_SPEC_DECL_HPP_ +namespace KokkosBlas { +namespace Impl { +@BLAS2_GEMV_ETI_DECL_BLOCK@ +} //IMPL +} //Kokkos +#endif diff --git a/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas2_ger_eti_spec_decl.hpp.in b/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas2_ger_eti_spec_decl.hpp.in new file mode 100644 index 000000000000..3ca1a64a8ea4 --- /dev/null +++ b/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas2_ger_eti_spec_decl.hpp.in @@ -0,0 +1,25 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS2_GER_ETI_SPEC_DECL_HPP_ +#define KOKKOSBLAS2_GER_ETI_SPEC_DECL_HPP_ + +namespace KokkosBlas { +namespace Impl { +@BLAS2_GER_ETI_DECL_BLOCK@ +} // namespace Impl +} // namespace KokkosBlas +#endif diff --git a/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas2_syr2_eti_spec_decl.hpp.in b/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas2_syr2_eti_spec_decl.hpp.in new file mode 100644 index 000000000000..315f62fdfb79 --- /dev/null +++ b/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas2_syr2_eti_spec_decl.hpp.in @@ -0,0 +1,25 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS2_SYR2_ETI_SPEC_DECL_HPP_ +#define KOKKOSBLAS2_SYR2_ETI_SPEC_DECL_HPP_ + +namespace KokkosBlas { +namespace Impl { +@BLAS2_SYR2_ETI_DECL_BLOCK@ +} //IMPL +} //Kokkos +#endif diff --git a/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas2_syr_eti_spec_decl.hpp.in b/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas2_syr_eti_spec_decl.hpp.in new file mode 100644 index 000000000000..370c6bf96c9c --- /dev/null +++ b/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas2_syr_eti_spec_decl.hpp.in @@ -0,0 +1,25 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS2_SYR_ETI_SPEC_DECL_HPP_ +#define KOKKOSBLAS2_SYR_ETI_SPEC_DECL_HPP_ + +namespace KokkosBlas { +namespace Impl { +@BLAS2_SYR_ETI_DECL_BLOCK@ +} //IMPL +} //Kokkos +#endif diff --git a/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas3_gemm_eti_spec_decl.hpp.in b/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas3_gemm_eti_spec_decl.hpp.in new file mode 100644 index 000000000000..22ea9a1ed127 --- /dev/null +++ b/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas3_gemm_eti_spec_decl.hpp.in @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS3_GEMM_ETI_SPEC_DECL_HPP_ +#define KOKKOSBLAS3_GEMM_ETI_SPEC_DECL_HPP_ +namespace KokkosBlas { +namespace Impl { +@BLAS3_GEMM_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas3_trmm_eti_spec_decl.hpp.in b/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas3_trmm_eti_spec_decl.hpp.in new file mode 100644 index 000000000000..e802ccf4fc57 --- /dev/null +++ b/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas3_trmm_eti_spec_decl.hpp.in @@ -0,0 +1,26 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS3_TRMM_ETI_SPEC_DECL_HPP_ +#define KOKKOSBLAS3_TRMM_ETI_SPEC_DECL_HPP_ +namespace KokkosBlas { +namespace Impl { + +@BLAS3_TRMM_ETI_DECL_BLOCK@ + +} // Impl +} // KokkosBlas +#endif // KOKKOSBLAS3_TRMM_ETI_SPEC_DECL_HPP_ diff --git a/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas3_trsm_eti_spec_decl.hpp.in b/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas3_trsm_eti_spec_decl.hpp.in new file mode 100644 index 000000000000..d1b6384e788d --- /dev/null +++ b/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas3_trsm_eti_spec_decl.hpp.in @@ -0,0 +1,26 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef KOKKOSBLAS3_TRSM_ETI_SPEC_DECL_HPP_ +#define KOKKOSBLAS3_TRSM_ETI_SPEC_DECL_HPP_ + +namespace KokkosBlas { +namespace Impl { + +@BLAS3_TRSM_ETI_DECL_BLOCK@ + +} // Impl +} // KokkosBlas +#endif // KOKKOSBLAS3_TRSM_ETI_SPEC_DECL_HPP_ diff --git a/packages/kokkos-kernels/blas/impl/KokkosBlas1_abs_impl.hpp b/packages/kokkos-kernels/blas/impl/KokkosBlas1_abs_impl.hpp index 0c674f25f5e8..181708e94faa 100644 --- a/packages/kokkos-kernels/blas/impl/KokkosBlas1_abs_impl.hpp +++ b/packages/kokkos-kernels/blas/impl/KokkosBlas1_abs_impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef KOKKOS_BLAS1_IMPL_ABS_HPP_ -#define KOKKOS_BLAS1_IMPL_ABS_HPP_ +#ifndef KOKKOSBLAS1_IMPL_ABS_HPP_ +#define KOKKOSBLAS1_IMPL_ABS_HPP_ #include #include @@ -201,4 +201,4 @@ void V_Abs_Generic(const execution_space& space, const RV& R, const XV& X) { } // namespace Impl } // namespace KokkosBlas -#endif // KOKKOS_BLAS1_MV_IMPL_ABS_HPP_ +#endif // KOKKOSBLAS1_IMPL_ABS_HPP_ diff --git a/packages/kokkos-kernels/blas/impl/KokkosBlas1_abs_spec.hpp b/packages/kokkos-kernels/blas/impl/KokkosBlas1_abs_spec.hpp index fb6357b38ee1..a00daf0683d0 100644 --- a/packages/kokkos-kernels/blas/impl/KokkosBlas1_abs_spec.hpp +++ b/packages/kokkos-kernels/blas/impl/KokkosBlas1_abs_spec.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef KOKKOS_BLAS1_IMPL_ABS_SPEC_HPP_ -#define KOKKOS_BLAS1_IMPL_ABS_SPEC_HPP_ +#ifndef KOKKOSBLAS1_IMPL_ABS_SPEC_HPP_ +#define KOKKOSBLAS1_IMPL_ABS_SPEC_HPP_ #include #include @@ -229,5 +229,7 @@ struct Abs; #include +#include +#include -#endif // KOKKOS_BLAS1_MV_IMPL_ABS_HPP_ +#endif // KOKKOSBLAS1_IMPL_ABS_SPEC_HPP_ diff --git a/packages/kokkos-kernels/blas/impl/KokkosBlas1_axpby_spec.hpp b/packages/kokkos-kernels/blas/impl/KokkosBlas1_axpby_spec.hpp index f4f85c8f6b42..37d3ccd56201 100644 --- a/packages/kokkos-kernels/blas/impl/KokkosBlas1_axpby_spec.hpp +++ b/packages/kokkos-kernels/blas/impl/KokkosBlas1_axpby_spec.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef KOKKOS_BLAS1_AXPBY_SPEC_HPP_ -#define KOKKOS_BLAS1_AXPBY_SPEC_HPP_ +#ifndef KOKKOSBLAS1_AXPBY_SPEC_HPP_ +#define KOKKOSBLAS1_AXPBY_SPEC_HPP_ #include "KokkosKernels_config.h" #include "Kokkos_Core.hpp" @@ -513,6 +513,8 @@ struct Axpby, Kokkos::MemoryTraits >, \ 1, false, true>; +#include + #define KOKKOSBLAS1_AXPBY_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ template struct Axpby< \ EXEC_SPACE, SCALAR, \ @@ -559,6 +561,8 @@ struct Axpby, Kokkos::MemoryTraits >, \ 2, false, true>; +#include + #define KOKKOSBLAS1_AXPBY_MV_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ template struct Axpby< \ EXEC_SPACE, SCALAR, \ @@ -580,4 +584,4 @@ struct Axpby -#endif // KOKKOS_BLAS1_MV_IMPL_AXPBY_HPP_ +#endif // KOKKOSBLAS1_AXPBY_SPEC_HPP_ diff --git a/packages/kokkos-kernels/blas/impl/KokkosBlas1_axpby_unification_attempt_traits.hpp b/packages/kokkos-kernels/blas/impl/KokkosBlas1_axpby_unification_attempt_traits.hpp index 0a0300780166..84d91a106a4a 100644 --- a/packages/kokkos-kernels/blas/impl/KokkosBlas1_axpby_unification_attempt_traits.hpp +++ b/packages/kokkos-kernels/blas/impl/KokkosBlas1_axpby_unification_attempt_traits.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef KOKKOS_BLAS1_AXPBY_UNIFICATION_ATTEMPT_TRAITS_HPP_ -#define KOKKOS_BLAS1_AXPBY_UNIFICATION_ATTEMPT_TRAITS_HPP_ +#ifndef KOKKOSBLAS1_AXPBY_UNIFICATION_ATTEMPT_TRAITS_HPP_ +#define KOKKOSBLAS1_AXPBY_UNIFICATION_ATTEMPT_TRAITS_HPP_ #include #include @@ -103,7 +103,7 @@ struct AxpbyUnificationAttemptTraits { // - type names begin with upper case letters // ******************************************************************** public: - static constexpr bool onDevice = KokkosKernels::Impl::kk_is_gpu_exec_space(); + static constexpr bool onDevice = KokkosKernels::Impl::is_gpu_exec_space_v; private: static constexpr bool onHost = !onDevice; @@ -816,4 +816,4 @@ static void populateRank1Stride1ViewWithScalarOrNonStrideView(T_in const& coeff_ } // namespace Impl } // namespace KokkosBlas -#endif // KOKKOS_BLAS1_AXPBY_UNIFICATION_ATTEMPT_TRAITS_HPP_ +#endif // KOKKOSBLAS1_AXPBY_UNIFICATION_ATTEMPT_TRAITS_HPP_ diff --git a/packages/kokkos-kernels/blas/impl/KokkosBlas1_dot_spec.hpp b/packages/kokkos-kernels/blas/impl/KokkosBlas1_dot_spec.hpp index 982e2eaa0c7b..e2b2cc28c85a 100644 --- a/packages/kokkos-kernels/blas/impl/KokkosBlas1_dot_spec.hpp +++ b/packages/kokkos-kernels/blas/impl/KokkosBlas1_dot_spec.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef KOKKOS_BLAS1_IMPL_DOT_SPEC_HPP_ -#define KOKKOS_BLAS1_IMPL_DOT_SPEC_HPP_ +#ifndef KOKKOSBLAS1_IMPL_DOT_SPEC_HPP_ +#define KOKKOSBLAS1_IMPL_DOT_SPEC_HPP_ #include #include @@ -410,6 +410,8 @@ struct Dot>, \ true>; +#include + #define KOKKOSBLAS1_DOT_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ template struct Dot>, \ @@ -477,6 +479,8 @@ struct Dot>, \ 1, 2, false, true>; +#include + #define KOKKOSBLAS1_DOT_MV_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ template struct Dot< \ EXEC_SPACE, \ @@ -508,4 +512,4 @@ struct Dot -#endif // KOKKOS_BLAS1_MV_IMPL_DOT_HPP_ +#endif // KOKKOSBLAS1_IMPL_DOT_SPEC_HPP_ diff --git a/packages/kokkos-kernels/blas/impl/KokkosBlas1_iamax_spec.hpp b/packages/kokkos-kernels/blas/impl/KokkosBlas1_iamax_spec.hpp index 80e4cb603643..28978c034ed7 100644 --- a/packages/kokkos-kernels/blas/impl/KokkosBlas1_iamax_spec.hpp +++ b/packages/kokkos-kernels/blas/impl/KokkosBlas1_iamax_spec.hpp @@ -294,5 +294,7 @@ struct Iamax +#include +#include #endif // KOKKOSBLAS1_IAMAX_SPEC_HPP_ diff --git a/packages/kokkos-kernels/blas/impl/KokkosBlas1_mult_spec.hpp b/packages/kokkos-kernels/blas/impl/KokkosBlas1_mult_spec.hpp index 3cd847dc1de6..3b8904b1dfa2 100644 --- a/packages/kokkos-kernels/blas/impl/KokkosBlas1_mult_spec.hpp +++ b/packages/kokkos-kernels/blas/impl/KokkosBlas1_mult_spec.hpp @@ -229,6 +229,8 @@ struct Mult >, \ 1, false, true>; +#include + #define KOKKOSBLAS1_MULT_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ template struct Mult< \ EXEC_SPACE, \ @@ -257,6 +259,8 @@ struct Mult >, \ 2, false, true>; +#include + #define KOKKOSBLAS1_MULT_MV_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ template struct Mult< \ EXEC_SPACE, \ diff --git a/packages/kokkos-kernels/blas/impl/KokkosBlas1_nrm1_spec.hpp b/packages/kokkos-kernels/blas/impl/KokkosBlas1_nrm1_spec.hpp index 3977c5225c71..25eeae5ee2df 100644 --- a/packages/kokkos-kernels/blas/impl/KokkosBlas1_nrm1_spec.hpp +++ b/packages/kokkos-kernels/blas/impl/KokkosBlas1_nrm1_spec.hpp @@ -244,5 +244,7 @@ struct Nrm1; #include +#include +#include #endif // KOKKOSBLAS1_NRM1_SPEC_HPP_ diff --git a/packages/kokkos-kernels/blas/impl/KokkosBlas1_nrm2_spec.hpp b/packages/kokkos-kernels/blas/impl/KokkosBlas1_nrm2_spec.hpp index 4d0b2e139673..b13290cc1eee 100644 --- a/packages/kokkos-kernels/blas/impl/KokkosBlas1_nrm2_spec.hpp +++ b/packages/kokkos-kernels/blas/impl/KokkosBlas1_nrm2_spec.hpp @@ -199,6 +199,8 @@ struct Nrm2 >, \ 1, false, true>; +#include + // // Macro for definition of full specialization of // KokkosBlas::Impl::Nrm2 for rank == 2. This is NOT for users!!! We @@ -229,6 +231,8 @@ struct Nrm2 >, \ 2, false, true>; +#include + // // Macro for definition of full specialization of // KokkosBlas::Impl::Nrm2 for rank == 2. This is NOT for users!!! We diff --git a/packages/kokkos-kernels/blas/impl/KokkosBlas1_nrm2w_spec.hpp b/packages/kokkos-kernels/blas/impl/KokkosBlas1_nrm2w_spec.hpp index 566083213946..99f82c535b23 100644 --- a/packages/kokkos-kernels/blas/impl/KokkosBlas1_nrm2w_spec.hpp +++ b/packages/kokkos-kernels/blas/impl/KokkosBlas1_nrm2w_spec.hpp @@ -97,15 +97,15 @@ struct Nrm2w::value, "KokkosBlas::Impl::" - "Nrm2w<1-D>: RMV is not a Kokkos::View."); + "Nrm2w: RMV is not a Kokkos::View."); static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" - "Nrm2w<1-D>: XMV is not a Kokkos::View."); + "Nrm2w: XMV is not a Kokkos::View."); static_assert(RMV::rank == 0, - "KokkosBlas::Impl::Nrm2w<1-D>: " + "KokkosBlas::Impl::Nrm2w: " "RMV is not rank 0."); static_assert(XMV::rank == 1, - "KokkosBlas::Impl::Nrm2w<1-D>: " + "KokkosBlas::Impl::Nrm2w: " "XMV is not rank 1."); Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::nrm2w[ETI]" : "KokkosBlas::nrm2w[noETI]"); @@ -135,15 +135,15 @@ struct Nrm2w::value, "KokkosBlas::Impl::" - "Nrm2w<2-D>: RV is not a Kokkos::View."); + "Nrm2w_mv: RV is not a Kokkos::View."); static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" - "Nrm2w<2-D>: XMV is not a Kokkos::View."); + "Nrm2w_mv: XMV is not a Kokkos::View."); static_assert(RV::rank == 1, - "KokkosBlas::Impl::Nrm2w<2-D>: " + "KokkosBlas::Impl::Nrm2w_mv: " "RV is not rank 1."); static_assert(XMV::rank == 2, - "KokkosBlas::Impl::Nrm2w<2-D>: " + "KokkosBlas::Impl::Nrm2w_mv: " "XMV is not rank 2."); Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::nrm2w[ETI]" : "KokkosBlas::nrm2w[noETI]"); @@ -191,7 +191,8 @@ struct Nrm2w::mag_type, \ + extern template struct Nrm2w::mag_type, \ LAYOUT, Kokkos::HostSpace, Kokkos::MemoryTraits >, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ @@ -240,5 +241,7 @@ struct Nrm2w; #include +#include +#include #endif // KOKKOSBLAS1_NRM2W_SPEC_HPP_ diff --git a/packages/kokkos-kernels/blas/impl/KokkosBlas1_nrminf_spec.hpp b/packages/kokkos-kernels/blas/impl/KokkosBlas1_nrminf_spec.hpp index e7b365ce854c..eee326830b05 100644 --- a/packages/kokkos-kernels/blas/impl/KokkosBlas1_nrminf_spec.hpp +++ b/packages/kokkos-kernels/blas/impl/KokkosBlas1_nrminf_spec.hpp @@ -233,5 +233,7 @@ struct NrmInf; #include +#include +#include #endif // KOKKOSBLAS1_NRMINF_SPEC_HPP_ diff --git a/packages/kokkos-kernels/blas/impl/KokkosBlas1_reciprocal_impl.hpp b/packages/kokkos-kernels/blas/impl/KokkosBlas1_reciprocal_impl.hpp index 7ad6ab95dbea..b86ef116fb15 100644 --- a/packages/kokkos-kernels/blas/impl/KokkosBlas1_reciprocal_impl.hpp +++ b/packages/kokkos-kernels/blas/impl/KokkosBlas1_reciprocal_impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef KOKKOS_BLAS1_IMPL_RECIPROCAL_HPP_ -#define KOKKOS_BLAS1_IMPL_RECIPROCAL_HPP_ +#ifndef KOKKOSBLAS1_IMPL_RECIPROCAL_HPP_ +#define KOKKOSBLAS1_IMPL_RECIPROCAL_HPP_ #include #include @@ -203,4 +203,4 @@ void V_Reciprocal_Generic(const execution_space& space, const RV& R, const XV& X } // namespace Impl } // namespace KokkosBlas -#endif // KOKKOS_BLAS1_MV_IMPL_RECIPROCAL_HPP_ +#endif // KOKKOSBLAS1_IMPL_RECIPROCAL_HPP_ diff --git a/packages/kokkos-kernels/blas/impl/KokkosBlas1_reciprocal_spec.hpp b/packages/kokkos-kernels/blas/impl/KokkosBlas1_reciprocal_spec.hpp index 988043511b0f..efd6f5a9fe82 100644 --- a/packages/kokkos-kernels/blas/impl/KokkosBlas1_reciprocal_spec.hpp +++ b/packages/kokkos-kernels/blas/impl/KokkosBlas1_reciprocal_spec.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef KOKKOS_BLAS1_IMPL_RECIPROCAL_SPEC_HPP_ -#define KOKKOS_BLAS1_IMPL_RECIPROCAL_SPEC_HPP_ +#ifndef KOKKOSBLAS1_IMPL_RECIPROCAL_SPEC_HPP_ +#define KOKKOSBLAS1_IMPL_RECIPROCAL_SPEC_HPP_ #include #include @@ -230,5 +230,7 @@ struct Reciprocal; #include +#include +#include -#endif // KOKKOS_BLAS1_MV_IMPL_RECIPROCAL_HPP_ +#endif // KOKKOSBLAS1_IMPL_RECIPROCAL_SPEC_HPP_ diff --git a/packages/kokkos-kernels/blas/impl/KokkosBlas1_rot_spec.hpp b/packages/kokkos-kernels/blas/impl/KokkosBlas1_rot_spec.hpp index 4ca4d8d1ef97..493cd648cf2c 100644 --- a/packages/kokkos-kernels/blas/impl/KokkosBlas1_rot_spec.hpp +++ b/packages/kokkos-kernels/blas/impl/KokkosBlas1_rot_spec.hpp @@ -124,5 +124,6 @@ struct Rot; #include +#include #endif // KOKKOSBLAS1_ROT_SPEC_HPP_ diff --git a/packages/kokkos-kernels/blas/impl/KokkosBlas1_rotg_spec.hpp b/packages/kokkos-kernels/blas/impl/KokkosBlas1_rotg_spec.hpp index 87618f12c991..12752ab71c9b 100644 --- a/packages/kokkos-kernels/blas/impl/KokkosBlas1_rotg_spec.hpp +++ b/packages/kokkos-kernels/blas/impl/KokkosBlas1_rotg_spec.hpp @@ -123,5 +123,6 @@ struct Rotg; #include +#include #endif // KOKKOSBLAS1_ROTG_SPEC_HPP_ diff --git a/packages/kokkos-kernels/blas/impl/KokkosBlas1_rotm_spec.hpp b/packages/kokkos-kernels/blas/impl/KokkosBlas1_rotm_spec.hpp index 5000b35fc37a..8442a7b3e47c 100644 --- a/packages/kokkos-kernels/blas/impl/KokkosBlas1_rotm_spec.hpp +++ b/packages/kokkos-kernels/blas/impl/KokkosBlas1_rotm_spec.hpp @@ -120,5 +120,6 @@ struct Rotm; #include +#include #endif // KOKKOSBLAS1_ROTM_SPEC_HPP_ diff --git a/packages/kokkos-kernels/blas/impl/KokkosBlas1_rotmg_spec.hpp b/packages/kokkos-kernels/blas/impl/KokkosBlas1_rotmg_spec.hpp index caa44dda5dbb..631cee114e6f 100644 --- a/packages/kokkos-kernels/blas/impl/KokkosBlas1_rotmg_spec.hpp +++ b/packages/kokkos-kernels/blas/impl/KokkosBlas1_rotmg_spec.hpp @@ -124,5 +124,6 @@ struct Rotmg; #include +#include #endif // KOKKOSBLAS1_ROTMG_SPEC_HPP_ diff --git a/packages/kokkos-kernels/blas/impl/KokkosBlas1_scal_spec.hpp b/packages/kokkos-kernels/blas/impl/KokkosBlas1_scal_spec.hpp index 70a95d33e2b3..a52e9a3b6120 100644 --- a/packages/kokkos-kernels/blas/impl/KokkosBlas1_scal_spec.hpp +++ b/packages/kokkos-kernels/blas/impl/KokkosBlas1_scal_spec.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef KOKKOS_BLAS1_IMPL_SCAL_SPEC_HPP_ -#define KOKKOS_BLAS1_IMPL_SCAL_SPEC_HPP_ +#ifndef KOKKOSBLAS1_IMPL_SCAL_SPEC_HPP_ +#define KOKKOSBLAS1_IMPL_SCAL_SPEC_HPP_ #include #include @@ -292,6 +292,8 @@ struct Scal >, \ 1, false, true>; +#include + #define KOKKOSBLAS1_SCAL_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ template struct Scal< \ EXEC_SPACE, \ @@ -324,6 +326,8 @@ struct Scal >, \ 2, false, true>; +#include + #define KOKKOSBLAS1_SCAL_MV_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ template struct Scal< \ EXEC_SPACE, \ @@ -343,4 +347,4 @@ struct Scal -#endif // KOKKOS_BLAS1_MV_IMPL_SCAL_HPP_ +#endif // KOKKOSBLAS1_IMPL_SCAL_SPEC_HPP_ diff --git a/packages/kokkos-kernels/blas/impl/KokkosBlas1_set_impl.hpp b/packages/kokkos-kernels/blas/impl/KokkosBlas1_set_impl.hpp index 037720253bca..c676246b68de 100644 --- a/packages/kokkos-kernels/blas/impl/KokkosBlas1_set_impl.hpp +++ b/packages/kokkos-kernels/blas/impl/KokkosBlas1_set_impl.hpp @@ -14,8 +14,8 @@ // //@HEADER -#ifndef __KOKKOSBLAS_SET_IMPL_HPP__ -#define __KOKKOSBLAS_SET_IMPL_HPP__ +#ifndef KOKKOSBLAS_SET_IMPL_HPP +#define KOKKOSBLAS_SET_IMPL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/blas/impl/KokkosBlas1_sum_spec.hpp b/packages/kokkos-kernels/blas/impl/KokkosBlas1_sum_spec.hpp index 6df41e030902..2ade8b49a17f 100644 --- a/packages/kokkos-kernels/blas/impl/KokkosBlas1_sum_spec.hpp +++ b/packages/kokkos-kernels/blas/impl/KokkosBlas1_sum_spec.hpp @@ -240,5 +240,7 @@ struct Sum; #include +#include +#include #endif // KOKKOSBLAS1_SUM_SPEC_HPP_ diff --git a/packages/kokkos-kernels/blas/impl/KokkosBlas1_swap_spec.hpp b/packages/kokkos-kernels/blas/impl/KokkosBlas1_swap_spec.hpp index 749552a81c30..314795125bb7 100644 --- a/packages/kokkos-kernels/blas/impl/KokkosBlas1_swap_spec.hpp +++ b/packages/kokkos-kernels/blas/impl/KokkosBlas1_swap_spec.hpp @@ -120,5 +120,6 @@ struct Swap; #include +#include #endif // KOKKOSBLAS1_SWAP_SPEC_HPP_ diff --git a/packages/kokkos-kernels/blas/impl/KokkosBlas1_update_spec.hpp b/packages/kokkos-kernels/blas/impl/KokkosBlas1_update_spec.hpp index b031a529b8d9..131ace8c25f4 100644 --- a/packages/kokkos-kernels/blas/impl/KokkosBlas1_update_spec.hpp +++ b/packages/kokkos-kernels/blas/impl/KokkosBlas1_update_spec.hpp @@ -335,5 +335,7 @@ struct Update; #include +#include +#include #endif // KOKKOSBLAS1_UPDATE_SPEC_HPP_ diff --git a/packages/kokkos-kernels/blas/impl/KokkosBlas2_gemv_impl.hpp b/packages/kokkos-kernels/blas/impl/KokkosBlas2_gemv_impl.hpp index b1976e262273..6ae3c90582e5 100644 --- a/packages/kokkos-kernels/blas/impl/KokkosBlas2_gemv_impl.hpp +++ b/packages/kokkos-kernels/blas/impl/KokkosBlas2_gemv_impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef KOKKOS_BLAS2_MV_IMPL_GEMV_HPP_ -#define KOKKOS_BLAS2_MV_IMPL_GEMV_HPP_ +#ifndef KOKKOSBLAS2_MV_IMPL_GEMV_HPP_ +#define KOKKOSBLAS2_MV_IMPL_GEMV_HPP_ #include "KokkosKernels_config.h" #include "Kokkos_Core.hpp" @@ -623,7 +623,7 @@ void twoLevelGemv(const ExecutionSpace& space, const char trans[], typename AVie // depending on whether execution space is CPU or GPU. enable_if makes sure // unused kernels are not instantiated. template ()>::type* = nullptr> + typename std::enable_if>::type* = nullptr> void generalGemvImpl(const ExecutionSpace& space, const char trans[], typename AViewType::const_value_type& alpha, const AViewType& A, const XViewType& x, typename YViewType::const_value_type& beta, const YViewType& y) { @@ -631,7 +631,7 @@ void generalGemvImpl(const ExecutionSpace& space, const char trans[], typename A } template ()>::type* = nullptr> + typename std::enable_if>::type* = nullptr> void generalGemvImpl(const ExecutionSpace& space, const char trans[], typename AViewType::const_value_type& alpha, const AViewType& A, const XViewType& x, typename YViewType::const_value_type& beta, const YViewType& y) { @@ -641,4 +641,4 @@ void generalGemvImpl(const ExecutionSpace& space, const char trans[], typename A } // namespace Impl } // namespace KokkosBlas -#endif // KOKKOS_BLAS2_MV_IMPL_GEMV_HPP_ +#endif // KOKKOSBLAS2_MV_IMPL_GEMV_HPP_ diff --git a/packages/kokkos-kernels/blas/impl/KokkosBlas2_gemv_spec.hpp b/packages/kokkos-kernels/blas/impl/KokkosBlas2_gemv_spec.hpp index 05e2d28bc7da..b56ac4c1f68c 100644 --- a/packages/kokkos-kernels/blas/impl/KokkosBlas2_gemv_spec.hpp +++ b/packages/kokkos-kernels/blas/impl/KokkosBlas2_gemv_spec.hpp @@ -121,6 +121,8 @@ struct GEMV { Kokkos::View, Kokkos::MemoryTraits >, \ false, true>; +#include + #define KOKKOSBLAS2_GEMV_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ template struct GEMV< \ EXEC_SPACE, \ diff --git a/packages/kokkos-kernels/blas/impl/KokkosBlas2_ger_impl.hpp b/packages/kokkos-kernels/blas/impl/KokkosBlas2_ger_impl.hpp index 94eb1868f97b..bfcff1fb0e38 100644 --- a/packages/kokkos-kernels/blas/impl/KokkosBlas2_ger_impl.hpp +++ b/packages/kokkos-kernels/blas/impl/KokkosBlas2_ger_impl.hpp @@ -208,14 +208,14 @@ void teamParallelGer(const ExecutionSpace& space, const char trans[], const type // The 'enable_if' makes sure unused kernels are not instantiated. template ()>::type* = nullptr> + typename std::enable_if>::type* = nullptr> void generalGerImpl(const ExecutionSpace& space, const char trans[], const typename AViewType::const_value_type& alpha, const XViewType& x, const YViewType& y, const AViewType& A) { threadParallelGer(space, trans, alpha, x, y, A); } template ()>::type* = nullptr> + typename std::enable_if>::type* = nullptr> void generalGerImpl(const ExecutionSpace& space, const char trans[], const typename AViewType::const_value_type& alpha, const XViewType& x, const YViewType& y, const AViewType& A) { teamParallelGer(space, trans, alpha, x, y, A); diff --git a/packages/kokkos-kernels/blas/impl/KokkosBlas2_ger_spec.hpp b/packages/kokkos-kernels/blas/impl/KokkosBlas2_ger_spec.hpp index 04e25ab42225..806678fb2598 100644 --- a/packages/kokkos-kernels/blas/impl/KokkosBlas2_ger_spec.hpp +++ b/packages/kokkos-kernels/blas/impl/KokkosBlas2_ger_spec.hpp @@ -125,5 +125,6 @@ struct GER { false, true>; #include +#include #endif // KOKKOSBLAS2_GER_SPEC_HPP_ diff --git a/packages/kokkos-kernels/blas/impl/KokkosBlas2_serial_gemv_impl.hpp b/packages/kokkos-kernels/blas/impl/KokkosBlas2_serial_gemv_impl.hpp index 79f49fdd0e4d..d9fcb098158c 100644 --- a/packages/kokkos-kernels/blas/impl/KokkosBlas2_serial_gemv_impl.hpp +++ b/packages/kokkos-kernels/blas/impl/KokkosBlas2_serial_gemv_impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBLAS_GEMV_SERIAL_IMPL_HPP__ -#define __KOKKOSBLAS_GEMV_SERIAL_IMPL_HPP__ +#ifndef KOKKOSBLAS_GEMV_SERIAL_IMPL_HPP +#define KOKKOSBLAS_GEMV_SERIAL_IMPL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/blas/impl/KokkosBlas2_serial_gemv_inner_multiple_dot.hpp b/packages/kokkos-kernels/blas/impl/KokkosBlas2_serial_gemv_inner_multiple_dot.hpp index 1b70413119be..1a41ff4db3ba 100644 --- a/packages/kokkos-kernels/blas/impl/KokkosBlas2_serial_gemv_inner_multiple_dot.hpp +++ b/packages/kokkos-kernels/blas/impl/KokkosBlas2_serial_gemv_inner_multiple_dot.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBLAS_INNER_MULTIPLE_DOT_PRODUCT_SERIAL_IMPL_HPP__ -#define __KOKKOSBLAS_INNER_MULTIPLE_DOT_PRODUCT_SERIAL_IMPL_HPP__ +#ifndef KOKKOSBLAS_INNER_MULTIPLE_DOT_PRODUCT_SERIAL_IMPL_HPP +#define KOKKOSBLAS_INNER_MULTIPLE_DOT_PRODUCT_SERIAL_IMPL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) @@ -356,4 +356,4 @@ KOKKOS_INLINE_FUNCTION int InnerMultipleDotProduct<1>::serial_invoke(const Scala } // namespace Impl } // namespace KokkosBlas -#endif // __KOKKOSBLAS_INNER_MULTIPLE_DOT_PRODUCT_SERIAL_IMPL_HPP__ +#endif // KOKKOSBLAS_INNER_MULTIPLE_DOT_PRODUCT_SERIAL_IMPL_HPP diff --git a/packages/kokkos-kernels/blas/impl/KokkosBlas2_serial_gemv_internal.hpp b/packages/kokkos-kernels/blas/impl/KokkosBlas2_serial_gemv_internal.hpp index 912972c7ee4c..d0be91e84b87 100644 --- a/packages/kokkos-kernels/blas/impl/KokkosBlas2_serial_gemv_internal.hpp +++ b/packages/kokkos-kernels/blas/impl/KokkosBlas2_serial_gemv_internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBLAS_GEMV_SERIAL_INTERNAL_HPP__ -#define __KOKKOSBLAS_GEMV_SERIAL_INTERNAL_HPP__ +#ifndef KOKKOSBLAS_GEMV_SERIAL_INTERNAL_HPP +#define KOKKOSBLAS_GEMV_SERIAL_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/packages/kokkos-kernels/blas/impl/KokkosBlas2_syr2_impl.hpp b/packages/kokkos-kernels/blas/impl/KokkosBlas2_syr2_impl.hpp index 7bcb0069ab1c..5f134c097dc4 100644 --- a/packages/kokkos-kernels/blas/impl/KokkosBlas2_syr2_impl.hpp +++ b/packages/kokkos-kernels/blas/impl/KokkosBlas2_syr2_impl.hpp @@ -292,7 +292,7 @@ void teamParallelSyr2(const ExecutionSpace& space, const typename AViewType::con template ()>::type* = nullptr> + typename std::enable_if>::type* = nullptr> void generalSyr2Impl(const ExecutionSpace& space, const typename AViewType::const_value_type& alpha, const XViewType& x, const YViewType& y, const AViewType& A) { threadParallelSyr2(space, alpha, @@ -301,7 +301,7 @@ void generalSyr2Impl(const ExecutionSpace& space, const typename AViewType::cons template ()>::type* = nullptr> + typename std::enable_if>::type* = nullptr> void generalSyr2Impl(const ExecutionSpace& space, const typename AViewType::const_value_type& alpha, const XViewType& x, const YViewType& y, const AViewType& A) { teamParallelSyr2(space, alpha, x, diff --git a/packages/kokkos-kernels/blas/impl/KokkosBlas2_syr2_spec.hpp b/packages/kokkos-kernels/blas/impl/KokkosBlas2_syr2_spec.hpp index a8ae741edec2..7e4b1bcab202 100644 --- a/packages/kokkos-kernels/blas/impl/KokkosBlas2_syr2_spec.hpp +++ b/packages/kokkos-kernels/blas/impl/KokkosBlas2_syr2_spec.hpp @@ -154,5 +154,6 @@ struct SYR2 { false, true>; #include +#include #endif // KOKKOSBLAS2_SYR2_SPEC_HPP_ diff --git a/packages/kokkos-kernels/blas/impl/KokkosBlas2_syr_impl.hpp b/packages/kokkos-kernels/blas/impl/KokkosBlas2_syr_impl.hpp index 7685fd4b4b3d..11aa3a50522f 100644 --- a/packages/kokkos-kernels/blas/impl/KokkosBlas2_syr_impl.hpp +++ b/packages/kokkos-kernels/blas/impl/KokkosBlas2_syr_impl.hpp @@ -206,14 +206,14 @@ void teamParallelSyr(const ExecutionSpace& space, const typename AViewType::cons // The 'enable_if' makes sure unused kernels are not instantiated. template ()>::type* = nullptr> + typename std::enable_if>::type* = nullptr> void generalSyrImpl(const ExecutionSpace& space, const typename AViewType::const_value_type& alpha, const XViewType& x, const AViewType& A) { threadParallelSyr(space, alpha, x, A); } template ()>::type* = nullptr> + typename std::enable_if>::type* = nullptr> void generalSyrImpl(const ExecutionSpace& space, const typename AViewType::const_value_type& alpha, const XViewType& x, const AViewType& A) { teamParallelSyr(space, alpha, x, A); diff --git a/packages/kokkos-kernels/blas/impl/KokkosBlas2_syr_spec.hpp b/packages/kokkos-kernels/blas/impl/KokkosBlas2_syr_spec.hpp index 58c77536182a..74112c30cd12 100644 --- a/packages/kokkos-kernels/blas/impl/KokkosBlas2_syr_spec.hpp +++ b/packages/kokkos-kernels/blas/impl/KokkosBlas2_syr_spec.hpp @@ -146,5 +146,6 @@ struct SYR { false, true>; #include +#include #endif // KOKKOSBLAS2_SYR_SPEC_HPP_ diff --git a/packages/kokkos-kernels/blas/impl/KokkosBlas3_gemm_dotbased_impl.hpp b/packages/kokkos-kernels/blas/impl/KokkosBlas3_gemm_dotbased_impl.hpp index 15c3c74ecd66..72a08ec749ea 100644 --- a/packages/kokkos-kernels/blas/impl/KokkosBlas3_gemm_dotbased_impl.hpp +++ b/packages/kokkos-kernels/blas/impl/KokkosBlas3_gemm_dotbased_impl.hpp @@ -14,8 +14,8 @@ // //@HEADER -#ifndef KOKKOS_BLAS3_GEMM_DOTBASED_IMPL_HPP_ -#define KOKKOS_BLAS3_GEMM_DOTBASED_IMPL_HPP_ +#ifndef KOKKOSBLAS3_GEMM_DOTBASED_IMPL_HPP_ +#define KOKKOSBLAS3_GEMM_DOTBASED_IMPL_HPP_ #include "KokkosBlas_util.hpp" @@ -141,4 +141,4 @@ struct DotBasedGEMM { } // namespace Impl } // namespace KokkosBlas -#endif +#endif // KOKKOSBLAS3_GEMM_DOTBASED_IMPL_HPP_ diff --git a/packages/kokkos-kernels/blas/impl/KokkosBlas3_gemm_impl.hpp b/packages/kokkos-kernels/blas/impl/KokkosBlas3_gemm_impl.hpp index 675ef5d3a4c5..38f3fe5f8d4b 100644 --- a/packages/kokkos-kernels/blas/impl/KokkosBlas3_gemm_impl.hpp +++ b/packages/kokkos-kernels/blas/impl/KokkosBlas3_gemm_impl.hpp @@ -14,8 +14,8 @@ // //@HEADER -#ifndef KOKKOS_BLAS3_GEMM_IMPL_HPP_ -#define KOKKOS_BLAS3_GEMM_IMPL_HPP_ +#ifndef KOKKOSBLAS3_GEMM_IMPL_HPP_ +#define KOKKOSBLAS3_GEMM_IMPL_HPP_ #include #include "KokkosKernels_Macros.hpp" @@ -23,7 +23,7 @@ #ifdef KOKKOS_ENABLE_CXX14 #ifdef KOKKOS_COMPILER_GNU #if KOKKOS_COMPILER_GNU <= 740 -#define KOKKOS_IMPL_BATCHED_GEMM_GCC_CXX14_WORKAROUND +#define KOKKOSKERNELS_IMPL_BATCHED_GEMM_GCC_CXX14_WORKAROUND #endif #endif #endif @@ -69,11 +69,11 @@ struct impl_deep_copy_matrix_block(C.extent(0)); const int N = static_cast(C.extent(1)); - const bool is_device_space = KokkosKernels::Impl::kk_is_gpu_exec_space(); + const bool is_device_space = KokkosKernels::Impl::is_gpu_exec_space_v; const bool A_is_lr = std::is_same::value; const bool A_is_tr = ((transA[0] == 'T') || (transA[0] == 't') || (transA[0] == 'C') || (transA[0] == 'c')); const bool B_is_tr = ((transB[0] == 'T') || (transB[0] == 't') || (transB[0] == 'C') || (transB[0] == 'c')); @@ -272,5 +272,6 @@ struct GEMM { MEM_SPACE) #include +#include #endif // KOKKOSBLAS3_GEMM_SPEC_HPP_ diff --git a/packages/kokkos-kernels/blas/impl/KokkosBlas3_trmm_spec.hpp b/packages/kokkos-kernels/blas/impl/KokkosBlas3_trmm_spec.hpp index 6399f9e57e21..f5520656ad01 100644 --- a/packages/kokkos-kernels/blas/impl/KokkosBlas3_trmm_spec.hpp +++ b/packages/kokkos-kernels/blas/impl/KokkosBlas3_trmm_spec.hpp @@ -138,6 +138,8 @@ struct TRMM + #define KOKKOSBLAS3_TRMM_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ KOKKOSBLAS3_TRMM_ETI_SPEC_INST_LAYOUTS(SCALAR, LAYOUT, LAYOUT, EXEC_SPACE, MEM_SPACE) diff --git a/packages/kokkos-kernels/blas/impl/KokkosBlas3_trsm_spec.hpp b/packages/kokkos-kernels/blas/impl/KokkosBlas3_trsm_spec.hpp index 8c9088e97075..7c5c6fc3eb33 100644 --- a/packages/kokkos-kernels/blas/impl/KokkosBlas3_trsm_spec.hpp +++ b/packages/kokkos-kernels/blas/impl/KokkosBlas3_trsm_spec.hpp @@ -138,6 +138,8 @@ struct TRSM + #define KOKKOSBLAS3_TRSM_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ KOKKOSBLAS3_TRSM_ETI_SPEC_INST_LAYOUTS(SCALAR, LAYOUT, LAYOUT, EXEC_SPACE, MEM_SPACE) diff --git a/packages/kokkos-kernels/blas/impl/KokkosBlas_util.hpp b/packages/kokkos-kernels/blas/impl/KokkosBlas_util.hpp index 885625673f4d..c0777ac9ea2e 100644 --- a/packages/kokkos-kernels/blas/impl/KokkosBlas_util.hpp +++ b/packages/kokkos-kernels/blas/impl/KokkosBlas_util.hpp @@ -14,8 +14,8 @@ // //@HEADER -#ifndef KOKKOS_BLAS_UTIL_HPP -#define KOKKOS_BLAS_UTIL_HPP +#ifndef KOKKOSBLAS_UTIL_HPP +#define KOKKOSBLAS_UTIL_HPP #include "Kokkos_ArithTraits.hpp" @@ -86,6 +86,7 @@ struct Algo { using QR = Level3; using UTV = Level3; using Pttrf = Level3; + using Pttrs = Level3; struct Level2 { struct Unblocked {}; @@ -118,6 +119,8 @@ struct Algo { using Trsv = Level2; using ApplyQ = Level2; using Tbsv = Level2; + using Pbtrf = Level2; + using Pbtrs = Level2; }; namespace Impl { @@ -174,4 +177,4 @@ struct TakeSqrtFunctor { } // namespace Impl } // namespace KokkosBlas -#endif +#endif // KOKKOSBLAS_UTIL_HPP diff --git a/packages/kokkos-kernels/blas/src/KokkosBlas1_dot.hpp b/packages/kokkos-kernels/blas/src/KokkosBlas1_dot.hpp index 6e1a428b5170..477dac864ce6 100644 --- a/packages/kokkos-kernels/blas/src/KokkosBlas1_dot.hpp +++ b/packages/kokkos-kernels/blas/src/KokkosBlas1_dot.hpp @@ -77,9 +77,9 @@ typename Kokkos::Details::InnerProductSpaceTraits::type; using RVector_Internal = - Kokkos::View>; - using RVector_Result = - Kokkos::View>; + Kokkos::View>; + using RVector_Result = Kokkos::View>; XVector_Internal X = x; YVector_Internal Y = y; diff --git a/packages/kokkos-kernels/blas/src/KokkosBlas1_nrm1.hpp b/packages/kokkos-kernels/blas/src/KokkosBlas1_nrm1.hpp index bf7119a5855a..807475d8f4c9 100644 --- a/packages/kokkos-kernels/blas/src/KokkosBlas1_nrm1.hpp +++ b/packages/kokkos-kernels/blas/src/KokkosBlas1_nrm1.hpp @@ -49,8 +49,8 @@ typename Kokkos::Details::InnerProductSpaceTraits::array_layout, typename XVector::device_type, Kokkos::MemoryTraits >; - using RVector_Internal = - Kokkos::View >; + using RVector_Internal = Kokkos::View >; mag_type result; RVector_Internal R = RVector_Internal(&result); diff --git a/packages/kokkos-kernels/blas/src/KokkosBlas1_nrm2_squared.hpp b/packages/kokkos-kernels/blas/src/KokkosBlas1_nrm2_squared.hpp index 748ece36635d..e8b0a63d043c 100644 --- a/packages/kokkos-kernels/blas/src/KokkosBlas1_nrm2_squared.hpp +++ b/packages/kokkos-kernels/blas/src/KokkosBlas1_nrm2_squared.hpp @@ -55,7 +55,8 @@ typename Kokkos::Details::InnerProductSpaceTraits > XVector_Internal; - typedef Kokkos::View > + typedef Kokkos::View > RVector_Internal; mag_type result; diff --git a/packages/kokkos-kernels/blas/tpls/KokkosBlas1_axpby_tpl_spec_decl.hpp b/packages/kokkos-kernels/blas/tpls/KokkosBlas1_axpby_tpl_spec_decl.hpp index 5ab29e632f7f..34852fa5b8cc 100644 --- a/packages/kokkos-kernels/blas/tpls/KokkosBlas1_axpby_tpl_spec_decl.hpp +++ b/packages/kokkos-kernels/blas/tpls/KokkosBlas1_axpby_tpl_spec_decl.hpp @@ -222,9 +222,9 @@ namespace Impl { const int N = static_cast(numElems); \ constexpr int one = 1; \ KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasDaxpy(s.handle, N, &alpha, X.data(), one, Y.data(), one)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasDaxpy(s.handle, N, &alpha, X.data(), one, Y.data(), one)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, NULL)); \ } else \ Axpby::axpby(space, alpha, X, beta, Y); \ Kokkos::Profiling::popRegion(); \ @@ -258,89 +258,89 @@ namespace Impl { const int N = static_cast(numElems); \ constexpr int one = 1; \ KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSaxpy(s.handle, N, &alpha, X.data(), one, Y.data(), one)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSaxpy(s.handle, N, &alpha, X.data(), one, Y.data(), one)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, NULL)); \ } else \ Axpby::axpby(space, alpha, X, beta, Y); \ Kokkos::Profiling::popRegion(); \ } \ }; -#define KOKKOSBLAS1_ZAXPBY_CUBLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ - template \ - struct Axpby, \ - Kokkos::View*, LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::complex, \ - Kokkos::View*, LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex AV; \ - typedef Kokkos::complex BV; \ - typedef Kokkos::View*, LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef Kokkos::View*, LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits > \ - YV; \ - typedef typename XV::size_type size_type; \ - \ - static void axpby(const ExecSpace& space, const AV& alpha, const XV& X, const BV& beta, const YV& Y) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::axpby[TPL_CUBLAS,complex]"); \ - const size_type numElems = X.extent(0); \ - if ((numElems < static_cast(INT_MAX)) && (beta == 1.0f)) { \ - axpby_print_specialization(); \ - const int N = static_cast(numElems); \ - constexpr int one = 1; \ - KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZaxpy(s.handle, N, reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(Y.data()), one)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - } else \ - Axpby::axpby(space, alpha, X, beta, Y); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_ZAXPBY_CUBLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ + template \ + struct Axpby, \ + Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::complex, \ + Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + 1, true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex AV; \ + typedef Kokkos::complex BV; \ + typedef Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits > \ + XV; \ + typedef Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits > \ + YV; \ + typedef typename XV::size_type size_type; \ + \ + static void axpby(const ExecSpace& space, const AV& alpha, const XV& X, const BV& beta, const YV& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::axpby[TPL_CUBLAS,complex]"); \ + const size_type numElems = X.extent(0); \ + if ((numElems < static_cast(INT_MAX)) && (beta == 1.0f)) { \ + axpby_print_specialization(); \ + const int N = static_cast(numElems); \ + constexpr int one = 1; \ + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasZaxpy(s.handle, N, reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, NULL)); \ + } else \ + Axpby::axpby(space, alpha, X, beta, Y); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_CAXPBY_CUBLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ - template \ - struct Axpby, \ - Kokkos::View*, LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::complex, \ - Kokkos::View*, LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex AV; \ - typedef Kokkos::complex BV; \ - typedef Kokkos::View*, LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef Kokkos::View*, LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits > \ - YV; \ - typedef typename XV::size_type size_type; \ - \ - static void axpby(const ExecSpace& space, const AV& alpha, const XV& X, const BV& beta, const YV& Y) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::axpby[TPL_CUBLAS,complex]"); \ - const size_type numElems = X.extent(0); \ - if ((numElems < static_cast(INT_MAX)) && (beta == 1.0f)) { \ - axpby_print_specialization(); \ - const int N = static_cast(numElems); \ - constexpr int one = 1; \ - KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasCaxpy(s.handle, N, reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(Y.data()), one)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - } else \ - Axpby::axpby(space, alpha, X, beta, Y); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_CAXPBY_CUBLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ + template \ + struct Axpby, \ + Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::complex, \ + Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + 1, true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex AV; \ + typedef Kokkos::complex BV; \ + typedef Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits > \ + XV; \ + typedef Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits > \ + YV; \ + typedef typename XV::size_type size_type; \ + \ + static void axpby(const ExecSpace& space, const AV& alpha, const XV& X, const BV& beta, const YV& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::axpby[TPL_CUBLAS,complex]"); \ + const size_type numElems = X.extent(0); \ + if ((numElems < static_cast(INT_MAX)) && (beta == 1.0f)) { \ + axpby_print_specialization(); \ + const int N = static_cast(numElems); \ + constexpr int one = 1; \ + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasCaxpy(s.handle, N, reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, NULL)); \ + } else \ + Axpby::axpby(space, alpha, X, beta, Y); \ + Kokkos::Profiling::popRegion(); \ + } \ }; KOKKOSBLAS1_DAXPBY_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, true) diff --git a/packages/kokkos-kernels/blas/tpls/KokkosBlas1_dot_tpl_spec_decl.hpp b/packages/kokkos-kernels/blas/tpls/KokkosBlas1_dot_tpl_spec_decl.hpp index fa9d5fafce93..8533ca6f858d 100644 --- a/packages/kokkos-kernels/blas/tpls/KokkosBlas1_dot_tpl_spec_decl.hpp +++ b/packages/kokkos-kernels/blas/tpls/KokkosBlas1_dot_tpl_spec_decl.hpp @@ -117,11 +117,11 @@ namespace Impl { dot_print_specialization(); \ const int N = static_cast(numElems); \ KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(TPL_DOT(s.handle, N, reinterpret_cast(X.data()), 1, \ - reinterpret_cast(Y.data()), 1, \ - reinterpret_cast(&R()))); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(TPL_DOT(s.handle, N, reinterpret_cast(X.data()), 1, \ + reinterpret_cast(Y.data()), 1, \ + reinterpret_cast(&R()))); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, NULL)); \ } else { \ Dot::dot(space, R, X, Y); \ } \ @@ -175,11 +175,11 @@ namespace Impl { dot_print_specialization(); \ const rocblas_int N = static_cast(numElems); \ KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(TPL_DOT(s.handle, N, reinterpret_cast(X.data()), 1, \ - reinterpret_cast(Y.data()), 1, \ - reinterpret_cast(&R()))); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(TPL_DOT(s.handle, N, reinterpret_cast(X.data()), 1, \ + reinterpret_cast(Y.data()), 1, \ + reinterpret_cast(&R()))); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, NULL)); \ } else { \ Dot::dot(space, R, X, Y); \ } \ diff --git a/packages/kokkos-kernels/blas/tpls/KokkosBlas1_iamax_tpl_spec_decl.hpp b/packages/kokkos-kernels/blas/tpls/KokkosBlas1_iamax_tpl_spec_decl.hpp index c85de4d18638..c9c765215c34 100644 --- a/packages/kokkos-kernels/blas/tpls/KokkosBlas1_iamax_tpl_spec_decl.hpp +++ b/packages/kokkos-kernels/blas/tpls/KokkosBlas1_iamax_tpl_spec_decl.hpp @@ -146,17 +146,17 @@ using CUBLASUVM_DEVICE_TYPE = Kokkos::Device const int XST = X.stride(0); \ const int LDX = (XST == 0) ? 1 : XST; \ KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, space.cuda_stream())); \ cublasPointerMode_t prevPtrMode; \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasGetPointerMode(s.handle, &prevPtrMode)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasGetPointerMode(s.handle, &prevPtrMode)); \ if (prevPtrMode == CUBLAS_PTR_MODE_2) { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(s.handle, CUBLAS_PTR_MODE_1)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetPointerMode(s.handle, CUBLAS_PTR_MODE_1)); \ } \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(CUBLAS_FN(s.handle, N, reinterpret_cast(X.data()), LDX, \ - reinterpret_cast(R.data()))); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(CUBLAS_FN(s.handle, N, reinterpret_cast(X.data()), \ + LDX, reinterpret_cast(R.data()))); \ if (prevPtrMode == CUBLAS_PTR_MODE_2) { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(s.handle, CUBLAS_PTR_MODE_2)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetPointerMode(s.handle, CUBLAS_PTR_MODE_2)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, NULL)); \ } \ } else { \ Iamax::iamax(space, R, X); \ @@ -280,52 +280,53 @@ namespace Impl { using ROCBLAS_DEVICE_TYPE = Kokkos::Device; -#define KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_ROCBLAS_WRAPPER(SCALAR_TYPE, ROCBLAS_SCALAR_TYPE, ROCBLAS_FN, INDEX_TYPE, \ - LAYOUT, MEMSPACE, ETI_SPEC_AVAIL, RET_DEVICE_TYPE, \ - ROCBLAS_PTR_MODE_1, ROCBLAS_PTR_MODE_2) \ - template <> \ - struct Iamax >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - using execution_space = Kokkos::HIP; \ - typedef Kokkos::View > RV; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef typename XV::size_type size_type; \ - \ - static void iamax(const execution_space& space, RV& R, const XV& X) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::iamax[TPL_ROCBLAS," #SCALAR_TYPE "]"); \ - const size_type numElems = X.extent(0); \ - if (numElems == 0) { \ - Kokkos::deep_copy(R, 0); \ - return; \ - } \ - if (numElems < static_cast(INT_MAX)) { \ - iamax_print_specialization(); \ - const int N = static_cast(numElems); \ - const int XST = X.stride(0); \ - const int LDX = (XST == 0) ? 1 : XST; \ - KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ - rocblas_pointer_mode prevPtrMode; \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_get_pointer_mode(s.handle, &prevPtrMode)); \ - if (prevPtrMode == ROCBLAS_PTR_MODE_2) { \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(s.handle, ROCBLAS_PTR_MODE_1)); \ - } \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(ROCBLAS_FN(s.handle, N, reinterpret_cast(X.data()), \ - LDX, reinterpret_cast(R.data()))); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ - if (prevPtrMode == ROCBLAS_PTR_MODE_2) { \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(s.handle, ROCBLAS_PTR_MODE_2)); \ - } \ - } else { \ - Iamax::iamax(space, R, X); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_ROCBLAS_WRAPPER(SCALAR_TYPE, ROCBLAS_SCALAR_TYPE, ROCBLAS_FN, INDEX_TYPE, \ + LAYOUT, MEMSPACE, ETI_SPEC_AVAIL, RET_DEVICE_TYPE, \ + ROCBLAS_PTR_MODE_1, ROCBLAS_PTR_MODE_2) \ + template <> \ + struct Iamax >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, true, ETI_SPEC_AVAIL> { \ + using execution_space = Kokkos::HIP; \ + typedef Kokkos::View > RV; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XV; \ + typedef typename XV::size_type size_type; \ + \ + static void iamax(const execution_space& space, RV& R, const XV& X) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::iamax[TPL_ROCBLAS," #SCALAR_TYPE "]"); \ + const size_type numElems = X.extent(0); \ + if (numElems == 0) { \ + Kokkos::deep_copy(R, 0); \ + return; \ + } \ + if (numElems < static_cast(INT_MAX)) { \ + iamax_print_specialization(); \ + const int N = static_cast(numElems); \ + const int XST = X.stride(0); \ + const int LDX = (XST == 0) ? 1 : XST; \ + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, space.hip_stream())); \ + rocblas_pointer_mode prevPtrMode; \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_get_pointer_mode(s.handle, &prevPtrMode)); \ + if (prevPtrMode == ROCBLAS_PTR_MODE_2) { \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_pointer_mode(s.handle, ROCBLAS_PTR_MODE_1)); \ + } \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(ROCBLAS_FN(s.handle, N, \ + reinterpret_cast(X.data()), LDX, \ + reinterpret_cast(R.data()))); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, NULL)); \ + if (prevPtrMode == ROCBLAS_PTR_MODE_2) { \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_pointer_mode(s.handle, ROCBLAS_PTR_MODE_2)); \ + } \ + } else { \ + Iamax::iamax(space, R, X); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; #define KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_ROCBLAS(SCALAR_TYPE, ROCBLAS_SCALAR_TYPE, ROCBLAS_FN, INDEX_TYPE, LAYOUT, \ diff --git a/packages/kokkos-kernels/blas/tpls/KokkosBlas1_nrm1_tpl_spec_decl.hpp b/packages/kokkos-kernels/blas/tpls/KokkosBlas1_nrm1_tpl_spec_decl.hpp index 378fbc936f01..550344522764 100644 --- a/packages/kokkos-kernels/blas/tpls/KokkosBlas1_nrm1_tpl_spec_decl.hpp +++ b/packages/kokkos-kernels/blas/tpls/KokkosBlas1_nrm1_tpl_spec_decl.hpp @@ -119,22 +119,22 @@ void cublasAsumWrapper(const ExecutionSpace& space, RViewType& R, const XViewTyp constexpr int one = 1; KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, space.cuda_stream())); if constexpr (std::is_same_v) { - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSasum(s.handle, N, X.data(), one, R.data())); + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSasum(s.handle, N, X.data(), one, R.data())); } if constexpr (std::is_same_v) { - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasDasum(s.handle, N, X.data(), one, R.data())); + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasDasum(s.handle, N, X.data(), one, R.data())); } if constexpr (std::is_same_v>) { - KOKKOS_CUBLAS_SAFE_CALL_IMPL( + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL( cublasScasum(s.handle, N, reinterpret_cast(X.data()), one, R.data())); } if constexpr (std::is_same_v>) { - KOKKOS_CUBLAS_SAFE_CALL_IMPL( + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL( cublasDzasum(s.handle, N, reinterpret_cast(X.data()), one, R.data())); } - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, NULL)); } #define KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_CUBLAS(SCALAR, LAYOUT, MEMSPACE) \ @@ -201,22 +201,22 @@ void rocblasAsumWrapper(const ExecutionSpace& space, RViewType& R, const XViewTy constexpr int one = 1; KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, space.hip_stream())); if constexpr (std::is_same_v) { - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_sasum(s.handle, N, X.data(), one, R.data())); + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_sasum(s.handle, N, X.data(), one, R.data())); } if constexpr (std::is_same_v) { - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_dasum(s.handle, N, X.data(), one, R.data())); + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_dasum(s.handle, N, X.data(), one, R.data())); } if constexpr (std::is_same_v>) { - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL( rocblas_scasum(s.handle, N, reinterpret_cast(X.data()), one, R.data())); } if constexpr (std::is_same_v>) { - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL( rocblas_dzasum(s.handle, N, reinterpret_cast(X.data()), one, R.data())); } - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, NULL)); } #define KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_ROCBLAS(SCALAR, LAYOUT, MEMSPACE) \ diff --git a/packages/kokkos-kernels/blas/tpls/KokkosBlas1_nrm2_tpl_spec_decl.hpp b/packages/kokkos-kernels/blas/tpls/KokkosBlas1_nrm2_tpl_spec_decl.hpp index b1e4cd58b92e..3fba6d03e718 100644 --- a/packages/kokkos-kernels/blas/tpls/KokkosBlas1_nrm2_tpl_spec_decl.hpp +++ b/packages/kokkos-kernels/blas/tpls/KokkosBlas1_nrm2_tpl_spec_decl.hpp @@ -175,37 +175,37 @@ KOKKOSBLAS1_CNRM2_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, fals namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_CUBLAS(LAYOUT, KOKKOS_TYPE, TPL_TYPE, EXECSPACE, MEMSPACE, TPL_NRM2, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Nrm2::mag_type, LAYOUT, Kokkos::HostSpace, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - using RT = Kokkos::ArithTraits::mag_type; \ - using RV = Kokkos::View >; \ - using XV = Kokkos::View, \ - Kokkos::MemoryTraits >; \ - using size_type = typename XV::size_type; \ - \ - static void nrm2(const EXECSPACE& space, RV& R, const XV& X, const bool& take_sqrt) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::nrm2[TPL_CUBLAS," + Kokkos::ArithTraits::name() + "]"); \ - const size_type numElems = X.extent(0); \ - if (numElems <= static_cast(std::numeric_limits::max())) { \ - nrm2_print_specialization(); \ - const int N = static_cast(numElems); \ - KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(TPL_NRM2(s.handle, N, reinterpret_cast(X.data()), 1, &R())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - if (!take_sqrt) R() = R() * R(); \ - } else { \ - Nrm2::nrm2(space, R, X, take_sqrt); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_CUBLAS(LAYOUT, KOKKOS_TYPE, TPL_TYPE, EXECSPACE, MEMSPACE, TPL_NRM2, \ + ETI_SPEC_AVAIL) \ + template <> \ + struct Nrm2::mag_type, LAYOUT, Kokkos::HostSpace, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, true, ETI_SPEC_AVAIL> { \ + using RT = Kokkos::ArithTraits::mag_type; \ + using RV = Kokkos::View >; \ + using XV = Kokkos::View, \ + Kokkos::MemoryTraits >; \ + using size_type = typename XV::size_type; \ + \ + static void nrm2(const EXECSPACE& space, RV& R, const XV& X, const bool& take_sqrt) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::nrm2[TPL_CUBLAS," + Kokkos::ArithTraits::name() + "]"); \ + const size_type numElems = X.extent(0); \ + if (numElems <= static_cast(std::numeric_limits::max())) { \ + nrm2_print_specialization(); \ + const int N = static_cast(numElems); \ + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(TPL_NRM2(s.handle, N, reinterpret_cast(X.data()), 1, &R())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, NULL)); \ + if (!take_sqrt) R() = R() * R(); \ + } else { \ + Nrm2::nrm2(space, R, X, take_sqrt); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; #define KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_CUBLAS_EXT(ETI_SPEC_AVAIL) \ @@ -254,9 +254,10 @@ namespace Impl { nrm2_print_specialization(); \ const rocblas_int N = static_cast(numElems); \ KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(TPL_NRM2(s.handle, N, reinterpret_cast(X.data()), 1, &R())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL( \ + TPL_NRM2(s.handle, N, reinterpret_cast(X.data()), 1, &R())); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, NULL)); \ if (!take_sqrt) R() = R() * R(); \ } else { \ Nrm2::nrm2(space, R, X, take_sqrt); \ diff --git a/packages/kokkos-kernels/blas/tpls/KokkosBlas1_rot_tpl_spec_decl.hpp b/packages/kokkos-kernels/blas/tpls/KokkosBlas1_rot_tpl_spec_decl.hpp index 404c5c0e3b9b..dfe747bf885a 100644 --- a/packages/kokkos-kernels/blas/tpls/KokkosBlas1_rot_tpl_spec_decl.hpp +++ b/packages/kokkos-kernels/blas/tpls/KokkosBlas1_rot_tpl_spec_decl.hpp @@ -167,12 +167,12 @@ namespace Impl { Kokkos::Profiling::pushRegion("KokkosBlas::rot[TPL_CUBLAS,double]"); \ rot_print_specialization(); \ KokkosBlas::Impl::CudaBlasSingleton& singleton = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(singleton.handle, space.cuda_stream())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(singleton.handle, space.cuda_stream())); \ cublasPointerMode_t pointer_mode; \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasGetPointerMode(singleton.handle, &pointer_mode)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(singleton.handle, CUBLAS_POINTER_MODE_DEVICE)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasGetPointerMode(singleton.handle, &pointer_mode)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetPointerMode(singleton.handle, CUBLAS_POINTER_MODE_DEVICE)); \ cublasDrot(singleton.handle, X.extent_int(0), X.data(), 1, Y.data(), 1, c.data(), s.data()); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(singleton.handle, pointer_mode)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetPointerMode(singleton.handle, pointer_mode)); \ Kokkos::Profiling::popRegion(); \ } \ }; @@ -193,12 +193,12 @@ namespace Impl { Kokkos::Profiling::pushRegion("KokkosBlas::rot[TPL_CUBLAS,float]"); \ rot_print_specialization(); \ KokkosBlas::Impl::CudaBlasSingleton& singleton = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(singleton.handle, space.cuda_stream())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(singleton.handle, space.cuda_stream())); \ cublasPointerMode_t pointer_mode; \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasGetPointerMode(singleton.handle, &pointer_mode)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(singleton.handle, CUBLAS_POINTER_MODE_DEVICE)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasGetPointerMode(singleton.handle, &pointer_mode)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetPointerMode(singleton.handle, CUBLAS_POINTER_MODE_DEVICE)); \ cublasSrot(singleton.handle, X.extent_int(0), X.data(), 1, Y.data(), 1, c.data(), s.data()); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(singleton.handle, pointer_mode)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetPointerMode(singleton.handle, pointer_mode)); \ Kokkos::Profiling::popRegion(); \ } \ }; @@ -220,13 +220,13 @@ namespace Impl { Kokkos::Profiling::pushRegion("KokkosBlas::rot[TPL_CUBLAS,complex]"); \ rot_print_specialization(); \ KokkosBlas::Impl::CudaBlasSingleton& singleton = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(singleton.handle, space.cuda_stream())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(singleton.handle, space.cuda_stream())); \ cublasPointerMode_t pointer_mode; \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasGetPointerMode(singleton.handle, &pointer_mode)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(singleton.handle, CUBLAS_POINTER_MODE_DEVICE)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasGetPointerMode(singleton.handle, &pointer_mode)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetPointerMode(singleton.handle, CUBLAS_POINTER_MODE_DEVICE)); \ cublasZdrot(singleton.handle, X.extent_int(0), reinterpret_cast(X.data()), 1, \ reinterpret_cast(Y.data()), 1, c.data(), s.data()); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(singleton.handle, pointer_mode)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetPointerMode(singleton.handle, pointer_mode)); \ Kokkos::Profiling::popRegion(); \ } \ }; @@ -248,13 +248,13 @@ namespace Impl { Kokkos::Profiling::pushRegion("KokkosBlas::rot[TPL_CUBLAS,complex]"); \ rot_print_specialization(); \ KokkosBlas::Impl::CudaBlasSingleton& singleton = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(singleton.handle, space.cuda_stream())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(singleton.handle, space.cuda_stream())); \ cublasPointerMode_t pointer_mode; \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasGetPointerMode(singleton.handle, &pointer_mode)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(singleton.handle, CUBLAS_POINTER_MODE_DEVICE)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasGetPointerMode(singleton.handle, &pointer_mode)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetPointerMode(singleton.handle, CUBLAS_POINTER_MODE_DEVICE)); \ cublasCsrot(singleton.handle, X.extent_int(0), reinterpret_cast(X.data()), 1, \ reinterpret_cast(Y.data()), 1, c.data(), s.data()); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(singleton.handle, pointer_mode)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetPointerMode(singleton.handle, pointer_mode)); \ Kokkos::Profiling::popRegion(); \ } \ }; diff --git a/packages/kokkos-kernels/blas/tpls/KokkosBlas1_rotg_tpl_spec_decl.hpp b/packages/kokkos-kernels/blas/tpls/KokkosBlas1_rotg_tpl_spec_decl.hpp index e6583d5ae31e..33e855fdf6b3 100644 --- a/packages/kokkos-kernels/blas/tpls/KokkosBlas1_rotg_tpl_spec_decl.hpp +++ b/packages/kokkos-kernels/blas/tpls/KokkosBlas1_rotg_tpl_spec_decl.hpp @@ -193,12 +193,12 @@ namespace Impl { Kokkos::Profiling::pushRegion("KokkosBlas::rotg[TPL_CUBLAS,double]"); \ rotg_print_specialization(); \ KokkosBlas::Impl::CudaBlasSingleton& singleton = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(singleton.handle, space.cuda_stream())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(singleton.handle, space.cuda_stream())); \ cublasPointerMode_t pointer_mode; \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasGetPointerMode(singleton.handle, &pointer_mode)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(singleton.handle, CUBLAS_POINTER_MODE_DEVICE)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasDrotg(singleton.handle, a.data(), b.data(), c.data(), s.data())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(singleton.handle, pointer_mode)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasGetPointerMode(singleton.handle, &pointer_mode)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetPointerMode(singleton.handle, CUBLAS_POINTER_MODE_DEVICE)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasDrotg(singleton.handle, a.data(), b.data(), c.data(), s.data())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetPointerMode(singleton.handle, pointer_mode)); \ Kokkos::Profiling::popRegion(); \ } \ }; @@ -219,12 +219,12 @@ namespace Impl { Kokkos::Profiling::pushRegion("KokkosBlas::rotg[TPL_CUBLAS,float]"); \ rotg_print_specialization(); \ KokkosBlas::Impl::CudaBlasSingleton& singleton = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(singleton.handle, space.cuda_stream())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(singleton.handle, space.cuda_stream())); \ cublasPointerMode_t pointer_mode; \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasGetPointerMode(singleton.handle, &pointer_mode)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(singleton.handle, CUBLAS_POINTER_MODE_DEVICE)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSrotg(singleton.handle, a.data(), b.data(), c.data(), s.data())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(singleton.handle, pointer_mode)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasGetPointerMode(singleton.handle, &pointer_mode)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetPointerMode(singleton.handle, CUBLAS_POINTER_MODE_DEVICE)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSrotg(singleton.handle, a.data(), b.data(), c.data(), s.data())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetPointerMode(singleton.handle, pointer_mode)); \ Kokkos::Profiling::popRegion(); \ } \ }; @@ -246,14 +246,14 @@ namespace Impl { Kokkos::Profiling::pushRegion("KokkosBlas::rotg[TPL_CUBLAS,complex]"); \ rotg_print_specialization, EXECSPACE>(); \ KokkosBlas::Impl::CudaBlasSingleton& singleton = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(singleton.handle, space.cuda_stream())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(singleton.handle, space.cuda_stream())); \ cublasPointerMode_t pointer_mode; \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasGetPointerMode(singleton.handle, &pointer_mode)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(singleton.handle, CUBLAS_POINTER_MODE_DEVICE)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZrotg(singleton.handle, reinterpret_cast(a.data()), \ - reinterpret_cast(b.data()), c.data(), \ - reinterpret_cast(s.data()))); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(singleton.handle, pointer_mode)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasGetPointerMode(singleton.handle, &pointer_mode)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetPointerMode(singleton.handle, CUBLAS_POINTER_MODE_DEVICE)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasZrotg(singleton.handle, reinterpret_cast(a.data()), \ + reinterpret_cast(b.data()), c.data(), \ + reinterpret_cast(s.data()))); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetPointerMode(singleton.handle, pointer_mode)); \ Kokkos::Profiling::popRegion(); \ } \ }; @@ -275,14 +275,14 @@ namespace Impl { Kokkos::Profiling::pushRegion("KokkosBlas::rotg[TPL_CUBLAS,complex]"); \ rotg_print_specialization, EXECSPACE>(); \ KokkosBlas::Impl::CudaBlasSingleton& singleton = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(singleton.handle, space.cuda_stream())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(singleton.handle, space.cuda_stream())); \ cublasPointerMode_t pointer_mode; \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasGetPointerMode(singleton.handle, &pointer_mode)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(singleton.handle, CUBLAS_POINTER_MODE_DEVICE)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasCrotg(singleton.handle, reinterpret_cast(a.data()), \ - reinterpret_cast(b.data()), c.data(), \ - reinterpret_cast(s.data()))); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(singleton.handle, pointer_mode)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasGetPointerMode(singleton.handle, &pointer_mode)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetPointerMode(singleton.handle, CUBLAS_POINTER_MODE_DEVICE)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasCrotg(singleton.handle, reinterpret_cast(a.data()), \ + reinterpret_cast(b.data()), c.data(), \ + reinterpret_cast(s.data()))); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetPointerMode(singleton.handle, pointer_mode)); \ Kokkos::Profiling::popRegion(); \ } \ }; @@ -351,12 +351,12 @@ namespace Impl { Kokkos::Profiling::pushRegion("KokkosBlas::rotg[TPL_ROCBLAS,double]"); \ rotg_print_specialization(); \ KokkosBlas::Impl::RocBlasSingleton& singleton = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(singleton.handle, space.hip_stream())); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(singleton.handle, space.hip_stream())); \ rocblas_pointer_mode pointer_mode; \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_get_pointer_mode(singleton.handle, &pointer_mode)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(singleton.handle, rocblas_pointer_mode_device)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_drotg(singleton.handle, a.data(), b.data(), c.data(), s.data())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(singleton.handle, pointer_mode)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_get_pointer_mode(singleton.handle, &pointer_mode)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_pointer_mode(singleton.handle, rocblas_pointer_mode_device)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_drotg(singleton.handle, a.data(), b.data(), c.data(), s.data())); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_pointer_mode(singleton.handle, pointer_mode)); \ Kokkos::Profiling::popRegion(); \ } \ }; @@ -377,44 +377,44 @@ namespace Impl { Kokkos::Profiling::pushRegion("KokkosBlas::rotg[TPL_ROCBLAS,float]"); \ rotg_print_specialization(); \ KokkosBlas::Impl::RocBlasSingleton& singleton = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(singleton.handle, space.hip_stream())); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(singleton.handle, space.hip_stream())); \ rocblas_pointer_mode pointer_mode; \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_get_pointer_mode(singleton.handle, &pointer_mode)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(singleton.handle, rocblas_pointer_mode_device)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_srotg(singleton.handle, a.data(), b.data(), c.data(), s.data())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(singleton.handle, pointer_mode)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_get_pointer_mode(singleton.handle, &pointer_mode)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_pointer_mode(singleton.handle, rocblas_pointer_mode_device)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_srotg(singleton.handle, a.data(), b.data(), c.data(), s.data())); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_pointer_mode(singleton.handle, pointer_mode)); \ Kokkos::Profiling::popRegion(); \ } \ }; -#define KOKKOSBLAS1_ZROTG_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct Rotg< \ - EXECSPACE, \ - Kokkos::View, LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - using SViewType = Kokkos::View, LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits>; \ - using MViewType = \ - Kokkos::View, Kokkos::MemoryTraits>; \ - static void rotg(EXECSPACE const& space, SViewType const& a, SViewType const& b, MViewType const& c, \ - SViewType const& s) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::rotg[TPL_ROCBLAS,complex]"); \ - rotg_print_specialization, EXECSPACE>(); \ - KokkosBlas::Impl::RocBlasSingleton& singleton = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(singleton.handle, space.hip_stream())); \ - rocblas_pointer_mode pointer_mode; \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_get_pointer_mode(singleton.handle, &pointer_mode)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(singleton.handle, rocblas_pointer_mode_device)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_zrotg(singleton.handle, \ - reinterpret_cast(a.data()), \ - reinterpret_cast(b.data()), c.data(), \ - reinterpret_cast(s.data()))); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(singleton.handle, pointer_mode)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_ZROTG_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct Rotg< \ + EXECSPACE, \ + Kokkos::View, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + using SViewType = Kokkos::View, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>; \ + using MViewType = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + static void rotg(EXECSPACE const& space, SViewType const& a, SViewType const& b, MViewType const& c, \ + SViewType const& s) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::rotg[TPL_ROCBLAS,complex]"); \ + rotg_print_specialization, EXECSPACE>(); \ + KokkosBlas::Impl::RocBlasSingleton& singleton = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(singleton.handle, space.hip_stream())); \ + rocblas_pointer_mode pointer_mode; \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_get_pointer_mode(singleton.handle, &pointer_mode)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_pointer_mode(singleton.handle, rocblas_pointer_mode_device)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_zrotg(singleton.handle, \ + reinterpret_cast(a.data()), \ + reinterpret_cast(b.data()), c.data(), \ + reinterpret_cast(s.data()))); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_pointer_mode(singleton.handle, pointer_mode)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; #define KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) \ @@ -434,15 +434,15 @@ namespace Impl { Kokkos::Profiling::pushRegion("KokkosBlas::rotg[TPL_ROCBLAS,complex]"); \ rotg_print_specialization, EXECSPACE>(); \ KokkosBlas::Impl::RocBlasSingleton& singleton = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(singleton.handle, space.hip_stream())); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(singleton.handle, space.hip_stream())); \ rocblas_pointer_mode pointer_mode; \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_get_pointer_mode(singleton.handle, &pointer_mode)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(singleton.handle, rocblas_pointer_mode_device)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_crotg(singleton.handle, \ - reinterpret_cast(a.data()), \ - reinterpret_cast(b.data()), c.data(), \ - reinterpret_cast(s.data()))); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(singleton.handle, pointer_mode)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_get_pointer_mode(singleton.handle, &pointer_mode)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_pointer_mode(singleton.handle, rocblas_pointer_mode_device)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_crotg(singleton.handle, \ + reinterpret_cast(a.data()), \ + reinterpret_cast(b.data()), c.data(), \ + reinterpret_cast(s.data()))); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_pointer_mode(singleton.handle, pointer_mode)); \ Kokkos::Profiling::popRegion(); \ } \ }; diff --git a/packages/kokkos-kernels/blas/tpls/KokkosBlas1_rotm_tpl_spec_decl.hpp b/packages/kokkos-kernels/blas/tpls/KokkosBlas1_rotm_tpl_spec_decl.hpp index 7bde6d08357a..41b1719e7187 100644 --- a/packages/kokkos-kernels/blas/tpls/KokkosBlas1_rotm_tpl_spec_decl.hpp +++ b/packages/kokkos-kernels/blas/tpls/KokkosBlas1_rotm_tpl_spec_decl.hpp @@ -108,12 +108,12 @@ namespace Impl { Kokkos::Profiling::pushRegion("KokkosBlas::rotm[TPL_CUBLAS,double]"); \ rotm_print_specialization(); \ KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, space.cuda_stream())); \ cublasPointerMode_t pointer_mode; \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasGetPointerMode(s.handle, &pointer_mode)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(s.handle, CUBLAS_POINTER_MODE_DEVICE)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasDrotm(s.handle, X.extent(0), X.data(), 1, Y.data(), 1, param.data())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(s.handle, pointer_mode)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasGetPointerMode(s.handle, &pointer_mode)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetPointerMode(s.handle, CUBLAS_POINTER_MODE_DEVICE)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasDrotm(s.handle, X.extent(0), X.data(), 1, Y.data(), 1, param.data())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetPointerMode(s.handle, pointer_mode)); \ Kokkos::Profiling::popRegion(); \ } \ }; @@ -140,12 +140,12 @@ KOKKOSBLAS1_DROTM_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos Kokkos::Profiling::pushRegion("KokkosBlas::rotm[TPL_CUBLAS,float]"); \ rotm_print_specialization(); \ KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, space.cuda_stream())); \ cublasPointerMode_t pointer_mode; \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasGetPointerMode(s.handle, &pointer_mode)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(s.handle, CUBLAS_POINTER_MODE_DEVICE)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSrotm(s.handle, X.extent(0), X.data(), 1, Y.data(), 1, param.data())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(s.handle, pointer_mode)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasGetPointerMode(s.handle, &pointer_mode)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetPointerMode(s.handle, CUBLAS_POINTER_MODE_DEVICE)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSrotm(s.handle, X.extent(0), X.data(), 1, Y.data(), 1, param.data())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetPointerMode(s.handle, pointer_mode)); \ Kokkos::Profiling::popRegion(); \ } \ }; @@ -184,13 +184,13 @@ namespace Impl { Kokkos::Profiling::pushRegion("KokkosBlas::rotm[TPL_ROCBLAS,double]"); \ rotm_print_specialization(); \ KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, space.hip_stream())); \ rocblas_pointer_mode pointer_mode; \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_get_pointer_mode(s.handle, &pointer_mode)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_device)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_get_pointer_mode(s.handle, &pointer_mode)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_device)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL( \ rocblas_drotm(s.handle, static_cast(X.extent(0)), X.data(), 1, Y.data(), 1, param.data())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(s.handle, pointer_mode)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_pointer_mode(s.handle, pointer_mode)); \ Kokkos::Profiling::popRegion(); \ } \ }; @@ -217,13 +217,13 @@ KOKKOSBLAS1_DROTM_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos Kokkos::Profiling::pushRegion("KokkosBlas::rotm[TPL_ROCBLAS,float]"); \ rotm_print_specialization(); \ KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, space.hip_stream())); \ rocblas_pointer_mode pointer_mode; \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_get_pointer_mode(s.handle, &pointer_mode)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_device)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_get_pointer_mode(s.handle, &pointer_mode)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_device)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL( \ rocblas_srotm(s.handle, static_cast(X.extent(0)), X.data(), 1, Y.data(), 1, param.data())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(s.handle, pointer_mode)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_pointer_mode(s.handle, pointer_mode)); \ Kokkos::Profiling::popRegion(); \ } \ }; diff --git a/packages/kokkos-kernels/blas/tpls/KokkosBlas1_rotmg_tpl_spec_decl.hpp b/packages/kokkos-kernels/blas/tpls/KokkosBlas1_rotmg_tpl_spec_decl.hpp index 0271cfd98165..9ededf991692 100644 --- a/packages/kokkos-kernels/blas/tpls/KokkosBlas1_rotmg_tpl_spec_decl.hpp +++ b/packages/kokkos-kernels/blas/tpls/KokkosBlas1_rotmg_tpl_spec_decl.hpp @@ -115,12 +115,13 @@ namespace Impl { Kokkos::Profiling::pushRegion("KokkosBlas::rotmg[TPL_CUBLAS,double]"); \ rotmg_print_specialization(); \ KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, space.cuda_stream())); \ cublasPointerMode_t pointer_mode; \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasGetPointerMode(s.handle, &pointer_mode)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(s.handle, CUBLAS_POINTER_MODE_DEVICE)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasDrotmg(s.handle, d1.data(), d2.data(), x1.data(), y1.data(), param.data())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(s.handle, pointer_mode)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasGetPointerMode(s.handle, &pointer_mode)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetPointerMode(s.handle, CUBLAS_POINTER_MODE_DEVICE)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL( \ + cublasDrotmg(s.handle, d1.data(), d2.data(), x1.data(), y1.data(), param.data())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetPointerMode(s.handle, pointer_mode)); \ Kokkos::Profiling::popRegion(); \ } \ }; @@ -151,12 +152,13 @@ KOKKOSBLAS1_DROTMG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokko Kokkos::Profiling::pushRegion("KokkosBlas::rotmg[TPL_CUBLAS,float]"); \ rotmg_print_specialization(); \ KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, space.cuda_stream())); \ cublasPointerMode_t pointer_mode; \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasGetPointerMode(s.handle, &pointer_mode)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(s.handle, CUBLAS_POINTER_MODE_DEVICE)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSrotmg(s.handle, d1.data(), d2.data(), x1.data(), y1.data(), param.data())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(s.handle, pointer_mode)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasGetPointerMode(s.handle, &pointer_mode)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetPointerMode(s.handle, CUBLAS_POINTER_MODE_DEVICE)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL( \ + cublasSrotmg(s.handle, d1.data(), d2.data(), x1.data(), y1.data(), param.data())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetPointerMode(s.handle, pointer_mode)); \ Kokkos::Profiling::popRegion(); \ } \ }; @@ -199,13 +201,13 @@ namespace Impl { Kokkos::Profiling::pushRegion("KokkosBlas::rotmg[TPL_ROCBLAS,double]"); \ rotmg_print_specialization(); \ KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, space.hip_stream())); \ rocblas_pointer_mode pointer_mode; \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_get_pointer_mode(s.handle, &pointer_mode)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_device)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_get_pointer_mode(s.handle, &pointer_mode)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_device)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL( \ rocblas_drotmg(s.handle, d1.data(), d2.data(), x1.data(), y1.data(), param.data())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(s.handle, pointer_mode)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_pointer_mode(s.handle, pointer_mode)); \ Kokkos::Profiling::popRegion(); \ } \ }; @@ -236,13 +238,13 @@ KOKKOSBLAS1_DROTMG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokko Kokkos::Profiling::pushRegion("KokkosBlas::rotmg[TPL_ROCBLAS,float]"); \ rotmg_print_specialization(); \ KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, space.hip_stream())); \ rocblas_pointer_mode pointer_mode; \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_get_pointer_mode(s.handle, &pointer_mode)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_device)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_get_pointer_mode(s.handle, &pointer_mode)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_device)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL( \ rocblas_srotmg(s.handle, d1.data(), d2.data(), x1.data(), y1.data(), param.data())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(s.handle, pointer_mode)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_pointer_mode(s.handle, pointer_mode)); \ Kokkos::Profiling::popRegion(); \ } \ }; diff --git a/packages/kokkos-kernels/blas/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp b/packages/kokkos-kernels/blas/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp index 7083e28730b3..5ed846e1c7cd 100644 --- a/packages/kokkos-kernels/blas/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp +++ b/packages/kokkos-kernels/blas/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp @@ -108,42 +108,42 @@ KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, fals namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_CUBLAS(SCALAR_TYPE, CUDA_SCALAR_TYPE, CUBLAS_FN, LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template \ - struct Scal, \ - Kokkos::MemoryTraits >, \ - SCALAR_TYPE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - RV; \ - typedef SCALAR_TYPE AS; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef typename XV::size_type size_type; \ - \ - static void scal(const ExecSpace& space, const RV& R, const AS& alpha, const XV& X) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::scal[TPL_CUBLAS," #SCALAR_TYPE "]"); \ - const size_type numElems = X.extent(0); \ - if ((numElems < static_cast(INT_MAX)) && (R.data() == X.data())) { \ - scal_print_specialization(); \ - const int N = static_cast(numElems); \ - constexpr int one = 1; \ - KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(CUBLAS_FN(s.handle, N, reinterpret_cast(&alpha), \ - reinterpret_cast(R.data()), one)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - } else { \ - Scal::scal(space, R, alpha, X); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_CUBLAS(SCALAR_TYPE, CUDA_SCALAR_TYPE, CUBLAS_FN, LAYOUT, MEMSPACE, \ + ETI_SPEC_AVAIL) \ + template \ + struct Scal, \ + Kokkos::MemoryTraits >, \ + SCALAR_TYPE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + RV; \ + typedef SCALAR_TYPE AS; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XV; \ + typedef typename XV::size_type size_type; \ + \ + static void scal(const ExecSpace& space, const RV& R, const AS& alpha, const XV& X) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::scal[TPL_CUBLAS," #SCALAR_TYPE "]"); \ + const size_type numElems = X.extent(0); \ + if ((numElems < static_cast(INT_MAX)) && (R.data() == X.data())) { \ + scal_print_specialization(); \ + const int N = static_cast(numElems); \ + constexpr int one = 1; \ + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(CUBLAS_FN(s.handle, N, reinterpret_cast(&alpha), \ + reinterpret_cast(R.data()), one)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, NULL)); \ + } else { \ + Scal::scal(space, R, alpha, X); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; #define KOKKOSBLAS1_DSCAL_TPL_SPEC_DECL_CUBLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ @@ -196,46 +196,47 @@ KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_ROCBLAS(SCALAR_TYPE, ROCBLAS_SCALAR_TYPE, ROCBLAS_FN, LAYOUT, EXECSPACE, \ - MEMSPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct Scal, \ - Kokkos::MemoryTraits >, \ - SCALAR_TYPE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - using execution_space = EXECSPACE; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - RV; \ - typedef SCALAR_TYPE AS; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef typename XV::size_type size_type; \ - \ - static void scal(const execution_space& space, const RV& R, const AS& alpha, const XV& X) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::scal[TPL_ROCBLAS," #SCALAR_TYPE "]"); \ - const size_type numElems = X.extent(0); \ - if ((numElems < static_cast(INT_MAX)) && (R.data() == X.data())) { \ - scal_print_specialization(); \ - const int N = static_cast(numElems); \ - constexpr int one = 1; \ - KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ - rocblas_pointer_mode pointer_mode; \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_get_pointer_mode(s.handle, &pointer_mode)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_host)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(ROCBLAS_FN(s.handle, N, reinterpret_cast(&alpha), \ - reinterpret_cast(R.data()), one)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(s.handle, pointer_mode)); \ - } else { \ - Scal::scal(space, R, alpha, X); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_ROCBLAS(SCALAR_TYPE, ROCBLAS_SCALAR_TYPE, ROCBLAS_FN, LAYOUT, EXECSPACE, \ + MEMSPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct Scal, \ + Kokkos::MemoryTraits >, \ + SCALAR_TYPE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, true, ETI_SPEC_AVAIL> { \ + using execution_space = EXECSPACE; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + RV; \ + typedef SCALAR_TYPE AS; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XV; \ + typedef typename XV::size_type size_type; \ + \ + static void scal(const execution_space& space, const RV& R, const AS& alpha, const XV& X) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::scal[TPL_ROCBLAS," #SCALAR_TYPE "]"); \ + const size_type numElems = X.extent(0); \ + if ((numElems < static_cast(INT_MAX)) && (R.data() == X.data())) { \ + scal_print_specialization(); \ + const int N = static_cast(numElems); \ + constexpr int one = 1; \ + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, space.hip_stream())); \ + rocblas_pointer_mode pointer_mode; \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_get_pointer_mode(s.handle, &pointer_mode)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_host)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(ROCBLAS_FN(s.handle, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(R.data()), one)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_pointer_mode(s.handle, pointer_mode)); \ + } else { \ + Scal::scal(space, R, alpha, X); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; #define KOKKOSBLAS1_DSCAL_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) \ diff --git a/packages/kokkos-kernels/blas/tpls/KokkosBlas1_swap_tpl_spec_decl.hpp b/packages/kokkos-kernels/blas/tpls/KokkosBlas1_swap_tpl_spec_decl.hpp index e74b498c33d3..0b475c722fef 100644 --- a/packages/kokkos-kernels/blas/tpls/KokkosBlas1_swap_tpl_spec_decl.hpp +++ b/packages/kokkos-kernels/blas/tpls/KokkosBlas1_swap_tpl_spec_decl.hpp @@ -174,8 +174,8 @@ namespace Impl { Kokkos::Profiling::pushRegion("KokkosBlas::swap[TPL_CUBLAS,double]"); \ swap_print_specialization(); \ KokkosBlas::Impl::CudaBlasSingleton& singleton = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(singleton.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasDswap(singleton.handle, X.extent_int(0), X.data(), 1, Y.data(), 1)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(singleton.handle, space.cuda_stream())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasDswap(singleton.handle, X.extent_int(0), X.data(), 1, Y.data(), 1)); \ Kokkos::Profiling::popRegion(); \ } \ }; @@ -195,8 +195,8 @@ namespace Impl { Kokkos::Profiling::pushRegion("KokkosBlas::swap[TPL_CUBLAS,float]"); \ swap_print_specialization(); \ KokkosBlas::Impl::CudaBlasSingleton& singleton = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(singleton.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSswap(singleton.handle, X.extent_int(0), X.data(), 1, Y.data(), 1)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(singleton.handle, space.cuda_stream())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSswap(singleton.handle, X.extent_int(0), X.data(), 1, Y.data(), 1)); \ Kokkos::Profiling::popRegion(); \ } \ }; @@ -217,10 +217,10 @@ namespace Impl { Kokkos::Profiling::pushRegion("KokkosBlas::swap[TPL_CUBLAS,complex]"); \ swap_print_specialization(); \ KokkosBlas::Impl::CudaBlasSingleton& singleton = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(singleton.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZswap(singleton.handle, X.extent_int(0), \ - reinterpret_cast(X.data()), 1, \ - reinterpret_cast(Y.data()), 1)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(singleton.handle, space.cuda_stream())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasZswap(singleton.handle, X.extent_int(0), \ + reinterpret_cast(X.data()), 1, \ + reinterpret_cast(Y.data()), 1)); \ Kokkos::Profiling::popRegion(); \ } \ }; @@ -241,10 +241,10 @@ namespace Impl { Kokkos::Profiling::pushRegion("KokkosBlas::swap[TPL_CUBLAS,complex]"); \ swap_print_specialization(); \ KokkosBlas::Impl::CudaBlasSingleton& singleton = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(singleton.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasCswap(singleton.handle, X.extent_int(0), \ - reinterpret_cast(X.data()), 1, \ - reinterpret_cast(Y.data()), 1)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(singleton.handle, space.cuda_stream())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasCswap(singleton.handle, X.extent_int(0), \ + reinterpret_cast(X.data()), 1, \ + reinterpret_cast(Y.data()), 1)); \ Kokkos::Profiling::popRegion(); \ } \ }; @@ -310,79 +310,79 @@ namespace Impl { Kokkos::Profiling::pushRegion("KokkosBlas::swap[TPL_ROCBLAS,double]"); \ swap_print_specialization(); \ KokkosBlas::Impl::RocBlasSingleton& singleton = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(singleton.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_dswap(singleton.handle, X.extent_int(0), X.data(), 1, Y.data(), 1)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(singleton.handle, space.hip_stream())); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_dswap(singleton.handle, X.extent_int(0), X.data(), 1, Y.data(), 1)); \ Kokkos::Profiling::popRegion(); \ } \ }; -#define KOKKOSBLAS1_SSWAP_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct Swap< \ - EXECSPACE, \ - Kokkos::View, Kokkos::MemoryTraits>, \ - Kokkos::View, Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - using XVector = \ - Kokkos::View, Kokkos::MemoryTraits>; \ - using YVector = \ - Kokkos::View, Kokkos::MemoryTraits>; \ - static void swap(EXECSPACE const& space, XVector const& X, YVector const& Y) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::swap[TPL_ROCBLAS,float]"); \ - swap_print_specialization(); \ - KokkosBlas::Impl::RocBlasSingleton& singleton = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(singleton.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_sswap(singleton.handle, X.extent_int(0), X.data(), 1, Y.data(), 1)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_SSWAP_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct Swap< \ + EXECSPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + using XVector = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + using YVector = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + static void swap(EXECSPACE const& space, XVector const& X, YVector const& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::swap[TPL_ROCBLAS,float]"); \ + swap_print_specialization(); \ + KokkosBlas::Impl::RocBlasSingleton& singleton = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(singleton.handle, space.hip_stream())); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_sswap(singleton.handle, X.extent_int(0), X.data(), 1, Y.data(), 1)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_ZSWAP_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct Swap*, LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View*, LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - using XVector = Kokkos::View*, LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits>; \ - using YVector = Kokkos::View*, LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits>; \ - static void swap(EXECSPACE const& space, XVector const& X, YVector const& Y) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::swap[TPL_ROCBLAS,complex_double]"); \ - swap_print_specialization(); \ - KokkosBlas::Impl::RocBlasSingleton& singleton = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(singleton.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_zswap(singleton.handle, X.extent_int(0), \ - reinterpret_cast(X.data()), 1, \ - reinterpret_cast(Y.data()), 1)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_ZSWAP_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct Swap*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + using XVector = Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>; \ + using YVector = Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>; \ + static void swap(EXECSPACE const& space, XVector const& X, YVector const& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::swap[TPL_ROCBLAS,complex_double]"); \ + swap_print_specialization(); \ + KokkosBlas::Impl::RocBlasSingleton& singleton = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(singleton.handle, space.hip_stream())); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_zswap(singleton.handle, X.extent_int(0), \ + reinterpret_cast(X.data()), 1, \ + reinterpret_cast(Y.data()), 1)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_CSWAP_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct Swap*, LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View*, LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - using XVector = Kokkos::View*, LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits>; \ - using YVector = Kokkos::View*, LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits>; \ - static void swap(EXECSPACE const& space, XVector const& X, YVector const& Y) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::swap[TPL_ROCBLAS,complex_float]"); \ - swap_print_specialization(); \ - KokkosBlas::Impl::RocBlasSingleton& singleton = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(singleton.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_cswap(singleton.handle, X.extent_int(0), \ - reinterpret_cast(X.data()), 1, \ - reinterpret_cast(Y.data()), 1)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_CSWAP_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct Swap*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + using XVector = Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>; \ + using YVector = Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>; \ + static void swap(EXECSPACE const& space, XVector const& X, YVector const& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::swap[TPL_ROCBLAS,complex_float]"); \ + swap_print_specialization(); \ + KokkosBlas::Impl::RocBlasSingleton& singleton = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(singleton.handle, space.hip_stream())); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_cswap(singleton.handle, X.extent_int(0), \ + reinterpret_cast(X.data()), 1, \ + reinterpret_cast(Y.data()), 1)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; KOKKOSBLAS1_DSWAP_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, true) diff --git a/packages/kokkos-kernels/blas/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp b/packages/kokkos-kernels/blas/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp index fcc5762f571d..fdfd17e45437 100644 --- a/packages/kokkos-kernels/blas/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp +++ b/packages/kokkos-kernels/blas/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp @@ -253,10 +253,10 @@ namespace Impl { Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_CUBLAS,double]"); \ KOKKOSBLAS2_GEMV_CUBLAS_DETERMINE_ARGS(LAYOUTA); \ KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL( \ cublasDgemv(s.handle, transa, M, N, &alpha, A.data(), LDA, X.data(), one, &beta, Y.data(), one)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, NULL)); \ Kokkos::Profiling::popRegion(); \ } \ }; @@ -288,10 +288,10 @@ namespace Impl { Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_CUBLAS,float]"); \ KOKKOSBLAS2_GEMV_CUBLAS_DETERMINE_ARGS(LAYOUTA); \ KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL( \ cublasSgemv(s.handle, transa, M, N, &alpha, A.data(), LDA, X.data(), one, &beta, Y.data(), one)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, NULL)); \ Kokkos::Profiling::popRegion(); \ } \ }; @@ -323,12 +323,12 @@ namespace Impl { Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_CUBLAS,complex]"); \ KOKKOSBLAS2_GEMV_CUBLAS_DETERMINE_ARGS(LAYOUTA); \ KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZgemv( \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasZgemv( \ s.handle, transa, M, N, reinterpret_cast(&alpha), \ reinterpret_cast(A.data()), LDA, reinterpret_cast(X.data()), \ one, reinterpret_cast(&beta), reinterpret_cast(Y.data()), one)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, NULL)); \ Kokkos::Profiling::popRegion(); \ } \ }; @@ -360,12 +360,12 @@ namespace Impl { Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_CUBLAS,complex]"); \ KOKKOSBLAS2_GEMV_CUBLAS_DETERMINE_ARGS(LAYOUTA); \ KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL( \ cublasCgemv(s.handle, transa, M, N, reinterpret_cast(&alpha), \ reinterpret_cast(A.data()), LDA, reinterpret_cast(X.data()), \ one, reinterpret_cast(&beta), reinterpret_cast(Y.data()), one)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, NULL)); \ Kokkos::Profiling::popRegion(); \ } \ }; @@ -448,10 +448,10 @@ namespace Impl { Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_ROCBLAS,double]"); \ KOKKOSBLAS2_GEMV_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL( \ rocblas_dgemv(s.handle, transa, M, N, &alpha, A.data(), LDA, X.data(), one, &beta, Y.data(), one)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, NULL)); \ Kokkos::Profiling::popRegion(); \ } \ }; @@ -483,90 +483,90 @@ namespace Impl { Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_ROCBLAS,float]"); \ KOKKOSBLAS2_GEMV_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL( \ rocblas_sgemv(s.handle, transa, M, N, &alpha, A.data(), LDA, X.data(), one, &beta, Y.data(), one)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, NULL)); \ Kokkos::Profiling::popRegion(); \ } \ }; -#define KOKKOSBLAS2_ZGEMV_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ - struct GEMV**, LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View*, LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View*, LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YViewType; \ - \ - static void gemv(const ExecSpace& space, const char trans[], typename AViewType::const_value_type& alpha, \ - const AViewType& A, const XViewType& X, typename YViewType::const_value_type& beta, \ - const YViewType& Y) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_ROCBLAS,complex]"); \ - KOKKOSBLAS2_GEMV_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ - KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_zgemv(s.handle, transa, M, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(A.data()), LDA, \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(&beta), \ - reinterpret_cast(Y.data()), one)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_ZGEMV_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct GEMV**, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + \ + static void gemv(const ExecSpace& space, const char trans[], typename AViewType::const_value_type& alpha, \ + const AViewType& A, const XViewType& X, typename YViewType::const_value_type& beta, \ + const YViewType& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_ROCBLAS,complex]"); \ + KOKKOSBLAS2_GEMV_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_zgemv(s.handle, transa, M, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(A.data()), LDA, \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(&beta), \ + reinterpret_cast(Y.data()), one)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, NULL)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_CGEMV_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ - struct GEMV**, LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View*, LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View*, LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YViewType; \ - \ - static void gemv(const ExecSpace& space, const char trans[], typename AViewType::const_value_type& alpha, \ - const AViewType& A, const XViewType& X, typename YViewType::const_value_type& beta, \ - const YViewType& Y) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_ROCBLAS,complex]"); \ - KOKKOSBLAS2_GEMV_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ - KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_cgemv(s.handle, transa, M, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(A.data()), LDA, \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(&beta), \ - reinterpret_cast(Y.data()), one)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_CGEMV_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct GEMV**, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + \ + static void gemv(const ExecSpace& space, const char trans[], typename AViewType::const_value_type& alpha, \ + const AViewType& A, const XViewType& X, typename YViewType::const_value_type& beta, \ + const YViewType& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_ROCBLAS,complex]"); \ + KOKKOSBLAS2_GEMV_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_cgemv(s.handle, transa, M, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(A.data()), LDA, \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(&beta), \ + reinterpret_cast(Y.data()), one)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, NULL)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; KOKKOSBLAS2_DGEMV_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, true) diff --git a/packages/kokkos-kernels/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_cublas.hpp b/packages/kokkos-kernels/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_cublas.hpp index fdb09d1c9170..6bc5eaef62ee 100644 --- a/packages/kokkos-kernels/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_cublas.hpp +++ b/packages/kokkos-kernels/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_cublas.hpp @@ -56,52 +56,56 @@ namespace Impl { Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_CUBLAS,double]"); \ KOKKOSBLAS2_GER_CUBLAS_DETERMINE_ARGS(LAYOUT); \ KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, space.cuda_stream())); \ if (A_is_ll) { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasDger(s.handle, M, N, &alpha, X.data(), one, Y.data(), one, A.data(), LDA)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL( \ + cublasDger(s.handle, M, N, &alpha, X.data(), one, Y.data(), one, A.data(), LDA)); \ } else { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasDger(s.handle, M, N, &alpha, Y.data(), one, X.data(), one, A.data(), LDA)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL( \ + cublasDger(s.handle, M, N, &alpha, Y.data(), one, X.data(), one, A.data(), LDA)); \ } \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, NULL)); \ Kokkos::Profiling::popRegion(); \ } \ }; -#define KOKKOSBLAS2_SGER_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct GER< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef float SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - \ - static void ger(const EXEC_SPACE& space, const char /*trans*/[], typename AViewType::const_value_type& alpha, \ - const XViewType& X, const YViewType& Y, const AViewType& A) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_CUBLAS,float]"); \ - KOKKOSBLAS2_GER_CUBLAS_DETERMINE_ARGS(LAYOUT); \ - KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ - if (A_is_ll) { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSger(s.handle, M, N, &alpha, X.data(), one, Y.data(), one, A.data(), LDA)); \ - } else { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSger(s.handle, M, N, &alpha, Y.data(), one, X.data(), one, A.data(), LDA)); \ - } \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_SGER_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct GER< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef float SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void ger(const EXEC_SPACE& space, const char /*trans*/[], typename AViewType::const_value_type& alpha, \ + const XViewType& X, const YViewType& Y, const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_CUBLAS,float]"); \ + KOKKOSBLAS2_GER_CUBLAS_DETERMINE_ARGS(LAYOUT); \ + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, space.cuda_stream())); \ + if (A_is_ll) { \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL( \ + cublasSger(s.handle, M, N, &alpha, X.data(), one, Y.data(), one, A.data(), LDA)); \ + } else { \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL( \ + cublasSger(s.handle, M, N, &alpha, Y.data(), one, X.data(), one, A.data(), LDA)); \ + } \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, NULL)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; #define KOKKOSBLAS2_ZGER_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ @@ -131,31 +135,34 @@ namespace Impl { KOKKOSBLAS2_GER_CUBLAS_DETERMINE_ARGS(LAYOUT); \ bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, space.cuda_stream())); \ if (A_is_ll) { \ if (justTranspose) { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZgeru(s.handle, M, N, reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(Y.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasZgeru(s.handle, M, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ } else { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZgerc(s.handle, M, N, reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(Y.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasZgerc(s.handle, M, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ } \ } else { \ if (justTranspose) { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZgeru(s.handle, M, N, reinterpret_cast(&alpha), \ - reinterpret_cast(Y.data()), one, \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasZgeru(s.handle, M, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ } else { \ /* cublasZgerc() + ~A_ll => call kokkos-kernels' implementation */ \ GER::ger(space, trans, alpha, X, Y, A); \ } \ } \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, NULL)); \ Kokkos::Profiling::popRegion(); \ } \ }; @@ -187,31 +194,31 @@ namespace Impl { KOKKOSBLAS2_GER_CUBLAS_DETERMINE_ARGS(LAYOUT); \ bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, space.cuda_stream())); \ if (A_is_ll) { \ if (justTranspose) { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasCgeru(s.handle, M, N, reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(Y.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasCgeru(s.handle, M, N, reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ } else { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasCgerc(s.handle, M, N, reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(Y.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasCgerc(s.handle, M, N, reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ } \ } else { \ if (justTranspose) { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasCgeru(s.handle, M, N, reinterpret_cast(&alpha), \ - reinterpret_cast(Y.data()), one, \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasCgeru(s.handle, M, N, reinterpret_cast(&alpha), \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ } else { \ /* cublasCgerc() + ~A_ll => call kokkos-kernels' implementation */ \ GER::ger(space, trans, alpha, X, Y, A); \ } \ } \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, NULL)); \ Kokkos::Profiling::popRegion(); \ } \ }; diff --git a/packages/kokkos-kernels/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_rocblas.hpp b/packages/kokkos-kernels/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_rocblas.hpp index 26a0da58649b..d1171208c5a0 100644 --- a/packages/kokkos-kernels/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_rocblas.hpp +++ b/packages/kokkos-kernels/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_rocblas.hpp @@ -56,15 +56,15 @@ namespace Impl { Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_ROCBLAS,double]"); \ KOKKOSBLAS2_GER_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, space.hip_stream())); \ if (A_is_ll) { \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL( \ rocblas_dger(s.handle, M, N, &alpha, X.data(), one, Y.data(), one, A.data(), LDA)); \ } else { \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL( \ rocblas_dger(s.handle, M, N, &alpha, Y.data(), one, X.data(), one, A.data(), LDA)); \ } \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, NULL)); \ Kokkos::Profiling::popRegion(); \ } \ }; @@ -95,15 +95,15 @@ namespace Impl { Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_ROCBLAS,float]"); \ KOKKOSBLAS2_GER_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, space.hip_stream())); \ if (A_is_ll) { \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL( \ rocblas_sger(s.handle, M, N, &alpha, X.data(), one, Y.data(), one, A.data(), LDA)); \ } else { \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL( \ rocblas_sger(s.handle, M, N, &alpha, Y.data(), one, X.data(), one, A.data(), LDA)); \ } \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, NULL)); \ Kokkos::Profiling::popRegion(); \ } \ }; @@ -135,34 +135,34 @@ namespace Impl { KOKKOSBLAS2_GER_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, space.hip_stream())); \ if (A_is_ll) { \ if (justTranspose) { \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_zgeru(s.handle, M, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(Y.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL( \ + rocblas_zgeru(s.handle, M, N, reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ } else { \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_zgerc(s.handle, M, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(Y.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL( \ + rocblas_zgerc(s.handle, M, N, reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ } \ } else { \ if (justTranspose) { \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_zgeru(s.handle, M, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(Y.data()), one, \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL( \ + rocblas_zgeru(s.handle, M, N, reinterpret_cast(&alpha), \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ } else { \ /* rocblas_zgerc() + ~A_ll => call k-kernels' implementation */ \ GER::ger(space, trans, alpha, X, Y, A); \ } \ } \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, NULL)); \ Kokkos::Profiling::popRegion(); \ } \ }; @@ -194,34 +194,34 @@ namespace Impl { KOKKOSBLAS2_GER_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, space.hip_stream())); \ if (A_is_ll) { \ if (justTranspose) { \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_cgeru(s.handle, M, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(Y.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL( \ + rocblas_cgeru(s.handle, M, N, reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ } else { \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_cgerc(s.handle, M, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(Y.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL( \ + rocblas_cgerc(s.handle, M, N, reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ } \ } else { \ if (justTranspose) { \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_cgeru(s.handle, M, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(Y.data()), one, \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL( \ + rocblas_cgeru(s.handle, M, N, reinterpret_cast(&alpha), \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ } else { \ /* rocblas_cgerc() + ~A_ll => call k-kernels' implementation */ \ GER::ger(space, trans, alpha, X, Y, A); \ } \ } \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, NULL)); \ Kokkos::Profiling::popRegion(); \ } \ }; diff --git a/packages/kokkos-kernels/blas/tpls/KokkosBlas2_serial_gemv_tpl_spec_decl.hpp b/packages/kokkos-kernels/blas/tpls/KokkosBlas2_serial_gemv_tpl_spec_decl.hpp index d8944335403b..54c86491e6a1 100644 --- a/packages/kokkos-kernels/blas/tpls/KokkosBlas2_serial_gemv_tpl_spec_decl.hpp +++ b/packages/kokkos-kernels/blas/tpls/KokkosBlas2_serial_gemv_tpl_spec_decl.hpp @@ -35,7 +35,7 @@ namespace Impl { // Note: using GEMM because there is no GEMV in MKL compact routines -#define __IMPL_KK_MKL_DGEMM_COMPACT(SCALAR, MKL_ROUTINE) \ +#define KOKKOSBLAS_IMPL_MKL_DGEMM_COMPACT(SCALAR, MKL_ROUTINE) \ inline void kk_mkl_gemm_compact(MKL_LAYOUT layout, MKL_TRANSPOSE transa, MKL_TRANSPOSE transb, MKL_INT m, MKL_INT n, \ MKL_INT k, SCALAR alpha, const SCALAR *a, MKL_INT ldap, const SCALAR *b, \ MKL_INT ldbp, SCALAR beta, SCALAR *c, MKL_INT ldcp, MKL_COMPACT_PACK format, \ @@ -43,12 +43,12 @@ namespace Impl { MKL_ROUTINE(layout, transa, transb, m, n, k, alpha, a, ldap, b, ldbp, beta, c, ldcp, format, nm); \ } -__IMPL_KK_MKL_DGEMM_COMPACT(double, mkl_dgemm_compact) -__IMPL_KK_MKL_DGEMM_COMPACT(float, mkl_sgemm_compact) +KOKKOSBLAS_IMPL_MKL_DGEMM_COMPACT(double, mkl_dgemm_compact) +KOKKOSBLAS_IMPL_MKL_DGEMM_COMPACT(float, mkl_sgemm_compact) // Note: MKL compact format packs real and imaginary components separately // which makes it not directly compatible with our Vector types -#undef __IMPL_KK_MKL_DGEMM_COMPACT +#undef KOKKOSBLAS_IMPL_MKL_DGEMM_COMPACT template inline MKL_COMPACT_PACK mkl_compact_format() { diff --git a/packages/kokkos-kernels/blas/tpls/KokkosBlas2_syr2_tpl_spec_decl_cublas.hpp b/packages/kokkos-kernels/blas/tpls/KokkosBlas2_syr2_tpl_spec_decl_cublas.hpp index 4dd95aa79a82..6b3df91afe2d 100644 --- a/packages/kokkos-kernels/blas/tpls/KokkosBlas2_syr2_tpl_spec_decl_cublas.hpp +++ b/packages/kokkos-kernels/blas/tpls/KokkosBlas2_syr2_tpl_spec_decl_cublas.hpp @@ -58,10 +58,10 @@ namespace Impl { KOKKOSBLAS2_SYR2_CUBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ if (A_is_ll) { \ KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL( \ cublasDsyr2(s.handle, fillMode, N, &alpha, X.data(), one, Y.data(), one, A.data(), LDA)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, NULL)); \ } else { \ /* cublasDsyr2() + ~A_ll => call kokkos-kernels' implementation */ \ SYR2::syr2(space, trans, uplo, alpha, X, \ @@ -99,10 +99,10 @@ namespace Impl { KOKKOSBLAS2_SYR2_CUBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ if (A_is_ll) { \ KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL( \ cublasSsyr2(s.handle, fillMode, N, &alpha, X.data(), one, Y.data(), one, A.data(), LDA)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, NULL)); \ } else { \ /* cublasSsyr2() + ~A_ll => call kokkos-kernels' implementation */ \ SYR2::syr2(space, trans, uplo, alpha, X, \ @@ -142,13 +142,13 @@ namespace Impl { if (justTranspose) { \ if (A_is_ll) { \ KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZsyr2(s.handle, fillMode, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(Y.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasZsyr2(s.handle, fillMode, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, NULL)); \ } else { \ /* cublasZsyr2() + ~A_ll => call kokkos-kernels' implementation */ \ SYR2::syr2(space, trans, uplo, alpha, X, \ @@ -157,13 +157,13 @@ namespace Impl { } else { \ if (A_is_ll) { \ KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZher2(s.handle, fillMode, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(Y.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasZher2(s.handle, fillMode, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, NULL)); \ } else { \ /* cublasZher2() + ~A_ll => call kokkos-kernels' implementation */ \ SYR2::syr2(space, trans, uplo, alpha, X, \ @@ -204,12 +204,12 @@ namespace Impl { if (justTranspose) { \ if (A_is_ll) { \ KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasCsyr2(s.handle, fillMode, N, reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(Y.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasCsyr2( \ + s.handle, fillMode, N, reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, NULL)); \ } else { \ /* cublasCsyr2() + ~A_ll => call kokkos-kernels' implementation */ \ SYR2::syr2(space, trans, uplo, alpha, X, \ @@ -218,12 +218,12 @@ namespace Impl { } else { \ if (A_is_ll) { \ KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasCher2(s.handle, fillMode, N, reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(Y.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasCher2( \ + s.handle, fillMode, N, reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, NULL)); \ } else { \ /* cublasCher2() + ~A_ll => call kokkos-kernels' implementation */ \ SYR2::syr2(space, trans, uplo, alpha, X, \ diff --git a/packages/kokkos-kernels/blas/tpls/KokkosBlas2_syr2_tpl_spec_decl_rocblas.hpp b/packages/kokkos-kernels/blas/tpls/KokkosBlas2_syr2_tpl_spec_decl_rocblas.hpp index 84085224acc9..72f0115b076d 100644 --- a/packages/kokkos-kernels/blas/tpls/KokkosBlas2_syr2_tpl_spec_decl_rocblas.hpp +++ b/packages/kokkos-kernels/blas/tpls/KokkosBlas2_syr2_tpl_spec_decl_rocblas.hpp @@ -58,10 +58,10 @@ namespace Impl { KOKKOSBLAS2_SYR2_ROCBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ if (A_is_ll) { \ KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL( \ rocblas_dsyr2(s.handle, fillMode, N, &alpha, X.data(), one, Y.data(), one, A.data(), LDA)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, NULL)); \ } else { \ /* rocblas_dsyr2() + ~A_ll => call kokkos-kernels' implementation */ \ SYR2::syr2(space, trans, uplo, alpha, X, \ @@ -99,10 +99,10 @@ namespace Impl { KOKKOSBLAS2_SYR2_ROCBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ if (A_is_ll) { \ KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL( \ rocblas_ssyr2(s.handle, fillMode, N, &alpha, X.data(), one, Y.data(), one, A.data(), LDA)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, NULL)); \ } else { \ /* rocblas_ssyr2() + ~A_ll => call kokkos-kernels' implementation */ \ SYR2::syr2(space, trans, uplo, alpha, X, \ @@ -142,13 +142,13 @@ namespace Impl { if (justTranspose) { \ if (A_is_ll) { \ KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_zsyr2(s.handle, fillMode, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(Y.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL( \ + rocblas_zsyr2(s.handle, fillMode, N, reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, NULL)); \ } else { \ /* rocblas_zsyr2() + ~A_ll => call kokkos-kernels' implementation */ \ SYR2::syr2(space, trans, uplo, alpha, X, \ @@ -157,13 +157,13 @@ namespace Impl { } else { \ if (A_is_ll && (alpha.imag() == 0.)) { \ KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_zher2(s.handle, fillMode, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(Y.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL( \ + rocblas_zher2(s.handle, fillMode, N, reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, NULL)); \ } else { \ /* rocblas_zher2() + ~A_ll => call kokkos-kernels' implementation */ \ SYR2::syr2(space, trans, uplo, alpha, X, \ @@ -204,13 +204,13 @@ namespace Impl { if (justTranspose) { \ if (A_is_ll) { \ KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_csyr2(s.handle, fillMode, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(Y.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL( \ + rocblas_csyr2(s.handle, fillMode, N, reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, NULL)); \ } else { \ /* rocblas_csyr2() + ~A_ll => call kokkos-kernels' implementation */ \ SYR2::syr2(space, trans, uplo, alpha, X, \ @@ -219,13 +219,13 @@ namespace Impl { } else { \ if (A_is_ll && (alpha.imag() == 0.)) { \ KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_cher2(s.handle, fillMode, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(Y.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL( \ + rocblas_cher2(s.handle, fillMode, N, reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, NULL)); \ } else { \ /* rocblas_cher2() + ~A_ll => call kokkos-kernels' implementation */ \ SYR2::syr2(space, trans, uplo, alpha, X, \ diff --git a/packages/kokkos-kernels/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_cublas.hpp b/packages/kokkos-kernels/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_cublas.hpp index 43b177d9a5f0..0843d6c50f83 100644 --- a/packages/kokkos-kernels/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_cublas.hpp +++ b/packages/kokkos-kernels/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_cublas.hpp @@ -52,9 +52,9 @@ namespace Impl { KOKKOSBLAS2_SYR_CUBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ if (A_is_ll) { \ KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasDsyr(s.handle, fillMode, N, &alpha, X.data(), one, A.data(), LDA)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasDsyr(s.handle, fillMode, N, &alpha, X.data(), one, A.data(), LDA)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, NULL)); \ } else { \ /* cublasDsyr() + ~A_ll => call kokkos-kernels' implementation */ \ SYR::syr(space, trans, uplo, alpha, X, A); \ @@ -85,9 +85,9 @@ namespace Impl { KOKKOSBLAS2_SYR_CUBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ if (A_is_ll) { \ KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSsyr(s.handle, fillMode, N, &alpha, X.data(), one, A.data(), LDA)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSsyr(s.handle, fillMode, N, &alpha, X.data(), one, A.data(), LDA)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, NULL)); \ } else { \ /* cublasSsyr() + ~A_ll => call kokkos-kernels' implementation */ \ SYR::syr(space, trans, uplo, alpha, X, A); \ @@ -96,109 +96,109 @@ namespace Impl { } \ }; -#define KOKKOSBLAS2_ZSYR_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct SYR*, LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View**, LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - \ - static void syr(const typename AViewType::execution_space& space, const char trans[], const char uplo[], \ - typename AViewType::const_value_type& alpha, const XViewType& X, const AViewType& A) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_CUBLAS,complex]"); \ - KOKKOSBLAS2_SYR_CUBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ - bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ - if (justTranspose) { \ - if (A_is_ll) { \ - KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZsyr(s.handle, fillMode, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - } else { \ - /* cublasZsyr() + ~A_ll => call kokkos-kernels' implementation */ \ - SYR::syr(space, trans, uplo, alpha, X, A); \ - } \ - } else { \ - if (A_is_ll && (alpha.imag() == 0.)) { \ - const double alpha_val = alpha.real(); \ - KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZher(s.handle, fillMode, N, &alpha_val, \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - } else { \ - /* cublasZher() + [~A_ll or ~real alpha]=> call kokkos-kernels' \ - * implementation */ \ - SYR::syr(space, trans, uplo, alpha, X, A); \ - } \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_ZSYR_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View**, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void syr(const typename AViewType::execution_space& space, const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, const XViewType& X, const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_CUBLAS,complex]"); \ + KOKKOSBLAS2_SYR_CUBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + if (justTranspose) { \ + if (A_is_ll) { \ + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasZsyr(s.handle, fillMode, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, NULL)); \ + } else { \ + /* cublasZsyr() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR::syr(space, trans, uplo, alpha, X, A); \ + } \ + } else { \ + if (A_is_ll && (alpha.imag() == 0.)) { \ + const double alpha_val = alpha.real(); \ + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasZher(s.handle, fillMode, N, &alpha_val, \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, NULL)); \ + } else { \ + /* cublasZher() + [~A_ll or ~real alpha]=> call kokkos-kernels' \ + * implementation */ \ + SYR::syr(space, trans, uplo, alpha, X, A); \ + } \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_CSYR_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct SYR*, LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View**, LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - \ - static void syr(const typename AViewType::execution_space& space, const char trans[], const char uplo[], \ - typename AViewType::const_value_type& alpha, const XViewType& X, const AViewType& A) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_CUBLAS,complex]"); \ - KOKKOSBLAS2_SYR_CUBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ - bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ - if (justTranspose) { \ - if (A_is_ll) { \ - KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasCsyr(s.handle, fillMode, N, reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - } else { \ - /* cublasCsyr() + ~A_ll => call kokkos-kernels' implementation */ \ - SYR::syr(space, trans, uplo, alpha, X, A); \ - } \ - } else { \ - if (A_is_ll && (alpha.imag() == 0.)) { \ - const float alpha_val = alpha.real(); \ - KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasCher(s.handle, fillMode, N, &alpha_val, \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - } else { \ - /* cublasCher() + [~A_ll or ~real alpha]=> call kokkos-kernels' \ - * implementation */ \ - SYR::syr(space, trans, uplo, alpha, X, A); \ - } \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_CSYR_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View**, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void syr(const typename AViewType::execution_space& space, const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, const XViewType& X, const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_CUBLAS,complex]"); \ + KOKKOSBLAS2_SYR_CUBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + if (justTranspose) { \ + if (A_is_ll) { \ + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasCsyr( \ + s.handle, fillMode, N, reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, reinterpret_cast(A.data()), LDA)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, NULL)); \ + } else { \ + /* cublasCsyr() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR::syr(space, trans, uplo, alpha, X, A); \ + } \ + } else { \ + if (A_is_ll && (alpha.imag() == 0.)) { \ + const float alpha_val = alpha.real(); \ + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasCher(s.handle, fillMode, N, &alpha_val, \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, NULL)); \ + } else { \ + /* cublasCher() + [~A_ll or ~real alpha]=> call kokkos-kernels' \ + * implementation */ \ + SYR::syr(space, trans, uplo, alpha, X, A); \ + } \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, true) diff --git a/packages/kokkos-kernels/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_rocblas.hpp b/packages/kokkos-kernels/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_rocblas.hpp index 59c99c1225ec..3780abefefd5 100644 --- a/packages/kokkos-kernels/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_rocblas.hpp +++ b/packages/kokkos-kernels/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_rocblas.hpp @@ -52,10 +52,10 @@ namespace Impl { KOKKOSBLAS2_SYR_ROCBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ if (A_is_ll) { \ KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_host)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_dsyr(s.handle, fillMode, N, &alpha, X.data(), one, A.data(), LDA)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_host)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_dsyr(s.handle, fillMode, N, &alpha, X.data(), one, A.data(), LDA)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, NULL)); \ } else { \ /* rocblas_dsyr() + ~A_ll => call kokkos-kernels' implementation */ \ SYR::syr(space, trans, uplo, alpha, X, A); \ @@ -86,10 +86,10 @@ namespace Impl { KOKKOSBLAS2_SYR_ROCBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ if (A_is_ll) { \ KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_host)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_ssyr(s.handle, fillMode, N, &alpha, X.data(), one, A.data(), LDA)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_host)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_ssyr(s.handle, fillMode, N, &alpha, X.data(), one, A.data(), LDA)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, NULL)); \ } else { \ /* rocblas_ssyr() + ~A_ll => call kokkos-kernels' implementation */ \ SYR::syr(space, trans, uplo, alpha, X, A); \ @@ -98,114 +98,114 @@ namespace Impl { } \ }; -#define KOKKOSBLAS2_ZSYR_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct SYR*, LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View**, LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - \ - static void syr(const typename AViewType::execution_space& space, const char trans[], const char uplo[], \ - typename AViewType::const_value_type& alpha, const XViewType& X, const AViewType& A) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_ROCBLAS,complex]"); \ - KOKKOSBLAS2_SYR_ROCBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ - bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ - if (justTranspose) { \ - if (A_is_ll) { \ - KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_host)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_zsyr(s.handle, fillMode, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ - } else { \ - /* rocblas_zsyr() + ~A_ll => call kokkos-kernels' implementation */ \ - SYR::syr(space, trans, uplo, alpha, X, A); \ - } \ - } else { \ - if (A_is_ll && (alpha.imag() == 0.)) { \ - const double alpha_val = alpha.real(); \ - KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_host)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_zher(s.handle, fillMode, N, &alpha_val, \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ - } else { \ - /* rocblas_zher() + [~A_ll or ~real alpha]=> call kokkos-kernels' \ - * implementation */ \ - SYR::syr(space, trans, uplo, alpha, X, A); \ - } \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_ZSYR_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View**, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void syr(const typename AViewType::execution_space& space, const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, const XViewType& X, const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_ROCBLAS,complex]"); \ + KOKKOSBLAS2_SYR_ROCBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + if (justTranspose) { \ + if (A_is_ll) { \ + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_host)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL( \ + rocblas_zsyr(s.handle, fillMode, N, reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, NULL)); \ + } else { \ + /* rocblas_zsyr() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR::syr(space, trans, uplo, alpha, X, A); \ + } \ + } else { \ + if (A_is_ll && (alpha.imag() == 0.)) { \ + const double alpha_val = alpha.real(); \ + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_host)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_zher( \ + s.handle, fillMode, N, &alpha_val, reinterpret_cast(X.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, NULL)); \ + } else { \ + /* rocblas_zher() + [~A_ll or ~real alpha]=> call kokkos-kernels' \ + * implementation */ \ + SYR::syr(space, trans, uplo, alpha, X, A); \ + } \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_CSYR_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct SYR*, LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View**, LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - \ - static void syr(const typename AViewType::execution_space& space, const char trans[], const char uplo[], \ - typename AViewType::const_value_type& alpha, const XViewType& X, const AViewType& A) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_ROCBLAS,complex]"); \ - KOKKOSBLAS2_SYR_ROCBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ - bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ - if (justTranspose) { \ - if (A_is_ll) { \ - KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_host)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_csyr(s.handle, fillMode, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ - } else { \ - /* rocblas_csyr() + ~A_ll => call kokkos-kernels' implementation */ \ - SYR::syr(space, trans, uplo, alpha, X, A); \ - } \ - } else { \ - if (A_is_ll && (alpha.imag() == 0.)) { \ - const float alpha_val = alpha.real(); \ - KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_host)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_cher(s.handle, fillMode, N, &alpha_val, \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ - } else { \ - /* rocblas_cher() + [~A_ll or ~real alpha]=> call kokkos-kernels' \ - * implementation */ \ - SYR::syr(space, trans, uplo, alpha, X, A); \ - } \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_CSYR_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View**, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void syr(const typename AViewType::execution_space& space, const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, const XViewType& X, const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_ROCBLAS,complex]"); \ + KOKKOSBLAS2_SYR_ROCBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + if (justTranspose) { \ + if (A_is_ll) { \ + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_host)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL( \ + rocblas_csyr(s.handle, fillMode, N, reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, NULL)); \ + } else { \ + /* rocblas_csyr() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR::syr(space, trans, uplo, alpha, X, A); \ + } \ + } else { \ + if (A_is_ll && (alpha.imag() == 0.)) { \ + const float alpha_val = alpha.real(); \ + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_host)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_cher( \ + s.handle, fillMode, N, &alpha_val, reinterpret_cast(X.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, NULL)); \ + } else { \ + /* rocblas_cher() + [~A_ll or ~real alpha]=> call kokkos-kernels' \ + * implementation */ \ + SYR::syr(space, trans, uplo, alpha, X, A); \ + } \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; KOKKOSBLAS2_DSYR_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, true) diff --git a/packages/kokkos-kernels/blas/tpls/KokkosBlas3_gemm_tpl_spec_decl.hpp b/packages/kokkos-kernels/blas/tpls/KokkosBlas3_gemm_tpl_spec_decl.hpp index 52123a9daf8c..010411ef8064 100644 --- a/packages/kokkos-kernels/blas/tpls/KokkosBlas3_gemm_tpl_spec_decl.hpp +++ b/packages/kokkos-kernels/blas/tpls/KokkosBlas3_gemm_tpl_spec_decl.hpp @@ -175,20 +175,20 @@ namespace Impl { gemm.run(space, conjT); \ } else { \ KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, space.cuda_stream())); \ if (!A_is_lr && !B_is_lr && !C_is_lr) \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(CUBLAS_FN( \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(CUBLAS_FN( \ s.handle, transa, transb, M, N, K, reinterpret_cast(&alpha), \ reinterpret_cast(A.data()), LDA, \ reinterpret_cast(B.data()), LDB, \ reinterpret_cast(&beta), reinterpret_cast(C.data()), LDC)); \ if (A_is_lr && B_is_lr && C_is_lr) \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(CUBLAS_FN( \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(CUBLAS_FN( \ s.handle, transb, transa, N, M, K, reinterpret_cast(&alpha), \ reinterpret_cast(B.data()), LDB, \ reinterpret_cast(A.data()), LDA, \ reinterpret_cast(&beta), reinterpret_cast(C.data()), LDC)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, NULL)); \ } \ Kokkos::Profiling::popRegion(); \ } \ @@ -260,78 +260,78 @@ KOKKOSBLAS3_CGEMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::Layou namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS3_XGEMM_ROCBLAS(SCALAR_TYPE, ROCBLAS_SCALAR_TYPE, ROCBLAS_FN, LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ - struct GEMM, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef SCALAR_TYPE SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - BViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - CViewType; \ - \ - static void gemm(const typename CViewType::execution_space& space, const char transA[], const char transB[], \ - typename AViewType::const_value_type& alpha, const AViewType& A, const BViewType& B, \ - typename CViewType::const_value_type& beta, const CViewType& C) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::gemm[TPL_ROCBLAS," #SCALAR_TYPE "]"); \ - \ - const bool A_t = (transA[0] != 'N') && (transA[0] != 'n'); \ - const int M = static_cast(C.extent(0)); \ - const int N = static_cast(C.extent(1)); \ - const int K = static_cast(A.extent(A_t ? 0 : 1)); \ - \ - bool is_lr = std::is_same::value; \ - \ - const int AST = is_lr ? A.stride(0) : A.stride(1), LDA = AST == 0 ? 1 : AST; \ - const int BST = is_lr ? B.stride(0) : B.stride(1), LDB = BST == 0 ? 1 : BST; \ - const int CST = is_lr ? C.stride(0) : C.stride(1), LDC = CST == 0 ? 1 : CST; \ - \ - rocblas_operation transa = trans_mode_kk_to_rocblas(transA); \ - rocblas_operation transb = trans_mode_kk_to_rocblas(transB); \ - \ - constexpr int numDotsLayoutLeftThreshold = 1600; \ - constexpr int numDotsLayoutRightThreshold = 100; \ - if ((!is_lr && transa != rocblas_operation_none && transb == rocblas_operation_none && \ - M * N < numDotsLayoutLeftThreshold) || \ - (is_lr && transa != rocblas_operation_none && transb == rocblas_operation_none && \ - M * N < numDotsLayoutRightThreshold)) { \ - DotBasedGEMM gemm(alpha, A, B, beta, C); \ - bool conjT = (std::is_same::value || std::is_same::value) \ - ? false \ - : (transa == rocblas_operation_conjugate_transpose ? true : false); \ - gemm.run(space, conjT); \ - } else { \ - KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ - if (!is_lr) \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(ROCBLAS_FN(s.handle, transa, transb, M, N, K, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(A.data()), LDA, \ - reinterpret_cast(B.data()), LDB, \ - reinterpret_cast(&beta), \ - reinterpret_cast(C.data()), LDC)); \ - else \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(ROCBLAS_FN(s.handle, transb, transa, N, M, K, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(B.data()), LDB, \ - reinterpret_cast(A.data()), LDA, \ - reinterpret_cast(&beta), \ - reinterpret_cast(C.data()), LDC)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS3_XGEMM_ROCBLAS(SCALAR_TYPE, ROCBLAS_SCALAR_TYPE, ROCBLAS_FN, LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct GEMM, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef SCALAR_TYPE SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + BViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + CViewType; \ + \ + static void gemm(const typename CViewType::execution_space& space, const char transA[], const char transB[], \ + typename AViewType::const_value_type& alpha, const AViewType& A, const BViewType& B, \ + typename CViewType::const_value_type& beta, const CViewType& C) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::gemm[TPL_ROCBLAS," #SCALAR_TYPE "]"); \ + \ + const bool A_t = (transA[0] != 'N') && (transA[0] != 'n'); \ + const int M = static_cast(C.extent(0)); \ + const int N = static_cast(C.extent(1)); \ + const int K = static_cast(A.extent(A_t ? 0 : 1)); \ + \ + bool is_lr = std::is_same::value; \ + \ + const int AST = is_lr ? A.stride(0) : A.stride(1), LDA = AST == 0 ? 1 : AST; \ + const int BST = is_lr ? B.stride(0) : B.stride(1), LDB = BST == 0 ? 1 : BST; \ + const int CST = is_lr ? C.stride(0) : C.stride(1), LDC = CST == 0 ? 1 : CST; \ + \ + rocblas_operation transa = trans_mode_kk_to_rocblas(transA); \ + rocblas_operation transb = trans_mode_kk_to_rocblas(transB); \ + \ + constexpr int numDotsLayoutLeftThreshold = 1600; \ + constexpr int numDotsLayoutRightThreshold = 100; \ + if ((!is_lr && transa != rocblas_operation_none && transb == rocblas_operation_none && \ + M * N < numDotsLayoutLeftThreshold) || \ + (is_lr && transa != rocblas_operation_none && transb == rocblas_operation_none && \ + M * N < numDotsLayoutRightThreshold)) { \ + DotBasedGEMM gemm(alpha, A, B, beta, C); \ + bool conjT = (std::is_same::value || std::is_same::value) \ + ? false \ + : (transa == rocblas_operation_conjugate_transpose ? true : false); \ + gemm.run(space, conjT); \ + } else { \ + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, space.hip_stream())); \ + if (!is_lr) \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(ROCBLAS_FN(s.handle, transa, transb, M, N, K, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(A.data()), LDA, \ + reinterpret_cast(B.data()), LDB, \ + reinterpret_cast(&beta), \ + reinterpret_cast(C.data()), LDC)); \ + else \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(ROCBLAS_FN(s.handle, transb, transa, N, M, K, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(B.data()), LDB, \ + reinterpret_cast(A.data()), LDA, \ + reinterpret_cast(&beta), \ + reinterpret_cast(C.data()), LDC)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, NULL)); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; #define KOKKOSBLAS3_DGEMM_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ diff --git a/packages/kokkos-kernels/blas/tpls/KokkosBlas3_trmm_tpl_spec_decl.hpp b/packages/kokkos-kernels/blas/tpls/KokkosBlas3_trmm_tpl_spec_decl.hpp index 4e68c08decfa..669477e3aa3a 100644 --- a/packages/kokkos-kernels/blas/tpls/KokkosBlas3_trmm_tpl_spec_decl.hpp +++ b/packages/kokkos-kernels/blas/tpls/KokkosBlas3_trmm_tpl_spec_decl.hpp @@ -199,19 +199,19 @@ namespace Impl { diag_ = CUBLAS_DIAG_NON_UNIT; \ \ KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, space.cuda_stream())); \ if (A_is_layout_left) { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(CUBLAS_FN( \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(CUBLAS_FN( \ s.handle, side_, uplo_, trans_, diag_, M, N, reinterpret_cast(&alpha), \ reinterpret_cast(A.data()), LDA, reinterpret_cast(B.data()), \ LDB, reinterpret_cast(B.data()), LDB)); \ } else { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(CUBLAS_FN( \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(CUBLAS_FN( \ s.handle, side_, uplo_, trans_, diag_, N, M, reinterpret_cast(&alpha), \ reinterpret_cast(A.data()), LDA, reinterpret_cast(B.data()), \ LDB, reinterpret_cast(B.data()), LDB)); \ } \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, NULL)); \ Kokkos::Profiling::popRegion(); \ } \ }; diff --git a/packages/kokkos-kernels/blas/tpls/KokkosBlas3_trsm_tpl_spec_decl.hpp b/packages/kokkos-kernels/blas/tpls/KokkosBlas3_trsm_tpl_spec_decl.hpp index 7074a4e0e2ac..b0b8f0de7372 100644 --- a/packages/kokkos-kernels/blas/tpls/KokkosBlas3_trsm_tpl_spec_decl.hpp +++ b/packages/kokkos-kernels/blas/tpls/KokkosBlas3_trsm_tpl_spec_decl.hpp @@ -370,15 +370,15 @@ namespace Impl { diag_ = CUBLAS_DIAG_NON_UNIT; \ \ KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, space.cuda_stream())); \ if (A_is_ll) { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL( \ cublasDtrsm(s.handle, side_, uplo_, trans_, diag_, M, N, &alpha, A.data(), LDA, B.data(), LDB)); \ } else { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL( \ cublasDtrsm(s.handle, side_, uplo_, trans_, diag_, N, M, &alpha, A.data(), LDA, B.data(), LDB)); \ } \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, NULL)); \ Kokkos::Profiling::popRegion(); \ } \ }; @@ -449,102 +449,102 @@ namespace Impl { diag_ = CUBLAS_DIAG_NON_UNIT; \ \ KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, space.cuda_stream())); \ if (A_is_ll) { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL( \ cublasStrsm(s.handle, side_, uplo_, trans_, diag_, M, N, &alpha, A.data(), LDA, B.data(), LDB)); \ } else { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL( \ cublasStrsm(s.handle, side_, uplo_, trans_, diag_, N, M, &alpha, A.data(), LDA, B.data(), LDB)); \ } \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, NULL)); \ \ Kokkos::Profiling::popRegion(); \ } \ }; -#define KOKKOSBLAS3_ZTRSM_CUBLAS(LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ - struct TRSM**, LAYOUTA, Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View**, LAYOUTB, Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - BViewType; \ - \ - static void trsm(const ExecSpace& space, const char side[], const char uplo[], const char trans[], \ - const char diag[], typename BViewType::const_value_type& alpha, const AViewType& A, \ - const BViewType& B) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::trsm[TPL_CUBLAS,complex]"); \ - const int M = static_cast(B.extent(0)); \ - const int N = static_cast(B.extent(1)); \ - \ - bool A_is_ll = std::is_same::value; \ - bool B_is_ll = std::is_same::value; \ - \ - const int AST = A_is_ll ? A.stride(1) : A.stride(0), LDA = (AST == 0) ? 1 : AST; \ - const int BST = B_is_ll ? B.stride(1) : B.stride(0), LDB = (BST == 0) ? 1 : BST; \ - \ - cublasSideMode_t side_; \ - cublasFillMode_t uplo_; \ - cublasOperation_t trans_; \ - cublasDiagType_t diag_; \ - \ - if (A_is_ll) { \ - if ((side[0] == 'L') || (side[0] == 'l')) \ - side_ = CUBLAS_SIDE_LEFT; \ - else \ - side_ = CUBLAS_SIDE_RIGHT; \ - if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ - uplo_ = CUBLAS_FILL_MODE_LOWER; \ - else \ - uplo_ = CUBLAS_FILL_MODE_UPPER; \ - } else { \ - if ((side[0] == 'L') || (side[0] == 'l')) \ - side_ = CUBLAS_SIDE_RIGHT; \ - else \ - side_ = CUBLAS_SIDE_LEFT; \ - if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ - uplo_ = CUBLAS_FILL_MODE_UPPER; \ - else \ - uplo_ = CUBLAS_FILL_MODE_LOWER; \ - } \ - \ - if ((trans[0] == 'N') || (trans[0] == 'n')) \ - trans_ = CUBLAS_OP_N; \ - else if ((trans[0] == 'T') || (trans[0] == 't')) \ - trans_ = CUBLAS_OP_T; \ - else \ - trans_ = CUBLAS_OP_C; \ - if ((diag[0] == 'U') || (diag[0] == 'u')) \ - diag_ = CUBLAS_DIAG_UNIT; \ - else \ - diag_ = CUBLAS_DIAG_NON_UNIT; \ - \ - KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ - if (A_is_ll) { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZtrsm(s.handle, side_, uplo_, trans_, diag_, M, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(A.data()), LDA, \ - reinterpret_cast(B.data()), LDB)); \ - } else { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZtrsm(s.handle, side_, uplo_, trans_, diag_, N, M, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(A.data()), LDA, \ - reinterpret_cast(B.data()), LDB)); \ - } \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS3_ZTRSM_CUBLAS(LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct TRSM**, LAYOUTA, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View**, LAYOUTB, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + BViewType; \ + \ + static void trsm(const ExecSpace& space, const char side[], const char uplo[], const char trans[], \ + const char diag[], typename BViewType::const_value_type& alpha, const AViewType& A, \ + const BViewType& B) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::trsm[TPL_CUBLAS,complex]"); \ + const int M = static_cast(B.extent(0)); \ + const int N = static_cast(B.extent(1)); \ + \ + bool A_is_ll = std::is_same::value; \ + bool B_is_ll = std::is_same::value; \ + \ + const int AST = A_is_ll ? A.stride(1) : A.stride(0), LDA = (AST == 0) ? 1 : AST; \ + const int BST = B_is_ll ? B.stride(1) : B.stride(0), LDB = (BST == 0) ? 1 : BST; \ + \ + cublasSideMode_t side_; \ + cublasFillMode_t uplo_; \ + cublasOperation_t trans_; \ + cublasDiagType_t diag_; \ + \ + if (A_is_ll) { \ + if ((side[0] == 'L') || (side[0] == 'l')) \ + side_ = CUBLAS_SIDE_LEFT; \ + else \ + side_ = CUBLAS_SIDE_RIGHT; \ + if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ + uplo_ = CUBLAS_FILL_MODE_LOWER; \ + else \ + uplo_ = CUBLAS_FILL_MODE_UPPER; \ + } else { \ + if ((side[0] == 'L') || (side[0] == 'l')) \ + side_ = CUBLAS_SIDE_RIGHT; \ + else \ + side_ = CUBLAS_SIDE_LEFT; \ + if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ + uplo_ = CUBLAS_FILL_MODE_UPPER; \ + else \ + uplo_ = CUBLAS_FILL_MODE_LOWER; \ + } \ + \ + if ((trans[0] == 'N') || (trans[0] == 'n')) \ + trans_ = CUBLAS_OP_N; \ + else if ((trans[0] == 'T') || (trans[0] == 't')) \ + trans_ = CUBLAS_OP_T; \ + else \ + trans_ = CUBLAS_OP_C; \ + if ((diag[0] == 'U') || (diag[0] == 'u')) \ + diag_ = CUBLAS_DIAG_UNIT; \ + else \ + diag_ = CUBLAS_DIAG_NON_UNIT; \ + \ + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, space.cuda_stream())); \ + if (A_is_ll) { \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasZtrsm(s.handle, side_, uplo_, trans_, diag_, M, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(A.data()), LDA, \ + reinterpret_cast(B.data()), LDB)); \ + } else { \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasZtrsm(s.handle, side_, uplo_, trans_, diag_, N, M, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(A.data()), LDA, \ + reinterpret_cast(B.data()), LDB)); \ + } \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, NULL)); \ + \ + Kokkos::Profiling::popRegion(); \ + } \ }; #define KOKKOSBLAS3_CTRSM_CUBLAS(LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) \ @@ -613,17 +613,17 @@ namespace Impl { diag_ = CUBLAS_DIAG_NON_UNIT; \ \ KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, space.cuda_stream())); \ if (A_is_ll) { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasCtrsm( \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasCtrsm( \ s.handle, side_, uplo_, trans_, diag_, M, N, reinterpret_cast(&alpha), \ reinterpret_cast(A.data()), LDA, reinterpret_cast(B.data()), LDB)); \ } else { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasCtrsm( \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasCtrsm( \ s.handle, side_, uplo_, trans_, diag_, N, M, reinterpret_cast(&alpha), \ reinterpret_cast(A.data()), LDA, reinterpret_cast(B.data()), LDB)); \ } \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, NULL)); \ \ Kokkos::Profiling::popRegion(); \ } \ diff --git a/packages/kokkos-kernels/blas/tpls/KokkosBlas_Cuda_tpl.hpp b/packages/kokkos-kernels/blas/tpls/KokkosBlas_Cuda_tpl.hpp index d80e3a23d8e1..fa2749c9800d 100644 --- a/packages/kokkos-kernels/blas/tpls/KokkosBlas_Cuda_tpl.hpp +++ b/packages/kokkos-kernels/blas/tpls/KokkosBlas_Cuda_tpl.hpp @@ -25,12 +25,24 @@ namespace Impl { CudaBlasSingleton::CudaBlasSingleton() { cublasStatus_t stat = cublasCreate(&handle); if (stat != CUBLAS_STATUS_SUCCESS) Kokkos::abort("CUBLAS initialization failed\n"); - - Kokkos::push_finalize_hook([&]() { cublasDestroy(handle); }); } CudaBlasSingleton& CudaBlasSingleton::singleton() { - static CudaBlasSingleton s; + std::unique_ptr& instance = get_instance(); + if (!instance) { + instance = std::make_unique(); + Kokkos::push_finalize_hook([&]() { + cublasDestroy(instance->handle); + instance.reset(); + }); + } + return *instance; +} + +bool CudaBlasSingleton::is_initialized() { return get_instance() != nullptr; } + +std::unique_ptr& CudaBlasSingleton::get_instance() { + static std::unique_ptr s; return s; } diff --git a/packages/kokkos-kernels/blas/tpls/KokkosBlas_Host_tpl.cpp b/packages/kokkos-kernels/blas/tpls/KokkosBlas_Host_tpl.cpp index 6989aea34d51..c163dc726de4 100644 --- a/packages/kokkos-kernels/blas/tpls/KokkosBlas_Host_tpl.cpp +++ b/packages/kokkos-kernels/blas/tpls/KokkosBlas_Host_tpl.cpp @@ -790,6 +790,18 @@ double HostBlas >::nrm2(KK_INT n, const std::complex double HostBlas >::asum(KK_INT n, const std::complex* x, KK_INT x_inc) { + // see issue 2005 + // On some platforms with OpenBLAS < 0.3.26, dzasum on vectors less than 16 entries is producing 0. + // this has been observed on some (not all) systems with: + // clang 14.0.6 / 15.0.7 AND OpenBLAS 0.3.23 AND Sapphire Rapids CPU + // unfortunately, it's not clear exactly what the trigger is + if (n > 0 && n < 16) { + double ret = 0.0; + for (int i = 0; i < n; ++i) { + ret += Kokkos::abs(x[i].real()) + Kokkos::abs(x[i].imag()); + } + return ret; + } return F77_FUNC_DZASUM(&n, x, &x_inc); } template <> diff --git a/packages/kokkos-kernels/blas/tpls/KokkosBlas_Host_tpl.hpp b/packages/kokkos-kernels/blas/tpls/KokkosBlas_Host_tpl.hpp index 3ccf2f822aac..576fde847116 100644 --- a/packages/kokkos-kernels/blas/tpls/KokkosBlas_Host_tpl.hpp +++ b/packages/kokkos-kernels/blas/tpls/KokkosBlas_Host_tpl.hpp @@ -32,7 +32,7 @@ namespace KokkosBlas { namespace Impl { -#if defined(KOKKOSKERNELS_ENABLE_TPL_MKL) +#if defined(KOKKOSKERNELS_ENABLE_TPL_MKL) && defined(MKL_PROVIDES_BLAS_LAPACK) using KK_INT = MKL_INT; #else using KK_INT = int; diff --git a/packages/kokkos-kernels/blas/tpls/KokkosBlas_Magma_tpl.hpp b/packages/kokkos-kernels/blas/tpls/KokkosBlas_Magma_tpl.hpp index f149a790df53..bce5d4057a09 100644 --- a/packages/kokkos-kernels/blas/tpls/KokkosBlas_Magma_tpl.hpp +++ b/packages/kokkos-kernels/blas/tpls/KokkosBlas_Magma_tpl.hpp @@ -25,12 +25,24 @@ namespace Impl { MagmaSingleton::MagmaSingleton() { magma_int_t stat = magma_init(); if (stat != MAGMA_SUCCESS) Kokkos::abort("MAGMA initialization failed\n"); - - Kokkos::push_finalize_hook([&]() { magma_finalize(); }); } MagmaSingleton& MagmaSingleton::singleton() { - static MagmaSingleton s; + std::unique_ptr& instance = get_instance(); + if (!instance) { + instance = std::make_unique(); + Kokkos::push_finalize_hook([&]() { + magma_finalize(); + instance.reset(); + }); + } + return *instance; +} + +bool MagmaSingleton::is_initialized() { return get_instance() != nullptr; } + +std::unique_ptr& MagmaSingleton::get_instance() { + static std::unique_ptr s; return s; } diff --git a/packages/kokkos-kernels/blas/tpls/KokkosBlas_Rocm_tpl.hpp b/packages/kokkos-kernels/blas/tpls/KokkosBlas_Rocm_tpl.hpp index b5a7dabf6f2d..8285570c2b37 100644 --- a/packages/kokkos-kernels/blas/tpls/KokkosBlas_Rocm_tpl.hpp +++ b/packages/kokkos-kernels/blas/tpls/KokkosBlas_Rocm_tpl.hpp @@ -22,14 +22,24 @@ namespace KokkosBlas { namespace Impl { -RocBlasSingleton::RocBlasSingleton() { - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_create_handle(&handle)); +RocBlasSingleton::RocBlasSingleton() { KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_create_handle(&handle)); } - Kokkos::push_finalize_hook([&]() { KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_destroy_handle(handle)); }); +RocBlasSingleton& RocBlasSingleton::singleton() { + std::unique_ptr& instance = get_instance(); + if (!instance) { + instance = std::make_unique(); + Kokkos::push_finalize_hook([&]() { + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_destroy_handle(instance->handle)); + instance.reset(); + }); + } + return *instance; } -RocBlasSingleton& RocBlasSingleton::singleton() { - static RocBlasSingleton s; +bool RocBlasSingleton::is_initialized() { return get_instance() != nullptr; } + +std::unique_ptr& RocBlasSingleton::get_instance() { + static std::unique_ptr s; return s; } diff --git a/packages/kokkos-kernels/blas/tpls/KokkosBlas_magma.hpp b/packages/kokkos-kernels/blas/tpls/KokkosBlas_magma.hpp index 5f5fcfe4e189..e86d2ee2cd32 100644 --- a/packages/kokkos-kernels/blas/tpls/KokkosBlas_magma.hpp +++ b/packages/kokkos-kernels/blas/tpls/KokkosBlas_magma.hpp @@ -27,7 +27,11 @@ namespace Impl { struct MagmaSingleton { MagmaSingleton(); + static bool is_initialized(); static MagmaSingleton& singleton(); + + private: + static std::unique_ptr& get_instance(); }; } // namespace Impl diff --git a/packages/kokkos-kernels/blas/tpls/KokkosBlas_tpl_spec.hpp b/packages/kokkos-kernels/blas/tpls/KokkosBlas_tpl_spec.hpp index 7f40edf43592..c8593e919973 100644 --- a/packages/kokkos-kernels/blas/tpls/KokkosBlas_tpl_spec.hpp +++ b/packages/kokkos-kernels/blas/tpls/KokkosBlas_tpl_spec.hpp @@ -29,7 +29,11 @@ struct CudaBlasSingleton { CudaBlasSingleton(); + static bool is_initialized(); static CudaBlasSingleton& singleton(); + + private: + static std::unique_ptr& get_instance(); }; inline void cublas_internal_error_throw(cublasStatus_t cublasState, const char* name, const char* file, @@ -82,7 +86,8 @@ inline void cublas_internal_safe_call(cublasStatus_t cublasState, const char* na // The macro below defines the interface for the safe cublas calls. // The functions themselves are protected by impl namespace and this // is not meant to be used by external application or libraries. -#define KOKKOS_CUBLAS_SAFE_CALL_IMPL(call) KokkosBlas::Impl::cublas_internal_safe_call(call, #call, __FILE__, __LINE__) +#define KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(call) \ + KokkosBlas::Impl::cublas_internal_safe_call(call, #call, __FILE__, __LINE__) /// \brief This function converts KK transpose mode to cuBLAS transpose mode inline cublasOperation_t trans_mode_kk_to_cublas(const char kkMode[]) { @@ -111,7 +116,12 @@ struct RocBlasSingleton { RocBlasSingleton(); + static bool is_initialized(); + static RocBlasSingleton& singleton(); + + private: + static std::unique_ptr& get_instance(); }; inline void rocblas_internal_error_throw(rocblas_status rocblasState, const char* name, const char* file, @@ -171,7 +181,7 @@ inline void rocblas_internal_safe_call(rocblas_status rocblasState, const char* // The macro below defines the interface for the safe rocblas calls. // The functions themselves are protected by impl namespace and this // is not meant to be used by external application or libraries. -#define KOKKOS_ROCBLAS_SAFE_CALL_IMPL(call) \ +#define KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(call) \ KokkosBlas::Impl::rocblas_internal_safe_call(call, #call, __FILE__, __LINE__) /// \brief This function converts KK transpose mode to rocBLAS transpose mode diff --git a/packages/kokkos-kernels/blas/unit_test/Test_Blas2_gemv.hpp b/packages/kokkos-kernels/blas/unit_test/Test_Blas2_gemv.hpp index d70935c2acd8..be6e4a6d5102 100644 --- a/packages/kokkos-kernels/blas/unit_test/Test_Blas2_gemv.hpp +++ b/packages/kokkos-kernels/blas/unit_test/Test_Blas2_gemv.hpp @@ -19,6 +19,7 @@ #include #include #include +#include "KokkosKernels_TestVanilla.hpp" namespace Test { template diff --git a/packages/kokkos-kernels/blas/unit_test/Test_Blas2_ger.hpp b/packages/kokkos-kernels/blas/unit_test/Test_Blas2_ger.hpp index 6e975532e138..19e69a1c98e2 100644 --- a/packages/kokkos-kernels/blas/unit_test/Test_Blas2_ger.hpp +++ b/packages/kokkos-kernels/blas/unit_test/Test_Blas2_ger.hpp @@ -153,7 +153,7 @@ GerTester::GerT std::is_same>::value), _A_is_lr(std::is_same::value), _A_is_ll(std::is_same::value), - _testIsGpu(KokkosKernels::Impl::kk_is_gpu_exec_space()) + _testIsGpu(KokkosKernels::Impl::is_gpu_exec_space_v) #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS , _vanillaUsesDifferentOrderOfOps(_A_is_lr && _testIsGpu) diff --git a/packages/kokkos-kernels/blas/unit_test/Test_Blas2_syr.hpp b/packages/kokkos-kernels/blas/unit_test/Test_Blas2_syr.hpp index 8dc7cadf517b..cc1d011351ae 100644 --- a/packages/kokkos-kernels/blas/unit_test/Test_Blas2_syr.hpp +++ b/packages/kokkos-kernels/blas/unit_test/Test_Blas2_syr.hpp @@ -153,7 +153,7 @@ SyrTester::SyrTester() std::is_same>::value), _A_is_lr(std::is_same::value), _A_is_ll(std::is_same::value), - _testIsGpu(KokkosKernels::Impl::kk_is_gpu_exec_space()) + _testIsGpu(KokkosKernels::Impl::is_gpu_exec_space_v) #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS , _vanillaUsesDifferentOrderOfOps(_A_is_lr) diff --git a/packages/kokkos-kernels/blas/unit_test/Test_Blas2_syr2.hpp b/packages/kokkos-kernels/blas/unit_test/Test_Blas2_syr2.hpp index 2d6792f8c840..de475b5340be 100644 --- a/packages/kokkos-kernels/blas/unit_test/Test_Blas2_syr2.hpp +++ b/packages/kokkos-kernels/blas/unit_test/Test_Blas2_syr2.hpp @@ -164,7 +164,7 @@ Syr2Tester::Syr std::is_same>::value), _A_is_lr(std::is_same::value), _A_is_ll(std::is_same::value), - _testIsGpu(KokkosKernels::Impl::kk_is_gpu_exec_space()) + _testIsGpu(KokkosKernels::Impl::is_gpu_exec_space_v) #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS , _vanillaUsesDifferentOrderOfOps(_A_is_lr) diff --git a/packages/kokkos-kernels/blas/unit_test/Test_Blas_rocblas.hpp b/packages/kokkos-kernels/blas/unit_test/Test_Blas_rocblas.hpp index 091fac725921..aac11b180b53 100644 --- a/packages/kokkos-kernels/blas/unit_test/Test_Blas_rocblas.hpp +++ b/packages/kokkos-kernels/blas/unit_test/Test_Blas_rocblas.hpp @@ -41,11 +41,11 @@ void test_rocblas_safe_call() { bool caught_exception = false; rocblas_status myStatus = rocblas_status_success; - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(myStatus); + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(myStatus); try { myStatus = rocblas_status_internal_error; - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(myStatus); + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(myStatus); } catch (std::runtime_error& e) { caught_exception = true; } diff --git a/packages/kokkos-kernels/cmake/KokkosKernels_config.h.in b/packages/kokkos-kernels/cmake/KokkosKernels_config.h.in index c3865559c0b3..fa9f55684758 100644 --- a/packages/kokkos-kernels/cmake/KokkosKernels_config.h.in +++ b/packages/kokkos-kernels/cmake/KokkosKernels_config.h.in @@ -153,6 +153,9 @@ #endif #endif +/* Whether MKL is providing the BLAS and LAPACK implementation */ +#cmakedefine MKL_PROVIDES_BLAS_LAPACK + #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP) && \ !defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOS_ENABLE_OPENMPTARGET) #define KOKKOSKERNELS_ENABLE_HOST_ONLY @@ -171,4 +174,12 @@ #define KOKKOSKERNELS_IMPL_COMPILE_LIBRARY false #endif +/* Enabled components */ +#cmakedefine KOKKOSKERNELS_ENABLE_COMPONENT_BATCHED +#cmakedefine KOKKOSKERNELS_ENABLE_COMPONENT_BLAS +#cmakedefine KOKKOSKERNELS_ENABLE_COMPONENT_LAPACK +#cmakedefine KOKKOSKERNELS_ENABLE_COMPONENT_SPARSE +#cmakedefine KOKKOSKERNELS_ENABLE_COMPONENT_GRAPH +#cmakedefine KOKKOSKERNELS_ENABLE_COMPONENT_ODE + #endif // KOKKOSKERNELS_CONFIG_H diff --git a/packages/kokkos-kernels/cmake/Modules/FindTPLBLAS.cmake b/packages/kokkos-kernels/cmake/Modules/FindTPLBLAS.cmake index 0bc73fc73ffb..67e4cc9a08a2 100644 --- a/packages/kokkos-kernels/cmake/Modules/FindTPLBLAS.cmake +++ b/packages/kokkos-kernels/cmake/Modules/FindTPLBLAS.cmake @@ -8,4 +8,3 @@ ELSE() FIND_PACKAGE(BLAS REQUIRED) KOKKOSKERNELS_CREATE_IMPORTED_TPL(BLAS INTERFACE LINK_LIBRARIES ${BLAS_LIBRARIES}) ENDIF() - diff --git a/packages/kokkos-kernels/cmake/Modules/FindTPLLAPACK.cmake b/packages/kokkos-kernels/cmake/Modules/FindTPLLAPACK.cmake index 463f61afeb76..f6d345d5ee70 100644 --- a/packages/kokkos-kernels/cmake/Modules/FindTPLLAPACK.cmake +++ b/packages/kokkos-kernels/cmake/Modules/FindTPLLAPACK.cmake @@ -8,4 +8,3 @@ ELSE() FIND_PACKAGE(LAPACK REQUIRED) KOKKOSKERNELS_CREATE_IMPORTED_TPL(LAPACK INTERFACE LINK_LIBRARIES ${LAPACK_LIBRARIES}) ENDIF() - diff --git a/packages/kokkos-kernels/cmake/Modules/FindTPLMKL.cmake b/packages/kokkos-kernels/cmake/Modules/FindTPLMKL.cmake index 52f457197630..1ecd882e711b 100644 --- a/packages/kokkos-kernels/cmake/Modules/FindTPLMKL.cmake +++ b/packages/kokkos-kernels/cmake/Modules/FindTPLMKL.cmake @@ -74,3 +74,7 @@ ELSE() ) ENDIF() ENDIF() +# This logic to find MKL is only used in non-Trilinos builds. +# In this case, MKL can always be used as the host BLAS/LAPACK implementation +# (whether MKL_INT is 32- or 64-bit). +set (MKL_PROVIDES_BLAS_LAPACK ON INTERNAL) diff --git a/packages/kokkos-kernels/cmake/kokkoskernels_components.cmake b/packages/kokkos-kernels/cmake/kokkoskernels_components.cmake index 16a784bd1ffe..951900e9a968 100644 --- a/packages/kokkos-kernels/cmake/kokkoskernels_components.cmake +++ b/packages/kokkos-kernels/cmake/kokkoskernels_components.cmake @@ -102,4 +102,13 @@ IF ( KokkosKernels_ENABLE_COMPONENT_BATCHED ELSE() SET(KOKKOSKERNELS_ALL_COMPONENTS_ENABLED OFF CACHE BOOL "" FORCE) ENDIF() -mark_as_advanced(FORCE KOKKOSKERNELS_ALL_COMPONENTS_ENABLED) \ No newline at end of file +mark_as_advanced(FORCE KOKKOSKERNELS_ALL_COMPONENTS_ENABLED) +# Now that component enables are finalized, also set upper-case +# versions of component enables for the config.h + +SET(KOKKOSKERNELS_ENABLE_COMPONENT_BATCHED ${KokkosKernels_ENABLE_COMPONENT_BATCHED}) +SET(KOKKOSKERNELS_ENABLE_COMPONENT_BLAS ${KokkosKernels_ENABLE_COMPONENT_BLAS}) +SET(KOKKOSKERNELS_ENABLE_COMPONENT_LAPACK ${KokkosKernels_ENABLE_COMPONENT_LAPACK}) +SET(KOKKOSKERNELS_ENABLE_COMPONENT_GRAPH ${KokkosKernels_ENABLE_COMPONENT_GRAPH}) +SET(KOKKOSKERNELS_ENABLE_COMPONENT_SPARSE ${KokkosKernels_ENABLE_COMPONENT_SPARSE}) +SET(KOKKOSKERNELS_ENABLE_COMPONENT_ODE ${KokkosKernels_ENABLE_COMPONENT_ODE}) diff --git a/packages/kokkos-kernels/cmake/kokkoskernels_eti.cmake b/packages/kokkos-kernels/cmake/kokkoskernels_eti.cmake index 524cad11f98a..6c7629900d0a 100644 --- a/packages/kokkos-kernels/cmake/kokkoskernels_eti.cmake +++ b/packages/kokkos-kernels/cmake/kokkoskernels_eti.cmake @@ -131,6 +131,7 @@ MACRO(KOKKOSKERNELS_GENERATE_ETI FUNCTION_NAME SUBFOLDER) STRING(TOUPPER "${FUNCTION_NAME}" UPPER_NAME) SET(ETI_AVAIL_MACRO "KOKKOS${UPPER_NAME}_ETI_SPEC_AVAIL") + SET(ETI_DECL_MACRO "KOKKOS${UPPER_NAME}_ETI_SPEC_DECL") SET(ETI_INST_MACRO "KOKKOS${UPPER_NAME}_ETI_SPEC_INST") # if this is tied to particular components @@ -152,6 +153,7 @@ MACRO(KOKKOSKERNELS_GENERATE_ETI FUNCTION_NAME SUBFOLDER) STRING(REPLACE ",)" ")" MACRO_STRING ${MACRO_STRING}) #Make a single header file for all instances LIST(APPEND ${UPPER_NAME}_ETI_AVAIL_LIST "${ETI_AVAIL_MACRO}${MACRO_STRING}") + LIST(APPEND ${UPPER_NAME}_ETI_DECL_LIST "${ETI_DECL_MACRO}${MACRO_STRING}") #Make a different source file for each instance SET(INST_SOURCE "${ETI_COMPONENTS}/eti/generated_specializations_cpp/${SUBFOLDER}/${ETI}.cpp") SET(INST_TEMPLATE "${ETI_COMPONENTS}/eti/generated_specializations_cpp/${SUBFOLDER}/Kokkos${FUNCTION_NAME}_eti_spec_inst.cpp.in") @@ -174,4 +176,14 @@ MACRO(KOKKOSKERNELS_GENERATE_ETI FUNCTION_NAME SUBFOLDER) ${CMAKE_CURRENT_BINARY_DIR}/${AVAIL_HEADER}) LIST(APPEND ${ETI_HEADER_LIST} ${CMAKE_CURRENT_BINARY_DIR}/${AVAIL_HEADER}) + + SET(DECL_HEADER "${ETI_COMPONENTS}/eti/generated_specializations_hpp/Kokkos${FUNCTION_NAME}_eti_spec_decl.hpp") + SET(DECL_TEMPLATE "${DECL_HEADER}.in") + + STRING(REPLACE ";" "\n" ${UPPER_NAME}_ETI_DECL_BLOCK "${${UPPER_NAME}_ETI_DECL_LIST}") + + CONFIGURE_FILE(${CMAKE_CURRENT_SOURCE_DIR}/${DECL_TEMPLATE} + ${CMAKE_CURRENT_BINARY_DIR}/${DECL_HEADER}) + + LIST(APPEND ${ETI_HEADER_LIST} ${CMAKE_CURRENT_BINARY_DIR}/${DECL_HEADER}) ENDMACRO(KOKKOSKERNELS_GENERATE_ETI) diff --git a/packages/kokkos-kernels/common/CMakeLists.txt b/packages/kokkos-kernels/common/CMakeLists.txt index b06586929688..fd180f7827d7 100644 --- a/packages/kokkos-kernels/common/CMakeLists.txt +++ b/packages/kokkos-kernels/common/CMakeLists.txt @@ -2,3 +2,5 @@ LIST(APPEND KK_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/common/src) LIST(APPEND KK_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/common/impl) LIST(APPEND KK_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/common/unit_test) + +LIST(APPEND SOURCES common/src/KokkosKernels_EagerInitialize.cpp) diff --git a/packages/kokkos-kernels/common/src/KokkosKernels_EagerInitialize.cpp b/packages/kokkos-kernels/common/src/KokkosKernels_EagerInitialize.cpp new file mode 100644 index 000000000000..214de93109c3 --- /dev/null +++ b/packages/kokkos-kernels/common/src/KokkosKernels_EagerInitialize.cpp @@ -0,0 +1,78 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include "KokkosKernels_EagerInitialize.hpp" +#include "KokkosKernels_config.h" +#include "Kokkos_Core.hpp" + +// Include the minimal set of headers that declare all TPL singletons +#ifdef KOKKOSKERNELS_ENABLE_COMPONENT_BLAS +#include "KokkosBlas_tpl_spec.hpp" //cuBLAS, rocBLAS +#ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA +#include "KokkosBlas_magma.hpp" +#endif +#endif + +#ifdef KOKKOSKERNELS_ENABLE_COMPONENT_SPARSE +// note: this file declares both cuSPARSE and rocSPARSE singletons +#include "KokkosKernels_tpl_handles_decl.hpp" +#endif + +#ifdef KOKKOSKERNELS_ENABLE_COMPONENT_LAPACK +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSOLVER +#include "KokkosLapack_cusolver.hpp" +#endif +#ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA +#include "KokkosLapack_magma.hpp" +#endif +#endif + +namespace KokkosKernels { +void eager_initialize() { + if (!Kokkos::is_initialized()) { + throw std::runtime_error("Kokkos::intialize must be called before KokkosKernels::eager_initialize"); + } +#ifdef KOKKOSKERNELS_ENABLE_COMPONENT_BLAS +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS + (void)KokkosBlas::Impl::CudaBlasSingleton::singleton(); +#endif +#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCBLAS + (void)KokkosBlas::Impl::RocBlasSingleton::singleton(); +#endif +#ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA + (void)KokkosBlas::Impl::MagmaSingleton::singleton(); +#endif +#endif + +#ifdef KOKKOSKERNELS_ENABLE_COMPONENT_SPARSE +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE + (void)KokkosKernels::Impl::CusparseSingleton::singleton(); +#endif +#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE + (void)KokkosKernels::Impl::RocsparseSingleton::singleton(); +#endif +#endif + +#ifdef KOKKOSKERNELS_ENABLE_COMPONENT_LAPACK +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSOLVER + (void)KokkosLapack::Impl::CudaLapackSingleton::singleton(); +#endif +#ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA + (void)KokkosLapack::Impl::MagmaSingleton::singleton(); +#endif +#endif +} +} // namespace KokkosKernels diff --git a/packages/kokkos-kernels/common/src/KokkosKernels_EagerInitialize.hpp b/packages/kokkos-kernels/common/src/KokkosKernels_EagerInitialize.hpp new file mode 100644 index 000000000000..83ddba74eec8 --- /dev/null +++ b/packages/kokkos-kernels/common/src/KokkosKernels_EagerInitialize.hpp @@ -0,0 +1,38 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOKERNELS_EAGER_INITIALIZE_HPP +#define KOKKOKERNELS_EAGER_INITIALIZE_HPP + +namespace KokkosKernels { +// \brief Eagerly initialize handles for all enabled TPLs, as well +// as any other globally shared resources that would otherwise be lazily initialized. +// +// Eagerly initializing a TPL means that it doesn't have to be +// lazily initialized when first calling a kernel that uses it. +// For example, \c eager_initialize() will call \c cusparseCreate() upfront +// so that the first call to \c KokkosSparse::spmv doesn't have to. +// This can add a significant amount of apparent runtime to that first kernel +// call, even though the added time isn't really spent in the kernel. +// +// Calling this before using any kernels/TPLs is optional. +// This function is idempotent (any calls after the first have no effect). +// +// \pre \c Kokkos::initialize() has been called. +void eager_initialize(); +} // namespace KokkosKernels + +#endif diff --git a/packages/kokkos-kernels/common/src/KokkosKernels_ExecSpaceUtils.hpp b/packages/kokkos-kernels/common/src/KokkosKernels_ExecSpaceUtils.hpp index 2d167f5c7333..625fa3e71002 100644 --- a/packages/kokkos-kernels/common/src/KokkosKernels_ExecSpaceUtils.hpp +++ b/packages/kokkos-kernels/common/src/KokkosKernels_ExecSpaceUtils.hpp @@ -78,29 +78,21 @@ KOKKOS_FORCEINLINE_FUNCTION ExecSpaceType kk_get_exec_space_type() { //////////////////////////////////////////////////////////////////////////////// template -constexpr KOKKOS_INLINE_FUNCTION bool kk_is_gpu_exec_space() { - return false; -} +constexpr inline bool is_gpu_exec_space_v = false; #ifdef KOKKOS_ENABLE_CUDA template <> -constexpr KOKKOS_INLINE_FUNCTION bool kk_is_gpu_exec_space() { - return true; -} +constexpr inline bool is_gpu_exec_space_v = true; #endif #ifdef KOKKOS_ENABLE_HIP template <> -constexpr KOKKOS_INLINE_FUNCTION bool kk_is_gpu_exec_space() { - return true; -} +constexpr inline bool is_gpu_exec_space_v = true; #endif #ifdef KOKKOS_ENABLE_SYCL template <> -constexpr KOKKOS_INLINE_FUNCTION bool kk_is_gpu_exec_space() { - return true; -} +constexpr inline bool is_gpu_exec_space_v = true; #endif //////////////////////////////////////////////////////////////////////////////// diff --git a/packages/kokkos-kernels/common/src/KokkosKernels_HashmapAccumulator.hpp b/packages/kokkos-kernels/common/src/KokkosKernels_HashmapAccumulator.hpp index c57dfa83fd53..c5ea080e05f8 100644 --- a/packages/kokkos-kernels/common/src/KokkosKernels_HashmapAccumulator.hpp +++ b/packages/kokkos-kernels/common/src/KokkosKernels_HashmapAccumulator.hpp @@ -516,7 +516,7 @@ struct HashmapAccumulator { Kokkos::atomic_add(values + hash, value); return __insert_success; } else if (keys[hash] == -1) { - if (Kokkos::atomic_compare_exchange_strong(keys + hash, -1, key)) { + if (-1 == Kokkos::atomic_compare_exchange(keys + hash, -1, key)) { // should only be here if we used a new hash used_hashes[Kokkos::atomic_fetch_add(used_hash_size, size_type(1))] = hash; Kokkos::atomic_add(values + hash, value); diff --git a/packages/kokkos-kernels/common/src/KokkosKernels_Sorting.hpp b/packages/kokkos-kernels/common/src/KokkosKernels_Sorting.hpp index f91f11c16498..c8e7ee477123 100644 --- a/packages/kokkos-kernels/common/src/KokkosKernels_Sorting.hpp +++ b/packages/kokkos-kernels/common/src/KokkosKernels_Sorting.hpp @@ -17,8 +17,9 @@ #define _KOKKOSKERNELS_SORTING_HPP #include "Kokkos_Core.hpp" +#include "Kokkos_Sort.hpp" #include "KokkosKernels_SimpleUtils.hpp" //for kk_exclusive_parallel_prefix_sum -#include "KokkosKernels_ExecSpaceUtils.hpp" //for kk_is_gpu_exec_space +#include "KokkosKernels_ExecSpaceUtils.hpp" //for is_gpu_exec_space #include namespace KokkosKernels { @@ -59,30 +60,13 @@ KOKKOS_INLINE_FUNCTION void SerialRadixSort2(ValueType* values, ValueType* value // Team-level parallel sorting (callable inside any TeamPolicy kernel) // ------------------------------------------------------------------- -// Comparison based sorting that uses the entire team (described by mem) to sort -// raw array according to the comparator. -template > -KOKKOS_INLINE_FUNCTION void TeamBitonicSort(ValueType* values, Ordinal n, const TeamMember mem, - const Comparator& comp = Comparator()); - -// Same as SerialRadixSort, but also permutes perm[0...n] as it sorts -// values[0...n]. -template > -KOKKOS_INLINE_FUNCTION void TeamBitonicSort2(ValueType* values, PermType* perm, Ordinal n, const TeamMember mem, - const Comparator& comp = Comparator()); - namespace Impl { // Functor that sorts a view on one team template struct BitonicSingleTeamFunctor { BitonicSingleTeamFunctor(View& v_, const Comparator& comp_) : v(v_), comp(comp_) {} - KOKKOS_INLINE_FUNCTION void operator()(const TeamMember t) const { - KokkosKernels::TeamBitonicSort(v.data(), v.extent(0), t, - comp); - }; + KOKKOS_INLINE_FUNCTION void operator()(const TeamMember t) const { Kokkos::Experimental::sort_team(t, v, comp); }; View v; Comparator comp; }; @@ -97,8 +81,7 @@ struct BitonicChunkFunctor { Ordinal chunkStart = chunk * chunkSize; Ordinal n = chunkSize; if (chunkStart + n > Ordinal(v.extent(0))) n = v.extent(0) - chunkStart; - KokkosKernels::TeamBitonicSort(v.data() + chunkStart, n, - t, comp); + Kokkos::Experimental::sort_team(t, Kokkos::subview(v, Kokkos::make_pair(chunkStart, chunkStart + n)), comp); }; View v; Comparator comp; @@ -217,10 +200,11 @@ void bitonicSort(View v, const Comparator& comp) { Ordinal npot = 1; while (npot < n) npot <<= 1; // Partition the data equally among fixed number of teams - Ordinal chunkSize = 512; - Ordinal numTeams = npot / chunkSize; + Ordinal chunkSize = 512; + Ordinal numTeamsChunkSort = (n + chunkSize - 1) / chunkSize; + Ordinal numTeams = npot / chunkSize; // First, sort within teams - Kokkos::parallel_for(team_policy(numTeams, Kokkos::AUTO()), + Kokkos::parallel_for(team_policy(numTeamsChunkSort, Kokkos::AUTO()), Impl::BitonicChunkFunctor(v, comp, chunkSize)); for (int teamsPerBox = 2; teamsPerBox <= npot / chunkSize; teamsPerBox *= 2) { Ordinal boxSize = teamsPerBox * chunkSize; @@ -388,165 +372,23 @@ KOKKOS_INLINE_FUNCTION void SerialRadixSort2(ValueType* values, ValueType* value // trivially-copyable) Pros: In-place, plenty of parallelism for GPUs, and // memory references are coalesced Con: O(n log^2(n)) serial time is bad on CPUs // Good diagram of the algorithm at https://en.wikipedia.org/wiki/Bitonic_sorter -template -KOKKOS_INLINE_FUNCTION void TeamBitonicSort(ValueType* values, Ordinal n, const TeamMember mem, - const Comparator& comp) { - // Algorithm only works on power-of-two input size only. - // If n is not a power-of-two, will implicitly pretend - // that values[i] for i >= n is just the max for ValueType, so it never gets - // swapped - Ordinal npot = 1; - Ordinal levels = 0; - while (npot < n) { - levels++; - npot <<= 1; - } - for (Ordinal i = 0; i < levels; i++) { - for (Ordinal j = 0; j <= i; j++) { - // n/2 pairs of items are compared in parallel - Kokkos::parallel_for(Kokkos::TeamVectorRange(mem, npot / 2), [=](const Ordinal t) { - // How big are the brown/pink boxes? - Ordinal boxSize = Ordinal(2) << (i - j); - // Which box contains this thread? - Ordinal boxID = t >> (i - j); // t * 2 / boxSize; - Ordinal boxStart = boxID << (1 + i - j); // boxID * boxSize - Ordinal boxOffset = t - (boxStart >> 1); // t - boxID * boxSize / - // 2; - Ordinal elem1 = boxStart + boxOffset; - if (j == 0) { - // first phase (brown box): within a block, compare with the - // opposite value in the box - Ordinal elem2 = boxStart + boxSize - 1 - boxOffset; - if (elem2 < n) { - // both elements in bounds, so compare them and swap if out of - // order - if (comp(values[elem2], values[elem1])) { - ValueType temp = values[elem1]; - values[elem1] = values[elem2]; - values[elem2] = temp; - } - } - } else { - // later phases (pink box): within a block, compare with fixed - // distance (boxSize / 2) apart - Ordinal elem2 = elem1 + boxSize / 2; - if (elem2 < n) { - if (comp(values[elem2], values[elem1])) { - ValueType temp = values[elem1]; - values[elem1] = values[elem2]; - values[elem2] = temp; - } - } - } - }); - mem.team_barrier(); - } - } -} - -// Sort "values", while applying the same swaps to "perm" -template -KOKKOS_INLINE_FUNCTION void TeamBitonicSort2(ValueType* values, PermType* perm, Ordinal n, const TeamMember mem, - const Comparator& comp) { - // Algorithm only works on power-of-two input size only. - // If n is not a power-of-two, will implicitly pretend - // that values[i] for i >= n is just the max for ValueType, so it never gets - // swapped - Ordinal npot = 1; - Ordinal levels = 0; - while (npot < n) { - levels++; - npot <<= 1; - } - for (Ordinal i = 0; i < levels; i++) { - for (Ordinal j = 0; j <= i; j++) { - // n/2 pairs of items are compared in parallel - Kokkos::parallel_for(Kokkos::TeamVectorRange(mem, npot / 2), [=](const Ordinal t) { - // How big are the brown/pink boxes? - Ordinal boxSize = Ordinal(2) << (i - j); - // Which box contains this thread? - Ordinal boxID = t >> (i - j); // t * 2 / boxSize; - Ordinal boxStart = boxID << (1 + i - j); // boxID * boxSize - Ordinal boxOffset = t - (boxStart >> 1); // t - boxID * boxSize / - // 2; - Ordinal elem1 = boxStart + boxOffset; - if (j == 0) { - // first phase (brown box): within a block, compare with the - // opposite value in the box - Ordinal elem2 = boxStart + boxSize - 1 - boxOffset; - if (elem2 < n) { - // both elements in bounds, so compare them and swap if out of - // order - if (comp(values[elem2], values[elem1])) { - ValueType temp1 = values[elem1]; - values[elem1] = values[elem2]; - values[elem2] = temp1; - PermType temp2 = perm[elem1]; - perm[elem1] = perm[elem2]; - perm[elem2] = temp2; - } - } - } else { - // later phases (pink box): within a block, compare with fixed - // distance (boxSize / 2) apart - Ordinal elem2 = elem1 + boxSize / 2; - if (elem2 < n) { - if (comp(values[elem2], values[elem1])) { - ValueType temp1 = values[elem1]; - values[elem1] = values[elem2]; - values[elem2] = temp1; - PermType temp2 = perm[elem1]; - perm[elem1] = perm[elem2]; - perm[elem2] = temp2; - } - } - } - }); - mem.team_barrier(); - } - } -} - -// For backward compatibility: keep the public interface accessible in -// KokkosKernels::Impl:: -namespace Impl { - -template > -[[deprecated]] void bitonicSort(View v, const Comparator& comp = Comparator()) { - KokkosKernels::bitonicSort(v, comp); -} - -template -[[deprecated]] KOKKOS_INLINE_FUNCTION void SerialRadixSort(ValueType* values, ValueType* valuesAux, Ordinal n) { - KokkosKernels::SerialRadixSort(values, valuesAux, n); -} - -// Same as SerialRadixSort, but also permutes perm[0...n] as it sorts -// values[0...n]. -template -[[deprecated]] KOKKOS_INLINE_FUNCTION void SerialRadixSort2(ValueType* values, ValueType* valuesAux, PermType* perm, - PermType* permAux, Ordinal n) { - KokkosKernels::SerialRadixSort2(values, valuesAux, perm, permAux, n); -} - template > -[[deprecated]] KOKKOS_INLINE_FUNCTION void TeamBitonicSort(ValueType* values, Ordinal n, const TeamMember mem, - const Comparator& comp = Comparator()) { - KokkosKernels::TeamBitonicSort(values, n, mem, comp); +[[deprecated("Use Kokkos::Experimental::sort_team instead")]] KOKKOS_INLINE_FUNCTION void TeamBitonicSort( + ValueType* values, Ordinal n, const TeamMember mem, const Comparator& comp = Comparator()) { + Kokkos::View valuesView(values, n); + Kokkos::Experimental::sort_team(mem, valuesView, comp); } -// Same as SerialRadixSort, but also permutes perm[0...n] as it sorts -// values[0...n]. +// Sort "values", while applying the same swaps to "perm" template > -[[deprecated]] KOKKOS_INLINE_FUNCTION void TeamBitonicSort2(ValueType* values, PermType* perm, Ordinal n, - const TeamMember mem, - const Comparator& comp = Comparator()) { - KokkosKernels::TeamBitonicSort2(values, perm, n, mem, comp); +[[deprecated("Use Kokkos::Experimental::sort_by_key_team instead")]] KOKKOS_INLINE_FUNCTION void TeamBitonicSort2( + ValueType* values, PermType* perm, Ordinal n, const TeamMember mem, const Comparator& comp = Comparator()) { + Kokkos::View valuesView(values, n); + Kokkos::View permView(perm, n); + Kokkos::Experimental::sort_by_key_team(mem, valuesView, permView, comp); } -} // namespace Impl } // namespace KokkosKernels diff --git a/packages/kokkos-kernels/common/src/KokkosKernels_Uniform_Initialized_MemoryPool.hpp b/packages/kokkos-kernels/common/src/KokkosKernels_Uniform_Initialized_MemoryPool.hpp index aa477815d617..caf8e6b3070e 100644 --- a/packages/kokkos-kernels/common/src/KokkosKernels_Uniform_Initialized_MemoryPool.hpp +++ b/packages/kokkos-kernels/common/src/KokkosKernels_Uniform_Initialized_MemoryPool.hpp @@ -294,7 +294,7 @@ class UniformMemoryPool { data_type *get_arbitrary_free_chunk(const size_t &thread_index, const size_t max_tries) const { size_t chunk_index = thread_index & modular_num_chunks; size_t num_try = 0; - while (!Kokkos::atomic_compare_exchange_strong(pchunk_locks + chunk_index, 0, 1)) { + while (0 != Kokkos::atomic_compare_exchange(pchunk_locks + chunk_index, 0, 1)) { chunk_index = (chunk_index + 1) & modular_num_chunks; ++num_try; if (num_try > max_tries) { diff --git a/packages/kokkos-kernels/common/src/KokkosKernels_Utils.hpp b/packages/kokkos-kernels/common/src/KokkosKernels_Utils.hpp index f0add80c50ed..f24b6362ebb5 100644 --- a/packages/kokkos-kernels/common/src/KokkosKernels_Utils.hpp +++ b/packages/kokkos-kernels/common/src/KokkosKernels_Utils.hpp @@ -58,7 +58,7 @@ void get_suggested_vector_size(int &suggested_vector_size_, idx nr, idx nnz) { template int get_suggested_team_size(Functor &f, int vector_size) { using execution_space = typename team_policy_t::traits::execution_space; - if (kk_is_gpu_exec_space()) { + if (is_gpu_exec_space_v) { team_policy_t temp(1, 1, vector_size); return temp.team_size_recommended(f, ParallelTag()); } else @@ -68,7 +68,7 @@ int get_suggested_team_size(Functor &f, int vector_size) { template int get_suggested_team_size(Functor &f, int vector_size, size_t sharedPerTeam, size_t sharedPerThread) { using execution_space = typename team_policy_t::traits::execution_space; - if (kk_is_gpu_exec_space()) { + if (is_gpu_exec_space_v) { team_policy_t temp = team_policy_t(1, 1, vector_size) .set_scratch_size(0, Kokkos::PerTeam(sharedPerTeam), Kokkos::PerThread(sharedPerThread)); return temp.team_size_recommended(f, ParallelTag()); diff --git a/packages/kokkos-kernels/common/src/KokkosKernels_default_types.hpp b/packages/kokkos-kernels/common/src/KokkosKernels_default_types.hpp index 1da965a0829c..0a55be6d0b43 100644 --- a/packages/kokkos-kernels/common/src/KokkosKernels_default_types.hpp +++ b/packages/kokkos-kernels/common/src/KokkosKernels_default_types.hpp @@ -20,57 +20,77 @@ #include "Kokkos_Core.hpp" //for LayoutLeft/LayoutRight #include //for all the ETI #cmakedefine macros +// define a deprecated symbol = type in the global namespace +// and a non-deprecated version in Kokkos Kernels +// these deprecations were done in 4.4. +// Intel 19 doesn't seem to like deprecating a type alias +#if defined(KOKKOS_COMPILER_INTEL) && (KOKKOS_COMPILER_INTEL < 2000) +#define KK_IMPL_MAKE_TYPE_ALIAS(symbol, type) \ + using symbol = type; \ + namespace KokkosKernels { \ + using symbol = type; \ + } +#else +#define KK_IMPL_MAKE_TYPE_ALIAS(symbol, type) \ + using symbol [[deprecated("use KokkosKernels::" #symbol ".")]] = type; \ + namespace KokkosKernels { \ + using symbol = type; \ + } +#endif + #if defined(KOKKOSKERNELS_INST_ORDINAL_INT) -using default_lno_t = int; +KK_IMPL_MAKE_TYPE_ALIAS(default_lno_t, int) #elif defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) -using default_lno_t = int64_t; +KK_IMPL_MAKE_TYPE_ALIAS(default_lno_t, int64_t) #else // Non-ETI build: default to int -using default_lno_t = int; +KK_IMPL_MAKE_TYPE_ALIAS(default_lno_t, int) #endif // Prefer int as the default offset type, because cuSPARSE doesn't support // size_t for rowptrs. #if defined(KOKKOSKERNELS_INST_OFFSET_INT) -using default_size_type = int; +KK_IMPL_MAKE_TYPE_ALIAS(default_size_type, int) #elif defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T) -using default_size_type = size_t; +KK_IMPL_MAKE_TYPE_ALIAS(default_size_type, size_t) #else // Non-ETI build: default to int -using default_size_type = int; +KK_IMPL_MAKE_TYPE_ALIAS(default_size_type, int) #endif #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) -using default_layout = Kokkos::LayoutLeft; +KK_IMPL_MAKE_TYPE_ALIAS(default_layout, Kokkos::LayoutLeft) #elif defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) -using default_layout = Kokkos::LayoutRight; +KK_IMPL_MAKE_TYPE_ALIAS(default_layout, Kokkos::LayoutRight) #else -using default_layout = Kokkos::LayoutLeft; +KK_IMPL_MAKE_TYPE_ALIAS(default_layout, Kokkos::LayoutLeft) #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) -using default_scalar = double; +KK_IMPL_MAKE_TYPE_ALIAS(default_scalar, double) #elif defined(KOKKOSKERNELS_INST_FLOAT) -using default_scalar = float; +KK_IMPL_MAKE_TYPE_ALIAS(default_scalar, float) #elif defined(KOKKOSKERNELS_INST_HALF) -using default_scalar = Kokkos::Experimental::half_t; +KK_IMPL_MAKE_TYPE_ALIAS(default_scalar, Kokkos::Experimental::half_t) #elif defined(KOKKOSKERNELS_INST_BHALF) -using default_scalar = Kokkos::Experimental::bhalf_t; +KK_IMPL_MAKE_TYPE_ALIAS(default_scalar, Kokkos::Experimental::bhalf_t) #else -using default_scalar = double; +KK_IMPL_MAKE_TYPE_ALIAS(default_scalar, double) #endif #if defined(KOKKOS_ENABLE_CUDA) -using default_device = Kokkos::Cuda; +KK_IMPL_MAKE_TYPE_ALIAS(default_device, Kokkos::Cuda) #elif defined(KOKKOS_ENABLE_HIP) -using default_device = Kokkos::HIP; +KK_IMPL_MAKE_TYPE_ALIAS(default_device, Kokkos::HIP) #elif defined(KOKKOS_ENABLE_OPENMPTARGET) -using default_device = Kokkos::Experimental::OpenMPTarget; +KK_IMPL_MAKE_TYPE_ALIAS(default_device, Kokkos::Experimental::OpenMPTarget) #elif defined(KOKKOS_ENABLE_OPENMP) -using default_device = Kokkos::OpenMP; +KK_IMPL_MAKE_TYPE_ALIAS(default_device, Kokkos::OpenMP) #elif defined(KOKKOS_ENABLE_THREADS) -using default_device = Kokkos::Threads; +KK_IMPL_MAKE_TYPE_ALIAS(default_device, Kokkos::Threads) #else -using default_device = Kokkos::Serial; +KK_IMPL_MAKE_TYPE_ALIAS(default_device, Kokkos::Serial) #endif +#undef KK_IMPL_MAKE_TYPE_ALIAS + #endif // KOKKOSKERNELS_DEFAULT_TYPES_H diff --git a/packages/kokkos-kernels/common/src/Kokkos_ArithTraits.hpp b/packages/kokkos-kernels/common/src/Kokkos_ArithTraits.hpp index 25089613d4ba..f2d84edf817e 100644 --- a/packages/kokkos-kernels/common/src/Kokkos_ArithTraits.hpp +++ b/packages/kokkos-kernels/common/src/Kokkos_ArithTraits.hpp @@ -14,8 +14,8 @@ // //@HEADER -#ifndef KOKKOS_ARITHTRAITS_HPP -#define KOKKOS_ARITHTRAITS_HPP +#ifndef KOKKOSKERNELS_KOKKOS_ARITHTRAITS_HPP +#define KOKKOSKERNELS_KOKKOS_ARITHTRAITS_HPP /// \file Kokkos_ArithTraits.hpp /// \brief Declaration and definition of Kokkos::ArithTraits @@ -1641,4 +1641,4 @@ using ArithTraits [[deprecated("Use Kokkos::ArithTraits instead")]] = ::Kokkos:: } // namespace Details } // namespace Kokkos -#endif // KOKKOS_ARITHTRAITS_HPP +#endif // KOKKOSKERNELS_KOKKOS_ARITHTRAITS_HPP diff --git a/packages/kokkos-kernels/common/src/Kokkos_InnerProductSpaceTraits.hpp b/packages/kokkos-kernels/common/src/Kokkos_InnerProductSpaceTraits.hpp index 25337c925f13..d190009168dd 100644 --- a/packages/kokkos-kernels/common/src/Kokkos_InnerProductSpaceTraits.hpp +++ b/packages/kokkos-kernels/common/src/Kokkos_InnerProductSpaceTraits.hpp @@ -14,8 +14,8 @@ // //@HEADER -#ifndef KOKKOS_INNERPRODUCTSPACETRAITS_HPP -#define KOKKOS_INNERPRODUCTSPACETRAITS_HPP +#ifndef KOKKOSKERNELS_KOKKOS_INNERPRODUCTSPACETRAITS_HPP +#define KOKKOSKERNELS_KOKKOS_INNERPRODUCTSPACETRAITS_HPP /// \file Kokkos_InnerProductSpaceTraits.hpp /// \brief Declaration and definition of @@ -238,13 +238,12 @@ KOKKOS_INLINE_FUNCTION void updateDot(ResultType& sum, const InputType1& x, cons KOKKOS_INLINE_FUNCTION void updateDot(double& sum, const double x, const double y) { sum += x * y; } -KOKKOS_INLINE_FUNCTION void updateDot(double& sum, const float x, const float y) { sum += x * y; } +KOKKOS_INLINE_FUNCTION void updateDot(double& sum, const float x, const float y) { sum += static_cast(x) * y; } // This exists because complex += complex is not defined. KOKKOS_INLINE_FUNCTION void updateDot(Kokkos::complex& sum, const Kokkos::complex x, const Kokkos::complex y) { - const auto tmp = Kokkos::conj(x) * y; - sum += Kokkos::complex(tmp.real(), tmp.imag()); + sum += Kokkos::conj(Kokkos::complex(x)) * Kokkos::complex(y); } // This exists in case people call the overload of KokkosBlas::dot @@ -272,4 +271,4 @@ struct CastPossiblyComplex, Kokkos::complex> { } // namespace Details } // namespace Kokkos -#endif // KOKKOS_INNERPRODUCTSPACETRAITS_HPP +#endif // KOKKOSKERNELS_KOKKOS_INNERPRODUCTSPACETRAITS_HPP diff --git a/packages/kokkos-kernels/common/unit_test/CMakeLists.txt b/packages/kokkos-kernels/common/unit_test/CMakeLists.txt index c0d8fc116f38..c963e908e54d 100644 --- a/packages/kokkos-kernels/common/unit_test/CMakeLists.txt +++ b/packages/kokkos-kernels/common/unit_test/CMakeLists.txt @@ -95,3 +95,10 @@ IF (KOKKOS_ENABLE_THREADS) ) ENDIF () +# Add eager_initialize test, which is not backend-specific +KOKKOSKERNELS_ADD_UNIT_TEST( + common_eager_initialize + SOURCES Test_Common_EagerInitialize.cpp + COMPONENTS common +) + diff --git a/packages/kokkos-kernels/common/unit_test/Test_Common_ArithTraits.hpp b/packages/kokkos-kernels/common/unit_test/Test_Common_ArithTraits.hpp index 73a4ebfefee3..b1fe860f8a46 100644 --- a/packages/kokkos-kernels/common/unit_test/Test_Common_ArithTraits.hpp +++ b/packages/kokkos-kernels/common/unit_test/Test_Common_ArithTraits.hpp @@ -26,8 +26,8 @@ /// use Kokkos::ArithTraits, so it may be useful for users to /// read it. -#ifndef KOKKOS_ARITHTRAITSTEST_HPP -#define KOKKOS_ARITHTRAITSTEST_HPP +#ifndef KOKKOSKERNELS_TEST_COMMON_ARITHTRAITSTEST_HPP +#define KOKKOSKERNELS_TEST_COMMON_ARITHTRAITSTEST_HPP #include #include "Kokkos_ArithTraits.hpp" @@ -1693,4 +1693,4 @@ void test_ArithTraits() { } TEST_F(TestCategory, common_ArithTraits) { test_ArithTraits(); } -#endif // KOKKOS_ARITHTRAITSTEST_HPP +#endif // KOKKOSKERNELS_TEST_COMMON_ARITHTRAITSTEST_HPP diff --git a/packages/kokkos-kernels/common/unit_test/Test_Common_EagerInitialize.cpp b/packages/kokkos-kernels/common/unit_test/Test_Common_EagerInitialize.cpp new file mode 100644 index 000000000000..fc495e78fcae --- /dev/null +++ b/packages/kokkos-kernels/common/unit_test/Test_Common_EagerInitialize.cpp @@ -0,0 +1,113 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KK_EAGERINIT_TEST_HPP +#define KK_EAGERINIT_TEST_HPP + +#include +#include "Kokkos_Core.hpp" +#include "KokkosKernels_config.h" +#include "KokkosKernels_EagerInitialize.hpp" + +#ifdef KOKKOSKERNELS_ENABLE_COMPONENT_BLAS +#include "KokkosBlas_tpl_spec.hpp" //cuBLAS, rocBLAS +#ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA +#include "KokkosBlas_magma.hpp" +#endif +#endif + +#ifdef KOKKOSKERNELS_ENABLE_COMPONENT_SPARSE +// note: this file declares both cuSPARSE and rocSPARSE singletons +#include "KokkosKernels_tpl_handles_decl.hpp" +#endif + +#ifdef KOKKOSKERNELS_ENABLE_COMPONENT_LAPACK +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSOLVER +#include "KokkosLapack_cusolver.hpp" +#endif +#ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA +#include "KokkosLapack_magma.hpp" +#endif +#endif + +// Count the number of singletons which are currently initialized, +// and the numInitialized number of singleton classes that are currently enabled +// (based on which TPLs and components were enabled at configure-time) +void countSingletons(int& numInitialized, int& numEnabled) { + numInitialized = 0; + numEnabled = 0; +#ifdef KOKKOSKERNELS_ENABLE_COMPONENT_BLAS +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS + numEnabled++; + if (KokkosBlas::Impl::CudaBlasSingleton::is_initialized()) numInitialized++; +#endif +#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCBLAS + numEnabled++; + if (KokkosBlas::Impl::RocBlasSingleton::is_initialized()) numInitialized++; +#endif +#ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA + numEnabled++; + if (KokkosBlas::Impl::MagmaSingleton::is_initialized()) numInitialized++; +#endif +#endif + +#ifdef KOKKOSKERNELS_ENABLE_COMPONENT_SPARSE +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE + numEnabled++; + if (KokkosKernels::Impl::CusparseSingleton::is_initialized()) numInitialized++; +#endif +#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE + numEnabled++; + if (KokkosKernels::Impl::RocsparseSingleton::is_initialized()) numInitialized++; +#endif +#endif + +#ifdef KOKKOSKERNELS_ENABLE_COMPONENT_LAPACK +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSOLVER + numEnabled++; + if (KokkosLapack::Impl::CudaLapackSingleton::is_initialized()) numInitialized++; +#endif +#ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA + numEnabled++; + if (KokkosLapack::Impl::MagmaSingleton::is_initialized()) numInitialized++; +#endif +#endif +} + +int main() { + int numInitialized, numEnabled; + Kokkos::initialize(); + { + // Check that no singletons are already initialized. + countSingletons(numInitialized, numEnabled); + if (numInitialized != 0) + throw std::runtime_error("At least one singleton was initialized before it should have been"); + KokkosKernels::eager_initialize(); + // Check that all singletons are now initialized. + countSingletons(numInitialized, numEnabled); + std::cout << "Kokkos::eager_initialize() set up " << numInitialized << " of " << numEnabled << " TPL singletons.\n"; + if (numInitialized != numEnabled) + throw std::runtime_error("At least one singleton was not initialized by eager_initialize()"); + } + Kokkos::finalize(); + // Finally, make sure that all singletons were finalized during Kokkos::finalize(). + countSingletons(numInitialized, numEnabled); + if (numInitialized != 0) + throw std::runtime_error("At least one singleton was not correctly finalized by Kokkos::finalize()"); + return 0; +} + +#endif diff --git a/packages/kokkos-kernels/common/unit_test/Test_Common_IOUtils.hpp b/packages/kokkos-kernels/common/unit_test/Test_Common_IOUtils.hpp index 121930442130..db59a0ee6999 100644 --- a/packages/kokkos-kernels/common/unit_test/Test_Common_IOUtils.hpp +++ b/packages/kokkos-kernels/common/unit_test/Test_Common_IOUtils.hpp @@ -42,7 +42,7 @@ class ViewPrintHelper { template void testPrintView() { - using scalar_t = default_scalar; + using scalar_t = KokkosKernels::default_scalar; using Unmanaged = Kokkos::MemoryTraits; using rank0_view = Kokkos::View; using rank1_view = Kokkos::View; diff --git a/packages/kokkos-kernels/common/unit_test/Test_Common_LowerBound.hpp b/packages/kokkos-kernels/common/unit_test/Test_Common_LowerBound.hpp index d471801a3078..54ce1c2e0030 100644 --- a/packages/kokkos-kernels/common/unit_test/Test_Common_LowerBound.hpp +++ b/packages/kokkos-kernels/common/unit_test/Test_Common_LowerBound.hpp @@ -117,7 +117,7 @@ void test_lower_bound_team(const std::vector &_haystack, const T _needle) { // test lower_bound search const int leagueSize = 1; - const int teamSize = KokkosKernels::Impl::kk_is_gpu_exec_space() ? 64 : 1; + const int teamSize = KokkosKernels::Impl::is_gpu_exec_space_v ? 64 : 1; int errCount; Kokkos::parallel_reduce(Policy(leagueSize, teamSize), TeamLowerBoundFunctor(expected, haystack, _needle), errCount); diff --git a/packages/kokkos-kernels/common/unit_test/Test_Common_Sorting.hpp b/packages/kokkos-kernels/common/unit_test/Test_Common_Sorting.hpp index 30623a8691e9..e4e62e59368e 100644 --- a/packages/kokkos-kernels/common/unit_test/Test_Common_Sorting.hpp +++ b/packages/kokkos-kernels/common/unit_test/Test_Common_Sorting.hpp @@ -248,125 +248,6 @@ void testSerialRadixSort2(size_t k, size_t subArraySize) { } } -template -struct TestTeamBitonicFunctor { - typedef typename ValView::value_type Value; - - TestTeamBitonicFunctor(ValView& values_, OrdView& counts_, OrdView& offsets_) - : values(values_), counts(counts_), offsets(offsets_) {} - - template - KOKKOS_INLINE_FUNCTION void operator()(const TeamMem t) const { - int i = t.league_rank(); - KokkosKernels::TeamBitonicSort(values.data() + offsets(i), counts(i), t); - } - - ValView values; - OrdView counts; - OrdView offsets; -}; - -template -struct TestTeamBitonic2Functor { - typedef typename KeyView::value_type Key; - typedef typename ValView::value_type Value; - - TestTeamBitonic2Functor(KeyView& keys_, ValView& values_, OrdView& counts_, OrdView& offsets_) - : keys(keys_), values(values_), counts(counts_), offsets(offsets_) {} - - template - KOKKOS_INLINE_FUNCTION void operator()(const TeamMem t) const { - int i = t.league_rank(); - KokkosKernels::TeamBitonicSort2(keys.data() + offsets(i), values.data() + offsets(i), - counts(i), t); - } - - KeyView keys; - ValView values; - OrdView counts; - OrdView offsets; -}; - -template -void testTeamBitonicSort(size_t k, size_t subArraySize) { - // Create a view of randomized data - typedef typename Device::execution_space exec_space; - typedef typename Device::memory_space mem_space; - typedef Kokkos::View OrdView; - typedef Kokkos::View ValView; - OrdView counts("Subarray Sizes", k); - OrdView offsets("Subarray Offsets", k); - // Generate k sub-array sizes, each with size about 20 - size_t n = generateRandomOffsets(counts, offsets, k, subArraySize); - ValView data("Bitonic sort testing data", n); - fillRandom(data); - Kokkos::View gold("Host sorted", n); - Kokkos::deep_copy(gold, data); - // Run the sorting on device in all sub-arrays in parallel - Kokkos::parallel_for(Kokkos::TeamPolicy(k, Kokkos::AUTO()), - TestTeamBitonicFunctor(data, counts, offsets)); - // Copy result to host - auto dataHost = Kokkos::create_mirror_view(data); - Kokkos::deep_copy(dataHost, data); - // Sort using std::sort on host to do correctness test - exec_space().fence(); - auto countsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), counts); - auto offsetsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offsets); - for (size_t i = 0; i < k; i++) { - Scalar* begin = gold.data() + offsetsHost(i); - Scalar* end = begin + countsHost(i); - std::sort(begin, end); - } - for (size_t i = 0; i < n; i++) { - ASSERT_EQ(dataHost(i), gold(i)); - } -} - -template -void testTeamBitonicSort2(size_t k, size_t subArraySize) { - // Create a view of randomized data - typedef typename Device::execution_space exec_space; - typedef typename Device::memory_space mem_space; - typedef Kokkos::View OrdView; - typedef Kokkos::View KeyView; - typedef Kokkos::View ValView; - OrdView counts("Subarray Sizes", k); - OrdView offsets("Subarray Offsets", k); - // Generate k sub-array sizes, each with size about 20 - size_t n = generateRandomOffsets(counts, offsets, k, subArraySize); - KeyView keys("Bitonic test keys", n); - ValView data("Bitonic test data", n); - // The keys are randomized - fillRandom(keys, data); - Kokkos::View gold("Host sorted", n); - Kokkos::deep_copy(gold, keys); - // Run the sorting on device in all sub-arrays in parallel, just using vector - // loops Deliberately using a weird number for vector length - Kokkos::parallel_for(Kokkos::TeamPolicy(k, Kokkos::AUTO()), - TestTeamBitonic2Functor(keys, data, counts, offsets)); - exec_space().fence(); - auto countsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), counts); - auto offsetsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offsets); - // Sort using std::sort on host to do correctness test - for (size_t i = 0; i < k; i++) { - Key* begin = gold.data() + offsetsHost(i); - Key* end = begin + countsHost(i); - std::sort(begin, end); - } - // Copy results to host - auto keysHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), keys); - auto dataHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), data); - // Make sure keys are sorted exactly (stability of sort doesn't matter) - for (size_t i = 0; i < n; i++) { - ASSERT_EQ(keysHost(i), gold(i)); - } - // Make sure the hashes of each key still matches the corresponding value - for (size_t i = 0; i < n; i++) { - auto correctHash = kvHash()(keysHost(i)); - ASSERT_EQ(dataHost(i), correctHash); - } -} - template struct CheckSortedFunctor { CheckSortedFunctor(View& v_) : v(v_) {} @@ -480,27 +361,6 @@ TEST_F(TestCategory, common_serial_radix2) { } } -TEST_F(TestCategory, common_team_bitonic) { - // Test team-level bitonic over some contiguous medium arrays - // 1st arg is #arrays, 2nd arg is max subarray size - size_t numArrays = 20; - for (size_t arrayMax = 0; arrayMax < 10000; arrayMax = 1 + 4 * arrayMax) { - testTeamBitonicSort(numArrays, arrayMax); - testTeamBitonicSort(numArrays, arrayMax); - } -} - -TEST_F(TestCategory, common_team_bitonic2) { - // Test team-level bitonic over some contiguous medium arrays - // 1st arg is #arrays, 2nd arg is max subarray size - size_t numArrays = 20; - for (size_t arrayMax = 0; arrayMax < 10000; arrayMax = 1 + 4 * arrayMax) { - testTeamBitonicSort2(numArrays, arrayMax); - testTeamBitonicSort2(numArrays, arrayMax); - testTeamBitonicSort2>(numArrays, arrayMax); - } -} - TEST_F(TestCategory, common_device_bitonic) { // Test device-level bitonic with some larger arrays testBitonicSort(243743); diff --git a/packages/kokkos-kernels/common/unit_test/Test_Common_UpperBound.hpp b/packages/kokkos-kernels/common/unit_test/Test_Common_UpperBound.hpp index abd4cf655ad4..ad84c8272d89 100644 --- a/packages/kokkos-kernels/common/unit_test/Test_Common_UpperBound.hpp +++ b/packages/kokkos-kernels/common/unit_test/Test_Common_UpperBound.hpp @@ -117,7 +117,7 @@ void test_upper_bound_team(const std::vector &_haystack, const T _needle) { // test upper_bound search const int leagueSize = 1; - const int teamSize = KokkosKernels::Impl::kk_is_gpu_exec_space() ? 64 : 1; + const int teamSize = KokkosKernels::Impl::is_gpu_exec_space_v ? 64 : 1; int errCount; Kokkos::parallel_reduce(Policy(leagueSize, teamSize), TeamUpperBoundFunctor(expected, haystack, _needle), errCount); diff --git a/packages/kokkos-kernels/common/unit_test/Test_Common_float128.hpp b/packages/kokkos-kernels/common/unit_test/Test_Common_float128.hpp index 063fd06d80a9..b1177534939a 100644 --- a/packages/kokkos-kernels/common/unit_test/Test_Common_float128.hpp +++ b/packages/kokkos-kernels/common/unit_test/Test_Common_float128.hpp @@ -131,8 +131,8 @@ void testfloat128() { } // Assign to the first entry, atomically. - Kokkos::atomic_assign(&view(0), z); - cout << "view(0) after atomic_assign (z) = " << view(0) << endl; + Kokkos::atomic_store(&view(0), z); + cout << "view(0) after atomic_store (z) = " << view(0) << endl; if (view(0) != z) { success = false; } diff --git a/packages/kokkos-kernels/example/gmres/test_prec.cpp b/packages/kokkos-kernels/example/gmres/test_prec.cpp index 942dc176b64f..c557a735bd3f 100644 --- a/packages/kokkos-kernels/example/gmres/test_prec.cpp +++ b/packages/kokkos-kernels/example/gmres/test_prec.cpp @@ -126,6 +126,7 @@ int main(int argc, char* argv[]) { if (endRes < convTol && numIters == 1) { pass = true; } + delete myPrec; } Kokkos::finalize(); diff --git a/packages/kokkos-kernels/example/half/xpy.cpp b/packages/kokkos-kernels/example/half/xpy.cpp index cf3b5767f710..f2c56383cad8 100644 --- a/packages/kokkos-kernels/example/half/xpy.cpp +++ b/packages/kokkos-kernels/example/half/xpy.cpp @@ -94,7 +94,7 @@ int main(int argc, char **argv) { return 1; } using LayoutType = Kokkos::LayoutLeft; - using DeviceType = default_device; + using DeviceType = KokkosKernels::default_device; size_t n = atoi(argv[1]); bool time_only = static_cast(atoi(argv[2])); do_xpy(n, time_only); diff --git a/packages/kokkos-kernels/example/wiki/graph/KokkosGraph_wiki_9pt_stencil.hpp b/packages/kokkos-kernels/example/wiki/graph/KokkosGraph_wiki_9pt_stencil.hpp index 2137bf09e5ab..000fa18c2539 100644 --- a/packages/kokkos-kernels/example/wiki/graph/KokkosGraph_wiki_9pt_stencil.hpp +++ b/packages/kokkos-kernels/example/wiki/graph/KokkosGraph_wiki_9pt_stencil.hpp @@ -25,16 +25,16 @@ #include #include -using Ordinal = default_lno_t; -using Offset = default_size_type; -using Layout = default_layout; +using Ordinal = KokkosKernels::default_lno_t; +using Offset = KokkosKernels::default_size_type; +using Layout = KokkosKernels::default_layout; using ExecSpace = Kokkos::DefaultExecutionSpace; using DeviceSpace = typename ExecSpace::memory_space; using Kokkos::HostSpace; using RowmapType = Kokkos::View; using ColindsType = Kokkos::View; -using Handle = KokkosKernels::Experimental::KokkosKernelsHandle; +using Handle = KokkosKernels::Experimental::KokkosKernelsHandle; namespace GraphDemo { Ordinal gridX = 15; diff --git a/packages/kokkos-kernels/example/wiki/sparse/KokkosSparse_wiki_bsrmatrix.cpp b/packages/kokkos-kernels/example/wiki/sparse/KokkosSparse_wiki_bsrmatrix.cpp index 49721e595ed4..0e129e2baa4d 100644 --- a/packages/kokkos-kernels/example/wiki/sparse/KokkosSparse_wiki_bsrmatrix.cpp +++ b/packages/kokkos-kernels/example/wiki/sparse/KokkosSparse_wiki_bsrmatrix.cpp @@ -23,10 +23,10 @@ #include "KokkosSparse_BsrMatrix.hpp" #include "KokkosSparse_CrsMatrix.hpp" -using Scalar = default_scalar; -using Ordinal = default_lno_t; -using Offset = default_size_type; -using Layout = default_layout; +using Scalar = KokkosKernels::default_scalar; +using Ordinal = KokkosKernels::default_lno_t; +using Offset = KokkosKernels::default_size_type; +using Layout = KokkosKernels::default_layout; int main() { Kokkos::initialize(); diff --git a/packages/kokkos-kernels/example/wiki/sparse/KokkosSparse_wiki_bsrmatrix_2.cpp b/packages/kokkos-kernels/example/wiki/sparse/KokkosSparse_wiki_bsrmatrix_2.cpp index 527b0d56c4e7..83d1ea665e02 100644 --- a/packages/kokkos-kernels/example/wiki/sparse/KokkosSparse_wiki_bsrmatrix_2.cpp +++ b/packages/kokkos-kernels/example/wiki/sparse/KokkosSparse_wiki_bsrmatrix_2.cpp @@ -23,10 +23,10 @@ #include "KokkosKernels_default_types.hpp" #include "KokkosSparse_BsrMatrix.hpp" -using Scalar = default_scalar; -using Ordinal = default_lno_t; -using Offset = default_size_type; -using Layout = default_layout; +using Scalar = KokkosKernels::default_scalar; +using Ordinal = KokkosKernels::default_lno_t; +using Offset = KokkosKernels::default_size_type; +using Layout = KokkosKernels::default_layout; template struct bsr_fill { diff --git a/packages/kokkos-kernels/example/wiki/sparse/KokkosSparse_wiki_crsmatrix.cpp b/packages/kokkos-kernels/example/wiki/sparse/KokkosSparse_wiki_crsmatrix.cpp index 21257d803481..228958908be0 100644 --- a/packages/kokkos-kernels/example/wiki/sparse/KokkosSparse_wiki_crsmatrix.cpp +++ b/packages/kokkos-kernels/example/wiki/sparse/KokkosSparse_wiki_crsmatrix.cpp @@ -21,10 +21,10 @@ #include "KokkosSparse_CrsMatrix.hpp" #include "KokkosSparse_spmv.hpp" -using Scalar = default_scalar; -using Ordinal = default_lno_t; -using Offset = default_size_type; -using Layout = default_layout; +using Scalar = KokkosKernels::default_scalar; +using Ordinal = KokkosKernels::default_lno_t; +using Offset = KokkosKernels::default_size_type; +using Layout = KokkosKernels::default_layout; int main() { Kokkos::initialize(); diff --git a/packages/kokkos-kernels/example/wiki/sparse/KokkosSparse_wiki_gauss_seidel.cpp b/packages/kokkos-kernels/example/wiki/sparse/KokkosSparse_wiki_gauss_seidel.cpp index 31ccea3b0a0e..0ecf5c182829 100644 --- a/packages/kokkos-kernels/example/wiki/sparse/KokkosSparse_wiki_gauss_seidel.cpp +++ b/packages/kokkos-kernels/example/wiki/sparse/KokkosSparse_wiki_gauss_seidel.cpp @@ -30,17 +30,17 @@ // Helper to print out colors in the shape of the grid int main() { - using Scalar = default_scalar; + using Scalar = KokkosKernels::default_scalar; using Mag = Kokkos::ArithTraits::mag_type; - using Ordinal = default_lno_t; - using Offset = default_size_type; + using Ordinal = KokkosKernels::default_lno_t; + using Offset = KokkosKernels::default_size_type; using ExecSpace = Kokkos::DefaultExecutionSpace; using MemSpace = typename ExecSpace::memory_space; using Device = Kokkos::Device; - using Handle = - KokkosKernels::Experimental::KokkosKernelsHandle; - using Matrix = KokkosSparse::CrsMatrix; - using Vector = typename Matrix::values_type; + using Handle = KokkosKernels::Experimental::KokkosKernelsHandle; + using Matrix = KokkosSparse::CrsMatrix; + using Vector = typename Matrix::values_type; constexpr Ordinal numRows = 10000; const Scalar one = Kokkos::ArithTraits::one(); const Mag magOne = Kokkos::ArithTraits::one(); diff --git a/packages/kokkos-kernels/example/wiki/sparse/KokkosSparse_wiki_spadd.cpp b/packages/kokkos-kernels/example/wiki/sparse/KokkosSparse_wiki_spadd.cpp index c9edd7bc0c99..6282b8f250e1 100644 --- a/packages/kokkos-kernels/example/wiki/sparse/KokkosSparse_wiki_spadd.cpp +++ b/packages/kokkos-kernels/example/wiki/sparse/KokkosSparse_wiki_spadd.cpp @@ -20,10 +20,10 @@ #include "KokkosKernels_Test_Structured_Matrix.hpp" -using Scalar = default_scalar; -using Ordinal = default_lno_t; -using Offset = default_size_type; -using Layout = default_layout; +using Scalar = KokkosKernels::default_scalar; +using Ordinal = KokkosKernels::default_lno_t; +using Offset = KokkosKernels::default_size_type; +using Layout = KokkosKernels::default_layout; int main() { Kokkos::initialize(); diff --git a/packages/kokkos-kernels/example/wiki/sparse/KokkosSparse_wiki_spgemm.cpp b/packages/kokkos-kernels/example/wiki/sparse/KokkosSparse_wiki_spgemm.cpp index 2b3ccd13d2b2..2e622f8b6161 100644 --- a/packages/kokkos-kernels/example/wiki/sparse/KokkosSparse_wiki_spgemm.cpp +++ b/packages/kokkos-kernels/example/wiki/sparse/KokkosSparse_wiki_spgemm.cpp @@ -20,10 +20,10 @@ #include "KokkosKernels_Test_Structured_Matrix.hpp" -using Scalar = default_scalar; -using Ordinal = default_lno_t; -using Offset = default_size_type; -using Layout = default_layout; +using Scalar = KokkosKernels::default_scalar; +using Ordinal = KokkosKernels::default_lno_t; +using Offset = KokkosKernels::default_size_type; +using Layout = KokkosKernels::default_layout; int main() { Kokkos::initialize(); diff --git a/packages/kokkos-kernels/example/wiki/sparse/KokkosSparse_wiki_spmv.cpp b/packages/kokkos-kernels/example/wiki/sparse/KokkosSparse_wiki_spmv.cpp index 5778684a8a16..1caeec7e54de 100644 --- a/packages/kokkos-kernels/example/wiki/sparse/KokkosSparse_wiki_spmv.cpp +++ b/packages/kokkos-kernels/example/wiki/sparse/KokkosSparse_wiki_spmv.cpp @@ -21,10 +21,10 @@ #include "KokkosKernels_Test_Structured_Matrix.hpp" -using Scalar = default_scalar; -using Ordinal = default_lno_t; -using Offset = default_size_type; -using Layout = default_layout; +using Scalar = KokkosKernels::default_scalar; +using Ordinal = KokkosKernels::default_lno_t; +using Offset = KokkosKernels::default_size_type; +using Layout = KokkosKernels::default_layout; template struct check_spmv_functor { diff --git a/packages/kokkos-kernels/graph/eti/generated_specializations_hpp/KokkosGraph_color_d1_eti_spec_decl.hpp.in b/packages/kokkos-kernels/graph/eti/generated_specializations_hpp/KokkosGraph_color_d1_eti_spec_decl.hpp.in new file mode 100644 index 000000000000..23e16995573f --- /dev/null +++ b/packages/kokkos-kernels/graph/eti/generated_specializations_hpp/KokkosGraph_color_d1_eti_spec_decl.hpp.in @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSGRAPH_COLOR_D1_ETI_SPEC_DECL_HPP_ +#define KOKKOSGRAPH_COLOR_D1_ETI_SPEC_DECL_HPP_ +namespace KokkosGraph { +namespace Impl { +@GRAPH_COLOR_D1_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/packages/kokkos-kernels/graph/impl/KokkosGraph_Distance1Color_impl.hpp b/packages/kokkos-kernels/graph/impl/KokkosGraph_Distance1Color_impl.hpp index 2abc5c76e4c1..8d3808453be4 100644 --- a/packages/kokkos-kernels/graph/impl/KokkosGraph_Distance1Color_impl.hpp +++ b/packages/kokkos-kernels/graph/impl/KokkosGraph_Distance1Color_impl.hpp @@ -2529,7 +2529,7 @@ class GraphColor_EB : public GraphColor(&(color_ban(uncolored_vertex)), src_col | dst_col); + Kokkos::atomic_fetch_or(&(color_ban(uncolored_vertex)), src_col | dst_col); edge_conflict_marker(work_index) = 0; } } @@ -2616,7 +2616,7 @@ class GraphColor_EB : public GraphColor(&(tentative_color_ban(smaller_index)), -src_col); + Kokkos::atomic_fetch_or(&(tentative_color_ban(smaller_index)), -src_col); nnz_lno_t banned_colors = ~(color_ban(smaller_index) | tentative_color_ban(smaller_index)); nnz_lno_t larger_col = banned_colors & (-banned_colors); kokcolors(smaller_index) = -(larger_col); @@ -2625,16 +2625,14 @@ class GraphColor_EB : public GraphColor(&(color_ban(dst_id)), - // -src_col); - Kokkos::atomic_fetch_or(&(tentative_color_ban(dst_id)), -src_col); + // Kokkos::atomic_fetch_or(&(color_ban(dst_id)), -src_col); + Kokkos::atomic_fetch_or(&(tentative_color_ban(dst_id)), -src_col); } else if (dst_col != 0) { // if it is dst tentatively colors, but src is not colored, // then we send the dst color info to src's tentative_ban - // Kokkos::atomic_fetch_or(&(color_ban(src_id)), - // -dst_col); - Kokkos::atomic_fetch_or(&(tentative_color_ban(src_id)), -dst_col); + // Kokkos::atomic_fetch_or(&(color_ban(src_id)), -dst_col); + Kokkos::atomic_fetch_or(&(tentative_color_ban(src_id)), -dst_col); } else { // idx smaller_index = src_id < dst_id > 0 ? src_id: dst_id; // idx larger_index = src_id < dst_id > 0 ? dst_id : src_id; @@ -2660,9 +2658,8 @@ class GraphColor_EB : public GraphColor(&(tentative_color_ban(larger_index)), src_col); - // Kokkos::atomic_fetch_or(&(color_ban(dst_id)), - // src_col); + Kokkos::atomic_fetch_or(&(tentative_color_ban(larger_index)), src_col); + // Kokkos::atomic_fetch_or(&(color_ban(dst_id)), src_col); } } } diff --git a/packages/kokkos-kernels/graph/impl/KokkosGraph_Distance2Color_impl.hpp b/packages/kokkos-kernels/graph/impl/KokkosGraph_Distance2Color_impl.hpp index cfa5186283e2..2a6515851579 100644 --- a/packages/kokkos-kernels/graph/impl/KokkosGraph_Distance2Color_impl.hpp +++ b/packages/kokkos-kernels/graph/impl/KokkosGraph_Distance2Color_impl.hpp @@ -616,7 +616,7 @@ class GraphColorDistance2 { const lno_t numVerts = this->nr; const lno_t numCols = this->nc; // note: relying on forbidden and colors_out being initialized to 0 - forbidden_view forbidden("Forbidden", batch * numCols); + forbidden_view forbidden("Forbidden", static_cast(batch) * numCols); int iter = 0; Kokkos::Timer timer; lno_t currentWork = this->nr; diff --git a/packages/kokkos-kernels/graph/impl/KokkosGraph_Distance2MIS_impl.hpp b/packages/kokkos-kernels/graph/impl/KokkosGraph_Distance2MIS_impl.hpp index e39e1e7ad3e6..89562fd9014c 100644 --- a/packages/kokkos-kernels/graph/impl/KokkosGraph_Distance2MIS_impl.hpp +++ b/packages/kokkos-kernels/graph/impl/KokkosGraph_Distance2MIS_impl.hpp @@ -317,7 +317,7 @@ struct D2_MIS_RandomPriority { KokkosKernels::Impl::sequential_fill(colWorklist); worklist_t thirdWorklist = Kokkos::subview(allWorklists, Kokkos::ALL(), 2); auto execSpaceEnum = KokkosKernels::Impl::kk_get_exec_space_type(); - bool useTeams = KokkosKernels::Impl::kk_is_gpu_exec_space() && (entries.extent(0) / numVerts >= 16); + bool useTeams = KokkosKernels::Impl::is_gpu_exec_space_v && (entries.extent(0) / numVerts >= 16); int vectorLength = KokkosKernels::Impl::kk_get_suggested_vector_size(numVerts, entries.extent(0), execSpaceEnum); int round = 0; lno_t rowWorkLen = numVerts; @@ -396,7 +396,7 @@ struct D2_MIS_RandomPriority { Kokkos::deep_copy(rowStatus, ~(status_t(0))); worklist_t thirdWorklist = Kokkos::subview(allWorklists, Kokkos::ALL(), 2); auto execSpaceEnum = KokkosKernels::Impl::kk_get_exec_space_type(); - bool useTeams = KokkosKernels::Impl::kk_is_gpu_exec_space() && (entries.extent(0) / numVerts >= 16); + bool useTeams = KokkosKernels::Impl::is_gpu_exec_space_v && (entries.extent(0) / numVerts >= 16); int vectorLength = KokkosKernels::Impl::kk_get_suggested_vector_size(numVerts, entries.extent(0), execSpaceEnum); int round = 0; int refreshColTeamSize = 0; @@ -963,7 +963,7 @@ struct D2_MIS_Aggregation { KOKKOS_INLINE_FUNCTION void operator()(lno_t i) const { lno_t agg = labels_(i); if (agg != -1) { - Kokkos::atomic_increment(&aggSizes_(agg)); + Kokkos::atomic_inc(&aggSizes_(agg)); // compute connectivity of i size_type rowBegin = rowmap_(i); size_type rowEnd = rowmap_(i + 1); diff --git a/packages/kokkos-kernels/graph/impl/KokkosGraph_ExplicitCoarsening_impl.hpp b/packages/kokkos-kernels/graph/impl/KokkosGraph_ExplicitCoarsening_impl.hpp index dc0e802485bd..7aff62f31823 100644 --- a/packages/kokkos-kernels/graph/impl/KokkosGraph_ExplicitCoarsening_impl.hpp +++ b/packages/kokkos-kernels/graph/impl/KokkosGraph_ExplicitCoarsening_impl.hpp @@ -33,7 +33,7 @@ struct ExplicitGraphCoarsening { struct ClusterSizeFunctor { ClusterSizeFunctor(const ordinal_view_t& counts_, const labels_t& vertClusters_) : counts(counts_), vertClusters(vertClusters_) {} - KOKKOS_INLINE_FUNCTION void operator()(const lno_t i) const { Kokkos::atomic_increment(&counts(vertClusters(i))); } + KOKKOS_INLINE_FUNCTION void operator()(const lno_t i) const { Kokkos::atomic_inc(&counts(vertClusters(i))); } ordinal_view_t counts; labels_t vertClusters; }; @@ -98,7 +98,7 @@ struct ExplicitGraphCoarsening { KOKKOS_INLINE_FUNCTION bool insert(lno_t cluster, lno_t nei, int* table) const { unsigned h = xorshiftHash(nei); for (unsigned i = h; i < h + 2; i++) { - if (Kokkos::atomic_compare_exchange_strong(&table[i % tableSize()], cluster, nei)) return true; + if (cluster == Kokkos::atomic_compare_exchange(&table[i % tableSize()], cluster, nei)) return true; } return false; } diff --git a/packages/kokkos-kernels/graph/impl/KokkosGraph_color_d1_spec.hpp b/packages/kokkos-kernels/graph/impl/KokkosGraph_color_d1_spec.hpp index 178fdd9182ba..21b9dae2c7b3 100644 --- a/packages/kokkos-kernels/graph/impl/KokkosGraph_color_d1_spec.hpp +++ b/packages/kokkos-kernels/graph/impl/KokkosGraph_color_d1_spec.hpp @@ -103,4 +103,6 @@ struct COLOR_D1>, \ false, true>; +#include + #endif diff --git a/packages/kokkos-kernels/graph/src/KokkosGraph_CoarsenConstruct.hpp b/packages/kokkos-kernels/graph/src/KokkosGraph_CoarsenConstruct.hpp index 8e1cce3ddb52..d0a48a6f88ca 100644 --- a/packages/kokkos-kernels/graph/src/KokkosGraph_CoarsenConstruct.hpp +++ b/packages/kokkos-kernels/graph/src/KokkosGraph_CoarsenConstruct.hpp @@ -73,8 +73,8 @@ struct SortLowDegreeCrsMatrixFunctor { Kokkos::single(Kokkos::PerTeam(t), [&]() { reducer++; }); return; } - KokkosKernels::TeamBitonicSort2(entries.data() + rowStart, - values.data() + rowStart, rowNum, t); + Kokkos::Experimental::sort_by_key_team(t, Kokkos::subview(entries, Kokkos::make_pair(rowStart, rowEnd)), + Kokkos::subview(values, Kokkos::make_pair(rowStart, rowEnd))); } rowmap_t rowmap; @@ -96,7 +96,7 @@ typename entries_t::non_const_value_type sort_low_degree_rows_crs_matrix( const typename entries_t::non_const_value_type degreeLimit) { using lno_t = typename entries_t::non_const_value_type; using team_pol = Kokkos::TeamPolicy; - bool useRadix = !KokkosKernels::Impl::kk_is_gpu_exec_space(); + bool useRadix = !KokkosKernels::Impl::is_gpu_exec_space_v; Impl::SortLowDegreeCrsMatrixFunctor funct(useRadix, rowmap, entries, values, degreeLimit); lno_t numRows = rowmap.extent(0) ? rowmap.extent(0) - 1 : 0; diff --git a/packages/kokkos-kernels/graph/src/KokkosGraph_CoarsenHeuristics.hpp b/packages/kokkos-kernels/graph/src/KokkosGraph_CoarsenHeuristics.hpp index f136882d8992..3a6dfbda6d00 100644 --- a/packages/kokkos-kernels/graph/src/KokkosGraph_CoarsenHeuristics.hpp +++ b/packages/kokkos-kernels/graph/src/KokkosGraph_CoarsenHeuristics.hpp @@ -86,7 +86,7 @@ class coarsen_heuristics { if (bucket >= t_buckets) bucket -= t_buckets; if (buckets(bucket) == ORD_MAX) { // attempt to insert into bucket - if (Kokkos::atomic_compare_exchange_strong(&buckets(bucket), ORD_MAX, i)) { + if (ORD_MAX == Kokkos::atomic_compare_exchange(&buckets(bucket), ORD_MAX, i)) { break; } } @@ -140,8 +140,8 @@ class coarsen_heuristics { // need to enforce an ordering condition to allow hard-stall // conditions to be broken if (condition ^ swap) { - if (Kokkos::atomic_compare_exchange_strong(&match(u), ORD_MAX, v)) { - if (u == v || Kokkos::atomic_compare_exchange_strong(&match(v), ORD_MAX, u)) { + if (ORD_MAX == Kokkos::atomic_compare_exchange(&match(u), ORD_MAX, v)) { + if (u == v || ORD_MAX == Kokkos::atomic_compare_exchange(&match(v), ORD_MAX, u)) { ordinal_t cv = Kokkos::atomic_fetch_add(&nvertices_coarse(), 1); vcmap(u) = cv; vcmap(v) = cv; @@ -201,8 +201,8 @@ class coarsen_heuristics { // need to enforce an ordering condition to allow hard-stall // conditions to be broken if (condition ^ swap) { - if (Kokkos::atomic_compare_exchange_strong(&match(u), ORD_MAX, v)) { - if (u == v || Kokkos::atomic_compare_exchange_strong(&match(v), ORD_MAX, u)) { + if (ORD_MAX == Kokkos::atomic_compare_exchange(&match(u), ORD_MAX, v)) { + if (u == v || ORD_MAX == Kokkos::atomic_compare_exchange(&match(v), ORD_MAX, u)) { ordinal_t cv = u; if (v < u) { cv = v; @@ -859,8 +859,8 @@ class coarsen_heuristics { // need to enforce an ordering condition to allow hard-stall // conditions to be broken if (condition ^ swap) { - if (Kokkos::atomic_compare_exchange_strong(&match(u), ORD_MAX, v)) { - if (u == v || Kokkos::atomic_compare_exchange_strong(&match(v), ORD_MAX, u)) { + if (ORD_MAX == Kokkos::atomic_compare_exchange(&match(u), ORD_MAX, v)) { + if (u == v || ORD_MAX == Kokkos::atomic_compare_exchange(&match(v), ORD_MAX, u)) { // u == v avoids problems if there is a self-loop edge ordinal_t cv = Kokkos::atomic_fetch_add(&nvertices_coarse(), 1); vcmap(u) = cv; @@ -1084,8 +1084,8 @@ class coarsen_heuristics { // need to enforce an ordering condition to allow hard-stall // conditions to be broken if (condition ^ swap) { - if (Kokkos::atomic_compare_exchange_strong(&match(u), ORD_MAX, v)) { - if (Kokkos::atomic_compare_exchange_strong(&match(v), ORD_MAX, u)) { + if (ORD_MAX == Kokkos::atomic_compare_exchange(&match(u), ORD_MAX, v)) { + if (ORD_MAX == Kokkos::atomic_compare_exchange(&match(v), ORD_MAX, u)) { ordinal_t cv = Kokkos::atomic_fetch_add(&nvertices_coarse(), 1); vcmap(u) = cv; vcmap(v) = cv; diff --git a/packages/kokkos-kernels/graph/src/KokkosGraph_Distance1ColorHandle.hpp b/packages/kokkos-kernels/graph/src/KokkosGraph_Distance1ColorHandle.hpp index 1eefd07c4d29..bcc71f147f4b 100644 --- a/packages/kokkos-kernels/graph/src/KokkosGraph_Distance1ColorHandle.hpp +++ b/packages/kokkos-kernels/graph/src/KokkosGraph_Distance1ColorHandle.hpp @@ -226,7 +226,7 @@ class GraphColoringHandle { #ifdef VERBOSE std::cout << ExecutionSpace::name() << " Execution Space, Default Algorithm: COLORING_VBBIT\n"; #endif - } else if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { + } else if (KokkosKernels::Impl::is_gpu_exec_space_v) { this->coloring_algorithm_type = COLORING_EB; #ifdef VERBOSE std::cout << ExecutionSpace::name() << " Execution Space, Default Algorithm: COLORING_EB\n"; @@ -402,7 +402,7 @@ class GraphColoringHandle { size_type_temp_work_view_t lower_count("LowerXADJ", nv + 1); size_type new_num_edge = 0; typedef Kokkos::RangePolicy my_exec_space; - if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { + if (KokkosKernels::Impl::is_gpu_exec_space_v) { int teamSizeMax = 0; int vector_size = 0; diff --git a/packages/kokkos-kernels/graph/unit_test/CMakeLists.txt b/packages/kokkos-kernels/graph/unit_test/CMakeLists.txt index b49795315915..00fa135481b6 100644 --- a/packages/kokkos-kernels/graph/unit_test/CMakeLists.txt +++ b/packages/kokkos-kernels/graph/unit_test/CMakeLists.txt @@ -10,12 +10,17 @@ KOKKOSKERNELS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_C # # ##################### +SET(KK_ENABLE_GRAPH_TESTS ON) + IF (KokkosKernels_TEST_ETI_ONLY) IF (NOT KokkosKernels_INST_DOUBLE AND NOT KokkosKernels_INST_FLOAT) - MESSAGE(FATAL_ERROR "Because only ETI'd type combinations are enabled for testing, the Kokkos Kernels graph tests require that double or float is enabled in ETI.") + MESSAGE(WARNING "Because only ETI'd type combinations are enabled for testing, the Kokkos Kernels graph tests require that double or float is enabled in ETI.") + SET(KK_ENABLE_GRAPH_TESTS OFF) ENDIF () ENDIF () +IF(KK_ENABLE_GRAPH_TESTS) + ##################### # # # Add GPU backends # @@ -97,4 +102,4 @@ IF (KOKKOS_ENABLE_THREADS) COMPONENTS graph ) ENDIF () - +ENDIF () diff --git a/packages/kokkos-kernels/graph/unit_test/Test_Graph_graph_color.hpp b/packages/kokkos-kernels/graph/unit_test/Test_Graph_graph_color.hpp index 3ddfa7c9b0de..e53261edf76e 100644 --- a/packages/kokkos-kernels/graph/unit_test/Test_Graph_graph_color.hpp +++ b/packages/kokkos-kernels/graph/unit_test/Test_Graph_graph_color.hpp @@ -169,30 +169,30 @@ void test_coloring(lno_t numRows, size_type nnz, lno_t bandwidth, lno_t row_size // device::execution_space::finalize(); } -#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ - TEST_F(TestCategory, graph##_##graph_color##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_coloring(50000, 50000 * 30, 200, 10); \ - test_coloring(50000, 50000 * 30, 100, 10); \ +#define EXECUTE_TEST(ORDINAL, OFFSET, DEVICE) \ + TEST_F(TestCategory, graph##_##graph_color##_default_scalar_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_coloring(50000, 50000 * 30, 200, 10); \ + test_coloring(50000, 50000 * 30, 100, 10); \ } #if (defined(KOKKOSKERNELS_INST_ORDINAL_INT) && defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(default_scalar, int, int, TestDevice) +EXECUTE_TEST(int, int, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(default_scalar, int64_t, int, TestDevice) +EXECUTE_TEST(int64_t, int, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_ORDINAL_INT) && defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(default_scalar, int, size_t, TestDevice) +EXECUTE_TEST(int, size_t, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(default_scalar, int64_t, size_t, TestDevice) +EXECUTE_TEST(int64_t, size_t, TestDevice) #endif #undef EXECUTE_TEST diff --git a/packages/kokkos-kernels/graph/unit_test/Test_Graph_graph_color_deterministic.hpp b/packages/kokkos-kernels/graph/unit_test/Test_Graph_graph_color_deterministic.hpp index 87771de84fd7..66cda713dda9 100644 --- a/packages/kokkos-kernels/graph/unit_test/Test_Graph_graph_color_deterministic.hpp +++ b/packages/kokkos-kernels/graph/unit_test/Test_Graph_graph_color_deterministic.hpp @@ -222,30 +222,30 @@ void test_coloring_deterministic(lno_t numRows, size_type nnz) { } } -#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ - TEST_F(TestCategory, graph##_##graph_color_deterministic##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_coloring_deterministic(18, 74); \ - test_coloring_deterministic(18, 74); \ +#define EXECUTE_TEST(ORDINAL, OFFSET, DEVICE) \ + TEST_F(TestCategory, graph##_##graph_color_deterministic##_default_scalar_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_coloring_deterministic(18, 74); \ + test_coloring_deterministic(18, 74); \ } #if (defined(KOKKOSKERNELS_INST_ORDINAL_INT) && defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(default_scalar, int, int, TestDevice) +EXECUTE_TEST(int, int, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(default_scalar, int64_t, int, TestDevice) +EXECUTE_TEST(int64_t, int, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_ORDINAL_INT) && defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(default_scalar, int, size_t, TestDevice) +EXECUTE_TEST(int, size_t, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(default_scalar, int64_t, size_t, TestDevice) +EXECUTE_TEST(int64_t, size_t, TestDevice) #endif #undef EXECUTE_TEST diff --git a/packages/kokkos-kernels/graph/unit_test/Test_Graph_rcm.hpp b/packages/kokkos-kernels/graph/unit_test/Test_Graph_rcm.hpp index 0a9543367a53..096bdb696162 100644 --- a/packages/kokkos-kernels/graph/unit_test/Test_Graph_rcm.hpp +++ b/packages/kokkos-kernels/graph/unit_test/Test_Graph_rcm.hpp @@ -119,7 +119,7 @@ void test_rcm(const rowmap_t& rowmap, const entries_t& entries, bool expectBandw template void test_rcm_zerorows() { - using graph_t = Kokkos::StaticCrsGraph; + using graph_t = Kokkos::StaticCrsGraph; using rowmap_t = typename graph_t::row_map_type::non_const_type; using entries_t = typename graph_t::entries_type::non_const_type; rowmap_t rowmap; @@ -129,7 +129,7 @@ void test_rcm_zerorows() { template void test_rcm_7pt(lno_t gridX, lno_t gridY, lno_t gridZ, bool expectBandwidthReduced) { - using graph_t = Kokkos::StaticCrsGraph; + using graph_t = Kokkos::StaticCrsGraph; using rowmap_t = typename graph_t::row_map_type::non_const_type; using entries_t = typename graph_t::entries_type::non_const_type; rowmap_t rowmap; @@ -140,7 +140,7 @@ void test_rcm_7pt(lno_t gridX, lno_t gridY, lno_t gridZ, bool expectBandwidthRed template void test_rcm_4clique() { - using graph_t = Kokkos::StaticCrsGraph; + using graph_t = Kokkos::StaticCrsGraph; using rowmap_t = typename graph_t::row_map_type::non_const_type; using entries_t = typename graph_t::entries_type::non_const_type; rowmap_t rowmap("rowmap", 5); @@ -156,7 +156,7 @@ void test_rcm_4clique() { template void test_rcm_multiple_components() { - using graph_t = Kokkos::StaticCrsGraph; + using graph_t = Kokkos::StaticCrsGraph; using rowmap_t = typename graph_t::row_map_type::non_const_type; using entries_t = typename graph_t::entries_type::non_const_type; // Generate a single 3D grid first diff --git a/packages/kokkos-kernels/lapack/eti/generated_specializations_hpp/KokkosLapack_gesv_eti_spec_decl.hpp.in b/packages/kokkos-kernels/lapack/eti/generated_specializations_hpp/KokkosLapack_gesv_eti_spec_decl.hpp.in new file mode 100644 index 000000000000..89f5ac3cc154 --- /dev/null +++ b/packages/kokkos-kernels/lapack/eti/generated_specializations_hpp/KokkosLapack_gesv_eti_spec_decl.hpp.in @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSLAPACK_GESV_ETI_SPEC_DECL_HPP_ +#define KOKKOSLAPACK_GESV_ETI_SPEC_DECL_HPP_ +namespace KokkosLapack { +namespace Impl { +@LAPACK_GESV_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/packages/kokkos-kernels/lapack/eti/generated_specializations_hpp/KokkosLapack_svd_eti_spec_decl.hpp.in b/packages/kokkos-kernels/lapack/eti/generated_specializations_hpp/KokkosLapack_svd_eti_spec_decl.hpp.in new file mode 100644 index 000000000000..a6d110429b45 --- /dev/null +++ b/packages/kokkos-kernels/lapack/eti/generated_specializations_hpp/KokkosLapack_svd_eti_spec_decl.hpp.in @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSLAPACK_SVD_ETI_SPEC_DECL_HPP_ +#define KOKKOSLAPACK_SVD_ETI_SPEC_DECL_HPP_ +namespace KokkosLapack { +namespace Impl { +@LAPACK_SVD_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/packages/kokkos-kernels/lapack/eti/generated_specializations_hpp/KokkosLapack_trtri_eti_spec_decl.hpp.in b/packages/kokkos-kernels/lapack/eti/generated_specializations_hpp/KokkosLapack_trtri_eti_spec_decl.hpp.in new file mode 100644 index 000000000000..3e82f0f8e6cc --- /dev/null +++ b/packages/kokkos-kernels/lapack/eti/generated_specializations_hpp/KokkosLapack_trtri_eti_spec_decl.hpp.in @@ -0,0 +1,26 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSLAPACK_TRTRI_ETI_SPEC_DECL_HPP_ +#define KOKKOSLAPACK_TRTRI_ETI_SPEC_DECL_HPP_ +namespace KokkosLapack { +namespace Impl { + +@LAPACK_TRTRI_ETI_DECL_BLOCK@ + +} // Impl +} // KokkosLapack +#endif // KOKKOSLAPACK_TRTRI_ETI_SPEC_DECL_HPP_ diff --git a/packages/kokkos-kernels/lapack/impl/KokkosLapack_gesv_spec.hpp b/packages/kokkos-kernels/lapack/impl/KokkosLapack_gesv_spec.hpp index 60a69e72b3e2..ebed196e3bac 100644 --- a/packages/kokkos-kernels/lapack/impl/KokkosLapack_gesv_spec.hpp +++ b/packages/kokkos-kernels/lapack/impl/KokkosLapack_gesv_spec.hpp @@ -120,5 +120,6 @@ struct GESV; #include +#include #endif // KOKKOSLAPACK_IMPL_GESV_SPEC_HPP_ diff --git a/packages/kokkos-kernels/lapack/impl/KokkosLapack_svd_spec.hpp b/packages/kokkos-kernels/lapack/impl/KokkosLapack_svd_spec.hpp index b0dfe3d091bf..470a8c0e8e45 100644 --- a/packages/kokkos-kernels/lapack/impl/KokkosLapack_svd_spec.hpp +++ b/packages/kokkos-kernels/lapack/impl/KokkosLapack_svd_spec.hpp @@ -128,5 +128,6 @@ struct SVD; #include +#include #endif // KOKKOSLAPACK_IMPL_SVD_SPEC_HPP_ diff --git a/packages/kokkos-kernels/lapack/impl/KokkosLapack_trtri_spec.hpp b/packages/kokkos-kernels/lapack/impl/KokkosLapack_trtri_spec.hpp index ef458f7e5703..0af55f4c6cea 100644 --- a/packages/kokkos-kernels/lapack/impl/KokkosLapack_trtri_spec.hpp +++ b/packages/kokkos-kernels/lapack/impl/KokkosLapack_trtri_spec.hpp @@ -114,5 +114,6 @@ struct TRTRI { false, true>; #include +#include #endif // KOKKOSLAPACK_TRTRI_SPEC_HPP_ diff --git a/packages/kokkos-kernels/lapack/src/KokkosLapack_gesv.hpp b/packages/kokkos-kernels/lapack/src/KokkosLapack_gesv.hpp index 281d6a565148..1b183981fe65 100644 --- a/packages/kokkos-kernels/lapack/src/KokkosLapack_gesv.hpp +++ b/packages/kokkos-kernels/lapack/src/KokkosLapack_gesv.hpp @@ -63,9 +63,15 @@ void gesv(const ExecutionSpace& space, const AMatrix& A, const BXMV& B, const IP static_assert(Kokkos::SpaceAccessibility::accessible); static_assert(Kokkos::SpaceAccessibility::accessible); #if defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA) +#if defined(KOKKOS_ENABLE_CUDA) if constexpr (!std::is_same_v) { static_assert(Kokkos::SpaceAccessibility::accessible); } +#elif defined(KOKKOS_ENABLE_HIP) + if constexpr (!std::is_same_v) { + static_assert(Kokkos::SpaceAccessibility::accessible); + } +#endif #else static_assert(Kokkos::SpaceAccessibility::accessible); #endif @@ -96,6 +102,7 @@ void gesv(const ExecutionSpace& space, const AMatrix& A, const BXMV& B, const IP // Check for no pivoting case. Only MAGMA supports no pivoting interface #ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA // have MAGMA TPL #ifdef KOKKOSKERNELS_ENABLE_TPL_LAPACK // and have LAPACK TPL +#if defined(KOKKOS_ENABLE_CUDA) if ((!std::is_same::value) && (IPIV0 == 0) && (IPIV.data() == nullptr)) { std::ostringstream os; @@ -103,6 +110,15 @@ void gesv(const ExecutionSpace& space, const AMatrix& A, const BXMV& B, const IP << "LAPACK TPL does not support no pivoting."; KokkosKernels::Impl::throw_runtime_exception(os.str()); } +#elif defined(KOKKOS_ENABLE_HIP) + if ((!std::is_same::value) && (IPIV0 == 0) && + (IPIV.data() == nullptr)) { + std::ostringstream os; + os << "KokkosLapack::gesv: IPIV: " << IPIV0 << ". " + << "LAPACK TPL does not support no pivoting."; + KokkosKernels::Impl::throw_runtime_exception(os.str()); + } +#endif #endif #else // not have MAGMA TPL #ifdef KOKKOSKERNELS_ENABLE_TPL_LAPACK // but have LAPACK TPL diff --git a/packages/kokkos-kernels/lapack/src/KokkosLapack_svd.hpp b/packages/kokkos-kernels/lapack/src/KokkosLapack_svd.hpp index c0c962fb19d9..e050e75b59fd 100644 --- a/packages/kokkos-kernels/lapack/src/KokkosLapack_svd.hpp +++ b/packages/kokkos-kernels/lapack/src/KokkosLapack_svd.hpp @@ -127,20 +127,34 @@ void svd(const ExecutionSpace& space, const char jobu[], const char jobvt[], con is_extent_invalid = true; os << "KokkosLapack::svd: S has extent " << S.extent(0) << ", instead of " << rankA << ".\n"; } - if ((jobu[0] == 'A') || (jobu[0] == 'a') || (jobu[0] == 'S') || (jobu[0] == 's')) { + if ((jobu[0] == 'A') || (jobu[0] == 'a')) { if (U.extent_int(0) != m || U.extent_int(1) != m) { is_extent_invalid = true; os << "KokkosLapack::svd: U has extents (" << U.extent(0) << ", " << U.extent(1) << ") instead of (" << m << ", " << m << ").\n"; } } - if ((jobvt[0] == 'A') || (jobvt[0] == 'a') || (jobvt[0] == 'S') || (jobvt[0] == 's')) { + if ((jobu[0] == 'S') || (jobu[0] == 's')) { + if (U.extent_int(0) != m || U.extent_int(1) != std::min(m, n)) { + is_extent_invalid = true; + os << "KokkosLapack::svd: U has extents (" << U.extent(0) << ", " << U.extent(1) << ") instead of (" << m << ", " + << std::min(m, n) << ").\n"; + } + } + if ((jobvt[0] == 'A') || (jobvt[0] == 'a')) { if (Vt.extent_int(0) != n || Vt.extent_int(1) != n) { is_extent_invalid = true; os << "KokkosLapack::svd: V has extents (" << Vt.extent(0) << ", " << Vt.extent(1) << ") instead of (" << n << ", " << n << ").\n"; } } + if ((jobvt[0] == 'S') || (jobvt[0] == 's')) { + if (Vt.extent_int(0) != std::min(m, n) || Vt.extent_int(1) != n) { + is_extent_invalid = true; + os << "KokkosLapack::svd: V has extents (" << Vt.extent(0) << ", " << Vt.extent(1) << ") instead of (" + << std::min(m, n) << ", " << n << ").\n"; + } + } if (is_extent_invalid) { KokkosKernels::Impl::throw_runtime_exception(os.str()); } diff --git a/packages/kokkos-kernels/lapack/tpls/KokkosLapack_Cuda_tpl.hpp b/packages/kokkos-kernels/lapack/tpls/KokkosLapack_Cuda_tpl.hpp index 3ead12d5f4de..e3191ea93bd1 100644 --- a/packages/kokkos-kernels/lapack/tpls/KokkosLapack_Cuda_tpl.hpp +++ b/packages/kokkos-kernels/lapack/tpls/KokkosLapack_Cuda_tpl.hpp @@ -25,12 +25,24 @@ namespace Impl { CudaLapackSingleton::CudaLapackSingleton() { cusolverStatus_t stat = cusolverDnCreate(&handle); if (stat != CUSOLVER_STATUS_SUCCESS) Kokkos::abort("CUSOLVER initialization failed\n"); - - Kokkos::push_finalize_hook([&]() { cusolverDnDestroy(handle); }); } CudaLapackSingleton& CudaLapackSingleton::singleton() { - static CudaLapackSingleton s; + std::unique_ptr& instance = get_instance(); + if (!instance) { + instance = std::make_unique(); + Kokkos::push_finalize_hook([&]() { + cusolverDnDestroy(instance->handle); + instance.reset(); + }); + } + return *instance; +} + +bool CudaLapackSingleton::is_initialized() { return get_instance() != nullptr; } + +std::unique_ptr& CudaLapackSingleton::get_instance() { + static std::unique_ptr s; return s; } diff --git a/packages/kokkos-kernels/lapack/tpls/KokkosLapack_Magma_tpl.hpp b/packages/kokkos-kernels/lapack/tpls/KokkosLapack_Magma_tpl.hpp index 636c40735db6..542f681281e8 100644 --- a/packages/kokkos-kernels/lapack/tpls/KokkosLapack_Magma_tpl.hpp +++ b/packages/kokkos-kernels/lapack/tpls/KokkosLapack_Magma_tpl.hpp @@ -25,12 +25,24 @@ namespace Impl { MagmaSingleton::MagmaSingleton() { magma_int_t stat = magma_init(); if (stat != MAGMA_SUCCESS) Kokkos::abort("MAGMA initialization failed\n"); - - Kokkos::push_finalize_hook([&]() { magma_finalize(); }); } MagmaSingleton& MagmaSingleton::singleton() { - static MagmaSingleton s; + std::unique_ptr& instance = get_instance(); + if (!instance) { + instance = std::make_unique(); + Kokkos::push_finalize_hook([&]() { + magma_finalize(); + instance.reset(); + }); + } + return *instance; +} + +bool MagmaSingleton::is_initialized() { return get_instance() != nullptr; } + +std::unique_ptr& MagmaSingleton::get_instance() { + static std::unique_ptr s; return s; } diff --git a/packages/kokkos-kernels/lapack/tpls/KokkosLapack_cusolver.hpp b/packages/kokkos-kernels/lapack/tpls/KokkosLapack_cusolver.hpp index 272fb8b3b83f..0084b70429a5 100644 --- a/packages/kokkos-kernels/lapack/tpls/KokkosLapack_cusolver.hpp +++ b/packages/kokkos-kernels/lapack/tpls/KokkosLapack_cusolver.hpp @@ -32,6 +32,11 @@ struct CudaLapackSingleton { CudaLapackSingleton(); static CudaLapackSingleton& singleton(); + + static bool is_initialized(); + + private: + static std::unique_ptr& get_instance(); }; inline void cusolver_internal_error_throw(cusolverStatus_t cusolverStatus, const char* name, const char* file, @@ -69,7 +74,7 @@ inline void cusolver_internal_safe_call(cusolverStatus_t cusolverStatus, const c // The macro below defines is the public interface for the safe cusolver calls. // The functions themselves are protected by impl namespace. -#define KOKKOS_CUSOLVER_SAFE_CALL_IMPL(call) \ +#define KOKKOSLAPACK_IMPL_CUSOLVER_SAFE_CALL(call) \ KokkosLapack::Impl::cusolver_internal_safe_call(call, #call, __FILE__, __LINE__) } // namespace Impl diff --git a/packages/kokkos-kernels/lapack/tpls/KokkosLapack_gesv_tpl_spec_avail.hpp b/packages/kokkos-kernels/lapack/tpls/KokkosLapack_gesv_tpl_spec_avail.hpp index 472b79ce85fc..50a6863b8094 100644 --- a/packages/kokkos-kernels/lapack/tpls/KokkosLapack_gesv_tpl_spec_avail.hpp +++ b/packages/kokkos-kernels/lapack/tpls/KokkosLapack_gesv_tpl_spec_avail.hpp @@ -52,23 +52,28 @@ KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, Kokkos::LayoutLe namespace KokkosLapack { namespace Impl { -#define KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA(SCALAR, LAYOUT, MEMSPACE) \ - template <> \ - struct gesv_tpl_spec_avail< \ - Kokkos::Cuda, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ +#define KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA(SCALAR, LAYOUT, EXECSPACE, MEMSPACE) \ + template <> \ + struct gesv_tpl_spec_avail< \ + EXECSPACE, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; - -KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA(double, Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA(float, Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) +#if defined(KOKKOS_ENABLE_CUDA) +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA(double, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA(float, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) +#endif +#if defined(KOKKOS_ENABLE_HIP) +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA(double, Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace) +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA(float, Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace) +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace) +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace) +#endif } // namespace Impl } // namespace KokkosLapack #endif // KOKKOSKERNELS_ENABLE_TPL_MAGMA diff --git a/packages/kokkos-kernels/lapack/tpls/KokkosLapack_gesv_tpl_spec_decl.hpp b/packages/kokkos-kernels/lapack/tpls/KokkosLapack_gesv_tpl_spec_decl.hpp index 559f5d0509cd..7185f385cdd2 100644 --- a/packages/kokkos-kernels/lapack/tpls/KokkosLapack_gesv_tpl_spec_decl.hpp +++ b/packages/kokkos-kernels/lapack/tpls/KokkosLapack_gesv_tpl_spec_decl.hpp @@ -197,42 +197,48 @@ void magmaGesvWrapper(const ExecSpace& space, const AViewType& A, const BViewTyp Kokkos::Profiling::popRegion(); } -#define KOKKOSLAPACK_GESV_MAGMA(SCALAR, LAYOUT, MEM_SPACE) \ - template <> \ - struct GESV, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, \ - gesv_eti_spec_avail, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>>::value> { \ - using AViewType = Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using BViewType = Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using PViewType = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - \ - static void gesv(const Kokkos::Cuda& space, const AViewType& A, const BViewType& B, const PViewType& IPIV) { \ - magmaGesvWrapper(space, A, B, IPIV); \ - } \ +#define KOKKOSLAPACK_GESV_MAGMA(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct GESV< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, \ + gesv_eti_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>>::value> { \ + using AViewType = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using BViewType = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using PViewType = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ + \ + static void gesv(const EXEC_SPACE& space, const AViewType& A, const BViewType& B, const PViewType& IPIV) { \ + magmaGesvWrapper(space, A, B, IPIV); \ + } \ }; -KOKKOSLAPACK_GESV_MAGMA(float, Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSLAPACK_GESV_MAGMA(double, Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSLAPACK_GESV_MAGMA(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSLAPACK_GESV_MAGMA(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) - +#if defined(KOKKOS_ENABLE_CUDA) +KOKKOSLAPACK_GESV_MAGMA(float, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSLAPACK_GESV_MAGMA(double, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSLAPACK_GESV_MAGMA(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSLAPACK_GESV_MAGMA(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) +#endif +#if defined(KOKKOS_ENABLE_HIP) +KOKKOSLAPACK_GESV_MAGMA(float, Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace) +KOKKOSLAPACK_GESV_MAGMA(double, Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace) +KOKKOSLAPACK_GESV_MAGMA(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace) +KOKKOSLAPACK_GESV_MAGMA(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace) +#endif } // namespace Impl } // namespace KokkosLapack #endif // KOKKOSKERNELS_ENABLE_TPL_MAGMA @@ -264,54 +270,54 @@ void cusolverGesvWrapper(const ExecutionSpace& space, const IPIVViewType& IPIV, Kokkos::View info("getrf info"); CudaLapackSingleton& s = CudaLapackSingleton::singleton(); - KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnSetStream(s.handle, space.cuda_stream())); + KOKKOSLAPACK_IMPL_CUSOLVER_SAFE_CALL(cusolverDnSetStream(s.handle, space.cuda_stream())); if constexpr (std::is_same_v) { - KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnSgetrf_bufferSize(s.handle, m, n, A.data(), lda, &lwork)); + KOKKOSLAPACK_IMPL_CUSOLVER_SAFE_CALL(cusolverDnSgetrf_bufferSize(s.handle, m, n, A.data(), lda, &lwork)); Kokkos::View Workspace("getrf workspace", lwork); - KOKKOS_CUSOLVER_SAFE_CALL_IMPL( + KOKKOSLAPACK_IMPL_CUSOLVER_SAFE_CALL( cusolverDnSgetrf(s.handle, m, n, A.data(), lda, Workspace.data(), IPIV.data(), info.data())); - KOKKOS_CUSOLVER_SAFE_CALL_IMPL( + KOKKOSLAPACK_IMPL_CUSOLVER_SAFE_CALL( cusolverDnSgetrs(s.handle, CUBLAS_OP_N, m, nrhs, A.data(), lda, IPIV.data(), B.data(), ldb, info.data())); } if constexpr (std::is_same_v) { - KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnDgetrf_bufferSize(s.handle, m, n, A.data(), lda, &lwork)); + KOKKOSLAPACK_IMPL_CUSOLVER_SAFE_CALL(cusolverDnDgetrf_bufferSize(s.handle, m, n, A.data(), lda, &lwork)); Kokkos::View Workspace("getrf workspace", lwork); - KOKKOS_CUSOLVER_SAFE_CALL_IMPL( + KOKKOSLAPACK_IMPL_CUSOLVER_SAFE_CALL( cusolverDnDgetrf(s.handle, m, n, A.data(), lda, Workspace.data(), IPIV.data(), info.data())); - KOKKOS_CUSOLVER_SAFE_CALL_IMPL( + KOKKOSLAPACK_IMPL_CUSOLVER_SAFE_CALL( cusolverDnDgetrs(s.handle, CUBLAS_OP_N, m, nrhs, A.data(), lda, IPIV.data(), B.data(), ldb, info.data())); } if constexpr (std::is_same_v>) { - KOKKOS_CUSOLVER_SAFE_CALL_IMPL( + KOKKOSLAPACK_IMPL_CUSOLVER_SAFE_CALL( cusolverDnCgetrf_bufferSize(s.handle, m, n, reinterpret_cast(A.data()), lda, &lwork)); Kokkos::View Workspace("getrf workspace", lwork); - KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnCgetrf(s.handle, m, n, reinterpret_cast(A.data()), lda, - reinterpret_cast(Workspace.data()), IPIV.data(), - info.data())); + KOKKOSLAPACK_IMPL_CUSOLVER_SAFE_CALL(cusolverDnCgetrf(s.handle, m, n, reinterpret_cast(A.data()), lda, + reinterpret_cast(Workspace.data()), IPIV.data(), + info.data())); - KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnCgetrs(s.handle, CUBLAS_OP_N, m, nrhs, - reinterpret_cast(A.data()), lda, IPIV.data(), - reinterpret_cast(B.data()), ldb, info.data())); + KOKKOSLAPACK_IMPL_CUSOLVER_SAFE_CALL(cusolverDnCgetrs(s.handle, CUBLAS_OP_N, m, nrhs, + reinterpret_cast(A.data()), lda, IPIV.data(), + reinterpret_cast(B.data()), ldb, info.data())); } if constexpr (std::is_same_v>) { - KOKKOS_CUSOLVER_SAFE_CALL_IMPL( + KOKKOSLAPACK_IMPL_CUSOLVER_SAFE_CALL( cusolverDnZgetrf_bufferSize(s.handle, m, n, reinterpret_cast(A.data()), lda, &lwork)); Kokkos::View Workspace("getrf workspace", lwork); - KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnZgetrf(s.handle, m, n, reinterpret_cast(A.data()), lda, - reinterpret_cast(Workspace.data()), IPIV.data(), - info.data())); + KOKKOSLAPACK_IMPL_CUSOLVER_SAFE_CALL(cusolverDnZgetrf(s.handle, m, n, reinterpret_cast(A.data()), + lda, reinterpret_cast(Workspace.data()), + IPIV.data(), info.data())); - KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnZgetrs(s.handle, CUBLAS_OP_N, m, nrhs, - reinterpret_cast(A.data()), lda, IPIV.data(), - reinterpret_cast(B.data()), ldb, info.data())); + KOKKOSLAPACK_IMPL_CUSOLVER_SAFE_CALL( + cusolverDnZgetrs(s.handle, CUBLAS_OP_N, m, nrhs, reinterpret_cast(A.data()), lda, IPIV.data(), + reinterpret_cast(B.data()), ldb, info.data())); } - KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnSetStream(s.handle, NULL)); + KOKKOSLAPACK_IMPL_CUSOLVER_SAFE_CALL(cusolverDnSetStream(s.handle, NULL)); } #define KOKKOSLAPACK_GESV_CUSOLVER(SCALAR, LAYOUT, MEM_SPACE) \ @@ -385,26 +391,26 @@ void rocsolverGesvWrapper(const ExecutionSpace& space, const IPIVViewType& IPIV, Kokkos::View info("rocsolver info"); KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, space.hip_stream())); if constexpr (std::is_same_v) { - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL( rocsolver_sgesv(s.handle, N, nrhs, A.data(), lda, IPIV.data(), B.data(), ldb, info.data())); } if constexpr (std::is_same_v) { - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL( rocsolver_dgesv(s.handle, N, nrhs, A.data(), lda, IPIV.data(), B.data(), ldb, info.data())); } if constexpr (std::is_same_v>) { - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocsolver_cgesv(s.handle, N, nrhs, reinterpret_cast(A.data()), - lda, IPIV.data(), reinterpret_cast(B.data()), - ldb, info.data())); + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL( + rocsolver_cgesv(s.handle, N, nrhs, reinterpret_cast(A.data()), lda, IPIV.data(), + reinterpret_cast(B.data()), ldb, info.data())); } if constexpr (std::is_same_v>) { - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL( rocsolver_zgesv(s.handle, N, nrhs, reinterpret_cast(A.data()), lda, IPIV.data(), reinterpret_cast(B.data()), ldb, info.data())); } - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, NULL)); } #define KOKKOSLAPACK_GESV_ROCSOLVER(SCALAR, LAYOUT, MEM_SPACE) \ diff --git a/packages/kokkos-kernels/lapack/tpls/KokkosLapack_magma.hpp b/packages/kokkos-kernels/lapack/tpls/KokkosLapack_magma.hpp index dfde113fa663..b1b7bb1ab6ae 100644 --- a/packages/kokkos-kernels/lapack/tpls/KokkosLapack_magma.hpp +++ b/packages/kokkos-kernels/lapack/tpls/KokkosLapack_magma.hpp @@ -30,6 +30,11 @@ struct MagmaSingleton { MagmaSingleton(); static MagmaSingleton& singleton(); + + static bool is_initialized(); + + private: + static std::unique_ptr& get_instance(); }; } // namespace Impl diff --git a/packages/kokkos-kernels/lapack/tpls/KokkosLapack_svd_tpl_spec_decl.hpp b/packages/kokkos-kernels/lapack/tpls/KokkosLapack_svd_tpl_spec_decl.hpp index 01255bf427cd..03bc8d9c3025 100644 --- a/packages/kokkos-kernels/lapack/tpls/KokkosLapack_svd_tpl_spec_decl.hpp +++ b/packages/kokkos-kernels/lapack/tpls/KokkosLapack_svd_tpl_spec_decl.hpp @@ -324,42 +324,42 @@ void cusolverSvdWrapper(const ExecutionSpace& space, const char jobu[], const ch Kokkos::View rwork("svd rwork buffer", Kokkos::min(m, n) - 1); CudaLapackSingleton& s = CudaLapackSingleton::singleton(); - KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnSetStream(s.handle, space.cuda_stream())); + KOKKOSLAPACK_IMPL_CUSOLVER_SAFE_CALL(cusolverDnSetStream(s.handle, space.cuda_stream())); if constexpr (std::is_same_v) { - KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnSgesvd_bufferSize(s.handle, m, n, &lwork)); + KOKKOSLAPACK_IMPL_CUSOLVER_SAFE_CALL(cusolverDnSgesvd_bufferSize(s.handle, m, n, &lwork)); Kokkos::View work("svd work buffer", lwork); - KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnSgesvd(s.handle, jobu[0], jobvt[0], m, n, A.data(), lda, S.data(), - U.data(), ldu, Vt.data(), ldvt, work.data(), lwork, rwork.data(), - info.data())); + KOKKOSLAPACK_IMPL_CUSOLVER_SAFE_CALL(cusolverDnSgesvd(s.handle, jobu[0], jobvt[0], m, n, A.data(), lda, S.data(), + U.data(), ldu, Vt.data(), ldvt, work.data(), lwork, + rwork.data(), info.data())); } if constexpr (std::is_same_v) { - KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnDgesvd_bufferSize(s.handle, m, n, &lwork)); + KOKKOSLAPACK_IMPL_CUSOLVER_SAFE_CALL(cusolverDnDgesvd_bufferSize(s.handle, m, n, &lwork)); Kokkos::View work("svd work buffer", lwork); - KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnDgesvd(s.handle, jobu[0], jobvt[0], m, n, A.data(), lda, S.data(), - U.data(), ldu, Vt.data(), ldvt, work.data(), lwork, rwork.data(), - info.data())); + KOKKOSLAPACK_IMPL_CUSOLVER_SAFE_CALL(cusolverDnDgesvd(s.handle, jobu[0], jobvt[0], m, n, A.data(), lda, S.data(), + U.data(), ldu, Vt.data(), ldvt, work.data(), lwork, + rwork.data(), info.data())); } if constexpr (std::is_same_v>) { - KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnCgesvd_bufferSize(s.handle, m, n, &lwork)); + KOKKOSLAPACK_IMPL_CUSOLVER_SAFE_CALL(cusolverDnCgesvd_bufferSize(s.handle, m, n, &lwork)); Kokkos::View work("svd work buffer", lwork); - KOKKOS_CUSOLVER_SAFE_CALL_IMPL( + KOKKOSLAPACK_IMPL_CUSOLVER_SAFE_CALL( cusolverDnCgesvd(s.handle, jobu[0], jobvt[0], m, n, reinterpret_cast(A.data()), lda, S.data(), reinterpret_cast(U.data()), ldu, reinterpret_cast(Vt.data()), ldvt, reinterpret_cast(work.data()), lwork, rwork.data(), info.data())); } if constexpr (std::is_same_v>) { - KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnZgesvd_bufferSize(s.handle, m, n, &lwork)); + KOKKOSLAPACK_IMPL_CUSOLVER_SAFE_CALL(cusolverDnZgesvd_bufferSize(s.handle, m, n, &lwork)); Kokkos::View work("svd work buffer", lwork); - KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnZgesvd( + KOKKOSLAPACK_IMPL_CUSOLVER_SAFE_CALL(cusolverDnZgesvd( s.handle, jobu[0], jobvt[0], m, n, reinterpret_cast(A.data()), lda, S.data(), reinterpret_cast(U.data()), ldu, reinterpret_cast(Vt.data()), ldvt, reinterpret_cast(work.data()), lwork, rwork.data(), info.data())); } - KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnSetStream(s.handle, NULL)); + KOKKOSLAPACK_IMPL_CUSOLVER_SAFE_CALL(cusolverDnSetStream(s.handle, NULL)); } #define KOKKOSLAPACK_SVD_CUSOLVER(SCALAR, LAYOUT, MEM_SPACE) \ @@ -473,30 +473,30 @@ void rocsolverSvdWrapper(const ExecutionSpace& space, const char jobu[], const c Kokkos::View rwork("svd rwork buffer", Kokkos::min(m, n) - 1); KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, space.hip_stream())); if constexpr (std::is_same_v) { - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocsolver_sgesvd(s.handle, UVecMode, VVecMode, m, n, A.data(), lda, S.data(), - U.data(), ldu, Vt.data(), ldvt, rwork.data(), WorkMode, - info.data())); + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocsolver_sgesvd(s.handle, UVecMode, VVecMode, m, n, A.data(), lda, S.data(), + U.data(), ldu, Vt.data(), ldvt, rwork.data(), WorkMode, + info.data())); } if constexpr (std::is_same_v) { - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocsolver_dgesvd(s.handle, UVecMode, VVecMode, m, n, A.data(), lda, S.data(), - U.data(), ldu, Vt.data(), ldvt, rwork.data(), WorkMode, - info.data())); + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocsolver_dgesvd(s.handle, UVecMode, VVecMode, m, n, A.data(), lda, S.data(), + U.data(), ldu, Vt.data(), ldvt, rwork.data(), WorkMode, + info.data())); } if constexpr (std::is_same_v>) { - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocsolver_cgesvd( + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocsolver_cgesvd( s.handle, UVecMode, VVecMode, m, n, reinterpret_cast(A.data()), lda, S.data(), reinterpret_cast(U.data()), ldu, reinterpret_cast(Vt.data()), ldvt, rwork.data(), WorkMode, info.data())); } if constexpr (std::is_same_v>) { - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocsolver_zgesvd( + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocsolver_zgesvd( s.handle, UVecMode, VVecMode, m, n, reinterpret_cast(A.data()), lda, S.data(), reinterpret_cast(U.data()), ldu, reinterpret_cast(Vt.data()), ldvt, rwork.data(), WorkMode, info.data())); } - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, NULL)); } #define KOKKOSLAPACK_SVD_ROCSOLVER(SCALAR, LAYOUT, MEM_SPACE) \ diff --git a/packages/kokkos-kernels/lapack/unit_test/Test_Lapack_gesv.hpp b/packages/kokkos-kernels/lapack/unit_test/Test_Lapack_gesv.hpp index 653ed2cbf26b..fb3f37192707 100644 --- a/packages/kokkos-kernels/lapack/unit_test/Test_Lapack_gesv.hpp +++ b/packages/kokkos-kernels/lapack/unit_test/Test_Lapack_gesv.hpp @@ -15,11 +15,12 @@ //@HEADER // only enable this test where KokkosLapack supports gesv: -// CUDA+(MAGMA or CUSOLVER), HIP+ROCSOLVER and HOST+LAPACK -#if (defined(TEST_CUDA_LAPACK_CPP) && \ - (defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA) || defined(KOKKOSKERNELS_ENABLE_TPL_CUSOLVER))) || \ - (defined(TEST_HIP_LAPACK_CPP) && defined(KOKKOSKERNELS_ENABLE_TPL_ROCSOLVER)) || \ - (defined(KOKKOSKERNELS_ENABLE_TPL_LAPACK) && \ +// CUDA+(MAGMA or CUSOLVER), HIP+(MAGMA or ROCSOLVER) and HOST+LAPACK +#if (defined(TEST_CUDA_LAPACK_CPP) && \ + (defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA) || defined(KOKKOSKERNELS_ENABLE_TPL_CUSOLVER))) || \ + (defined(TEST_HIP_LAPACK_CPP) && \ + (defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA) || defined(KOKKOSKERNELS_ENABLE_TPL_ROCSOLVER))) || \ + (defined(KOKKOSKERNELS_ENABLE_TPL_LAPACK) && \ (defined(TEST_OPENMP_LAPACK_CPP) || defined(TEST_SERIAL_LAPACK_CPP) || defined(TEST_THREADS_LAPACK_CPP))) #include @@ -97,8 +98,13 @@ void impl_test_gesv(const char* mode, const char* padding, int N) { bool notpl_runtime_err = false; #ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA // have MAGMA TPL #ifdef KOKKOSKERNELS_ENABLE_TPL_LAPACK // and have LAPACK TPL +#if defined(KOKKOS_ENABLE_CUDA) nopivot_runtime_err = (!std::is_same::value) && (ipiv.extent(0) == 0) && (ipiv.data() == nullptr); +#elif defined(KOKKOS_ENABLE_HIP) + nopivot_runtime_err = (!std::is_same::value) && + (ipiv.extent(0) == 0) && (ipiv.data() == nullptr); +#endif notpl_runtime_err = false; #else notpl_runtime_err = true; @@ -200,8 +206,13 @@ void impl_test_gesv_mrhs(const char* mode, const char* padding, int N, int nrhs) bool notpl_runtime_err = false; #ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA // have MAGMA TPL #ifdef KOKKOSKERNELS_ENABLE_TPL_LAPACK // and have LAPACK TPL +#if defined(KOKKOS_ENABLE_CUDA) nopivot_runtime_err = (!std::is_same::value) && (ipiv.extent(0) == 0) && (ipiv.data() == nullptr); +#elif defined(KOKKOS_ENABLE_HIP) + nopivot_runtime_err = (!std::is_same::value) && + (ipiv.extent(0) == 0) && (ipiv.data() == nullptr); +#endif notpl_runtime_err = false; #else notpl_runtime_err = true; @@ -222,9 +233,9 @@ void impl_test_gesv_mrhs(const char* mode, const char* padding, int N, int nrhs) // Get the solution vector. Kokkos::deep_copy(h_B, B); - // Checking vs ref on CPU, this eps is about 10^-9 + // Checking vs ref on CPU, this eps is about 10^-8 typedef typename ats::mag_type mag_type; - const mag_type eps = 1.0e7 * ats::epsilon(); + const mag_type eps = 1.0e8 * ats::epsilon(); bool test_flag = true; for (int j = 0; j < nrhs; j++) { for (int i = 0; i < N; i++) { @@ -268,6 +279,19 @@ int test_gesv(const char* mode) { Test::impl_test_gesv(&mode[0], "N", 64); // no padding Test::impl_test_gesv(&mode[0], "N", 1024); // no padding + Test::impl_test_gesv(&mode[0], "Y", + 13); // padding + Test::impl_test_gesv(&mode[0], "Y", + 179); // padding + } +#elif defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA) && defined(KOKKOS_ENABLE_HIP) + if constexpr (std::is_same_v) { + Test::impl_test_gesv(&mode[0], "N", 2); // no padding + Test::impl_test_gesv(&mode[0], "N", 13); // no padding + Test::impl_test_gesv(&mode[0], "N", 179); // no padding + Test::impl_test_gesv(&mode[0], "N", 64); // no padding + Test::impl_test_gesv(&mode[0], "N", 1024); // no padding + Test::impl_test_gesv(&mode[0], "Y", 13); // padding Test::impl_test_gesv(&mode[0], "Y", @@ -307,6 +331,17 @@ int test_gesv_mrhs(const char* mode) { Test::impl_test_gesv_mrhs(&mode[0], "N", 64, 5); // no padding Test::impl_test_gesv_mrhs(&mode[0], "N", 1024, 5); // no padding + Test::impl_test_gesv_mrhs(&mode[0], "Y", 13, 5); // padding + Test::impl_test_gesv_mrhs(&mode[0], "Y", 179, 5); // padding + } +#elif defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA) && defined(KOKKOS_ENABLE_HIP) + if constexpr (std::is_same_v) { + Test::impl_test_gesv_mrhs(&mode[0], "N", 2, 5); // no padding + Test::impl_test_gesv_mrhs(&mode[0], "N", 13, 5); // no padding + Test::impl_test_gesv_mrhs(&mode[0], "N", 179, 5); // no padding + Test::impl_test_gesv_mrhs(&mode[0], "N", 64, 5); // no padding + Test::impl_test_gesv_mrhs(&mode[0], "N", 1024, 5); // no padding + Test::impl_test_gesv_mrhs(&mode[0], "Y", 13, 5); // padding Test::impl_test_gesv_mrhs(&mode[0], "Y", 179, 5); // padding } diff --git a/packages/kokkos-kernels/master_history.txt b/packages/kokkos-kernels/master_history.txt index c712462dd81e..a02c157740d6 100644 --- a/packages/kokkos-kernels/master_history.txt +++ b/packages/kokkos-kernels/master_history.txt @@ -28,3 +28,4 @@ tag: 4.3.00 date: 04/03/2024 master: afd65f03 release: ebbf4b78 tag: 4.3.01 date: 05/07/2024 master: 1b0a15f5 release: 58785c1b tag: 4.4.00 date: 08/08/2024 master: d1a91b8a release: 1145f529 tag: 4.4.01 date: 09/12/2024 master: 0608a337 release: 6b340287 +tag: 4.5.00 date: 11/11/2024 master: 0b43169e release: 4a7590af diff --git a/packages/kokkos-kernels/ode/impl/KokkosODE_BDF_impl.hpp b/packages/kokkos-kernels/ode/impl/KokkosODE_BDF_impl.hpp index 3119ff0e3aa4..3e817563da17 100644 --- a/packages/kokkos-kernels/ode/impl/KokkosODE_BDF_impl.hpp +++ b/packages/kokkos-kernels/ode/impl/KokkosODE_BDF_impl.hpp @@ -142,7 +142,7 @@ struct BDF_system_wrapper2 { if (compute_jac) { mySys.evaluate_jacobian(t, dt, y, jac); - // J = I - dt*(dy/dy) + // J = I - dt*(df/dy) for (int rowIdx = 0; rowIdx < neqs; ++rowIdx) { for (int colIdx = 0; colIdx < neqs; ++colIdx) { jac(rowIdx, colIdx) = -dt * jac(rowIdx, colIdx); diff --git a/packages/kokkos-kernels/ode/impl/KokkosODE_RungeKuttaTables_impl.hpp b/packages/kokkos-kernels/ode/impl/KokkosODE_RungeKuttaTables_impl.hpp index 6a0770d1a7ab..f5b844a35808 100644 --- a/packages/kokkos-kernels/ode/impl/KokkosODE_RungeKuttaTables_impl.hpp +++ b/packages/kokkos-kernels/ode/impl/KokkosODE_RungeKuttaTables_impl.hpp @@ -257,6 +257,59 @@ struct ButcherTableau<4, 6> // Referred to as DOPRI5 or RKDP 11.0 / 84.0 - 187.0 / 2100.0, -1.0 / 40.0}}; }; +// Coefficients obtained from: +// J. H. Verner +// "Explicit Runge-Kutta methods with estimates of the local truncation error", +// Journal of Numerical Analysis, Volume 15, Issue 4, 1978, +// https://doi.org/10.1137/0715051. +template <> +struct ButcherTableau<5, 7> // Referred to as Verner 5-6 or VER56 +{ + static constexpr int order = 6; + static constexpr int nstages = 8; + Kokkos::Array a{{0.0, + 1.0 / 6.0, + 0.0, + 4.0 / 75.0, + 16.0 / 75.0, + 0.0, + 5.0 / 6.0, + -8.0 / 3.0, + 5.0 / 2.0, + 0.0, + -165.0 / 64.0, + 55.0 / 6.0, + -425.0 / 64.0, + 85.0 / 96.0, + 0.0, + 12.0 / 5.0, + -8.0, + 4015.0 / 612.0, + -11.0 / 36.0, + 88.0 / 255.0, + 0.0, + -8263.0 / 15000.0, + 124.0 / 75.0, + -643.0 / 680.0, + -81.0 / 250.0, + 2484.0 / 10625.0, + 0.0, + 0.0, + 3501.0 / 1720.0, + -300.0 / 43.0, + 297275.0 / 52632.0, + -319.0 / 2322.0, + 24068.0 / 84065.0, + 3850.0 / 26703.0, + 0.0}}; + Kokkos::Array b{ + {3.0 / 4.0, 0.0, 875.0 / 2244.0, 23.0 / 72.0, 264.0 / 1955.0, 0.0, 125.0 / 11592.0, 43.0 / 616.0}}; + Kokkos::Array c{{0.0, 1.0 / 6.0, 4.0 / 15.0, 2.0 / 3.0, 5.0 / 6.0, 1.0, 1.0 / 15.0, 1.0}}; + Kokkos::Array e{{3.0 / 4.0 - 13.0 / 160.0, 0.0, 875.0 / 2244.0 - 2375.0 / 5984.0, + 23.0 / 72.0 - 5.0 / 16.0, 264.0 / 1955.0 - 12.0 / 85.0, -3.0 / 44.0, + 125.0 / 11592.0, 43.0 / 616.0}}; +}; + } // namespace Impl } // namespace KokkosODE diff --git a/packages/kokkos-kernels/ode/impl/KokkosODE_RungeKutta_impl.hpp b/packages/kokkos-kernels/ode/impl/KokkosODE_RungeKutta_impl.hpp index 83ab76758f96..533914477de9 100644 --- a/packages/kokkos-kernels/ode/impl/KokkosODE_RungeKutta_impl.hpp +++ b/packages/kokkos-kernels/ode/impl/KokkosODE_RungeKutta_impl.hpp @@ -23,19 +23,84 @@ #include "KokkosODE_RungeKuttaTables_impl.hpp" #include "KokkosODE_Types.hpp" +#include "iostream" + namespace KokkosODE { namespace Impl { +// This algorithm is mostly derived from +// E. Hairer, S. P. Norsett G. Wanner, +// "Solving Ordinary Differential Equations I: +// Nonstiff Problems", Sec. II.4. +// Note that all floating point values below +// have been heuristically selected for +// convergence performance. +template +KOKKOS_FUNCTION void first_step_size(const ode_type ode, const int order, const scalar_type t0, const scalar_type atol, + const scalar_type rtol, const vec_type& y0, const res_type& f0, const vec_type y1, + const mat_type temp, scalar_type& dt_ini) { + using KAT = Kokkos::ArithTraits; + + // Extract subviews to store intermediate data + auto f1 = Kokkos::subview(temp, 1, Kokkos::ALL()); + + // Compute norms for y0 and f0 + double n0 = KAT::zero(), n1 = KAT::zero(), dt0, scale; + for (int eqIdx = 0; eqIdx < ode.neqs; ++eqIdx) { + scale = atol + rtol * Kokkos::abs(y0(eqIdx)); + n0 += Kokkos::pow(y0(eqIdx) / scale, 2); + n1 += Kokkos::pow(f0(eqIdx) / scale, 2); + } + n0 = Kokkos::sqrt(n0) / Kokkos::sqrt(ode.neqs); + n1 = Kokkos::sqrt(n1) / Kokkos::sqrt(ode.neqs); + + // Select dt0 + if ((n0 < 1e-5) || (n1 < 1e-5)) { + dt0 = 1e-6; + } else { + dt0 = 0.01 * n0 / n1; + } + + // Estimate y at t0 + dt0 + for (int eqIdx = 0; eqIdx < ode.neqs; ++eqIdx) { + y1(eqIdx) = y0(eqIdx) + dt0 * f0(eqIdx); + } + + // Compute f at t0+dt0 and y1, + // then compute the norm of f(t0+dt0, y1) - f(t0, y0) + scalar_type n2 = KAT::zero(); + ode.evaluate_function(t0 + dt0, dt0, y1, f1); + for (int eqIdx = 0; eqIdx < ode.neqs; ++eqIdx) { + n2 += Kokkos::pow((f1(eqIdx) - f0(eqIdx)) / (atol + rtol * Kokkos::abs(y0(eqIdx))), 2); + } + n2 = Kokkos::sqrt(n2) / (dt0 * Kokkos::sqrt(ode.neqs)); + + // Finally select initial time step dt_ini + if ((n1 <= 1e-15) && (n2 <= 1e-15)) { + dt_ini = Kokkos::max(1e-6, dt0 * 1e-3); + } else { + dt_ini = Kokkos::pow(0.01 / Kokkos::max(n1, n2), KAT::one() / order); + } + + dt_ini = Kokkos::min(100 * dt0, dt_ini); + + // Zero out temp variables just to be safe... + for (int eqIdx = 0; eqIdx < ode.neqs; ++eqIdx) { + f0(eqIdx) = 0.0; + y1(eqIdx) = 0.0; + f1(eqIdx) = 0.0; + } +} // first_step_size + // y_new = y_old + dt*sum(b_i*k_i) i in [1, nstages] // k_i = f(t+c_i*dt, y_old+sum(a_{ij}*k_i)) j in [1, i-1] // we need to compute the k_i and store them as we go // to use them for k_{i+1} computation. template -KOKKOS_FUNCTION void RKStep(ode_type& ode, const table_type& table, const bool adaptivity, scalar_type t, - scalar_type dt, const vec_type& y_old, const vec_type& y_new, const vec_type& temp, - const mv_type& k_vecs) { - const int neqs = ode.neqs; - const int nstages = table.nstages; +KOKKOS_FUNCTION void RKStep(ode_type& ode, const table_type& table, scalar_type t, scalar_type dt, + const vec_type& y_old, const vec_type& y_new, const vec_type& temp, const mv_type& k_vecs) { + const int neqs = ode.neqs; + constexpr int nstages = table_type::nstages; // first set y_new = y_old for (int eqIdx = 0; eqIdx < ode.neqs; ++eqIdx) { @@ -72,34 +137,42 @@ KOKKOS_FUNCTION void RKStep(ode_type& ode, const table_type& table, const bool a y_new(eqIdx) += dt * table.b[stageIdx] * k(eqIdx); } } - - // Compute estimation of the error using k_vecs and table.e - if (adaptivity == true) { - for (int eqIdx = 0; eqIdx < neqs; ++eqIdx) { - temp(eqIdx) = 0; - for (int stageIdx = 0; stageIdx < nstages; ++stageIdx) { - temp(eqIdx) += dt * table.e[stageIdx] * k_vecs(stageIdx, eqIdx); - } - } - } } // RKStep +// Note that the control values for +// time step increase/decrease are +// heuristically chosen based on +// L. F. Shampine and M. W. Reichelt +// "The Matlab ODE suite" SIAM J. Sci. +// Comput. Vol. 18, No. 1, pp. 1-22 +// Jan. 1997 template KOKKOS_FUNCTION Experimental::ode_solver_status RKSolve(const ode_type& ode, const table_type& table, const KokkosODE::Experimental::ODE_params& params, const scalar_type t_start, const scalar_type t_end, const vec_type& y0, const vec_type& y, const vec_type& temp, - const mv_type& k_vecs) { + const mv_type& k_vecs, int* const step_count) { constexpr scalar_type error_threshold = 1; - bool adapt = params.adaptivity; + scalar_type error_n; + bool adapt = params.adaptivity; bool dt_was_reduced; - if (std::is_same_v>) { + if constexpr (std::is_same_v>) { adapt = false; } // Set current time and initial time step - scalar_type t_now = t_start; - scalar_type dt = (t_end - t_start) / params.max_steps; + scalar_type t_now = t_start, dt = 0.0; + if (adapt == true) { + ode.evaluate_function(t_start, 0, y0, temp); + first_step_size(ode, table_type::order, t_start, params.abs_tol, params.rel_tol, y0, temp, y, k_vecs, dt); + if (dt < params.min_step_size) { + dt = params.min_step_size; + } + } else { + dt = (t_end - t_start) / params.num_steps; + } + + *step_count = 0; // Loop over time steps to integrate ODE for (int stepIdx = 0; (stepIdx < params.max_steps) && (t_now <= t_end); ++stepIdx) { @@ -119,28 +192,33 @@ KOKKOS_FUNCTION Experimental::ode_solver_status RKSolve(const ode_type& ode, con // Take tentative steps until the requested error // is met. This of course only works for adaptive // solvers, for fix time steps we simply do not - // compute and check what error of the current step + // compute and check the error of the current step while (error_threshold < error) { // Take a step of Runge-Kutta integrator - RKStep(ode, table, adapt, t_now, dt, y0, y, temp, k_vecs); + RKStep(ode, table, t_now, dt, y0, y, temp, k_vecs); // Compute the largest error and decide on // the size of the next time step to take. error = 0; - if (adapt) { + + // Compute estimation of the error using k_vecs and table.e + if (adapt == true) { // Compute the error for (int eqIdx = 0; eqIdx < ode.neqs; ++eqIdx) { - error = Kokkos::max(error, Kokkos::abs(temp(eqIdx))); - tol = Kokkos::max( - tol, params.abs_tol + params.rel_tol * Kokkos::max(Kokkos::abs(y(eqIdx)), Kokkos::abs(y0(eqIdx)))); + tol = params.abs_tol + params.rel_tol * Kokkos::max(Kokkos::abs(y(eqIdx)), Kokkos::abs(y0(eqIdx))); + error_n = 0; + for (int stageIdx = 0; stageIdx < table.nstages; ++stageIdx) { + error_n += dt * table.e[stageIdx] * k_vecs(stageIdx, eqIdx); + } + error += (error_n * error_n) / (tol * tol); } - error = error / tol; + error = Kokkos::sqrt(error / ode.neqs); // Reduce the time step if error // is too large and current step // is rejected. if (error > 1) { - dt = dt * Kokkos::max(0.2, 0.8 / Kokkos::pow(error, 1 / table.order)); + dt = dt * Kokkos::max(0.2, 0.8 * Kokkos::pow(error, -1.0 / table.order)); dt_was_reduced = true; } @@ -150,6 +228,7 @@ KOKKOS_FUNCTION Experimental::ode_solver_status RKSolve(const ode_type& ode, con // Update time and initial condition for next time step t_now += dt; + *step_count += 1; for (int eqIdx = 0; eqIdx < ode.neqs; ++eqIdx) { y0(eqIdx) = y(eqIdx); } @@ -157,7 +236,7 @@ KOKKOS_FUNCTION Experimental::ode_solver_status RKSolve(const ode_type& ode, con if (t_now < t_end) { if (adapt && !dt_was_reduced && error < 0.5) { // Compute new time increment - dt = dt * Kokkos::min(10.0, Kokkos::max(2.0, 0.9 * Kokkos::pow(error, 1 / table.order))); + dt = dt * Kokkos::min(10.0, Kokkos::max(2.0, 0.9 * Kokkos::pow(error, -1.0 / table.order))); } } else { return Experimental::ode_solver_status::SUCCESS; diff --git a/packages/kokkos-kernels/ode/src/KokkosODE_BDF.hpp b/packages/kokkos-kernels/ode/src/KokkosODE_BDF.hpp index 419316ba4539..2afb6e46e2b4 100644 --- a/packages/kokkos-kernels/ode/src/KokkosODE_BDF.hpp +++ b/packages/kokkos-kernels/ode/src/KokkosODE_BDF.hpp @@ -93,6 +93,7 @@ struct BDF { const double dt = (t_end - t_start) / num_steps; double t = t_start; + int count = 0; // Load y0 into y_vecs(:, 0) for (int eqIdx = 0; eqIdx < ode.neqs; ++eqIdx) { @@ -107,7 +108,7 @@ struct BDF { } KokkosODE::Experimental::ODE_params params(table.order - 1); for (int stepIdx = 0; stepIdx < init_steps; ++stepIdx) { - KokkosODE::Experimental::RungeKutta::Solve(ode, params, t, t + dt, y0, y, update, kstack); + KokkosODE::Experimental::RungeKutta::Solve(ode, params, t, t + dt, y0, y, update, kstack, &count); for (int eqIdx = 0; eqIdx < ode.neqs; ++eqIdx) { y_vecs(eqIdx, stepIdx + 1) = y(eqIdx); diff --git a/packages/kokkos-kernels/ode/src/KokkosODE_RungeKutta.hpp b/packages/kokkos-kernels/ode/src/KokkosODE_RungeKutta.hpp index 2d298a65689c..39c1800b6ca1 100644 --- a/packages/kokkos-kernels/ode/src/KokkosODE_RungeKutta.hpp +++ b/packages/kokkos-kernels/ode/src/KokkosODE_RungeKutta.hpp @@ -38,7 +38,8 @@ enum RK_type : int { RK4 = 4, ///< Runge-Kutta classic order 4 method RKF45 = 5, ///< Fehlberg order 5 method RKCK = 6, ///< Cash-Karp method - RKDP = 7 ///< Dormand-Prince method + RKDP = 7, ///< Dormand-Prince method + VER56 = 8 ///< Verner order 6 method }; template @@ -86,6 +87,11 @@ struct RK_Tableau_helper { using table_type = KokkosODE::Impl::ButcherTableau<4, 6>; }; +template <> +struct RK_Tableau_helper { + using table_type = KokkosODE::Impl::ButcherTableau<5, 7>; +}; + /// \brief Unspecialized version of the RungeKutta solvers /// /// \tparam RK_type an RK_type enum value used to specify @@ -128,9 +134,10 @@ struct RungeKutta { template KOKKOS_FUNCTION static ode_solver_status Solve(const ode_type& ode, const KokkosODE::Experimental::ODE_params& params, const scalar_type t_start, const scalar_type t_end, const vec_type& y0, - const vec_type& y, const vec_type& temp, const mv_type& k_vecs) { + const vec_type& y, const vec_type& temp, const mv_type& k_vecs, + int* const count) { table_type table; - return KokkosODE::Impl::RKSolve(ode, table, params, t_start, t_end, y0, y, temp, k_vecs); + return KokkosODE::Impl::RKSolve(ode, table, params, t_start, t_end, y0, y, temp, k_vecs, count); } }; diff --git a/packages/kokkos-kernels/ode/src/KokkosODE_Types.hpp b/packages/kokkos-kernels/ode/src/KokkosODE_Types.hpp index 2145afb71823..e28bcb82f020 100644 --- a/packages/kokkos-kernels/ode/src/KokkosODE_Types.hpp +++ b/packages/kokkos-kernels/ode/src/KokkosODE_Types.hpp @@ -27,12 +27,21 @@ struct ODE_params { int num_steps, max_steps; double abs_tol, rel_tol, min_step_size; + KOKKOS_FUNCTION + ODE_params() + : adaptivity(true), num_steps(100), max_steps(10000), abs_tol(1e-12), rel_tol(1e-6), min_step_size(1e-9) {} + // Constructor that only specify the desired number of steps. // In this case no adaptivity is provided, the time step will // be constant such that dt = (tend - tstart) / num_steps; KOKKOS_FUNCTION ODE_params(const int num_steps_) - : adaptivity(false), num_steps(num_steps_), max_steps(num_steps_), abs_tol(0), rel_tol(0), min_step_size(0) {} + : adaptivity(false), + num_steps(num_steps_), + max_steps(num_steps_ + 1), + abs_tol(1e-12), + rel_tol(1e-6), + min_step_size(0) {} /// ODE_parms construtor for adaptive time stepping. KOKKOS_FUNCTION diff --git a/packages/kokkos-kernels/ode/unit_test/Test_ODE.hpp b/packages/kokkos-kernels/ode/unit_test/Test_ODE.hpp index 1b55171581ee..f30ff39bd2d1 100644 --- a/packages/kokkos-kernels/ode/unit_test/Test_ODE.hpp +++ b/packages/kokkos-kernels/ode/unit_test/Test_ODE.hpp @@ -19,6 +19,7 @@ // Explicit integrators #include "Test_ODE_RK.hpp" #include "Test_ODE_RK_chem.hpp" +#include "Test_ODE_RK_counts.hpp" // Implicit integrators #include "Test_ODE_Newton.hpp" diff --git a/packages/kokkos-kernels/ode/unit_test/Test_ODE_BDF.hpp b/packages/kokkos-kernels/ode/unit_test/Test_ODE_BDF.hpp index 8f8319cb1df8..dfee9b62a043 100644 --- a/packages/kokkos-kernels/ode/unit_test/Test_ODE_BDF.hpp +++ b/packages/kokkos-kernels/ode/unit_test/Test_ODE_BDF.hpp @@ -545,7 +545,7 @@ void update_D(const int order, const scalar_type factor, const mat_type& coeffs, template void test_Nordsieck() { using execution_space = Kokkos::HostSpace; - StiffChemistry mySys{}; + [[maybe_unused]] StiffChemistry mySys{}; Kokkos::View R("coeffs", 6, 6), U("coeffs", 6, 6); Kokkos::View D("D", 8, mySys.neqs), tempD("tmp", 8, mySys.neqs); diff --git a/packages/kokkos-kernels/ode/unit_test/Test_ODE_RK.hpp b/packages/kokkos-kernels/ode/unit_test/Test_ODE_RK.hpp index 90bec0e1841d..6d6f7877fe00 100644 --- a/packages/kokkos-kernels/ode/unit_test/Test_ODE_RK.hpp +++ b/packages/kokkos-kernels/ode/unit_test/Test_ODE_RK.hpp @@ -21,12 +21,14 @@ namespace Test { -// damped harmonic undriven oscillator +// damped undriven harmonic oscillator // m y'' + c y' + k y = 0 -// solution: y=A * exp(-xi * omega_0 * t) * sin(sqrt(1-xi^2) * omega_0 * t + -// phi) omega_0 = sqrt(k/m); xi = c / sqrt(4*m*k) A and phi depend on y(0) and -// y'(0); Change of variables: x(t) = y(t)*exp(-c/(2m)*t) = y(t)*exp(-xi * -// omega_0 * t) Change of variables: X = [x ] +// solution: y=A * exp(-xi * omega_0 * t) * sin(sqrt(1-xi^2) * omega_0 * t + phi) +// omega_0 = sqrt(k/m) +// xi = c / sqrt(4*m*k) +// A and phi depend on y(0) and y'(0); +// Change of variables: x(t) = y(t)*exp(-c/(2m)*t) = y(t)*exp(-xi * omega_0 * t) +// Change of variables: X = [x ] // [x'] // Leads to X' = A*X with A = [ 0 1] // [-d 0] @@ -74,7 +76,8 @@ struct solution_wrapper { void operator()(const int /*idx*/) const { ode.solution(t, y_old, y_ref); } }; -template +template struct RKSolve_wrapper { using ode_params = KokkosODE::Experimental::ODE_params; @@ -84,10 +87,11 @@ struct RKSolve_wrapper { int max_steps; vec_type y_old, y_new, tmp; mv_type kstack; + count_type count; RKSolve_wrapper(const ode_type& my_ode_, const ode_params& params_, const scalar_type tstart_, const scalar_type tend_, const vec_type& y_old_, const vec_type& y_new_, const vec_type& tmp_, - const mv_type& kstack_) + const mv_type& kstack_, const count_type& count_) : my_ode(my_ode_), params(params_), tstart(tstart_), @@ -95,11 +99,13 @@ struct RKSolve_wrapper { y_old(y_old_), y_new(y_new_), tmp(tmp_), - kstack(kstack_) {} + kstack(kstack_), + count(count_) {} KOKKOS_FUNCTION void operator()(const int /*idx*/) const { - KokkosODE::Experimental::RungeKutta::Solve(my_ode, params, tstart, tend, y_old, y_new, tmp, kstack); + KokkosODE::Experimental::RungeKutta::Solve(my_ode, params, tstart, tend, y_old, y_new, tmp, kstack, + count.data()); } }; @@ -110,14 +116,16 @@ void test_method(const std::string label, ode_type& my_ode, const scalar_type& t const Kokkos::View& sol, typename vec_type::HostMirror y_ref_h) { using execution_space = typename vec_type::execution_space; using solver_type = KokkosODE::Experimental::RungeKutta; + using count_type = Kokkos::View; KokkosODE::Experimental::ODE_params params(num_steps); vec_type tmp("tmp vector", my_ode.neqs); mv_type kstack("k stack", solver_type::num_stages(), my_ode.neqs); + count_type count("time step count", 1); Kokkos::RangePolicy my_policy(0, 1); - RKSolve_wrapper solve_wrapper(my_ode, params, tstart, tend, y_old, - y_new, tmp, kstack); + RKSolve_wrapper solve_wrapper( + my_ode, params, tstart, tend, y_old, y_new, tmp, kstack, count); Kokkos::parallel_for(my_policy, solve_wrapper); auto y_new_h = Kokkos::create_mirror_view(y_new); @@ -284,7 +292,9 @@ void test_rate(ode_type& my_ode, const scalar_type& tstart, const scalar_type& t typename vec_type::HostMirror& y_ref_h, typename vec_type::HostMirror& error) { using execution_space = typename vec_type::execution_space; using solver_type = KokkosODE::Experimental::RungeKutta; + using count_type = Kokkos::View; + count_type count("time step count", 1); vec_type tmp("tmp vector", my_ode.neqs); mv_type kstack("k stack", solver_type::num_stages(), my_ode.neqs); @@ -295,10 +305,11 @@ void test_rate(ode_type& my_ode, const scalar_type& tstart, const scalar_type& t Kokkos::RangePolicy my_policy(0, 1); for (int idx = 0; idx < num_steps.extent_int(0); ++idx) { KokkosODE::Experimental::ODE_params params(num_steps(idx)); + params.adaptivity = false; Kokkos::deep_copy(y_old, y_old_h); Kokkos::deep_copy(y_new, y_old_h); - RKSolve_wrapper solve_wrapper(my_ode, params, tstart, tend, - y_old, y_new, tmp, kstack); + RKSolve_wrapper solve_wrapper( + my_ode, params, tstart, tend, y_old, y_new, tmp, kstack, count); Kokkos::parallel_for(my_policy, solve_wrapper); Kokkos::deep_copy(y_new_h, y_new); @@ -306,8 +317,8 @@ void test_rate(ode_type& my_ode, const scalar_type& tstart, const scalar_type& t #if defined(HAVE_KOKKOSKERNELS_DEBUG) scalar_type dt = (tend - tstart) / num_steps(idx); - std::cout << "dt=" << dt << ", error=" << error(idx) << ", solution: {" << y_new_h(0) << ", " << y_new_h(1) << "}" - << std::endl; + std::cout << "count=" << count(0) << ", dt=" << dt << ", error=" << error(idx) << ", solution: {" << y_new_h(0) + << ", " << y_new_h(1) << "}" << std::endl; #endif } @@ -424,9 +435,11 @@ void test_adaptivity() { using RK_type = KokkosODE::Experimental::RK_type; using vec_type = Kokkos::View; using mv_type = Kokkos::View; + using count_type = Kokkos::View; duho my_oscillator(1, 1, 4); const int neqs = my_oscillator.neqs; + count_type count("time step count", 1); vec_type y("solution", neqs), f("function", neqs); auto y_h = Kokkos::create_mirror(y); @@ -471,8 +484,8 @@ void test_adaptivity() { KokkosODE::Experimental::ODE_params params(numSteps, maxSteps, absTol, relTol, minStepSize); Kokkos::deep_copy(y_old, y_old_h); Kokkos::deep_copy(y_new, y_old_h); - RKSolve_wrapper solve_wrapper(my_oscillator, params, tstart, tend, - y_old, y_new, tmp, kstack); + RKSolve_wrapper solve_wrapper( + my_oscillator, params, tstart, tend, y_old, y_new, tmp, kstack, count); Kokkos::parallel_for(my_policy, solve_wrapper); auto y_new_h = Kokkos::create_mirror(y_new); diff --git a/packages/kokkos-kernels/ode/unit_test/Test_ODE_RK_chem.hpp b/packages/kokkos-kernels/ode/unit_test/Test_ODE_RK_chem.hpp index 690e271c84f4..aabdcbb490b1 100644 --- a/packages/kokkos-kernels/ode/unit_test/Test_ODE_RK_chem.hpp +++ b/packages/kokkos-kernels/ode/unit_test/Test_ODE_RK_chem.hpp @@ -92,6 +92,7 @@ void test_chem() { using mv_type = Kokkos::View; using RK_type = KokkosODE::Experimental::RK_type; using solver_type = KokkosODE::Experimental::RungeKutta; + using count_type = Kokkos::View; { chem_model_1 chem_model; @@ -101,6 +102,7 @@ void test_chem() { KokkosODE::Experimental::ODE_params params(num_steps); vec_type tmp("tmp vector", neqs); mv_type kstack("k stack", solver_type::num_stages(), neqs); + count_type count("time steps count", 1); // Set initial conditions vec_type y_new("solution", neqs); @@ -112,8 +114,8 @@ void test_chem() { Kokkos::deep_copy(y_new, y_old_h); Kokkos::RangePolicy my_policy(0, 1); - RKSolve_wrapper solve_wrapper( - chem_model, params, chem_model.tstart, chem_model.tend, y_old, y_new, tmp, kstack); + RKSolve_wrapper solve_wrapper( + chem_model, params, chem_model.tstart, chem_model.tend, y_old, y_new, tmp, kstack, count); Kokkos::parallel_for(my_policy, solve_wrapper); auto y_new_h = Kokkos::create_mirror(y_new); @@ -137,6 +139,7 @@ void test_chem() { KokkosODE::Experimental::ODE_params params(num_steps); vec_type tmp("tmp vector", neqs); mv_type kstack("k stack", solver_type::num_stages(), neqs); + count_type count("time steps count", 1); // Set initial conditions vec_type y_new("solution", neqs); @@ -153,8 +156,8 @@ void test_chem() { Kokkos::deep_copy(y_new, y_old_h); Kokkos::RangePolicy my_policy(0, 1); - RKSolve_wrapper solve_wrapper( - chem_model, params, chem_model.tstart, chem_model.tend, y_old, y_new, tmp, kstack); + RKSolve_wrapper solve_wrapper( + chem_model, params, chem_model.tstart, chem_model.tend, y_old, y_new, tmp, kstack, count); Kokkos::parallel_for(my_policy, solve_wrapper); auto y_new_h = Kokkos::create_mirror(y_new); diff --git a/packages/kokkos-kernels/ode/unit_test/Test_ODE_RK_counts.hpp b/packages/kokkos-kernels/ode/unit_test/Test_ODE_RK_counts.hpp new file mode 100644 index 000000000000..f76fcb0134bb --- /dev/null +++ b/packages/kokkos-kernels/ode/unit_test/Test_ODE_RK_counts.hpp @@ -0,0 +1,162 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include +#include "KokkosKernels_TestUtils.hpp" + +#include "KokkosODE_RungeKutta.hpp" +#include "Test_ODE_TestProblems.hpp" + +namespace Test { + +std::string RK_type_to_name(const KokkosODE::Experimental::RK_type RK) { + std::string name; + + switch (RK) { + case KokkosODE::Experimental::RK_type::RKFE: name = "Forward-Euler"; break; + case KokkosODE::Experimental::RK_type::RKEH: name = "Euler-Heun"; break; + case KokkosODE::Experimental::RK_type::RKF12: name = "Fehlberg 1-2"; break; + case KokkosODE::Experimental::RK_type::RKBS: name = "Bogacki-Shampine"; break; + case KokkosODE::Experimental::RK_type::RK4: name = "Classic RK order 4"; break; + case KokkosODE::Experimental::RK_type::RKF45: name = "Fehlberg 4-5"; break; + case KokkosODE::Experimental::RK_type::RKCK: name = "Cash-Karp"; break; + case KokkosODE::Experimental::RK_type::RKDP: name = "Dormand-Prince"; break; + default: name = "Unknown Runge-Kutta method"; + } + + return name; +} + +template +void RK_Count(const Device, const OdeType myODE, const double relTol, const double absTol, + const int /*expected_count*/) { + using execution_space = typename Device::execution_space; + using vec_type = Kokkos::View; + using mv_type = Kokkos::View; + using count_type = Kokkos::View; + + constexpr int neqs = myODE.neqs; + + constexpr double tstart = myODE.tstart(), tend = myODE.tend(); + constexpr int num_steps = myODE.numsteps(); + constexpr int maxSteps = 1e6; + constexpr double minStepSize = (tend - tstart) / (100 * maxSteps); + KokkosODE::Experimental::ODE_params params( + num_steps, maxSteps, 1.0e-12, (RK == KokkosODE::Experimental::RK_type::RKF12) ? 1.0e-8 : 1.0e-6, minStepSize); + + vec_type y("solution", neqs), f("function", neqs); + vec_type y_new("y new", neqs), y_old("y old", neqs); + count_type count("time step count", 1); + + auto y_h = Kokkos::create_mirror_view(y); + typename vec_type::HostMirror y_old_h = Kokkos::create_mirror(y_old); + auto y_ref_h = Kokkos::create_mirror(y); + for (int dofIdx = 0; dofIdx < neqs; ++dofIdx) { + y_h(dofIdx) = myODE.expected_val(tstart, dofIdx); + y_old_h(dofIdx) = y_h(dofIdx); + y_ref_h(dofIdx) = myODE.expected_val(tend, dofIdx); + } + Kokkos::deep_copy(y, y_h); + + vec_type tmp("tmp vector", neqs); + mv_type kstack("k stack", KokkosODE::Experimental::RungeKutta::num_stages(), neqs); + + Kokkos::RangePolicy my_policy(0, 1); + Kokkos::deep_copy(y_old, y_old_h); + Kokkos::deep_copy(y_new, y_old_h); + RKSolve_wrapper solve_wrapper(myODE, params, tstart, tend, y_old, + y_new, tmp, kstack, count); + Kokkos::parallel_for(my_policy, solve_wrapper); + + auto y_new_h = Kokkos::create_mirror(y_new); + Kokkos::deep_copy(y_new_h, y_new); + + typename count_type::HostMirror count_h = Kokkos::create_mirror_view(count); + Kokkos::deep_copy(count_h, count); + + double error = 0.0; + for (int eqIdx = 0; eqIdx < neqs; ++eqIdx) { + error += Kokkos::pow(y_ref_h(eqIdx) - y_new_h(eqIdx), 2.0) / + Kokkos::pow(absTol + relTol * Kokkos::abs(y_new_h(eqIdx)), 2.0); + } + error = Kokkos::sqrt(error / neqs); + + std::string msg = std::string(OdeType::name) + ", " + RK_type_to_name(RK); + EXPECT_LE(error, 1.0) << msg.c_str(); + // EXPECT_LE(count_h(0), expected_count); +} // RK_Count + +} // namespace Test + +template +void test_RK_count() { + // RK_Count (Device, OdeType, relTol, absTol, /*expected_count*/) + Test::RK_Count(TestDevice(), TestProblem::DegreeOnePoly(), 1.0e-6, 1e-12, 2); + Test::RK_Count(TestDevice(), TestProblem::DegreeTwoPoly(), 1.0e-6, 1e-12, 2); + Test::RK_Count(TestDevice(), TestProblem::DegreeThreePoly(), 1.0e-6, 1e-12, 2); + Test::RK_Count(TestDevice(), TestProblem::DegreeFivePoly(), 1.0e-6, 1e-12, 5); + Test::RK_Count(TestDevice(), TestProblem::Exponential(0.7), 2.0e-6, 1e-12, 4); + Test::RK_Count(TestDevice(), TestProblem::SpringMassDamper(1001., 1000.), 1.0e-4, 0.0, 272); + Test::RK_Count(TestDevice(), TestProblem::CosExp(-10., 2., 1.), 5.3e-5, 0.0, 25); + if constexpr (RK == KokkosODE::Experimental::RK_type::RKF12) { + Test::RK_Count(TestDevice(), TestProblem::StiffChemicalDecayProcess(1e4, 1.), 4e-9, 1e-9, 2786); + } else { + Test::RK_Count(TestDevice(), TestProblem::StiffChemicalDecayProcess(1e4, 1.), 4e-9, 1.8e-10, 2786); + } + Test::RK_Count(TestDevice(), TestProblem::Tracer(10.0), 0.0, 1e-3, 10); + Test::RK_Count(TestDevice(), TestProblem::EnrightB5(), 1.3e-2, 0.0, 90); + if constexpr (RK == KokkosODE::Experimental::RK_type::RKF12) { + Test::RK_Count(TestDevice(), TestProblem::EnrightC1(), 1.e-4, 1e-14, 90); + } else { + Test::RK_Count(TestDevice(), TestProblem::EnrightC1(), 1.e-5, 1e-14, 90); + } + if constexpr (RK == KokkosODE::Experimental::RK_type::RKF12) { + Test::RK_Count(TestDevice(), TestProblem::EnrightC5(), 1.e-4, 1e-14, 97); + } else { + Test::RK_Count(TestDevice(), TestProblem::EnrightC5(), 1.e-5, 1e-14, 97); + } + if constexpr (RK == KokkosODE::Experimental::RK_type::RKF12) { + Test::RK_Count(TestDevice(), TestProblem::EnrightD2(), 2.e-4, 0.0, 590); + } else { + Test::RK_Count(TestDevice(), TestProblem::EnrightD2(), 1.e-5, 0.0, 590); + } + Test::RK_Count(TestDevice(), TestProblem::EnrightD4(), 1.e-5, 1.e-9, 932); +#if defined(KOKKOS_ENABLE_SYCL) + if constexpr ((RK != KokkosODE::Experimental::RK_type::RKF12) && + !std::is_same_v) { +#else + if constexpr (RK != KokkosODE::Experimental::RK_type::RKF12) { +#endif + Test::RK_Count(TestDevice(), TestProblem::KKStiffChemistry(), 1e-5, 0.0, 1); + } +} + +void test_count() { + using RK_type = KokkosODE::Experimental::RK_type; + + // test_RK_count(); + test_RK_count(); + test_RK_count(); + test_RK_count(); + test_RK_count(); + test_RK_count(); + test_RK_count(); + // test_RK_count(); +} + +#if defined(KOKKOSKERNELS_INST_DOUBLE) +TEST_F(TestCategory, RK_Count) { test_count(); } +#endif diff --git a/packages/kokkos-kernels/ode/unit_test/Test_ODE_TestProblems.hpp b/packages/kokkos-kernels/ode/unit_test/Test_ODE_TestProblems.hpp new file mode 100644 index 000000000000..2b66ec682d9f --- /dev/null +++ b/packages/kokkos-kernels/ode/unit_test/Test_ODE_TestProblems.hpp @@ -0,0 +1,649 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef TEST_ODE_TESTPROBLEMS_HPP +#define TEST_ODE_TESTPROBLEMS_HPP + +namespace TestProblem { + +struct DegreeOnePoly { + template + KOKKOS_FUNCTION void evaluate_function(double /*t*/, double /*dt*/, View1& /*y*/, View2& dydt) const { + for (int dofIdx = 0; dofIdx < neqs; ++dofIdx) { + dydt(dofIdx) = 1; + } + } + + template + KOKKOS_FUNCTION void evaluate_jacobian(double /*t*/, double /*dt*/, View1& /*y*/, View2& jac) const { + for (int rowIdx = 0; rowIdx < neqs; ++rowIdx) { + for (int colIdx = 0; colIdx < neqs; ++colIdx) { + jac(rowIdx, colIdx) = 0; + } + } + } + + KOKKOS_FUNCTION constexpr double tstart() const { return 0.0; } + KOKKOS_FUNCTION constexpr double tend() const { return 1.0; } + KOKKOS_FUNCTION constexpr int numsteps() const { return 10; } + KOKKOS_FUNCTION double expected_val(const double t, const int /*n*/) const { return t + 1.0; } + KOKKOS_FUNCTION static constexpr int num_equations() { return neqs; } + static constexpr int neqs = 1; + static constexpr char name[] = "DegreeOnePoly"; +}; + +struct DegreeTwoPoly { + template + KOKKOS_FUNCTION void evaluate_function(double t, double /*dt*/, View1& /*y*/, View2& dydt) const { + for (int dofIdx = 0; dofIdx < neqs; ++dofIdx) { + dydt(dofIdx) = t + 1; + } + } + + template + KOKKOS_FUNCTION void evaluate_jacobian(double /*t*/, double /*dt*/, View1& /*y*/, View2& jac) const { + for (int rowIdx = 0; rowIdx < neqs; ++rowIdx) { + for (int colIdx = 0; colIdx < neqs; ++colIdx) { + jac(rowIdx, colIdx) = 0; + } + } + } + + KOKKOS_FUNCTION constexpr double tstart() const { return 0.0; } + KOKKOS_FUNCTION constexpr double tend() const { return 1.0; } + KOKKOS_FUNCTION constexpr int numsteps() const { return 10; } + KOKKOS_FUNCTION double expected_val(const double t, const int /*n*/) const { return 0.5 * t * t + t + 1.0; } + KOKKOS_FUNCTION static constexpr int num_equations() { return neqs; } + static constexpr int neqs = 1; + static constexpr char name[] = "DegreeTwoPoly"; +}; + +struct DegreeThreePoly { + template + KOKKOS_FUNCTION void evaluate_function(double t, double /*dt*/, View1& /*y*/, View2& dydt) const { + for (int dofIdx = 0; dofIdx < neqs; ++dofIdx) { + dydt(dofIdx) = (t * t) + t + 1; + } + } + + template + KOKKOS_FUNCTION void evaluate_jacobian(double /*t*/, double /*dt*/, View1& /*y*/, View2& jac) const { + for (int rowIdx = 0; rowIdx < neqs; ++rowIdx) { + for (int colIdx = 0; colIdx < neqs; ++colIdx) { + jac(rowIdx, colIdx) = 0; + } + } + } + + KOKKOS_FUNCTION constexpr double tstart() const { return 0.0; } + KOKKOS_FUNCTION constexpr double tend() const { return 1.0; } + KOKKOS_FUNCTION constexpr int numsteps() const { return 10; } + KOKKOS_FUNCTION double expected_val(const double t, const int /*n*/) const { + return (1. / 3) * (t * t * t) + (1. / 2) * (t * t) + t + 1; + } + KOKKOS_FUNCTION static constexpr int num_equations() { return neqs; } + static constexpr int neqs = 1; + static constexpr char name[] = "DegreeThreePoly"; +}; + +struct DegreeFivePoly { + template + KOKKOS_FUNCTION void evaluate_function(double t, double /*dt*/, View1& /*y*/, View2& dydt) const { + for (int dofIdx = 0; dofIdx < neqs; ++dofIdx) { + dydt(dofIdx) = (t * t * t * t) + (t * t * t) + (t * t) + t + 1; + } + } + + template + KOKKOS_FUNCTION void evaluate_jacobian(double /*t*/, double /*dt*/, View1& /*y*/, View2& jac) const { + for (int rowIdx = 0; rowIdx < neqs; ++rowIdx) { + for (int colIdx = 0; colIdx < neqs; ++colIdx) { + jac(rowIdx, colIdx) = 0; + } + } + } + + KOKKOS_FUNCTION constexpr double tstart() const { return 0.0; } + KOKKOS_FUNCTION constexpr double tend() const { return 1.0; } + KOKKOS_FUNCTION constexpr int numsteps() const { return 10; } + KOKKOS_FUNCTION double expected_val(const double t, const int /*n*/) const { + return (1. / 5) * (t * t * t * t * t) + (1. / 4) * (t * t * t * t) + (1. / 3) * (t * t * t) + (1. / 2) * (t * t) + + t + 1; + } + KOKKOS_FUNCTION static constexpr int num_equations() { return neqs; } + static constexpr int neqs = 1; + static constexpr char name[] = "DegreeFivePoly"; +}; + +struct Exponential { + Exponential(double rate_) : rate(rate_) {} + + template + KOKKOS_FUNCTION void evaluate_function(double /*t*/, double /*dt*/, View1& y, View2& dydt) const { + for (int dofIdx = 0; dofIdx < neqs; ++dofIdx) { + dydt(dofIdx) = rate * y(dofIdx); + } + } + + template + KOKKOS_FUNCTION void evaluate_jacobian(double /*t*/, double /*dt*/, View1& /*y*/, View2& jac) const { + for (int rowIdx = 0; rowIdx < neqs; ++rowIdx) { + for (int colIdx = 0; colIdx < neqs; ++colIdx) { + jac(rowIdx, colIdx) = 0; + } + } + + for (int rowIdx = 0; rowIdx < neqs; ++rowIdx) { + jac(rowIdx, rowIdx) = rate; + } + } + + KOKKOS_FUNCTION constexpr double tstart() const { return 0.0; } + KOKKOS_FUNCTION constexpr double tend() const { return 1.0; } + KOKKOS_FUNCTION constexpr int numsteps() const { return 10; } + KOKKOS_FUNCTION double expected_val(const double t, const int /*n*/) const { return Kokkos::exp(rate * t); } + KOKKOS_FUNCTION static constexpr int num_equations() { return neqs; } + static constexpr int neqs = 1; + const double rate; + static constexpr char name[] = "Exponential"; +}; + +struct SpringMassDamper { + SpringMassDamper(double c_, double k_) + : c(c_), + k(k_), + lambda1((-c + Kokkos::pow(c * c - 4. * k, 0.5)) / 2.), + lambda2((-c - Kokkos::pow(c * c - 4. * k, 0.5)) / 2.) {} + + template + KOKKOS_FUNCTION void evaluate_function(double /*t*/, double /*dt*/, View1& y, View2& dydt) const { + dydt[0] = y[1]; + dydt[1] = -k * y[0] - c * y[1]; + } + + template + KOKKOS_FUNCTION void evaluate_jacobian(double /*t*/, double /*dt*/, View1& /*y*/, View2& jac) const { + jac(0, 0) = 0.; + jac(0, 1) = 1.; + jac(1, 0) = -k; + jac(1, 1) = -c; + } + + KOKKOS_FUNCTION constexpr double tstart() const { return 0.0; } + KOKKOS_FUNCTION constexpr double tend() const { return 1.0; } + KOKKOS_FUNCTION constexpr int numsteps() const { return 10; } + KOKKOS_FUNCTION double expected_val(const double t, const int n) const { + using Kokkos::exp; + + const double det = lambda1 - lambda2; + double val = 0; + + if (n == 0) { + val = -(lambda2 / det) * exp(lambda1 * t) + (lambda1 / det) * exp(lambda2 * t); + } else { + val = -(lambda2 * lambda1 / det) * exp(lambda1 * t) + (lambda1 * lambda2 / det) * exp(lambda2 * t); + } + + return val; + } + + KOKKOS_FUNCTION static constexpr int num_equations() { return neqs; } + + static constexpr int neqs = 2; + const double c; + const double k; + const double lambda1; + const double lambda2; + static constexpr char name[] = "SpringMassDamper"; +}; + +// Example 8.1 from Leveque + +struct CosExp { + CosExp(double lambda_, double t0_, double eta_) : lambda(lambda_), t0(t0_), eta(eta_) {} + + template + KOKKOS_FUNCTION void evaluate_function(double t, double /*dt*/, View1& y, View2& dydt) const { + for (int i = 0; i < neqs; i++) { + dydt(i) = lambda * (y(i) - Kokkos::cos(t)) - Kokkos::sin(t); + } + } + + template + KOKKOS_FUNCTION void evaluate_jacobian(double /*t*/, double /*dt*/, View1& /*y*/, View2& jac) const { + jac(0, 0) = 0.0; + + for (int i = 0; i < neqs; ++i) { + jac(i, i) = lambda; + } + } + + KOKKOS_FUNCTION constexpr double tstart() const { return 0.0; } + KOKKOS_FUNCTION constexpr double tend() const { return 10.0; } + KOKKOS_FUNCTION constexpr int numsteps() const { return 10; } + KOKKOS_FUNCTION double expected_val(const double t, const int /*n*/) const { + return Kokkos::exp(lambda * (t - t0)) * (eta - Kokkos::cos(t0)) + Kokkos::cos(t); + } + KOKKOS_FUNCTION static constexpr int num_equations() { return neqs; } + + static constexpr int neqs = 1; + const double lambda; + const double t0; + const double eta; + static constexpr char name[] = "CosExp"; +}; + +// Example 7.9 in LeVeque + +struct StiffChemicalDecayProcess { + StiffChemicalDecayProcess(double K1_, double K2_) : K1(K1_), K2(K2_) {} + + template + KOKKOS_FUNCTION void evaluate_function(double /*t*/, double /*dt*/, View1& y, View2& dydt) const { + dydt[0] = -K1 * y[0]; + dydt[1] = K1 * y[0] - K2 * y[1]; + dydt[2] = K2 * y[1]; + } + + template + KOKKOS_FUNCTION void evaluate_jacobian(double /*t*/, double /*dt*/, View1& /*y*/, View2& jac) const { + jac(0, 0) = -K1; + jac(0, 1) = 0.; + jac(0, 2) = 0.; + + jac(1, 0) = K1; + jac(1, 1) = -K2; + jac(1, 2) = 0.; + + jac(2, 0) = 0.; + jac(2, 1) = K2; + jac(2, 2) = 0.; + } + + KOKKOS_FUNCTION constexpr double tstart() const { return 0.0; } + KOKKOS_FUNCTION constexpr double tend() const { return 0.2; } + KOKKOS_FUNCTION constexpr int numsteps() const { return 10; } + KOKKOS_FUNCTION double expected_val(const double t, const int n) const { + using Kokkos::exp; + + const double C21 = y1_init * K1 / (K2 - K1); + const double C22 = y2_init - C21; + + double val = 0.0; + + if (n == 0) { + val = y1_init * exp(-K1 * t); + } else if (n == 1) { + val = C21 * exp(-K1 * t) + C22 * exp(-K2 * t); + } else { + const double C31 = -K2 * C21 / K1; + const double C32 = -C22; + const double C33 = y1_init + y2_init + y3_init; + + val = C31 * exp(-K1 * t) + C32 * exp(-K2 * t) + C33; + } + + return val; + } + KOKKOS_FUNCTION static constexpr int num_equations() { return neqs; } + + static constexpr int neqs = 3; + const double y1_init = 3.0; + const double y2_init = 4.0; + const double y3_init = 2.0; + const double K1; + const double K2; + static constexpr char name[] = "StiffChemicalDecay"; +}; + +struct Tracer { + Tracer(double rate_) : rate(rate_) {} + + template + KOKKOS_FUNCTION void evaluate_function(double /*t*/, double /*dt*/, View1& y, View2& dydt) const { + for (int i = 0; i < neqs; i += 2) { + // const double R = Kokkos::sqrt(y[i] * y[i] + y[i + 1] * y[i + 1]); + // dydt[i] = -rate * y[i + 1] / R; + // dydt[i + 1] = rate * y[i] / R; + dydt[i] = -rate * y[i + 1]; + dydt[i + 1] = rate * y[i]; + } + } + + KOKKOS_FUNCTION constexpr double tstart() const { return 0.0; } + KOKKOS_FUNCTION constexpr double tend() const { return 2.0 * pi; } + KOKKOS_FUNCTION constexpr int numsteps() const { return 10; } + KOKKOS_FUNCTION double expected_val(const double t, const int n) const { + const double theta = rate * t; + double val = 0.0; + + if (n % 2 == 0) { + val = Kokkos::cos(theta); + } else { + val = Kokkos::sin(theta); + } + return val; + } + KOKKOS_FUNCTION static constexpr int num_equations() { return neqs; } + + static constexpr int neqs = 2; + static constexpr double pi = 3.14159265358979323846; + const double rate; + static constexpr char name[] = "Tracer"; +}; + +struct EnrightB5 { + EnrightB5(double alpha_ = 100.0) : alpha(alpha_) {} + + template + KOKKOS_FUNCTION void evaluate_function(double /*t*/, double /*dt*/, View1& y, View2& dydt) const { + dydt[0] = -10. * y[0] + alpha * y[1]; + dydt[1] = -alpha * y[0] - 10. * y[1]; + dydt[2] = -4. * y[2]; + dydt[3] = -y[3]; + dydt[4] = -0.5 * y[4]; + dydt[5] = -0.1 * y[5]; + } + + template + KOKKOS_FUNCTION void evaluate_jacobian(double /*t*/, double /*dt*/, View1& /*y*/, View2& jac) const { + for (int rowIdx = 0; rowIdx < neqs; ++rowIdx) { + for (int colIdx = 0; colIdx < neqs; ++colIdx) { + jac(rowIdx, colIdx) = 0.0; + } + } + + jac(0, 0) = -10.; + jac(0, 1) = alpha; + jac(1, 0) = -alpha; + jac(1, 1) = -10.; + jac(2, 2) = -4.; + jac(3, 3) = -1.; + jac(4, 4) = -0.5; + jac(5, 5) = -0.1; + } + + KOKKOS_FUNCTION constexpr double tstart() const { return 0.0; } + KOKKOS_FUNCTION constexpr double tend() const { return 0.2; } + KOKKOS_FUNCTION constexpr int numsteps() const { return 10; } + KOKKOS_FUNCTION double expected_val(const double t, const int n) const { + using Kokkos::cos; + using Kokkos::exp; + using Kokkos::sin; + + double val = 0.0; + + const double c1 = 1.0; + const double c2 = -1.0; + + const double a[2] = {0, 1}; + const double b[2] = {-1, 0}; + + if (n < 2) { + val = exp(-10. * t) * (c1 * (a[n] * cos(alpha * t) - b[n] * sin(alpha * t)) + + c2 * (a[n] * sin(alpha * t) + b[n] * cos(alpha * t))); + } else if (n == 2) { + val = exp(-4. * t); + } else if (n == 3) { + val = exp(-t); + } else if (n == 4) { + val = exp(-0.5 * t); + } else { + val = exp(-0.1 * t); + } + + return val; + } + + KOKKOS_FUNCTION static constexpr int num_equations() { return neqs; } + static constexpr int neqs = 6; + const double alpha; + static constexpr char name[] = "EnrightB5"; +}; // EnrightB5 + +struct EnrightC1 { + template + KOKKOS_FUNCTION void evaluate_function(double /*t*/, double /*dt*/, View1& y, View2& dydt) const { + dydt[0] = -y[0] + y[1] * y[1] + y[2] * y[2] + y[3] * y[3]; + dydt[1] = -10. * y[1] + 10. * (y[2] * y[2] + y[3] * y[3]); + dydt[2] = -40. * y[2] + 40. * y[3] * y[3]; + dydt[3] = -100.0 * y[3] + 2.; + } + + template + KOKKOS_FUNCTION void evaluate_jacobian(double /*t*/, double /*dt*/, View1& y, View2& jac) const { + for (int rowIdx = 0; rowIdx < neqs; ++rowIdx) { + for (int colIdx = 0; colIdx < neqs; ++colIdx) { + jac(rowIdx, colIdx) = 0.0; + } + } + + jac(0, 0) = -1.; + jac(0, 1) = 2. * y[1]; + jac(0, 2) = 2. * y[2]; + jac(0, 3) = 2. * y[3]; + + jac(1, 1) = -10.; + jac(1, 2) = 20. * y[2]; + jac(1, 3) = 20. * y[3]; + + jac(2, 2) = -40.; + jac(2, 3) = 80. * y[3]; + + jac(3, 3) = -100.; + } + + KOKKOS_FUNCTION constexpr double tstart() const { return 0.0; } + KOKKOS_FUNCTION constexpr double tend() const { return 20.0; } + KOKKOS_FUNCTION constexpr int numsteps() const { return 10; } + KOKKOS_FUNCTION double expected_val(const double t, const int n) const { + if (t == 0) { + return 1.; + } else { + // cvode sol + constexpr Kokkos::Array y = {4.003235e-04, 4.001600e-04, 4.000000e-04, 2.000000e-02}; + return y[n]; + } + } + KOKKOS_FUNCTION static constexpr int num_equations() { return neqs; } + static constexpr int neqs = 4; + static constexpr char name[] = "EnrightC1"; +}; + +struct EnrightC5 { + EnrightC5(const double beta_ = 20.0) : beta(beta_) {} + + template + KOKKOS_FUNCTION void evaluate_function(double /*t*/, double /*dt*/, View1& y, View2& dydt) const { + dydt[0] = -y[0] + 2.; + dydt[1] = -10. * y[1] + beta * y[0] * y[0]; + dydt[2] = -40. * y[2] + 4. * beta * (y[0] * y[0] + y[1] * y[1]); + dydt[3] = -100.0 * y[3] + 10. * beta * (y[0] * y[0] + y[1] * y[1] + y[2] * y[2]); + } + + template + KOKKOS_FUNCTION void evaluate_jacobian(double /*t*/, double /*dt*/, View1& y, View2& jac) const { + for (int rowIdx = 0; rowIdx < neqs; ++rowIdx) { + for (int colIdx = 0; colIdx < neqs; ++colIdx) { + jac(rowIdx, colIdx) = 0.0; + } + } + + jac(0, 0) = -1.; + + jac(1, 0) = 2 * beta * y[0]; + jac(1, 1) = -10.; + + jac(2, 0) = 8. * beta * y[0]; + jac(2, 1) = 8. * beta * y[1]; + jac(2, 2) = -40.; + + jac(3, 0) = 20. * beta * y[0]; + jac(3, 1) = 20. * beta * y[1]; + jac(3, 2) = 20. * beta * y[2]; + jac(3, 3) = -100.; + } + + KOKKOS_FUNCTION constexpr double tstart() const { return 0.0; } + KOKKOS_FUNCTION constexpr double tend() const { return 20.0; } + KOKKOS_FUNCTION constexpr int numsteps() const { return 10; } + KOKKOS_FUNCTION double expected_val(const double t, const int n) const { + if (t == 0) { + return 1.; + } else { + // cvode sol + constexpr Kokkos::Array y = {2.000000e+00, 8.000000e+00, 1.360000e+02, 3.712800e+04}; + return y[n]; + } + } + KOKKOS_FUNCTION static constexpr int num_equations() { return neqs; } + static constexpr int neqs = 4; + const double beta; + static constexpr char name[] = "EnrightC5"; +}; + +struct EnrightD2 { + template + KOKKOS_FUNCTION void evaluate_function(double /*t*/, double /*dt*/, View1& y, View2& dydt) const { + dydt[0] = -0.04 * y[0] + 0.01 * y[1] * y[2]; + dydt[1] = 400.0 * y[0] - 100.0 * y[1] * y[2] - 3000. * y[1] * y[1]; + dydt[2] = 30. * y[1] * y[1]; + } + + template + KOKKOS_FUNCTION void evaluate_jacobian(double /*t*/, double /*dt*/, View1& y, View2& jac) const { + for (int rowIdx = 0; rowIdx < neqs; ++rowIdx) { + for (int colIdx = 0; colIdx < neqs; ++colIdx) { + jac(rowIdx, colIdx) = 0.0; + } + } + + jac(0, 0) = -0.04; + jac(0, 1) = 0.01 * y[2]; + jac(0, 2) = 0.01 * y[1]; + + jac(1, 0) = 400.; + jac(1, 1) = -100. * y[2] - 6000. * y[1]; + jac(1, 2) = -100. * y[1]; + + jac(2, 1) = 60. * y[1]; + } + + KOKKOS_FUNCTION constexpr double tstart() const { return 0.0; } + KOKKOS_FUNCTION constexpr double tend() const { return 40.0; } + KOKKOS_FUNCTION constexpr int numsteps() const { return 100; } + KOKKOS_FUNCTION double expected_val(const double t, const int n) const { + if (t == 0.) { + constexpr Kokkos::Array y = {1., 0., 0.}; + return y[n]; + } else { + // cvode solution + constexpr Kokkos::Array y = {7.158278e-01, 9.185559e-02, 2.841630e+01}; + return y[n]; + } + } + KOKKOS_FUNCTION static constexpr int num_equations() { return neqs; } + static constexpr int neqs = 3; + static constexpr char name[] = "EnrightD2"; +}; + +struct EnrightD4 { + template + KOKKOS_FUNCTION void evaluate_function(double /*t*/, double /*dt*/, View1& y, View2& dydt) const { + dydt[0] = -0.013 * y[0] - 1000. * y[0] * y[2]; + dydt[1] = -2500. * y[1] * y[2]; + dydt[2] = dydt[0] + dydt[1]; + } + + template + KOKKOS_FUNCTION void evaluate_jacobian(double /*t*/, double /*dt*/, View1& y, View2& jac) const { + for (int rowIdx = 0; rowIdx < neqs; ++rowIdx) { + for (int colIdx = 0; colIdx < neqs; ++colIdx) { + jac(rowIdx, colIdx) = 0.0; + } + } + + jac(0, 0) = -0.013 - 1000. * y[2]; + jac(0, 2) = -1000. * y[0]; + + jac(1, 1) = -2500. * y[2]; + jac(1, 2) = -2500. * y[1]; + + jac(2, 0) = jac(0, 0); + jac(2, 1) = jac(1, 1); + jac(2, 2) = jac(0, 2) + jac(1, 2); + } + + KOKKOS_FUNCTION constexpr double tstart() const { return 0.0; } + KOKKOS_FUNCTION constexpr double tend() const { return 50.0; } // 50.0; } + KOKKOS_FUNCTION constexpr int numsteps() const { return 10; } + KOKKOS_FUNCTION double expected_val(const double t, const int n) const { + if (t == 0.) { + constexpr Kokkos::Array y = {1., 1., 0}; + return y[n]; + } else { + // cvode solution at tend + constexpr Kokkos::Array y = {5.976506e-01, 1.402347e+00, -1.893371e-06}; + return y[n]; + } + } + KOKKOS_FUNCTION static constexpr int num_equations() { return neqs; } + static constexpr int neqs = 3; + static constexpr char name[] = "EnrightD4"; +}; + +// Robertson Autocatalytic reaction +struct KKStiffChemistry { + template + KOKKOS_FUNCTION void evaluate_function(double /*t*/, double /*dt*/, View1& y, View2& dydt) const { + dydt(0) = -0.04 * y(0) + 1.e4 * y(1) * y(2); + dydt(1) = 0.04 * y(0) - 1.e4 * y(1) * y(2) - 3.e7 * y(1) * y(1); + dydt(2) = 3.e7 * y(1) * y(1); + } + + template + KOKKOS_FUNCTION void evaluate_jacobian(double /*t*/, double /*dt*/, View1& y, View2& jac) const { + jac(0, 0) = -0.04; + jac(0, 1) = 1.e4 * y(2); + jac(0, 2) = 1.e4 * y(1); + jac(1, 0) = 0.04; + jac(1, 1) = -1.e4 * y(2) - 3.e7 * 2.0 * y(1); + jac(1, 2) = -1.e4 * y(1); + jac(2, 0) = 0.0; + jac(2, 1) = 3.e7 * 2.0 * y(1); + jac(2, 2) = 0.0; + } + + KOKKOS_FUNCTION constexpr double tstart() const { return 0.0; } + KOKKOS_FUNCTION constexpr double tend() const { return 500.0; } + KOKKOS_FUNCTION constexpr int numsteps() const { return 1000; } + KOKKOS_FUNCTION double expected_val(const double t, const int n) const { + if (t == 0) { + return n == 0 ? 1. : 0.; + } else { + // cvode solution + constexpr Kokkos::Array y = {4.226713e-01, 2.885221e-06, 5.773258e-01}; + return y[n]; + } + } + KOKKOS_FUNCTION static constexpr int num_equations() { return neqs; } + static constexpr int neqs = 3; + static constexpr char name[] = "Robertson Autocatalytic"; +}; + +} // namespace TestProblem + +#endif // TEST_ODE_TESTPROBLEMS_HPP diff --git a/packages/kokkos-kernels/perf_test/CMakeLists.txt b/packages/kokkos-kernels/perf_test/CMakeLists.txt index 28271dfb0d36..8d740966f039 100644 --- a/packages/kokkos-kernels/perf_test/CMakeLists.txt +++ b/packages/kokkos-kernels/perf_test/CMakeLists.txt @@ -11,21 +11,6 @@ if (KokkosKernels_ENABLE_PERFTESTS) #build correctly with or without MPI, but only run them with a single #MPI process. - SET(GTEST_SOURCE_DIR ${PACKAGE_SOURCE_DIR}/tpls/gtest) - - KOKKOSKERNELS_ADD_TEST_LIBRARY( - kokkoskernelsperf_gtest - HEADERS ${GTEST_SOURCE_DIR}/gtest/gtest.h - SOURCES ${GTEST_SOURCE_DIR}/gtest/gtest-all.cc - ) - #Disables pthreads, this is a problem for serial builds in Trilinos & Sierra if it's enabled. - - TARGET_COMPILE_DEFINITIONS(kokkoskernelsperf_gtest PUBLIC "-DGTEST_HAS_PTHREAD=0") - TARGET_INCLUDE_DIRECTORIES(kokkoskernelsperf_gtest PUBLIC $) - - #Gtest minimally requires C++ 11 - TARGET_COMPILE_FEATURES(kokkoskernelsperf_gtest PUBLIC cxx_std_11) - KOKKOSKERNELS_INCLUDE_DIRECTORIES(sparse) if(KokkosKernels_ENABLE_TESTS_AND_PERFSUITE) diff --git a/packages/kokkos-kernels/perf_test/KokkosKernels_perf_test_utilities.hpp b/packages/kokkos-kernels/perf_test/KokkosKernels_perf_test_utilities.hpp index ec767c68f756..6d11e021a96a 100644 --- a/packages/kokkos-kernels/perf_test/KokkosKernels_perf_test_utilities.hpp +++ b/packages/kokkos-kernels/perf_test/KokkosKernels_perf_test_utilities.hpp @@ -20,7 +20,7 @@ #ifndef KOKKOSKERNELS_PERF_TEST_UTILITIES_HPP #define KOKKOSKERNELS_PERF_TEST_UTILITIES_HPP -#include "KokkosKernels_TestUtils.hpp" // for string_compare_no_case +#include "KokkosKernels_TestStringUtils.hpp" // for string_compare_no_case // Namepsace that defines common utilities // for performance tests diff --git a/packages/kokkos-kernels/perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test.cpp b/packages/kokkos-kernels/perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test.cpp index 23d53ba10669..49c7a7621032 100644 --- a/packages/kokkos-kernels/perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test.cpp +++ b/packages/kokkos-kernels/perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test.cpp @@ -14,10 +14,12 @@ // //@HEADER +#include + #include #include #include -#include "KokkosKernels_TestUtils.hpp" +#include "KokkosKernels_TestStringUtils.hpp" struct Params { int use_cuda = 0; diff --git a/packages/kokkos-kernels/perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test_benchmark.cpp b/packages/kokkos-kernels/perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test_benchmark.cpp index d4e3754f14be..9dcf5d651547 100644 --- a/packages/kokkos-kernels/perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test_benchmark.cpp +++ b/packages/kokkos-kernels/perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test_benchmark.cpp @@ -85,7 +85,7 @@ static void run(benchmark::State& state) { using MemSpace = typename ExecSpace::memory_space; using Device = Kokkos::Device; - std::cout << "Running BLAS Level 1 DOT perfomrance experiment (" << ExecSpace::name() << ")\n"; + std::cout << "Running BLAS Level 1 DOT performance experiment (" << ExecSpace::name() << ")\n"; std::cout << "Each test input vector has a length of " << m << std::endl; diff --git a/packages/kokkos-kernels/perf_test/blas/blas1/KokkosBlas_dot_perf_test.cpp b/packages/kokkos-kernels/perf_test/blas/blas1/KokkosBlas_dot_perf_test.cpp index a383739aacb6..36f11733ce81 100644 --- a/packages/kokkos-kernels/perf_test/blas/blas1/KokkosBlas_dot_perf_test.cpp +++ b/packages/kokkos-kernels/perf_test/blas/blas1/KokkosBlas_dot_perf_test.cpp @@ -14,12 +14,14 @@ // //@HEADER +#include + #include #include // For RPS implementation #include "KokkosBlas_dot_perf_test.hpp" -#include "KokkosKernels_TestUtils.hpp" +#include "KokkosKernels_TestStringUtils.hpp" struct Params { int use_cuda = 0; @@ -110,7 +112,7 @@ void run(int m, int repeat) { using MemSpace = typename ExecSpace::memory_space; using Device = Kokkos::Device; - std::cout << "Running BLAS Level 1 DOT perfomrance experiment (" << ExecSpace::name() << ")\n"; + std::cout << "Running BLAS Level 1 DOT performance experiment (" << ExecSpace::name() << ")\n"; std::cout << "Each test input vector has a length of " << m << std::endl; diff --git a/packages/kokkos-kernels/perf_test/blas/blas1/KokkosBlas_dot_perf_test_benchmark.cpp b/packages/kokkos-kernels/perf_test/blas/blas1/KokkosBlas_dot_perf_test_benchmark.cpp index be622ffaec29..10d3aa82d72d 100644 --- a/packages/kokkos-kernels/perf_test/blas/blas1/KokkosBlas_dot_perf_test_benchmark.cpp +++ b/packages/kokkos-kernels/perf_test/blas/blas1/KokkosBlas_dot_perf_test_benchmark.cpp @@ -84,7 +84,7 @@ static void run(benchmark::State& state) { using MemSpace = typename ExecSpace::memory_space; using Device = Kokkos::Device; - std::cout << "Running BLAS Level 1 DOT perfomrance experiment (" << ExecSpace::name() << ")\n"; + std::cout << "Running BLAS Level 1 DOT performance experiment (" << ExecSpace::name() << ")\n"; std::cout << "Each test input vector has a length of " << m << std::endl; diff --git a/packages/kokkos-kernels/perf_test/blas/blas1/KokkosBlas_team_dot_perf_test.cpp b/packages/kokkos-kernels/perf_test/blas/blas1/KokkosBlas_team_dot_perf_test.cpp index 16dadd34b618..ca30ed50ace2 100644 --- a/packages/kokkos-kernels/perf_test/blas/blas1/KokkosBlas_team_dot_perf_test.cpp +++ b/packages/kokkos-kernels/perf_test/blas/blas1/KokkosBlas_team_dot_perf_test.cpp @@ -14,10 +14,12 @@ // //@HEADER +#include + #include #include #include -#include "KokkosKernels_TestUtils.hpp" +#include "KokkosKernels_TestStringUtils.hpp" struct Params { int use_cuda = 0; diff --git a/packages/kokkos-kernels/perf_test/blas/blas1/KokkosBlas_team_dot_perf_test_benchmark.cpp b/packages/kokkos-kernels/perf_test/blas/blas1/KokkosBlas_team_dot_perf_test_benchmark.cpp index 776c291ad053..ab0750e5b6cf 100644 --- a/packages/kokkos-kernels/perf_test/blas/blas1/KokkosBlas_team_dot_perf_test_benchmark.cpp +++ b/packages/kokkos-kernels/perf_test/blas/blas1/KokkosBlas_team_dot_perf_test_benchmark.cpp @@ -42,10 +42,12 @@ //@HEADER */ +#include + #include #include #include -#include "KokkosKernels_TestUtils.hpp" +#include "KokkosKernels_TestStringUtils.hpp" #include diff --git a/packages/kokkos-kernels/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test.cpp b/packages/kokkos-kernels/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test.cpp index 924b63eceeab..90fb06574112 100644 --- a/packages/kokkos-kernels/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test.cpp +++ b/packages/kokkos-kernels/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test.cpp @@ -16,7 +16,7 @@ #include "KokkosBlas2_gemv.hpp" #include -#include "KokkosKernels_TestUtils.hpp" +#include "KokkosKernels_TestStringUtils.hpp" struct Params { int use_cuda = 0; diff --git a/packages/kokkos-kernels/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp b/packages/kokkos-kernels/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp index cdae173ce01b..2ff22b7078be 100644 --- a/packages/kokkos-kernels/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp +++ b/packages/kokkos-kernels/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp @@ -19,7 +19,7 @@ #include "KokkosBlas2_gemv.hpp" -#include "KokkosKernels_TestUtils.hpp" +#include "KokkosKernels_TestStringUtils.hpp" #include "KokkosKernels_perf_test_utilities.hpp" #include diff --git a/packages/kokkos-kernels/perf_test/blas/blas2/KokkosBlas2_ger_perf_test_benchmark.cpp b/packages/kokkos-kernels/perf_test/blas/blas2/KokkosBlas2_ger_perf_test_benchmark.cpp index 8efd9920b1bd..2feeb2c2ff03 100644 --- a/packages/kokkos-kernels/perf_test/blas/blas2/KokkosBlas2_ger_perf_test_benchmark.cpp +++ b/packages/kokkos-kernels/perf_test/blas/blas2/KokkosBlas2_ger_perf_test_benchmark.cpp @@ -21,7 +21,7 @@ #include "KokkosBlas2_ger.hpp" #include -#include "KokkosKernels_TestUtils.hpp" +#include "KokkosKernels_TestStringUtils.hpp" #include "KokkosKernels_perf_test_utilities.hpp" #include diff --git a/packages/kokkos-kernels/perf_test/blas/blas3/CMakeLists.txt b/packages/kokkos-kernels/perf_test/blas/blas3/CMakeLists.txt index 80c9d25c1c01..4dbf0ae5dddf 100644 --- a/packages/kokkos-kernels/perf_test/blas/blas3/CMakeLists.txt +++ b/packages/kokkos-kernels/perf_test/blas/blas3/CMakeLists.txt @@ -4,7 +4,6 @@ KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) KOKKOSKERNELS_ADD_EXECUTABLE( KokkosBlas3_perf_test SOURCES KokkosBlas3_perf_test.cpp - TESTONLYLIBS kokkoskernelsperf_gtest ) KOKKOSKERNELS_ADD_EXECUTABLE( diff --git a/packages/kokkos-kernels/perf_test/blas/blas3/KokkosBlas3_common.hpp b/packages/kokkos-kernels/perf_test/blas/blas3/KokkosBlas3_common.hpp index 3f13dbf8d8c5..1916eb5c4ade 100644 --- a/packages/kokkos-kernels/perf_test/blas/blas3/KokkosBlas3_common.hpp +++ b/packages/kokkos-kernels/perf_test/blas/blas3/KokkosBlas3_common.hpp @@ -42,14 +42,14 @@ /************************ blas routine structure definitions **********/ struct perf_test_trmm_args { std::string trmm_args; - default_scalar alpha; + KokkosKernels::default_scalar alpha; }; typedef struct perf_test_trmm_args pt_trmm_args_t; struct perf_test_gemm_args { std::string gemm_args; //[N,T,C][N,T,C] for transA and transB - default_scalar alpha; - default_scalar beta; + KokkosKernels::default_scalar alpha; + KokkosKernels::default_scalar beta; }; typedef struct perf_test_gemm_args pt_gemm_args_t; // ADD MORE BLAS3 ROUTINE ARG STRUCTS HERE. diff --git a/packages/kokkos-kernels/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/packages/kokkos-kernels/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp index 6710091e607f..75a3c4c00496 100644 --- a/packages/kokkos-kernels/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp +++ b/packages/kokkos-kernels/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp @@ -25,11 +25,12 @@ #include +#include "Kokkos_ArithTraits.hpp" #include "KokkosBatched_HostLevel_Gemm.hpp" #include "KokkosBatched_Gemm_Decl.hpp" #include "KokkosBatched_Util.hpp" -#include "gtest/gtest.h" // EXPECT_NEAR -#include "KokkosKernels_TestUtils.hpp" +#include "KokkosKernels_TestStringUtils.hpp" +#include "KokkosKernels_TestVanilla.hpp" #include @@ -107,19 +108,24 @@ void (*do_gemm_invoke[LOOP_N][TEST_N])(options_t) = { #define DEFAULT_GEMM_ALPHA 1.0 #define DEFAULT_GEMM_BETA 1.0 -using view_type_3d = Kokkos::View; -using view_type_4d = Kokkos::View; -using view_type_5d = Kokkos::View; +using view_type_3d = + Kokkos::View; +using view_type_4d = + Kokkos::View; +using view_type_5d = + Kokkos::View; // Construct the vector type -using memory_space = typename default_device::execution_space::memory_space; -constexpr int simd_vector_size = KokkosBatched::DefaultVectorLength::value; +using memory_space = typename KokkosKernels::default_device::execution_space::memory_space; +constexpr int simd_vector_size = KokkosBatched::DefaultVectorLength::value; constexpr int simd_internal_vector_size = - KokkosBatched::DefaultInternalVectorLength::value; -using vector_type = KokkosBatched::Vector, simd_vector_size>; -using internal_vector_type = KokkosBatched::Vector, simd_internal_vector_size>; -using vector_view_type_3d = Kokkos::View; -using internal_vector_view_type_4d = Kokkos::View; + KokkosBatched::DefaultInternalVectorLength::value; +using vector_type = KokkosBatched::Vector, simd_vector_size>; +using internal_vector_type = + KokkosBatched::Vector, simd_internal_vector_size>; +using vector_view_type_3d = Kokkos::View; +using internal_vector_view_type_4d = + Kokkos::View; struct batched_params { int team_size; @@ -163,7 +169,7 @@ typedef struct gemm_simd_args gemm_simd_args_t; * https://developer.arm.com/documentation/101004/2100/Interleave-batch-functions/armpl-dgemm-interleave-batch. */ struct gemm_armpl_args { - default_scalar *mat; + KokkosKernels::default_scalar *mat; armpl_int_t jstrd, istrd, bstrd; }; typedef struct gemm_armpl_args gemm_armpl_args_t; @@ -199,8 +205,8 @@ typedef struct gemm_armpl_args gemm_armpl_args_t; */ struct gemm_args { char transA, transB; - default_scalar alpha; - default_scalar beta; + KokkosKernels::default_scalar alpha; + KokkosKernels::default_scalar beta; view_type_3d A, B, C; batched_params_t bp; // Below are matrices for simd tests @@ -224,9 +230,10 @@ static std::string gemm_csv_header_str = // http://www.icl.utk.edu/~mgates3/docs/lawn41.pdf static inline double __gemm_flop_count(double a_m, double a_n, double b_n) { // TODO: if not Kokkos::complex. - if (std::is_same::value || std::is_same::value || - std::is_same::value || - std::is_same::value) + if (std::is_same::value || + std::is_same::value || + std::is_same::value || + std::is_same::value) return 2 * a_m * b_n * a_n; else // For complex, we need to count 2 flops for each add and 6 flops for each @@ -285,9 +292,9 @@ static void __print_gemm_perf_test_options(options_t options) { printf("options.n = %d\n", options.n); printf("options.blas_args.gemm.gemm_args = %s\n", options.blas_args.gemm.gemm_args.c_str()); printf("options.out_file = %s\n", options.out_file.c_str()); - if (std::is_same::value) + if (std::is_same::value) printf("options.alpha = %lf\n", options.blas_args.gemm.alpha); - else if (std::is_same::value) + else if (std::is_same::value) printf("options.alpha = %f\n", options.blas_args.gemm.alpha); return; } @@ -1140,7 +1147,7 @@ void __do_gemm_parallel_experiment5(options_t options, gemm_args_t gemm_args) { using scalar_type = typename view_type_3d::value_type; constexpr int vl = KokkosBatched::DefaultVectorLength::value; using simd_type = KokkosBatched::Vector, simd_vector_size>; - using simd_view_type = Kokkos::View; + using simd_view_type = Kokkos::View; using functor_type = parallel_batched_gemm_experiment5; uint32_t warm_up_n = options.warm_up_n; @@ -1229,9 +1236,9 @@ void __do_gemm_parallel_experiment6(options_t options, gemm_args_t gemm_args) { KokkosBatched::DefaultVectorLength::value; constexpr int il = KokkosBatched::DefaultInternalVectorLength::value; - using view_type = Kokkos::View; - using vector_view_type = Kokkos::View; - using internal_vector_view_type = Kokkos::View; + using view_type = Kokkos::View; + using vector_view_type = Kokkos::View; + using internal_vector_view_type = Kokkos::View; using functor_type = parallel_batched_gemm_experiment6; @@ -1301,7 +1308,7 @@ void __do_gemm_armpl(options_t options, gemm_args_t gemm_args) { char transa = std::is_same::value ? 'N' : 'T'; char transb = std::is_same::value ? 'N' : 'T'; - if (!std::is_same::value) FATAL_ERROR("only double scalars are supported!"); + if (!std::is_same::value) FATAL_ERROR("only double scalars are supported!"); STATUS; @@ -1362,7 +1369,8 @@ static inline bool __gemm_print_compare_failure(ViewType h_expected, ViewType h_ */ template static inline bool __gemm_do_compare(view_type_3d expected, view_type_3d actual) { - double epsilon = Test::epsilon::value * 1e3; + double epsilon = Kokkos::ArithTraits::eps() * 1e3; + STATUS; typename view_type_3d::HostMirror h_expected = Kokkos::create_mirror_view(expected); @@ -1418,14 +1426,16 @@ static inline void __gemm_copy_simd_view_to_3d_view(gemm_simd_args_t src, dstVie vector_batch_size = src.ivec_4d.extent(0); simd_batch_size = src.ivec_4d.extent(3); last_batch = dst.extent(2); - if (std::is_same::value && remainder == 0) data_layout_same_as_3d_view = true; + if (std::is_same::value && remainder == 0) + data_layout_same_as_3d_view = true; } else { remainder = dst.extent(0) % simd_internal_vector_size; vector_batch_size = src.ivec_4d.extent(3); simd_batch_size = src.ivec_4d.extent(0); last_batch = dst.extent(0); - if (std::is_same::value && remainder == 0) data_layout_same_as_3d_view = true; + if (std::is_same::value && remainder == 0) + data_layout_same_as_3d_view = true; } // When the batch_size is a multiple of the simd_vector_size and the @@ -1448,7 +1458,7 @@ static inline void __gemm_copy_simd_view_to_3d_view(gemm_simd_args_t src, dstVie remainder += simd_internal_vector_size; // Views needed for slow manual copy - using h_view_type_5d = Kokkos::View; + using h_view_type_5d = Kokkos::View; using h_subview_type_2d = Kokkos::View; using h_subview_type_3d = Kokkos::View; using h_subview_type_4d = Kokkos::View; @@ -1458,7 +1468,7 @@ static inline void __gemm_copy_simd_view_to_3d_view(gemm_simd_args_t src, dstVie h_subview_type_2d h_sv2; // TODO: Clean everything below this point up... - if (std::is_same::value) + if (std::is_same::value) h_src_raw = h_view_type_5d((src_scalar_type *)h_src.data(), src.ivec_4d.extent(0), src.ivec_4d.extent(1), src.ivec_4d.extent(2), src.ivec_4d.extent(3), simd_internal_vector_size); else @@ -1468,7 +1478,7 @@ static inline void __gemm_copy_simd_view_to_3d_view(gemm_simd_args_t src, dstVie // The below loops copies each corresponding 2-rank matrix within the simd // view back to the 3-rank view. for (size_t simd_internal_vec_idx = 0; simd_internal_vec_idx < remainder; simd_internal_vec_idx++) { - if (std::is_same::value) + if (std::is_same::value) h_sv0 = Kokkos::subview(h_src_raw, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), simd_internal_vec_idx); else @@ -1599,7 +1609,8 @@ static inline void __gemm_do_verify(options_t options, gemm_args_t gemm_args, vo if (gemm_args.C.data() != nullptr) { #if defined(KOKKOSKERNELS_ENABLE_TPL_ARMPL) && ARMPL_BUILD >= 1058 if (options.test == EXPERIMENT) { - using view_type_2d = Kokkos::View; + using view_type_2d = + Kokkos::View; view_type_2d C; for (int ib = 0; ib < gemm_args.nbatch; ++ib) { for (int i = 0; i < gemm_args.ninter; ++i) { @@ -1703,7 +1714,7 @@ gemm_args_t __do_setup(options_t options, matrix_dims_t dims) { // Use the non-simd 4-rank view type to randomly populate the gemm simd // arguments - using tmp_view_type_4d = Kokkos::View; + using tmp_view_type_4d = Kokkos::View; tmp_view_type_4d tmpA("tmpA", gemm_args.Av.mat_4d.extent(0), gemm_args.Av.mat_4d.extent(1), gemm_args.Av.mat_4d.extent(2), gemm_args.Av.mat_4d.extent(3)); Kokkos::fill_random(tmpA, rand_pool, Kokkos::rand, double>::max()); @@ -1729,7 +1740,7 @@ gemm_args_t __do_setup(options_t options, matrix_dims_t dims) { gemm_args.C = vtc("gemm_args.C", dims.c.k, dims.c.m, dims.c.n); } - using tmp_view_type_3d = Kokkos::View; + using tmp_view_type_3d = Kokkos::View; tmp_view_type_3d tmpA("tmpA", gemm_args.A.extent(0), gemm_args.A.extent(1), gemm_args.A.extent(2)); Kokkos::fill_random(tmpA, rand_pool, Kokkos::rand, double>::max()); tmp_view_type_3d tmpB("tmpB", gemm_args.B.extent(0), gemm_args.B.extent(1), gemm_args.B.extent(2)); @@ -1776,11 +1787,14 @@ gemm_args_t __do_setup(options_t options, matrix_dims_t dims) { gemm_args.B_pl.bstrd = bstrd_B; gemm_args.C_pl.bstrd = bstrd_C; - default_scalar *A_p = (default_scalar *)malloc(sizeof(default_scalar) * bstrd_A * nbatch); - default_scalar *B_p = (default_scalar *)malloc(sizeof(default_scalar) * bstrd_B * nbatch); - default_scalar *C_p = (default_scalar *)malloc(sizeof(default_scalar) * bstrd_C * nbatch); + KokkosKernels::default_scalar *A_p = + (KokkosKernels::default_scalar *)malloc(sizeof(KokkosKernels::default_scalar) * bstrd_A * nbatch); + KokkosKernels::default_scalar *B_p = + (KokkosKernels::default_scalar *)malloc(sizeof(KokkosKernels::default_scalar) * bstrd_B * nbatch); + KokkosKernels::default_scalar *C_p = + (KokkosKernels::default_scalar *)malloc(sizeof(KokkosKernels::default_scalar) * bstrd_C * nbatch); - using view_type_2d = Kokkos::View; + using view_type_2d = Kokkos::View; view_type_2d A, B, C; // Populate interleave-batch matrices @@ -1837,8 +1851,10 @@ void __do_loop_and_invoke(options_t options, void (*fn)(options_t, gemm_args_t)) STATUS; __print_gemm_perf_test_options(options); - std::cout << "SCALAR:" << typeid(default_scalar).name() << ", LAYOUT:" << typeid(default_layout).name() - << ", DEVICE:" << typeid(default_device).name() << ", SPACE:" << typeid(memory_space).name() << std::endl; + std::cout << "SCALAR:" << typeid(KokkosKernels::default_scalar).name() + << ", LAYOUT:" << typeid(KokkosKernels::default_layout).name() + << ", DEVICE:" << typeid(KokkosKernels::default_device).name() << ", SPACE:" << typeid(memory_space).name() + << std::endl; options.out[0] << gemm_csv_header_str << std::endl; @@ -1847,10 +1863,12 @@ void __do_loop_and_invoke(options_t options, void (*fn)(options_t, gemm_args_t)) cur_dims.b.n <= options.stop.b.n && cur_dims.c.m <= options.stop.c.m && cur_dims.c.n <= options.stop.c.n; cur_dims.a.m += options.step, cur_dims.a.n += options.step, cur_dims.b.m += options.step, cur_dims.b.n += options.step, cur_dims.c.m += options.step, cur_dims.c.n += options.step) { - gemm_args = __do_setup(options, cur_dims); + gemm_args = __do_setup(options, cur_dims); if (options.verify) { - __gemm_do_verify(options, gemm_args, fn); + __gemm_do_verify( + options, gemm_args, fn); } else { fn(options, gemm_args); } @@ -1867,21 +1885,25 @@ void __do_loop_and_invoke(options_t options, void (*fn)(options_t, gemm_args_t)) /*************************** External fns **************************/ void do_gemm_serial_blas(options_t options) { STATUS; - __do_loop_and_invoke(options, __do_gemm_serial_blas); + __do_loop_and_invoke( + options, + __do_gemm_serial_blas); return; } void do_gemm_serial_batched(options_t options) { STATUS; - __do_loop_and_invoke(options, __do_gemm_serial_batched); + __do_loop_and_invoke(options, + __do_gemm_serial_batched); return; } void do_gemm_serial_batched_blocked(options_t options) { STATUS; - __do_loop_and_invoke(options, __do_gemm_serial_batched); + __do_loop_and_invoke(options, + __do_gemm_serial_batched); return; } @@ -1893,29 +1915,31 @@ void do_gemm_heuristic_batched_parallel(options_t options) { exit(-EINVAL); } - __do_loop_and_invoke(options, __do_gemm_parallel_batched_heuristic); + __do_loop_and_invoke(options, __do_gemm_parallel_batched_heuristic); return; } void do_gemm_serial_batched_parallel(options_t options) { STATUS; if (options.blas_args.batch_size_last_dim) - __do_loop_and_invoke( - options, __do_gemm_parallel_batched); + __do_loop_and_invoke(options, __do_gemm_parallel_batched); else - __do_loop_and_invoke(options, - __do_gemm_parallel_batched); + __do_loop_and_invoke( + options, + __do_gemm_parallel_batched); return; } void do_gemm_serial_batched_blocked_parallel(options_t options) { STATUS; if (options.blas_args.batch_size_last_dim) - __do_loop_and_invoke( - options, __do_gemm_parallel_batched); + __do_loop_and_invoke(options, __do_gemm_parallel_batched); else - __do_loop_and_invoke(options, - __do_gemm_parallel_batched); + __do_loop_and_invoke( + options, + __do_gemm_parallel_batched); return; } @@ -1925,11 +1949,13 @@ void do_gemm_serial_simd_batched_parallel(options_t options) { // SerialSimdTag options.use_simd = true; if (options.blas_args.batch_size_last_dim) - __do_loop_and_invoke(options, __do_gemm_parallel_batched); + __do_loop_and_invoke(options, + __do_gemm_parallel_batched); else - __do_loop_and_invoke(options, __do_gemm_parallel_batched); + __do_loop_and_invoke(options, + __do_gemm_parallel_batched); return; } @@ -1939,35 +1965,38 @@ void do_gemm_serial_simd_batched_blocked_parallel(options_t options) { // SerialSimdTag options.use_simd = true; if (options.blas_args.batch_size_last_dim) - __do_loop_and_invoke(options, __do_gemm_parallel_batched); + __do_loop_and_invoke(options, + __do_gemm_parallel_batched); else - __do_loop_and_invoke(options, __do_gemm_parallel_batched); + __do_loop_and_invoke(options, + __do_gemm_parallel_batched); return; } -#if defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) && defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) && \ +#if defined(KOKKOSBATCHED_IMPL_ENABLE_INTEL_MKL) && defined(KOKKOSBATCHED_IMPL_ENABLE_INTEL_MKL_BATCHED) && \ defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__) void do_gemm_serial_batched_compact_mkl_parallel(options_t options) { STATUS; if (options.blas_args.batch_size_last_dim) - __do_loop_and_invoke( - options, - __do_gemm_parallel_batched); + __do_loop_and_invoke(options, + __do_gemm_parallel_batched); else - __do_loop_and_invoke( - options, __do_gemm_parallel_batched); + __do_loop_and_invoke(options, __do_gemm_parallel_batched); return; } #else void do_gemm_serial_batched_compact_mkl_parallel(options_t) { STATUS; -#if !defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) - std::cerr << std::string(__func__) << " disabled since __KOKKOSBATCHED_ENABLE_INTEL_MKL__ is undefined." << std::endl; -#elif !defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) +#if !defined(KOKKOSBATCHED_IMPL_ENABLE_INTEL_MKL) + std::cerr << std::string(__func__) << " disabled since KOKKOSBATCHED_IMPL_ENABLE_INTEL_MKL is undefined." + << std::endl; +#elif !defined(KOKKOSBATCHED_IMPL_ENABLE_INTEL_MKL_BATCHED) std::cerr << std::string(__func__) - << " disabled since __KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__ is " + << " disabled since KOKKOSBATCHED_IMPL_ENABLE_INTEL_MKL_BATCHED is " "undefined." << std::endl; #elif !defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__) @@ -1983,34 +2012,37 @@ void do_gemm_serial_batched_compact_mkl_parallel(options_t) { void do_gemm_team_batched_parallel(options_t options) { STATUS; if (options.blas_args.batch_size_last_dim) - __do_loop_and_invoke( - options, __do_gemm_parallel_batched); + __do_loop_and_invoke(options, __do_gemm_parallel_batched); else - __do_loop_and_invoke(options, - __do_gemm_parallel_batched); + __do_loop_and_invoke( + options, + __do_gemm_parallel_batched); return; } void do_gemm_team_batched_blocked_parallel(options_t options) { STATUS; if (options.blas_args.batch_size_last_dim) - __do_loop_and_invoke( - options, __do_gemm_parallel_batched); + __do_loop_and_invoke(options, __do_gemm_parallel_batched); else - __do_loop_and_invoke(options, - __do_gemm_parallel_batched); + __do_loop_and_invoke( + options, + __do_gemm_parallel_batched); return; } void do_gemm_team_vector_batched_parallel(options_t options) { STATUS; if (options.blas_args.batch_size_last_dim) - __do_loop_and_invoke( - options, - __do_gemm_parallel_batched); + __do_loop_and_invoke(options, + __do_gemm_parallel_batched); else __do_loop_and_invoke( - options, __do_gemm_parallel_batched); + options, + __do_gemm_parallel_batched); return; } @@ -2019,10 +2051,10 @@ void do_gemm_team_simd_batched_parallel(options_t options) { options.use_simd = true; if (options.blas_args.batch_size_last_dim) __do_loop_and_invoke(options, __do_gemm_parallel_batched); + KokkosKernels::default_device, KokkosBatched::Mode::Team>); else __do_loop_and_invoke(options, __do_gemm_parallel_batched); + KokkosKernels::default_device, KokkosBatched::Mode::Team>); return; } @@ -2031,10 +2063,10 @@ void do_gemm_team_simd_batched_blocked_parallel(options_t options) { options.use_simd = true; if (options.blas_args.batch_size_last_dim) __do_loop_and_invoke(options, __do_gemm_parallel_batched); + KokkosKernels::default_device, KokkosBatched::Mode::Team>); else __do_loop_and_invoke(options, __do_gemm_parallel_batched); + KokkosKernels::default_device, KokkosBatched::Mode::Team>); return; } @@ -2043,7 +2075,7 @@ void do_gemm_team_simd_batched_blocked_parallel(options_t options) { STATUS; __do_loop_and_invoke( options, __do_gemm_parallel_batched); return; +KokkosBatched::Algo::Gemm::Blocked, KokkosKernels::default_device>); return; } */ void do_gemm_experiment_parallel(options_t options) { @@ -2054,23 +2086,23 @@ void do_gemm_experiment_parallel(options_t options) { // __do_loop_and_invoke( // options, __do_gemm_parallel_experiment1); + // BlockingType, KokkosKernels::default_device>); // __do_loop_and_invoke( // options, __do_gemm_parallel_experiment2); + // BlockingType, KokkosKernels::default_device>); // __do_loop_and_invoke( // options, __do_gemm_parallel_experiment3); + // BlockingType, KokkosKernels::default_device>); // __do_loop_and_invoke( // options, __do_gemm_parallel_experiment4); + // BlockingType, KokkosKernels::default_device>); // __do_loop_and_invoke( // options, __do_gemm_parallel_experiment5); + // BlockingType, KokkosKernels::default_device>); // __do_loop_and_invoke( // options, __do_gemm_parallel_experiment6); - __do_loop_and_invoke(options, __do_gemm_armpl); + // BlockingType, KokkosKernels::default_device>); + __do_loop_and_invoke(options, __do_gemm_armpl); } #endif // KOKKOSBLAS3_GEMM_PERF_TEST_H_ diff --git a/packages/kokkos-kernels/perf_test/blas/blas3/KokkosBlas3_gemm_standalone_perf_test.cpp b/packages/kokkos-kernels/perf_test/blas/blas3/KokkosBlas3_gemm_standalone_perf_test.cpp index 22a268c2e6c6..a9f8db5be441 100644 --- a/packages/kokkos-kernels/perf_test/blas/blas3/KokkosBlas3_gemm_standalone_perf_test.cpp +++ b/packages/kokkos-kernels/perf_test/blas/blas3/KokkosBlas3_gemm_standalone_perf_test.cpp @@ -16,7 +16,7 @@ #include "KokkosBlas3_gemm.hpp" #include -#include "KokkosKernels_TestUtils.hpp" +#include "KokkosKernels_TestStringUtils.hpp" struct Params { int use_cuda = 0; @@ -103,7 +103,8 @@ template void run(int m, int n, int k, int repeat) { using LL = Kokkos::LayoutLeft; using LR = Kokkos::LayoutRight; - std::cout << "** Running GEMM experiments (" << ExecSpace::name() << ") **\n"; + std::cout << "** Running GEMM experiments (" << ExecSpace::name() << " m=" << m << " n=" << n << " k=" << k + << ") **\n"; std::cout << "Running: A LayoutLeft, B LayoutLeft : "; runImpl(m, n, k, repeat); std::cout << "Running: A LayoutLeft, B LayoutRight : "; diff --git a/packages/kokkos-kernels/perf_test/blas/blas3/KokkosBlas3_gemm_standalone_perf_test_benchmark.cpp b/packages/kokkos-kernels/perf_test/blas/blas3/KokkosBlas3_gemm_standalone_perf_test_benchmark.cpp index eee2eea53c1d..6f0737180e0b 100644 --- a/packages/kokkos-kernels/perf_test/blas/blas3/KokkosBlas3_gemm_standalone_perf_test_benchmark.cpp +++ b/packages/kokkos-kernels/perf_test/blas/blas3/KokkosBlas3_gemm_standalone_perf_test_benchmark.cpp @@ -16,7 +16,7 @@ #include "KokkosBlas3_gemm.hpp" #include -#include "KokkosKernels_TestUtils.hpp" +#include "KokkosKernels_TestStringUtils.hpp" #include "KokkosKernels_perf_test_utilities.hpp" #include "Benchmark_Context.hpp" #include diff --git a/packages/kokkos-kernels/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp b/packages/kokkos-kernels/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp index 3b21fb7e703a..cce1d8036116 100644 --- a/packages/kokkos-kernels/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp +++ b/packages/kokkos-kernels/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp @@ -301,12 +301,12 @@ int main(int argc, char **argv) { double alpha, beta; if (sscanf(optarg, "%lf,%lf", &alpha, &beta) != 2) __blas3_perf_test_input_error(argv, ret, optarg); - options.blas_args.gemm.alpha = static_cast(alpha); - options.blas_args.gemm.beta = static_cast(beta); + options.blas_args.gemm.alpha = static_cast(alpha); + options.blas_args.gemm.beta = static_cast(beta); break; case 'a': // printf("optarg=%s. %d\n", optarg, strncasecmp(optarg, "blas", 4)); - options.blas_args.trmm.alpha = (default_scalar)atof(optarg); + options.blas_args.trmm.alpha = (KokkosKernels::default_scalar)atof(optarg); break; case 'l': for (i = 0; i < LOOP_N; i++) { diff --git a/packages/kokkos-kernels/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp b/packages/kokkos-kernels/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp index 9b23ce4d51fa..084c0759d288 100644 --- a/packages/kokkos-kernels/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp +++ b/packages/kokkos-kernels/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp @@ -64,8 +64,9 @@ static inline int __trmm_impl_flop_count(char side, int b_m, int b_n, int /*a_m* flops = (b_n * (b_n + 1)) * b_m; } - if (std::is_same::value || std::is_same::value || - std::is_same::value) + if (std::is_same::value || + std::is_same::value || + std::is_same::value) return flops; // Account for 6 additional flops when complex numbers are used. @@ -86,8 +87,9 @@ static inline double __trmm_flop_count(char side, double b_m, double b_n, double flops = b_n * b_n * b_m; } - if (std::is_same::value || std::is_same::value || - std::is_same::value) + if (std::is_same::value || + std::is_same::value || + std::is_same::value) return flops; // Account for 6 additional flops when complex numbers are used. @@ -97,10 +99,11 @@ static inline double __trmm_flop_count(char side, double b_m, double b_n, double return flops * 4; } -using view_type_3d = Kokkos::View; +using view_type_3d = + Kokkos::View; struct trmm_args { char side, uplo, trans, diag; - default_scalar alpha; + KokkosKernels::default_scalar alpha; view_type_3d A, B; }; typedef struct trmm_args trmm_args_t; @@ -117,7 +120,8 @@ static void __trmm_output_csv_row(options_t options, trmm_args_t trmm_args, doub double gflops = flops / 1e9; double average_time = time_in_seconds / options.n; double gbytes_in_matrix = - (trmm_args.B.extent(0) * trmm_args.B.extent(1) * trmm_args.B.extent(2) * sizeof(default_scalar)) / 1e9; + (trmm_args.B.extent(0) * trmm_args.B.extent(1) * trmm_args.B.extent(2) * sizeof(KokkosKernels::default_scalar)) / + 1e9; double min_memory_transactions, max_memory_transactions; // Assuming infinite cache size @@ -157,9 +161,9 @@ static void __print_trmm_perf_test_options(options_t options) { printf("options.n = %d\n", options.n); printf("options.blas_args.trmm.trmm_args = %s\n", options.blas_args.trmm.trmm_args.c_str()); printf("options.out_file = %s\n", options.out_file.c_str()); - if (std::is_same::value) + if (std::is_same::value) printf("options.alpha = %lf\n", options.blas_args.trmm.alpha); - else if (std::is_same::value) + else if (std::is_same::value) printf("options.alpha = %f\n", options.blas_args.trmm.alpha); return; } @@ -551,8 +555,8 @@ trmm_args_t __do_setup(options_t options, matrix_dims_t dim) { host_A = Kokkos::create_mirror_view(trmm_args.A); { - Kokkos::View tmp("tmp", trmm_args.A.extent(0), trmm_args.A.extent(1), - trmm_args.A.extent(2)); + Kokkos::View tmp( + "tmp", trmm_args.A.extent(0), trmm_args.A.extent(1), trmm_args.A.extent(2)); Kokkos::fill_random(tmp, rand_pool, Kokkos::rand, double>::max()); Kokkos::deep_copy(host_A, tmp); } @@ -592,8 +596,8 @@ trmm_args_t __do_setup(options_t options, matrix_dims_t dim) { Kokkos::deep_copy(trmm_args.A, host_A); { - Kokkos::View tmp("tmp", trmm_args.B.extent(0), trmm_args.B.extent(1), - trmm_args.B.extent(2)); + Kokkos::View tmp( + "tmp", trmm_args.B.extent(0), trmm_args.B.extent(1), trmm_args.B.extent(2)); Kokkos::fill_random(tmp, rand_pool, Kokkos::rand, double>::max()); Kokkos::deep_copy(trmm_args.B, tmp); } @@ -608,8 +612,9 @@ void __do_loop_and_invoke(options_t options, void (*fn)(options_t, trmm_args_t)) STATUS; __print_trmm_perf_test_options(options); - std::cout << "SCALAR:" << typeid(default_scalar).name() << ", LAYOUT:" << typeid(default_layout).name() - << ", DEVICE:" << typeid(default_device).name() << std::endl; + std::cout << "SCALAR:" << typeid(KokkosKernels::default_scalar).name() + << ", LAYOUT:" << typeid(KokkosKernels::default_layout).name() + << ", DEVICE:" << typeid(KokkosKernels::default_device).name() << std::endl; options.out[0] << trmm_csv_header_str << std::endl; @@ -617,7 +622,8 @@ void __do_loop_and_invoke(options_t options, void (*fn)(options_t, trmm_args_t)) cur_dims.b.m <= options.stop.b.m && cur_dims.b.n <= options.stop.b.n; cur_dims.a.m += options.step, cur_dims.a.n += options.step, cur_dims.b.m += options.step, cur_dims.b.n += options.step) { - trmm_args = __do_setup(options, cur_dims); + trmm_args = __do_setup( + options, cur_dims); fn(options, trmm_args); } return; @@ -626,25 +632,30 @@ void __do_loop_and_invoke(options_t options, void (*fn)(options_t, trmm_args_t)) /*************************** External fns **************************/ void do_trmm_serial_blas(options_t options) { STATUS; - __do_loop_and_invoke(options, __do_trmm_serial_blas); + __do_loop_and_invoke( + options, + __do_trmm_serial_blas); return; } void do_trmm_serial_batched(options_t options) { STATUS; - __do_loop_and_invoke(options, __do_trmm_serial_batched); + __do_loop_and_invoke(options, __do_trmm_serial_batched); return; } void do_trmm_parallel_blas(options_t options) { STATUS; - __do_loop_and_invoke(options, __do_trmm_parallel_blas); + __do_loop_and_invoke(options, __do_trmm_parallel_blas); return; } void do_trmm_parallel_batched(options_t options) { STATUS; - __do_loop_and_invoke(options, __do_trmm_parallel_batched); + __do_loop_and_invoke(options, __do_trmm_parallel_batched); return; } diff --git a/packages/kokkos-kernels/perf_test/blas/blas3/KokkosBlas_trtri_perf_test.hpp b/packages/kokkos-kernels/perf_test/blas/blas3/KokkosBlas_trtri_perf_test.hpp index d2d246055132..6040cd5c43ff 100644 --- a/packages/kokkos-kernels/perf_test/blas/blas3/KokkosBlas_trtri_perf_test.hpp +++ b/packages/kokkos-kernels/perf_test/blas/blas3/KokkosBlas_trtri_perf_test.hpp @@ -59,8 +59,9 @@ static inline double __trtri_impl_flop_count(double a_m, double /*a_n*/) { double flop_count = 0; double flops_per_div, flops_per_mul, flops_per_add; - if (std::is_same::value || std::is_same::value || - std::is_same::value) { + if (std::is_same::value || + std::is_same::value || + std::is_same::value) { flops_per_div = 1; flops_per_mul = 1; flops_per_add = 1; @@ -93,8 +94,9 @@ static inline double __trtri_flop_count(double a_m, double a_n) { exit(255); } - if (std::is_same::value || std::is_same::value || - std::is_same::value) { + if (std::is_same::value || + std::is_same::value || + std::is_same::value) { flops_per_mul = 1; flops_per_add = 1; } else { @@ -110,7 +112,8 @@ static inline double __trtri_flop_count(double a_m, double a_n) { return flops; } -using view_type_3d = Kokkos::View; +using view_type_3d = + Kokkos::View; struct trtri_args { char uplo, diag; view_type_3d A; @@ -145,8 +148,9 @@ static void __print_trtri_perf_test_options(options_t options) { printf("options.n = %d\n", options.n); printf("options.blas_args.trtri.trtri_args = %s\n", options.blas_args.trtri.trtri_args.c_str()); printf("options.out_file = %s\n", options.out_file.c_str()); - std::cout << "SCALAR:" << typeid(default_scalar).name() << ", LAYOUT:" << typeid(default_layout).name() - << ", DEVICE:." << typeid(default_device).name() << std::endl; + std::cout << "SCALAR:" << typeid(KokkosKernels::default_scalar).name() + << ", LAYOUT:" << typeid(KokkosKernels::default_layout).name() << ", DEVICE:." + << typeid(KokkosKernels::default_device).name() << std::endl; #else static void __print_trtri_perf_test_options(options_t) { #endif // TRTRI_PERF_TEST_DEBUG @@ -456,8 +460,9 @@ void __do_loop_and_invoke(options_t options, void (*fn)(options_t, trtri_args_t) STATUS; __print_trtri_perf_test_options(options); - std::cout << "SCALAR:" << typeid(default_scalar).name() << ", LAYOUT:" << typeid(default_layout).name() - << ", DEVICE:." << typeid(default_device).name() << std::endl; + std::cout << "SCALAR:" << typeid(KokkosKernels::default_scalar).name() + << ", LAYOUT:" << typeid(KokkosKernels::default_layout).name() << ", DEVICE:." + << typeid(KokkosKernels::default_device).name() << std::endl; options.out[0] << trtri_csv_header_str << std::endl; @@ -465,7 +470,8 @@ void __do_loop_and_invoke(options_t options, void (*fn)(options_t, trtri_args_t) cur_dims.b.m <= options.stop.b.m && cur_dims.b.n <= options.stop.b.n; cur_dims.a.m += options.step, cur_dims.a.n += options.step, cur_dims.b.m += options.step, cur_dims.b.n += options.step) { - trtri_args = __do_setup(options, cur_dims); + trtri_args = + __do_setup(options, cur_dims); fn(options, trtri_args); } return; @@ -474,25 +480,29 @@ void __do_loop_and_invoke(options_t options, void (*fn)(options_t, trtri_args_t) /*************************** External fns **************************/ void do_trtri_serial_blas(options_t options) { STATUS; - __do_loop_and_invoke(options, __do_trtri_serial_blas); + __do_loop_and_invoke( + options, __do_trtri_serial_blas); return; } void do_trtri_serial_batched(options_t options) { STATUS; - __do_loop_and_invoke(options, __do_trtri_serial_batched); + __do_loop_and_invoke( + options, __do_trtri_serial_batched); return; } void do_trtri_parallel_blas(options_t options) { STATUS; - __do_loop_and_invoke(options, __do_trtri_parallel_blas); + __do_loop_and_invoke( + options, __do_trtri_parallel_blas); return; } void do_trtri_parallel_batched(options_t options) { STATUS; - __do_loop_and_invoke(options, __do_trtri_parallel_batched); + __do_loop_and_invoke( + options, __do_trtri_parallel_batched); return; } diff --git a/packages/kokkos-kernels/perf_test/graph/KokkosGraph_color.cpp b/packages/kokkos-kernels/perf_test/graph/KokkosGraph_color.cpp index 548abd0052f0..0c3706bdfee0 100644 --- a/packages/kokkos-kernels/perf_test/graph/KokkosGraph_color.cpp +++ b/packages/kokkos-kernels/perf_test/graph/KokkosGraph_color.cpp @@ -26,7 +26,7 @@ #include "KokkosSparse_CrsMatrix.hpp" #include "KokkosKernels_TestParameters.hpp" #include "KokkosGraph_Distance1Color.hpp" -#include "KokkosKernels_TestUtils.hpp" +#include "KokkosKernels_TestStringUtils.hpp" #include "KokkosSparse_IOUtils.hpp" void print_options(std::ostream &os, const char *app_name, unsigned int indent = 0) { diff --git a/packages/kokkos-kernels/perf_test/graph/KokkosGraph_color_d2.cpp b/packages/kokkos-kernels/perf_test/graph/KokkosGraph_color_d2.cpp index dc54250e9e0b..04242f31f0a6 100644 --- a/packages/kokkos-kernels/perf_test/graph/KokkosGraph_color_d2.cpp +++ b/packages/kokkos-kernels/perf_test/graph/KokkosGraph_color_d2.cpp @@ -36,7 +36,7 @@ #include #include #include "KokkosKernels_default_types.hpp" -#include "KokkosKernels_TestUtils.hpp" +#include "KokkosKernels_TestStringUtils.hpp" #include "KokkosSparse_IOUtils.hpp" using namespace KokkosGraph; @@ -69,9 +69,9 @@ struct D2Parameters { } }; -typedef default_scalar kk_scalar_t; -typedef default_size_type kk_size_type; -typedef default_lno_t kk_lno_t; +using kk_scalar_t = KokkosKernels::default_scalar; +using kk_size_type = KokkosKernels::default_size_type; +using kk_lno_t = KokkosKernels::default_lno_t; using KokkosKernels::Impl::xorshiftHash; diff --git a/packages/kokkos-kernels/perf_test/graph/KokkosGraph_mis_d2.cpp b/packages/kokkos-kernels/perf_test/graph/KokkosGraph_mis_d2.cpp index d45ecb4f1ee1..952f28e64c4d 100644 --- a/packages/kokkos-kernels/perf_test/graph/KokkosGraph_mis_d2.cpp +++ b/packages/kokkos-kernels/perf_test/graph/KokkosGraph_mis_d2.cpp @@ -37,7 +37,7 @@ #include "KokkosSparse_spadd.hpp" #include "KokkosGraph_MIS2.hpp" #include "KokkosKernels_default_types.hpp" -#include "KokkosKernels_TestUtils.hpp" +#include "KokkosKernels_TestStringUtils.hpp" #include "KokkosSparse_IOUtils.hpp" using namespace KokkosGraph; @@ -202,14 +202,15 @@ int parse_inputs(MIS2Parameters& params, int argc, char** argv) { template void run_mis2(const MIS2Parameters& params) { - using size_type = default_size_type; - using lno_t = default_lno_t; - using exec_space = typename device_t::execution_space; - using mem_space = typename device_t::memory_space; - using crsMat_t = typename KokkosSparse::CrsMatrix; - using lno_view_t = typename crsMat_t::index_type::non_const_type; - using KKH = KokkosKernels::Experimental::KokkosKernelsHandle; + using size_type = KokkosKernels::default_size_type; + using lno_t = KokkosKernels::default_lno_t; + using scalar_type = KokkosKernels::default_scalar; + using exec_space = typename device_t::execution_space; + using mem_space = typename device_t::memory_space; + using crsMat_t = typename KokkosSparse::CrsMatrix; + using lno_view_t = typename crsMat_t::index_type::non_const_type; + using KKH = + KokkosKernels::Experimental::KokkosKernelsHandle; Kokkos::Timer t; crsMat_t A_in = KokkosSparse::Impl::read_kokkos_crst_matrix(params.mtx_file); @@ -219,7 +220,7 @@ void run_mis2(const MIS2Parameters& params) { crsMat_t At_in = KokkosSparse::Impl::transpose_matrix(A_in); crsMat_t A; KKH kkh; - const default_scalar one = Kokkos::ArithTraits::one(); + const scalar_type one = Kokkos::ArithTraits::one(); kkh.create_spadd_handle(false); KokkosSparse::spadd_symbolic(&kkh, A_in, At_in, A); KokkosSparse::spadd_numeric(&kkh, one, A_in, one, At_in, A); diff --git a/packages/kokkos-kernels/perf_test/graph/KokkosGraph_triangle.cpp b/packages/kokkos-kernels/perf_test/graph/KokkosGraph_triangle.cpp index 3b5802a1efd3..84676607d927 100644 --- a/packages/kokkos-kernels/perf_test/graph/KokkosGraph_triangle.cpp +++ b/packages/kokkos-kernels/perf_test/graph/KokkosGraph_triangle.cpp @@ -18,7 +18,7 @@ #include "KokkosGraph_Triangle.hpp" #include "KokkosSparse_CrsMatrix.hpp" #include "KokkosSparse_IOUtils.hpp" //for read_kokkos_crst_graph -#include "KokkosKernels_TestUtils.hpp" +#include "KokkosKernels_TestStringUtils.hpp" #include "KokkosKernels_TestParameters.hpp" #include "KokkosKernels_perf_test_utilities.hpp" @@ -230,13 +230,13 @@ void run_experiment(int argc, char **argv, perf_test::CommonInputParams) { using namespace KokkosSparse; using mem_space = typename exec_space::memory_space; using device_t = Kokkos::Device; - using lno_t = default_lno_t; - using size_type = default_size_type; - using graph_t = Kokkos::StaticCrsGraph; + using lno_t = KokkosKernels::default_lno_t; + using size_type = KokkosKernels::default_size_type; + using graph_t = Kokkos::StaticCrsGraph; using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle; - if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { + if (KokkosKernels::Impl::is_gpu_exec_space_v) { std::cerr << "** Triangle counting is currently not supported on GPU backends.\n"; return; } diff --git a/packages/kokkos-kernels/perf_test/lapack/KokkosLapack_SVD_benchmark.cpp b/packages/kokkos-kernels/perf_test/lapack/KokkosLapack_SVD_benchmark.cpp index 8336b3b737a8..55aa70b9cfec 100644 --- a/packages/kokkos-kernels/perf_test/lapack/KokkosLapack_SVD_benchmark.cpp +++ b/packages/kokkos-kernels/perf_test/lapack/KokkosLapack_SVD_benchmark.cpp @@ -14,9 +14,11 @@ // //@HEADER -#include "KokkosLapack_svd.hpp" +#include -#include "KokkosKernels_TestUtils.hpp" +#include "KokkosKernels_IOUtils.hpp" // getRandomBounds +#include "KokkosLapack_svd.hpp" +#include "KokkosKernels_TestStringUtils.hpp" #include "KokkosKernels_perf_test_utilities.hpp" #include @@ -72,7 +74,7 @@ void run_svd_benchmark(benchmark::State& state, const svd_parameters& svd_params // Initialize A with random numbers double randStart = 0, randEnd = 0; - Test::getRandomBounds(10.0, randStart, randEnd); + KokkosKernels::Impl::getRandomBounds(10.0, randStart, randEnd); Kokkos::fill_random(A, rand_pool, randStart, randEnd); for (auto _ : state) { diff --git a/packages/kokkos-kernels/perf_test/ode/KokkosODE_BDF.cpp b/packages/kokkos-kernels/perf_test/ode/KokkosODE_BDF.cpp index 60cc5e01ff26..0b1ecdf2765e 100644 --- a/packages/kokkos-kernels/perf_test/ode/KokkosODE_BDF.cpp +++ b/packages/kokkos-kernels/perf_test/ode/KokkosODE_BDF.cpp @@ -16,7 +16,7 @@ #include "KokkosODE_BDF.hpp" -#include "KokkosKernels_TestUtils.hpp" +#include "KokkosKernels_TestStringUtils.hpp" #include "KokkosKernels_perf_test_utilities.hpp" #include diff --git a/packages/kokkos-kernels/perf_test/ode/KokkosODE_RK.cpp b/packages/kokkos-kernels/perf_test/ode/KokkosODE_RK.cpp index 83635d7af697..7a5fd33999de 100644 --- a/packages/kokkos-kernels/perf_test/ode/KokkosODE_RK.cpp +++ b/packages/kokkos-kernels/perf_test/ode/KokkosODE_RK.cpp @@ -16,7 +16,7 @@ #include "KokkosODE_RungeKutta.hpp" -#include "KokkosKernels_TestUtils.hpp" +#include "KokkosKernels_TestStringUtils.hpp" #include "KokkosKernels_perf_test_utilities.hpp" #include @@ -92,7 +92,7 @@ struct chem_model_2 { } }; -template +template struct RKSolve_wrapper { using ode_params = KokkosODE::Experimental::ODE_params; @@ -103,10 +103,11 @@ struct RKSolve_wrapper { scalar_type tstart, tend; vec_type y_old, y_new, tmp; mv_type kstack; + count_type count; RKSolve_wrapper(const ode_type& my_ode_, const table_type& table_, const ode_params& params_, const scalar_type tstart_, const scalar_type tend_, const vec_type& y_old_, const vec_type& y_new_, - const vec_type& tmp_, const mv_type& kstack_) + const vec_type& tmp_, const mv_type& kstack_, const count_type& count_) : my_ode(my_ode_), table(table_), params(params_), @@ -115,7 +116,8 @@ struct RKSolve_wrapper { y_old(y_old_), y_new(y_new_), tmp(tmp_), - kstack(kstack_) {} + kstack(kstack_), + count(count_) {} KOKKOS_FUNCTION void operator()(const int idx) const { @@ -124,10 +126,12 @@ struct RKSolve_wrapper { auto local_y_new = Kokkos::subview(y_new, Kokkos::pair(2 * idx, 2 * idx + 1)); auto local_tmp = Kokkos::subview(tmp, Kokkos::pair(2 * idx, 2 * idx + 1)); auto local_kstack = Kokkos::subview(kstack, Kokkos::ALL(), Kokkos::pair(2 * idx, 2 * idx + 1)); + auto local_count = Kokkos::subview(count, idx, Kokkos::ALL()); // Run Runge-Kutta time integrator - KokkosODE::Impl::RKSolve( - my_ode, table, params, tstart, tend, local_y_old, local_y_new, local_tmp, local_kstack); + // This should be replaced by a call to the public interface! + KokkosODE::Impl::RKSolve(my_ode, table, params, tstart, tend, local_y_old, local_y_new, local_tmp, local_kstack, + local_count.data()); } }; @@ -149,6 +153,7 @@ void run_ode_chem(benchmark::State& state, const rk_input_parameters& inputs) { using mv_type = Kokkos::View; using table_type = KokkosODE::Impl::ButcherTableau<4, 5, 1>; using ode_params = KokkosODE::Experimental::ODE_params; + using count_type = Kokkos::View; const int num_odes = inputs.num_odes; const int model = inputs.model; @@ -159,6 +164,7 @@ void run_ode_chem(benchmark::State& state, const rk_input_parameters& inputs) { const int neqs = chem_model.neqs; const int num_steps = 15000; const double dt = 0.1; + count_type count("time steps count", num_odes, 1); table_type table; ode_params params(num_steps); @@ -176,7 +182,7 @@ void run_ode_chem(benchmark::State& state, const rk_input_parameters& inputs) { Kokkos::RangePolicy my_policy(0, num_odes); RKSolve_wrapper solve_wrapper(chem_model, table, params, chem_model.tstart, chem_model.tend, y_old, y_new, tmp, - kstack); + kstack, count); Kokkos::Timer time; time.reset(); @@ -206,6 +212,7 @@ void run_ode_chem(benchmark::State& state, const rk_input_parameters& inputs) { const int neqs = chem_model.neqs; const int num_steps = 15000; const double dt = 0.1; + count_type count("time steps count", num_odes, 1); table_type table; ode_params params(num_steps); @@ -228,7 +235,7 @@ void run_ode_chem(benchmark::State& state, const rk_input_parameters& inputs) { Kokkos::RangePolicy my_policy(0, num_odes); RKSolve_wrapper solve_wrapper(chem_model, table, params, chem_model.tstart, chem_model.tend, y_old, y_new, tmp, - kstack); + kstack, count); Kokkos::Timer time; time.reset(); diff --git a/packages/kokkos-kernels/perf_test/sparse/CMakeLists.txt b/packages/kokkos-kernels/perf_test/sparse/CMakeLists.txt index 514ef0ed8253..70b5928efb4f 100644 --- a/packages/kokkos-kernels/perf_test/sparse/CMakeLists.txt +++ b/packages/kokkos-kernels/perf_test/sparse/CMakeLists.txt @@ -103,7 +103,6 @@ KOKKOSKERNELS_ADD_EXECUTABLE( KOKKOSKERNELS_ADD_EXECUTABLE( sparse_gs SOURCES KokkosSparse_gs.cpp - TESTONLYLIBS kokkoskernelsperf_gtest ) KOKKOSKERNELS_ADD_EXECUTABLE( diff --git a/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_block_pcg.cpp b/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_block_pcg.cpp index 528f4a20ea2c..91a599ad7119 100644 --- a/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_block_pcg.cpp +++ b/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_block_pcg.cpp @@ -23,7 +23,7 @@ #include "KokkosKernels_Utils.hpp" #include "KokkosSparse_IOUtils.hpp" -#include "KokkosKernels_TestUtils.hpp" +#include "KokkosKernels_TestStringUtils.hpp" #define MAXVAL 1 @@ -62,7 +62,7 @@ crsMat_t create_crs_matrix(char *mtx_bin_file) { cols_view_t columns_view("colsmap_view", ne); values_view_t values_view("values_view", ne); - if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { + if (KokkosKernels::Impl::is_gpu_exec_space_v) { typename row_map_view_t::HostMirror hr = Kokkos::create_mirror_view(rowmap_view); typename cols_view_t::HostMirror hc = Kokkos::create_mirror_view(columns_view); typename values_view_t::HostMirror hv = Kokkos::create_mirror_view(values_view); diff --git a/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_gs.cpp b/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_gs.cpp index b4633fd4c07b..0ed984a8f807 100644 --- a/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_gs.cpp +++ b/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_gs.cpp @@ -14,10 +14,17 @@ // //@HEADER +#include +#include +#include +#include +#include + #include #include + #include -#include +#include #include #include #include @@ -25,11 +32,7 @@ #include #include "KokkosKernels_default_types.hpp" #include "KokkosSparse_IOUtils.hpp" -#include -#include -#include -#include -#include +#include "KokkosKernels_TestMatrixUtils.hpp" using std::cout; using std::string; @@ -151,9 +154,9 @@ crsMat_t generateLongRowMatrix(const GS_Parameters& params) { template void runGS(const GS_Parameters& params) { - typedef default_scalar scalar_t; - typedef default_lno_t lno_t; - typedef default_size_type size_type; + using scalar_t = KokkosKernels::default_scalar; + using lno_t = KokkosKernels::default_lno_t; + using size_type = KokkosKernels::default_size_type; typedef typename device_t::execution_space exec_space; typedef typename device_t::memory_space mem_space; typedef KokkosKernels::Experimental::KokkosKernelsHandle diff --git a/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_kk_spmv.cpp b/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_kk_spmv.cpp index 999a913e0629..a0a9e3456a5d 100644 --- a/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_kk_spmv.cpp +++ b/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_kk_spmv.cpp @@ -32,9 +32,9 @@ #include #include "KokkosKernels_default_types.hpp" -using Scalar = default_scalar; -using Ordinal = default_lno_t; -using Offset = default_size_type; +using Scalar = KokkosKernels::default_scalar; +using Ordinal = KokkosKernels::default_lno_t; +using Offset = KokkosKernels::default_size_type; using KAT = Kokkos::ArithTraits; struct SPMVBenchmarking { @@ -200,7 +200,7 @@ void print_help() { int main(int argc, char** argv) { SPMVBenchmarking sb; char layout; - if (std::is_same::value) + if (std::is_same::value) layout = 'L'; else layout = 'R'; diff --git a/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_mdf.cpp b/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_mdf.cpp index 4db45ce55f26..0cd33f2eca94 100644 --- a/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_mdf.cpp +++ b/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_mdf.cpp @@ -19,7 +19,7 @@ #include "KokkosKernels_Handle.hpp" #include "KokkosSparse_IOUtils.hpp" #include "KokkosSparse_Utils_cusparse.hpp" -#include "KokkosKernels_TestUtils.hpp" +#include "KokkosKernels_TestStringUtils.hpp" #include "KokkosKernels_perf_test_utilities.hpp" #include "KokkosSparse_mdf.hpp" diff --git a/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_par_ilut.cpp b/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_par_ilut.cpp index 73a6c6fc5e6a..5143e13e28c4 100644 --- a/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_par_ilut.cpp +++ b/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_par_ilut.cpp @@ -54,9 +54,9 @@ using KokkosSparse::Experimental::spiluk_symbolic; using KokkosSparse::Experimental::SPILUKAlgorithm; // Build up useful types -using scalar_t = default_scalar; -using lno_t = default_lno_t; -using size_type = default_size_type; +using scalar_t = KokkosKernels::default_scalar; +using lno_t = KokkosKernels::default_lno_t; +using size_type = KokkosKernels::default_size_type; using exe_space = Kokkos::DefaultExecutionSpace; using mem_space = typename exe_space::memory_space; using device = Kokkos::Device; @@ -126,7 +126,7 @@ void run_par_ilut_test(benchmark::State& state, KernelHandle& kh, const sp_matri #ifdef USE_GINKGO /////////////////////////////////////////////////////////////////////////////// -static constexpr bool IS_GPU = KokkosKernels::Impl::kk_is_gpu_exec_space(); +static constexpr bool IS_GPU = KokkosKernels::Impl::is_gpu_exec_space_v; using ginkgo_exec = std::conditional_t; @@ -277,7 +277,7 @@ int test_par_ilut_perf(const std::string& matrix_file, int rows, int nnz_per_row // Now that we have A, we can set team_size if (team_size == -1) { - team_size = KokkosKernels::Impl::kk_is_gpu_exec_space() ? nnz_per_row : 1; + team_size = KokkosKernels::Impl::is_gpu_exec_space_v ? nnz_per_row : 1; } KokkosSparse::sort_crs_matrix(A); diff --git a/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_pcg.cpp b/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_pcg.cpp index d299952cb4e5..00466ae6c978 100644 --- a/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_pcg.cpp +++ b/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_pcg.cpp @@ -20,21 +20,22 @@ #include "KokkosKernels_Utils.hpp" #include "KokkosKernels_IOUtils.hpp" #include "KokkosKernels_default_types.hpp" -#include "KokkosKernels_TestUtils.hpp" +#include "KokkosKernels_TestStringUtils.hpp" #include "KokkosSparse_IOUtils.hpp" #include #define MAXVAL 1 template -scalar_view_t create_x_vector(default_lno_t nv, default_scalar max_value = 1.0) { +scalar_view_t create_x_vector(KokkosKernels::default_lno_t nv, KokkosKernels::default_scalar max_value = 1.0) { scalar_view_t kok_x("X", nv); typename scalar_view_t::HostMirror h_x = Kokkos::create_mirror_view(kok_x); - for (default_lno_t i = 0; i < nv; ++i) { - default_scalar r = static_cast(rand()) / static_cast(RAND_MAX / max_value); - h_x(i) = r; + for (KokkosKernels::default_lno_t i = 0; i < nv; ++i) { + KokkosKernels::default_scalar r = static_cast(rand()) / + static_cast(RAND_MAX / max_value); + h_x(i) = r; } Kokkos::deep_copy(kok_x, h_x); return kok_x; @@ -57,9 +58,9 @@ void run_experiment(crsMat_t crsmat, int clusterSize, bool useSequential) { typedef typename lno_view_t::value_type size_type; typedef typename scalar_view_t::value_type scalar_t; - default_lno_t nv = crsmat.numRows(); - scalar_view_t kok_x_original = create_x_vector(nv, MAXVAL); - scalar_view_t kok_b_vector = create_y_vector(crsmat, kok_x_original); + KokkosKernels::default_lno_t nv = crsmat.numRows(); + scalar_view_t kok_x_original = create_x_vector(nv, MAXVAL); + scalar_view_t kok_b_vector = create_y_vector(crsmat, kok_x_original); // create X vector scalar_view_t kok_x_vector("kok_x_vector", nv); @@ -220,13 +221,15 @@ enum { template void run_pcg(int *cmdline, const char *mtx_file) { - default_lno_t nv = 0, ne = 0; - default_lno_t *xadj, *adj; - default_scalar *ew; + using lno_t = KokkosKernels::default_lno_t; + lno_t nv = 0, ne = 0; + lno_t *xadj, *adj; + KokkosKernels::default_scalar *ew; - KokkosSparse::Impl::read_matrix(&nv, &ne, &xadj, &adj, &ew, mtx_file); + KokkosSparse::Impl::read_matrix(&nv, &ne, &xadj, &adj, &ew, mtx_file); - typedef typename KokkosSparse::CrsMatrix + typedef typename KokkosSparse::CrsMatrix crsMat_t; typedef typename crsMat_t::StaticCrsGraphType graph_t; @@ -243,11 +246,11 @@ void run_pcg(int *cmdline, const char *mtx_file) { typename cols_view_t::HostMirror hc = Kokkos::create_mirror_view(columns_view); typename values_view_t::HostMirror hv = Kokkos::create_mirror_view(values_view); - for (default_lno_t i = 0; i <= nv; ++i) { + for (lno_t i = 0; i <= nv; ++i) { hr(i) = xadj[i]; } - for (default_lno_t i = 0; i < ne; ++i) { + for (lno_t i = 0; i < ne; ++i) { hc(i) = adj[i]; hv(i) = ew[i]; } diff --git a/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_pcg.hpp b/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_pcg.hpp index 6d0b9180a0f7..12234d5773f0 100644 --- a/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_pcg.hpp +++ b/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_pcg.hpp @@ -14,8 +14,8 @@ // //@HEADER -#ifndef KOKKOS_EXAMPLE_CG_SOLVE -#define KOKKOS_EXAMPLE_CG_SOLVE +#ifndef KOKKOSSPARSE_PCG_HPP +#define KOKKOSSPARSE_PCG_HPP #include #include @@ -468,4 +468,4 @@ void pcgsolve(KernelHandle_t &kh, const crsMatrix_t &crsMat, const y_vector_t &y //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- -#endif /* #ifndef KOKKOS_EXAMPLE_CG_SOLVE */ +#endif /* #ifndef KOKKOSSPARSE_PCG_HPP */ diff --git a/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_sort_crs.cpp b/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_sort_crs.cpp index cd3ed91521d5..db5a0559b3a0 100644 --- a/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_sort_crs.cpp +++ b/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_sort_crs.cpp @@ -57,9 +57,9 @@ void run_experiment(int argc, char** argv, const CommonInputParams& common_param using mem_space = typename exec_space::memory_space; using device_t = typename Kokkos::Device; - using size_type = default_size_type; - using lno_t = default_lno_t; - using scalar_t = default_scalar; + using size_type = KokkosKernels::default_size_type; + using lno_t = KokkosKernels::default_lno_t; + using scalar_t = KokkosKernels::default_scalar; using crsMat_t = KokkosSparse::CrsMatrix; using graph_t = typename crsMat_t::StaticCrsGraphType; @@ -79,8 +79,12 @@ void run_experiment(int argc, char** argv, const CommonInputParams& common_param // Randomly shuffle the entries within each row, so that the rows aren't // already sorted. Leave the values alone; this changes the matrix numerically // but this doesn't affect sorting. + std::random_device rd; + std::mt19937 g(rd()); for (lno_t i = 0; i < m; i++) { - std::random_shuffle(entriesHost.data() + i, entriesHost.data() + i + 1); + const size_type rowBegin = rowmapHost(i); + const size_type rowEnd = rowmapHost(i + 1); + std::shuffle(entriesHost.data() + rowBegin, entriesHost.data() + rowEnd, g); } Kokkos::deep_copy(shuffledEntries, entriesHost); exec_space exec; diff --git a/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_spadd.cpp b/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_spadd.cpp index ac7bfe636b9d..063c151812d2 100644 --- a/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_spadd.cpp +++ b/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_spadd.cpp @@ -20,7 +20,7 @@ #include "KokkosSparse_IOUtils.hpp" #include "KokkosSparse_Utils_cusparse.hpp" #include "KokkosSparse_Utils_mkl.hpp" -#include "KokkosKernels_TestUtils.hpp" +#include "KokkosKernels_TestStringUtils.hpp" #include "KokkosKernels_perf_test_utilities.hpp" #include "KokkosSparse_spadd.hpp" @@ -129,9 +129,9 @@ void run_experiment(int argc, char** argv, CommonInputParams) { using mem_space = typename exec_space::memory_space; using device_t = typename Kokkos::Device; - using size_type = default_size_type; - using lno_t = default_lno_t; - using scalar_t = default_scalar; + using size_type = KokkosKernels::default_size_type; + using lno_t = KokkosKernels::default_lno_t; + using scalar_t = KokkosKernels::default_scalar; using crsMat_t = KokkosSparse::CrsMatrix; using KernelHandle = diff --git a/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_spgemm.cpp b/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_spgemm.cpp index cff88e4998ce..2c63127c2dcf 100644 --- a/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_spgemm.cpp +++ b/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_spgemm.cpp @@ -23,7 +23,7 @@ #include "KokkosBlas1_nrminf.hpp" #include "KokkosBlas1_axpby.hpp" #include "KokkosKernels_TestParameters.hpp" -#include "KokkosKernels_TestUtils.hpp" +#include "KokkosKernels_TestStringUtils.hpp" #include "KokkosKernels_perf_test_utilities.hpp" #define TRANSPOSEFIRST false @@ -253,9 +253,9 @@ void run_spgemm(int argc, char** argv, perf_test::CommonInputParams) { using namespace KokkosSparse::Experimental; using MemSpace = typename ExecSpace::memory_space; - using size_type = default_size_type; - using lno_t = default_lno_t; - using scalar_t = default_scalar; + using size_type = KokkosKernels::default_size_type; + using lno_t = KokkosKernels::default_lno_t; + using scalar_t = KokkosKernels::default_scalar; using device_t = Kokkos::Device; using crsMat_t = typename KokkosSparse::CrsMatrix; using KernelHandle = diff --git a/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_spgemm_jacobi.cpp b/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_spgemm_jacobi.cpp index 46610a8dab1b..a5646603b725 100644 --- a/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_spgemm_jacobi.cpp +++ b/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_spgemm_jacobi.cpp @@ -18,7 +18,7 @@ #include "KokkosKernels_default_types.hpp" #include "KokkosKernels_IOUtils.hpp" #include "KokkosSparse_run_spgemm_jacobi.hpp" -#include "KokkosKernels_TestUtils.hpp" +#include "KokkosKernels_TestStringUtils.hpp" void print_options() { std::cerr << "Options\n" << std::endl; @@ -201,9 +201,9 @@ int parse_inputs(KokkosKernels::Experiment::Parameters& params, int argc, char** } int main(int argc, char** argv) { - using size_type = default_size_type; - using lno_t = default_lno_t; - using scalar_t = default_scalar; + using size_type = KokkosKernels::default_size_type; + using lno_t = KokkosKernels::default_lno_t; + using scalar_t = KokkosKernels::default_scalar; KokkosKernels::Experiment::Parameters params; diff --git a/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_spiluk.cpp b/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_spiluk.cpp index 4aa05f9522a8..13c6eefea283 100644 --- a/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_spiluk.cpp +++ b/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_spiluk.cpp @@ -54,7 +54,7 @@ enum { DEFAULT, CUSPARSE, LVLSCHED_RP, LVLSCHED_TP1 /*, LVLSCHED_TP2*/ }; int test_spiluk_perf(std::vector tests, std::string afilename, int kin, int team_size, int /*vector_length*/, /*int idx_offset,*/ int loop) { - typedef default_scalar scalar_t; + using scalar_t = KokkosKernels::default_scalar; typedef int lno_t; typedef int size_type; typedef Kokkos::DefaultExecutionSpace execution_space; diff --git a/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_spmv_benchmark.cpp b/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_spmv_benchmark.cpp index 5ecf2f724822..6a2f92165bc1 100644 --- a/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_spmv_benchmark.cpp +++ b/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_spmv_benchmark.cpp @@ -21,7 +21,7 @@ #include #include #include "KokkosKernels_default_types.hpp" -#include "KokkosKernels_TestUtils.hpp" +#include "KokkosKernels_TestStringUtils.hpp" #include "KokkosKernels_perf_test_utilities.hpp" // Headers for benchmark library diff --git a/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_spmv_merge.cpp b/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_spmv_merge.cpp index b04142ed08d3..8e509939a2c8 100644 --- a/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_spmv_merge.cpp +++ b/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_spmv_merge.cpp @@ -154,8 +154,8 @@ void print_help() { } int main(int argc, char** argv) { - using Scalar = default_scalar; - using lno_t = default_lno_t; + using Scalar = KokkosKernels::default_scalar; + using lno_t = KokkosKernels::default_lno_t; bool compare = false; lno_t loop = 100; diff --git a/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_spmv_struct.cpp b/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_spmv_struct.cpp index 862f6c8175ed..1e95bd7de5e2 100644 --- a/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_spmv_struct.cpp +++ b/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_spmv_struct.cpp @@ -151,9 +151,9 @@ int main(int argc, char **argv) { Kokkos::initialize(argc, argv); { - typedef default_size_type size_type; - typedef default_lno_t lno_t; - typedef default_scalar Scalar; + using size_type = KokkosKernels::default_size_type; + using lno_t = KokkosKernels::default_lno_t; + using Scalar = KokkosKernels::default_scalar; typedef KokkosSparse::CrsMatrix matrix_type; typedef typename Kokkos::View mv_type; // typedef typename diff --git a/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_spmv_test.hpp b/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_spmv_test.hpp index 616543117af2..f6008b9cd810 100644 --- a/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_spmv_test.hpp +++ b/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_spmv_test.hpp @@ -54,10 +54,10 @@ void armpl_matvec(AType /*A*/, XType x, YType y, spmv_additional_data* data); enum { KOKKOS, MKL, ARMPL, CUSPARSE, KK_KERNELS, KK_KERNELS_INSP, KK_INSP, OMP_STATIC, OMP_DYNAMIC, OMP_INSP }; enum { AUTO, DYNAMIC, STATIC }; -using Scalar = default_scalar; -using Ordinal = default_lno_t; -using Offset = default_size_type; -using Layout = default_layout; +using Scalar = KokkosKernels::default_scalar; +using Ordinal = KokkosKernels::default_lno_t; +using Offset = KokkosKernels::default_size_type; +using Layout = KokkosKernels::default_layout; #ifdef KOKKOSKERNELS_ENABLE_TESTS_AND_PERFSUITE std::vector make_spmv_kernel_base(const rajaperf::RunParams& params); diff --git a/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_sptrsv.cpp b/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_sptrsv.cpp index eb36b67fbf0d..ae5c0ad7ebeb 100644 --- a/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_sptrsv.cpp +++ b/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_sptrsv.cpp @@ -98,9 +98,9 @@ void check_entries_sorted(const RowMapType drow_map, const EntriesType dentries) int test_sptrsv_perf(std::vector tests, const std::string &lfilename, const std::string &ufilename, const int team_size, const int vector_length, const int /*idx_offset*/, const int loop, const int chain_threshold = 0, const float /*dense_row_percent*/ = -1.0) { - typedef default_scalar scalar_t; - typedef default_lno_t lno_t; - typedef default_size_type size_type; + using scalar_t = KokkosKernels::default_scalar; + using lno_t = KokkosKernels::default_lno_t; + using size_type = KokkosKernels::default_size_type; typedef Kokkos::DefaultExecutionSpace execution_space; typedef typename execution_space::memory_space memory_space; diff --git a/packages/kokkos-kernels/perf_test/sparse/spmv/Kokkos_SPMV.hpp b/packages/kokkos-kernels/perf_test/sparse/spmv/Kokkos_SPMV.hpp index 50f3f0cdd49e..6e4578f93edf 100644 --- a/packages/kokkos-kernels/perf_test/sparse/spmv/Kokkos_SPMV.hpp +++ b/packages/kokkos-kernels/perf_test/sparse/spmv/Kokkos_SPMV.hpp @@ -14,8 +14,8 @@ // //@HEADER -#ifndef KOKKOS_SPMV_HPP_ -#define KOKKOS_SPMV_HPP_ +#ifndef KOKKOSKERNELS_PERFTEST_KOKKOS_SPMV_HPP_ +#define KOKKOSKERNELS_PERFTEST_KOKKOS_SPMV_HPP_ template struct SPMV_Functor { @@ -142,4 +142,4 @@ void kokkos_matvec(AType A, XType x, YType y, int rows_per_thread, int team_size Kokkos::parallel_for("KokkosSparse::PerfTest::SpMV", policy, func); } -#endif /* KOKKOS_SPMV_HPP_ */ +#endif /* KOKKOSKERNELS_PERFTEST_KOKKOS_SPMV_HPP_ */ diff --git a/packages/kokkos-kernels/perf_test/sparse/spmv/Kokkos_SPMV_Inspector.hpp b/packages/kokkos-kernels/perf_test/sparse/spmv/Kokkos_SPMV_Inspector.hpp index 6d82b9782ba8..258f1a10878c 100644 --- a/packages/kokkos-kernels/perf_test/sparse/spmv/Kokkos_SPMV_Inspector.hpp +++ b/packages/kokkos-kernels/perf_test/sparse/spmv/Kokkos_SPMV_Inspector.hpp @@ -14,8 +14,8 @@ // //@HEADER -#ifndef KOKKOS_SPMV_INSPECTOR_HPP_ -#define KOKKOS_SPMV_INSPECTOR_HPP_ +#ifndef KOKKOSKERNELS_PERFTEST_SPMV_INSPECTOR_HPP +#define KOKKOSKERNELS_PERFTEST_SPMV_INSPECTOR_HPP #include "Kokkos_SPMV.hpp" @@ -125,4 +125,4 @@ void kk_inspector_matvec(AType A, XType x, YType y, int team_size, int vector_le Kokkos::parallel_for("KokkosSparse::PerfTest::SpMV_Inspector", policy, func); } -#endif /* KOKKOS_SPMV_HPP_ */ +#endif // KOKKOSKERNELS_PERFTEST_SPMV_INSPECTOR_HPP diff --git a/packages/kokkos-kernels/perf_test/sparse/spmv/OpenMPSmartStatic_SPMV.hpp b/packages/kokkos-kernels/perf_test/sparse/spmv/OpenMPSmartStatic_SPMV.hpp index 1967d0d39230..37a1fd05ed7b 100644 --- a/packages/kokkos-kernels/perf_test/sparse/spmv/OpenMPSmartStatic_SPMV.hpp +++ b/packages/kokkos-kernels/perf_test/sparse/spmv/OpenMPSmartStatic_SPMV.hpp @@ -20,6 +20,8 @@ #ifdef KOKKOS_ENABLE_OPENMP #include +#include +#include #define OMP_BENCH_RESTRICT __restrict__ @@ -33,8 +35,16 @@ void establishSmartSchedule(AType A) { // Generate a schedule Ordinal* rowSizes = NULL; - posix_memalign((void**)&rowSizes, 64, sizeof(int) * A.numRows()); - posix_memalign((void**)&threadStarts, 128, sizeof(int) * (omp_get_max_threads() + 1)); + if (posix_memalign((void**)&rowSizes, 64, sizeof(int) * A.numRows())) { + std::stringstream ss; + ss << __FILE__ << ":" << __LINE__ << " posix_memalign failed"; + throw std::runtime_error(ss.str()); + } + if (posix_memalign((void**)&threadStarts, 128, sizeof(int) * (omp_get_max_threads() + 1))) { + std::stringstream ss; + ss << __FILE__ << ":" << __LINE__ << " posix_memalign failed"; + throw std::runtime_error(ss.str()); + } for (int i = 0; i < omp_get_max_threads(); ++i) { threadStarts[i] = A.numRows(); diff --git a/packages/kokkos-kernels/scripts/cm_test_all_sandia b/packages/kokkos-kernels/scripts/cm_test_all_sandia index 55749a67bb0c..f22d82ed6a27 100755 --- a/packages/kokkos-kernels/scripts/cm_test_all_sandia +++ b/packages/kokkos-kernels/scripts/cm_test_all_sandia @@ -746,22 +746,22 @@ elif [ "$MACHINE" = "solo" ]; then module load cmake BASE_MODULE_LIST="cmake,/" - BASE_MODULE_LIST_LLVM="cmake,/,gnu/10.2.1" + BASE_MODULE_LIST_LLVM="cmake,/,gnu/10.3.1" BASE_MODULE_LIST_INTEL="cmake,/" ONEAPI_WARNING_FLAGS="" - GNU102_MODULE_TPL_LIST="$BASE_MODULE_LIST,openblas/0.3.21" + GNU103_MODULE_TPL_LIST="$BASE_MODULE_LIST,openblas/0.3.21" if [ "$SPOT_CHECK" = "True" ]; then - COMPILERS=("gnu/10.2.1 $BASE_MODULE_LIST "Threads_Serial,OpenMP" g++ $GNU_WARNING_FLAGS" + COMPILERS=("gnu/10.3.1 $BASE_MODULE_LIST "Threads_Serial,OpenMP" g++ $GNU_WARNING_FLAGS" "llvm/10.0.1 $BASE_MODULE_LIST_LLVM "Threads_Serial" clang++ $CLANG_WARNING_FLAGS" ) elif [ "$SPOT_CHECK_TPLS" = "True" ]; then COMPILERS=("intel/19.1 $BASE_MODULE_LIST_INTEL,mkl/19.1 "OpenMP,Threads" icpc $INTEL_WARNING_FLAGS" - "gnu/10.2.1 $GNU102_MODULE_TPL_LIST "OpenMP_Serial" g++ $GNU_WARNING_FLAGS" + "gnu/10.3.1 $GNU103_MODULE_TPL_LIST "OpenMP_Serial" g++ $GNU_WARNING_FLAGS" ) else - COMPILERS=("gnu/10.2.1 $BASE_MODULE_LIST $GNU_BUILD_LIST g++ $GNU_WARNING_FLAGS" + COMPILERS=("gnu/10.3.1 $BASE_MODULE_LIST $GNU_BUILD_LIST g++ $GNU_WARNING_FLAGS" "gnu/11.2.1 $BASE_MODULE_LIST $GNU_BUILD_LIST g++ $GNU_WARNING_FLAGS" "gnu/12.1.1 $BASE_MODULE_LIST $GNU_BUILD_LIST g++ $GNU_WARNING_FLAGS" "llvm/10.0.1 $BASE_MODULE_LIST_LLVM $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" diff --git a/packages/kokkos-kernels/sparse/CMakeLists.txt b/packages/kokkos-kernels/sparse/CMakeLists.txt index 97076655f7e2..1e34990bb52d 100644 --- a/packages/kokkos-kernels/sparse/CMakeLists.txt +++ b/packages/kokkos-kernels/sparse/CMakeLists.txt @@ -18,13 +18,6 @@ LIST(APPEND SOURCES sparse/tpls/KokkosKernels_tpl_handles.cpp) #that should be instantiated based on input options #Generate @X@ variables in the template X.hpp.in and X.cpp.in #files containing the list of all needed macros -KOKKOSKERNELS_GENERATE_ETI(Sparse_sptrsv_solve sptrsv_solve - COMPONENTS sparse - HEADER_LIST ETI_HEADERS - SOURCE_LIST SOURCES - TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES -) - KOKKOSKERNELS_GENERATE_ETI(Sparse_spmv_struct spmv COMPONENTS sparse HEADER_LIST ETI_HEADERS @@ -160,6 +153,13 @@ KOKKOSKERNELS_GENERATE_ETI(Sparse_sptrsv_symbolic sptrsv_symbolic TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES ) +KOKKOSKERNELS_GENERATE_ETI(Sparse_sptrsv_solve sptrsv_solve + COMPONENTS sparse + HEADER_LIST ETI_HEADERS + SOURCE_LIST SOURCES + TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES +) + KOKKOSKERNELS_GENERATE_ETI(Sparse_trsv trsv COMPONENTS sparse HEADER_LIST ETI_HEADERS diff --git a/packages/kokkos-kernels/sparse/eti/generated_specializations_hpp/KokkosSparse_bspgemm_numeric_eti_spec_decl.hpp.in b/packages/kokkos-kernels/sparse/eti/generated_specializations_hpp/KokkosSparse_bspgemm_numeric_eti_spec_decl.hpp.in new file mode 100644 index 000000000000..2fdcd740e2d8 --- /dev/null +++ b/packages/kokkos-kernels/sparse/eti/generated_specializations_hpp/KokkosSparse_bspgemm_numeric_eti_spec_decl.hpp.in @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSSPARSE_BSPGEMM_NUMERIC_ETI_SPEC_DECL_HPP_ +#define KOKKOSSPARSE_BSPGEMM_NUMERIC_ETI_SPEC_DECL_HPP_ +namespace KokkosSparse { +namespace Impl { +@SPARSE_BSPGEMM_NUMERIC_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/packages/kokkos-kernels/sparse/eti/generated_specializations_hpp/KokkosSparse_gauss_seidel_apply_eti_spec_decl.hpp.in b/packages/kokkos-kernels/sparse/eti/generated_specializations_hpp/KokkosSparse_gauss_seidel_apply_eti_spec_decl.hpp.in new file mode 100644 index 000000000000..9be44095f0c4 --- /dev/null +++ b/packages/kokkos-kernels/sparse/eti/generated_specializations_hpp/KokkosSparse_gauss_seidel_apply_eti_spec_decl.hpp.in @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSSPARSE_GAUSS_SEIDEL_APPLY_ETI_SPEC_DECL_HPP_ +#define KOKKOSSPARSE_GAUSS_SEIDEL_APPLY_ETI_SPEC_DECL_HPP_ +namespace KokkosSparse { +namespace Impl { +@SPARSE_GAUSS_SEIDEL_APPLY_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/packages/kokkos-kernels/sparse/eti/generated_specializations_hpp/KokkosSparse_gauss_seidel_numeric_eti_spec_decl.hpp.in b/packages/kokkos-kernels/sparse/eti/generated_specializations_hpp/KokkosSparse_gauss_seidel_numeric_eti_spec_decl.hpp.in new file mode 100644 index 000000000000..1e3befcc89c1 --- /dev/null +++ b/packages/kokkos-kernels/sparse/eti/generated_specializations_hpp/KokkosSparse_gauss_seidel_numeric_eti_spec_decl.hpp.in @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSSPARSE_GAUSS_SEIDEL_NUMERIC_ETI_SPEC_DECL_HPP_ +#define KOKKOSSPARSE_GAUSS_SEIDEL_NUMERIC_ETI_SPEC_DECL_HPP_ +namespace KokkosSparse { +namespace Impl { +@SPARSE_GAUSS_SEIDEL_NUMERIC_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/packages/kokkos-kernels/sparse/eti/generated_specializations_hpp/KokkosSparse_gauss_seidel_symbolic_eti_spec_decl.hpp.in b/packages/kokkos-kernels/sparse/eti/generated_specializations_hpp/KokkosSparse_gauss_seidel_symbolic_eti_spec_decl.hpp.in new file mode 100644 index 000000000000..493740dfb2ea --- /dev/null +++ b/packages/kokkos-kernels/sparse/eti/generated_specializations_hpp/KokkosSparse_gauss_seidel_symbolic_eti_spec_decl.hpp.in @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSSPARSE_GAUSS_SEIDEL_SYMBOLIC_ETI_SPEC_DECL_HPP_ +#define KOKKOSSPARSE_GAUSS_SEIDEL_SYMBOLIC_ETI_SPEC_DECL_HPP_ +namespace KokkosSparse { +namespace Impl { +@SPARSE_GAUSS_SEIDEL_SYMBOLIC_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/packages/kokkos-kernels/sparse/eti/generated_specializations_hpp/KokkosSparse_gmres_eti_spec_avail.hpp.in b/packages/kokkos-kernels/sparse/eti/generated_specializations_hpp/KokkosSparse_gmres_eti_spec_avail.hpp.in index 16b0bf66b745..97f3ac20dfa2 100644 --- a/packages/kokkos-kernels/sparse/eti/generated_specializations_hpp/KokkosSparse_gmres_eti_spec_avail.hpp.in +++ b/packages/kokkos-kernels/sparse/eti/generated_specializations_hpp/KokkosSparse_gmres_eti_spec_avail.hpp.in @@ -1,5 +1,3 @@ -#ifndef KOKKOSSPARSE_GMRES_ETI_SPEC_AVAIL_HPP_ -#define KOKKOSSPARSE_GMRES_ETI_SPEC_AVAIL_HPP_ /* //@HEADER // ************************************************************************ @@ -18,11 +16,11 @@ //@HEADER */ +#ifndef KOKKOSSPARSE_GMRES_ETI_SPEC_AVAIL_HPP_ +#define KOKKOSSPARSE_GMRES_ETI_SPEC_AVAIL_HPP_ namespace KokkosSparse { namespace Impl { - @SPARSE_GMRES_ETI_AVAIL_BLOCK@ - } // Impl } // KokkosSparse #endif // KOKKOSSPARSE_GMRES_ETI_SPEC_AVAIL_HPP_ diff --git a/packages/kokkos-kernels/sparse/eti/generated_specializations_hpp/KokkosSparse_gmres_eti_spec_decl.hpp.in b/packages/kokkos-kernels/sparse/eti/generated_specializations_hpp/KokkosSparse_gmres_eti_spec_decl.hpp.in new file mode 100644 index 000000000000..96fdca40b4b4 --- /dev/null +++ b/packages/kokkos-kernels/sparse/eti/generated_specializations_hpp/KokkosSparse_gmres_eti_spec_decl.hpp.in @@ -0,0 +1,26 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +*/ + +#ifndef KOKKOSSPARSE_GMRES_ETI_SPEC_DECL_HPP_ +#define KOKKOSSPARSE_GMRES_ETI_SPEC_DECL_HPP_ +namespace KokkosSparse { +namespace Impl { +@SPARSE_GMRES_ETI_DECL_BLOCK@ +} // Impl +} // KokkosSparse +#endif // KOKKOSSPARSE_GMRES_ETI_SPEC_DECL_HPP_ diff --git a/packages/kokkos-kernels/sparse/eti/generated_specializations_hpp/KokkosSparse_par_ilut_numeric_eti_spec_decl.hpp.in b/packages/kokkos-kernels/sparse/eti/generated_specializations_hpp/KokkosSparse_par_ilut_numeric_eti_spec_decl.hpp.in new file mode 100644 index 000000000000..0fd514c1ca4d --- /dev/null +++ b/packages/kokkos-kernels/sparse/eti/generated_specializations_hpp/KokkosSparse_par_ilut_numeric_eti_spec_decl.hpp.in @@ -0,0 +1,26 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSSPARSE_PAR_ILUT_NUMERIC_ETI_SPEC_DECL_HPP_ +#define KOKKOSSPARSE_PAR_ILUT_NUMERIC_ETI_SPEC_DECL_HPP_ +namespace KokkosSparse { +namespace Impl { + +@SPARSE_PAR_ILUT_NUMERIC_ETI_DECL_BLOCK@ + +} // Impl +} // KokkosSparse +#endif // KOKKOSSPARSE_PAR_ILUT_NUMERIC_ETI_SPEC_DECL_HPP_ diff --git a/packages/kokkos-kernels/sparse/eti/generated_specializations_hpp/KokkosSparse_par_ilut_symbolic_eti_spec_decl.hpp.in b/packages/kokkos-kernels/sparse/eti/generated_specializations_hpp/KokkosSparse_par_ilut_symbolic_eti_spec_decl.hpp.in new file mode 100644 index 000000000000..c30fe10f824c --- /dev/null +++ b/packages/kokkos-kernels/sparse/eti/generated_specializations_hpp/KokkosSparse_par_ilut_symbolic_eti_spec_decl.hpp.in @@ -0,0 +1,26 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSSPARSE_PAR_ILUT_SYMBOLIC_ETI_SPEC_DECL_HPP_ +#define KOKKOSSPARSE_PAR_ILUT_SYMBOLIC_ETI_SPEC_DECL_HPP_ +namespace KokkosSparse { +namespace Impl { + +@SPARSE_PAR_ILUT_SYMBOLIC_ETI_DECL_BLOCK@ + +} // Impl +} // KokkosSparse +#endif // KOKKOSSPARSE_PAR_ILUT_SYMBOLIC_ETI_SPEC_DECL_HPP_ diff --git a/packages/kokkos-kernels/sparse/eti/generated_specializations_hpp/KokkosSparse_spadd_numeric_eti_spec_decl.hpp.in b/packages/kokkos-kernels/sparse/eti/generated_specializations_hpp/KokkosSparse_spadd_numeric_eti_spec_decl.hpp.in new file mode 100644 index 000000000000..43b1da79d1eb --- /dev/null +++ b/packages/kokkos-kernels/sparse/eti/generated_specializations_hpp/KokkosSparse_spadd_numeric_eti_spec_decl.hpp.in @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSSPARSE_SPADD_NUMERIC_ETI_SPEC_DECL_HPP_ +#define KOKKOSSPARSE_SPADD_NUMERIC_ETI_SPEC_DECL_HPP_ +namespace KokkosSparse { +namespace Impl { +@SPARSE_SPADD_NUMERIC_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/packages/kokkos-kernels/sparse/eti/generated_specializations_hpp/KokkosSparse_spadd_symbolic_eti_spec_decl.hpp.in b/packages/kokkos-kernels/sparse/eti/generated_specializations_hpp/KokkosSparse_spadd_symbolic_eti_spec_decl.hpp.in new file mode 100644 index 000000000000..131960272e0f --- /dev/null +++ b/packages/kokkos-kernels/sparse/eti/generated_specializations_hpp/KokkosSparse_spadd_symbolic_eti_spec_decl.hpp.in @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSSPARSE_SPADD_SYMBOLIC_ETI_SPEC_DECL_HPP_ +#define KOKKOSSPARSE_SPADD_SYMBOLIC_ETI_SPEC_DECL_HPP_ +namespace KokkosSparse { +namespace Impl { +@SPARSE_SPADD_SYMBOLIC_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/packages/kokkos-kernels/sparse/eti/generated_specializations_hpp/KokkosSparse_spgemm_jacobi_eti_spec_decl.hpp.in b/packages/kokkos-kernels/sparse/eti/generated_specializations_hpp/KokkosSparse_spgemm_jacobi_eti_spec_decl.hpp.in new file mode 100644 index 000000000000..6356f7c4388c --- /dev/null +++ b/packages/kokkos-kernels/sparse/eti/generated_specializations_hpp/KokkosSparse_spgemm_jacobi_eti_spec_decl.hpp.in @@ -0,0 +1,25 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSSPARSE_SPGEMM_JACOBI_ETI_SPEC_DECL_HPP_ +#define KOKKOSSPARSE_SPGEMM_JACOBI_ETI_SPEC_DECL_HPP_ + +namespace KokkosSparse { +namespace Impl { +@SPARSE_SPGEMM_JACOBI_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/packages/kokkos-kernels/sparse/eti/generated_specializations_hpp/KokkosSparse_spgemm_noreuse_eti_spec_decl.hpp.in b/packages/kokkos-kernels/sparse/eti/generated_specializations_hpp/KokkosSparse_spgemm_noreuse_eti_spec_decl.hpp.in new file mode 100644 index 000000000000..2ca1ecf07bc4 --- /dev/null +++ b/packages/kokkos-kernels/sparse/eti/generated_specializations_hpp/KokkosSparse_spgemm_noreuse_eti_spec_decl.hpp.in @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSSPARSE_SPGEMM_NOREUSE_ETI_SPEC_DECL_HPP_ +#define KOKKOSSPARSE_SPGEMM_NOREUSE_ETI_SPEC_DECL_HPP_ +namespace KokkosSparse { +namespace Impl { +@SPARSE_SPGEMM_NOREUSE_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/packages/kokkos-kernels/sparse/eti/generated_specializations_hpp/KokkosSparse_spgemm_numeric_eti_spec_decl.hpp.in b/packages/kokkos-kernels/sparse/eti/generated_specializations_hpp/KokkosSparse_spgemm_numeric_eti_spec_decl.hpp.in new file mode 100644 index 000000000000..af422e6fe57d --- /dev/null +++ b/packages/kokkos-kernels/sparse/eti/generated_specializations_hpp/KokkosSparse_spgemm_numeric_eti_spec_decl.hpp.in @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSSPARSE_SPGEMM_NUMERIC_ETI_SPEC_DECL_HPP_ +#define KOKKOSSPARSE_SPGEMM_NUMERIC_ETI_SPEC_DECL_HPP_ +namespace KokkosSparse { +namespace Impl { +@SPARSE_SPGEMM_NUMERIC_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/packages/kokkos-kernels/sparse/eti/generated_specializations_hpp/KokkosSparse_spgemm_symbolic_eti_spec_decl.hpp.in b/packages/kokkos-kernels/sparse/eti/generated_specializations_hpp/KokkosSparse_spgemm_symbolic_eti_spec_decl.hpp.in new file mode 100644 index 000000000000..2f3870e948e4 --- /dev/null +++ b/packages/kokkos-kernels/sparse/eti/generated_specializations_hpp/KokkosSparse_spgemm_symbolic_eti_spec_decl.hpp.in @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSSPARSE_SPGEMM_SYMBOLIC_ETI_SPEC_DECL_HPP_ +#define KOKKOSSPARSE_SPGEMM_SYMBOLIC_ETI_SPEC_DECL_HPP_ +namespace KokkosSparse { +namespace Impl { +@SPARSE_SPGEMM_SYMBOLIC_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/packages/kokkos-kernels/sparse/eti/generated_specializations_hpp/KokkosSparse_spiluk_numeric_eti_spec_decl.hpp.in b/packages/kokkos-kernels/sparse/eti/generated_specializations_hpp/KokkosSparse_spiluk_numeric_eti_spec_decl.hpp.in new file mode 100644 index 000000000000..4cc6d57e9be7 --- /dev/null +++ b/packages/kokkos-kernels/sparse/eti/generated_specializations_hpp/KokkosSparse_spiluk_numeric_eti_spec_decl.hpp.in @@ -0,0 +1,26 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSSPARSE_SPILUK_NUMERIC_ETI_SPEC_DECL_HPP_ +#define KOKKOSSPARSE_SPILUK_NUMERIC_ETI_SPEC_DECL_HPP_ +namespace KokkosSparse { +namespace Impl { + +@SPARSE_SPILUK_NUMERIC_ETI_DECL_BLOCK@ + +} // Impl +} // KokkosSparse +#endif // KOKKOSSPARSE_SPILUK_NUMERIC_ETI_SPEC_DECL_HPP_ diff --git a/packages/kokkos-kernels/sparse/eti/generated_specializations_hpp/KokkosSparse_spiluk_symbolic_eti_spec_decl.hpp.in b/packages/kokkos-kernels/sparse/eti/generated_specializations_hpp/KokkosSparse_spiluk_symbolic_eti_spec_decl.hpp.in new file mode 100644 index 000000000000..bfffae9dc0d8 --- /dev/null +++ b/packages/kokkos-kernels/sparse/eti/generated_specializations_hpp/KokkosSparse_spiluk_symbolic_eti_spec_decl.hpp.in @@ -0,0 +1,26 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSSPARSE_SPILUK_SYMBOLIC_ETI_SPEC_DECL_HPP_ +#define KOKKOSSPARSE_SPILUK_SYMBOLIC_ETI_SPEC_DECL_HPP_ +namespace KokkosSparse { +namespace Impl { + +@SPARSE_SPILUK_SYMBOLIC_ETI_DECL_BLOCK@ + +} // Impl +} // KokkosSparse +#endif // KOKKOSSPARSE_SPILUK_SYMBOLIC_ETI_SPEC_DECL_HPP_ diff --git a/packages/kokkos-kernels/sparse/eti/generated_specializations_hpp/KokkosSparse_spmv_bsrmatrix_eti_spec_decl.hpp.in b/packages/kokkos-kernels/sparse/eti/generated_specializations_hpp/KokkosSparse_spmv_bsrmatrix_eti_spec_decl.hpp.in new file mode 100644 index 000000000000..ad3166a1e9ba --- /dev/null +++ b/packages/kokkos-kernels/sparse/eti/generated_specializations_hpp/KokkosSparse_spmv_bsrmatrix_eti_spec_decl.hpp.in @@ -0,0 +1,26 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSSPARSE_SPMV_BSRMATRIX_ETI_SPEC_DECL_HPP_ +#define KOKKOSSPARSE_SPMV_BSRMATRIX_ETI_SPEC_DECL_HPP_ +namespace KokkosSparse { +namespace Impl { +// clang-format off +@SPARSE_SPMV_BSRMATRIX_ETI_DECL_BLOCK@ +// clang-format on +} // namespace Impl +} // namespace KokkosSparse +#endif diff --git a/packages/kokkos-kernels/sparse/eti/generated_specializations_hpp/KokkosSparse_spmv_eti_spec_decl.hpp.in b/packages/kokkos-kernels/sparse/eti/generated_specializations_hpp/KokkosSparse_spmv_eti_spec_decl.hpp.in new file mode 100644 index 000000000000..14813536f040 --- /dev/null +++ b/packages/kokkos-kernels/sparse/eti/generated_specializations_hpp/KokkosSparse_spmv_eti_spec_decl.hpp.in @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSSPARSE_SPMV_ETI_SPEC_DECL_HPP_ +#define KOKKOSSPARSE_SPMV_ETI_SPEC_DECL_HPP_ +namespace KokkosSparse { +namespace Impl { +@SPARSE_SPMV_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/packages/kokkos-kernels/sparse/eti/generated_specializations_hpp/KokkosSparse_spmv_mv_bsrmatrix_eti_spec_decl.hpp.in b/packages/kokkos-kernels/sparse/eti/generated_specializations_hpp/KokkosSparse_spmv_mv_bsrmatrix_eti_spec_decl.hpp.in new file mode 100644 index 000000000000..9a44ea21e555 --- /dev/null +++ b/packages/kokkos-kernels/sparse/eti/generated_specializations_hpp/KokkosSparse_spmv_mv_bsrmatrix_eti_spec_decl.hpp.in @@ -0,0 +1,27 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSSPARSE_SPMV_MV_BSRMATRIX_ETI_SPEC_DECL_HPP_ +#define KOKKOSSPARSE_SPMV_MV_BSRMATRIX_ETI_SPEC_DECL_HPP_ + +namespace KokkosSparse { +namespace Impl { +// clang-format off +@SPARSE_SPMV_MV_BSRMATRIX_ETI_DECL_BLOCK@ +// clang-format on +} // namespace Impl +} // namespace KokkosSparse +#endif diff --git a/packages/kokkos-kernels/sparse/eti/generated_specializations_hpp/KokkosSparse_spmv_mv_eti_spec_decl.hpp.in b/packages/kokkos-kernels/sparse/eti/generated_specializations_hpp/KokkosSparse_spmv_mv_eti_spec_decl.hpp.in new file mode 100644 index 000000000000..af58d3e7fce9 --- /dev/null +++ b/packages/kokkos-kernels/sparse/eti/generated_specializations_hpp/KokkosSparse_spmv_mv_eti_spec_decl.hpp.in @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSSPARSE_SPMV_MV_ETI_SPEC_DECL_HPP_ +#define KOKKOSSPARSE_SPMV_MV_ETI_SPEC_DECL_HPP_ +namespace KokkosSparse { +namespace Impl { +@SPARSE_SPMV_MV_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/packages/kokkos-kernels/sparse/eti/generated_specializations_hpp/KokkosSparse_spmv_mv_struct_eti_spec_decl.hpp.in b/packages/kokkos-kernels/sparse/eti/generated_specializations_hpp/KokkosSparse_spmv_mv_struct_eti_spec_decl.hpp.in new file mode 100644 index 000000000000..11ba625f3cf6 --- /dev/null +++ b/packages/kokkos-kernels/sparse/eti/generated_specializations_hpp/KokkosSparse_spmv_mv_struct_eti_spec_decl.hpp.in @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSSPARSE_SPMV_MV_STRUCT_ETI_SPEC_DECL_HPP_ +#define KOKKOSSPARSE_SPMV_MV_STRUCT_ETI_SPEC_DECL_HPP_ +namespace KokkosSparse { +namespace Impl { +@SPARSE_SPMV_MV_STRUCT_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/packages/kokkos-kernels/sparse/eti/generated_specializations_hpp/KokkosSparse_spmv_struct_eti_spec_decl.hpp.in b/packages/kokkos-kernels/sparse/eti/generated_specializations_hpp/KokkosSparse_spmv_struct_eti_spec_decl.hpp.in new file mode 100644 index 000000000000..a03fcf586e7e --- /dev/null +++ b/packages/kokkos-kernels/sparse/eti/generated_specializations_hpp/KokkosSparse_spmv_struct_eti_spec_decl.hpp.in @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSSPARSE_SPMV_STRUCT_ETI_SPEC_DECL_HPP_ +#define KOKKOSSPARSE_SPMV_STRUCT_ETI_SPEC_DECL_HPP_ +namespace KokkosSparse { +namespace Impl { +@SPARSE_SPMV_STRUCT_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/packages/kokkos-kernels/sparse/eti/generated_specializations_hpp/KokkosSparse_sptrsv_solve_eti_spec_decl.hpp.in b/packages/kokkos-kernels/sparse/eti/generated_specializations_hpp/KokkosSparse_sptrsv_solve_eti_spec_decl.hpp.in new file mode 100644 index 000000000000..aa3d2b2cef36 --- /dev/null +++ b/packages/kokkos-kernels/sparse/eti/generated_specializations_hpp/KokkosSparse_sptrsv_solve_eti_spec_decl.hpp.in @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSSPARSE_SPTRSV_SOLVE_ETI_SPEC_DECL_HPP_ +#define KOKKOSSPARSE_SPTRSV_SOLVE_ETI_SPEC_DECL_HPP_ +namespace KokkosSparse { +namespace Impl { +@SPARSE_SPTRSV_SOLVE_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/packages/kokkos-kernels/sparse/eti/generated_specializations_hpp/KokkosSparse_sptrsv_symbolic_eti_spec_decl.hpp.in b/packages/kokkos-kernels/sparse/eti/generated_specializations_hpp/KokkosSparse_sptrsv_symbolic_eti_spec_decl.hpp.in new file mode 100644 index 000000000000..4c48c895a1e1 --- /dev/null +++ b/packages/kokkos-kernels/sparse/eti/generated_specializations_hpp/KokkosSparse_sptrsv_symbolic_eti_spec_decl.hpp.in @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSSPARSE_SPTRSV_SYMBOLIC_ETI_SPEC_DECL_HPP_ +#define KOKKOSSPARSE_SPTRSV_SYMBOLIC_ETI_SPEC_DECL_HPP_ +namespace KokkosSparse { +namespace Impl { +@SPARSE_SPTRSV_SYMBOLIC_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/packages/kokkos-kernels/sparse/eti/generated_specializations_hpp/KokkosSparse_trsv_eti_spec_decl.hpp.in b/packages/kokkos-kernels/sparse/eti/generated_specializations_hpp/KokkosSparse_trsv_eti_spec_decl.hpp.in new file mode 100644 index 000000000000..5b24a276d0c0 --- /dev/null +++ b/packages/kokkos-kernels/sparse/eti/generated_specializations_hpp/KokkosSparse_trsv_eti_spec_decl.hpp.in @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSSPARSE_TRSV_ETI_SPEC_DECL_HPP_ +#define KOKKOSSPARSE_TRSV_ETI_SPEC_DECL_HPP_ +namespace KokkosSparse { +namespace Impl { +@SPARSE_TRSV_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/packages/kokkos-kernels/sparse/impl/KokkosSparse_bspgemm_impl.hpp b/packages/kokkos-kernels/sparse/impl/KokkosSparse_bspgemm_impl.hpp index ae0604cff44b..7f40d2ffc2ee 100644 --- a/packages/kokkos-kernels/sparse/impl/KokkosSparse_bspgemm_impl.hpp +++ b/packages/kokkos-kernels/sparse/impl/KokkosSparse_bspgemm_impl.hpp @@ -91,7 +91,7 @@ class KokkosBSPGEMM : public KokkosSPGEMM alignof(nnz_lno_t)) ? (alignof(scalar_t) - alignof(nnz_lno_t)) : 0; - static constexpr bool exec_gpu = KokkosKernels::Impl::kk_is_gpu_exec_space(); + static constexpr bool exec_gpu = KokkosKernels::Impl::is_gpu_exec_space_v; private: nnz_lno_t block_dim; diff --git a/packages/kokkos-kernels/sparse/impl/KokkosSparse_bspgemm_impl_kkmem.hpp b/packages/kokkos-kernels/sparse/impl/KokkosSparse_bspgemm_impl_kkmem.hpp index 1e2e59815163..b306c219f177 100644 --- a/packages/kokkos-kernels/sparse/impl/KokkosSparse_bspgemm_impl_kkmem.hpp +++ b/packages/kokkos-kernels/sparse/impl/KokkosSparse_bspgemm_impl_kkmem.hpp @@ -376,7 +376,7 @@ struct KokkosBSPGEMM thread_shmem_key_size) { - volatile nnz_lno_t *tmp = NULL; + nnz_lno_t *tmp = NULL; // size_t tid = get_thread_id(row_index); // the code gets internal compiler error on gcc 4.7.2 // assuming that this part only runs on GPUs for now, below fix @@ -386,10 +386,7 @@ struct KokkosBSPGEMM max_first_level_hash_size) { { while (tmp == NULL) { Kokkos::single( Kokkos::PerTeam(teamMember), - [&](volatile nnz_lno_t *&memptr) { - memptr = (volatile nnz_lno_t *)(memory_space.allocate_chunk(row_index)); - }, - tmp); + [&](nnz_lno_t *&memptr) { memptr = (nnz_lno_t *)(memory_space.allocate_chunk(row_index)); }, tmp); } global_acc_row_keys = (nnz_lno_t *)(tmp); global_acc_row_vals = KokkosKernels::Impl::alignPtrTo(tmp + pow2_hash_size); @@ -613,9 +607,9 @@ struct KokkosBSPGEMM max_first_level_hash_size) insert_is_on = false; fail = 0; break; @@ -635,9 +629,9 @@ struct KokkosBSPGEMM max_first_level_hash_size) insert_is_on = false; fail = 0; break; @@ -657,9 +651,10 @@ struct KokkosBSPGEMMspgemm_algorithm == SPGEMM_KK || SPGEMM_KK_LP == this->spgemm_algorithm) { - if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { + if (KokkosKernels::Impl::is_gpu_exec_space_v) { // then chose the best method and parameters. size_type average_row_nnz = 0; size_t average_row_flops = 0; @@ -1207,7 +1203,7 @@ void KokkosBSPGEMM< // END OF SHARED MEMORY SIZE CALCULATIONS // required memory for L2 - if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { + if (KokkosKernels::Impl::is_gpu_exec_space_v) { if (algorithm_to_run == SPGEMM_KK_MEMORY_SPREADTEAM) { tmp_max_nnz = 1; } else if (algorithm_to_run == SPGEMM_KK_MEMORY_BIGSPREADTEAM) { @@ -1257,7 +1253,7 @@ void KokkosBSPGEMM< KokkosKernels::Impl::PoolType my_pool_type = KokkosKernels::Impl::OneThread2OneChunk; - if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { + if (KokkosKernels::Impl::is_gpu_exec_space_v) { my_pool_type = KokkosKernels::Impl::ManyThread2OneChunk; } @@ -1288,7 +1284,7 @@ void KokkosBSPGEMM< } timer1.reset(); - if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { + if (KokkosKernels::Impl::is_gpu_exec_space_v) { if (algorithm_to_run == SPGEMM_KK_MEMORY_SPREADTEAM) { if (thread_shmem_key_size <= 0) { std::cout << "KokkosBSPGEMM_numeric_hash SPGEMM_KK_MEMORY_SPREADTEAM: " diff --git a/packages/kokkos-kernels/sparse/impl/KokkosSparse_bspgemm_impl_speed.hpp b/packages/kokkos-kernels/sparse/impl/KokkosSparse_bspgemm_impl_speed.hpp index 98dca331c682..a358d5253206 100644 --- a/packages/kokkos-kernels/sparse/impl/KokkosSparse_bspgemm_impl_speed.hpp +++ b/packages/kokkos-kernels/sparse/impl/KokkosSparse_bspgemm_impl_speed.hpp @@ -451,7 +451,7 @@ void KokkosBSPGEMM< Kokkos::Timer numeric_speed_timer_with_free; - if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { + if (KokkosKernels::Impl::is_gpu_exec_space_v) { // allocate memory for begins and next to be used by the hashmap nnz_lno_temp_work_view_t beginsC(Kokkos::view_alloc(Kokkos::WithoutInitializing, "C keys"), valuesC_.extent(0)); nnz_lno_temp_work_view_t nextsC(Kokkos::view_alloc(Kokkos::WithoutInitializing, "C nexts"), valuesC_.extent(0)); diff --git a/packages/kokkos-kernels/sparse/impl/KokkosSparse_bspgemm_numeric_spec.hpp b/packages/kokkos-kernels/sparse/impl/KokkosSparse_bspgemm_numeric_spec.hpp index 2315339858de..2d2d70955396 100644 --- a/packages/kokkos-kernels/sparse/impl/KokkosSparse_bspgemm_numeric_spec.hpp +++ b/packages/kokkos-kernels/sparse/impl/KokkosSparse_bspgemm_numeric_spec.hpp @@ -243,6 +243,8 @@ struct BSPGEMM_NUMERIC >, \ false, true>; +#include + #define KOKKOSSPARSE_BSPGEMM_NUMERIC_ETI_SPEC_INST(SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, \ EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ template struct BSPGEMM_NUMERIC< \ diff --git a/packages/kokkos-kernels/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp b/packages/kokkos-kernels/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp index fa4fa4a54ea1..2b15dcaa005e 100644 --- a/packages/kokkos-kernels/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp +++ b/packages/kokkos-kernels/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp @@ -514,7 +514,7 @@ class PointGaussSeidel { }); #if KOKKOSSPARSE_IMPL_PRINTDEBUG - if (!KokkosKernels::Impl::kk_is_gpu_exec_space() && + if (!KokkosKernels::Impl::is_gpu_exec_space_v && (ii == 0 || (block_size == 1 && ii < 2))) { std::cout << "\n\n\nrow:" << ii * block_size + i; std::cout << "\nneighbors:"; @@ -606,8 +606,8 @@ class PointGaussSeidel { nnz_lno_t color = t.league_rank(); nnz_lno_t colorBegin = color_xadj(color); nnz_lno_t colorLen = color_xadj(color + 1) - colorBegin; - KokkosKernels::TeamBitonicSort(color_adj.data() + colorBegin, colorLen, t, comp); - t.team_barrier(); + Kokkos::Experimental::sort_team( + t, Kokkos::subview(color_adj, Kokkos::make_pair(colorBegin, colorBegin + colorLen)), comp); // Now that the color set is sorted, count how many long rows there were nnz_lno_t numLongRows; Kokkos::parallel_reduce( @@ -897,7 +897,7 @@ class PointGaussSeidel { nnz_lno_t num_values_in_l2 = 0; nnz_lno_t num_big_rows = 0; - if (!KokkosKernels::Impl::kk_is_gpu_exec_space()) { + if (!KokkosKernels::Impl::is_gpu_exec_space_v) { // again, if it is on CPUs, we make L1 as big as we need. size_t l1mem = 1; while (l1mem < level_1_mem) { @@ -937,7 +937,7 @@ class PointGaussSeidel { KOKKOSKERNELS_MACRO_MIN(num_large_rows, (size_type)(my_exec_space.concurrency() / suggested_vector_size)); // std::cout << "num_big_rows:" << num_big_rows << std::endl; - if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { + if (KokkosKernels::Impl::is_gpu_exec_space_v) { // check if we have enough memory for this. lower the concurrency if // we do not have enugh memory. size_t free_byte; @@ -1208,7 +1208,7 @@ class PointGaussSeidel { // this!!!!!!!!!!!!!!!!!! change fill_matrix_numeric so that they store // the internal matrix as above. the rest will wok fine. - if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { + if (KokkosKernels::Impl::is_gpu_exec_space_v) { Kokkos::parallel_for("KokkosSparse::GaussSeidel::Team_fill_matrix_numeric", team_policy_t(my_exec_space, (num_rows + rows_per_team - 1) / rows_per_team, suggested_team_size, suggested_vector_size), @@ -1239,7 +1239,7 @@ class PointGaussSeidel { Get_Matrix_Diagonals gmd(newxadj_, newadj_, permuted_adj_vals, permuted_inverse_diagonal, this->num_rows, rows_per_team, block_size, block_matrix_size); - if (KokkosKernels::Impl::kk_is_gpu_exec_space() || block_size > 1) { + if (KokkosKernels::Impl::is_gpu_exec_space_v || block_size > 1) { Kokkos::parallel_for("KokkosSparse::GaussSeidel::team_get_matrix_diagonals", team_policy_t(my_exec_space, (num_rows + rows_per_team - 1) / rows_per_team, suggested_team_size, suggested_vector_size), diff --git a/packages/kokkos-kernels/sparse/impl/KokkosSparse_gauss_seidel_spec.hpp b/packages/kokkos-kernels/sparse/impl/KokkosSparse_gauss_seidel_spec.hpp index 0581a332cf7a..6f62de44b134 100644 --- a/packages/kokkos-kernels/sparse/impl/KokkosSparse_gauss_seidel_spec.hpp +++ b/packages/kokkos-kernels/sparse/impl/KokkosSparse_gauss_seidel_spec.hpp @@ -282,6 +282,8 @@ struct GAUSS_SEIDEL_APPLY >, \ false, true>; +#include + #define KOKKOSSPARSE_GAUSS_SEIDEL_SYMBOLIC_ETI_SPEC_INST(SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, \ EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ template struct GAUSS_SEIDEL_SYMBOLIC< \ @@ -321,6 +323,8 @@ struct GAUSS_SEIDEL_APPLY >, \ false, true>; +#include + #define KOKKOSSPARSE_GAUSS_SEIDEL_NUMERIC_ETI_SPEC_INST(SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, \ EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ template struct GAUSS_SEIDEL_NUMERIC< \ @@ -383,6 +387,8 @@ struct GAUSS_SEIDEL_APPLY >, \ false, true>; +#include + #define KOKKOSSPARSE_GAUSS_SEIDEL_APPLY_ETI_SPEC_INST(SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, \ EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ template struct GAUSS_SEIDEL_APPLY< \ diff --git a/packages/kokkos-kernels/sparse/impl/KokkosSparse_getDiagCopyWithOffsets_impl.hpp b/packages/kokkos-kernels/sparse/impl/KokkosSparse_getDiagCopyWithOffsets_impl.hpp index f4ed853e670f..0c20595892ef 100644 --- a/packages/kokkos-kernels/sparse/impl/KokkosSparse_getDiagCopyWithOffsets_impl.hpp +++ b/packages/kokkos-kernels/sparse/impl/KokkosSparse_getDiagCopyWithOffsets_impl.hpp @@ -14,8 +14,8 @@ // //@HEADER -#ifndef KOKKOS_SPARSE_IMPL_GETDIAGCOPYWITHOFFSETS_HPP_ -#define KOKKOS_SPARSE_IMPL_GETDIAGCOPYWITHOFFSETS_HPP_ +#ifndef KOKKOSSPARSE_IMPL_GETDIAGCOPYWITHOFFSETS_HPP_ +#define KOKKOSSPARSE_IMPL_GETDIAGCOPYWITHOFFSETS_HPP_ #include "KokkosSparse_CrsMatrix.hpp" #include "KokkosSparse_OrdinalTraits.hpp" @@ -124,4 +124,4 @@ struct CrsMatrixGetDiagCopyWithOffsets { } // namespace Impl } // namespace KokkosSparse -#endif // KOKKOS_SPARSE_IMPL_GETDIAGCOPYWITHOFFSETS_HPP_ +#endif // KOKKOSSPARSE_IMPL_GETDIAGCOPYWITHOFFSETS_HPP_ diff --git a/packages/kokkos-kernels/sparse/impl/KokkosSparse_gmres_spec.hpp b/packages/kokkos-kernels/sparse/impl/KokkosSparse_gmres_spec.hpp index ef9dd508bb69..8e9dbe504197 100644 --- a/packages/kokkos-kernels/sparse/impl/KokkosSparse_gmres_spec.hpp +++ b/packages/kokkos-kernels/sparse/impl/KokkosSparse_gmres_spec.hpp @@ -128,6 +128,8 @@ struct GMRES >, \ false, true>; +#include + #define KOKKOSSPARSE_GMRES_ETI_SPEC_INST(SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ MEM_SPACE_TYPE) \ template struct GMRES< \ diff --git a/packages/kokkos-kernels/sparse/impl/KokkosSparse_mdf_impl.hpp b/packages/kokkos-kernels/sparse/impl/KokkosSparse_mdf_impl.hpp index 4949887a7ddd..537dd354681e 100644 --- a/packages/kokkos-kernels/sparse/impl/KokkosSparse_mdf_impl.hpp +++ b/packages/kokkos-kernels/sparse/impl/KokkosSparse_mdf_impl.hpp @@ -568,7 +568,8 @@ struct MDF_compute_list_length { team.team_reduce(Kokkos::Sum(updateIdx)); // Sort update list - KokkosKernels::TeamBitonicSort(&update_list(0), updateIdx, team); + Kokkos::Experimental::sort_team( + team, Kokkos::subview(update_list, Kokkos::make_pair(0, updateIdx))); } { size_type numEntrU = 0; diff --git a/packages/kokkos-kernels/sparse/impl/KokkosSparse_par_ilut_numeric_spec.hpp b/packages/kokkos-kernels/sparse/impl/KokkosSparse_par_ilut_numeric_spec.hpp index c347b24647ba..471dedfb7637 100644 --- a/packages/kokkos-kernels/sparse/impl/KokkosSparse_par_ilut_numeric_spec.hpp +++ b/packages/kokkos-kernels/sparse/impl/KokkosSparse_par_ilut_numeric_spec.hpp @@ -147,6 +147,8 @@ struct PAR_ILUT_NUMERIC >, \ false, true>; +#include + #define KOKKOSSPARSE_PAR_ILUT_NUMERIC_ETI_SPEC_INST(SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, \ EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ template struct PAR_ILUT_NUMERIC< \ diff --git a/packages/kokkos-kernels/sparse/impl/KokkosSparse_par_ilut_symbolic_spec.hpp b/packages/kokkos-kernels/sparse/impl/KokkosSparse_par_ilut_symbolic_spec.hpp index dfb66ca9c9e6..8b816e35bea1 100644 --- a/packages/kokkos-kernels/sparse/impl/KokkosSparse_par_ilut_symbolic_spec.hpp +++ b/packages/kokkos-kernels/sparse/impl/KokkosSparse_par_ilut_symbolic_spec.hpp @@ -115,6 +115,8 @@ struct PAR_ILUT_SYMBOLIC >, \ false, true>; +#include + #define KOKKOSSPARSE_PAR_ILUT_SYMBOLIC_ETI_SPEC_INST(SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, \ EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ template struct PAR_ILUT_SYMBOLIC< \ diff --git a/packages/kokkos-kernels/sparse/impl/KokkosSparse_partitioning_impl.hpp b/packages/kokkos-kernels/sparse/impl/KokkosSparse_partitioning_impl.hpp index b926b7452317..f336f42b06cc 100644 --- a/packages/kokkos-kernels/sparse/impl/KokkosSparse_partitioning_impl.hpp +++ b/packages/kokkos-kernels/sparse/impl/KokkosSparse_partitioning_impl.hpp @@ -113,7 +113,7 @@ struct BalloonClustering { auto state = randPool.get_state(); do { root = state.rand(numRows); - } while (!Kokkos::atomic_compare_exchange_strong(&vertClusters(root), numClusters, i)); + } while (numClusters != Kokkos::atomic_compare_exchange(&vertClusters(root), numClusters, i)); randPool.free_state(state); distances(root) = 0; pressure(root) = 1; @@ -125,7 +125,7 @@ struct BalloonClustering { nnz_lno_t cluster = state.rand(numClusters); randPool.free_state(state); vertClusters(i) = cluster; - Kokkos::atomic_increment(&clusterCounts(cluster)); + Kokkos::atomic_inc(&clusterCounts(cluster)); distances(i) = numRows; pressure(i) = 0.1; } @@ -175,8 +175,8 @@ struct BalloonClustering { // this cluster will take over weakNei if (vertLocks.set(i)) { if (vertLocks.set(weakNei)) { - Kokkos::atomic_increment(&clusterCounts(cluster)); - if (weakNeiCluster != numClusters) Kokkos::atomic_decrement(&clusterCounts(weakNeiCluster)); + Kokkos::atomic_inc(&clusterCounts(cluster)); + if (weakNeiCluster != numClusters) Kokkos::atomic_dec(&clusterCounts(weakNeiCluster)); vertClusters(weakNei) = cluster; pressure(i) -= pressure(weakNei); pressure(weakNei) = pressure(i); diff --git a/packages/kokkos-kernels/sparse/impl/KokkosSparse_sort_crs_impl.hpp b/packages/kokkos-kernels/sparse/impl/KokkosSparse_sort_crs_impl.hpp index edde2e93a571..a90885a9fa96 100644 --- a/packages/kokkos-kernels/sparse/impl/KokkosSparse_sort_crs_impl.hpp +++ b/packages/kokkos-kernels/sparse/impl/KokkosSparse_sort_crs_impl.hpp @@ -340,7 +340,7 @@ bool useBulkSortHeuristic(Ordinal avgDeg, Ordinal maxDeg) { // * GPU execution space, HIP is enabled, but no ROCTHRUST // * GPU execution space, HIP is enabled, and GPU is GFX942 // (Kokkos seems to require thrust when CUDA is enabled) - if constexpr (KokkosKernels::Impl::kk_is_gpu_exec_space()) { + if constexpr (KokkosKernels::Impl::is_gpu_exec_space_v) { #if (defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOS_ONEDPL_HAS_SORT_BY_KEY)) || \ (defined(KOKKOS_ENABLE_HIP) && !defined(KOKKOS_ENABLE_ROCTHRUST)) || \ (defined(KOKKOS_ENABLE_HIP) && defined(KOKKOS_ARCH_AMD_GFX942)) diff --git a/packages/kokkos-kernels/sparse/impl/KokkosSparse_spadd_numeric_spec.hpp b/packages/kokkos-kernels/sparse/impl/KokkosSparse_spadd_numeric_spec.hpp index 1d4ccee2205e..a5367bc562a1 100644 --- a/packages/kokkos-kernels/sparse/impl/KokkosSparse_spadd_numeric_spec.hpp +++ b/packages/kokkos-kernels/sparse/impl/KokkosSparse_spadd_numeric_spec.hpp @@ -145,6 +145,8 @@ struct SPADD_NUMERIC >, \ false, true>; +#include + #define KOKKOSSPARSE_SPADD_NUMERIC_ETI_SPEC_INST(SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ MEM_SPACE_TYPE) \ template struct SPADD_NUMERIC< \ diff --git a/packages/kokkos-kernels/sparse/impl/KokkosSparse_spadd_symbolic_impl.hpp b/packages/kokkos-kernels/sparse/impl/KokkosSparse_spadd_symbolic_impl.hpp index bde3b0253106..9ba03dae9a2c 100644 --- a/packages/kokkos-kernels/sparse/impl/KokkosSparse_spadd_symbolic_impl.hpp +++ b/packages/kokkos-kernels/sparse/impl/KokkosSparse_spadd_symbolic_impl.hpp @@ -342,7 +342,7 @@ template ()>::type* = nullptr) { + typename std::enable_if>::type* = nullptr) { using size_type = typename KernelHandle::size_type; using ordinal_type = typename KernelHandle::nnz_lno_t; using range_type = Kokkos::RangePolicy; @@ -361,7 +361,7 @@ template ()>::type* = nullptr) { + typename std::enable_if>::type* = nullptr) { using size_type = typename KernelHandle::size_type; using ordinal_type = typename KernelHandle::nnz_lno_t; using RangePol = Kokkos::RangePolicy; diff --git a/packages/kokkos-kernels/sparse/impl/KokkosSparse_spadd_symbolic_spec.hpp b/packages/kokkos-kernels/sparse/impl/KokkosSparse_spadd_symbolic_spec.hpp index f5286fd9d997..ca6898dbc90c 100644 --- a/packages/kokkos-kernels/sparse/impl/KokkosSparse_spadd_symbolic_spec.hpp +++ b/packages/kokkos-kernels/sparse/impl/KokkosSparse_spadd_symbolic_spec.hpp @@ -117,6 +117,8 @@ struct SPADD_SYMBOLIC >, \ false, true>; +#include + #define KOKKOSSPARSE_SPADD_SYMBOLIC_ETI_SPEC_INST(SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, \ EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ template struct SPADD_SYMBOLIC< \ diff --git a/packages/kokkos-kernels/sparse/impl/KokkosSparse_spgemm_impl.hpp b/packages/kokkos-kernels/sparse/impl/KokkosSparse_spgemm_impl.hpp index 228840381e29..99541537fbc3 100644 --- a/packages/kokkos-kernels/sparse/impl/KokkosSparse_spgemm_impl.hpp +++ b/packages/kokkos-kernels/sparse/impl/KokkosSparse_spgemm_impl.hpp @@ -650,7 +650,7 @@ class KokkosSPGEMM { // chunk (no contention) template size_t compute_num_pool_chunks(size_t chunk_bytes, size_t ideal_num_chunks) { - if (!KokkosKernels::Impl::kk_is_gpu_exec_space()) return ideal_num_chunks; + if (!KokkosKernels::Impl::is_gpu_exec_space_v) return ideal_num_chunks; size_t free_byte, total_byte; KokkosKernels::Impl::kk_get_free_total_memory(free_byte, total_byte); size_t required_size = ideal_num_chunks * chunk_bytes; diff --git a/packages/kokkos-kernels/sparse/impl/KokkosSparse_spgemm_impl_compression.hpp b/packages/kokkos-kernels/sparse/impl/KokkosSparse_spgemm_impl_compression.hpp index bfbdaa9a45d7..0fb1a618ca2b 100644 --- a/packages/kokkos-kernels/sparse/impl/KokkosSparse_spgemm_impl_compression.hpp +++ b/packages/kokkos-kernels/sparse/impl/KokkosSparse_spgemm_impl_compression.hpp @@ -523,7 +523,7 @@ struct KokkosSPGEMMhandle->get_handle_exec_space(); - constexpr bool exec_gpu = KokkosKernels::Impl::kk_is_gpu_exec_space(); + constexpr bool exec_gpu = KokkosKernels::Impl::is_gpu_exec_space_v; // get the suggested vectorlane size based on the execution space, and average // number of nnzs per row. int suggested_vector_size = this->handle->get_suggested_vector_size(n, nnz); @@ -723,7 +723,7 @@ bool KokkosSPGEMM()) { + if (KokkosKernels::Impl::is_gpu_exec_space_v) { #ifndef KOKKOSKERNELSMOREMEM size_type max_row_nnz = 0; KokkosKernels::Impl::view_reduce_maxsizerow(n, in_row_map, max_row_nnz); diff --git a/packages/kokkos-kernels/sparse/impl/KokkosSparse_spgemm_impl_def.hpp b/packages/kokkos-kernels/sparse/impl/KokkosSparse_spgemm_impl_def.hpp index 86f25ba47da0..106c5825ffa2 100644 --- a/packages/kokkos-kernels/sparse/impl/KokkosSparse_spgemm_impl_def.hpp +++ b/packages/kokkos-kernels/sparse/impl/KokkosSparse_spgemm_impl_def.hpp @@ -60,7 +60,7 @@ void KokkosSPGEMMhandle->get_spgemm_handle()->get_compression_step(); // compress in single step if it is GPU. - if (KokkosKernels::Impl::kk_is_gpu_exec_space()) compress_in_single_step = true; + if (KokkosKernels::Impl::is_gpu_exec_space_v) compress_in_single_step = true; // compressed B fields. row_lno_temp_work_view_t new_row_mapB(Kokkos::view_alloc(Kokkos::WithoutInitializing, "new row map"), n + 1); diff --git a/packages/kokkos-kernels/sparse/impl/KokkosSparse_spgemm_impl_kkmem.hpp b/packages/kokkos-kernels/sparse/impl/KokkosSparse_spgemm_impl_kkmem.hpp index 8d81117f933f..cf61676b5d23 100644 --- a/packages/kokkos-kernels/sparse/impl/KokkosSparse_spgemm_impl_kkmem.hpp +++ b/packages/kokkos-kernels/sparse/impl/KokkosSparse_spgemm_impl_kkmem.hpp @@ -692,9 +692,9 @@ struct KokkosSPGEMM max_first_level_hash_size) insert_is_on = false; fail = 0; break; @@ -714,9 +714,9 @@ struct KokkosSPGEMM max_first_level_hash_size) insert_is_on = false; fail = 0; break; @@ -737,9 +737,10 @@ struct KokkosSPGEMMspgemm_algorithm == SPGEMM_KK || SPGEMM_KK_LP == this->spgemm_algorithm) { - if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { + if (KokkosKernels::Impl::is_gpu_exec_space_v) { // then chose the best method and parameters. size_type average_row_nnz = 0; size_t average_row_flops = 0; @@ -1308,7 +1310,7 @@ void KokkosSPGEMM()) { + if (KokkosKernels::Impl::is_gpu_exec_space_v) { if (algorithm_to_run == SPGEMM_KK_MEMORY_SPREADTEAM) { tmp_max_nnz = 1; } else if (algorithm_to_run == SPGEMM_KK_MEMORY_BIGSPREADTEAM) { @@ -1358,7 +1360,7 @@ void KokkosSPGEMM()) { + if (KokkosKernels::Impl::is_gpu_exec_space_v) { my_pool_type = KokkosKernels::Impl::ManyThread2OneChunk; } @@ -1390,7 +1392,7 @@ void KokkosSPGEMM()) { + if (KokkosKernels::Impl::is_gpu_exec_space_v) { if (algorithm_to_run == SPGEMM_KK_MEMORY_SPREADTEAM) { if (thread_shmem_key_size <= 0) { std::cout << "KokkosSPGEMM_numeric_hash SPGEMM_KK_MEMORY_SPREADTEAM: " @@ -1516,7 +1518,7 @@ void KokkosSPGEMM()) { + if (KokkosKernels::Impl::is_gpu_exec_space_v) { my_pool_type = KokkosKernels::Impl::ManyThread2OneChunk; } @@ -1548,7 +1550,7 @@ void KokkosSPGEMM()) { + if (KokkosKernels::Impl::is_gpu_exec_space_v) { Kokkos::parallel_for( "KOKKOSPARSE::SPGEMM::SPGEMM_KK_MEMORY2", gpu_team_policy_t(a_row_cnt / team_row_chunk_size + 1, suggested_team_size, suggested_vector_size), sc); diff --git a/packages/kokkos-kernels/sparse/impl/KokkosSparse_spgemm_impl_speed.hpp b/packages/kokkos-kernels/sparse/impl/KokkosSparse_spgemm_impl_speed.hpp index 6dc365b58e63..3a82679562a4 100644 --- a/packages/kokkos-kernels/sparse/impl/KokkosSparse_spgemm_impl_speed.hpp +++ b/packages/kokkos-kernels/sparse/impl/KokkosSparse_spgemm_impl_speed.hpp @@ -428,7 +428,7 @@ void KokkosSPGEMM()) { + if (KokkosKernels::Impl::is_gpu_exec_space_v) { // allocate memory for begins and next to be used by the hashmap nnz_lno_temp_work_view_t beginsC(Kokkos::view_alloc(Kokkos::WithoutInitializing, "C keys"), valuesC_.extent(0)); nnz_lno_temp_work_view_t nextsC(Kokkos::view_alloc(Kokkos::WithoutInitializing, "C nexts"), valuesC_.extent(0)); diff --git a/packages/kokkos-kernels/sparse/impl/KokkosSparse_spgemm_impl_symbolic.hpp b/packages/kokkos-kernels/sparse/impl/KokkosSparse_spgemm_impl_symbolic.hpp index ccf747883b65..8a6e8d0a6223 100644 --- a/packages/kokkos-kernels/sparse/impl/KokkosSparse_spgemm_impl_symbolic.hpp +++ b/packages/kokkos-kernels/sparse/impl/KokkosSparse_spgemm_impl_symbolic.hpp @@ -1281,7 +1281,7 @@ void KokkosSPGEMMspgemm_algorithm; - constexpr bool exec_gpu = KokkosKernels::Impl::kk_is_gpu_exec_space(); + constexpr bool exec_gpu = KokkosKernels::Impl::is_gpu_exec_space_v; KokkosKernels::Impl::ExecSpaceType lcl_my_exec_space = this->handle->get_handle_exec_space(); if (exec_gpu) { current_spgemm_algorithm = SPGEMM_KK_MEMORY; @@ -1523,7 +1523,7 @@ void KokkosSPGEMMspgemm_algorithm; - constexpr bool exec_gpu = KokkosKernels::Impl::kk_is_gpu_exec_space(); + constexpr bool exec_gpu = KokkosKernels::Impl::is_gpu_exec_space_v; KokkosKernels::Impl::ExecSpaceType lcl_my_exec_space = this->handle->get_handle_exec_space(); if (exec_gpu) { current_spgemm_algorithm = SPGEMM_KK_MEMORY; diff --git a/packages/kokkos-kernels/sparse/impl/KokkosSparse_spgemm_impl_triangle.hpp b/packages/kokkos-kernels/sparse/impl/KokkosSparse_spgemm_impl_triangle.hpp index 5b0352d99b6c..d9ca251e8906 100644 --- a/packages/kokkos-kernels/sparse/impl/KokkosSparse_spgemm_impl_triangle.hpp +++ b/packages/kokkos-kernels/sparse/impl/KokkosSparse_spgemm_impl_triangle.hpp @@ -1165,7 +1165,7 @@ void KokkosSPGEMM< nnz_lno_t *entriesC, // null if it is symbolic, otherwise not null! struct_visit_t visit_applier) { bool apply_compression = this->handle->get_spgemm_handle()->get_compression(); - constexpr bool exec_gpu = KokkosKernels::Impl::kk_is_gpu_exec_space(); + constexpr bool exec_gpu = KokkosKernels::Impl::is_gpu_exec_space_v; const nnz_lno_t *min_result_row_for_each_row = this->handle->get_spgemm_handle()->get_min_col_of_row().data(); nnz_lno_t max_row_size = this->handle->get_spgemm_handle()->get_max_result_nnz( @@ -1468,7 +1468,7 @@ template void KokkosSPGEMM::KokkosSPGEMM_symbolic_triangle_setup() { - constexpr bool exec_gpu = KokkosKernels::Impl::kk_is_gpu_exec_space(); + constexpr bool exec_gpu = KokkosKernels::Impl::is_gpu_exec_space_v; nnz_lno_t n = this->row_mapB.extent(0) - 1; size_type nnz = this->entriesB.extent(0); diff --git a/packages/kokkos-kernels/sparse/impl/KokkosSparse_spgemm_impl_triangle_no_compression.hpp b/packages/kokkos-kernels/sparse/impl/KokkosSparse_spgemm_impl_triangle_no_compression.hpp index 298af30aea8e..5da138f04a23 100644 --- a/packages/kokkos-kernels/sparse/impl/KokkosSparse_spgemm_impl_triangle_no_compression.hpp +++ b/packages/kokkos-kernels/sparse/impl/KokkosSparse_spgemm_impl_triangle_no_compression.hpp @@ -778,7 +778,7 @@ void KokkosSPGEMMhandle->get_spgemm_handle()->get_min_col_of_row().data(); nnz_lno_t max_row_size = this->handle->get_spgemm_handle()->get_max_result_nnz(); - constexpr bool exec_gpu = KokkosKernels::Impl::kk_is_gpu_exec_space(); + constexpr bool exec_gpu = KokkosKernels::Impl::is_gpu_exec_space_v; typedef KokkosKernels::Impl::UniformMemoryPool pool_memory_space; int suggested_vector_size = this->handle->get_suggested_vector_size(this->b_row_cnt, bnnz); diff --git a/packages/kokkos-kernels/sparse/impl/KokkosSparse_spgemm_jacobi_sparseacc_impl.hpp b/packages/kokkos-kernels/sparse/impl/KokkosSparse_spgemm_jacobi_sparseacc_impl.hpp index 3730c6ec51e7..5642a9f11a22 100644 --- a/packages/kokkos-kernels/sparse/impl/KokkosSparse_spgemm_jacobi_sparseacc_impl.hpp +++ b/packages/kokkos-kernels/sparse/impl/KokkosSparse_spgemm_jacobi_sparseacc_impl.hpp @@ -591,7 +591,7 @@ struct KokkosSPGEMM max_first_level_hash_size) insert_is_on = false; fail = 0; break; @@ -835,9 +835,9 @@ struct KokkosSPGEMM max_first_level_hash_size) insert_is_on = false; fail = 0; break; @@ -856,7 +856,7 @@ struct KokkosSPGEMM max_first_level_hash_size) insert_is_on = false; fail = 0; break; @@ -960,9 +961,9 @@ struct KokkosSPGEMM max_first_level_hash_size) insert_is_on = false; fail = 0; break; @@ -980,7 +981,7 @@ struct KokkosSPGEMM; - constexpr bool exec_gpu = KokkosKernels::Impl::kk_is_gpu_exec_space(); + constexpr bool exec_gpu = KokkosKernels::Impl::is_gpu_exec_space_v; if (KOKKOSKERNELS_VERBOSE) { std::cout << "\tSPARSE ACC MODE" << std::endl; } diff --git a/packages/kokkos-kernels/sparse/impl/KokkosSparse_spgemm_jacobi_spec.hpp b/packages/kokkos-kernels/sparse/impl/KokkosSparse_spgemm_jacobi_spec.hpp index 65ab7b4cc888..7dbc3083a48e 100644 --- a/packages/kokkos-kernels/sparse/impl/KokkosSparse_spgemm_jacobi_spec.hpp +++ b/packages/kokkos-kernels/sparse/impl/KokkosSparse_spgemm_jacobi_spec.hpp @@ -200,6 +200,8 @@ struct SPGEMM_JACOBI >, \ false, true>; +#include + #define KOKKOSSPARSE_SPGEMM_JACOBI_ETI_SPEC_INST(SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ MEM_SPACE_TYPE) \ template struct SPGEMM_JACOBI< \ diff --git a/packages/kokkos-kernels/sparse/impl/KokkosSparse_spgemm_noreuse_spec.hpp b/packages/kokkos-kernels/sparse/impl/KokkosSparse_spgemm_noreuse_spec.hpp index bdbcbe1e856c..e80d9dfb26bc 100644 --- a/packages/kokkos-kernels/sparse/impl/KokkosSparse_spgemm_noreuse_spec.hpp +++ b/packages/kokkos-kernels/sparse/impl/KokkosSparse_spgemm_noreuse_spec.hpp @@ -120,6 +120,8 @@ struct SPGEMM_NOREUSE, const OFFSET_TYPE>, \ false, true>; +#include + #define KOKKOSSPARSE_SPGEMM_NOREUSE_ETI_SPEC_INST(SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, EXEC_SPACE_TYPE, \ MEM_SPACE_TYPE) \ template struct SPGEMM_NOREUSE< \ diff --git a/packages/kokkos-kernels/sparse/impl/KokkosSparse_spgemm_numeric_spec.hpp b/packages/kokkos-kernels/sparse/impl/KokkosSparse_spgemm_numeric_spec.hpp index 18e35c1df8fb..460033e95089 100644 --- a/packages/kokkos-kernels/sparse/impl/KokkosSparse_spgemm_numeric_spec.hpp +++ b/packages/kokkos-kernels/sparse/impl/KokkosSparse_spgemm_numeric_spec.hpp @@ -173,6 +173,8 @@ struct SPGEMM_NUMERIC >, \ false, true>; +#include + #define KOKKOSSPARSE_SPGEMM_NUMERIC_ETI_SPEC_INST(SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, \ EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ template struct SPGEMM_NUMERIC< \ diff --git a/packages/kokkos-kernels/sparse/impl/KokkosSparse_spgemm_symbolic_spec.hpp b/packages/kokkos-kernels/sparse/impl/KokkosSparse_spgemm_symbolic_spec.hpp index 971ddec716bc..117492e21d19 100644 --- a/packages/kokkos-kernels/sparse/impl/KokkosSparse_spgemm_symbolic_spec.hpp +++ b/packages/kokkos-kernels/sparse/impl/KokkosSparse_spgemm_symbolic_spec.hpp @@ -152,6 +152,8 @@ struct SPGEMM_SYMBOLIC >, \ false, true>; +#include + #define KOKKOSSPARSE_SPGEMM_SYMBOLIC_ETI_SPEC_INST(SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, \ EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ template struct SPGEMM_SYMBOLIC< \ diff --git a/packages/kokkos-kernels/sparse/impl/KokkosSparse_spiluk_numeric_spec.hpp b/packages/kokkos-kernels/sparse/impl/KokkosSparse_spiluk_numeric_spec.hpp index cd16aa0f4f12..92c9b8075311 100644 --- a/packages/kokkos-kernels/sparse/impl/KokkosSparse_spiluk_numeric_spec.hpp +++ b/packages/kokkos-kernels/sparse/impl/KokkosSparse_spiluk_numeric_spec.hpp @@ -181,6 +181,8 @@ struct SPILUK_NUMERIC >, \ false, true>; +#include + #define KOKKOSSPARSE_SPILUK_NUMERIC_ETI_SPEC_INST(SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, \ EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ template struct SPILUK_NUMERIC< \ diff --git a/packages/kokkos-kernels/sparse/impl/KokkosSparse_spiluk_symbolic_spec.hpp b/packages/kokkos-kernels/sparse/impl/KokkosSparse_spiluk_symbolic_spec.hpp index 622e929dfc63..433c9c4e80e0 100644 --- a/packages/kokkos-kernels/sparse/impl/KokkosSparse_spiluk_symbolic_spec.hpp +++ b/packages/kokkos-kernels/sparse/impl/KokkosSparse_spiluk_symbolic_spec.hpp @@ -131,6 +131,8 @@ struct SPILUK_SYMBOLIC >, \ false, true>; +#include + #define KOKKOSSPARSE_SPILUK_SYMBOLIC_ETI_SPEC_INST(SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, \ EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ template struct SPILUK_SYMBOLIC< \ diff --git a/packages/kokkos-kernels/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl.hpp b/packages/kokkos-kernels/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl.hpp index 0223087f1fe8..d9702af900a9 100644 --- a/packages/kokkos-kernels/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl.hpp +++ b/packages/kokkos-kernels/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl.hpp @@ -589,7 +589,7 @@ struct BSR_GEMV_Functor { // template ()>::type + typename std::enable_if>::type * = nullptr> void spMatVec_no_transpose( const typename AD::execution_space &exec, Handle *handle, const AlphaType &alpha, @@ -636,8 +636,8 @@ void spMatVec_no_transpose( // template ()>::type - * = nullptr> + typename std::enable_if>::type * = + nullptr> void spMatVec_no_transpose( const typename AD::execution_space &exec, Handle *handle, const AlphaType &alpha, const KokkosSparse::Experimental::BsrMatrix, AS> &A, @@ -839,7 +839,7 @@ struct BSR_GEMV_Transpose_Functor { /// trivial serial impl used) template ()>::type + typename std::enable_if>::type * = nullptr> void spMatVec_transpose( const typename AD::execution_space &exec, Handle *handle, const AlphaType &alpha, @@ -885,8 +885,8 @@ void spMatVec_transpose( // spMatVec_transpose: version for GPU execution spaces (TeamPolicy used) // template ()>::type - * = nullptr> + typename std::enable_if>::type * = + nullptr> void spMatVec_transpose(const typename AMatrix::execution_space &exec, Handle *handle, const AlphaType &alpha, const AMatrix &A, const XVector &x, const BetaType &beta, YVector &y, bool useConjugate) { if (A.numRows() <= 0) { @@ -1093,7 +1093,7 @@ struct BSR_GEMM_Functor { // template ()>::type + typename std::enable_if>::type * = nullptr> void spMatMultiVec_no_transpose( const typename AD::execution_space &exec, Handle *handle, const AlphaType &alpha, @@ -1139,8 +1139,8 @@ void spMatMultiVec_no_transpose( // template ()>::type - * = nullptr> + typename std::enable_if>::type * = + nullptr> void spMatMultiVec_no_transpose( const typename AD::execution_space &exec, Handle *handle, const AlphaType &alpha, const KokkosSparse::Experimental::BsrMatrix, AS> &A, @@ -1352,7 +1352,7 @@ struct BSR_GEMM_Transpose_Functor { /// (RangePolicy or trivial serial impl used) template ()>::type + typename std::enable_if>::type * = nullptr> void spMatMultiVec_transpose( const execution_space &exec, Handle *handle, const AlphaType &alpha, @@ -1391,7 +1391,7 @@ void spMatMultiVec_transpose( // template ()>::type * = nullptr> + typename std::enable_if>::type * = nullptr> void spMatMultiVec_transpose(const execution_space &exec, Handle *handle, const AlphaType &alpha, const AMatrix &A, const XVector &x, const BetaType &beta, YVector &y, bool useConjugate) { if (A.numRows() <= 0) { diff --git a/packages/kokkos-kernels/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp b/packages/kokkos-kernels/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp index 1c4564f1ba38..b6ba3e5b882e 100644 --- a/packages/kokkos-kernels/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp +++ b/packages/kokkos-kernels/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp @@ -134,7 +134,7 @@ struct SPMV_BSRMATRIX() || handle->algo == SPMV_BSR_V42) { + if (KokkosKernels::Impl::is_gpu_exec_space_v || handle->algo == SPMV_BSR_V42) { if (modeIsNoTrans) { ::KokkosSparse::Impl::apply_v42(space, alpha, A, X, beta, Y); return; @@ -258,7 +258,7 @@ struct SPMV_MV_BSRMATRIX() || handle->algo == SPMV_BSR_V42) { + if (KokkosKernels::Impl::is_gpu_exec_space_v || handle->algo == SPMV_BSR_V42) { if (modeIsNoTrans) { ::KokkosSparse::Impl::apply_v42(space, alpha, A, X, beta, Y); return; @@ -321,6 +321,8 @@ struct SPMV_MV_BSRMATRIX>, \ false, true>; +#include + #define KOKKOSSPARSE_SPMV_BSRMATRIX_ETI_SPEC_INST(SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, \ EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ template struct SPMV_BSRMATRIX< \ @@ -352,6 +354,8 @@ struct SPMV_MV_BSRMATRIX>, \ std::is_integral_v, false, true>; +#include + #define KOKKOSSPARSE_SPMV_MV_BSRMATRIX_ETI_SPEC_INST(SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, \ EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ template struct SPMV_MV_BSRMATRIX< \ diff --git a/packages/kokkos-kernels/sparse/impl/KokkosSparse_spmv_impl.hpp b/packages/kokkos-kernels/sparse/impl/KokkosSparse_spmv_impl.hpp index 5fefcb897d10..ce1bd9c3a4cf 100644 --- a/packages/kokkos-kernels/sparse/impl/KokkosSparse_spmv_impl.hpp +++ b/packages/kokkos-kernels/sparse/impl/KokkosSparse_spmv_impl.hpp @@ -187,7 +187,7 @@ int64_t spmv_launch_parameters(int64_t numRows, int64_t nnz, int64_t rows_per_th // Determine rows per thread if (rows_per_thread < 1) { - if (KokkosKernels::Impl::kk_is_gpu_exec_space()) + if (KokkosKernels::Impl::is_gpu_exec_space_v) rows_per_thread = 1; else { if (nnz_per_row < 20 && nnz > 5000000) { @@ -198,7 +198,7 @@ int64_t spmv_launch_parameters(int64_t numRows, int64_t nnz, int64_t rows_per_th } if (team_size < 1) { - if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { + if (KokkosKernels::Impl::is_gpu_exec_space_v) { team_size = 256 / vector_length; } else { team_size = 1; @@ -220,7 +220,7 @@ int64_t spmv_launch_parameters(int64_t numRows, int64_t nnz, int64_t rows_per_th // spmv_beta_no_transpose: version for CPU execution spaces (RangePolicy or // trivial serial impl used) template ()>::type* = nullptr> + typename std::enable_if>::type* = nullptr> static void spmv_beta_no_transpose(const execution_space& exec, Handle* handle, typename YVector::const_value_type& alpha, const AMatrix& A, const XVector& x, typename YVector::const_value_type& beta, const YVector& y) { @@ -334,7 +334,7 @@ static void spmv_beta_no_transpose(const execution_space& exec, Handle* handle, // spmv_beta_no_transpose: version for GPU execution spaces (TeamPolicy used) template ()>::type* = nullptr> + typename std::enable_if>::type* = nullptr> static void spmv_beta_no_transpose(const execution_space& exec, Handle* handle, typename YVector::const_value_type& alpha, const AMatrix& A, const XVector& x, typename YVector::const_value_type& beta, const YVector& y) { @@ -380,7 +380,7 @@ static void spmv_beta_no_transpose(const execution_space& exec, Handle* handle, // spmv_beta_transpose: version for CPU execution spaces (RangePolicy or trivial // serial impl used) template ()>::type* = nullptr> + typename std::enable_if>::type* = nullptr> static void spmv_beta_transpose(const execution_space& exec, typename YVector::const_value_type& alpha, const AMatrix& A, const XVector& x, typename YVector::const_value_type& beta, const YVector& y) { @@ -460,7 +460,7 @@ static void spmv_beta_transpose(const execution_space& exec, typename YVector::c // spmv_beta_transpose: version for GPU execution spaces (TeamPolicy used) template ()>::type* = nullptr> + typename std::enable_if>::type* = nullptr> static void spmv_beta_transpose(const execution_space& exec, typename YVector::const_value_type& alpha, const AMatrix& A, const XVector& x, typename YVector::const_value_type& beta, const YVector& y) { @@ -1005,7 +1005,7 @@ struct SPMV_MV_LayoutLeft_Functor { // spmv_alpha_beta_mv_no_transpose: version for CPU execution spaces // (RangePolicy) template ()>::type* = nullptr> + typename std::enable_if>::type* = nullptr> static void spmv_alpha_beta_mv_no_transpose(const execution_space& exec, const typename YVector::non_const_value_type& alpha, const AMatrix& A, const XVector& x, const typename YVector::non_const_value_type& beta, @@ -1054,7 +1054,7 @@ static void spmv_alpha_beta_mv_no_transpose(const execution_space& exec, // spmv_alpha_beta_mv_no_transpose: version for GPU execution spaces // (TeamPolicy) template ()>::type* = nullptr> + typename std::enable_if>::type* = nullptr> static void spmv_alpha_beta_mv_no_transpose(const execution_space& exec, const typename YVector::non_const_value_type& alpha, const AMatrix& A, const XVector& x, const typename YVector::non_const_value_type& beta, @@ -1119,7 +1119,7 @@ static void spmv_alpha_beta_mv_no_transpose(const execution_space& exec, // spmv_alpha_beta_mv_transpose: version for CPU execution spaces (RangePolicy) template ()>::type* = nullptr> + typename std::enable_if>::type* = nullptr> static void spmv_alpha_beta_mv_transpose(const execution_space& exec, const typename YVector::non_const_value_type& alpha, const AMatrix& A, const XVector& x, const typename YVector::non_const_value_type& beta, @@ -1160,7 +1160,7 @@ static void spmv_alpha_beta_mv_transpose(const execution_space& exec, // spmv_alpha_beta_mv_transpose: version for GPU execution spaces (TeamPolicy) template ()>::type* = nullptr> + typename std::enable_if>::type* = nullptr> static void spmv_alpha_beta_mv_transpose(const execution_space& exec, const typename YVector::non_const_value_type& alpha, const AMatrix& A, const XVector& x, const typename YVector::non_const_value_type& beta, diff --git a/packages/kokkos-kernels/sparse/impl/KokkosSparse_spmv_impl_merge.hpp b/packages/kokkos-kernels/sparse/impl/KokkosSparse_spmv_impl_merge.hpp index 0db896adec60..b74bcc5e2039 100644 --- a/packages/kokkos-kernels/sparse/impl/KokkosSparse_spmv_impl_merge.hpp +++ b/packages/kokkos-kernels/sparse/impl/KokkosSparse_spmv_impl_merge.hpp @@ -291,7 +291,7 @@ struct SpmvMergeHierarchical { const A_size_type pathLength = A.numRows() + A.nnz(); A_size_type pathLengthThreadChunk; int teamSize; - if constexpr (KokkosKernels::Impl::kk_is_gpu_exec_space()) { + if constexpr (KokkosKernels::Impl::is_gpu_exec_space_v) { pathLengthThreadChunk = 4; teamSize = 128; } else { @@ -315,7 +315,7 @@ struct SpmvMergeHierarchical { using GpuOp = SpmvMergeImplFunctor; using CpuOp = SpmvMergeImplFunctor; using Op = - typename std::conditional(), GpuOp, CpuOp>::type; + typename std::conditional, GpuOp, CpuOp>::type; Op op(alpha, A, x, y, pathLengthThreadChunk); Kokkos::parallel_for("SpmvMergeHierarchical::spmv", policy, op); } else if (KokkosSparse::Conjugate[0] == mode[0]) { @@ -323,7 +323,7 @@ struct SpmvMergeHierarchical { using GpuOp = SpmvMergeImplFunctor; using CpuOp = SpmvMergeImplFunctor; using Op = - typename std::conditional(), GpuOp, CpuOp>::type; + typename std::conditional, GpuOp, CpuOp>::type; Op op(alpha, A, x, y, pathLengthThreadChunk); Kokkos::parallel_for("SpmvMergeHierarchical::spmv", policy, op); } else { diff --git a/packages/kokkos-kernels/sparse/impl/KokkosSparse_spmv_spec.hpp b/packages/kokkos-kernels/sparse/impl/KokkosSparse_spmv_spec.hpp index a03b15241b99..d9a0175ea681 100644 --- a/packages/kokkos-kernels/sparse/impl/KokkosSparse_spmv_spec.hpp +++ b/packages/kokkos-kernels/sparse/impl/KokkosSparse_spmv_spec.hpp @@ -222,6 +222,8 @@ struct SPMV_MV>, \ false, true>; +#include + #define KOKKOSSPARSE_SPMV_ETI_SPEC_INST(SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ MEM_SPACE_TYPE) \ template struct SPMV< \ @@ -248,6 +250,8 @@ struct SPMV_MV>, \ std::is_integral_v, false, true>; +#include + #define KOKKOSSPARSE_SPMV_MV_ETI_SPEC_INST(SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ MEM_SPACE_TYPE) \ template struct SPMV_MV< \ diff --git a/packages/kokkos-kernels/sparse/impl/KokkosSparse_spmv_struct_impl.hpp b/packages/kokkos-kernels/sparse/impl/KokkosSparse_spmv_struct_impl.hpp index 80b2c908f703..e3283fe53292 100644 --- a/packages/kokkos-kernels/sparse/impl/KokkosSparse_spmv_struct_impl.hpp +++ b/packages/kokkos-kernels/sparse/impl/KokkosSparse_spmv_struct_impl.hpp @@ -617,7 +617,7 @@ int64_t spmv_struct_launch_parameters(int64_t numInterior, int64_t nnz, int nnz_ // Determine rows per thread if (rows_per_thread < 1) { - if (KokkosKernels::Impl::kk_is_gpu_exec_space()) + if (KokkosKernels::Impl::is_gpu_exec_space_v) rows_per_thread = 1; else { if (nnz_per_row < 20 && numInterior * nnz_per_row > 5000000) { @@ -628,7 +628,7 @@ int64_t spmv_struct_launch_parameters(int64_t numInterior, int64_t nnz, int nnz_ } if (team_size < 1) { - if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { + if (KokkosKernels::Impl::is_gpu_exec_space_v) { team_size = 128 / vector_length; } else { team_size = 1; diff --git a/packages/kokkos-kernels/sparse/impl/KokkosSparse_spmv_struct_spec.hpp b/packages/kokkos-kernels/sparse/impl/KokkosSparse_spmv_struct_spec.hpp index 8770f51ea0fd..ac0bc580f4d2 100644 --- a/packages/kokkos-kernels/sparse/impl/KokkosSparse_spmv_struct_spec.hpp +++ b/packages/kokkos-kernels/sparse/impl/KokkosSparse_spmv_struct_spec.hpp @@ -232,6 +232,8 @@ struct SPMV_MV_STRUCT>, \ false, true>; +#include + #define KOKKOSSPARSE_SPMV_STRUCT_ETI_SPEC_INST(SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ MEM_SPACE_TYPE) \ template struct SPMV_STRUCT< \ @@ -256,6 +258,8 @@ struct SPMV_MV_STRUCT>, \ std::is_integral_v, false, true>; +#include + #define KOKKOSSPARSE_SPMV_MV_STRUCT_ETI_SPEC_INST(SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, \ EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ template struct SPMV_MV_STRUCT< \ diff --git a/packages/kokkos-kernels/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp b/packages/kokkos-kernels/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp index 77d790e288dc..3afeadc5599d 100644 --- a/packages/kokkos-kernels/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp +++ b/packages/kokkos-kernels/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp @@ -817,10 +817,10 @@ struct SptrsvWrap { const int nsrow = colptr(j1 + 1) - i1; // create a view for the s-th supernocal column - // NOTE: we currently supports only default_layout = LayoutLeft + // NOTE: we currently supports only KokkosKernels::default_layout = LayoutLeft scalar_t *dataL = const_cast(values.data()); - Kokkos::View viewL(&dataL[i1], nsrow, - nscol); + Kokkos::View viewL( + &dataL[i1], nsrow, nscol); // extract part of the solution, corresponding to the diagonal block auto Xj = Kokkos::subview(X, range_type(j1, j2)); @@ -859,8 +859,9 @@ struct SptrsvWrap { KokkosBlas::TeamGemv::invoke(team, one, Ljj, Y, zero, Xj); } else { - // NOTE: we currently supports only default_layout = LayoutLeft - Kokkos::View Xjj(Xj.data(), nscol, 1); + // NOTE: we currently supports only KokkosKernels::default_layout = LayoutLeft + Kokkos::View Xjj( + Xj.data(), nscol, 1); if (unit_diagonal) { KokkosBatched::TeamTrsm struct UpperTriSupernodalFunctor { - // NOTE: we currently supports only default_layout = LayoutLeft - using SupernodeView = typename Kokkos::View; + // NOTE: we currently supports only KokkosKernels::default_layout = LayoutLeft + using SupernodeView = + typename Kokkos::View; bool invert_diagonal; const int *supercols; @@ -1022,8 +1024,9 @@ struct SptrsvWrap { KokkosBlas::TeamGemv:: template invoke(team, one, Ujj, Y, zero, Xj); } else { - // NOTE: we currently supports only default_layout = LayoutLeft - Kokkos::View Xjj(Xj.data(), nscol, 1); + // NOTE: we currently supports only KokkosKernels::default_layout = LayoutLeft + Kokkos::View Xjj( + Xj.data(), nscol, 1); KokkosBatched::TeamTrsm::invoke(team, one, Ujj, Xjj); @@ -1113,10 +1116,10 @@ struct SptrsvWrap { const int nsrow2 = nsrow - nscol; // create a view of the s-th supernocal column of U - // NOTE: we currently supports only default_layout = LayoutLeft + // NOTE: we currently supports only KokkosKernels::default_layout = LayoutLeft scalar_t *dataU = const_cast(values.data()); - Kokkos::View viewU(&dataU[i1], nsrow, - nscol); + Kokkos::View viewU( + &dataU[i1], nsrow, nscol); // extract part of solution, corresponding to the diagonal block U(s, s) auto Xj = Kokkos::subview(X, range_type(j1, j2)); @@ -1152,8 +1155,9 @@ struct SptrsvWrap { KokkosBlas::TeamGemv::invoke(team, one, Ujj, Y, zero, Xj); } else { - // NOTE: we currently supports only default_layout = LayoutLeft - Kokkos::View Xjj(Xj.data(), nscol, 1); + // NOTE: we currently supports only KokkosKernels::default_layout = LayoutLeft + Kokkos::View Xjj( + Xj.data(), nscol, 1); KokkosBatched::TeamTrsm::invoke(team, one, Ujj, Xjj); @@ -1363,8 +1367,9 @@ struct SptrsvWrap { timer.reset(); #endif - // NOTE: we currently supports only default_layout = LayoutLeft - using supernode_view_type = Kokkos::View; + // NOTE: we currently supports only KokkosKernels::default_layout = LayoutLeft + using supernode_view_type = + Kokkos::View; if (diag_kernel_type_host(lvl) == 3) { // using device-level kernels (functor is called to scatter the // results) @@ -1423,9 +1428,10 @@ struct SptrsvWrap { KokkosBlas::gemv(space, "N", one, Ljj, Y, zero, Xj); } else { char unit_diag = (unit_diagonal ? 'U' : 'N'); - // NOTE: we currently supports only default_layout = + // NOTE: we currently supports only KokkosKernels::default_layout = // LayoutLeft - Kokkos::View Xjj(Xj.data(), nscol, 1); + Kokkos::View Xjj( + Xj.data(), nscol, 1); KokkosBlas::trsm(space, "L", "L", "N", &unit_diag, one, Ljj, Xjj); // TODO: space.fence(); Kokkos::fence(); @@ -1695,9 +1701,9 @@ struct SptrsvWrap { int workoffset = work_offset_host(s); // create a view for the s-th supernocal block column - // NOTE: we currently supports only default_layout = LayoutLeft - Kokkos::View viewU(&dataU[i1], nsrow, - nscol); + // NOTE: we currently supports only KokkosKernels::default_layout = LayoutLeft + Kokkos::View viewU( + &dataU[i1], nsrow, nscol); if (invert_offdiagonal) { auto Uij = Kokkos::subview(viewU, range_type(0, nsrow), Kokkos::ALL()); @@ -1720,10 +1726,10 @@ struct SptrsvWrap { // instead of trmv/trsv KokkosBlas::gemv(space, "N", one, Ujj, Y, zero, Xj); } else { - // NOTE: we currently supports only default_layout = + // NOTE: we currently supports only KokkosKernels::default_layout = // LayoutLeft - Kokkos::View Xjj(Xj.data(), nscol, - 1); + Kokkos::View Xjj( + Xj.data(), nscol, 1); KokkosBlas::trsm(space, "L", "U", "N", "N", one, Ujj, Xjj); } // update off-diagonal blocks @@ -1796,9 +1802,9 @@ struct SptrsvWrap { int workoffset = work_offset_host(s); // create a view for the s-th supernocal block column - // NOTE: we currently supports only default_layout = LayoutLeft - Kokkos::View viewU(&dataU[i1], nsrow, - nscol); + // NOTE: we currently supports only KokkosKernels::default_layout = LayoutLeft + Kokkos::View viewU( + &dataU[i1], nsrow, nscol); // extract part of the solution, corresponding to the diagonal // block @@ -1824,9 +1830,10 @@ struct SptrsvWrap { if (invert_diagonal) { KokkosBlas::gemv(space, "T", one, Ujj, Xj, zero, Y); } else { - // NOTE: we currently supports only default_layout = + // NOTE: we currently supports only KokkosKernels::default_layout = // LayoutLeft - Kokkos::View Xjj(Xj.data(), nscol, 1); + Kokkos::View Xjj( + Xj.data(), nscol, 1); KokkosBlas::trsm(space, "L", "L", "T", "N", one, Ujj, Xjj); } } diff --git a/packages/kokkos-kernels/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp b/packages/kokkos-kernels/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp index 87cf72686c81..de9f31dbd6ee 100644 --- a/packages/kokkos-kernels/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp +++ b/packages/kokkos-kernels/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp @@ -217,6 +217,8 @@ struct SPTRSV_SOLVE >, \ false, true>; +#include + #define KOKKOSSPARSE_SPTRSV_SOLVE_ETI_SPEC_INST(SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ MEM_SPACE_TYPE) \ template struct SPTRSV_SOLVE< \ diff --git a/packages/kokkos-kernels/sparse/impl/KokkosSparse_sptrsv_symbolic_spec.hpp b/packages/kokkos-kernels/sparse/impl/KokkosSparse_sptrsv_symbolic_spec.hpp index 2229e90414c7..0b009f0e71a4 100644 --- a/packages/kokkos-kernels/sparse/impl/KokkosSparse_sptrsv_symbolic_spec.hpp +++ b/packages/kokkos-kernels/sparse/impl/KokkosSparse_sptrsv_symbolic_spec.hpp @@ -113,6 +113,8 @@ struct SPTRSV_SYMBOLIC >, \ false, true>; +#include + #define KOKKOSSPARSE_SPTRSV_SYMBOLIC_ETI_SPEC_INST(SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, \ EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ template struct SPTRSV_SYMBOLIC< \ diff --git a/packages/kokkos-kernels/sparse/impl/KokkosSparse_trsv_impl.hpp b/packages/kokkos-kernels/sparse/impl/KokkosSparse_trsv_impl.hpp index 443a91ed02f2..d16602404918 100644 --- a/packages/kokkos-kernels/sparse/impl/KokkosSparse_trsv_impl.hpp +++ b/packages/kokkos-kernels/sparse/impl/KokkosSparse_trsv_impl.hpp @@ -261,10 +261,8 @@ struct TrsvWrap { return; } - // Don't use r >= 0 as the test, because that fails if - // lno_t is unsigned. We do r == 0 (last - // iteration) below. - for (lno_t r = numRows - 1; r != 0; --r) { + // Iterate backwards with care due to potentially unsigned type + for (lno_t r = numRows - 1; r != static_cast(-1); --r) { const offset_type beg = ptr(r); const offset_type end = ptr(r + 1); for (offset_type k = beg; k < end; ++k) { @@ -275,20 +273,6 @@ struct TrsvWrap { } } // for each entry A_rc in the current row r } // for each row r - - // Last iteration: r = 0. - { - const lno_t r = 0; - const offset_type beg = ptr(r); - const offset_type end = ptr(r + 1); - for (offset_type k = beg; k < end; ++k) { - const scalar_t A_rc = val(k); - const lno_t c = ind(k); - for (lno_t j = 0; j < numVecs; ++j) { - X(r, j) -= A_rc * X(c, j); - } - } // for each entry A_rc in the current row r - } // last iteration: r = 0 } static void upperTriSolveCsr(RangeMultiVectorType X, const CrsMatrixType& A, DomainMultiVectorType Y) { @@ -312,10 +296,8 @@ struct TrsvWrap { return; } - // Don't use r >= 0 as the test, because that fails if - // lno_t is unsigned. We do r == 0 (last - // iteration) below. - for (lno_t r = numRows - 1; r != 0; --r) { + // Iterate backwards with care due to potentially unsigned type + for (lno_t r = numRows - 1; r != static_cast(-1); --r) { const offset_type beg = ptr(r); const offset_type end = ptr(r + 1); auto A_rr = co.zero(); @@ -334,28 +316,6 @@ struct TrsvWrap { co.template divide(X, A_rr, r, j); } } // for each row r - - // Last iteration: r = 0. - { - const lno_t r = 0; - const offset_type beg = ptr(r); - const offset_type end = ptr(r + 1); - auto A_rr = co.zero(); - for (offset_type k = beg; k < end; ++k) { - const auto A_rc = co.get(val, k); - const lno_t c = ind(k); - if (r == c) { - co.pluseq(A_rr, A_rc); - } else { - for (lno_t j = 0; j < numVecs; ++j) { - co.gemv(X, A_rc, r, c, j); - } - } - } // for each entry A_rc in the current row r - for (lno_t j = 0; j < numVecs; ++j) { - co.template divide(X, A_rr, r, j); - } - } // last iteration: r = 0 } static void upperTriSolveCscUnitDiag(RangeMultiVectorType X, const CrsMatrixType& A, DomainMultiVectorType Y) { @@ -380,10 +340,8 @@ struct TrsvWrap { return; } - // Don't use c >= 0 as the test, because that fails if - // lno_t is unsigned. We do c == 0 (last - // iteration) below. - for (lno_t c = numCols - 1; c != 0; --c) { + // Iterate backwards with care due to potentially unsigned type + for (lno_t c = numCols - 1; c != static_cast(-1); --c) { const offset_type beg = ptr(c); const offset_type end = ptr(c + 1); for (offset_type k = beg; k < end; ++k) { @@ -394,20 +352,6 @@ struct TrsvWrap { } } // for each entry A_rc in the current column c } // for each column c - - // Last iteration: c = 0. - { - const lno_t c = 0; - const offset_type beg = ptr(c); - const offset_type end = ptr(c + 1); - for (offset_type k = beg; k < end; ++k) { - const scalar_t A_rc = val(k); - const lno_t r = ind(k); - for (lno_t j = 0; j < numVecs; ++j) { - X(r, j) -= A_rc * X(c, j); - } - } // for each entry A_rc in the current column c - } } static void upperTriSolveCsc(RangeMultiVectorType X, const CrsMatrixType& A, DomainMultiVectorType Y) { @@ -432,13 +376,11 @@ struct TrsvWrap { return; } - // Don't use c >= 0 as the test, because that fails if - // lno_t is unsigned. We do c == 0 (last - // iteration) below. - for (lno_t c = numCols - 1; c != 0; --c) { + // Iterate backwards with care due to potentially unsigned type + for (lno_t c = numCols - 1; c != static_cast(-1); --c) { const offset_type beg = ptr(c); const offset_type end = ptr(c + 1); - for (offset_type k = end - 1; k >= beg; --k) { + for (offset_type k = end - 1; k != beg - 1; --k) { const lno_t r = ind(k); const auto A_rc = val(k); /*(vqd 20 Jul 2020) This assumes that the diagonal entry @@ -454,19 +396,6 @@ struct TrsvWrap { } } // for each entry A_rc in the current column c } // for each column c - - // Last iteration: c = 0. - { - const offset_type beg = ptr(0); - const auto A_rc = val(beg); - /*(vqd 20 Jul 2020) This assumes that the diagonal entry - has equal local row and column indices. That may not - necessarily hold, depending on the row and column Maps. See - note above.*/ - for (lno_t j = 0; j < numVecs; ++j) { - X(0, j) = X(0, j) / A_rc; - } - } } static void lowerTriSolveCscUnitDiag(RangeMultiVectorType X, const CrsMatrixType& A, DomainMultiVectorType Y) { @@ -520,10 +449,8 @@ struct TrsvWrap { return; } - // Don't use c >= 0 as the test, because that fails if - // lno_t is unsigned. We do c == 0 (last - // iteration) below. - for (lno_t c = numCols - 1; c != 0; --c) { + // Iterate backwards with care due to potentially unsigned type + for (lno_t c = numCols - 1; c != static_cast(-1); --c) { const offset_type beg = ptr(c); const offset_type end = ptr(c + 1); for (offset_type k = beg; k < end; ++k) { @@ -534,20 +461,6 @@ struct TrsvWrap { } } // for each entry A_rc in the current column c } // for each column c - - // Last iteration: c = 0. - { - const lno_t c = 0; - const offset_type beg = ptr(c); - const offset_type end = ptr(c + 1); - for (offset_type k = beg; k < end; ++k) { - const lno_t r = ind(k); - const scalar_t A_rc = STS::conj(val(k)); - for (lno_t j = 0; j < numVecs; ++j) { - X(r, j) -= A_rc * X(c, j); - } - } // for each entry A_rc in the current column c - } } static void upperTriSolveCscConj(RangeMultiVectorType X, const CrsMatrixType& A, DomainMultiVectorType Y) { @@ -572,13 +485,11 @@ struct TrsvWrap { return; } - // Don't use c >= 0 as the test, because that fails if - // lno_t is unsigned. We do c == 0 (last - // iteration) below. - for (lno_t c = numCols - 1; c != 0; --c) { + // Iterate backwards with care due to potentially unsigned type + for (lno_t c = numCols - 1; c != static_cast(-1); --c) { const offset_type beg = ptr(c); const offset_type end = ptr(c + 1); - for (offset_type k = end - 1; k >= beg; --k) { + for (offset_type k = end - 1; k != beg - 1; --k) { const lno_t r = ind(k); const scalar_t A_rc = STS::conj(val(k)); /*(vqd 20 Jul 2020) This assumes that the diagonal entry @@ -594,19 +505,6 @@ struct TrsvWrap { } } // for each entry A_rc in the current column c } // for each column c - - // Last iteration: c = 0. - { - const offset_type beg = ptr(0); - const scalar_t A_rc = STS::conj(val(beg)); - /*(vqd 20 Jul 2020) This assumes that the diagonal entry - has equal local row and column indices. That may not - necessarily hold, depending on the row and column Maps. See - note above.*/ - for (lno_t j = 0; j < numVecs; ++j) { - X(0, j) = X(0, j) / A_rc; - } - } } static void lowerTriSolveCsc(RangeMultiVectorType X, const CrsMatrixType& A, DomainMultiVectorType Y) { diff --git a/packages/kokkos-kernels/sparse/impl/KokkosSparse_trsv_spec.hpp b/packages/kokkos-kernels/sparse/impl/KokkosSparse_trsv_spec.hpp index ff852f97e23c..95c6d23c5f0b 100644 --- a/packages/kokkos-kernels/sparse/impl/KokkosSparse_trsv_spec.hpp +++ b/packages/kokkos-kernels/sparse/impl/KokkosSparse_trsv_spec.hpp @@ -174,6 +174,8 @@ struct TRSV >, \ false, true>; +#include + #define KOKKOSSPARSE_TRSV_ETI_SPEC_INST(SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ MEM_SPACE_TYPE) \ template struct TRSV< \ diff --git a/packages/kokkos-kernels/sparse/src/KokkosKernels_Handle.hpp b/packages/kokkos-kernels/sparse/src/KokkosKernels_Handle.hpp index 4d01e23a1d6c..f5e3fb4a0009 100644 --- a/packages/kokkos-kernels/sparse/src/KokkosKernels_Handle.hpp +++ b/packages/kokkos-kernels/sparse/src/KokkosKernels_Handle.hpp @@ -223,7 +223,7 @@ class KokkosKernelsHandle { typename size_type_persistent_work_view_t::HostMirror size_type_persistent_work_host_view_t; // Host view type typedef typename Kokkos::View scalar_temp_work_view_t; typedef typename Kokkos::View scalar_persistent_work_view_t; - typedef typename Kokkos::View + typedef typename Kokkos::View scalar_persistent_work_view2d_t; typedef typename Kokkos::View nnz_lno_temp_work_view_t; typedef typename Kokkos::View nnz_lno_persistent_work_view_t; diff --git a/packages/kokkos-kernels/sparse/src/KokkosSparse_BsrMatrix.hpp b/packages/kokkos-kernels/sparse/src/KokkosSparse_BsrMatrix.hpp index d174aa9b31c3..1fecb3b7b94f 100644 --- a/packages/kokkos-kernels/sparse/src/KokkosSparse_BsrMatrix.hpp +++ b/packages/kokkos-kernels/sparse/src/KokkosSparse_BsrMatrix.hpp @@ -21,8 +21,8 @@ /// This implements a local (no MPI) sparse matrix stored in block-by-block /// compressed row sparse format. -#ifndef KOKKOS_SPARSE_BSRMATRIX_HPP_ -#define KOKKOS_SPARSE_BSRMATRIX_HPP_ +#ifndef KOKKOSSPARSE_BSRMATRIX_HPP_ +#define KOKKOSSPARSE_BSRMATRIX_HPP_ #include #include @@ -300,7 +300,7 @@ struct BsrRowViewConst { /// storage for sparse matrices, as described, for example, in Saad /// (2nd ed.). template + class SizeType = KokkosKernels::default_size_type> class BsrMatrix { static_assert(std::is_signed::value, "BsrMatrix requires that OrdinalType is a signed integer type."); static_assert(Kokkos::is_memory_traits_v || std::is_void_v, @@ -529,7 +529,9 @@ class BsrMatrix { auto it = blocks.find(block); if (it == blocks.end()) { std::vector entries = {entry}; - entries.reserve(blockDim_ * blockDim_); + entries.reserve( + static_cast::size_type, ordinal_type>>(blockDim_) * + blockDim_); blocks[block] = std::move(entries); // new block with entry } else { it->second.push_back(entry); // add entry to block @@ -724,9 +726,9 @@ class BsrMatrix { Kokkos::deep_copy(h_crs_values, crs_mtx.values); typename values_type::HostMirror h_values = Kokkos::create_mirror_view(values); - if (h_values.extent(0) < size_t(numBlocks * blockDim_ * blockDim_)) { - Kokkos::resize(h_values, numBlocks * blockDim_ * blockDim_); - Kokkos::resize(values, numBlocks * blockDim_ * blockDim_); + if (h_values.extent(0) < static_cast(numBlocks) * blockDim_ * blockDim_) { + Kokkos::resize(h_values, static_cast(numBlocks) * blockDim_ * blockDim_); + Kokkos::resize(values, static_cast(numBlocks) * blockDim_ * blockDim_); } Kokkos::deep_copy(h_values, 0); @@ -967,7 +969,7 @@ class BsrMatrix { case BsrMatrix::valueOperation::ASSIGN: { for (ordinal_type lcol = 0; lcol < block_size; ++lcol) { if (force_atomic) { - Kokkos::atomic_assign(&(local_row_values[lcol]), vals[offset_into_vals + lrow * block_size + lcol]); + Kokkos::atomic_store(&(local_row_values[lcol]), vals[offset_into_vals + lrow * block_size + lcol]); } else { local_row_values[lcol] = vals[offset_into_vals + lrow * block_size + lcol]; } @@ -1005,4 +1007,4 @@ inline constexpr bool is_bsr_matrix_v = is_bsr_matrix::value; } // namespace Experimental } // namespace KokkosSparse -#endif +#endif // KOKKOSSPARSE_BSRMATRIX_HPP_ diff --git a/packages/kokkos-kernels/sparse/src/KokkosSparse_CcsMatrix.hpp b/packages/kokkos-kernels/sparse/src/KokkosSparse_CcsMatrix.hpp index 58665708b1b4..e39ab730e6b6 100644 --- a/packages/kokkos-kernels/sparse/src/KokkosSparse_CcsMatrix.hpp +++ b/packages/kokkos-kernels/sparse/src/KokkosSparse_CcsMatrix.hpp @@ -21,8 +21,8 @@ /// local (no MPI) sparse matrix stored in compressed column sparse /// ("Ccs") format. -#ifndef KOKKOS_SPARSE_CCSMATRIX_HPP_ -#define KOKKOS_SPARSE_CCSMATRIX_HPP_ +#ifndef KOKKOSSPARSE_CCSMATRIX_HPP_ +#define KOKKOSSPARSE_CCSMATRIX_HPP_ #include "Kokkos_Core.hpp" #include @@ -142,7 +142,7 @@ class CcsMatrix { //! Type of each (column) index in the matrix. typedef OrdinalType ordinal_type; //! Type of the graph structure of the sparse matrix - consistent with Kokkos. - typedef Kokkos::StaticCcsGraph + typedef Kokkos::StaticCcsGraph staticccsgraph_type; //! Type of the "column map" (which contains the offset for each column's //! data). @@ -236,4 +236,4 @@ template struct is_ccs_matrix> : public std::true_type {}; } // namespace KokkosSparse -#endif +#endif // KOKKOSSPARSE_CCSMATRIX_HPP_ diff --git a/packages/kokkos-kernels/sparse/src/KokkosSparse_CooMatrix.hpp b/packages/kokkos-kernels/sparse/src/KokkosSparse_CooMatrix.hpp index 996b3c29aaf2..c91a05f86371 100644 --- a/packages/kokkos-kernels/sparse/src/KokkosSparse_CooMatrix.hpp +++ b/packages/kokkos-kernels/sparse/src/KokkosSparse_CooMatrix.hpp @@ -21,8 +21,8 @@ /// local (no MPI) sparse matrix stored in coordinate ("Coo") format /// which is also known as ivj or triplet format. -#ifndef KOKKOS_SPARSE_COOMATRIX_HPP_ -#define KOKKOS_SPARSE_COOMATRIX_HPP_ +#ifndef KOKKOSSPARSE_COOMATRIX_HPP_ +#define KOKKOSSPARSE_COOMATRIX_HPP_ #include "Kokkos_Core.hpp" #include "KokkosKernels_Error.hpp" @@ -147,4 +147,4 @@ template struct is_coo_matrix> : public std::true_type {}; } // namespace KokkosSparse -#endif +#endif // KOKKOSSPARSE_COOMATRIX_HPP_ diff --git a/packages/kokkos-kernels/sparse/src/KokkosSparse_CrsMatrix.hpp b/packages/kokkos-kernels/sparse/src/KokkosSparse_CrsMatrix.hpp index 86586401cd78..676dfb64cb30 100644 --- a/packages/kokkos-kernels/sparse/src/KokkosSparse_CrsMatrix.hpp +++ b/packages/kokkos-kernels/sparse/src/KokkosSparse_CrsMatrix.hpp @@ -21,8 +21,8 @@ /// local (no MPI) sparse matrix stored in compressed row sparse /// ("Crs") format. -#ifndef KOKKOS_SPARSE_CRSMATRIX_HPP_ -#define KOKKOS_SPARSE_CRSMATRIX_HPP_ +#ifndef KOKKOSSPARSE_CRSMATRIX_HPP_ +#define KOKKOSSPARSE_CRSMATRIX_HPP_ #include "Kokkos_Core.hpp" #include "Kokkos_StaticCrsGraph.hpp" @@ -315,7 +315,7 @@ struct SparseRowViewConst { /// storage for sparse matrices, as described, for example, in Saad /// (2nd ed.). template + class SizeType = KokkosKernels::default_size_type> class CrsMatrix { static_assert(std::is_signed::value, "CrsMatrix requires that OrdinalType is a signed integer type."); @@ -344,10 +344,10 @@ class CrsMatrix { //! Type of a host-memory mirror of the sparse matrix. typedef CrsMatrix HostMirror; //! Type of the graph structure of the sparse matrix. - typedef Kokkos::StaticCrsGraph + typedef Kokkos::StaticCrsGraph StaticCrsGraphType; //! Type of the graph structure of the sparse matrix - consistent with Kokkos. - typedef Kokkos::StaticCrsGraph + typedef Kokkos::StaticCrsGraph staticcrsgraph_type; //! Type of column indices in the sparse matrix. typedef typename staticcrsgraph_type::entries_type index_type; @@ -611,7 +611,7 @@ class CrsMatrix { const ordinal_type offset = findRelOffset(&(row_view.colidx(0)), length, cols[i], hint, is_sorted); if (offset != length) { if (force_atomic) { - Kokkos::atomic_assign(&(row_view.value(offset)), vals[i]); + Kokkos::atomic_store(&(row_view.value(offset)), vals[i]); } else { row_view.value(offset) = vals[i]; } @@ -787,4 +787,4 @@ template inline constexpr bool is_crs_matrix_v = is_crs_matrix::value; } // namespace KokkosSparse -#endif +#endif // KOKKOSSPARSE_CRSMATRIX_HPP_ diff --git a/packages/kokkos-kernels/sparse/src/KokkosSparse_IOUtils.hpp b/packages/kokkos-kernels/sparse/src/KokkosSparse_IOUtils.hpp index 588c9dbca970..edb2cb6e97c2 100644 --- a/packages/kokkos-kernels/sparse/src/KokkosSparse_IOUtils.hpp +++ b/packages/kokkos-kernels/sparse/src/KokkosSparse_IOUtils.hpp @@ -73,7 +73,7 @@ void kk_sparseMatrix_generate(OrdinalType nrows, OrdinalType ncols, SizeType &nn } // Sample each value from uniform (-50, 50) for real types, or (-50 - 50i, 50 // + 50i) for complex types. - Kokkos::View valuesView(values, nnz * block_elem_count); + Kokkos::View valuesView(values, nnz * static_cast(block_elem_count)); ScalarType randStart, randEnd; KokkosKernels::Impl::getRandomBounds(50.0, randStart, randEnd); Kokkos::Random_XorShift64_Pool pool(13718); @@ -888,7 +888,7 @@ int read_mtx(const char *fileName, lno_t *nrows, lno_t *ncols, size_type *ne, si if (mtx_format == COORDINATE) ss >> nnz; else - nnz = nr * nc; + nnz = static_cast(nr) * nc; size_type numEdges = nnz; symmetrize = symmetrize || mtx_sym != GENERAL; if (symmetrize && nr != nc) { diff --git a/packages/kokkos-kernels/sparse/src/KokkosSparse_OrdinalTraits.hpp b/packages/kokkos-kernels/sparse/src/KokkosSparse_OrdinalTraits.hpp index ef08eb89e2f2..42cd564dc973 100644 --- a/packages/kokkos-kernels/sparse/src/KokkosSparse_OrdinalTraits.hpp +++ b/packages/kokkos-kernels/sparse/src/KokkosSparse_OrdinalTraits.hpp @@ -14,8 +14,8 @@ // //@HEADER -#ifndef KOKKOS_SPARSE_ORDINALTRAITS_HPP_ -#define KOKKOS_SPARSE_ORDINALTRAITS_HPP_ +#ifndef KOKKOSSPARSE_ORDINALTRAITS_HPP_ +#define KOKKOSSPARSE_ORDINALTRAITS_HPP_ /// \file KokkosSparse_OrdinalTraits.hpp /// \brief Declaration and definition of KokkosSparse::OrdinalTraits, @@ -95,4 +95,4 @@ struct OrdinalTraits { } // namespace KokkosSparse -#endif // KOKKOS_SPARSE_ORDINALTRAITS_HPP_ +#endif // KOKKOSSPARSE_ORDINALTRAITS_HPP_ diff --git a/packages/kokkos-kernels/sparse/src/KokkosSparse_SortCrs.hpp b/packages/kokkos-kernels/sparse/src/KokkosSparse_SortCrs.hpp index 838144dd5937..8da608be5645 100644 --- a/packages/kokkos-kernels/sparse/src/KokkosSparse_SortCrs.hpp +++ b/packages/kokkos-kernels/sparse/src/KokkosSparse_SortCrs.hpp @@ -63,7 +63,7 @@ void sort_crs_matrix(const execution_space& exec, const rowmap_t& rowmap, const return; } Ordinal numRows = rowmap.extent(0) ? rowmap.extent(0) - 1 : 0; - if constexpr (!KokkosKernels::Impl::kk_is_gpu_exec_space()) { + if constexpr (!KokkosKernels::Impl::is_gpu_exec_space_v) { // On CPUs, use a sequential radix sort within each row. Kokkos::parallel_for("sort_crs_matrix[CPU,radix]", Kokkos::RangePolicy>(exec, 0, numRows), @@ -240,7 +240,7 @@ void sort_crs_graph(const execution_space& exec, const rowmap_t& rowmap, const e if (entries.extent(0) <= size_t(1)) { return; } - if constexpr (!KokkosKernels::Impl::kk_is_gpu_exec_space()) { + if constexpr (!KokkosKernels::Impl::is_gpu_exec_space_v) { // If on CPU, sort each row independently. Don't need to know numCols for // this. Kokkos::parallel_for("sort_crs_graph[CPU,radix]", diff --git a/packages/kokkos-kernels/sparse/src/KokkosSparse_Utils.hpp b/packages/kokkos-kernels/sparse/src/KokkosSparse_Utils.hpp index d73787481e0e..e7d7b1c5bf8f 100644 --- a/packages/kokkos-kernels/sparse/src/KokkosSparse_Utils.hpp +++ b/packages/kokkos-kernels/sparse/src/KokkosSparse_Utils.hpp @@ -526,7 +526,7 @@ bsrMat_t transpose_bsr_matrix(const bsrMat_t &A) { rowmap_t AT_rowmap("Transpose rowmap", A.numCols() + 1); entries_t AT_entries(Kokkos::view_alloc(Kokkos::WithoutInitializing, "Transpose entries"), A.nnz()); values_t AT_values(Kokkos::view_alloc(Kokkos::WithoutInitializing, "Transpose values"), - A.nnz() * A.blockDim() * A.blockDim()); + A.nnz() * static_cast(A.blockDim()) * A.blockDim()); transpose_bsr_matrix(A.numRows(), A.numCols(), A.blockDim(), A.graph.row_map, A.graph.entries, A.values, AT_rowmap, AT_entries, AT_values); @@ -983,7 +983,7 @@ struct LowerTriangularMatrix { // TODO: Write GPU (vector-level) version here: /* - if(kk_is_gpu_exec_space()) + if(is_gpu_exec_space_v) { Kokkos::parallel_for( Kokkos::ThreadVectorRange(teamMember, read_left_work), diff --git a/packages/kokkos-kernels/sparse/src/KokkosSparse_Utils_rocsparse.hpp b/packages/kokkos-kernels/sparse/src/KokkosSparse_Utils_rocsparse.hpp index 4b99c96c8121..be939d589044 100644 --- a/packages/kokkos-kernels/sparse/src/KokkosSparse_Utils_rocsparse.hpp +++ b/packages/kokkos-kernels/sparse/src/KokkosSparse_Utils_rocsparse.hpp @@ -80,7 +80,7 @@ inline void rocsparse_internal_safe_call(rocsparse_status rocsparseStatus, const // The macro below defines is the public interface for the safe cusparse calls. // The functions themselves are protected by impl namespace. -#define KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(call) \ +#define KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(call) \ KokkosSparse::Impl::rocsparse_internal_safe_call(call, #call, __FILE__, __LINE__) inline rocsparse_operation mode_kk_to_rocsparse(const char kk_mode[]) { @@ -169,10 +169,10 @@ struct kokkos_to_rocsparse_type> { // destructed. struct TemporarySetRocsparseStream { TemporarySetRocsparseStream(rocsparse_handle handle_, const Kokkos::HIP& exec_) : handle(handle_) { - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_set_stream(handle, exec_.hip_stream())); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_set_stream(handle, exec_.hip_stream())); } - ~TemporarySetRocsparseStream() { KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_set_stream(handle, NULL)); } + ~TemporarySetRocsparseStream() { KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_set_stream(handle, NULL)); } rocsparse_handle handle; }; diff --git a/packages/kokkos-kernels/sparse/src/KokkosSparse_findRelOffset.hpp b/packages/kokkos-kernels/sparse/src/KokkosSparse_findRelOffset.hpp index 6dffcdd3d705..fe9a3968c0ad 100644 --- a/packages/kokkos-kernels/sparse/src/KokkosSparse_findRelOffset.hpp +++ b/packages/kokkos-kernels/sparse/src/KokkosSparse_findRelOffset.hpp @@ -14,8 +14,8 @@ // //@HEADER -#ifndef KOKKOS_SPARSE_FINDRELOFFSET_HPP -#define KOKKOS_SPARSE_FINDRELOFFSET_HPP +#ifndef KOKKOSSPARSE_FINDRELOFFSET_HPP +#define KOKKOSSPARSE_FINDRELOFFSET_HPP /// \file KokkosSparse_findRelOffset.hpp /// \brief Find the relative offset of a column index in a sparse @@ -147,4 +147,4 @@ KOKKOS_FUNCTION OffsetType findRelOffset(const IndexViewType& indsToSearch, cons } // namespace KokkosSparse -#endif // KOKKOS_SPARSE_FINDRELOFFSET_HPP +#endif // KOKKOSSPARSE_FINDRELOFFSET_HPP diff --git a/packages/kokkos-kernels/sparse/src/KokkosSparse_gauss_seidel_handle.hpp b/packages/kokkos-kernels/sparse/src/KokkosSparse_gauss_seidel_handle.hpp index bf5ee8633b14..8531f39bd036 100644 --- a/packages/kokkos-kernels/sparse/src/KokkosSparse_gauss_seidel_handle.hpp +++ b/packages/kokkos-kernels/sparse/src/KokkosSparse_gauss_seidel_handle.hpp @@ -208,7 +208,7 @@ class PointGaussSeidelHandle : public GaussSeidelHandle scalar_temp_work_view_t; typedef typename Kokkos::View scalar_persistent_work_view_t; - typedef typename Kokkos::View + typedef typename Kokkos::View scalar_persistent_work_view2d_t; typedef typename scalar_persistent_work_view_t::HostMirror scalar_persistent_work_host_view_t; // Host view type @@ -283,7 +283,7 @@ class PointGaussSeidelHandle : public GaussSeidelHandleblock_size; } void choose_default_algorithm() { - if (KokkosKernels::Impl::kk_is_gpu_exec_space()) + if (KokkosKernels::Impl::is_gpu_exec_space_v) this->algorithm_type = GS_TEAM; else this->algorithm_type = GS_PERMUTED; @@ -492,7 +492,7 @@ class ClusterGaussSeidelHandle : public GaussSeidelHandle(); } + bool use_teams() const { return KokkosKernels::Impl::is_gpu_exec_space_v; } ~ClusterGaussSeidelHandle() = default; @@ -532,7 +532,7 @@ class TwoStageGaussSeidelHandle using const_ordinal_t = typename const_entries_view_t::value_type; using const_scalar_t = typename const_values_view_t::value_type; - using vector_view_t = Kokkos::View; + using vector_view_t = Kokkos::View; using GSHandle = GaussSeidelHandle; diff --git a/packages/kokkos-kernels/sparse/src/KokkosSparse_getDiagCopy.hpp b/packages/kokkos-kernels/sparse/src/KokkosSparse_getDiagCopy.hpp index 8f67d8a1c6f5..bfe612d1cb00 100644 --- a/packages/kokkos-kernels/sparse/src/KokkosSparse_getDiagCopy.hpp +++ b/packages/kokkos-kernels/sparse/src/KokkosSparse_getDiagCopy.hpp @@ -17,8 +17,8 @@ /// \file KokkosSparse_getDiagCopy.hpp /// \brief Get a copy of the diagonal entries of a KokkosSparse::CrsMatrix. -#ifndef KOKKOS_SPARSE_GETDIAGCOPY_HPP_ -#define KOKKOS_SPARSE_GETDIAGCOPY_HPP_ +#ifndef KOKKOSSPARSE_GETDIAGCOPY_HPP_ +#define KOKKOSSPARSE_GETDIAGCOPY_HPP_ #include "KokkosSparse_getDiagCopyWithOffsets_impl.hpp" #include @@ -55,4 +55,4 @@ void getDiagCopy(const DiagType& D, const OffsetsType& offsets, const CrsMatrixT } // namespace KokkosSparse -#endif // KOKKOS_SPARSE_GETDIAGCOPY_HPP_ +#endif // KOKOS_SPARSE_GETDIAGCOPY_HPP_ diff --git a/packages/kokkos-kernels/sparse/src/KokkosSparse_spgemm_handle.hpp b/packages/kokkos-kernels/sparse/src/KokkosSparse_spgemm_handle.hpp index 9e7679a3a92e..1857e0bbc71d 100644 --- a/packages/kokkos-kernels/sparse/src/KokkosSparse_spgemm_handle.hpp +++ b/packages/kokkos-kernels/sparse/src/KokkosSparse_spgemm_handle.hpp @@ -135,11 +135,11 @@ class SPGEMMHandle { bufferSize = 0; buffer = nullptr; - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_create_mat_descr(&descr_A)); - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_create_mat_descr(&descr_B)); - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_create_mat_descr(&descr_C)); - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_create_mat_descr(&descr_D)); - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_create_mat_info(&info_C)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_create_mat_descr(&descr_A)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_create_mat_descr(&descr_B)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_create_mat_descr(&descr_C)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_create_mat_descr(&descr_D)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_create_mat_info(&info_C)); rocsparseHandle = kkControls.getRocsparseHandle(); } @@ -671,7 +671,7 @@ class SPGEMMHandle { // them in the handle suggested_vector_size_ = KokkosKernels::Impl::kk_get_suggested_vector_size( nr, nnz, KokkosKernels::Impl::kk_get_exec_space_type()); - if (KokkosKernels::Impl::kk_is_gpu_exec_space()) + if (KokkosKernels::Impl::is_gpu_exec_space_v) suggested_team_size_ = max_allowed_team_size / suggested_vector_size_; else suggested_team_size = max_allowed_team_size; diff --git a/packages/kokkos-kernels/sparse/src/KokkosSparse_spmv.hpp b/packages/kokkos-kernels/sparse/src/KokkosSparse_spmv.hpp index e31ff2ef8d60..86171b6cc771 100644 --- a/packages/kokkos-kernels/sparse/src/KokkosSparse_spmv.hpp +++ b/packages/kokkos-kernels/sparse/src/KokkosSparse_spmv.hpp @@ -119,8 +119,8 @@ void spmv(const ExecutionSpace& space, Handle* handle, const char mode[], const m = A.numRows(); n = A.numCols(); } else { - m = A.numRows() * A.blockDim(); - n = A.numCols() * A.blockDim(); + m = static_cast(A.numRows()) * A.blockDim(); + n = static_cast(A.numCols()) * A.blockDim(); } if ((mode[0] == NoTranspose[0]) || (mode[0] == Conjugate[0])) { diff --git a/packages/kokkos-kernels/sparse/src/KokkosSparse_spmv_handle.hpp b/packages/kokkos-kernels/sparse/src/KokkosSparse_spmv_handle.hpp index a1cd8895a338..4da6e4755139 100644 --- a/packages/kokkos-kernels/sparse/src/KokkosSparse_spmv_handle.hpp +++ b/packages/kokkos-kernels/sparse/src/KokkosSparse_spmv_handle.hpp @@ -148,7 +148,7 @@ struct RocSparse_CRS_SpMV_Data : public TPL_SpMV_Data { ~RocSparse_CRS_SpMV_Data() { // note: hipFree includes an implicit device synchronize KOKKOS_IMPL_HIP_SAFE_CALL(hipFree(buffer)); - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_destroy_spmat_descr(mat)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_destroy_spmat_descr(mat)); } rocsparse_spmat_descr mat; @@ -159,9 +159,9 @@ struct RocSparse_CRS_SpMV_Data : public TPL_SpMV_Data { struct RocSparse_BSR_SpMV_Data : public TPL_SpMV_Data { RocSparse_BSR_SpMV_Data(const Kokkos::HIP& exec_) : TPL_SpMV_Data(exec_) {} ~RocSparse_BSR_SpMV_Data() { - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_destroy_mat_descr(mat)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_destroy_mat_descr(mat)); #if (KOKKOSSPARSE_IMPL_ROCM_VERSION >= 50400) - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_destroy_mat_info(info)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_destroy_mat_info(info)); #endif } diff --git a/packages/kokkos-kernels/sparse/src/KokkosSparse_sptrsv_supernode.hpp b/packages/kokkos-kernels/sparse/src/KokkosSparse_sptrsv_supernode.hpp index 586e8f3a6449..1f67054ca3f8 100644 --- a/packages/kokkos-kernels/sparse/src/KokkosSparse_sptrsv_supernode.hpp +++ b/packages/kokkos-kernels/sparse/src/KokkosSparse_sptrsv_supernode.hpp @@ -1358,8 +1358,9 @@ void invert_supernodal_columns(KernelHandle *kernelHandle, bool unit_diag, int n char uplo_char = (lower ? 'L' : 'U'); char diag_char = (unit_diag ? 'U' : 'N'); - // NOTE: we currently supports only default_layout = LayoutLeft - Kokkos::View viewL(&hv(nnzD), nsrow, nscol); + // NOTE: we currently supports only KokkosKernels::default_layout = LayoutLeft + Kokkos::View viewL( + &hv(nnzD), nsrow, nscol); auto Ljj = Kokkos::subview(viewL, range_type(0, nscol), Kokkos::ALL()); #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE @@ -1403,7 +1404,7 @@ void invert_supernodal_columns(KernelHandle *kernelHandle, bool unit_diag, int n Kokkos::deep_copy(dViewL, viewL); #endif - // NOTE: we currently supports only default_layout = LayoutLeft + // NOTE: we currently supports only KokkosKernels::default_layout = LayoutLeft auto dViewLjj = Kokkos::subview(dViewL, range_type(0, nscol), Kokkos::ALL()); auto dViewLij = Kokkos::subview(dViewL, range_type(nscol, nsrow), Kokkos::ALL()); @@ -1946,7 +1947,7 @@ void split_crsmat(KernelHandle *kernelHandleL, host_crsmat_t superluL) { } // allocate for all the subgraphs row_map_view_t total_rowmap_view(Kokkos::view_alloc(Kokkos::WithoutInitializing, "rowmap_view"), - 2 * nlevels * (nrows + 1)); + static_cast(2 * nlevels) * (nrows + 1)); cols_view_t total_column_view(Kokkos::view_alloc(Kokkos::WithoutInitializing, "colmap_view"), newNnz); values_view_t total_values_view(Kokkos::view_alloc(Kokkos::WithoutInitializing, "values_view"), newNnz); // create host-mirrors diff --git a/packages/kokkos-kernels/sparse/tpls/KokkosKernels_tpl_handles_decl.hpp b/packages/kokkos-kernels/sparse/tpls/KokkosKernels_tpl_handles_decl.hpp index a1cd3c97f5f5..4a14d43df065 100644 --- a/packages/kokkos-kernels/sparse/tpls/KokkosKernels_tpl_handles_decl.hpp +++ b/packages/kokkos-kernels/sparse/tpls/KokkosKernels_tpl_handles_decl.hpp @@ -30,7 +30,11 @@ struct CusparseSingleton { CusparseSingleton(); + static bool is_initialized(); static CusparseSingleton& singleton(); + + private: + static std::unique_ptr& get_instance(); }; } // namespace Impl @@ -48,7 +52,11 @@ struct RocsparseSingleton { RocsparseSingleton(); + static bool is_initialized(); static RocsparseSingleton& singleton(); + + private: + static std::unique_ptr& get_instance(); }; } // namespace Impl diff --git a/packages/kokkos-kernels/sparse/tpls/KokkosKernels_tpl_handles_def.hpp b/packages/kokkos-kernels/sparse/tpls/KokkosKernels_tpl_handles_def.hpp index a88ad12130c6..d52959a59186 100644 --- a/packages/kokkos-kernels/sparse/tpls/KokkosKernels_tpl_handles_def.hpp +++ b/packages/kokkos-kernels/sparse/tpls/KokkosKernels_tpl_handles_def.hpp @@ -25,14 +25,24 @@ namespace KokkosKernels { namespace Impl { -CusparseSingleton::CusparseSingleton() { - KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreate(&cusparseHandle)); +CusparseSingleton::CusparseSingleton() { KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreate(&cusparseHandle)); } - Kokkos::push_finalize_hook([&]() { cusparseDestroy(cusparseHandle); }); +CusparseSingleton& CusparseSingleton::singleton() { + std::unique_ptr& instance = get_instance(); + if (!instance) { + instance = std::make_unique(); + Kokkos::push_finalize_hook([&]() { + KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroy(instance->cusparseHandle)); + instance.reset(); + }); + } + return *instance; } -CusparseSingleton& CusparseSingleton::singleton() { - static CusparseSingleton s; +bool CusparseSingleton::is_initialized() { return get_instance() != nullptr; } + +std::unique_ptr& CusparseSingleton::get_instance() { + static std::unique_ptr s; return s; } @@ -47,13 +57,25 @@ namespace KokkosKernels { namespace Impl { RocsparseSingleton::RocsparseSingleton() { - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_create_handle(&rocsparseHandle)); - - Kokkos::push_finalize_hook([&]() { KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_destroy_handle(rocsparseHandle)); }); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_create_handle(&rocsparseHandle)); } RocsparseSingleton& RocsparseSingleton::singleton() { - static RocsparseSingleton s; + std::unique_ptr& instance = get_instance(); + if (!instance) { + instance = std::make_unique(); + Kokkos::push_finalize_hook([&]() { + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_destroy_handle(instance->rocsparseHandle)); + instance.reset(); + }); + } + return *instance; +} + +bool RocsparseSingleton::is_initialized() { return get_instance() != nullptr; } + +std::unique_ptr& RocsparseSingleton::get_instance() { + static std::unique_ptr s; return s; } diff --git a/packages/kokkos-kernels/sparse/tpls/KokkosSparse_spadd_numeric_tpl_spec_decl.hpp b/packages/kokkos-kernels/sparse/tpls/KokkosSparse_spadd_numeric_tpl_spec_decl.hpp index 14eac2aee1cf..eff19977cfe0 100644 --- a/packages/kokkos-kernels/sparse/tpls/KokkosSparse_spadd_numeric_tpl_spec_decl.hpp +++ b/packages/kokkos-kernels/sparse/tpls/KokkosSparse_spadd_numeric_tpl_spec_decl.hpp @@ -184,21 +184,21 @@ KOKKOSSPARSE_SPADD_NUMERIC_TPL_SPEC_DECL_CUSPARSE_EXT(false) auto &rocspHandle = KokkosKernels::Impl::RocsparseSingleton::singleton().rocsparseHandle; \ rocsparse_pointer_mode oldPtrMode; \ \ - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_set_stream(rocspHandle, exec.hip_stream())); \ - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_get_pointer_mode(rocspHandle, &oldPtrMode)); \ - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL( \ + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_set_stream(rocspHandle, exec.hip_stream())); \ + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_get_pointer_mode(rocspHandle, &oldPtrMode)); \ + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL( \ rocsparse_set_pointer_mode(rocspHandle, rocsparse_pointer_mode_host)); /* alpha, beta on host*/ \ OFFSET_TYPE nnzA = colidxA.extent(0); \ OFFSET_TYPE nnzB = colidxB.extent(0); \ - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_##TOKEN##csrgeam( \ + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_##TOKEN##csrgeam( \ rocspHandle, m, n, reinterpret_cast(&alpha), rocData.descrA, nnzA, \ reinterpret_cast(valuesA.data()), rowmapA.data(), colidxA.data(), \ reinterpret_cast(&beta), rocData.descrB, nnzB, \ reinterpret_cast(valuesB.data()), rowmapB.data(), colidxB.data(), rocData.descrC, \ reinterpret_cast(valuesC.data()), const_cast(rowmapC.data()), \ colidxC.data())); \ - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_set_pointer_mode(rocspHandle, oldPtrMode)); \ - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_set_stream(rocspHandle, NULL)); \ + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_set_pointer_mode(rocspHandle, oldPtrMode)); \ + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_set_stream(rocspHandle, NULL)); \ \ Kokkos::Profiling::popRegion(); \ } \ diff --git a/packages/kokkos-kernels/sparse/tpls/KokkosSparse_spadd_symbolic_tpl_spec_decl.hpp b/packages/kokkos-kernels/sparse/tpls/KokkosSparse_spadd_symbolic_tpl_spec_decl.hpp index 514b019f1b32..69286e5e985d 100644 --- a/packages/kokkos-kernels/sparse/tpls/KokkosSparse_spadd_symbolic_tpl_spec_decl.hpp +++ b/packages/kokkos-kernels/sparse/tpls/KokkosSparse_spadd_symbolic_tpl_spec_decl.hpp @@ -160,15 +160,15 @@ KOKKOSSPARSE_SPADD_SYMBOLIC_TPL_SPEC_DECL_CUSPARSE_EXT(false) OFFSET_TYPE nnzB = colidxB.extent(0); \ OFFSET_TYPE nnzC = 0; \ \ - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_set_stream(rocspHandle, exec.hip_stream())); \ - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_create_mat_descr(&rocData.descrA)); \ - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_create_mat_descr(&rocData.descrB)); \ - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_create_mat_descr(&rocData.descrC)); \ - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_csrgeam_nnz(rocspHandle, m, n, rocData.descrA, nnzA, rowmapA.data(), \ - colidxA.data(), rocData.descrB, nnzB, rowmapB.data(), \ - colidxB.data(), rocData.descrC, rowmapC.data(), &nnzC)); \ + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_set_stream(rocspHandle, exec.hip_stream())); \ + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_create_mat_descr(&rocData.descrA)); \ + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_create_mat_descr(&rocData.descrB)); \ + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_create_mat_descr(&rocData.descrC)); \ + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_csrgeam_nnz( \ + rocspHandle, m, n, rocData.descrA, nnzA, rowmapA.data(), colidxA.data(), rocData.descrB, nnzB, \ + rowmapB.data(), colidxB.data(), rocData.descrC, rowmapC.data(), &nnzC)); \ addHandle->set_c_nnz(nnzC); \ - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_set_stream(rocspHandle, NULL)); \ + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_set_stream(rocspHandle, NULL)); \ Kokkos::Profiling::popRegion(); \ } \ }; diff --git a/packages/kokkos-kernels/sparse/tpls/KokkosSparse_spgemm_numeric_tpl_spec_avail.hpp b/packages/kokkos-kernels/sparse/tpls/KokkosSparse_spgemm_numeric_tpl_spec_avail.hpp index 6609d77a81da..bbec0e402aa8 100644 --- a/packages/kokkos-kernels/sparse/tpls/KokkosSparse_spgemm_numeric_tpl_spec_avail.hpp +++ b/packages/kokkos-kernels/sparse/tpls/KokkosSparse_spgemm_numeric_tpl_spec_avail.hpp @@ -45,23 +45,23 @@ struct spgemm_numeric_tpl_spec_avail { struct spgemm_numeric_tpl_spec_avail< \ KokkosKernels::Experimental::KokkosKernelsHandle, \ - Kokkos::View, \ + Kokkos::View, \ Kokkos::MemoryTraits >, \ - Kokkos::View, \ + Kokkos::View, \ Kokkos::MemoryTraits >, \ - Kokkos::View, \ + Kokkos::View, \ Kokkos::MemoryTraits >, \ - Kokkos::View, \ + Kokkos::View, \ Kokkos::MemoryTraits >, \ - Kokkos::View, \ + Kokkos::View, \ Kokkos::MemoryTraits >, \ - Kokkos::View, \ + Kokkos::View, \ Kokkos::MemoryTraits >, \ - Kokkos::View, \ + Kokkos::View, \ Kokkos::MemoryTraits >, \ - Kokkos::View, \ + Kokkos::View, \ Kokkos::MemoryTraits >, \ - Kokkos::View, \ + Kokkos::View, \ Kokkos::MemoryTraits > > { \ enum : bool { value = true }; \ }; @@ -80,30 +80,30 @@ SPGEMM_NUMERIC_AVAIL_CUSPARSE_S(Kokkos::complex) #endif #ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE -#define SPGEMM_NUMERIC_AVAIL_ROCSPARSE(SCALAR) \ - template <> \ - struct spgemm_numeric_tpl_spec_avail< \ - KokkosKernels::Experimental::KokkosKernelsHandle, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ +#define SPGEMM_NUMERIC_AVAIL_ROCSPARSE(SCALAR) \ + template <> \ + struct spgemm_numeric_tpl_spec_avail< \ + KokkosKernels::Experimental::KokkosKernelsHandle, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; SPGEMM_NUMERIC_AVAIL_ROCSPARSE(float) @@ -113,30 +113,30 @@ SPGEMM_NUMERIC_AVAIL_ROCSPARSE(Kokkos::complex) #endif #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL -#define SPGEMM_NUMERIC_AVAIL_MKL(SCALAR, EXEC) \ - template <> \ - struct spgemm_numeric_tpl_spec_avail< \ - KokkosKernels::Experimental::KokkosKernelsHandle, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ +#define SPGEMM_NUMERIC_AVAIL_MKL(SCALAR, EXEC) \ + template <> \ + struct spgemm_numeric_tpl_spec_avail< \ + KokkosKernels::Experimental::KokkosKernelsHandle, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; #define SPGEMM_NUMERIC_AVAIL_MKL_E(EXEC) \ diff --git a/packages/kokkos-kernels/sparse/tpls/KokkosSparse_spgemm_numeric_tpl_spec_decl.hpp b/packages/kokkos-kernels/sparse/tpls/KokkosSparse_spgemm_numeric_tpl_spec_decl.hpp index 5e636eea0e07..c26118fec9b6 100644 --- a/packages/kokkos-kernels/sparse/tpls/KokkosSparse_spgemm_numeric_tpl_spec_decl.hpp +++ b/packages/kokkos-kernels/sparse/tpls/KokkosSparse_spgemm_numeric_tpl_spec_decl.hpp @@ -191,50 +191,53 @@ void spgemm_numeric_cusparse(KernelHandle *handle, lno_t m, lno_t n, lno_t k, co #endif -#define SPGEMM_NUMERIC_DECL_CUSPARSE(SCALAR, MEMSPACE, TPL_AVAIL) \ - template <> \ - struct SPGEMM_NUMERIC, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, TPL_AVAIL> { \ - using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle; \ - using c_int_view_t = Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using int_view_t = Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using c_scalar_view_t = Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using scalar_view_t = Kokkos::View, \ - Kokkos::MemoryTraits>; \ - static void spgemm_numeric(KernelHandle *handle, typename KernelHandle::nnz_lno_t m, \ - typename KernelHandle::nnz_lno_t n, typename KernelHandle::nnz_lno_t k, \ - c_int_view_t row_mapA, c_int_view_t entriesA, c_scalar_view_t valuesA, bool, \ - c_int_view_t row_mapB, c_int_view_t entriesB, c_scalar_view_t valuesB, bool, \ - c_int_view_t row_mapC, int_view_t entriesC, scalar_view_t valuesC) { \ - std::string label = "KokkosSparse::spgemm_numeric[TPL_CUSPARSE," + Kokkos::ArithTraits::name() + "]"; \ - Kokkos::Profiling::pushRegion(label); \ - spgemm_numeric_cusparse(handle->get_spgemm_handle(), m, n, k, row_mapA, entriesA, valuesA, row_mapB, entriesB, \ - valuesB, row_mapC, entriesC, valuesC); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define SPGEMM_NUMERIC_DECL_CUSPARSE(SCALAR, MEMSPACE, TPL_AVAIL) \ + template <> \ + struct SPGEMM_NUMERIC, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, TPL_AVAIL> { \ + using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle; \ + using c_int_view_t = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using int_view_t = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using c_scalar_view_t = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using scalar_view_t = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ + static void spgemm_numeric(KernelHandle *handle, typename KernelHandle::nnz_lno_t m, \ + typename KernelHandle::nnz_lno_t n, typename KernelHandle::nnz_lno_t k, \ + c_int_view_t row_mapA, c_int_view_t entriesA, c_scalar_view_t valuesA, bool, \ + c_int_view_t row_mapB, c_int_view_t entriesB, c_scalar_view_t valuesB, bool, \ + c_int_view_t row_mapC, int_view_t entriesC, scalar_view_t valuesC) { \ + std::string label = "KokkosSparse::spgemm_numeric[TPL_CUSPARSE," + Kokkos::ArithTraits::name() + "]"; \ + Kokkos::Profiling::pushRegion(label); \ + spgemm_numeric_cusparse(handle->get_spgemm_handle(), m, n, k, row_mapA, entriesA, valuesA, row_mapB, entriesB, \ + valuesB, row_mapC, entriesC, valuesC); \ + Kokkos::Profiling::popRegion(); \ + } \ }; #define SPGEMM_NUMERIC_DECL_CUSPARSE_S(SCALAR, TPL_AVAIL) \ @@ -301,18 +304,18 @@ void spgemm_numeric_rocsparse(KernelHandle *handle, typename KernelHandle::nnz_l auto nnz_B = colidxB.extent(0); auto nnz_C = colidxC.extent(0); - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_get_pointer_mode(h->rocsparseHandle, &oldPtrMode)); - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_set_pointer_mode(h->rocsparseHandle, rocsparse_pointer_mode_host)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_get_pointer_mode(h->rocsparseHandle, &oldPtrMode)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_set_pointer_mode(h->rocsparseHandle, rocsparse_pointer_mode_host)); if (!handle->are_entries_computed()) { - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_csrgemm_symbolic( + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_csrgemm_symbolic( h->rocsparseHandle, h->opA, h->opB, m, k, n, h->descr_A, nnz_A, rowptrA.data(), colidxA.data(), h->descr_B, nnz_B, rowptrB.data(), colidxB.data(), h->descr_D, 0, nullptr, nullptr, h->descr_C, nnz_C, rowptrC.data(), colidxC.data(), h->info_C, h->buffer)); handle->set_computed_entries(); } - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_Xcsrgemm_numeric( + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_Xcsrgemm_numeric( h->rocsparseHandle, h->opA, h->opB, m, k, n, reinterpret_cast(&alpha), h->descr_A, nnz_A, reinterpret_cast(valuesA.data()), rowptrA.data(), colidxA.data(), h->descr_B, nnz_B, reinterpret_cast(valuesB.data()), rowptrB.data(), @@ -320,45 +323,49 @@ void spgemm_numeric_rocsparse(KernelHandle *handle, typename KernelHandle::nnz_l h->descr_C, nnz_C, reinterpret_cast(valuesC.data()), rowptrC.data(), colidxC.data(), h->info_C, h->buffer)); // Restore old pointer mode - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_set_pointer_mode(h->rocsparseHandle, oldPtrMode)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_set_pointer_mode(h->rocsparseHandle, oldPtrMode)); handle->set_call_numeric(); } #define SPGEMM_NUMERIC_DECL_ROCSPARSE(SCALAR, TPL_AVAIL) \ template <> \ - struct SPGEMM_NUMERIC, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, TPL_AVAIL> { \ + struct SPGEMM_NUMERIC< \ + KokkosKernels::Experimental::KokkosKernelsHandle, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, TPL_AVAIL> { \ using KernelHandle = \ KokkosKernels::Experimental::KokkosKernelsHandle; \ - using c_int_view_t = Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using int_view_t = Kokkos::View, \ - Kokkos::MemoryTraits>; \ + using c_int_view_t = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using int_view_t = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ using c_scalar_view_t = \ - Kokkos::View, \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using scalar_view_t = \ + Kokkos::View, \ Kokkos::MemoryTraits>; \ - using scalar_view_t = Kokkos::View, \ - Kokkos::MemoryTraits>; \ static void spgemm_numeric(KernelHandle *handle, typename KernelHandle::nnz_lno_t m, \ typename KernelHandle::nnz_lno_t n, typename KernelHandle::nnz_lno_t k, \ c_int_view_t row_mapA, c_int_view_t entriesA, c_scalar_view_t valuesA, bool, \ @@ -437,50 +444,54 @@ void spgemm_numeric_mkl(KernelHandle *handle, typename KernelHandle::nnz_lno_t m handle->set_computed_entries(); } -#define SPGEMM_NUMERIC_DECL_MKL(SCALAR, EXEC, TPL_AVAIL) \ - template <> \ - struct SPGEMM_NUMERIC, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, TPL_AVAIL> { \ - using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle \ + struct SPGEMM_NUMERIC< \ + KokkosKernels::Experimental::KokkosKernelsHandle, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, TPL_AVAIL> { \ + using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle; \ - using c_int_view_t = Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using int_view_t = Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using c_scalar_view_t = Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using scalar_view_t = Kokkos::View, \ - Kokkos::MemoryTraits>; \ - static void spgemm_numeric(KernelHandle *handle, typename KernelHandle::nnz_lno_t m, \ - typename KernelHandle::nnz_lno_t n, typename KernelHandle::nnz_lno_t k, \ - c_int_view_t row_mapA, c_int_view_t entriesA, c_scalar_view_t valuesA, bool, \ - c_int_view_t row_mapB, c_int_view_t entriesB, c_scalar_view_t valuesB, bool, \ - c_int_view_t row_mapC, int_view_t entriesC, scalar_view_t valuesC) { \ - std::string label = "KokkosSparse::spgemm_numeric[TPL_MKL," + Kokkos::ArithTraits::name() + "]"; \ - Kokkos::Profiling::pushRegion(label); \ - spgemm_numeric_mkl(handle->get_spgemm_handle(), m, n, k, row_mapA, entriesA, valuesA, row_mapB, entriesB, \ - valuesB, row_mapC, entriesC, valuesC); \ - Kokkos::Profiling::popRegion(); \ - } \ + using c_int_view_t = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using int_view_t = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using c_scalar_view_t = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using scalar_view_t = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ + static void spgemm_numeric(KernelHandle *handle, typename KernelHandle::nnz_lno_t m, \ + typename KernelHandle::nnz_lno_t n, typename KernelHandle::nnz_lno_t k, \ + c_int_view_t row_mapA, c_int_view_t entriesA, c_scalar_view_t valuesA, bool, \ + c_int_view_t row_mapB, c_int_view_t entriesB, c_scalar_view_t valuesB, bool, \ + c_int_view_t row_mapC, int_view_t entriesC, scalar_view_t valuesC) { \ + std::string label = "KokkosSparse::spgemm_numeric[TPL_MKL," + Kokkos::ArithTraits::name() + "]"; \ + Kokkos::Profiling::pushRegion(label); \ + spgemm_numeric_mkl(handle->get_spgemm_handle(), m, n, k, row_mapA, entriesA, valuesA, row_mapB, entriesB, \ + valuesB, row_mapC, entriesC, valuesC); \ + Kokkos::Profiling::popRegion(); \ + } \ }; #define SPGEMM_NUMERIC_DECL_MKL_SE(SCALAR, EXEC) \ diff --git a/packages/kokkos-kernels/sparse/tpls/KokkosSparse_spgemm_symbolic_tpl_spec_avail.hpp b/packages/kokkos-kernels/sparse/tpls/KokkosSparse_spgemm_symbolic_tpl_spec_avail.hpp index 75615623a525..392eb761c525 100644 --- a/packages/kokkos-kernels/sparse/tpls/KokkosSparse_spgemm_symbolic_tpl_spec_avail.hpp +++ b/packages/kokkos-kernels/sparse/tpls/KokkosSparse_spgemm_symbolic_tpl_spec_avail.hpp @@ -44,15 +44,15 @@ struct spgemm_symbolic_tpl_spec_avail { struct spgemm_symbolic_tpl_spec_avail< \ KokkosKernels::Experimental::KokkosKernelsHandle, \ - Kokkos::View, \ + Kokkos::View, \ Kokkos::MemoryTraits >, \ - Kokkos::View, \ + Kokkos::View, \ Kokkos::MemoryTraits >, \ - Kokkos::View, \ + Kokkos::View, \ Kokkos::MemoryTraits >, \ - Kokkos::View, \ + Kokkos::View, \ Kokkos::MemoryTraits >, \ - Kokkos::View, \ + Kokkos::View, \ Kokkos::MemoryTraits > > { \ enum : bool { value = true }; \ }; @@ -70,22 +70,22 @@ SPGEMM_SYMBOLIC_AVAIL_CUSPARSE_S(Kokkos::complex) #endif #ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE -#define SPGEMM_SYMBOLIC_AVAIL_ROCSPARSE(SCALAR) \ - template <> \ - struct spgemm_symbolic_tpl_spec_avail< \ - KokkosKernels::Experimental::KokkosKernelsHandle, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ +#define SPGEMM_SYMBOLIC_AVAIL_ROCSPARSE(SCALAR) \ + template <> \ + struct spgemm_symbolic_tpl_spec_avail< \ + KokkosKernels::Experimental::KokkosKernelsHandle, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; SPGEMM_SYMBOLIC_AVAIL_ROCSPARSE(float) @@ -95,22 +95,22 @@ SPGEMM_SYMBOLIC_AVAIL_ROCSPARSE(Kokkos::complex) #endif #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL -#define SPGEMM_SYMBOLIC_AVAIL_MKL(SCALAR, EXEC) \ - template <> \ - struct spgemm_symbolic_tpl_spec_avail< \ - KokkosKernels::Experimental::KokkosKernelsHandle, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ +#define SPGEMM_SYMBOLIC_AVAIL_MKL(SCALAR, EXEC) \ + template <> \ + struct spgemm_symbolic_tpl_spec_avail< \ + KokkosKernels::Experimental::KokkosKernelsHandle, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; #define SPGEMM_SYMBOLIC_AVAIL_MKL_E(EXEC) \ diff --git a/packages/kokkos-kernels/sparse/tpls/KokkosSparse_spgemm_symbolic_tpl_spec_decl.hpp b/packages/kokkos-kernels/sparse/tpls/KokkosSparse_spgemm_symbolic_tpl_spec_decl.hpp index 4ac41ca80dde..6385fff83534 100644 --- a/packages/kokkos-kernels/sparse/tpls/KokkosSparse_spgemm_symbolic_tpl_spec_decl.hpp +++ b/packages/kokkos-kernels/sparse/tpls/KokkosSparse_spgemm_symbolic_tpl_spec_decl.hpp @@ -313,25 +313,27 @@ void spgemm_symbolic_cusparse(KernelHandle *handle, lno_t m, lno_t n, lno_t k, c #define SPGEMM_SYMBOLIC_DECL_CUSPARSE(SCALAR, MEMSPACE, TPL_AVAIL) \ template <> \ - struct SPGEMM_SYMBOLIC, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, TPL_AVAIL> { \ + struct SPGEMM_SYMBOLIC< \ + KokkosKernels::Experimental::KokkosKernelsHandle, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, TPL_AVAIL> { \ using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle; \ - using c_int_view_t = Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using int_view_t = Kokkos::View, \ - Kokkos::MemoryTraits>; \ + using c_int_view_t = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using int_view_t = Kokkos::View, \ + Kokkos::MemoryTraits>; \ static void spgemm_symbolic(KernelHandle *handle, typename KernelHandle::nnz_lno_t m, \ typename KernelHandle::nnz_lno_t n, typename KernelHandle::nnz_lno_t k, \ c_int_view_t row_mapA, c_int_view_t entriesA, bool, c_int_view_t row_mapB, \ @@ -409,11 +411,11 @@ void spgemm_symbolic_rocsparse(KernelHandle *handle, typename KernelHandle::nnz_ const auto beta = Kokkos::ArithTraits::zero(); rocsparse_pointer_mode oldPtrMode; - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_get_pointer_mode(h->rocsparseHandle, &oldPtrMode)); - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_set_pointer_mode(h->rocsparseHandle, rocsparse_pointer_mode_host)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_get_pointer_mode(h->rocsparseHandle, &oldPtrMode)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_set_pointer_mode(h->rocsparseHandle, rocsparse_pointer_mode_host)); // C = alpha * OpA(A) * OpB(B) + beta * D - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_Xcsrgemm_buffer_size( + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_Xcsrgemm_buffer_size( h->rocsparseHandle, h->opA, h->opB, m, k, n, reinterpret_cast(&alpha), h->descr_A, nnz_A, rowptrA.data(), colidxA.data(), h->descr_B, nnz_B, rowptrB.data(), colidxB.data(), reinterpret_cast(&beta), h->descr_D, 0, nullptr, nullptr, h->info_C, @@ -422,16 +424,16 @@ void spgemm_symbolic_rocsparse(KernelHandle *handle, typename KernelHandle::nnz_ KOKKOS_IMPL_HIP_SAFE_CALL(hipMalloc(&h->buffer, h->bufferSize)); rocsparse_int nnz_C = 0; - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_csrgemm_nnz(h->rocsparseHandle, h->opA, h->opB, m, k, n, h->descr_A, nnz_A, - rowptrA.data(), colidxA.data(), h->descr_B, nnz_B, - rowptrB.data(), colidxB.data(), h->descr_D, 0, nullptr, nullptr, - h->descr_C, rowptrC.data(), &nnz_C, h->info_C, h->buffer)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL( + rocsparse_csrgemm_nnz(h->rocsparseHandle, h->opA, h->opB, m, k, n, h->descr_A, nnz_A, rowptrA.data(), + colidxA.data(), h->descr_B, nnz_B, rowptrB.data(), colidxB.data(), h->descr_D, 0, nullptr, + nullptr, h->descr_C, rowptrC.data(), &nnz_C, h->info_C, h->buffer)); // If C has zero rows, its rowptrs are not populated if (m == 0) { KOKKOS_IMPL_HIP_SAFE_CALL(hipMemset(rowptrC.data(), 0, rowptrC.extent(0) * sizeof(index_type))); } // Restore previous pointer mode - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_set_pointer_mode(h->rocsparseHandle, oldPtrMode)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_set_pointer_mode(h->rocsparseHandle, oldPtrMode)); handle->set_c_nnz(nnz_C); handle->set_call_symbolic(); @@ -440,26 +442,29 @@ void spgemm_symbolic_rocsparse(KernelHandle *handle, typename KernelHandle::nnz_ #define SPGEMM_SYMBOLIC_DECL_ROCSPARSE(SCALAR, TPL_AVAIL) \ template <> \ - struct SPGEMM_SYMBOLIC, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, TPL_AVAIL> { \ + struct SPGEMM_SYMBOLIC< \ + KokkosKernels::Experimental::KokkosKernelsHandle, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, TPL_AVAIL> { \ using KernelHandle = \ KokkosKernels::Experimental::KokkosKernelsHandle; \ - using c_int_view_t = Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using int_view_t = Kokkos::View, \ - Kokkos::MemoryTraits>; \ + using c_int_view_t = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using int_view_t = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ static void spgemm_symbolic(KernelHandle *handle, typename KernelHandle::nnz_lno_t m, \ typename KernelHandle::nnz_lno_t n, typename KernelHandle::nnz_lno_t k, \ c_int_view_t row_mapA, c_int_view_t entriesA, bool, c_int_view_t row_mapB, \ @@ -529,25 +534,27 @@ void spgemm_symbolic_mkl(KernelHandle *handle, typename KernelHandle::nnz_lno_t #define SPGEMM_SYMBOLIC_DECL_MKL(SCALAR, EXEC, TPL_AVAIL) \ template <> \ - struct SPGEMM_SYMBOLIC, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, TPL_AVAIL> { \ + struct SPGEMM_SYMBOLIC< \ + KokkosKernels::Experimental::KokkosKernelsHandle, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, TPL_AVAIL> { \ using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle; \ - using c_int_view_t = Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using int_view_t = Kokkos::View, \ - Kokkos::MemoryTraits>; \ + using c_int_view_t = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using int_view_t = Kokkos::View, \ + Kokkos::MemoryTraits>; \ static void spgemm_symbolic(KernelHandle *handle, typename KernelHandle::nnz_lno_t m, \ typename KernelHandle::nnz_lno_t n, typename KernelHandle::nnz_lno_t k, \ c_int_view_t row_mapA, c_int_view_t entriesA, bool, c_int_view_t row_mapB, \ diff --git a/packages/kokkos-kernels/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp b/packages/kokkos-kernels/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp index 11bf82f7b46f..defb13044f0c 100644 --- a/packages/kokkos-kernels/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp +++ b/packages/kokkos-kernels/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp @@ -667,26 +667,26 @@ void spmv_bsr_rocsparse(const Kokkos::HIP& exec, Handle* handle, const char mode } else { subhandle = new KokkosSparse::Impl::RocSparse_BSR_SpMV_Data(exec); handle->tpl_rank1 = subhandle; - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_create_mat_descr(&subhandle->mat)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_create_mat_descr(&subhandle->mat)); // *_ex* functions deprecated in introduced in 6+ #if KOKKOSSPARSE_IMPL_ROCM_VERSION >= 60000 - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_create_mat_info(&subhandle->info)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_create_mat_info(&subhandle->info)); if constexpr (std::is_same_v) { - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_sbsrmv_analysis(rocsparseHandle, dir, trans, mb, nb, nnzb, - subhandle->mat, bsr_val, bsr_row_ptr, bsr_col_ind, - block_dim, subhandle->info)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_sbsrmv_analysis(rocsparseHandle, dir, trans, mb, nb, nnzb, + subhandle->mat, bsr_val, bsr_row_ptr, bsr_col_ind, + block_dim, subhandle->info)); } else if constexpr (std::is_same_v) { - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_dbsrmv_analysis(rocsparseHandle, dir, trans, mb, nb, nnzb, - subhandle->mat, bsr_val, bsr_row_ptr, bsr_col_ind, - block_dim, subhandle->info)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_dbsrmv_analysis(rocsparseHandle, dir, trans, mb, nb, nnzb, + subhandle->mat, bsr_val, bsr_row_ptr, bsr_col_ind, + block_dim, subhandle->info)); } else if constexpr (std::is_same_v>) { - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_cbsrmv_analysis(rocsparseHandle, dir, trans, mb, nb, nnzb, - subhandle->mat, bsr_val, bsr_row_ptr, bsr_col_ind, - block_dim, subhandle->info)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_cbsrmv_analysis(rocsparseHandle, dir, trans, mb, nb, nnzb, + subhandle->mat, bsr_val, bsr_row_ptr, bsr_col_ind, + block_dim, subhandle->info)); } else if constexpr (std::is_same_v>) { - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_zbsrmv_analysis(rocsparseHandle, dir, trans, mb, nb, nnzb, - subhandle->mat, bsr_val, bsr_row_ptr, bsr_col_ind, - block_dim, subhandle->info)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_zbsrmv_analysis(rocsparseHandle, dir, trans, mb, nb, nnzb, + subhandle->mat, bsr_val, bsr_row_ptr, bsr_col_ind, + block_dim, subhandle->info)); } else { static_assert(KokkosKernels::Impl::always_false_v, "unsupported value type for rocsparse_*bsrmv"); } @@ -694,23 +694,23 @@ void spmv_bsr_rocsparse(const Kokkos::HIP& exec, Handle* handle, const char mode #elif KOKKOSSPARSE_IMPL_ROCM_VERSION < 50400 // No analysis step in the older versions #else - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_create_mat_info(&subhandle->info)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_create_mat_info(&subhandle->info)); if constexpr (std::is_same_v) { - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_sbsrmv_ex_analysis(rocsparseHandle, dir, trans, mb, nb, nnzb, - subhandle->mat, bsr_val, bsr_row_ptr, bsr_col_ind, - block_dim, subhandle->info)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_sbsrmv_ex_analysis(rocsparseHandle, dir, trans, mb, nb, nnzb, + subhandle->mat, bsr_val, bsr_row_ptr, + bsr_col_ind, block_dim, subhandle->info)); } else if constexpr (std::is_same_v) { - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_dbsrmv_ex_analysis(rocsparseHandle, dir, trans, mb, nb, nnzb, - subhandle->mat, bsr_val, bsr_row_ptr, bsr_col_ind, - block_dim, subhandle->info)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_dbsrmv_ex_analysis(rocsparseHandle, dir, trans, mb, nb, nnzb, + subhandle->mat, bsr_val, bsr_row_ptr, + bsr_col_ind, block_dim, subhandle->info)); } else if constexpr (std::is_same_v>) { - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_cbsrmv_ex_analysis(rocsparseHandle, dir, trans, mb, nb, nnzb, - subhandle->mat, bsr_val, bsr_row_ptr, bsr_col_ind, - block_dim, subhandle->info)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_cbsrmv_ex_analysis(rocsparseHandle, dir, trans, mb, nb, nnzb, + subhandle->mat, bsr_val, bsr_row_ptr, + bsr_col_ind, block_dim, subhandle->info)); } else if constexpr (std::is_same_v>) { - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_zbsrmv_ex_analysis(rocsparseHandle, dir, trans, mb, nb, nnzb, - subhandle->mat, bsr_val, bsr_row_ptr, bsr_col_ind, - block_dim, subhandle->info)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_zbsrmv_ex_analysis(rocsparseHandle, dir, trans, mb, nb, nnzb, + subhandle->mat, bsr_val, bsr_row_ptr, + bsr_col_ind, block_dim, subhandle->info)); } else { static_assert(KokkosKernels::Impl::always_false_v, "unsupported value type for rocsparse_*bsrmv"); } @@ -720,58 +720,62 @@ void spmv_bsr_rocsparse(const Kokkos::HIP& exec, Handle* handle, const char mode // *_ex* functions deprecated in introduced in 6+ #if KOKKOSSPARSE_IMPL_ROCM_VERSION >= 60000 if constexpr (std::is_same_v) { - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_sbsrmv(rocsparseHandle, dir, trans, mb, nb, nnzb, alpha_, subhandle->mat, - bsr_val, bsr_row_ptr, bsr_col_ind, block_dim, subhandle->info, x_, - beta_, y_)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_sbsrmv(rocsparseHandle, dir, trans, mb, nb, nnzb, alpha_, + subhandle->mat, bsr_val, bsr_row_ptr, bsr_col_ind, block_dim, + subhandle->info, x_, beta_, y_)); } else if constexpr (std::is_same_v) { - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_dbsrmv(rocsparseHandle, dir, trans, mb, nb, nnzb, alpha_, subhandle->mat, - bsr_val, bsr_row_ptr, bsr_col_ind, block_dim, subhandle->info, x_, - beta_, y_)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_dbsrmv(rocsparseHandle, dir, trans, mb, nb, nnzb, alpha_, + subhandle->mat, bsr_val, bsr_row_ptr, bsr_col_ind, block_dim, + subhandle->info, x_, beta_, y_)); } else if constexpr (std::is_same_v>) { - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_cbsrmv(rocsparseHandle, dir, trans, mb, nb, nnzb, alpha_, subhandle->mat, - bsr_val, bsr_row_ptr, bsr_col_ind, block_dim, subhandle->info, x_, - beta_, y_)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_cbsrmv(rocsparseHandle, dir, trans, mb, nb, nnzb, alpha_, + subhandle->mat, bsr_val, bsr_row_ptr, bsr_col_ind, block_dim, + subhandle->info, x_, beta_, y_)); } else if constexpr (std::is_same_v>) { - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_zbsrmv(rocsparseHandle, dir, trans, mb, nb, nnzb, alpha_, subhandle->mat, - bsr_val, bsr_row_ptr, bsr_col_ind, block_dim, subhandle->info, x_, - beta_, y_)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_zbsrmv(rocsparseHandle, dir, trans, mb, nb, nnzb, alpha_, + subhandle->mat, bsr_val, bsr_row_ptr, bsr_col_ind, block_dim, + subhandle->info, x_, beta_, y_)); } else { static_assert(KokkosKernels::Impl::always_false_v, "unsupported value type for rocsparse_*bsrmv"); } // *_ex* functions introduced in 5.4.0 #elif KOKKOSSPARSE_IMPL_ROCM_VERSION < 50400 if constexpr (std::is_same_v) { - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_sbsrmv(rocsparseHandle, dir, trans, mb, nb, nnzb, alpha_, subhandle->mat, - bsr_val, bsr_row_ptr, bsr_col_ind, block_dim, x_, beta_, y_)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_sbsrmv(rocsparseHandle, dir, trans, mb, nb, nnzb, alpha_, + subhandle->mat, bsr_val, bsr_row_ptr, bsr_col_ind, block_dim, + x_, beta_, y_)); } else if constexpr (std::is_same_v) { - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_dbsrmv(rocsparseHandle, dir, trans, mb, nb, nnzb, alpha_, subhandle->mat, - bsr_val, bsr_row_ptr, bsr_col_ind, block_dim, x_, beta_, y_)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_dbsrmv(rocsparseHandle, dir, trans, mb, nb, nnzb, alpha_, + subhandle->mat, bsr_val, bsr_row_ptr, bsr_col_ind, block_dim, + x_, beta_, y_)); } else if constexpr (std::is_same_v>) { - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_cbsrmv(rocsparseHandle, dir, trans, mb, nb, nnzb, alpha_, subhandle->mat, - bsr_val, bsr_row_ptr, bsr_col_ind, block_dim, x_, beta_, y_)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_cbsrmv(rocsparseHandle, dir, trans, mb, nb, nnzb, alpha_, + subhandle->mat, bsr_val, bsr_row_ptr, bsr_col_ind, block_dim, + x_, beta_, y_)); } else if constexpr (std::is_same_v>) { - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_zbsrmv(rocsparseHandle, dir, trans, mb, nb, nnzb, alpha_, subhandle->mat, - bsr_val, bsr_row_ptr, bsr_col_ind, block_dim, x_, beta_, y_)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_zbsrmv(rocsparseHandle, dir, trans, mb, nb, nnzb, alpha_, + subhandle->mat, bsr_val, bsr_row_ptr, bsr_col_ind, block_dim, + x_, beta_, y_)); } else { static_assert(KokkosKernels::Impl::always_false_v, "unsupported value type for rocsparse_*bsrmv"); } #else if constexpr (std::is_same_v) { - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_sbsrmv_ex(rocsparseHandle, dir, trans, mb, nb, nnzb, alpha_, - subhandle->mat, bsr_val, bsr_row_ptr, bsr_col_ind, block_dim, - subhandle->info, x_, beta_, y_)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_sbsrmv_ex(rocsparseHandle, dir, trans, mb, nb, nnzb, alpha_, + subhandle->mat, bsr_val, bsr_row_ptr, bsr_col_ind, + block_dim, subhandle->info, x_, beta_, y_)); } else if constexpr (std::is_same_v) { - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_dbsrmv_ex(rocsparseHandle, dir, trans, mb, nb, nnzb, alpha_, - subhandle->mat, bsr_val, bsr_row_ptr, bsr_col_ind, block_dim, - subhandle->info, x_, beta_, y_)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_dbsrmv_ex(rocsparseHandle, dir, trans, mb, nb, nnzb, alpha_, + subhandle->mat, bsr_val, bsr_row_ptr, bsr_col_ind, + block_dim, subhandle->info, x_, beta_, y_)); } else if constexpr (std::is_same_v>) { - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_cbsrmv_ex(rocsparseHandle, dir, trans, mb, nb, nnzb, alpha_, - subhandle->mat, bsr_val, bsr_row_ptr, bsr_col_ind, block_dim, - subhandle->info, x_, beta_, y_)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_cbsrmv_ex(rocsparseHandle, dir, trans, mb, nb, nnzb, alpha_, + subhandle->mat, bsr_val, bsr_row_ptr, bsr_col_ind, + block_dim, subhandle->info, x_, beta_, y_)); } else if constexpr (std::is_same_v>) { - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_zbsrmv_ex(rocsparseHandle, dir, trans, mb, nb, nnzb, alpha_, - subhandle->mat, bsr_val, bsr_row_ptr, bsr_col_ind, block_dim, - subhandle->info, x_, beta_, y_)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_zbsrmv_ex(rocsparseHandle, dir, trans, mb, nb, nnzb, alpha_, + subhandle->mat, bsr_val, bsr_row_ptr, bsr_col_ind, + block_dim, subhandle->info, x_, beta_, y_)); } else { static_assert(KokkosKernels::Impl::always_false_v, "unsupported value type for rocsparse_*bsrmv"); } diff --git a/packages/kokkos-kernels/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp b/packages/kokkos-kernels/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp index c4cc3fbc88bd..bcd367b9bb8b 100644 --- a/packages/kokkos-kernels/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp +++ b/packages/kokkos-kernels/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp @@ -331,10 +331,10 @@ void spmv_mv_rocsparse(const Kokkos::HIP &exec, Handle *handle, const char mode[ } rocsparse_dnmat_descr vecX, vecY; - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL( + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL( rocsparse_create_dnmat_descr(&vecX, x.extent(0), x.extent(1), x_ld, x_data, rocsparse_compute_type(), x_order)); - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL( + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL( rocsparse_create_dnmat_descr(&vecY, y.extent(0), y.extent(1), y_ld, y_data, rocsparse_compute_type(), y_order)); @@ -359,30 +359,30 @@ void spmv_mv_rocsparse(const Kokkos::HIP &exec, Handle *handle, const char mode[ void *csr_col_ind = static_cast(const_cast(A.graph.entries.data())); void *csr_val = static_cast(const_cast(A.values.data())); - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_create_csr_descr( + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_create_csr_descr( &subhandle->mat, A.numRows(), A.numCols(), A.nnz(), csr_row_ptr, csr_col_ind, csr_val, offset_index_type, entry_index_type, rocsparse_index_base_zero, compute_type)); // Size and allocate buffer, and analyze the matrix - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_spmm(rocsparseHandle, rocsparseOperation, rocsparse_operation_none, - &alpha, subhandle->mat, vecX, &beta, vecY, compute_type, alg, - rocsparse_spmm_stage_buffer_size, &subhandle->bufferSize, nullptr)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_spmm( + rocsparseHandle, rocsparseOperation, rocsparse_operation_none, &alpha, subhandle->mat, vecX, &beta, vecY, + compute_type, alg, rocsparse_spmm_stage_buffer_size, &subhandle->bufferSize, nullptr)); KOKKOS_IMPL_HIP_SAFE_CALL(hipMalloc(&subhandle->buffer, subhandle->bufferSize)); - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_spmm( + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_spmm( rocsparseHandle, rocsparseOperation, rocsparse_operation_none, &alpha, subhandle->mat, vecX, &beta, vecY, compute_type, alg, rocsparse_spmm_stage_preprocess, &subhandle->bufferSize, subhandle->buffer)); } // Perform the actual computation - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL( + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL( rocsparse_spmm(rocsparseHandle, rocsparseOperation, rocsparse_operation_none, &alpha, subhandle->mat, vecX, &beta, vecY, compute_type, alg, rocsparse_spmm_stage_compute, &subhandle->bufferSize, subhandle->buffer)); - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_destroy_dnmat_descr(vecY)); - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_destroy_dnmat_descr(vecX)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_destroy_dnmat_descr(vecY)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_destroy_dnmat_descr(vecX)); } #define KOKKOSSPARSE_SPMV_MV_ROCSPARSE(SCALAR, XL, YL, MEMSPACE) \ diff --git a/packages/kokkos-kernels/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp b/packages/kokkos-kernels/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp index 6de2a70359ca..8cf1f49e516a 100644 --- a/packages/kokkos-kernels/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp +++ b/packages/kokkos-kernels/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp @@ -301,9 +301,9 @@ void spmv_rocsparse(const Kokkos::HIP& exec, Handle* handle, const char mode[], rocsparse_dnvec_descr vecX, vecY; void* x_data = static_cast(const_cast(x.data())); void* y_data = static_cast(const_cast(y.data())); - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_create_dnvec_descr( + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_create_dnvec_descr( &vecX, x.extent_int(0), x_data, rocsparse_compute_type())); - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_create_dnvec_descr( + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_create_dnvec_descr( &vecY, y.extent_int(0), y_data, rocsparse_compute_type())); // Default to using the "stream" algorithm which has almost no setup cost, @@ -332,31 +332,32 @@ void spmv_rocsparse(const Kokkos::HIP& exec, Handle* handle, const char mode[], void* csr_col_ind = static_cast(const_cast(A.graph.entries.data())); void* csr_val = static_cast(const_cast(A.values.data())); - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_create_csr_descr( + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_create_csr_descr( &subhandle->mat, A.numRows(), A.numCols(), A.nnz(), csr_row_ptr, csr_col_ind, csr_val, offset_index_type, entry_index_type, rocsparse_index_base_zero, compute_type)); /* Size and allocate buffer, and analyze the matrix */ #if KOKKOSSPARSE_IMPL_ROCM_VERSION >= 60000 - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_spmv(rocsparseHandle, myRocsparseOperation, &alpha, subhandle->mat, vecX, - &beta, vecY, compute_type, alg, rocsparse_spmv_stage_buffer_size, - &subhandle->bufferSize, nullptr)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL( + rocsparse_spmv(rocsparseHandle, myRocsparseOperation, &alpha, subhandle->mat, vecX, &beta, vecY, compute_type, + alg, rocsparse_spmv_stage_buffer_size, &subhandle->bufferSize, nullptr)); KOKKOS_IMPL_HIP_SAFE_CALL(hipMalloc(&subhandle->buffer, subhandle->bufferSize)); - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_spmv(rocsparseHandle, myRocsparseOperation, &alpha, subhandle->mat, vecX, - &beta, vecY, compute_type, alg, rocsparse_spmv_stage_preprocess, - &subhandle->bufferSize, subhandle->buffer)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL( + rocsparse_spmv(rocsparseHandle, myRocsparseOperation, &alpha, subhandle->mat, vecX, &beta, vecY, compute_type, + alg, rocsparse_spmv_stage_preprocess, &subhandle->bufferSize, subhandle->buffer)); #elif KOKKOSSPARSE_IMPL_ROCM_VERSION >= 50400 - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_spmv_ex(rocsparseHandle, myRocsparseOperation, &alpha, subhandle->mat, - vecX, &beta, vecY, compute_type, alg, rocsparse_spmv_stage_auto, - &subhandle->bufferSize, nullptr)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL( + rocsparse_spmv_ex(rocsparseHandle, myRocsparseOperation, &alpha, subhandle->mat, vecX, &beta, vecY, + compute_type, alg, rocsparse_spmv_stage_auto, &subhandle->bufferSize, nullptr)); KOKKOS_IMPL_HIP_SAFE_CALL(hipMalloc(&subhandle->buffer, subhandle->bufferSize)); - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_spmv_ex( + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_spmv_ex( rocsparseHandle, myRocsparseOperation, &alpha, subhandle->mat, vecX, &beta, vecY, compute_type, alg, rocsparse_spmv_stage_preprocess, &subhandle->bufferSize, subhandle->buffer)); #else - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_spmv(rocsparseHandle, myRocsparseOperation, &alpha, subhandle->mat, vecX, - &beta, vecY, compute_type, alg, &subhandle->bufferSize, nullptr)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_spmv(rocsparseHandle, myRocsparseOperation, &alpha, subhandle->mat, + vecX, &beta, vecY, compute_type, alg, &subhandle->bufferSize, + nullptr)); KOKKOS_IMPL_HIP_SAFE_CALL(hipMalloc(&subhandle->buffer, subhandle->bufferSize)); #endif } @@ -364,21 +365,21 @@ void spmv_rocsparse(const Kokkos::HIP& exec, Handle* handle, const char mode[], /* Perform the actual computation */ #if KOKKOSSPARSE_IMPL_ROCM_VERSION >= 60000 - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_spmv(rocsparseHandle, myRocsparseOperation, &alpha, subhandle->mat, vecX, - &beta, vecY, compute_type, alg, rocsparse_spmv_stage_compute, - &subhandle->bufferSize, subhandle->buffer)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL( + rocsparse_spmv(rocsparseHandle, myRocsparseOperation, &alpha, subhandle->mat, vecX, &beta, vecY, compute_type, + alg, rocsparse_spmv_stage_compute, &subhandle->bufferSize, subhandle->buffer)); #elif KOKKOSSPARSE_IMPL_ROCM_VERSION >= 50400 - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_spmv_ex(rocsparseHandle, myRocsparseOperation, &alpha, subhandle->mat, vecX, - &beta, vecY, compute_type, alg, rocsparse_spmv_stage_compute, - &subhandle->bufferSize, subhandle->buffer)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL( + rocsparse_spmv_ex(rocsparseHandle, myRocsparseOperation, &alpha, subhandle->mat, vecX, &beta, vecY, compute_type, + alg, rocsparse_spmv_stage_compute, &subhandle->bufferSize, subhandle->buffer)); #else - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_spmv(rocsparseHandle, myRocsparseOperation, &alpha, subhandle->mat, vecX, - &beta, vecY, compute_type, alg, &subhandle->bufferSize, - subhandle->buffer)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_spmv(rocsparseHandle, myRocsparseOperation, &alpha, subhandle->mat, + vecX, &beta, vecY, compute_type, alg, &subhandle->bufferSize, + subhandle->buffer)); #endif - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_destroy_dnvec_descr(vecY)); - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_destroy_dnvec_descr(vecX)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_destroy_dnvec_descr(vecY)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_destroy_dnvec_descr(vecX)); } #define KOKKOSSPARSE_SPMV_ROCSPARSE(SCALAR, LAYOUT) \ diff --git a/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_SortCrs.hpp b/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_SortCrs.hpp index 6898b8aa9d9d..0877665ae0f9 100644 --- a/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_SortCrs.hpp +++ b/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_SortCrs.hpp @@ -42,11 +42,11 @@ enum : int { } template -void testSortCRS(default_lno_t numRows, default_lno_t numCols, default_size_type nnz, bool doValues, - bool doStructInterface, int howExecSpecified) { - using scalar_t = default_scalar; - using lno_t = default_lno_t; - using size_type = default_size_type; +void testSortCRS(KokkosKernels::default_lno_t numRows, KokkosKernels::default_lno_t numCols, + KokkosKernels::default_size_type nnz, bool doValues, bool doStructInterface, int howExecSpecified) { + using scalar_t = KokkosKernels::default_scalar; + using lno_t = KokkosKernels::default_lno_t; + using size_type = KokkosKernels::default_size_type; using exec_space = typename device_t::execution_space; using crsMat_t = KokkosSparse::CrsMatrix; // Create a random matrix on device @@ -135,9 +135,9 @@ void testSortCRS(default_lno_t numRows, default_lno_t numCols, default_size_type template void testSortCRSUnmanaged(bool doValues, bool doStructInterface) { // This test is about bug #960. - using scalar_t = default_scalar; - using lno_t = default_lno_t; - using size_type = default_size_type; + using scalar_t = KokkosKernels::default_scalar; + using lno_t = KokkosKernels::default_lno_t; + using size_type = KokkosKernels::default_size_type; using exec_space = typename device_t::execution_space; using crsMat_t = KokkosSparse::CrsMatrix, size_type>; @@ -175,9 +175,9 @@ void testSortCRSUnmanaged(bool doValues, bool doStructInterface) { template void testSortAndMerge(bool justGraph, int howExecSpecified, bool doStructInterface, bool inPlace, int testCase) { - using size_type = default_size_type; - using lno_t = default_lno_t; - using scalar_t = default_scalar; + using size_type = KokkosKernels::default_size_type; + using lno_t = KokkosKernels::default_lno_t; + using scalar_t = KokkosKernels::default_scalar; using exec_space = typename device_t::execution_space; using crsMat_t = KokkosSparse::CrsMatrix; using graph_t = typename crsMat_t::staticcrsgraph_type; diff --git a/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_Transpose.hpp b/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_Transpose.hpp index da430c6ca4a0..d8ddaf500b5c 100644 --- a/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_Transpose.hpp +++ b/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_Transpose.hpp @@ -44,9 +44,9 @@ template void testTranspose(int numRows, int numCols, bool doValues) { using exec_space = typename device_t::execution_space; using range_pol = Kokkos::RangePolicy; - using scalar_t = default_scalar; - using lno_t = default_lno_t; - using size_type = default_size_type; + using scalar_t = KokkosKernels::default_scalar; + using lno_t = KokkosKernels::default_lno_t; + using size_type = KokkosKernels::default_size_type; using crsMat_t = typename KokkosSparse::CrsMatrix; using c_rowmap_t = typename crsMat_t::row_map_type; using c_entries_t = typename crsMat_t::index_type; @@ -106,7 +106,7 @@ template void CompareBsrMatrices(bsrMat_t& A, bsrMat_t& B) { using exec_space = typename bsrMat_t::execution_space; using range_pol = Kokkos::RangePolicy; - using size_type = default_size_type; + using size_type = KokkosKernels::default_size_type; using c_rowmap_t = typename bsrMat_t::row_map_type; using c_entries_t = typename bsrMat_t::index_type; using values_t = typename bsrMat_t::values_type::non_const_type; @@ -133,9 +133,9 @@ void CompareBsrMatrices(bsrMat_t& A, bsrMat_t& B) { template void testTransposeBsrRef() { - using scalar_t = default_scalar; - using lno_t = default_lno_t; - using size_type = default_size_type; + using scalar_t = KokkosKernels::default_scalar; + using lno_t = KokkosKernels::default_lno_t; + using size_type = KokkosKernels::default_size_type; using bsrMat_t = typename KokkosSparse::Experimental::BsrMatrix; using rowmap_t = typename bsrMat_t::row_map_type::non_const_type; using entries_t = typename bsrMat_t::index_type::non_const_type; @@ -199,9 +199,9 @@ void testTransposeBsrRef() { template void testTransposeBsr(int numRows, int numCols, int blockSize) { - using scalar_t = default_scalar; - using lno_t = default_lno_t; - using size_type = default_size_type; + using scalar_t = KokkosKernels::default_scalar; + using lno_t = KokkosKernels::default_lno_t; + using size_type = KokkosKernels::default_size_type; using exec_space = typename device_t::execution_space; using bsrMat_t = typename KokkosSparse::Experimental::BsrMatrix; using c_rowmap_t = typename bsrMat_t::row_map_type; diff --git a/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_block_gauss_seidel.hpp b/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_block_gauss_seidel.hpp index 80c23356ce4e..7c6bddfd4d2a 100644 --- a/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_block_gauss_seidel.hpp +++ b/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_block_gauss_seidel.hpp @@ -220,7 +220,7 @@ void test_block_gauss_seidel_rank2(lno_t numRows, size_type nnz, lno_t bandwidth typedef typename crsMat_t::values_type::non_const_type scalar_view_t; typedef typename crsMat_t::StaticCrsGraphType::row_map_type::non_const_type lno_view_t; typedef typename crsMat_t::StaticCrsGraphType::entries_type::non_const_type lno_nnz_view_t; - typedef Kokkos::View scalar_view2d_t; + typedef Kokkos::View scalar_view2d_t; typedef typename Kokkos::ArithTraits::mag_type mag_t; lno_t numCols = numRows; diff --git a/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_bspgemm.hpp b/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_bspgemm.hpp index 32168f1686b4..f5314e516dfa 100644 --- a/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_bspgemm.hpp +++ b/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_bspgemm.hpp @@ -169,7 +169,7 @@ void test_bspgemm(lno_t blkDim, lno_t m, lno_t k, lno_t n, size_type nnz, lno_t SPGEMM_KK, SPGEMM_KK_MEMORY /* alias SPGEMM_KK_MEMSPEED */, SPGEMM_KK_SPEED /* alias SPGEMM_KK_DENSE */ }; - if (!KokkosKernels::Impl::kk_is_gpu_exec_space()) { + if (!KokkosKernels::Impl::is_gpu_exec_space_v) { // SPGEMM_KK_LP is useful on CPU to cover MultiCoreTag4 functor // (otherwise skipped) but on GPU it's same as SPGEMM_KK, so we can skip it. algorithms.push_back(SPGEMM_KK_LP); diff --git a/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_gauss_seidel.hpp b/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_gauss_seidel.hpp index 92109f02dd67..3704b63fbd57 100644 --- a/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_gauss_seidel.hpp +++ b/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_gauss_seidel.hpp @@ -14,6 +14,13 @@ // //@HEADER +#include +#include +#include +#include +#include +#include + #include #include @@ -25,17 +32,13 @@ #include #include #include -#include -#include -#include -#include -#include -#include + #include "KokkosSparse_gauss_seidel.hpp" #include "KokkosSparse_partitioning_impl.hpp" #include "KokkosSparse_sor_sequential_impl.hpp" #include "KokkosSparse_SortCrs.hpp" #include "KokkosKernels_TestUtils.hpp" +#include "KokkosKernels_TestMatrixUtils.hpp" #include "Test_Sparse_Utils.hpp" // #ifndef kokkos_complex_double @@ -248,8 +251,8 @@ void test_gauss_seidel_rank2(lno_t numRows, size_type nnz, lno_t bandwidth, lno_ using namespace Test; srand(245); typedef typename KokkosSparse::CrsMatrix crsMat_t; - typedef Kokkos::View scalar_view2d_t; - typedef Kokkos::View host_scalar_view2d_t; + typedef Kokkos::View scalar_view2d_t; + typedef Kokkos::View host_scalar_view2d_t; typedef typename Kokkos::ArithTraits::mag_type mag_t; lno_t numCols = numRows; diff --git a/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_removeCrsMatrixZeros.hpp b/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_removeCrsMatrixZeros.hpp index 8df02a1d4d84..dc2d199f09ce 100644 --- a/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_removeCrsMatrixZeros.hpp +++ b/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_removeCrsMatrixZeros.hpp @@ -217,7 +217,8 @@ void getTestInput(int test, Matrix& A, Matrix& Afiltered_ref) { void testRemoveCrsMatrixZeros(int testCase) { using namespace TestRemoveCrsMatrixZeros; - using Matrix = KokkosSparse::CrsMatrix; + using Matrix = KokkosSparse::CrsMatrix; Matrix A, Afiltered_ref; getTestInput(testCase, A, Afiltered_ref); Matrix Afiltered_actual = KokkosSparse::removeCrsMatrixZeros(A); diff --git a/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_rocsparse.hpp b/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_rocsparse.hpp index 379b422d7fe6..6ec48e66654f 100644 --- a/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_rocsparse.hpp +++ b/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_rocsparse.hpp @@ -47,11 +47,11 @@ void test_rocsparse_safe_call() { bool caught_exception = false; rocsparse_status myStatus = rocsparse_status_success; - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(myStatus); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(myStatus); try { myStatus = rocsparse_status_internal_error; - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(myStatus); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(myStatus); } catch (std::runtime_error& e) { caught_exception = true; } diff --git a/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_spiluk.hpp b/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_spiluk.hpp index 1fde1ac5ed1a..89511dcf3756 100644 --- a/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_spiluk.hpp +++ b/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_spiluk.hpp @@ -134,12 +134,12 @@ struct SpilukTest { const scalar_t MONE = scalar_t(-1); // Create a reference view e set to all 1's - ValuesType e_one("e_one", nrows * block_size); + ValuesType e_one("e_one", static_cast(nrows) * block_size); Kokkos::deep_copy(e_one, ONE); // Create two views for spmv results - ValuesType bb("bb", nrows * block_size); - ValuesType bb_tmp("bb_tmp", nrows * block_size); + ValuesType bb("bb", static_cast(nrows) * block_size); + ValuesType bb_tmp("bb_tmp", static_cast(nrows) * block_size); // Compute norm2(L*U*e_one - A*e_one)/norm2(A*e_one) KokkosSparse::spmv("N", ONE, A, e_one, ZERO, bb); @@ -244,8 +244,8 @@ struct SpilukTest { Kokkos::resize(L_entries, spiluk_handle->get_nnzL()); Kokkos::resize(U_entries, spiluk_handle->get_nnzU()); - ValuesType L_values("L_values", spiluk_handle->get_nnzL() * block_items); - ValuesType U_values("U_values", spiluk_handle->get_nnzU() * block_items); + ValuesType L_values("L_values", spiluk_handle->get_nnzL() * static_cast(block_items)); + ValuesType U_values("U_values", spiluk_handle->get_nnzU() * static_cast(block_items)); spiluk_numeric(&kh, fill_lev, row_map, entries, values, L_row_map, L_entries, L_values, U_row_map, U_entries, U_values); @@ -579,7 +579,7 @@ struct SpilukTest { RowMapType_hostmirror hrow_map("hrow_map", bnrows + 1); EntriesType_hostmirror hentries("hentries", bnnz); - ValuesType_hostmirror hvalues("hvalues", bnnz * block_items); + ValuesType_hostmirror hvalues("hvalues", static_cast(bnnz) * block_items); Kokkos::deep_copy(hrow_map, brow_map); Kokkos::deep_copy(hentries, bentries); @@ -591,7 +591,7 @@ struct SpilukTest { // Allocate A as input A_row_map_v[i] = RowMapType("A_row_map", bnrows + 1); A_entries_v[i] = EntriesType("A_entries", bnnz); - A_values_v[i] = ValuesType("A_values", bnnz * block_items); + A_values_v[i] = ValuesType("A_values", static_cast(bnnz) * block_items); // Copy from host to device Kokkos::deep_copy(A_row_map_v[i], hrow_map); @@ -619,8 +619,8 @@ struct SpilukTest { Kokkos::resize(L_entries_v[i], spiluk_handle->get_nnzL()); Kokkos::resize(U_entries_v[i], spiluk_handle->get_nnzU()); - L_values_v[i] = ValuesType("L_values", spiluk_handle->get_nnzL() * block_items); - U_values_v[i] = ValuesType("U_values", spiluk_handle->get_nnzU() * block_items); + L_values_v[i] = ValuesType("L_values", spiluk_handle->get_nnzL() * static_cast(block_items)); + U_values_v[i] = ValuesType("U_values", spiluk_handle->get_nnzU() * static_cast(block_items)); } // Done handle creation and spiluk_symbolic on all streams // Numeric phase diff --git a/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_spmv.hpp b/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_spmv.hpp index 43a9bd11e986..200a982d612d 100644 --- a/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_spmv.hpp +++ b/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_spmv.hpp @@ -864,9 +864,9 @@ void test_github_issue_101() { // vectors. Include a little extra in case the implementers decide // to strip-mine that. constexpr int numVecs = 22; - Kokkos::View X("X", numCols, numVecs); + Kokkos::View X("X", numCols, numVecs); Kokkos::deep_copy(X, static_cast(1.0)); - Kokkos::View Y("Y", numRows, numVecs); + Kokkos::View Y("Y", numRows, numVecs); auto Y_h = Kokkos::create_mirror_view(Y); // we'll want this later // Start with the easy test case, where the matrix and the vectors diff --git a/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_spmv_bsr.hpp b/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_spmv_bsr.hpp index dbc1fb7e027c..53f1de5e565f 100644 --- a/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_spmv_bsr.hpp +++ b/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_spmv_bsr.hpp @@ -316,8 +316,8 @@ auto random_vecs_for_spmv(const char *mode, const Bsr &a, const bool nans = fals using execution_space = typename Bsr::execution_space; using policy_type = Kokkos::RangePolicy; - size_t nx = a.numCols() * a.blockDim(); - size_t ny = a.numRows() * a.blockDim(); + size_t nx = static_cast(a.numCols()) * a.blockDim(); + size_t ny = static_cast(a.numRows()) * a.blockDim(); if (mode_is_transpose(mode)) { std::swap(nx, ny); } @@ -366,7 +366,7 @@ void test_spmv_combos(const char *mode, const Bsr &a, const Crs &acrs, size_t ma // Tensor core algorithm temporarily disabled, fails on V100 /* - if constexpr (KokkosKernels::Impl::kk_is_gpu_exec_space()) { + if constexpr (KokkosKernels::Impl::is_gpu_exec_space_v) { #if defined(KOKKOS_ENABLE_CUDA) if constexpr (std::is_same_v) { #if defined(KOKKOS_ARCH_AMPERE) || defined(KOKKOS_ARCH_VOLTA) @@ -557,8 +557,8 @@ auto random_multivecs_for_spm_mv(const char *mode, const Bsr &a, const size_t nu using execution_space = typename Bsr::execution_space; using policy_type = Kokkos::RangePolicy; - size_t nx = a.numCols() * a.blockDim(); - size_t ny = a.numRows() * a.blockDim(); + size_t nx = static_cast(a.numCols()) * a.blockDim(); + size_t ny = static_cast(a.numRows()) * a.blockDim(); if (mode_is_transpose(mode)) { std::swap(nx, ny); } @@ -607,7 +607,7 @@ void test_spm_mv_combos(const char *mode, const Bsr &a, const Crs &acrs, size_t // Tensor core algorithm temporarily disabled, fails on V100 /* - if constexpr (KokkosKernels::Impl::kk_is_gpu_exec_space()) { + if constexpr (KokkosKernels::Impl::is_gpu_exec_space_v) { #if defined(KOKKOS_ENABLE_CUDA) if constexpr (std::is_same_v) { #if defined(KOKKOS_ARCH_AMPERE) || defined(KOKKOS_ARCH_VOLTA) diff --git a/packages/kokkos-kernels/test_common/KokkosKernels_TestMatrixUtils.hpp b/packages/kokkos-kernels/test_common/KokkosKernels_TestMatrixUtils.hpp new file mode 100644 index 000000000000..998ccdba908c --- /dev/null +++ b/packages/kokkos-kernels/test_common/KokkosKernels_TestMatrixUtils.hpp @@ -0,0 +1,89 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSKERNELS_TESTMATRIXUTILS_HPP +#define KOKKOSKERNELS_TESTMATRIXUTILS_HPP + +#include + +#include "KokkosKernels_Utils.hpp" +#include "KokkosKernels_IOUtils.hpp" +#include "Kokkos_ArithTraits.hpp" +#include "KokkosBatched_Vector.hpp" +// Make this include-able from all subdirectories + +namespace Test { + +template +crsMat_t symmetrize(crsMat_t A) { + typedef typename crsMat_t::StaticCrsGraphType graph_t; + typedef typename crsMat_t::values_type::non_const_type scalar_view_t; + typedef typename graph_t::row_map_type::non_const_type lno_view_t; + typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t; + auto host_rowmap = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A.graph.row_map); + auto host_entries = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A.graph.entries); + auto host_values = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A.values); + lno_t numRows = A.numRows(); + // symmetrize as input_mat + input_mat^T, to still have a diagonally dominant + // matrix + typedef std::map Row; + std::vector symRows(numRows); + for (lno_t r = 0; r < numRows; r++) { + auto& row = symRows[r]; + for (size_type i = host_rowmap(r); i < host_rowmap(r + 1); i++) { + lno_t c = host_entries(i); + auto& col = symRows[c]; + auto it = row.find(c); + if (it == row.end()) + row[c] = host_values(i); + else + row[c] += host_values(i); + it = col.find(r); + if (it == col.end()) + col[r] = host_values(i); + else + col[r] += host_values(i); + } + } + // Count entries + Kokkos::View new_host_rowmap("Rowmap", numRows + 1); + size_t accum = 0; + for (lno_t r = 0; r <= numRows; r++) { + new_host_rowmap(r) = accum; + if (r < numRows) accum += symRows[r].size(); + } + // Allocate new entries/values + Kokkos::View new_host_entries("Entries", accum); + Kokkos::View new_host_values("Values", accum); + for (lno_t r = 0; r < numRows; r++) { + auto rowIt = symRows[r].begin(); + for (size_type i = new_host_rowmap(r); i < new_host_rowmap(r + 1); i++) { + new_host_entries(i) = rowIt->first; + new_host_values(i) = rowIt->second; + rowIt++; + } + } + lno_view_t new_rowmap("Rowmap", numRows + 1); + lno_nnz_view_t new_entries("Entries", accum); + scalar_view_t new_values("Values", accum); + Kokkos::deep_copy(new_rowmap, new_host_rowmap); + Kokkos::deep_copy(new_entries, new_host_entries); + Kokkos::deep_copy(new_values, new_host_values); + return crsMat_t("SymA", numRows, numRows, accum, new_values, new_rowmap, new_entries); +} + +} // namespace Test +#endif diff --git a/packages/kokkos-kernels/test_common/KokkosKernels_TestStringUtils.hpp b/packages/kokkos-kernels/test_common/KokkosKernels_TestStringUtils.hpp new file mode 100644 index 000000000000..fec1b09361e7 --- /dev/null +++ b/packages/kokkos-kernels/test_common/KokkosKernels_TestStringUtils.hpp @@ -0,0 +1,38 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSKERNELS_TESTSTRINGUTILS_HPP +#define KOKKOSKERNELS_TESTSTRINGUTILS_HPP + +#include +#include + +namespace Test { + +inline int string_compare_no_case(const char* str1, const char* str2) { + std::string str1_s(str1); + std::string str2_s(str2); + for (size_t i = 0; i < str1_s.size(); i++) str1_s[i] = std::tolower(str1_s[i]); + for (size_t i = 0; i < str2_s.size(); i++) str2_s[i] = std::tolower(str2_s[i]); + return std::strcmp(str1_s.c_str(), str2_s.c_str()); +} + +inline int string_compare_no_case(const std::string& str1, const std::string& str2) { + return string_compare_no_case(str1.c_str(), str2.c_str()); +} + +} // namespace Test +#endif diff --git a/packages/kokkos-kernels/test_common/KokkosKernels_TestUtils.hpp b/packages/kokkos-kernels/test_common/KokkosKernels_TestUtils.hpp index 24c07925e547..1d33bc8b75ec 100644 --- a/packages/kokkos-kernels/test_common/KokkosKernels_TestUtils.hpp +++ b/packages/kokkos-kernels/test_common/KokkosKernels_TestUtils.hpp @@ -212,153 +212,6 @@ using halfScalarType = Kokkos::Experimental::half_t; using bhalfScalarType = Kokkos::Experimental::bhalf_t; #endif // KOKKOS_BHALF_T_IS_FLOAT -template -struct SharedVanillaGEMM { - bool A_t, B_t, A_c, B_c; - int C_rows, C_cols, A_cols; - ViewTypeA A; - ViewTypeB B; - ViewTypeC C; - - typedef typename ViewTypeA::value_type ScalarA; - typedef typename ViewTypeB::value_type ScalarB; - typedef typename ViewTypeC::value_type ScalarC; - typedef Kokkos::View SubviewTypeA; - typedef Kokkos::View SubviewTypeB; - typedef Kokkos::ArithTraits APT; - typedef typename APT::mag_type mag_type; - ScalarA alpha; - ScalarC beta; - - KOKKOS_INLINE_FUNCTION - void operator()(const typename Kokkos::TeamPolicy::member_type& team) const { - Kokkos::parallel_for(Kokkos::TeamThreadRange(team, C_rows), [&](const int& i) { - // Give each kokkos thread a vector of A - SubviewTypeA a_vec; - if (A_t) - a_vec = Kokkos::subview(A, Kokkos::ALL(), i); - else - a_vec = Kokkos::subview(A, i, Kokkos::ALL()); - - // Have all vector lanes perform the dot product - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, C_cols), [&](const int& j) { - SubviewTypeB b_vec; - if (B_t) - b_vec = Kokkos::subview(B, j, Kokkos::ALL()); - else - b_vec = Kokkos::subview(B, Kokkos::ALL(), j); - ScalarC ab = ScalarC(0); - for (int k = 0; k < A_cols; k++) { - auto a = A_c ? APT::conj(a_vec(k)) : a_vec(k); - auto b = B_c ? APT::conj(b_vec(k)) : b_vec(k); - ab += a * b; - } - C(i, j) = beta * C(i, j) + alpha * ab; - }); - }); - } -}; -// C(i,:,:) = alpha * (A(i,:,:) * B(i,:,:)) + beta * C(i,:,:) -template -struct Functor_BatchedVanillaGEMM { - bool A_t, B_t, A_c, B_c, batch_size_last_dim = false; - ViewTypeA A; - ViewTypeB B; - ViewTypeC C; - - using ScalarA = typename ViewTypeA::value_type; - using ScalarB = typename ViewTypeB::value_type; - using ScalarC = typename ViewTypeC::value_type; - using SubviewTypeA = typename Kokkos::View; - using SubviewTypeB = typename Kokkos::View; - using SubviewTypeC = typename Kokkos::View; - - ScalarA alpha; - ScalarC beta; - - KOKKOS_INLINE_FUNCTION - void operator()(const typename Kokkos::TeamPolicy::member_type& team) const { - int i = team.league_rank(); - SubviewTypeA _A; - SubviewTypeB _B; - SubviewTypeC _C; - - if (batch_size_last_dim) { - _A = Kokkos::subview(A, Kokkos::ALL(), Kokkos::ALL(), i); - _B = Kokkos::subview(B, Kokkos::ALL(), Kokkos::ALL(), i); - _C = Kokkos::subview(C, Kokkos::ALL(), Kokkos::ALL(), i); - } else { - _A = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL()); - _B = Kokkos::subview(B, i, Kokkos::ALL(), Kokkos::ALL()); - _C = Kokkos::subview(C, i, Kokkos::ALL(), Kokkos::ALL()); - } - struct SharedVanillaGEMM vgemm; - vgemm.A_t = A_t; - vgemm.B_t = B_t; - vgemm.A_c = A_c; - vgemm.B_c = B_c; - vgemm.C_rows = batch_size_last_dim ? C.extent(0) : C.extent(1); - vgemm.C_cols = batch_size_last_dim ? C.extent(1) : C.extent(2); - vgemm.A_cols = batch_size_last_dim ? (A_t ? A.extent(0) : A.extent(1)) : (A_t ? A.extent(1) : A.extent(2)); - vgemm.A = _A; - vgemm.B = _B; - vgemm.C = _C; - vgemm.alpha = alpha; - vgemm.beta = beta; - vgemm(team); - } - - inline void run() { - Kokkos::parallel_for( - "Test::VanillaGEMM", - Kokkos::TeamPolicy(batch_size_last_dim ? C.extent(2) : C.extent(0), Kokkos::AUTO, - KokkosKernels::Impl::kk_get_max_vector_size()), - *this); - } -}; - -// Compute C := alpha * AB + beta * C -template -void vanillaGEMM(typename ViewTypeC::non_const_value_type alpha, const ViewTypeA& A, const ViewTypeB& B, - typename ViewTypeC::non_const_value_type beta, const ViewTypeC& C) { - using value_type = typename ViewTypeC::non_const_value_type; - using KAT = Kokkos::ArithTraits; - int m = A.extent(0); - int k = A.extent(1); - int n = B.extent(1); - for (int i = 0; i < m; i++) { - for (int j = 0; j < n; j++) { - value_type sum = KAT::zero(); - for (int ii = 0; ii < k; ii++) { - sum += A(i, ii) * B(ii, j); - } - C(i, j) = alpha * sum + beta * C(i, j); - } - } -} - -template -KOKKOS_INLINE_FUNCTION void vanillaGEMV(char mode, AlphaType alpha, const ViewTypeA& A, const ViewTypeX& x, - BetaType beta, const ViewTypeY& y) { - using ScalarY = typename ViewTypeY::non_const_value_type; - using KAT_A = Kokkos::ArithTraits; - const bool transposed = mode == 'T' || mode == 'C'; - const bool conjugated = mode == 'C'; - const bool has_beta = beta != Kokkos::ArithTraits::zero(); - int M = A.extent(transposed ? 1 : 0); - int N = A.extent(transposed ? 0 : 1); - for (int i = 0; i < M; i++) { - ScalarY y_i{}; - if (has_beta) y_i = beta * y(i); - for (int j = 0; j < N; j++) { - const auto a = transposed ? A(j, i) : A(i, j); - const auto Aij = conjugated ? KAT_A::conj(a) : a; - y_i += alpha * Aij * x(j); - } - y(i) = y_i; - } -} - template class epsilon { public: @@ -367,64 +220,6 @@ class epsilon { using KokkosKernels::Impl::getRandomBounds; -template -crsMat_t symmetrize(crsMat_t A) { - typedef typename crsMat_t::StaticCrsGraphType graph_t; - typedef typename crsMat_t::values_type::non_const_type scalar_view_t; - typedef typename graph_t::row_map_type::non_const_type lno_view_t; - typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t; - auto host_rowmap = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A.graph.row_map); - auto host_entries = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A.graph.entries); - auto host_values = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A.values); - lno_t numRows = A.numRows(); - // symmetrize as input_mat + input_mat^T, to still have a diagonally dominant - // matrix - typedef std::map Row; - std::vector symRows(numRows); - for (lno_t r = 0; r < numRows; r++) { - auto& row = symRows[r]; - for (size_type i = host_rowmap(r); i < host_rowmap(r + 1); i++) { - lno_t c = host_entries(i); - auto& col = symRows[c]; - auto it = row.find(c); - if (it == row.end()) - row[c] = host_values(i); - else - row[c] += host_values(i); - it = col.find(r); - if (it == col.end()) - col[r] = host_values(i); - else - col[r] += host_values(i); - } - } - // Count entries - Kokkos::View new_host_rowmap("Rowmap", numRows + 1); - size_t accum = 0; - for (lno_t r = 0; r <= numRows; r++) { - new_host_rowmap(r) = accum; - if (r < numRows) accum += symRows[r].size(); - } - // Allocate new entries/values - Kokkos::View new_host_entries("Entries", accum); - Kokkos::View new_host_values("Values", accum); - for (lno_t r = 0; r < numRows; r++) { - auto rowIt = symRows[r].begin(); - for (size_type i = new_host_rowmap(r); i < new_host_rowmap(r + 1); i++) { - new_host_entries(i) = rowIt->first; - new_host_values(i) = rowIt->second; - rowIt++; - } - } - lno_view_t new_rowmap("Rowmap", numRows + 1); - lno_nnz_view_t new_entries("Entries", accum); - scalar_view_t new_values("Values", accum); - Kokkos::deep_copy(new_rowmap, new_host_rowmap); - Kokkos::deep_copy(new_entries, new_host_entries); - Kokkos::deep_copy(new_values, new_host_values); - return crsMat_t("SymA", numRows, numRows, accum, new_values, new_rowmap, new_entries); -} - // create_random_x_vector and create_random_y_vector can be used together to // generate a random linear system Ax = y. template @@ -485,17 +280,6 @@ std::string value_type_name>() { return "::ComplexDouble"; } -int string_compare_no_case(const char* str1, const char* str2) { - std::string str1_s(str1); - std::string str2_s(str2); - for (size_t i = 0; i < str1_s.size(); i++) str1_s[i] = std::tolower(str1_s[i]); - for (size_t i = 0; i < str2_s.size(); i++) str2_s[i] = std::tolower(str2_s[i]); - return strcmp(str1_s.c_str(), str2_s.c_str()); -} - -int string_compare_no_case(const std::string& str1, const std::string& str2) { - return string_compare_no_case(str1.c_str(), str2.c_str()); -} /// /brief Coo matrix class for testing purposes. /// \tparam ScalarType /// \tparam LayoutType @@ -561,7 +345,7 @@ class RandCooMat { /// \tparam LayoutType /// \tparam Device template + typename Size = KokkosKernels::default_size_type> class RandCsMatrix { public: using value_type = ScalarType; diff --git a/packages/kokkos-kernels/test_common/KokkosKernels_TestVanilla.hpp b/packages/kokkos-kernels/test_common/KokkosKernels_TestVanilla.hpp new file mode 100644 index 000000000000..43ab7a3cf0c5 --- /dev/null +++ b/packages/kokkos-kernels/test_common/KokkosKernels_TestVanilla.hpp @@ -0,0 +1,177 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSKERNELS_TESTVANILLA_HPP +#define KOKKOSKERNELS_TESTVANILLA_HPP + +#include + +#include "KokkosKernels_Utils.hpp" +#include "KokkosKernels_IOUtils.hpp" +#include "Kokkos_ArithTraits.hpp" +#include "KokkosBatched_Vector.hpp" + +namespace Test { + +template +struct SharedVanillaGEMM { + bool A_t, B_t, A_c, B_c; + int C_rows, C_cols, A_cols; + ViewTypeA A; + ViewTypeB B; + ViewTypeC C; + + typedef typename ViewTypeA::value_type ScalarA; + typedef typename ViewTypeB::value_type ScalarB; + typedef typename ViewTypeC::value_type ScalarC; + typedef Kokkos::View SubviewTypeA; + typedef Kokkos::View SubviewTypeB; + typedef Kokkos::ArithTraits APT; + typedef typename APT::mag_type mag_type; + ScalarA alpha; + ScalarC beta; + + KOKKOS_INLINE_FUNCTION + void operator()(const typename Kokkos::TeamPolicy::member_type& team) const { + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, C_rows), [&](const int& i) { + // Give each kokkos thread a vector of A + SubviewTypeA a_vec; + if (A_t) + a_vec = Kokkos::subview(A, Kokkos::ALL(), i); + else + a_vec = Kokkos::subview(A, i, Kokkos::ALL()); + + // Have all vector lanes perform the dot product + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, C_cols), [&](const int& j) { + SubviewTypeB b_vec; + if (B_t) + b_vec = Kokkos::subview(B, j, Kokkos::ALL()); + else + b_vec = Kokkos::subview(B, Kokkos::ALL(), j); + ScalarC ab = ScalarC(0); + for (int k = 0; k < A_cols; k++) { + auto a = A_c ? APT::conj(a_vec(k)) : a_vec(k); + auto b = B_c ? APT::conj(b_vec(k)) : b_vec(k); + ab += a * b; + } + C(i, j) = beta * C(i, j) + alpha * ab; + }); + }); + } +}; +// C(i,:,:) = alpha * (A(i,:,:) * B(i,:,:)) + beta * C(i,:,:) +template +struct Functor_BatchedVanillaGEMM { + bool A_t, B_t, A_c, B_c, batch_size_last_dim = false; + ViewTypeA A; + ViewTypeB B; + ViewTypeC C; + + using ScalarA = typename ViewTypeA::value_type; + using ScalarB = typename ViewTypeB::value_type; + using ScalarC = typename ViewTypeC::value_type; + using SubviewTypeA = typename Kokkos::View; + using SubviewTypeB = typename Kokkos::View; + using SubviewTypeC = typename Kokkos::View; + + ScalarA alpha; + ScalarC beta; + + KOKKOS_INLINE_FUNCTION + void operator()(const typename Kokkos::TeamPolicy::member_type& team) const { + int i = team.league_rank(); + SubviewTypeA _A; + SubviewTypeB _B; + SubviewTypeC _C; + + if (batch_size_last_dim) { + _A = Kokkos::subview(A, Kokkos::ALL(), Kokkos::ALL(), i); + _B = Kokkos::subview(B, Kokkos::ALL(), Kokkos::ALL(), i); + _C = Kokkos::subview(C, Kokkos::ALL(), Kokkos::ALL(), i); + } else { + _A = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL()); + _B = Kokkos::subview(B, i, Kokkos::ALL(), Kokkos::ALL()); + _C = Kokkos::subview(C, i, Kokkos::ALL(), Kokkos::ALL()); + } + struct SharedVanillaGEMM vgemm; + vgemm.A_t = A_t; + vgemm.B_t = B_t; + vgemm.A_c = A_c; + vgemm.B_c = B_c; + vgemm.C_rows = batch_size_last_dim ? C.extent(0) : C.extent(1); + vgemm.C_cols = batch_size_last_dim ? C.extent(1) : C.extent(2); + vgemm.A_cols = batch_size_last_dim ? (A_t ? A.extent(0) : A.extent(1)) : (A_t ? A.extent(1) : A.extent(2)); + vgemm.A = _A; + vgemm.B = _B; + vgemm.C = _C; + vgemm.alpha = alpha; + vgemm.beta = beta; + vgemm(team); + } + + inline void run() { + Kokkos::parallel_for( + "Test::VanillaGEMM", + Kokkos::TeamPolicy(batch_size_last_dim ? C.extent(2) : C.extent(0), Kokkos::AUTO, + KokkosKernels::Impl::kk_get_max_vector_size()), + *this); + } +}; + +// Compute C := alpha * AB + beta * C +template +void vanillaGEMM(typename ViewTypeC::non_const_value_type alpha, const ViewTypeA& A, const ViewTypeB& B, + typename ViewTypeC::non_const_value_type beta, const ViewTypeC& C) { + using value_type = typename ViewTypeC::non_const_value_type; + using KAT = Kokkos::ArithTraits; + int m = A.extent(0); + int k = A.extent(1); + int n = B.extent(1); + for (int i = 0; i < m; i++) { + for (int j = 0; j < n; j++) { + value_type sum = KAT::zero(); + for (int ii = 0; ii < k; ii++) { + sum += A(i, ii) * B(ii, j); + } + C(i, j) = alpha * sum + beta * C(i, j); + } + } +} + +template +KOKKOS_INLINE_FUNCTION void vanillaGEMV(char mode, AlphaType alpha, const ViewTypeA& A, const ViewTypeX& x, + BetaType beta, const ViewTypeY& y) { + using ScalarY = typename ViewTypeY::non_const_value_type; + using KAT_A = Kokkos::ArithTraits; + const bool transposed = mode == 'T' || mode == 'C'; + const bool conjugated = mode == 'C'; + const bool has_beta = beta != Kokkos::ArithTraits::zero(); + int M = A.extent(transposed ? 1 : 0); + int N = A.extent(transposed ? 0 : 1); + for (int i = 0; i < M; i++) { + ScalarY y_i{}; + if (has_beta) y_i = beta * y(i); + for (int j = 0; j < N; j++) { + const auto a = transposed ? A(j, i) : A(i, j); + const auto Aij = conjugated ? KAT_A::conj(a) : a; + y_i += alpha * Aij * x(j); + } + y(i) = y_i; + } +} + +} // namespace Test +#endif diff --git a/packages/kokkos-kernels/test_common/Kokkos_Performance.hpp b/packages/kokkos-kernels/test_common/Kokkos_Performance.hpp index 648f7c53563d..10e366bc3cb5 100644 --- a/packages/kokkos-kernels/test_common/Kokkos_Performance.hpp +++ b/packages/kokkos-kernels/test_common/Kokkos_Performance.hpp @@ -14,8 +14,8 @@ // //@HEADER -#ifndef KOKKOS_PERFORMANCE_HPP -#define KOKKOS_PERFORMANCE_HPP +#ifndef KOKKOSKERNELS_KOKKOS_PERFORMANCE_HPP +#define KOKKOSKERNELS_KOKKOS_PERFORMANCE_HPP #include "Kokkos_Core.hpp" #include @@ -555,4 +555,4 @@ void Performance::Tolerance::from_string(const std::string& valtol_str) { } // namespace KokkosKernels -#endif // KOKKOS_PERFORMANCE_HPP +#endif // KOKKOSKERNELS_KOKKOS_PERFORMANCE_HPP diff --git a/packages/kokkos/appveyor.yml b/packages/kokkos/.appveyor.yml similarity index 82% rename from packages/kokkos/appveyor.yml rename to packages/kokkos/.appveyor.yml index d0a5645ef7b6..23cac222ca38 100644 --- a/packages/kokkos/appveyor.yml +++ b/packages/kokkos/.appveyor.yml @@ -5,6 +5,6 @@ build_script: - cmd: >- mkdir build && cd build && - cmake c:\projects\source -DKokkos_ENABLE_IMPL_MDSPAN=OFF -DKokkos_ENABLE_TESTS=ON -DCMAKE_CXX_FLAGS="/W0 /EHsc" -DKokkos_ENABLE_DEPRECATED_CODE_4=ON -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF && + cmake c:\projects\source -DKokkos_ENABLE_IMPL_MDSPAN=OFF -DKokkos_ENABLE_TESTS=ON -DCMAKE_CXX_FLAGS="/W0 /EHsc" -DKokkos_ENABLE_DEPRECATED_CODE_4=ON && cmake --build . --target install && ctest -C Debug --output-on-failure diff --git a/packages/kokkos/.clang-format b/packages/kokkos/.clang-format index db5f94fa2ebb..090edc2c51f1 100644 --- a/packages/kokkos/.clang-format +++ b/packages/kokkos/.clang-format @@ -1,4 +1,4 @@ -#Official Tool: clang-format version 8.0.0 +#Official Tool: clang-format version 16.0.0 BasedOnStyle: google SortIncludes: false AlignConsecutiveAssignments: true diff --git a/packages/kokkos/.clang-format-ignore b/packages/kokkos/.clang-format-ignore deleted file mode 100644 index 43d242c3106a..000000000000 --- a/packages/kokkos/.clang-format-ignore +++ /dev/null @@ -1,3 +0,0 @@ -core/unit_test/config/results/* -tpls/gtest/gtest/* -core/src/desul/* diff --git a/packages/kokkos/.clang-tidy b/packages/kokkos/.clang-tidy index 2b0d6e51d438..f1aba1f52e5f 100644 --- a/packages/kokkos/.clang-tidy +++ b/packages/kokkos/.clang-tidy @@ -1,3 +1,3 @@ -Checks: '-*,kokkos-*,modernize-use-using,modernize-use-nullptr,cppcoreguidelines-pro-type-cstyle-cast' +Checks: '-*,kokkos-*,modernize-type-traits,modernize-use-using,modernize-use-nullptr,cppcoreguidelines-pro-type-cstyle-cast' FormatStyle: file -HeaderFilterRegex: '.*/*.hpp' +HeaderFilterRegex: '(algorithms|benchmarks|containers|core|example|simd).*\.hpp' diff --git a/packages/kokkos/.cmake-format.py b/packages/kokkos/.cmake-format.py new file mode 100644 index 000000000000..6a66b6a14ec8 --- /dev/null +++ b/packages/kokkos/.cmake-format.py @@ -0,0 +1,28 @@ +# ----------------------------- +# Options affecting formatting. +# ----------------------------- +with section("format"): + + # How wide to allow formatted cmake files + line_width = 120 + + # If an argument group contains more than this many sub-groups (parg or kwarg + # groups) then force it to a vertical layout. + max_subgroups_hwrap = 3 + + # If a statement is wrapped to more than one line, than dangle the closing + # parenthesis on its own line. + dangle_parens = True + + # If the trailing parenthesis must be 'dangled' on its on line, then align it + # to this reference: `prefix`: the start of the statement, `prefix-indent`: + # the start of the statement, plus one indentation level, `child`: align to + # the column of the arguments + dangle_align = 'prefix' + +# ------------------------------------------------ +# Options affecting comment reflow and formatting. +# ------------------------------------------------ +with section("markup"): + # enable comment markup parsing and reflow + enable_markup = False diff --git a/packages/kokkos/.codecov.yml b/packages/kokkos/.codecov.yml deleted file mode 100644 index 097b0264a272..000000000000 --- a/packages/kokkos/.codecov.yml +++ /dev/null @@ -1,11 +0,0 @@ -coverage: - precision: 1 - round: down - range: "70...100" -ignore: - - tpls/ - - algorithms/unit_tests - - core/perf_test/ - - core/unit_test/ - - containers/performance_tests - - containers/unit_tests diff --git a/packages/kokkos/.jenkins b/packages/kokkos/.jenkins index 1635a69f298f..5790ccb93ce1 100644 --- a/packages/kokkos/.jenkins +++ b/packages/kokkos/.jenkins @@ -30,7 +30,157 @@ pipeline { sh './scripts/docker/check_format_cpp.sh' } } - stage('Build') { + stage('Build-1') { + parallel { + stage('GCC-8.4.0') { + agent { + dockerfile { + filename 'Dockerfile.gcc' + dir 'scripts/docker' + label 'docker' + } + } + environment { + OMP_NUM_THREADS = 8 + OMP_NESTED = 'true' + OMP_MAX_ACTIVE_LEVELS = 3 + OMP_PROC_BIND = 'true' + } + steps { + sh '''rm -rf build && mkdir -p build && cd build && \ + cmake \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_CXX_STANDARD=17 \ + -DCMAKE_CXX_FLAGS=-Werror \ + -DKokkos_ARCH_NATIVE=ON \ + -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ + -DKokkos_ENABLE_DEPRECATED_CODE_4=ON \ + -DKokkos_ENABLE_TESTS=ON \ + -DKokkos_ENABLE_BENCHMARKS=ON \ + -DKokkos_ENABLE_OPENMP=ON \ + -DKokkos_ENABLE_LIBDL=OFF \ + -DKokkos_ENABLE_LIBQUADMATH=ON \ + -DKokkos_ENABLE_SERIAL=ON \ + .. && \ + make -j8 && ctest --no-compress-output -T Test --verbose && gcc -I$PWD/../core/src/ ../core/unit_test/tools/TestCInterface.c''' + } + post { + always { + xunit([CTest(deleteOutputFiles: true, failIfNotNew: true, pattern: 'build/Testing/**/Test.xml', skipNoTestFiles: false, stopProcessingIfError: true)]) + } + } + } + stage('HIP-ROCm-5.6-C++20') { + agent { + dockerfile { + filename 'Dockerfile.hipcc' + dir 'scripts/docker' + additionalBuildArgs '--build-arg BASE=rocm/dev-ubuntu-22.04:5.6-complete@sha256:578a310fb1037d9c5e23fded2564f239acf6dc7231ff4742d2e7279fe7cc5c4a' + label 'rocm-docker' + args '-v /tmp/ccache.kokkos:/tmp/ccache --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --env HIP_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES' + } + } + steps { + sh 'ccache --zero-stats' + sh '''rm -rf build && mkdir -p build && cd build && \ + cmake \ + -DBUILD_SHARED_LIBS=ON \ + -DCMAKE_BUILD_TYPE=RelWithDebInfo \ + -DCMAKE_CXX_COMPILER=hipcc \ + -DCMAKE_CXX_FLAGS="-Werror -Wno-unused-command-line-argument" \ + -DCMAKE_CXX_STANDARD=20 \ + -DKokkos_ARCH_NATIVE=ON \ + -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ + -DKokkos_ENABLE_DEPRECATED_CODE_4=ON \ + -DKokkos_ENABLE_TESTS=ON \ + -DKokkos_ENABLE_BENCHMARKS=ON \ + -DKokkos_ENABLE_HIP=ON \ + .. && \ + make -j8 && ctest --no-compress-output -T Test --verbose''' + } + post { + always { + sh 'ccache --show-stats' + xunit([CTest(deleteOutputFiles: true, failIfNotNew: true, pattern: 'build/Testing/**/Test.xml', skipNoTestFiles: false, stopProcessingIfError: true)]) + } + } + } + stage('CUDA-11.0-NVCC-RDC') { + agent { + dockerfile { + filename 'Dockerfile.nvcc' + dir 'scripts/docker' + additionalBuildArgs '--build-arg BASE=nvcr.io/nvidia/cuda:11.0.3-devel-ubuntu20.04@sha256:10ab0f09fcdc796b4a2325ef1bce8f766f4a3500eab5a83780f80475ae26c7a6 --build-arg ADDITIONAL_PACKAGES="g++-8 gfortran clang" --build-arg CMAKE_VERSION=3.17.3' + label 'nvidia-docker && (volta || ampere)' + args '-v /tmp/ccache.kokkos:/tmp/ccache --env NVIDIA_VISIBLE_DEVICES=$NVIDIA_VISIBLE_DEVICES' + } + } + environment { + OMP_NUM_THREADS = 8 + // Nested OpenMP does not work for this configuration, + // so disabling it + OMP_MAX_ACTIVE_LEVELS = 1 + OMP_PLACES = 'threads' + OMP_PROC_BIND = 'spread' + NVCC_WRAPPER_DEFAULT_COMPILER = 'g++-8' + } + steps { + sh 'ccache --zero-stats' + sh '''rm -rf install && mkdir -p install && \ + rm -rf build && mkdir -p build && cd build && \ + cmake \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_CXX_COMPILER=g++-8 \ + -DCMAKE_CXX_FLAGS=-Werror \ + -DCMAKE_CXX_STANDARD=17 \ + -DKokkos_ARCH_NATIVE=ON \ + -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ + -DKokkos_ENABLE_OPENMP=OFF \ + -DKokkos_ENABLE_CUDA=ON \ + -DKokkos_ENABLE_CUDA_LAMBDA=OFF \ + -DKokkos_ENABLE_CUDA_UVM=ON \ + -DKokkos_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE=ON \ + -DKokkos_ENABLE_DEPRECATED_CODE_4=ON \ + \ + -DCMAKE_INSTALL_PREFIX=${PWD}/../install \ + .. && \ + make -j8 install && \ + cd .. && \ + rm -rf build-tests && mkdir -p build-tests && cd build-tests && \ + export CMAKE_PREFIX_PATH=${PWD}/../install && \ + cmake \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ + -DCMAKE_CXX_COMPILER=$WORKSPACE/bin/nvcc_wrapper \ + -DCMAKE_CXX_FLAGS="-Werror --Werror=all-warnings -Xcudafe --diag_suppress=940" \ + -DCMAKE_EXE_LINKER_FLAGS="-Xnvlink -suppress-stack-size-warning" \ + -DCMAKE_CXX_STANDARD=17 \ + -DKokkos_INSTALL_TESTING=ON \ + .. && \ + make -j8 && ctest --no-compress-output -T Test --verbose && \ + cd ../example/build_cmake_installed && \ + rm -rf build && mkdir -p build && cd build && \ + cmake \ + -DCMAKE_CXX_COMPILER=g++-8 \ + -DCMAKE_CXX_FLAGS=-Werror \ + -DCMAKE_CXX_STANDARD=17 \ + .. && \ + make -j8 && ctest --verbose && \ + cd ../.. && \ + cmake -B build_cmake_installed_different_compiler/build -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_CXX_FLAGS=-Werror -DCMAKE_CXX_STANDARD=17 build_cmake_installed_different_compiler && \ + cmake --build build_cmake_installed_different_compiler/build --target all && \ + cmake --build build_cmake_installed_different_compiler/build --target test''' + } + post { + always { + sh 'ccache --show-stats' + xunit([CTest(deleteOutputFiles: true, failIfNotNew: true, pattern: 'build-tests/Testing/**/Test.xml', skipNoTestFiles: false, stopProcessingIfError: true)]) + } + } + } + } + } + stage('Build-2') { parallel { stage('OPENACC-NVHPC-CUDA-12.2') { agent { @@ -49,14 +199,21 @@ pipeline { /opt/cmake/bin/cmake \ -DCMAKE_CXX_COMPILER=nvc++ \ -DCMAKE_CXX_STANDARD=17 \ + -DCMAKE_CXX_FLAGS=-Werror \ -DKokkos_ARCH_NATIVE=ON \ -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ -DKokkos_ENABLE_TESTS=ON \ -DKokkos_ENABLE_OPENACC=ON \ -DKokkos_ARCH_VOLTA70=ON \ .. && \ - make -j8 && ctest --verbose''' + make -j8 && ctest --no-compress-output -T Test --verbose''' } + post { + always { + xunit([CTest(deleteOutputFiles: true, failIfNotNew: true, pattern: 'build/Testing/**/Test.xml', skipNoTestFiles: false, stopProcessingIfError: true)]) + } + } + } stage('CUDA-12.2-NVHPC-AS-HOST-COMPILER') { agent { @@ -82,17 +239,22 @@ pipeline { -DCMAKE_BUILD_TYPE=RelWithDebInfo \ -DCMAKE_CXX_COMPILER=nvc++ \ -DCMAKE_CXX_STANDARD=17 \ - -DCMAKE_CXX_FLAGS="--diag_suppress=implicit_return_from_non_void_function" \ + -DCMAKE_CXX_FLAGS="-Werror --diag_suppress=implicit_return_from_non_void_function" \ -DKokkos_ARCH_NATIVE=ON \ -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ -DKokkos_ENABLE_DEPRECATED_CODE_4=OFF \ -DKokkos_ENABLE_TESTS=ON \ -DKokkos_ENABLE_CUDA=ON \ - -DKokkos_ENABLE_CUDA_LAMBDA=ON \ -DKokkos_ENABLE_OPENMP=ON \ .. && \ - make -j8 && ctest --verbose''' + make -j8 && ctest --no-compress-output -T Test --verbose''' + } + post { + always { + xunit([CTest(deleteOutputFiles: true, failIfNotNew: true, pattern: 'build/Testing/**/Test.xml', skipNoTestFiles: false, stopProcessingIfError: true)]) + } } + } stage('SYCL-OneAPI') { agent { @@ -117,19 +279,20 @@ pipeline { -DKokkos_ARCH_AMPERE80=ON \ -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ -DKokkos_ENABLE_DEPRECATED_CODE_4=OFF \ - -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ -DKokkos_ENABLE_EXAMPLES=ON \ -DKokkos_ENABLE_TESTS=ON \ -DKokkos_ENABLE_BENCHMARKS=ON \ -DKokkos_ENABLE_SYCL=ON \ + -DKokkos_ENABLE_SYCL_RELOCATABLE_DEVICE_CODE=ON \ -DKokkos_ENABLE_UNSUPPORTED_ARCHS=ON \ -DCMAKE_CXX_STANDARD=17 \ .. && \ - make -j8 && ctest --verbose''' + make -j8 && ctest --no-compress-output -T Test --verbose''' } post { always { sh 'ccache --show-stats' + xunit([CTest(deleteOutputFiles: true, failIfNotNew: true, pattern: 'build/Testing/**/Test.xml', skipNoTestFiles: false, stopProcessingIfError: true)]) } } } @@ -138,7 +301,7 @@ pipeline { dockerfile { filename 'Dockerfile.hipcc' dir 'scripts/docker' - additionalBuildArgs '--build-arg BASE=rocm/dev-ubuntu-20.04:5.2-complete' + additionalBuildArgs '--build-arg BASE=rocm/dev-ubuntu-20.04:5.2-complete@sha256:4030c8af0c06c286174758523dabe4b3850bf72d4a8c1ef275d3ec69aa475f65' label 'rocm-docker ' args '-v /tmp/ccache.kokkos:/tmp/ccache --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --env HIP_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES' } @@ -168,46 +331,12 @@ pipeline { -DKokkos_ENABLE_IMPL_MDSPAN=OFF \ -DKokkos_ENABLE_HIP_MULTIPLE_KERNEL_INSTANTIATIONS=ON \ .. && \ - make -j8 && ctest --verbose''' - } - post { - always { - sh 'ccache --show-stats' - } - } - } - stage('HIP-ROCm-5.6-C++20') { - agent { - dockerfile { - filename 'Dockerfile.hipcc' - dir 'scripts/docker' - additionalBuildArgs '--build-arg BASE=rocm/dev-ubuntu-22.04:5.6-complete' - label 'rocm-docker' - args '-v /tmp/ccache.kokkos:/tmp/ccache --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --env HIP_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES' - } - } - steps { - sh 'ccache --zero-stats' - sh '''rm -rf build && mkdir -p build && cd build && \ - cmake \ - -DBUILD_SHARED_LIBS=ON \ - -DCMAKE_BUILD_TYPE=RelWithDebInfo \ - -DCMAKE_CXX_COMPILER=hipcc \ - -DCMAKE_CXX_FLAGS="-Werror -Wno-unused-command-line-argument" \ - -DCMAKE_CXX_STANDARD=20 \ - -DKokkos_ARCH_NATIVE=ON \ - -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ - -DKokkos_ENABLE_DEPRECATED_CODE_4=ON \ - -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ - -DKokkos_ENABLE_TESTS=ON \ - -DKokkos_ENABLE_BENCHMARKS=ON \ - -DKokkos_ENABLE_HIP=ON \ - .. && \ - make -j8 && ctest --verbose''' + make -j8 && ctest --no-compress-output -T Test --verbose''' } post { always { sh 'ccache --show-stats' + xunit([CTest(deleteOutputFiles: true, failIfNotNew: true, pattern: 'build/Testing/**/Test.xml', skipNoTestFiles: false, stopProcessingIfError: true)]) } } } @@ -248,7 +377,7 @@ pipeline { -DKokkos_ARCH_AMD_GFX906=ON \ && \ cmake --build build --parallel ${BUILD_JOBS} && \ - cd build && ctest --output-on-failure + cd build && ctest --no-compress-output -T Test --output-on-failure ''' } post { @@ -277,7 +406,6 @@ pipeline { -DKokkos_ARCH_NATIVE=ON \ -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ -DKokkos_ENABLE_DEPRECATED_CODE_4=ON \ - -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ -DKokkos_ENABLE_TESTS=ON \ -DKokkos_ENABLE_BENCHMARKS=ON \ -DKokkos_ENABLE_TUNING=ON \ @@ -285,11 +413,12 @@ pipeline { -DKokkos_ARCH_VOLTA70=ON \ -DCMAKE_CXX_STANDARD=17 \ .. && \ - make -j8 && ctest --verbose''' + make -j8 && ctest --no-compress-output -T Test --verbose''' } post { always { sh 'ccache --show-stats' + xunit([CTest(deleteOutputFiles: true, failIfNotNew: true, pattern: 'build/Testing/**/Test.xml', skipNoTestFiles: false, stopProcessingIfError: true)]) } } } @@ -315,119 +444,18 @@ pipeline { -DKokkos_ARCH_NATIVE=ON \ -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ -DKokkos_ENABLE_DEPRECATED_CODE_4=ON \ - -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ -DKokkos_ENABLE_TESTS=ON \ -DKokkos_ENABLE_BENCHMARKS=ON \ -DKokkos_ENABLE_CUDA=ON \ - -DKokkos_ENABLE_CUDA_LAMBDA=ON \ -DKokkos_ENABLE_TUNING=ON \ -DKokkos_ARCH_VOLTA70=ON \ .. && \ - make -j8 && ctest --verbose''' - } - post { - always { - sh 'ccache --show-stats' - } - } - } - stage('CUDA-11.7-NVCC') { - agent { - dockerfile { - filename 'Dockerfile.nvcc' - dir 'scripts/docker' - additionalBuildArgs '--build-arg BASE=nvidia/cuda:11.7.1-devel-ubuntu20.04' - label 'nvidia-docker && volta' - args '-v /tmp/ccache.kokkos:/tmp/ccache --env NVIDIA_VISIBLE_DEVICES=$NVIDIA_VISIBLE_DEVICES' - } - } - steps { - sh 'ccache --zero-stats' - sh '''rm -rf build && mkdir -p build && cd build && \ - ../gnu_generate_makefile.bash \ - --with-options=compiler_warnings \ - --cxxflags="-Werror -Werror all-warnings -Xcudafe --diag_suppress=20208" \ - --cxxstandard=c++17 \ - --with-cuda \ - --with-cuda-options=enable_lambda \ - --arch=Volta70 \ - && \ - make test -j8''' - } - post { - always { - sh 'ccache --show-stats' - } - } - } - stage('CUDA-11.0-NVCC-RDC') { - agent { - dockerfile { - filename 'Dockerfile.nvcc' - dir 'scripts/docker' - additionalBuildArgs '--build-arg BASE=nvidia/cuda:11.0.3-devel-ubuntu18.04 --build-arg ADDITIONAL_PACKAGES="g++-8 gfortran clang" --build-arg CMAKE_VERSION=3.17.3' - label 'nvidia-docker && (volta || ampere)' - args '-v /tmp/ccache.kokkos:/tmp/ccache --env NVIDIA_VISIBLE_DEVICES=$NVIDIA_VISIBLE_DEVICES' - } - } - environment { - OMP_NUM_THREADS = 8 - // Nested OpenMP does not work for this configuration, - // so disabling it - OMP_MAX_ACTIVE_LEVELS = 1 - OMP_PLACES = 'threads' - OMP_PROC_BIND = 'spread' - NVCC_WRAPPER_DEFAULT_COMPILER = 'g++-8' - } - steps { - sh 'ccache --zero-stats' - sh '''rm -rf install && mkdir -p install && \ - rm -rf build && mkdir -p build && cd build && \ - cmake \ - -DCMAKE_BUILD_TYPE=Release \ - -DCMAKE_CXX_COMPILER=g++-8 \ - -DCMAKE_CXX_FLAGS=-Werror \ - -DCMAKE_CXX_STANDARD=17 \ - -DKokkos_ARCH_NATIVE=ON \ - -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ - -DKokkos_ENABLE_OPENMP=OFF \ - -DKokkos_ENABLE_CUDA=ON \ - -DKokkos_ENABLE_CUDA_LAMBDA=OFF \ - -DKokkos_ENABLE_CUDA_UVM=ON \ - -DKokkos_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE=ON \ - -DKokkos_ENABLE_DEPRECATED_CODE_4=ON \ - -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ - -DCMAKE_INSTALL_PREFIX=${PWD}/../install \ - .. && \ - make -j8 install && \ - cd .. && \ - rm -rf build-tests && mkdir -p build-tests && cd build-tests && \ - export CMAKE_PREFIX_PATH=${PWD}/../install && \ - cmake \ - -DCMAKE_BUILD_TYPE=Release \ - -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ - -DCMAKE_CXX_COMPILER=$WORKSPACE/bin/nvcc_wrapper \ - -DCMAKE_CXX_FLAGS=-Werror --Werror=all-warnings -Xcudafe --diag_suppress=3159 \ - -DCMAKE_CXX_STANDARD=17 \ - -DKokkos_INSTALL_TESTING=ON \ - .. && \ - make -j8 && ctest --verbose && \ - cd ../example/build_cmake_installed && \ - rm -rf build && mkdir -p build && cd build && \ - cmake \ - -DCMAKE_CXX_COMPILER=g++-8 \ - -DCMAKE_CXX_FLAGS=-Werror \ - -DCMAKE_CXX_STANDARD=17 \ - .. && \ - make -j8 && ctest --verbose && \ - cd ../.. && \ - cmake -B build_cmake_installed_different_compiler/build -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_CXX_FLAGS=-Werror -DCMAKE_CXX_STANDARD=17 build_cmake_installed_different_compiler && \ - cmake --build build_cmake_installed_different_compiler/build --target all && \ - cmake --build build_cmake_installed_different_compiler/build --target test''' + make -j8 && ctest --no-compress-output -T Test --verbose''' } post { always { sh 'ccache --show-stats' + xunit([CTest(deleteOutputFiles: true, failIfNotNew: true, pattern: 'build/Testing/**/Test.xml', skipNoTestFiles: false, stopProcessingIfError: true)]) } } } @@ -436,7 +464,7 @@ pipeline { dockerfile { filename 'Dockerfile.nvcc' dir 'scripts/docker' - additionalBuildArgs '--build-arg BASE=nvidia/cuda:11.6.2-devel-ubuntu20.04' + additionalBuildArgs '--build-arg BASE=nvcr.io/nvidia/cuda:11.6.2-devel-ubuntu20.04@sha256:d95d54bc231f8aea7fda79f60da620324584b20ed31a8ebdb0686cffd34dd405' label 'nvidia-docker && (volta || ampere)' args '-v /tmp/ccache.kokkos:/tmp/ccache --env NVIDIA_VISIBLE_DEVICES=$NVIDIA_VISIBLE_DEVICES' } @@ -449,7 +477,7 @@ pipeline { -DCMAKE_BUILD_TYPE=Debug \ -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ -DCMAKE_CXX_COMPILER=$WORKSPACE/bin/nvcc_wrapper \ - -DCMAKE_CXX_FLAGS="-Werror -Werror all-warnings -Xcudafe --diag_suppress=20208" \ + -DCMAKE_CXX_FLAGS="-Werror -Werror=all-warnings" \ -DCMAKE_CXX_STANDARD=17 \ -DKokkos_ARCH_NATIVE=ON \ -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ @@ -459,13 +487,12 @@ pipeline { -DKokkos_ENABLE_TESTS=ON \ -DKokkos_ENABLE_BENCHMARKS=ON \ -DKokkos_ENABLE_CUDA=ON \ - -DKokkos_ENABLE_CUDA_LAMBDA=ON \ -DKokkos_ENABLE_LIBDL=OFF \ -DKokkos_ENABLE_OPENMP=ON \ -DKokkos_ENABLE_IMPL_MDSPAN=OFF \ - -DKokkos_ENABLE_IMPL_CUDA_MALLOC_ASYNC=OFF \ + -DKokkos_ENABLE_IMPL_CUDA_MALLOC_ASYNC=ON \ .. && \ - make -j8 && ctest --verbose && \ + make -j8 && ctest --no-compress-output -T Test --verbose && \ cd ../example/build_cmake_in_tree && \ rm -rf build && mkdir -p build && cd build && \ cmake -DCMAKE_CXX_STANDARD=17 .. && make -j8 && ctest --verbose''' @@ -473,41 +500,37 @@ pipeline { post { always { sh 'ccache --show-stats' + xunit([CTest(deleteOutputFiles: true, failIfNotNew: true, pattern: 'build/Testing/**/Test.xml', skipNoTestFiles: false, stopProcessingIfError: true)]) } } } - stage('GCC-8.4.0') { + stage('CUDA-11.7-NVCC') { agent { dockerfile { - filename 'Dockerfile.gcc' + filename 'Dockerfile.nvcc' dir 'scripts/docker' - label 'docker' + additionalBuildArgs '--build-arg BASE=nvcr.io/nvidia/cuda:11.7.1-devel-ubuntu20.04@sha256:fc997521e612899a01dce92820f5f5a201dd943ebfdc3e49ba0706d491a39d2d' + label 'nvidia-docker && volta' + args '-v /tmp/ccache.kokkos:/tmp/ccache --env NVIDIA_VISIBLE_DEVICES=$NVIDIA_VISIBLE_DEVICES' } } - environment { - OMP_NUM_THREADS = 8 - OMP_NESTED = 'true' - OMP_MAX_ACTIVE_LEVELS = 3 - OMP_PROC_BIND = 'true' - } steps { + sh 'ccache --zero-stats' sh '''rm -rf build && mkdir -p build && cd build && \ - cmake \ - -DCMAKE_BUILD_TYPE=Release \ - -DCMAKE_CXX_STANDARD=17 \ - -DCMAKE_CXX_FLAGS=-Werror \ - -DKokkos_ARCH_NATIVE=ON \ - -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ - -DKokkos_ENABLE_DEPRECATED_CODE_4=ON \ - -DKokkos_ENABLE_DEPRECATION_WARNINGS=ON \ - -DKokkos_ENABLE_TESTS=ON \ - -DKokkos_ENABLE_BENCHMARKS=ON \ - -DKokkos_ENABLE_OPENMP=ON \ - -DKokkos_ENABLE_LIBDL=OFF \ - -DKokkos_ENABLE_LIBQUADMATH=ON \ - -DKokkos_ENABLE_SERIAL=ON \ - .. && \ - make -j8 && ctest --verbose && gcc -I$PWD/../core/src/ ../core/unit_test/tools/TestCInterface.c''' + ../gnu_generate_makefile.bash \ + --with-options=compiler_warnings \ + --cxxflags="-Werror -Werror=all-warnings" \ + --cxxstandard=c++17 \ + --with-cuda \ + --with-cuda-options=enable_lambda \ + --arch=Volta70 \ + && \ + make test -j8''' + } + post { + always { + sh 'ccache --show-stats' + } } } } diff --git a/packages/kokkos/.jenkins_nightly b/packages/kokkos/.jenkins_nightly index 8dd02e9f028a..15bef607258f 100644 --- a/packages/kokkos/.jenkins_nightly +++ b/packages/kokkos/.jenkins_nightly @@ -93,19 +93,23 @@ pipeline { -DKokkos_ENABLE_EXAMPLES=ON \ -DKokkos_ENABLE_TESTS=ON \ -DKokkos_ENABLE_DEPRECATED_CODE_4=ON \ - -DKokkos_ENABLE_DEPRECATION_WARNINGS=ON \ -DKokkos_ENABLE_SERIAL=ON \ .. && \ - make -j8 && ctest --verbose + make -j8 && ctest --no-compress-output -T Test --verbose ''' } + post { + always { + xunit([CTest(deleteOutputFiles: true, failIfNotNew: true, pattern: 'build/Testing/**/Test.xml', skipNoTestFiles: false, stopProcessingIfError: true)]) + } + } } - stage('HIP-ROCM-6.1') { + stage('HIP-ROCM-6.2') { agent { dockerfile { filename 'Dockerfile.hipcc' dir 'scripts/docker' - additionalBuildArgs '--build-arg BASE=rocm/dev-ubuntu-22.04:6.1.2-complete' + additionalBuildArgs '--build-arg BASE=rocm/dev-ubuntu-22.04:6.2-complete' label 'rocm-docker && AMD_Radeon_Instinct_MI210' args '-v /tmp/ccache.kokkos:/tmp/ccache --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --env HIP_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES' } @@ -125,16 +129,16 @@ pipeline { -DKokkos_ARCH_NATIVE=ON \ -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ -DKokkos_ENABLE_DEPRECATED_CODE_4=ON \ - -DKokkos_ENABLE_DEPRECATION_WARNINGS=ON \ -DKokkos_ENABLE_TESTS=ON \ -DKokkos_ENABLE_BENCHMARKS=ON \ -DKokkos_ENABLE_HIP=ON \ .. && \ - make -j8 && ctest --verbose''' + make -j8 && ctest --no-compress-output -T Test --verbose''' } post { always { sh 'ccache --show-stats' + xunit([CTest(deleteOutputFiles: true, failIfNotNew: true, pattern: 'build/Testing/**/Test.xml', skipNoTestFiles: false, stopProcessingIfError: true)]) } } } diff --git a/packages/kokkos/CHANGELOG.md b/packages/kokkos/CHANGELOG.md index 7b1d69e56630..6c237ebca867 100644 --- a/packages/kokkos/CHANGELOG.md +++ b/packages/kokkos/CHANGELOG.md @@ -1,7 +1,101 @@ # CHANGELOG +## 4.5.00 + +[Full Changelog](https://github.com/kokkos/kokkos/compare/4.4.01...4.5.00) + +### Features + +* SYCL backend graduated to production ready +* Introduce new `SequentialHostInit` view allocation property [\#7229](https://github.com/kokkos/kokkos/pull/7229) (backported in 4.4.01) +* Support building with Run-Time Type Information (RTTI) disabled +* Add new `KOKKOS_RELOCATABLE_FUNCTION` function annotation macro [\#5993](https://github.com/kokkos/kokkos/pull/5993) + +### Backend and Architecture Enhancements + +#### CUDA + +* Adding occupancy tuning for CUDA architectures [\#6788](https://github.com/kokkos/kokkos/pull/6788) +* By default disable `cudaMallocAsync` (i.e., revert the change made in version 4.2) [\#7353](https://github.com/kokkos/kokkos/pull/7353) + +#### HIP + +* Add support for AMD Phoenix APUs with Radeon 740M/760M/780M/880M/890M [\#7162](https://github.com/kokkos/kokkos/pull/7162) +* Update maximum waves per CU values for consumer card [\#7347](https://github.com/kokkos/kokkos/pull/7347) +* Check that Kokkos is running on the architecture it was compiled for [\#7379](https://github.com/kokkos/kokkos/pull/7379) +* Add opt-in option to use `hipMallocAsync` instead of `hipMalloc` [\#7324](https://github.com/kokkos/kokkos/pull/7324) +* Introduce new architecture option `AMD_GFX942_APU` for MI300A [\#7462](https://github.com/kokkos/kokkos/pull/7462) + +#### SYCL + +* Move the `SYCL` backend out of the `Experimental` namespace [\#7171](https://github.com/kokkos/kokkos/pull/7171) +* Introduce `KOKKOS_ENABLE_SYCL_RELOCATABLE_DEVICE_CODE` as CMake option [\#5993](https://github.com/kokkos/kokkos/pull/5993) + +#### OpenACC + +* Add support for building with the Clacc compiler [\#7198](https://github.com/kokkos/kokkos/pull/7198) +* Workaround NVHPC collapse clause bug for `MDRangePolicy` [\#7425](https://github.com/kokkos/kokkos/pull/7425) + +#### HPX + +* Implement `Experimental::partition_space` to produce truly independent execution spaces [\#7287](https://github.com/kokkos/kokkos/pull/7287) + +#### Threads + +* Fix compilation for `parallel_reduce` `MDRange` with `Dynamic` scheduling [\#7478](https://github.com/kokkos/kokkos/pull/7478) +* Fix race conditions on ARM architectures [\#7498](https://github.com/kokkos/kokkos/pull/7498) + +#### OpenMP + +* Fix run time behavior when compiling with `-fvisibility-hidden` [\#7284](https://github.com/kokkos/kokkos/pull/7284) (backported in 4.4.01) +* Fix linking with Cray Clang compiler [\#7341](https://github.com/kokkos/kokkos/pull/7341) + +#### Serial + +* Allow `Kokkos_ENABLE_ATOMICS_BYPASS` to skip mutexes to remediate performance regression in 4.4 [\#7369](https://github.com/kokkos/kokkos/pull/7369) + +### General Enhancements + +* Improve `View` initialization/destruction for non-scalar trivial and trivially-destructible types [\#7219](https://github.com/kokkos/kokkos/pull/7219) [\#7225](https://github.com/kokkos/kokkos/pull/7225) +* Add getters for default tile sizes used in `MDRangePolicy` [\#6839](https://github.com/kokkos/kokkos/pull/6839) +* Improve performance of `Kokkos::sort` when `std::sort` is used [\#7264](https://github.com/kokkos/kokkos/pull/7264) +* Add range-based for loop support for `Array` [\#7293](https://github.com/kokkos/kokkos/pull/7293) +* Allow functors as reducers for nested team parallel reduce [\#6921](https://github.com/kokkos/kokkos/pull/6921) +* Avoid making copies of string rvalue reference arguments to `view_alloc()` [\#7364](https://github.com/kokkos/kokkos/pull/7364) +* Add `atomic_{mod,xor,nand,lshift,rshift}` [\#7458](https://github.com/kokkos/kokkos/pull/7458) +* Allow using `SequentialHostInit` with `Kokkos::DualView` [\#7456](https://github.com/kokkos/kokkos/pull/7456) +* Add `Graph::instantiate()` [\#7240](https://github.com/kokkos/kokkos/pull/7240) +* Allow an arbitrary execution space instance to be used in `Kokkos::Graph::submit()` [\#7249](https://github.com/kokkos/kokkos/pull/7249) +* Enable compile-time diagnostic of illegal reduction target for graphs [\#7460](https://github.com/kokkos/kokkos/pull/7460) + +### Build System Changes + +* Make sure backend-specific options such as `IMPL_CUDA_MALLOC_ASYNC` only show when that backend is actually enabled [\#7228](https://github.com/kokkos/kokkos/pull/7228) +* Major refactoring removing `TriBITS` paths [\#6164](https://github.com/kokkos/kokkos/pull/6164) +* Add support for SpacemiT K60 (RISC-V) [\#7160](https://github.com/kokkos/kokkos/pull/7160) + +### Deprecations + +* Deprecate Tasking interface [\#7393](https://github.com/kokkos/kokkos/pull/7393) +* Deprecate `atomic_query_version`, `atomic_assign`, `atomic_compare_exchange_strong`, `atomic_{inc, dec}rement` [\#7458](https://github.com/kokkos/kokkos/pull/7458) +* Deprecate `{OpenMP,HPX}::is_asynchronous()` [\#7322](https://github.com/kokkos/kokkos/pull/7322) + +### Bug Fixes + +* Fix undefined behavior in `BinSort` when sorting within bins on host [\#7223](https://github.com/kokkos/kokkos/pull/7223) +* Using CUDA limits to set extents for blocks, grids [\#7235](https://github.com/kokkos/kokkos/pull/7235) +* Fix `deep_copy (serial_exec, dst, src)` with multiple host backends [\#7245](https://github.com/kokkos/kokkos/pull/7245) +* Skip `RangePolicy` bounds conversion checks if roundtrip convertibility is not provided [\#7172](https://github.com/kokkos/kokkos/pull/7172) +* Allow extracting host and device views from `DualView` with `const` value type [\#7242](https://github.com/kokkos/kokkos/pull/7242) +* Fix `TeamPolicy` array reduction for CUDA and HIP [\#6296](https://github.com/kokkos/kokkos/pull/6296) +* Fix implicit copy assignment operators in few AVX2 masks being deleted [\#7296](https://github.com/kokkos/kokkos/pull/7296) +* Fix configuring without architecture flags for SYCL [\#7303](https://github.com/kokkos/kokkos/pull/7303) +* Set an initial value index during join of `MinLoc`, `MaxLoc` or `MinMaxLoc` [\#7330](https://github.com/kokkos/kokkos/pull/7330) +* Fix storage lifetime of driver for global launch of graph nodes for CUDA and HIP [\#7365](https://github.com/kokkos/kokkos/pull/7365) +* Make `value_type` for `RandomAccessIterator` non-`const` [\#7485](https://github.com/kokkos/kokkos/pull/7485) + ## [4.4.01](https://github.com/kokkos/kokkos/tree/4.4.01) -[Full Changelog](https://github.com/kokkos/kokkos/compare/4.0.00...4.4.01) +[Full Changelog](https://github.com/kokkos/kokkos/compare/4.4.00...4.4.01) ### Features: * Introduce new SequentialHostInit view allocation property [\#7229](https://github.com/kokkos/kokkos/pull/7229) @@ -13,7 +107,7 @@ ### Bug Fixes * OpenMP: Fix issue related to the visibility of an internal symbol with shared libraries that affected `ScatterView` in particular [\#7284](https://github.com/kokkos/kokkos/pull/7284) -* Fix implicit copy assignment operators in few AVX2 masks being deleted [#7296](https://github.com/kokkos/kokkos/pull/7296) +* Fix implicit copy assignment operators in few AVX2 masks being deleted [\#7296](https://github.com/kokkos/kokkos/pull/7296) ## [4.4.00](https://github.com/kokkos/kokkos/tree/4.4.00) [Full Changelog](https://github.com/kokkos/kokkos/compare/4.3.01...4.4.00) @@ -57,6 +151,7 @@ * SIMD: Allow flexible vector width for 32 bit types [\#6802](https://github.com/kokkos/kokkos/pull/6802) * Updates for `Kokkos::Array`: add `kokkos_swap(Array)` specialization [\#6943](https://github.com/kokkos/kokkos/pull/6943), add `Kokkos::to_array` [\#6375](https://github.com/kokkos/kokkos/pull/6375), make `Kokkos::Array` equality-comparable [\#7148](https://github.com/kokkos/kokkos/pull/7148) * Structured binding support for `Kokkos::complex` [\#7040](https://github.com/kokkos/kokkos/pull/7040) +* Introduce `KOKKOS_DEDUCTION_GUIDE` macro to allow for portable user-defined deduction guides [\#6954](https://github.com/kokkos/kokkos/pull/6954) ### Build System Changes * Do not require OpenMP support for languages other than CXX [\#6965](https://github.com/kokkos/kokkos/pull/6965) @@ -1388,7 +1483,7 @@ **Closed issues:** - Silent error (Validate storage level arg to set_scratch_size) [\#3097](https://github.com/kokkos/kokkos/issues/3097) -- Remove KOKKKOS\_ENABLE\_PROFILING Option [\#3095](https://github.com/kokkos/kokkos/issues/3095) +- Remove KOKKOS\_ENABLE\_PROFILING Option [\#3095](https://github.com/kokkos/kokkos/issues/3095) - Cuda 11 -\> allow C++17 [\#3083](https://github.com/kokkos/kokkos/issues/3083) - In source build failure not explained [\#3081](https://github.com/kokkos/kokkos/issues/3081) - Allow naming of Views for initialization kernel [\#3070](https://github.com/kokkos/kokkos/issues/3070) diff --git a/packages/kokkos/CMakeLists.txt b/packages/kokkos/CMakeLists.txt index 736cbac218c2..f0bf8e3634a9 100644 --- a/packages/kokkos/CMakeLists.txt +++ b/packages/kokkos/CMakeLists.txt @@ -1,12 +1,11 @@ cmake_minimum_required(VERSION 3.16 FATAL_ERROR) # Disable in-source builds to prevent source tree corruption. -if( "${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_BINARY_DIR}" ) - message( FATAL_ERROR "FATAL: In-source builds are not allowed. You should create a separate directory for build files and delete CMakeCache.txt." ) -endif() - -if (COMMAND TRIBITS_PACKAGE) - TRIBITS_PACKAGE(Kokkos) +if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_BINARY_DIR}") + message( + FATAL_ERROR + "FATAL: In-source builds are not allowed. You should create a separate directory for build files and delete CMakeCache.txt." + ) endif() # We want to determine if options are given with the wrong case @@ -15,143 +14,142 @@ endif() # form a list of all the given variables. If it begins with any # case of KoKkOS, we add it to the list. -GET_CMAKE_PROPERTY(_variableNames VARIABLES) -SET(KOKKOS_GIVEN_VARIABLES) -FOREACH (var ${_variableNames}) - STRING(TOUPPER ${var} UC_VAR) - STRING(FIND ${UC_VAR} KOKKOS IDX) - IF (${IDX} EQUAL 0) - LIST(APPEND KOKKOS_GIVEN_VARIABLES ${var}) - ENDIF() -ENDFOREACH() +get_cmake_property(_variableNames VARIABLES) +set(KOKKOS_GIVEN_VARIABLES) +foreach(var ${_variableNames}) + string(TOUPPER ${var} UC_VAR) + string(FIND ${UC_VAR} KOKKOS IDX) + if(${IDX} EQUAL 0) + list(APPEND KOKKOS_GIVEN_VARIABLES ${var}) + endif() +endforeach() # Basic initialization (Used in KOKKOS_SETTINGS) -SET(Kokkos_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) -SET(KOKKOS_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) -SET(KOKKOS_SRC_PATH ${Kokkos_SOURCE_DIR}) -SET(KOKKOS_PATH ${Kokkos_SOURCE_DIR}) -SET(KOKKOS_TOP_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}) - -# Is this a build as part of Trilinos? -IF(COMMAND TRIBITS_PACKAGE_DECL) - SET(KOKKOS_HAS_TRILINOS ON) -ELSE() - SET(KOKKOS_HAS_TRILINOS OFF) - SET(PACKAGE_NAME Kokkos) - SET(PACKAGE_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}") -ENDIF() -# Is this build a subdirectory of another project -GET_DIRECTORY_PROPERTY(HAS_PARENT PARENT_DIRECTORY) +set(Kokkos_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) +set(KOKKOS_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) +set(KOKKOS_SRC_PATH ${Kokkos_SOURCE_DIR}) +set(KOKKOS_PATH ${Kokkos_SOURCE_DIR}) +set(KOKKOS_TOP_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}) +set(PACKAGE_NAME Kokkos) +set(PACKAGE_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}") -INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_functions.cmake) -INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_pick_cxx_std.cmake) +# Is this build a subdirectory of another project +get_directory_property(HAS_PARENT PARENT_DIRECTORY) -SET(KOKKOS_ENABLED_OPTIONS) #exported in config file -SET(KOKKOS_ENABLED_DEVICES) #exported in config file -SET(KOKKOS_ENABLED_TPLS) #exported in config file -SET(KOKKOS_ENABLED_ARCH_LIST) #exported in config file +include(${KOKKOS_SRC_PATH}/cmake/kokkos_functions.cmake) +include(${KOKKOS_SRC_PATH}/cmake/kokkos_pick_cxx_std.cmake) + +set(KOKKOS_ENABLED_OPTIONS) #exported in config file +set(KOKKOS_ENABLED_DEVICES) #exported in config file +set(KOKKOS_ENABLED_TPLS) #exported in config file +set(KOKKOS_ENABLED_ARCH_LIST) #exported in config file #These are helper flags used for sanity checks during config #Certain features should depend on other features being configured first -SET(KOKKOS_CFG_DAG_NONE On) #sentinel to indicate no dependencies -SET(KOKKOS_CFG_DAG_DEVICES_DONE Off) -SET(KOKKOS_CFG_DAG_OPTIONS_DONE Off) -SET(KOKKOS_CFG_DAG_ARCH_DONE Off) -SET(KOKKOS_CFG_DAG_CXX_STD_DONE Off) -SET(KOKKOS_CFG_DAG_COMPILER_ID_DONE Off) -FUNCTION(KOKKOS_CFG_DEPENDS SUCCESSOR PRECURSOR) - SET(PRE_FLAG KOKKOS_CFG_DAG_${PRECURSOR}) - SET(POST_FLAG KOKKOS_CFG_DAG_${SUCCESSOR}) - IF (NOT ${PRE_FLAG}) - MESSAGE(FATAL_ERROR "Bad CMake refactor: feature ${SUCCESSOR} cannot be configured until ${PRECURSOR} is configured") - ENDIF() - GLOBAL_SET(${POST_FLAG} On) -ENDFUNCTION() - - -LIST(APPEND CMAKE_MODULE_PATH cmake/Modules) - -IF(NOT KOKKOS_HAS_TRILINOS) - set(CMAKE_DISABLE_SOURCE_CHANGES ON) - set(CMAKE_DISABLE_IN_SOURCE_BUILD ON) - - # What language are we compiling Kokkos as - # downstream dependencies need to match this! - SET(KOKKOS_COMPILE_LANGUAGE CXX) - # use lower case here since we didn't parse options yet - IF (Kokkos_ENABLE_COMPILE_AS_CMAKE_LANGUAGE AND Kokkos_ENABLE_CUDA) - - # Without this as a language for the package we would get a C++ compiler enabled. - # but we still need a C++ compiler even if we build all our cpp files as CUDA only - # because otherwise the C++ features don't work etc. - # This is just the rather odd way CMake does this, since CUDA doesn't imply C++ even - # though it is a C++ extension ... (but I guess it didn't use to be back in CUDA 4 or 5 - # days. - SET(KOKKOS_INTERNAL_EXTRA_COMPILE_LANGUAGE CXX) - - SET(KOKKOS_COMPILE_LANGUAGE CUDA) - ENDIF() - # use lower case here since we haven't parsed options yet - IF (Kokkos_ENABLE_COMPILE_AS_CMAKE_LANGUAGE AND Kokkos_ENABLE_HIP) - - # Without this as a language for the package we would get a C++ compiler enabled. - # but we still need a C++ compiler even if we build all our cpp files as HIP only - # because otherwise the C++ features don't work etc. - SET(KOKKOS_INTERNAL_EXTRA_COMPILE_LANGUAGE CXX) - - SET(KOKKOS_COMPILE_LANGUAGE HIP) - ENDIF() - - IF (Spack_WORKAROUND) - IF (Kokkos_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) - MESSAGE(FATAL_ERROR "Can't currently use Kokkos_ENABLE_COMPILER_AS_CMAKE_LANGUAGE in a spack installation!") - ENDIF() - - #if we are explicitly using Spack for development, - #nuke the Spack compiler - SET(SPACK_CXX $ENV{SPACK_CXX}) - IF(SPACK_CXX) - SET(CMAKE_CXX_COMPILER ${SPACK_CXX} CACHE STRING "the C++ compiler" FORCE) - SET(ENV{CXX} ${SPACK_CXX}) - ENDIF() - ENDIF() - # Always call the project command to define Kokkos_ variables - # and to make sure that C++ is an enabled language - PROJECT(Kokkos ${KOKKOS_COMPILE_LANGUAGE} ${KOKKOS_INTERNAL_EXTRA_COMPILE_LANGUAGE}) - IF(NOT HAS_PARENT) - IF (NOT CMAKE_BUILD_TYPE) - SET(DEFAULT_BUILD_TYPE "RelWithDebInfo") - MESSAGE(STATUS "Setting build type to '${DEFAULT_BUILD_TYPE}' as none was specified.") - SET(CMAKE_BUILD_TYPE "${DEFAULT_BUILD_TYPE}" CACHE STRING - "Choose the type of build, options are: Debug, Release, RelWithDebInfo and MinSizeRel." - FORCE) - ENDIF() - ENDIF() -ELSE() - SET(KOKKOS_COMPILE_LANGUAGE CXX) -ENDIF() - -IF (NOT CMAKE_SIZEOF_VOID_P) - STRING(FIND ${CMAKE_CXX_COMPILER} nvcc_wrapper FIND_IDX) - IF (NOT FIND_IDX STREQUAL -1) - MESSAGE(FATAL_ERROR "Kokkos did not configure correctly and failed to validate compiler. The most likely cause is CUDA linkage using nvcc_wrapper. Please ensure your CUDA environment is correctly configured.") - ELSE() - MESSAGE(FATAL_ERROR "Kokkos did not configure correctly and failed to validate compiler. The most likely cause is linkage errors during CMake compiler validation. Please consult the CMake error log shown below for the exact error during compiler validation") - ENDIF() -ELSEIF (NOT CMAKE_SIZEOF_VOID_P EQUAL 8) - IF(CMAKE_SIZEOF_VOID_P EQUAL 4) - MESSAGE(WARNING "32-bit builds are experimental and not officially supported.") - SET(KOKKOS_IMPL_32BIT ON) - ELSE() - MESSAGE(FATAL_ERROR "Kokkos assumes a 64-bit build, i.e., 8-byte pointers, but found ${CMAKE_SIZEOF_VOID_P}-byte pointers instead;") - ENDIF() -ENDIF() +set(KOKKOS_CFG_DAG_NONE On) #sentinel to indicate no dependencies +set(KOKKOS_CFG_DAG_DEVICES_DONE Off) +set(KOKKOS_CFG_DAG_OPTIONS_DONE Off) +set(KOKKOS_CFG_DAG_ARCH_DONE Off) +set(KOKKOS_CFG_DAG_CXX_STD_DONE Off) +set(KOKKOS_CFG_DAG_COMPILER_ID_DONE Off) +function(KOKKOS_CFG_DEPENDS SUCCESSOR PRECURSOR) + set(PRE_FLAG KOKKOS_CFG_DAG_${PRECURSOR}) + set(POST_FLAG KOKKOS_CFG_DAG_${SUCCESSOR}) + if(NOT ${PRE_FLAG}) + message( + FATAL_ERROR "Bad CMake refactor: feature ${SUCCESSOR} cannot be configured until ${PRECURSOR} is configured" + ) + endif() + global_set(${POST_FLAG} On) +endfunction() + +list(APPEND CMAKE_MODULE_PATH cmake/Modules) + +set(CMAKE_DISABLE_SOURCE_CHANGES ON) +set(CMAKE_DISABLE_IN_SOURCE_BUILD ON) + +# What language are we compiling Kokkos as +# downstream dependencies need to match this! +set(KOKKOS_COMPILE_LANGUAGE CXX) +# use lower case here since we didn't parse options yet +if(Kokkos_ENABLE_COMPILE_AS_CMAKE_LANGUAGE AND Kokkos_ENABLE_CUDA) + + # Without this as a language for the package we would get a C++ compiler enabled. + # but we still need a C++ compiler even if we build all our cpp files as CUDA only + # because otherwise the C++ features don't work etc. + # This is just the rather odd way CMake does this, since CUDA doesn't imply C++ even + # though it is a C++ extension ... (but I guess it didn't use to be back in CUDA 4 or 5 + # days. + set(KOKKOS_INTERNAL_EXTRA_COMPILE_LANGUAGE CXX) + + set(KOKKOS_COMPILE_LANGUAGE CUDA) +endif() +# use lower case here since we haven't parsed options yet +if(Kokkos_ENABLE_COMPILE_AS_CMAKE_LANGUAGE AND Kokkos_ENABLE_HIP) + + # Without this as a language for the package we would get a C++ compiler enabled. + # but we still need a C++ compiler even if we build all our cpp files as HIP only + # because otherwise the C++ features don't work etc. + set(KOKKOS_INTERNAL_EXTRA_COMPILE_LANGUAGE CXX) + set(KOKKOS_COMPILE_LANGUAGE HIP) +endif() + +if(Spack_WORKAROUND) + if(Kokkos_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) + message(FATAL_ERROR "Can't currently use Kokkos_ENABLE_COMPILER_AS_CMAKE_LANGUAGE in a spack installation!") + endif() + + #if we are explicitly using Spack for development, + #nuke the Spack compiler + set(SPACK_CXX $ENV{SPACK_CXX}) + if(SPACK_CXX) + set(CMAKE_CXX_COMPILER ${SPACK_CXX} CACHE STRING "the C++ compiler" FORCE) + set(ENV{CXX} ${SPACK_CXX}) + endif() +endif() +# Always call the project command to define Kokkos_ variables +# and to make sure that C++ is an enabled language +project(Kokkos ${KOKKOS_COMPILE_LANGUAGE} ${KOKKOS_INTERNAL_EXTRA_COMPILE_LANGUAGE}) +if(NOT HAS_PARENT) + if(NOT CMAKE_BUILD_TYPE) + set(DEFAULT_BUILD_TYPE "RelWithDebInfo") + message(STATUS "Setting build type to '${DEFAULT_BUILD_TYPE}' as none was specified.") + set(CMAKE_BUILD_TYPE "${DEFAULT_BUILD_TYPE}" + CACHE STRING "Choose the type of build, options are: Debug, Release, RelWithDebInfo and MinSizeRel." FORCE + ) + endif() +endif() + +if(NOT CMAKE_SIZEOF_VOID_P) + string(FIND ${CMAKE_CXX_COMPILER} nvcc_wrapper FIND_IDX) + if(NOT FIND_IDX STREQUAL -1) + message( + FATAL_ERROR + "Kokkos did not configure correctly and failed to validate compiler. The most likely cause is CUDA linkage using nvcc_wrapper. Please ensure your CUDA environment is correctly configured." + ) + else() + message( + FATAL_ERROR + "Kokkos did not configure correctly and failed to validate compiler. The most likely cause is linkage errors during CMake compiler validation. Please consult the CMake error log shown below for the exact error during compiler validation" + ) + endif() +elseif(NOT CMAKE_SIZEOF_VOID_P EQUAL 8) + if(CMAKE_SIZEOF_VOID_P EQUAL 4) + message(WARNING "32-bit builds are experimental and not officially supported.") + set(KOKKOS_IMPL_32BIT ON) + else() + message( + FATAL_ERROR + "Kokkos assumes a 64-bit build, i.e., 8-byte pointers, but found ${CMAKE_SIZEOF_VOID_P}-byte pointers instead;" + ) + endif() +endif() set(Kokkos_VERSION_MAJOR 4) -set(Kokkos_VERSION_MINOR 4) -set(Kokkos_VERSION_PATCH 1) +set(Kokkos_VERSION_MINOR 5) +set(Kokkos_VERSION_PATCH 0) set(Kokkos_VERSION "${Kokkos_VERSION_MAJOR}.${Kokkos_VERSION_MINOR}.${Kokkos_VERSION_PATCH}") message(STATUS "Kokkos version: ${Kokkos_VERSION}") math(EXPR KOKKOS_VERSION "${Kokkos_VERSION_MAJOR} * 10000 + ${Kokkos_VERSION_MINOR} * 100 + ${Kokkos_VERSION_PATCH}") @@ -164,58 +162,54 @@ math(EXPR KOKKOS_VERSION_PATCH "${KOKKOS_VERSION} % 100") # Load either the real TriBITS or a TriBITS wrapper # for certain utility functions that are universal (like GLOBAL_SET) -INCLUDE(${KOKKOS_SRC_PATH}/cmake/fake_tribits.cmake) +include(${KOKKOS_SRC_PATH}/cmake/fake_tribits.cmake) -IF (Kokkos_ENABLE_CUDA) +if(Kokkos_ENABLE_CUDA) # If we are building CUDA, we have tricked CMake because we declare a CXX project # If the default C++ standard for a given compiler matches the requested # standard, then CMake just omits the -std flag in later versions of CMake # This breaks CUDA compilation (CUDA compiler can have a different default # -std then the underlying host compiler by itself). Setting this variable # forces CMake to always add the -std flag even if it thinks it doesn't need it - GLOBAL_SET(CMAKE_CXX_STANDARD_DEFAULT 98) -ENDIF() + global_set(CMAKE_CXX_STANDARD_DEFAULT 98) +endif() # These are the variables we will append to as we go # I really wish these were regular variables # but scoping issues can make it difficult -GLOBAL_SET(KOKKOS_COMPILE_OPTIONS) -GLOBAL_SET(KOKKOS_LINK_OPTIONS) -GLOBAL_SET(KOKKOS_AMDGPU_OPTIONS) -GLOBAL_SET(KOKKOS_CUDA_OPTIONS) -GLOBAL_SET(KOKKOS_CUDAFE_OPTIONS) -GLOBAL_SET(KOKKOS_XCOMPILER_OPTIONS) +global_set(KOKKOS_COMPILE_OPTIONS) +global_set(KOKKOS_LINK_OPTIONS) +global_set(KOKKOS_AMDGPU_OPTIONS) +global_set(KOKKOS_CUDA_OPTIONS) +global_set(KOKKOS_CUDAFE_OPTIONS) +global_set(KOKKOS_XCOMPILER_OPTIONS) # We need to append text here for making sure TPLs # we import are available for an installed Kokkos -GLOBAL_SET(KOKKOS_TPL_EXPORTS) +global_set(KOKKOS_TPL_EXPORTS) # KOKKOS_DEPENDENCE is used by kokkos_launch_compiler -GLOBAL_SET(KOKKOS_COMPILE_DEFINITIONS KOKKOS_DEPENDENCE) +global_set(KOKKOS_COMPILE_DEFINITIONS KOKKOS_DEPENDENCE) # MSVC never goes through kokkos_launch_compiler -IF(NOT MSVC) - GLOBAL_APPEND(KOKKOS_LINK_OPTIONS -DKOKKOS_DEPENDENCE) -ENDIF() +if(NOT MSVC) + global_append(KOKKOS_LINK_OPTIONS -DKOKKOS_DEPENDENCE) +endif() + +include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/kokkos_configure_trilinos.cmake) -IF(Kokkos_ENABLE_TESTS AND NOT KOKKOS_HAS_TRILINOS) +if(Kokkos_ENABLE_TESTS) find_package(GTest QUIET) -ENDIF() +endif() # Include a set of Kokkos-specific wrapper functions that # will either call raw CMake or TriBITS # These are functions like KOKKOS_INCLUDE_DIRECTORIES -INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_tribits.cmake) - +include(${KOKKOS_SRC_PATH}/cmake/kokkos_tribits.cmake) # Check the environment and set certain variables # to allow platform-specific checks -INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_check_env.cmake) +include(${KOKKOS_SRC_PATH}/cmake/kokkos_check_env.cmake) -IF(NOT KOKKOS_HAS_TRILINOS) - # This does not work in Trilinos and we simply don't care - # to fix it for Trilinos - # Gather information about the runtime environment - INCLUDE(${KOKKOS_SRC_PATH}/cmake/build_env_info.cmake) - check_git_setup() -ENDIF() +include(${KOKKOS_SRC_PATH}/cmake/build_env_info.cmake) +check_git_setup() # The build environment setup goes in the following steps # 1) Check all the enable options. This includes checking Kokkos_DEVICES @@ -223,102 +217,54 @@ ENDIF() # 3) Check the CXX standard and select important CXX flags # 4) Check for any third-party libraries (TPLs) like hwloc # 5) Check if optimizing for a particular architecture and add arch-specific flags -KOKKOS_SETUP_BUILD_ENVIRONMENT() +kokkos_setup_build_environment() # Finish off the build # 6) Recurse into subdirectories and configure individual libraries # 7) Export and install targets -OPTION(BUILD_SHARED_LIBS "Build shared libraries" OFF) +option(BUILD_SHARED_LIBS "Build shared libraries" OFF) -SET(KOKKOS_COMPONENT_LIBRARIES kokkoscore kokkoscontainers kokkosalgorithms kokkossimd) -SET_PROPERTY(GLOBAL PROPERTY KOKKOS_INT_LIBRARIES kokkos ${KOKKOS_COMPONENT_LIBRARIES}) +set(KOKKOS_COMPONENT_LIBRARIES kokkoscore kokkoscontainers kokkosalgorithms kokkossimd) +set_property(GLOBAL PROPERTY KOKKOS_INT_LIBRARIES kokkos ${KOKKOS_COMPONENT_LIBRARIES}) -IF (KOKKOS_HAS_TRILINOS) - SET(TRILINOS_INCDIR ${${PROJECT_NAME}_INSTALL_INCLUDE_DIR}) - SET(KOKKOS_HEADER_DIR ${TRILINOS_INCDIR}) - SET(KOKKOS_IS_SUBDIRECTORY TRUE) -ELSEIF(HAS_PARENT) - SET(KOKKOS_HEADER_DIR "include/kokkos") - SET(KOKKOS_IS_SUBDIRECTORY TRUE) -ELSE() - SET(KOKKOS_HEADER_DIR "${CMAKE_INSTALL_INCLUDEDIR}") - SET(KOKKOS_IS_SUBDIRECTORY FALSE) -ENDIF() +if(HAS_PARENT) + set(KOKKOS_HEADER_DIR "include/kokkos") + set(KOKKOS_IS_SUBDIRECTORY TRUE) +else() + set(KOKKOS_HEADER_DIR "${CMAKE_INSTALL_INCLUDEDIR}") + set(KOKKOS_IS_SUBDIRECTORY FALSE) +endif() #------------------------------------------------------------------------------ # # A) Forward declare the package so that certain options are also defined for # subpackages -## This restores the old behavior of ProjectCompilerPostConfig.cmake -# We must do this before KOKKOS_PACKAGE_DECL -IF (KOKKOS_HAS_TRILINOS) - # Overwrite the old flags at the top-level - # Because Tribits doesn't use lists, it uses spaces for the list of CXX flags - # we have to match the annoying behavior, also we have to preserve quotes - # which needs another workaround. - SET(KOKKOS_COMPILE_OPTIONS_TMP) - IF (KOKKOS_ENABLE_HIP) - LIST(APPEND KOKKOS_COMPILE_OPTIONS ${KOKKOS_AMDGPU_OPTIONS}) - ENDIF() - FOREACH(OPTION ${KOKKOS_COMPILE_OPTIONS}) - STRING(FIND "${OPTION}" " " OPTION_HAS_WHITESPACE) - IF(OPTION_HAS_WHITESPACE EQUAL -1) - LIST(APPEND KOKKOS_COMPILE_OPTIONS_TMP "${OPTION}") - ELSE() - LIST(APPEND KOKKOS_COMPILE_OPTIONS_TMP "\"${OPTION}\"") - ENDIF() - ENDFOREACH() - STRING(REPLACE ";" " " KOKKOSCORE_COMPILE_OPTIONS "${KOKKOS_COMPILE_OPTIONS_TMP}") - LIST(APPEND KOKKOS_ALL_COMPILE_OPTIONS ${KOKKOS_COMPILE_OPTIONS}) - IF (KOKKOS_ENABLE_CUDA) - LIST(APPEND KOKKOS_ALL_COMPILE_OPTIONS ${KOKKOS_CUDA_OPTIONS}) - ENDIF() - FOREACH(XCOMP_FLAG ${KOKKOS_XCOMPILER_OPTIONS}) - SET(KOKKOSCORE_XCOMPILER_OPTIONS "${KOKKOSCORE_XCOMPILER_OPTIONS} -Xcompiler ${XCOMP_FLAG}") - LIST(APPEND KOKKOS_ALL_COMPILE_OPTIONS -Xcompiler ${XCOMP_FLAG}) - ENDFOREACH() - IF (KOKKOS_ENABLE_CUDA) - STRING(REPLACE ";" " " KOKKOSCORE_CUDA_OPTIONS "${KOKKOS_CUDA_OPTIONS}") - FOREACH(CUDAFE_FLAG ${KOKKOS_CUDAFE_OPTIONS}) - SET(KOKKOSCORE_CUDAFE_OPTIONS "${KOKKOSCORE_CUDAFE_OPTIONS} -Xcudafe ${CUDAFE_FLAG}") - LIST(APPEND KOKKOS_ALL_COMPILE_OPTIONS -Xcudafe ${CUDAFE_FLAG}) - ENDFOREACH() - ENDIF() - #These flags get set up in KOKKOS_PACKAGE_DECL, which means they - #must be configured before KOKKOS_PACKAGE_DECL - SET(KOKKOS_ALL_COMPILE_OPTIONS - $<$:${KOKKOS_ALL_COMPILE_OPTIONS}>) -ENDIF() - - #------------------------------------------------------------------------------ # # D) Process the subpackages (subdirectories) for Kokkos # -KOKKOS_PROCESS_SUBPACKAGES() - +kokkos_process_subpackages() #------------------------------------------------------------------------------ # # E) If Kokkos itself is enabled, process the Kokkos package # -KOKKOS_PACKAGE_POSTPROCESS() -KOKKOS_CONFIGURE_CORE() +kokkos_configure_core() -IF (NOT KOKKOS_HAS_TRILINOS AND NOT Kokkos_INSTALL_TESTING) - ADD_LIBRARY(kokkos INTERFACE) +if(NOT Kokkos_INSTALL_TESTING) + add_library(kokkos INTERFACE) #Make sure in-tree projects can reference this as Kokkos:: #to match the installed target names - ADD_LIBRARY(Kokkos::kokkos ALIAS kokkos) + add_library(Kokkos::kokkos ALIAS kokkos) # all_libs target is required for TriBITS-compliance - ADD_LIBRARY(Kokkos::all_libs ALIAS kokkos) - TARGET_LINK_LIBRARIES(kokkos INTERFACE ${KOKKOS_COMPONENT_LIBRARIES}) - KOKKOS_INTERNAL_ADD_LIBRARY_INSTALL(kokkos) -ENDIF() -INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_install.cmake) + add_library(Kokkos::all_libs ALIAS kokkos) + target_link_libraries(kokkos INTERFACE ${KOKKOS_COMPONENT_LIBRARIES}) + kokkos_internal_add_library_install(kokkos) +endif() +include(${KOKKOS_SRC_PATH}/cmake/kokkos_install.cmake) # nvcc_wrapper is Kokkos' wrapper for NVIDIA's NVCC CUDA compiler. # Kokkos needs nvcc_wrapper in order to build. Other libraries and @@ -327,16 +273,15 @@ INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_install.cmake) # as relative to ${CMAKE_INSTALL_PATH}. # KOKKOS_INSTALL_ADDITIONAL_FILES will install nvcc wrapper and other generated # files -KOKKOS_INSTALL_ADDITIONAL_FILES() - +kokkos_install_additional_files() # Finally - if we are a subproject - make sure the enabled devices are visible -IF (HAS_PARENT) - FOREACH(DEV Kokkos_ENABLED_DEVICES) +if(HAS_PARENT) + foreach(DEV Kokkos_ENABLED_DEVICES) #I would much rather not make these cache variables or global properties, but I can't #make any guarantees on whether PARENT_SCOPE is good enough to make #these variables visible where I need them - SET(Kokkos_ENABLE_${DEV} ON PARENT_SCOPE) - SET_PROPERTY(GLOBAL PROPERTY Kokkos_ENABLE_${DEV} ON) - ENDFOREACH() -ENDIF() + set(Kokkos_ENABLE_${DEV} ON PARENT_SCOPE) + set_property(GLOBAL PROPERTY Kokkos_ENABLE_${DEV} ON) + endforeach() +endif() diff --git a/packages/kokkos/CONTRIBUTING.md b/packages/kokkos/CONTRIBUTING.md index b4f3057cef2c..e97f8c4d89c5 100644 --- a/packages/kokkos/CONTRIBUTING.md +++ b/packages/kokkos/CONTRIBUTING.md @@ -7,6 +7,8 @@ We actively welcome pull requests. 3. If you've changed APIs, update the documentation. 4. Ensure the test suite passes. +Before sending your patch for review, please try to ensure that it is formatted properly. We use clang-format version 16 for this. + ## Issues We use GitHub issues to track public bugs. Please ensure your description is clear and has sufficient instructions to be able to reproduce the issue. diff --git a/packages/kokkos/HOW_TO_SNAPSHOT b/packages/kokkos/HOW_TO_SNAPSHOT deleted file mode 100644 index ad3f78efb4f8..000000000000 --- a/packages/kokkos/HOW_TO_SNAPSHOT +++ /dev/null @@ -1,73 +0,0 @@ - -Developers of Kokkos (those who commit modifications to Kokkos) -must maintain the snapshot of Kokkos in the Trilinos repository. - -This file contains instructions for how to -snapshot Kokkos from github.com/kokkos to Trilinos. - ------------------------------------------------------------------------- -*** EVERYTHING GOES RIGHT WORKFLOW *** - -1) Given a 'git clone' of Kokkos and of Trilinos repositories. -1.1) Let ${KOKKOS} be the absolute path to the Kokkos clone. - This path *must* terminate with the directory name 'kokkos'; - e.g., ${HOME}/kokkos . -1.2) Let ${TRILINOS} be the absolute path to the Trilinos directory. - -2) Given that the Kokkos build & test is clean and - changes are committed to the Kokkos clone. - -3) Snapshot the current commit in the Kokkos clone into the Trilinos clone. - This overwrites ${TRILINOS}/packages/kokkos with the content of ${KOKKOS}: - ${KOKKOS}/scripts/snapshot.py --verbose ${KOKKOS} ${TRILINOS}/packages - -4) Verify the snapshot commit happened as expected - cd ${TRILINOS}/packages/kokkos - git log -1 --name-only - -5) Modify, build, and test Trilinos with the Kokkos snapshot. - -6) Given that that the Trilinos build & test is clean and - changes are committed to the Trilinos clone. - -7) Attempt push to the Kokkos repository. - If push fails then you must 'remove the Kokkos snapshot' - from your Trilinos clone. - See below. - -8) Attempt to push to the Trilinos repository. - If updating for a failed push requires you to change Kokkos you must - 'remove the Kokkos snapshot' from your Trilinos clone. - See below. - ------------------------------------------------------------------------- -*** WHEN SOMETHING GOES WRONG AND YOU MUST *** -*** REMOVE THE KOKKOS SNAPSHOT FROM YOUR TRILINOS CLONE *** - -1) Query the Trilinos clone commit log. - git log --oneline - -2) Note the of the commit to the Trillinos clone - immediately BEFORE the Kokkos snapshot commit. - Copy this for use in the next command. - -3) IF more than one outstanding commit then you can remove just the - Kokkos snapshot commit with 'git rebase -i'. Edit the rebase file. - Remove or comment out the Kokkos snapshot commit entry. - git rebase -i - -4) IF the Kokkos snapshot commit is the one and only - outstanding commit then remove just than commit. - git reset --hard HEAD~1 - ------------------------------------------------------------------------- -*** REGARDING 'snapshot.py' TOOL *** - -The 'snapshot.py' tool is developed and maintained by the -Center for Computing Research (CCR) -Software Engineering, Maintenance, and Support (SEMS) team. - -Contact Brent Perschbacher for questions> - ------------------------------------------------------------------------- - diff --git a/packages/kokkos/Makefile.kokkos b/packages/kokkos/Makefile.kokkos index ccb568a553ce..9e6ad3241564 100644 --- a/packages/kokkos/Makefile.kokkos +++ b/packages/kokkos/Makefile.kokkos @@ -1,8 +1,8 @@ # Default settings common options. KOKKOS_VERSION_MAJOR = 4 -KOKKOS_VERSION_MINOR = 4 -KOKKOS_VERSION_PATCH = 1 +KOKKOS_VERSION_MINOR = 5 +KOKKOS_VERSION_PATCH = 0 KOKKOS_VERSION = $(shell echo $(KOKKOS_VERSION_MAJOR)*10000+$(KOKKOS_VERSION_MINOR)*100+$(KOKKOS_VERSION_PATCH) | bc) # Options: Cuda,HIP,SYCL,OpenMPTarget,OpenMP,Threads,Serial @@ -13,7 +13,7 @@ KOKKOS_DEVICES ?= "Threads" # NVIDIA: Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75,Ampere80,Ampere86,Ada89,Hopper90 # ARM: ARMv80,ARMv81,ARMv8-ThunderX,ARMv8-TX2,A64FX,ARMv9-Grace # IBM: Power8,Power9 -# AMD-GPUS: AMD_GFX906,AMD_GFX908,AMD_GFX90A,AMD_GFX940,AMD_GFX942,AMD_GFX1030,AMD_GFX1100 +# AMD-GPUS: AMD_GFX906,AMD_GFX908,AMD_GFX90A,AMD_GFX940,AMD_GFX942,AMD_GFX1030,AMD_GFX1100,AMD_GFX1103 # AMD-CPUS: AMDAVX,Zen,Zen2,Zen3 # Intel-GPUs: Intel_Gen,Intel_Gen9,Intel_Gen11,Intel_Gen12LP,Intel_DG1,Intel_XeHP,Intel_PVC KOKKOS_ARCH ?= "" @@ -30,16 +30,19 @@ KOKKOS_TRIBITS ?= "no" KOKKOS_STANDALONE_CMAKE ?= "no" # Default settings specific options. -# Options: force_uvm,use_ldg,rdc,enable_lambda,enable_constexpr,disable_malloc_async +# Options: force_uvm,use_ldg,rdc,enable_lambda,enable_constexpr,enable_malloc_async KOKKOS_CUDA_OPTIONS ?= "" -# Options: rdc +# Options: rdc,enable_malloc_async KOKKOS_HIP_OPTIONS ?= "" # Default settings specific options. # Options: enable_async_dispatch KOKKOS_HPX_OPTIONS ?= "" +#Options : force_host_as_device +KOKKOS_OPENACC_OPTIONS ?= "" + # Helper functions for conversion to upper case uppercase_TABLE:=a,A b,B c,C d,D e,E f,F g,G h,H i,I j,J k,K l,L m,M n,N o,O p,P q,Q r,R s,S t,T u,U v,V w,W x,X y,Y z,Z uppercase_internal=$(if $1,$$(subst $(firstword $1),$(call uppercase_internal,$(wordlist 2,$(words $1),$1),$2)),$2) @@ -82,7 +85,7 @@ KOKKOS_INTERNAL_CUDA_USE_UVM := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS), KOKKOS_INTERNAL_CUDA_USE_RELOC := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS),rdc) KOKKOS_INTERNAL_CUDA_USE_LAMBDA := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS),enable_lambda) KOKKOS_INTERNAL_CUDA_USE_CONSTEXPR := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS),enable_constexpr) -KOKKOS_INTERNAL_CUDA_DISABLE_MALLOC_ASYNC := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS),disable_malloc_async) +KOKKOS_INTERNAL_CUDA_ENABLE_MALLOC_ASYNC := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS),enable_malloc_async) KOKKOS_INTERNAL_HPX_ENABLE_ASYNC_DISPATCH := $(call kokkos_has_string,$(KOKKOS_HPX_OPTIONS),enable_async_dispatch) # deprecated KOKKOS_INTERNAL_ENABLE_DESUL_ATOMICS := $(call kokkos_has_string,$(KOKKOS_OPTIONS),enable_desul_atomics) @@ -93,6 +96,8 @@ KOKKOS_INTERNAL_DISABLE_DEPRECATED_CODE := $(call kokkos_has_string,$(KOKKOS_OPT KOKKOS_INTERNAL_ENABLE_DEPRECATION_WARNINGS := $(call kokkos_has_string,$(KOKKOS_OPTIONS),enable_deprecation_warnings) KOKKOS_INTERNAL_HIP_USE_RELOC := $(call kokkos_has_string,$(KOKKOS_HIP_OPTIONS),rdc) +KOKKOS_INTERNAL_HIP_ENABLE_MALLOC_ASYNC := $(call kokkos_has_string,$(KOKKOS_HIP_OPTIONS),enable_malloc_async) +KOKKOS_INTERNAL_OPENACC_FORCE_HOST_AS_DEVICE := $(call kokkos_has_string,$(KOKKOS_OPENACC_OPTIONS),force_host_as_device) # Check for Kokkos Host Execution Spaces one of which must be on. KOKKOS_INTERNAL_USE_OPENMP := $(call kokkos_has_string,$(subst OpenMPTarget,,$(KOKKOS_DEVICES)),OpenMP) @@ -168,7 +173,7 @@ KOKKOS_INTERNAL_COMPILER_CRAY := $(strip $(shell $(CXX) -craype-verbose 2 KOKKOS_INTERNAL_COMPILER_NVCC := $(strip $(shell echo "$(shell export OMPI_CXX=$(OMPI_CXX); export MPICH_CXX=$(MPICH_CXX); $(CXX) --version 2>&1 | grep -c nvcc)>0" | bc)) KOKKOS_INTERNAL_COMPILER_NVHPC := $(strip $(shell $(CXX) --version 2>&1 | grep -c "nvc++")) KOKKOS_INTERNAL_COMPILER_CLANG := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),clang) -KOKKOS_INTERNAL_COMPILER_CRAY_CLANG := $(strip $(shell $(CXX) -craype-verbose 2>&1 | grep -c "clang++")) +KOKKOS_INTERNAL_COMPILER_CRAY_CLANG := $(strip $(shell $(CXX) -craype-verbose 2>&1 | grep -v "error:" | grep -c "clang++")) KOKKOS_INTERNAL_COMPILER_INTEL_CLANG := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),oneAPI) KOKKOS_INTERNAL_COMPILER_APPLE_CLANG := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),Apple clang) KOKKOS_INTERNAL_COMPILER_HCC := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),HCC) @@ -282,6 +287,8 @@ ifeq ($(KOKKOS_INTERNAL_USE_OPENACC), 1) # Set OpenACC flags. ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 1) KOKKOS_INTERNAL_OPENACC_FLAG := -acc + else ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1) + KOKKOS_INTERNAL_OPENACC_FLAG := -fopenacc -fopenacc-fake-async-wait -fopenacc-implicit-worker=vector -Wno-openacc-and-cxx -Wno-openmp-mapping -Wno-unknown-cuda-version -Wno-pass-failed else $(error Makefile.kokkos: OpenACC is enabled but the compiler must be NVHPC (got version string $(KOKKOS_CXX_VERSION))) endif @@ -401,8 +408,8 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0) endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 1) + KOKKOS_INTERNAL_NVCC_PATH := $(shell which nvcc) ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1) - KOKKOS_INTERNAL_NVCC_PATH := $(shell which nvcc) CUDA_PATH ?= $(KOKKOS_INTERNAL_NVCC_PATH:/bin/nvcc=) ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1) KOKKOS_INTERNAL_OPENMPTARGET_FLAG := $(KOKKOS_INTERNAL_OPENMPTARGET_FLAG) --cuda-path=$(CUDA_PATH) @@ -455,6 +462,15 @@ KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1100 := $(call kokkos_has_string,$(KOKKOS_ARCH), ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1100), 0) KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1100 := $(call kokkos_has_string,$(KOKKOS_ARCH),NAVI1100) endif +KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1103 := $(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX1103) +KOKKOS_INTERNAL_USE_ARCH_AMD := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX906) \ + + $(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX908) \ + + $(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX90A) \ + + $(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX940) \ + + $(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX942) \ + + $(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1030) \ + + $(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1100) \ + + $(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1103)) # Any AVX? KOKKOS_INTERNAL_USE_ARCH_AVX := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_SNB) + $(KOKKOS_INTERNAL_USE_ARCH_AMDAVX)) @@ -550,6 +566,9 @@ endif ifeq ($(KOKKOS_INTERNAL_USE_OPENACC), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_OPENACC") + ifeq ($(KOKKOS_INTERNAL_OPENACC_FORCE_HOST_AS_DEVICE), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE") + endif endif ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1) @@ -722,7 +741,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) endif endif - ifeq ($(KOKKOS_INTERNAL_CUDA_DISABLE_MALLOC_ASYNC), 0) + ifeq ($(KOKKOS_INTERNAL_CUDA_ENABLE_MALLOC_ASYNC), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC") else tmp := $(call kokkos_append_header,"/* $H""undef KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC */") @@ -1013,86 +1032,122 @@ ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1) endif endif +ifeq ($(KOKKOS_INTERNAL_USE_OPENACC), 1) + ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1) + KOKKOS_INTERNAL_CUDA_ARCH_FLAG=--offload-arch + endif +endif + # Do not add this flag if its the cray compiler or the nvhpc compiler. ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY_CLANG), 0) - ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) - # Lets start with adding architecture defines - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER30), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER30") + # Lets start with adding architecture defines + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER30), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER30") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_30 endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER32), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER32") + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER32), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER32") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_32 endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER35), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER35") + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER35), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER35") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_35 endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER37), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER37") + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER37), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER37") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_37 endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL50") + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL50") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_50 endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL52), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL52") + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL52), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL52") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_52 endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL53") + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL53") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_53 endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_PASCAL60), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_PASCAL") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_PASCAL60") + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_PASCAL60), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_PASCAL") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_PASCAL60") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_60 endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_PASCAL61), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_PASCAL") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_PASCAL61") + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_PASCAL61), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_PASCAL") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_PASCAL61") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_61 endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_VOLTA70), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VOLTA") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VOLTA70") + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_VOLTA70), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VOLTA") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VOLTA70") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_70 endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_VOLTA72), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VOLTA") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VOLTA72") + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_VOLTA72), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VOLTA") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VOLTA72") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_72 endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_TURING75), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_TURING75") + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_TURING75), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_TURING75") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_75 endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMPERE80), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE80") + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMPERE80), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE80") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_80 endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMPERE86), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE86") + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMPERE86), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE86") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_86 endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ADA89), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_ADA89") + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ADA89), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_ADA89") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_89 endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_HOPPER90), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_HOPPER") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_HOPPER90") + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_HOPPER90), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_HOPPER") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_HOPPER90") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_90 endif endif @@ -1108,6 +1163,9 @@ ifneq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0) ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1) KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG) endif + ifeq ($(KOKKOS_INTERNAL_USE_OPENACC), 1) + KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG) + endif endif endif @@ -1115,38 +1173,43 @@ endif # Figure out the architecture flag for ROCm. ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX906), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GFX906") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU") - KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx906 + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU \"gfx906\"") + KOKKOS_INTERNAL_AMD_ARCH_FLAG := --offload-arch=gfx906 endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX908), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GFX908") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU") - KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx908 + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU \"gfx908\"") + KOKKOS_INTERNAL_AMD_ARCH_FLAG := --offload-arch=gfx908 endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX90A), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GFX90A") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU") - KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx90a + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU \"gfx90A\"") + KOKKOS_INTERNAL_AMD_ARCH_FLAG := --offload-arch=gfx90a endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX940), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GFX940") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU") - KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx940 + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU \"gfx940\"") + KOKKOS_INTERNAL_AMD_ARCH_FLAG := --offload-arch=gfx940 endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX942), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GFX942") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU") - KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx942 + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU \"gfx942\"") + KOKKOS_INTERNAL_AMD_ARCH_FLAG := --offload-arch=gfx942 endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1030), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GFX1030") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU") - KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx1030 + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU \"gfx1030\"") + KOKKOS_INTERNAL_AMD_ARCH_FLAG := --offload-arch=gfx1030 endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1100), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GFX1100") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU") - KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx1100 + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU \"gfx1100\"") + KOKKOS_INTERNAL_AMD_ARCH_FLAG := --offload-arch=gfx1100 +endif +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1103), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GFX1103") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU \"gfx1103\"") + KOKKOS_INTERNAL_AMD_ARCH_FLAG := --offload-arch=gfx1103 endif @@ -1155,8 +1218,8 @@ ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1) KOKKOS_SRC += $(KOKKOS_PATH)/tpls/desul/src/Lock_Array_HIP.cpp KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/HIP/*.hpp) - KOKKOS_CXXFLAGS+=$(KOKKOS_INTERNAL_HIP_ARCH_FLAG) - KOKKOS_LDFLAGS+=$(KOKKOS_INTERNAL_HIP_ARCH_FLAG) + KOKKOS_CXXFLAGS+=$(KOKKOS_INTERNAL_AMD_ARCH_FLAG) + KOKKOS_LDFLAGS+=$(KOKKOS_INTERNAL_AMD_ARCH_FLAG) ifeq ($(KOKKOS_INTERNAL_HIP_USE_RELOC), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE") @@ -1166,6 +1229,21 @@ ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1) KOKKOS_CXXFLAGS+=-fno-gpu-rdc KOKKOS_LDFLAGS+=-fno-gpu-rdc endif + + ifeq ($(KOKKOS_INTERNAL_HIP_ENABLE_MALLOC_ASYNC), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_IMPL_HIP_MALLOC_ASYNC") + else + tmp := $(call kokkos_append_header,"/* $H""undef KOKKOS_ENABLE_IMPL_HIP_MALLOC_ASYNC */") + endif +endif + +ifneq ($(KOKKOS_INTERNAL_USE_ARCH_AMD), 0) + ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1) + ifeq ($(KOKKOS_INTERNAL_USE_OPENACC), 1) + KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_AMD_ARCH_FLAG) + KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_AMD_ARCH_FLAG) + endif + endif endif # Figure out Intel architecture flags. @@ -1219,6 +1297,8 @@ ifeq ($(KOKKOS_INTERNAL_USE_SYCL), 1) KOKKOS_CXXFLAGS+=$(KOKKOS_INTERNAL_INTEL_ARCH_FLAG) KOKKOS_LDFLAGS+=-fsycl KOKKOS_LDFLAGS+=$(KOKKOS_INTERNAL_INTEL_ARCH_FLAG) + + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_SYCL_RELOCATABLE_DEVICE_CODE") endif ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1) @@ -1306,6 +1386,8 @@ ifneq ($(KOKKOS_INTERNAL_NEW_CONFIG), 0) endif KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/*.hpp) +KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/View/*.hpp) +KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/View/MDSpan/*.hpp) KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/impl/*.hpp) KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/containers/src/*.hpp) KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/containers/src/impl/*.hpp) @@ -1358,6 +1440,48 @@ ifeq ($(KOKKOS_INTERNAL_USE_OPENACC), 1) KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_OPENACC_FLAG) KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_OPENACC_FLAG) KOKKOS_LIBS += $(KOKKOS_INTERNAL_OPENACC_LIB) + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 1) + ifneq ($(CUDA_PATH),) + ifeq ($(call kokkos_path_exists,$(CUDA_PATH)/lib), 1) + CUDA_PATH := $(CUDA_PATH:/compilers=/cuda) + endif + endif + ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1) + ifneq ($(CUDA_PATH),) + KOKKOS_LDFLAGS += -L$(CUDA_PATH)/lib64 + endif + KOKKOS_LIBS += -lcudart + endif + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 1) + KOKKOS_LIBS += -cuda + endif + ifeq ($(KOKKOS_INTERNAL_OPENACC_FORCE_HOST_AS_DEVICE), 1) + $(error If a GPU architecture is specified, KOKKOS_OPENACC_OPTIONS = force_host_as_device cannot be used. Disable the force_host_as_device option) + endif + else ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD), 1) + ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1) + ifneq ($(ROCM_PATH),) + KOKKOS_CPPFLAGS += -I$(ROCM_PATH)/include + KOKKOS_LDFLAGS += -L$(ROCM_PATH)/lib + endif + KOKKOS_LIBS += -lamdhip64 + endif + ifeq ($(KOKKOS_INTERNAL_OPENACC_FORCE_HOST_AS_DEVICE), 1) + $(error If a GPU architecture is specified, KOKKOS_OPENACC_OPTIONS = force_host_as_device cannot be used. Disable the force_host_as_device option) + endif + else ifeq ($(KOKKOS_INTERNAL_OPENACC_FORCE_HOST_AS_DEVICE), 1) + # Compile for kernel execution on the host. In that case, + # memory is shared between the OpenACC space and the host space. + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 1) + KOKKOS_CXXFLAGS += -acc=multicore + endif + else + # Automatic fallback mode; try to offload any available GPU, and fall back + # to the host CPU if no available GPU is found. + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 1) + KOKKOS_CXXFLAGS += -acc=gpu,multicore + endif + endif endif ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1) @@ -1468,7 +1592,11 @@ else endif ifeq ($(KOKKOS_INTERNAL_USE_OPENACC), 1) - tmp := $(call desul_append_header,"$H""define DESUL_ATOMICS_ENABLE_OPENACC") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 1) + tmp := $(call desul_append_header,"$H""define DESUL_ATOMICS_ENABLE_OPENACC") + else + tmp := $(call desul_append_header,"/* $H""undef DESUL_ATOMICS_ENABLE_OPENACC */") + endif else tmp := $(call desul_append_header,"/* $H""undef DESUL_ATOMICS_ENABLE_OPENACC */") endif @@ -1496,6 +1624,12 @@ $(DESUL_CONFIG_HEADER): KOKKOS_CPP_DEPENDS := $(DESUL_CONFIG_HEADER) KokkosCore_config.h $(KOKKOS_HEADERS) +# Tasking is deprecated +ifeq ($(KOKKOS_INTERNAL_DISABLE_DEPRECATED_CODE), 1) + TMP_KOKKOS_SRC := $(KOKKOS_SRC) + KOKKOS_SRC = $(patsubst %Task.cpp,, $(TMP_KOKKOS_SRC)) +endif + KOKKOS_OBJ = $(KOKKOS_SRC:.cpp=.o) KOKKOS_OBJ_LINK = $(notdir $(KOKKOS_OBJ)) diff --git a/packages/kokkos/Makefile.targets b/packages/kokkos/Makefile.targets index e8e429e02750..be535eea3e7c 100644 --- a/packages/kokkos/Makefile.targets +++ b/packages/kokkos/Makefile.targets @@ -16,8 +16,6 @@ Kokkos_HostSpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Ho $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HostSpace.cpp Kokkos_hwloc.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_hwloc.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_hwloc.cpp -Kokkos_TaskQueue.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_TaskQueue.cpp - $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_TaskQueue.cpp Kokkos_HostThreadTeam.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HostThreadTeam.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HostThreadTeam.cpp Kokkos_HostBarrier.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HostBarrier.cpp @@ -38,17 +36,21 @@ Kokkos_Abort.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Abort. ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1) Kokkos_Serial.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Serial/Kokkos_Serial.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Serial/Kokkos_Serial.cpp +ifneq ($(KOKKOS_INTERNAL_DISABLE_DEPRECATED_CODE), 1) Kokkos_Serial_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Serial/Kokkos_Serial_Task.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Serial/Kokkos_Serial_Task.cpp endif +endif ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) Kokkos_Cuda_Instance.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Instance.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Instance.cpp Kokkos_CudaSpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_CudaSpace.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_CudaSpace.cpp +ifneq ($(KOKKOS_INTERNAL_DISABLE_DEPRECATED_CODE), 1) Kokkos_Cuda_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Task.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Task.cpp +endif Lock_Array_CUDA.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/tpls/desul/src/Lock_Array_CUDA.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/tpls/desul/src/Lock_Array_CUDA.cpp endif @@ -73,6 +75,8 @@ Kokkos_HIP_Space.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/HIP/Kokkos_HIP $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/HIP/Kokkos_HIP_Space.cpp Kokkos_HIP_Instance.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/HIP/Kokkos_HIP_Instance.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/HIP/Kokkos_HIP_Instance.cpp +Kokkos_HIP_ZeroMemset.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/HIP/Kokkos_HIP_ZeroMemset.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/HIP/Kokkos_HIP_ZeroMemset.cpp Lock_Array_HIP.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/tpls/desul/src/Lock_Array_HIP.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/tpls/desul/src/Lock_Array_HIP.cpp endif @@ -89,26 +93,26 @@ Kokkos_OpenMP.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_Ope $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMP.cpp Kokkos_OpenMP_Instance.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMP_Instance.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMP_Instance.cpp +ifneq ($(KOKKOS_INTERNAL_DISABLE_DEPRECATED_CODE), 1) Kokkos_OpenMP_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMP_Task.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMP_Task.cpp endif +endif ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1) Kokkos_HPX.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/HPX/Kokkos_HPX.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/HPX/Kokkos_HPX.cpp +ifneq ($(KOKKOS_INTERNAL_DISABLE_DEPRECATED_CODE), 1) Kokkos_HPX_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/HPX/Kokkos_HPX_Task.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/HPX/Kokkos_HPX_Task.cpp endif +endif ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1) -Kokkos_OpenMPTarget_Exec.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp - $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp Kokkos_OpenMPTarget_Instance.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp Kokkos_OpenMPTargetSpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp -Kokkos_OpenMPTarget_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.cpp - $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.cpp endif ifeq ($(KOKKOS_INTERNAL_USE_OPENACC), 1) diff --git a/packages/kokkos/README.md b/packages/kokkos/README.md index c8c6f8f7cf50..0ea07f9ea2f6 100644 --- a/packages/kokkos/README.md +++ b/packages/kokkos/README.md @@ -30,12 +30,12 @@ To start learning about Kokkos: The latest release of Kokkos can be obtained from the [GitHub releases page](https://github.com/kokkos/kokkos/releases/latest). -The current release is [4.3.01](https://github.com/kokkos/kokkos/releases/tag/4.3.01). +The current release is [4.5.00](https://github.com/kokkos/kokkos/releases/tag/4.5.00). ```bash -curl -OJ -L https://github.com/kokkos/kokkos/archive/refs/tags/4.3.01.tar.gz +curl -OJ -L https://github.com/kokkos/kokkos/releases/download/4.5.00/kokkos-4.5.00.tar.gz # Or with wget -wget https://github.com/kokkos/kokkos/archive/refs/tags/4.3.01.tar.gz +wget https://github.com/kokkos/kokkos/releases/download/4.5.00/kokkos-4.5.00.tar.gz ``` To clone the latest development version of Kokkos from GitHub: diff --git a/packages/kokkos/algorithms/CMakeLists.txt b/packages/kokkos/algorithms/CMakeLists.txt index 368984647e9f..73ce9f7ec552 100644 --- a/packages/kokkos/algorithms/CMakeLists.txt +++ b/packages/kokkos/algorithms/CMakeLists.txt @@ -1,7 +1,7 @@ -IF (NOT Kokkos_INSTALL_TESTING) - ADD_SUBDIRECTORY(src) -ENDIF() +if(NOT Kokkos_INSTALL_TESTING) + add_subdirectory(src) +endif() # FIXME_OPENACC: temporarily disabled due to unimplemented features -IF(NOT ((KOKKOS_ENABLE_OPENMPTARGET AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) OR KOKKOS_ENABLE_OPENACC)) - KOKKOS_ADD_TEST_DIRECTORIES(unit_tests) -ENDIF() +if(NOT ((KOKKOS_ENABLE_OPENMPTARGET AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) OR KOKKOS_ENABLE_OPENACC)) + kokkos_add_test_directories(unit_tests) +endif() diff --git a/packages/kokkos/algorithms/src/CMakeLists.txt b/packages/kokkos/algorithms/src/CMakeLists.txt index b490caca6282..9f10b85e0214 100644 --- a/packages/kokkos/algorithms/src/CMakeLists.txt +++ b/packages/kokkos/algorithms/src/CMakeLists.txt @@ -1,34 +1,29 @@ #I have to leave these here for tribits -KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) -KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) +kokkos_include_directories(${CMAKE_CURRENT_BINARY_DIR}) +kokkos_include_directories(${CMAKE_CURRENT_SOURCE_DIR}) #----------------------------------------------------------------------------- -FILE(GLOB ALGO_HEADERS *.hpp) -FILE(GLOB ALGO_SOURCES *.cpp) -APPEND_GLOB(ALGO_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/std_algorithms/*.hpp) -APPEND_GLOB(ALGO_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/std_algorithms/impl/*.hpp) +file(GLOB ALGO_HEADERS *.hpp) +file(GLOB ALGO_SOURCES *.cpp) +append_glob(ALGO_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/std_algorithms/*.hpp) +append_glob(ALGO_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/std_algorithms/impl/*.hpp) -INSTALL ( +install( DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/" DESTINATION ${KOKKOS_HEADER_DIR} - FILES_MATCHING PATTERN "*.hpp" + FILES_MATCHING + PATTERN "*.hpp" ) #----------------------------------------------------------------------------- # We have to pass the sources in here for Tribits # These will get ignored for standalone CMake and a true interface library made -KOKKOS_ADD_INTERFACE_LIBRARY( - kokkosalgorithms - NOINSTALLHEADERS ${ALGO_HEADERS} - SOURCES ${ALGO_SOURCES} -) -KOKKOS_LIB_INCLUDE_DIRECTORIES(kokkosalgorithms - ${KOKKOS_TOP_BUILD_DIR} - ${CMAKE_CURRENT_BINARY_DIR} - ${CMAKE_CURRENT_SOURCE_DIR} +kokkos_add_interface_library(kokkosalgorithms NOINSTALLHEADERS ${ALGO_HEADERS} SOURCES ${ALGO_SOURCES}) +kokkos_lib_include_directories( + kokkosalgorithms ${KOKKOS_TOP_BUILD_DIR} ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR} ) -KOKKOS_LINK_TPL(kokkoscontainers PUBLIC ROCTHRUST) -KOKKOS_LINK_TPL(kokkoscore PUBLIC ONEDPL) +kokkos_link_tpl(kokkoscontainers PUBLIC ROCTHRUST) +kokkos_link_tpl(kokkoscore PUBLIC ONEDPL) diff --git a/packages/kokkos/algorithms/src/Kokkos_Random.hpp b/packages/kokkos/algorithms/src/Kokkos_Random.hpp index 7df12b8518eb..b28ea4c2ca9a 100644 --- a/packages/kokkos/algorithms/src/Kokkos_Random.hpp +++ b/packages/kokkos/algorithms/src/Kokkos_Random.hpp @@ -615,7 +615,7 @@ template struct Random_UniqueIndex { using locks_view_type = View; KOKKOS_FUNCTION - static int get_state_idx(const locks_view_type) { + static int get_state_idx(const locks_view_type&) { KOKKOS_IF_ON_HOST( (return DeviceType::execution_space::impl_hardware_thread_id();)) @@ -665,17 +665,16 @@ struct Random_UniqueIndex< #ifdef KOKKOS_ENABLE_SYCL template -struct Random_UniqueIndex< - Kokkos::Device> { +struct Random_UniqueIndex> { using locks_view_type = - View>; + View>; KOKKOS_FUNCTION static int get_state_idx(const locks_view_type& locks_) { auto item = sycl::ext::oneapi::experimental::this_nd_item<3>(); std::size_t threadIdx[3] = {item.get_local_id(2), item.get_local_id(1), item.get_local_id(0)}; std::size_t blockIdx[3] = {item.get_group(2), item.get_group(1), - item.get_group(0)}; + item.get_group(0)}; std::size_t blockDim[3] = {item.get_local_range(2), item.get_local_range(1), item.get_local_range(0)}; std::size_t gridDim[3] = { @@ -1121,7 +1120,7 @@ class Random_XorShift1024_Pool { using execution_space = typename device_type::execution_space; using locks_type = View; using int_view_type = View; - using state_data_type = View; + using state_data_type = View; locks_type locks_ = {}; state_data_type state_ = {}; diff --git a/packages/kokkos/algorithms/src/sorting/Kokkos_BinOpsPublicAPI.hpp b/packages/kokkos/algorithms/src/sorting/Kokkos_BinOpsPublicAPI.hpp index 73e751f572c5..8e7de32a07b2 100644 --- a/packages/kokkos/algorithms/src/sorting/Kokkos_BinOpsPublicAPI.hpp +++ b/packages/kokkos/algorithms/src/sorting/Kokkos_BinOpsPublicAPI.hpp @@ -35,11 +35,11 @@ struct BinOp1D { #endif // Construct BinOp with number of bins, minimum value and maximum value - BinOp1D(int max_bins__, typename KeyViewType::const_value_type min, + BinOp1D(int max_bins, typename KeyViewType::const_value_type min, typename KeyViewType::const_value_type max) - : max_bins_(max_bins__ + 1), + : max_bins_(max_bins + 1), // Cast to double to avoid possible overflow when using integer - mul_(static_cast(max_bins__) / + mul_(static_cast(max_bins) / (static_cast(max) - static_cast(min))), min_(static_cast(min)) { // For integral types the number of bins may be larger than the range @@ -47,7 +47,7 @@ struct BinOp1D { // and then don't need to sort bins. if (std::is_integral::value && (static_cast(max) - static_cast(min)) <= - static_cast(max_bins__)) { + static_cast(max_bins)) { mul_ = 1.; } } @@ -82,16 +82,16 @@ struct BinOp3D { BinOp3D() = delete; #endif - BinOp3D(int max_bins__[], typename KeyViewType::const_value_type min[], + BinOp3D(int max_bins[], typename KeyViewType::const_value_type min[], typename KeyViewType::const_value_type max[]) { - max_bins_[0] = max_bins__[0]; - max_bins_[1] = max_bins__[1]; - max_bins_[2] = max_bins__[2]; - mul_[0] = static_cast(max_bins__[0]) / + max_bins_[0] = max_bins[0]; + max_bins_[1] = max_bins[1]; + max_bins_[2] = max_bins[2]; + mul_[0] = static_cast(max_bins[0]) / (static_cast(max[0]) - static_cast(min[0])); - mul_[1] = static_cast(max_bins__[1]) / + mul_[1] = static_cast(max_bins[1]) / (static_cast(max[1]) - static_cast(min[1])); - mul_[2] = static_cast(max_bins__[2]) / + mul_[2] = static_cast(max_bins[2]) / (static_cast(max[2]) - static_cast(min[2])); min_[0] = static_cast(min[0]); min_[1] = static_cast(min[1]); diff --git a/packages/kokkos/algorithms/src/sorting/Kokkos_BinSortPublicAPI.hpp b/packages/kokkos/algorithms/src/sorting/Kokkos_BinSortPublicAPI.hpp index c399279fe48f..f417b6b13b3c 100644 --- a/packages/kokkos/algorithms/src/sorting/Kokkos_BinSortPublicAPI.hpp +++ b/packages/kokkos/algorithms/src/sorting/Kokkos_BinSortPublicAPI.hpp @@ -388,7 +388,8 @@ class BinSort { // reasonable experimentally. if (use_std_sort && bin_size > 10) { KOKKOS_IF_ON_HOST( - (std::sort(&sort_order(lower_bound), &sort_order(upper_bound), + (std::sort(sort_order.data() + lower_bound, + sort_order.data() + upper_bound, [this](int p, int q) { return bin_op(keys_rnd, p, q); });)) } else { for (int k = lower_bound + 1; k < upper_bound; ++k) { diff --git a/packages/kokkos/algorithms/src/sorting/Kokkos_SortPublicAPI.hpp b/packages/kokkos/algorithms/src/sorting/Kokkos_SortPublicAPI.hpp index 308e9e3a008b..20026c77e415 100644 --- a/packages/kokkos/algorithms/src/sorting/Kokkos_SortPublicAPI.hpp +++ b/packages/kokkos/algorithms/src/sorting/Kokkos_SortPublicAPI.hpp @@ -53,9 +53,13 @@ void sort(const ExecutionSpace& exec, if constexpr (Impl::better_off_calling_std_sort_v) { exec.fence("Kokkos::sort without comparator use std::sort"); - auto first = ::Kokkos::Experimental::begin(view); - auto last = ::Kokkos::Experimental::end(view); - std::sort(first, last); + if (view.span_is_contiguous()) { + std::sort(view.data(), view.data() + view.size()); + } else { + auto first = ::Kokkos::Experimental::begin(view); + auto last = ::Kokkos::Experimental::end(view); + std::sort(first, last); + } } else { Impl::sort_device_view_without_comparator(exec, view); } @@ -107,9 +111,13 @@ void sort(const ExecutionSpace& exec, if constexpr (Impl::better_off_calling_std_sort_v) { exec.fence("Kokkos::sort with comparator use std::sort"); - auto first = ::Kokkos::Experimental::begin(view); - auto last = ::Kokkos::Experimental::end(view); - std::sort(first, last, comparator); + if (view.span_is_contiguous()) { + std::sort(view.data(), view.data() + view.size(), comparator); + } else { + auto first = ::Kokkos::Experimental::begin(view); + auto last = ::Kokkos::Experimental::end(view); + std::sort(first, last, comparator); + } } else { Impl::sort_device_view_with_comparator(exec, view, comparator); } diff --git a/packages/kokkos/algorithms/src/sorting/impl/Kokkos_SortByKeyImpl.hpp b/packages/kokkos/algorithms/src/sorting/impl/Kokkos_SortByKeyImpl.hpp index f11f80704843..2a8f761d9b4f 100644 --- a/packages/kokkos/algorithms/src/sorting/impl/Kokkos_SortByKeyImpl.hpp +++ b/packages/kokkos/algorithms/src/sorting/impl/Kokkos_SortByKeyImpl.hpp @@ -30,6 +30,7 @@ #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wshadow" +#pragma GCC diagnostic ignored "-Wsuggest-override" #if defined(KOKKOS_COMPILER_CLANG) // Some versions of Clang fail to compile Thrust, failing with errors like @@ -76,13 +77,10 @@ namespace Kokkos::Impl { template constexpr inline bool is_admissible_to_kokkos_sort_by_key = - ::Kokkos::is_view::value&& T::rank() == 1 && - (std::is_same::value || - std::is_same::value || - std::is_same::value); + ::Kokkos::is_view::value && T::rank() == 1 && + (std::is_same_v || + std::is_same_v || + std::is_same_v); template KOKKOS_INLINE_FUNCTION constexpr void @@ -144,7 +142,7 @@ void sort_by_key_rocthrust( #if defined(KOKKOS_ENABLE_ONEDPL) template -inline constexpr bool sort_on_device_v = +inline constexpr bool sort_on_device_v = std::is_same_v || std::is_same_v; @@ -152,7 +150,7 @@ inline constexpr bool sort_on_device_v = template void sort_by_key_onedpl( - const Kokkos::Experimental::SYCL& exec, + const Kokkos::SYCL& exec, const Kokkos::View& keys, const Kokkos::View& values, MaybeComparator&&... maybeComparator) { @@ -176,7 +174,7 @@ template void applyPermutation(const ExecutionSpace& space, const PermutationView& permutation, const ViewType& view) { - static_assert(std::is_integral::value); + static_assert(std::is_integral_v); auto view_copy = Kokkos::create_mirror( Kokkos::view_alloc(space, typename ExecutionSpace::memory_space{}, @@ -335,7 +333,7 @@ void sort_by_key_device_view_without_comparator( template void sort_by_key_device_view_without_comparator( - const Kokkos::Experimental::SYCL& exec, + const Kokkos::SYCL& exec, const Kokkos::View& keys, const Kokkos::View& values) { #ifdef KOKKOS_ONEDPL_HAS_SORT_BY_KEY @@ -392,7 +390,7 @@ void sort_by_key_device_view_with_comparator( template void sort_by_key_device_view_with_comparator( - const Kokkos::Experimental::SYCL& exec, + const Kokkos::SYCL& exec, const Kokkos::View& keys, const Kokkos::View& values, const ComparatorType& comparator) { diff --git a/packages/kokkos/algorithms/src/sorting/impl/Kokkos_SortImpl.hpp b/packages/kokkos/algorithms/src/sorting/impl/Kokkos_SortImpl.hpp index 08946228919b..734ce450f69e 100644 --- a/packages/kokkos/algorithms/src/sorting/impl/Kokkos_SortImpl.hpp +++ b/packages/kokkos/algorithms/src/sorting/impl/Kokkos_SortImpl.hpp @@ -34,6 +34,7 @@ #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wshadow" +#pragma GCC diagnostic ignored "-Wsuggest-override" #if defined(KOKKOS_COMPILER_CLANG) // Some versions of Clang fail to compile Thrust, failing with errors like @@ -146,7 +147,7 @@ void sort_via_binsort(const ExecutionSpace& exec, bool sort_in_bins = true; // TODO: figure out better max_bins then this ... int64_t max_bins = view.extent(0) / 2; - if (std::is_integral::value) { + if (std::is_integral_v) { // Cast to double to avoid possible overflow when using integer auto const max_val = static_cast(result.max_val); auto const min_val = static_cast(result.min_val); @@ -157,7 +158,7 @@ void sort_via_binsort(const ExecutionSpace& exec, sort_in_bins = false; } } - if (std::is_floating_point::value) { + if (std::is_floating_point_v) { KOKKOS_ASSERT(std::isfinite(static_cast(result.max_val) - static_cast(result.min_val))); } @@ -211,11 +212,11 @@ void sort_rocthrust(const HIP& space, #if defined(KOKKOS_ENABLE_ONEDPL) template -void sort_onedpl(const Kokkos::Experimental::SYCL& space, +void sort_onedpl(const Kokkos::SYCL& space, const Kokkos::View& view, MaybeComparator&&... maybeComparator) { using ViewType = Kokkos::View; - static_assert(SpaceAccessibility::accessible, "SYCL execution space is not able to access the memory space " "of the View argument!"); @@ -268,19 +269,29 @@ void copy_to_host_run_stdsort_copy_back( KE::copy(exec, view, view_dc); // run sort on the mirror of view_dc - auto mv_h = create_mirror_view_and_copy(Kokkos::HostSpace(), view_dc); - auto first = KE::begin(mv_h); - auto last = KE::end(mv_h); - std::sort(first, last, std::forward(maybeComparator)...); + auto mv_h = create_mirror_view_and_copy(Kokkos::HostSpace(), view_dc); + if (view.span_is_contiguous()) { + std::sort(mv_h.data(), mv_h.data() + mv_h.size(), + std::forward(maybeComparator)...); + } else { + auto first = KE::begin(mv_h); + auto last = KE::end(mv_h); + std::sort(first, last, std::forward(maybeComparator)...); + } Kokkos::deep_copy(exec, view_dc, mv_h); // copy back to argument view KE::copy(exec, KE::cbegin(view_dc), KE::cend(view_dc), KE::begin(view)); } else { auto view_h = create_mirror_view_and_copy(Kokkos::HostSpace(), view); - auto first = KE::begin(view_h); - auto last = KE::end(view_h); - std::sort(first, last, std::forward(maybeComparator)...); + if (view.span_is_contiguous()) { + std::sort(view_h.data(), view_h.data() + view_h.size(), + std::forward(maybeComparator)...); + } else { + auto first = KE::begin(view_h); + auto last = KE::end(view_h); + std::sort(first, last, std::forward(maybeComparator)...); + } Kokkos::deep_copy(exec, view, view_h); } } @@ -310,7 +321,7 @@ void sort_device_view_without_comparator( #if defined(KOKKOS_ENABLE_ONEDPL) template void sort_device_view_without_comparator( - const Kokkos::Experimental::SYCL& exec, + const Kokkos::SYCL& exec, const Kokkos::View& view) { using ViewType = Kokkos::View; static_assert( @@ -365,8 +376,7 @@ void sort_device_view_with_comparator( #if defined(KOKKOS_ENABLE_ONEDPL) template void sort_device_view_with_comparator( - const Kokkos::Experimental::SYCL& exec, - const Kokkos::View& view, + const Kokkos::SYCL& exec, const Kokkos::View& view, const ComparatorType& comparator) { using ViewType = Kokkos::View; static_assert( @@ -397,12 +407,12 @@ sort_device_view_with_comparator( // and then copies data back. Potentially, this can later be changed // with a better solution like our own quicksort on device or similar. - using ViewType = Kokkos::View; - using MemSpace = typename ViewType::memory_space; // Note with HIP unified memory this code path is still the right thing to do // if we end up here when RocThrust is not enabled. // The create_mirror_view_and_copy will do the right thing (no copy). -#ifndef KOKKOS_ENABLE_IMPL_HIP_UNIFIED_MEMORY +#ifndef KOKKOS_IMPL_HIP_UNIFIED_MEMORY + using ViewType = Kokkos::View; + using MemSpace = typename ViewType::memory_space; static_assert(!SpaceAccessibility::accessible, "Impl::sort_device_view_with_comparator: should not be called " "on a view that is already accessible on the host"); diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Reduce.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Reduce.hpp index b84f00f8bb50..ea7e55ca6190 100644 --- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Reduce.hpp +++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Reduce.hpp @@ -91,7 +91,7 @@ template = 0> ValueType reduce(const ExecutionSpace& ex, IteratorType first, IteratorType last, ValueType init_reduction_value) { - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); return Impl::reduce_default_functors_exespace_impl( @@ -105,7 +105,7 @@ template ::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); return Impl::reduce_default_functors_exespace_impl(label, ex, first, last, @@ -119,7 +119,7 @@ template & view, ValueType init_reduction_value) { - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); namespace KE = ::Kokkos::Experimental; @@ -137,7 +137,7 @@ template & view, ValueType init_reduction_value) { - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); namespace KE = ::Kokkos::Experimental; @@ -157,7 +157,7 @@ template ::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); return Impl::reduce_custom_functors_exespace_impl( @@ -172,7 +172,7 @@ template ::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); return Impl::reduce_custom_functors_exespace_impl( @@ -186,7 +186,7 @@ template & view, ValueType init_reduction_value, BinaryOp joiner) { - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); namespace KE = ::Kokkos::Experimental; @@ -204,7 +204,7 @@ template & view, ValueType init_reduction_value, BinaryOp joiner) { - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); namespace KE = ::Kokkos::Experimental; @@ -258,7 +258,7 @@ template < KOKKOS_FUNCTION ValueType reduce(const TeamHandleType& teamHandle, IteratorType first, IteratorType last, ValueType init_reduction_value) { - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); return Impl::reduce_default_functors_team_impl(teamHandle, first, last, @@ -273,7 +273,7 @@ KOKKOS_FUNCTION ValueType reduce(const TeamHandleType& teamHandle, const ::Kokkos::View& view, ValueType init_reduction_value) { - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); namespace KE = ::Kokkos::Experimental; @@ -294,7 +294,7 @@ KOKKOS_FUNCTION ValueType reduce(const TeamHandleType& teamHandle, IteratorType first, IteratorType last, ValueType init_reduction_value, BinaryOp joiner) { - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); return Impl::reduce_custom_functors_team_impl(teamHandle, first, last, @@ -309,7 +309,7 @@ KOKKOS_FUNCTION ValueType reduce(const TeamHandleType& teamHandle, const ::Kokkos::View& view, ValueType init_reduction_value, BinaryOp joiner) { - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); namespace KE = ::Kokkos::Experimental; diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_TransformReduce.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_TransformReduce.hpp index 101f5113f68a..89585ddbea0c 100644 --- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_TransformReduce.hpp +++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_TransformReduce.hpp @@ -117,7 +117,7 @@ ValueType transform_reduce(const ExecutionSpace& ex, IteratorType1 first1, ValueType init_reduction_value, BinaryJoinerType joiner, BinaryTransform transformer) { - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); return Impl::transform_reduce_custom_functors_exespace_impl( @@ -136,7 +136,7 @@ ValueType transform_reduce(const std::string& label, const ExecutionSpace& ex, IteratorType2 first2, ValueType init_reduction_value, BinaryJoinerType joiner, BinaryTransform transformer) { - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); return Impl::transform_reduce_custom_functors_exespace_impl( @@ -157,7 +157,7 @@ ValueType transform_reduce( ValueType init_reduction_value, BinaryJoinerType joiner, BinaryTransform transformer) { namespace KE = ::Kokkos::Experimental; - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(first_view); @@ -182,7 +182,7 @@ ValueType transform_reduce( ValueType init_reduction_value, BinaryJoinerType joiner, BinaryTransform transformer) { namespace KE = ::Kokkos::Experimental; - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(first_view); @@ -208,7 +208,7 @@ ValueType transform_reduce(const ExecutionSpace& ex, IteratorType first1, IteratorType last1, ValueType init_reduction_value, BinaryJoinerType joiner, UnaryTransform transformer) { - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); return Impl::transform_reduce_custom_functors_exespace_impl( @@ -228,7 +228,7 @@ ValueType transform_reduce(const std::string& label, const ExecutionSpace& ex, ValueType init_reduction_value, BinaryJoinerType joiner, UnaryTransform transformer) { - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); return Impl::transform_reduce_custom_functors_exespace_impl( @@ -248,7 +248,7 @@ ValueType transform_reduce(const ExecutionSpace& ex, BinaryJoinerType joiner, UnaryTransform transformer) { namespace KE = ::Kokkos::Experimental; - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view); @@ -270,7 +270,7 @@ ValueType transform_reduce(const std::string& label, const ExecutionSpace& ex, BinaryJoinerType joiner, UnaryTransform transformer) { namespace KE = ::Kokkos::Experimental; - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view); @@ -345,7 +345,7 @@ KOKKOS_FUNCTION ValueType transform_reduce( const TeamHandleType& teamHandle, IteratorType1 first1, IteratorType1 last1, IteratorType2 first2, ValueType init_reduction_value, BinaryJoinerType joiner, BinaryTransform transformer) { - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); return Impl::transform_reduce_custom_functors_team_impl( @@ -366,7 +366,7 @@ transform_reduce(const TeamHandleType& teamHandle, ValueType init_reduction_value, BinaryJoinerType joiner, BinaryTransform transformer) { namespace KE = ::Kokkos::Experimental; - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(first_view); @@ -393,7 +393,7 @@ KOKKOS_FUNCTION ValueType transform_reduce(const TeamHandleType& teamHandle, ValueType init_reduction_value, BinaryJoinerType joiner, UnaryTransform transformer) { - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); return Impl::transform_reduce_custom_functors_team_impl( @@ -412,7 +412,7 @@ transform_reduce(const TeamHandleType& teamHandle, ValueType init_reduction_value, BinaryJoinerType joiner, UnaryTransform transformer) { namespace KE = ::Kokkos::Experimental; - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view); diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp index 54bb13e25b9e..da16141f5a7f 100644 --- a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp +++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp @@ -33,12 +33,12 @@ struct is_admissible_to_kokkos_std_algorithms : std::false_type {}; template struct is_admissible_to_kokkos_std_algorithms< T, std::enable_if_t<::Kokkos::is_view::value && T::rank() == 1 && - (std::is_same::value || - std::is_same::value || - std::is_same::value)>> + (std::is_same_v || + std::is_same_v || + std::is_same_v)>> : std::true_type {}; template @@ -102,8 +102,8 @@ struct are_random_access_iterators; template struct are_random_access_iterators { static constexpr bool value = - is_iterator_v && std::is_base_of::value; + is_iterator_v && std::is_base_of_v; }; template @@ -165,9 +165,8 @@ struct iterators_have_matching_difference_type { template struct iterators_have_matching_difference_type { - static constexpr bool value = - std::is_same::value; + static constexpr bool value = std::is_same_v; }; template diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_MoveBackward.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_MoveBackward.hpp index 9075562d460e..dc910861d507 100644 --- a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_MoveBackward.hpp +++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_MoveBackward.hpp @@ -30,7 +30,7 @@ namespace Impl { template struct StdMoveBackwardFunctor { using index_type = typename IteratorType1::difference_type; - static_assert(std::is_signed::value, + static_assert(std::is_signed_v, "Kokkos: StdMoveBackwardFunctor requires signed index type"); IteratorType1 m_last; diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_RandomAccessIterator.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_RandomAccessIterator.hpp index 5bce89e98f7f..e8c638c94c75 100644 --- a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_RandomAccessIterator.hpp +++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_RandomAccessIterator.hpp @@ -36,18 +36,18 @@ class RandomAccessIterator< ::Kokkos::View > { using iterator_type = RandomAccessIterator; using iterator_category = std::random_access_iterator_tag; - using value_type = typename view_type::value_type; + using value_type = typename view_type::non_const_value_type; using difference_type = ptrdiff_t; using pointer = typename view_type::pointer_type; using reference = typename view_type::reference_type; static_assert(view_type::rank == 1 && - (std::is_same::value || - std::is_same::value || - std::is_same::value), + (std::is_same_v || + std::is_same_v || + std::is_same_v), "RandomAccessIterator only supports 1D Views with LayoutLeft, " "LayoutRight, LayoutStride."); @@ -61,9 +61,9 @@ class RandomAccessIterator< ::Kokkos::View > { #ifndef KOKKOS_ENABLE_CXX17 // C++20 and beyond template - requires(std::is_constructible_v) KOKKOS_FUNCTION - explicit(!std::is_convertible_v) - RandomAccessIterator(const RandomAccessIterator& other) + requires(std::is_constructible_v) + KOKKOS_FUNCTION explicit(!std::is_convertible_v) + RandomAccessIterator(const RandomAccessIterator& other) : m_view(other.m_view), m_current_index(other.m_current_index) {} #else template < diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Reverse.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Reverse.hpp index b4046c7645bd..e6caa0728805 100644 --- a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Reverse.hpp +++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Reverse.hpp @@ -30,7 +30,7 @@ namespace Impl { template struct StdReverseFunctor { using index_type = typename InputIterator::difference_type; - static_assert(std::is_signed::value, + static_assert(std::is_signed_v, "Kokkos: StdReverseFunctor requires signed index type"); InputIterator m_first; diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ReverseCopy.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ReverseCopy.hpp index dd20d90e3995..7aa0e4fc44c8 100644 --- a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ReverseCopy.hpp +++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ReverseCopy.hpp @@ -30,7 +30,7 @@ namespace Impl { template struct StdReverseCopyFunctor { using index_type = typename InputIterator::difference_type; - static_assert(std::is_signed::value, + static_assert(std::is_signed_v, "Kokkos: StdReverseCopyFunctor requires signed index type"); InputIterator m_last; diff --git a/packages/kokkos/algorithms/unit_tests/CMakeLists.txt b/packages/kokkos/algorithms/unit_tests/CMakeLists.txt index db184bc8a999..31247af159b9 100644 --- a/packages/kokkos/algorithms/unit_tests/CMakeLists.txt +++ b/packages/kokkos/algorithms/unit_tests/CMakeLists.txt @@ -1,12 +1,10 @@ - #Leave these here for now - I don't need transitive deps anyway -KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) -KOKKOS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}) -KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../src ) -KOKKOS_INCLUDE_DIRECTORIES(${KOKKOS_SOURCE_DIR}/core/unit_test/category_files) - +kokkos_include_directories(${CMAKE_CURRENT_BINARY_DIR}) +kokkos_include_directories(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}) +kokkos_include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../src) +kokkos_include_directories(${KOKKOS_SOURCE_DIR}/core/unit_test/category_files) -SET(ALGORITHM UnitTestMain.cpp) +set(ALGORITHM UnitTestMain.cpp) foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;HIP;SYCL;OpenMPTarget) string(TOUPPER ${Tag} DEVICE) @@ -23,21 +21,11 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;HIP;SYCL;OpenMPTarget) # Generate a .cpp file for each one that runs it on the current backend (Tag), # and add this .cpp file to the sources for UnitTest_RandomAndSort. set(ALGO_SORT_SOURCES) - foreach(SOURCE_Input - TestSort - TestSortByKey - TestSortCustomComp - TestBinSortA - TestBinSortB - TestNestedSort - ) + foreach(SOURCE_Input TestSort TestSortByKey TestSortCustomComp TestBinSortA TestBinSortB TestNestedSort) set(file ${dir}/${SOURCE_Input}.cpp) # Write to a temporary intermediate file and call configure_file to avoid # updating timestamps triggering unnecessary rebuilds on subsequent cmake runs. - file(WRITE ${dir}/dummy.cpp - "#include \n" - "#include <${SOURCE_Input}.hpp>\n" - ) + file(WRITE ${dir}/dummy.cpp "#include \n" "#include <${SOURCE_Input}.hpp>\n") configure_file(${dir}/dummy.cpp ${file}) list(APPEND ALGO_SORT_SOURCES ${file}) endforeach() @@ -47,14 +35,9 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;HIP;SYCL;OpenMPTarget) # ------------------------------------------ # do as above set(ALGO_RANDOM_SOURCES) - foreach(SOURCE_Input - TestRandom - ) + foreach(SOURCE_Input TestRandom) set(file ${dir}/${SOURCE_Input}.cpp) - file(WRITE ${dir}/dummy.cpp - "#include \n" - "#include <${SOURCE_Input}.hpp>\n" - ) + file(WRITE ${dir}/dummy.cpp "#include \n" "#include <${SOURCE_Input}.hpp>\n") configure_file(${dir}/dummy.cpp ${file}) list(APPEND ALGO_RANDOM_SOURCES ${file}) endforeach() @@ -65,11 +48,7 @@ endforeach() # std set A # ------------------------------------------ set(STDALGO_SOURCES_A) -foreach(Name - StdReducers - StdAlgorithmsConstraints - RandomAccessIterator - ) +foreach(Name StdReducers StdAlgorithmsConstraints RandomAccessIterator) list(APPEND STDALGO_SOURCES_A Test${Name}.cpp) endforeach() @@ -77,10 +56,7 @@ endforeach() # std set B # ------------------------------------------ set(STDALGO_SOURCES_B) -foreach(Name - StdAlgorithmsCommon - StdAlgorithmsMinMaxElementOps - ) +foreach(Name StdAlgorithmsCommon StdAlgorithmsMinMaxElementOps) list(APPEND STDALGO_SOURCES_B Test${Name}.cpp) endforeach() @@ -88,22 +64,23 @@ endforeach() # std set C # ------------------------------------------ set(STDALGO_SOURCES_C) -foreach(Name - StdAlgorithmsCommon - StdAlgorithmsLexicographicalCompare - StdAlgorithmsForEach - StdAlgorithmsFind - StdAlgorithmsFindFirstOf - StdAlgorithmsFindEnd - StdAlgorithmsCount - StdAlgorithmsEqual - StdAlgorithmsAllAnyNoneOf - StdAlgorithmsAdjacentFind - StdAlgorithmsSearch - StdAlgorithmsSearch_n - StdAlgorithmsMismatch - StdAlgorithmsMoveBackward - ) +foreach( + Name + StdAlgorithmsCommon + StdAlgorithmsLexicographicalCompare + StdAlgorithmsForEach + StdAlgorithmsFind + StdAlgorithmsFindFirstOf + StdAlgorithmsFindEnd + StdAlgorithmsCount + StdAlgorithmsEqual + StdAlgorithmsAllAnyNoneOf + StdAlgorithmsAdjacentFind + StdAlgorithmsSearch + StdAlgorithmsSearch_n + StdAlgorithmsMismatch + StdAlgorithmsMoveBackward +) list(APPEND STDALGO_SOURCES_C Test${Name}.cpp) endforeach() @@ -111,27 +88,28 @@ endforeach() # std set D # ------------------------------------------ set(STDALGO_SOURCES_D) -foreach(Name - StdAlgorithmsCommon - StdAlgorithmsModOps - StdAlgorithmsModSeqOps - StdAlgorithmsReplace - StdAlgorithmsReplaceIf - StdAlgorithmsReplaceCopy - StdAlgorithmsReplaceCopyIf - StdAlgorithmsCopyIf - StdAlgorithmsUnique - StdAlgorithmsUniqueCopy - StdAlgorithmsRemove - StdAlgorithmsRemoveIf - StdAlgorithmsRemoveCopy - StdAlgorithmsRemoveCopyIf - StdAlgorithmsRotate - StdAlgorithmsRotateCopy - StdAlgorithmsReverse - StdAlgorithmsShiftLeft - StdAlgorithmsShiftRight - ) +foreach( + Name + StdAlgorithmsCommon + StdAlgorithmsModOps + StdAlgorithmsModSeqOps + StdAlgorithmsReplace + StdAlgorithmsReplaceIf + StdAlgorithmsReplaceCopy + StdAlgorithmsReplaceCopyIf + StdAlgorithmsCopyIf + StdAlgorithmsUnique + StdAlgorithmsUniqueCopy + StdAlgorithmsRemove + StdAlgorithmsRemoveIf + StdAlgorithmsRemoveCopy + StdAlgorithmsRemoveCopyIf + StdAlgorithmsRotate + StdAlgorithmsRotateCopy + StdAlgorithmsReverse + StdAlgorithmsShiftLeft + StdAlgorithmsShiftRight +) list(APPEND STDALGO_SOURCES_D Test${Name}.cpp) endforeach() @@ -139,20 +117,21 @@ endforeach() # std set E # ------------------------------------------ set(STDALGO_SOURCES_E) -foreach(Name - StdAlgorithmsCommon - StdAlgorithmsIsSorted - StdAlgorithmsIsSortedUntil - StdAlgorithmsPartitioningOps - StdAlgorithmsPartitionCopy - StdAlgorithmsNumerics - StdAlgorithmsAdjacentDifference - StdAlgorithmsExclusiveScan - StdAlgorithmsInclusiveScan - StdAlgorithmsTransformUnaryOp - StdAlgorithmsTransformExclusiveScan - StdAlgorithmsTransformInclusiveScan - ) +foreach( + Name + StdAlgorithmsCommon + StdAlgorithmsIsSorted + StdAlgorithmsIsSortedUntil + StdAlgorithmsPartitioningOps + StdAlgorithmsPartitionCopy + StdAlgorithmsNumerics + StdAlgorithmsAdjacentDifference + StdAlgorithmsExclusiveScan + StdAlgorithmsInclusiveScan + StdAlgorithmsTransformUnaryOp + StdAlgorithmsTransformExclusiveScan + StdAlgorithmsTransformInclusiveScan +) list(APPEND STDALGO_SOURCES_E Test${Name}.cpp) endforeach() @@ -160,11 +139,7 @@ endforeach() # std team Q # ------------------------------------------ set(STDALGO_TEAM_SOURCES_Q) -foreach(Name - StdAlgorithmsCommon - StdAlgorithmsTeamInclusiveScan - StdAlgorithmsTeamTransformInclusiveScan - ) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamInclusiveScan StdAlgorithmsTeamTransformInclusiveScan) list(APPEND STDALGO_TEAM_SOURCES_Q Test${Name}.cpp) endforeach() @@ -172,11 +147,7 @@ endforeach() # std team P # ------------------------------------------ set(STDALGO_TEAM_SOURCES_P) -foreach(Name - StdAlgorithmsCommon - StdAlgorithmsTeamExclusiveScan - StdAlgorithmsTeamTransformExclusiveScan - ) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamExclusiveScan StdAlgorithmsTeamTransformExclusiveScan) list(APPEND STDALGO_TEAM_SOURCES_P Test${Name}.cpp) endforeach() @@ -184,14 +155,9 @@ endforeach() # std team M # ------------------------------------------ set(STDALGO_TEAM_SOURCES_M) -foreach(Name - StdAlgorithmsCommon - StdAlgorithmsTeamTransformUnaryOp - StdAlgorithmsTeamTransformBinaryOp - StdAlgorithmsTeamGenerate - StdAlgorithmsTeamGenerate_n - StdAlgorithmsTeamSwapRanges - ) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamTransformUnaryOp StdAlgorithmsTeamTransformBinaryOp + StdAlgorithmsTeamGenerate StdAlgorithmsTeamGenerate_n StdAlgorithmsTeamSwapRanges +) list(APPEND STDALGO_TEAM_SOURCES_M Test${Name}.cpp) endforeach() @@ -199,14 +165,9 @@ endforeach() # std team L # ------------------------------------------ set(STDALGO_TEAM_SOURCES_L) -foreach(Name - StdAlgorithmsCommon - StdAlgorithmsTeamIsSorted - StdAlgorithmsTeamIsSortedUntil - StdAlgorithmsTeamIsPartitioned - StdAlgorithmsTeamPartitionCopy - StdAlgorithmsTeamPartitionPoint - ) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamIsSorted StdAlgorithmsTeamIsSortedUntil + StdAlgorithmsTeamIsPartitioned StdAlgorithmsTeamPartitionCopy StdAlgorithmsTeamPartitionPoint +) list(APPEND STDALGO_TEAM_SOURCES_L Test${Name}.cpp) endforeach() @@ -214,13 +175,9 @@ endforeach() # std team I # ------------------------------------------ set(STDALGO_TEAM_SOURCES_I) -foreach(Name - StdAlgorithmsCommon - StdAlgorithmsTeamUnique - StdAlgorithmsTeamAdjacentDifference - StdAlgorithmsTeamReduce - StdAlgorithmsTeamTransformReduce - ) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamUnique StdAlgorithmsTeamAdjacentDifference StdAlgorithmsTeamReduce + StdAlgorithmsTeamTransformReduce +) list(APPEND STDALGO_TEAM_SOURCES_I Test${Name}.cpp) endforeach() @@ -228,18 +185,19 @@ endforeach() # std team H # ------------------------------------------ set(STDALGO_TEAM_SOURCES_H) -foreach(Name - StdAlgorithmsCommon - StdAlgorithmsTeamCopy - StdAlgorithmsTeamCopy_n - StdAlgorithmsTeamCopyBackward - StdAlgorithmsTeamCopyIf - StdAlgorithmsTeamUniqueCopy - StdAlgorithmsTeamRemove - StdAlgorithmsTeamRemoveIf - StdAlgorithmsTeamRemoveCopy - StdAlgorithmsTeamRemoveCopyIf - ) +foreach( + Name + StdAlgorithmsCommon + StdAlgorithmsTeamCopy + StdAlgorithmsTeamCopy_n + StdAlgorithmsTeamCopyBackward + StdAlgorithmsTeamCopyIf + StdAlgorithmsTeamUniqueCopy + StdAlgorithmsTeamRemove + StdAlgorithmsTeamRemoveIf + StdAlgorithmsTeamRemoveCopy + StdAlgorithmsTeamRemoveCopyIf +) list(APPEND STDALGO_TEAM_SOURCES_H Test${Name}.cpp) endforeach() @@ -247,13 +205,9 @@ endforeach() # std team G # ------------------------------------------ set(STDALGO_TEAM_SOURCES_G) -foreach(Name - StdAlgorithmsCommon - StdAlgorithmsTeamMove - StdAlgorithmsTeamMoveBackward - StdAlgorithmsTeamShiftLeft - StdAlgorithmsTeamShiftRight - ) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamMove StdAlgorithmsTeamMoveBackward StdAlgorithmsTeamShiftLeft + StdAlgorithmsTeamShiftRight +) list(APPEND STDALGO_TEAM_SOURCES_G Test${Name}.cpp) endforeach() @@ -261,13 +215,9 @@ endforeach() # std team F # ------------------------------------------ set(STDALGO_TEAM_SOURCES_F) -foreach(Name - StdAlgorithmsCommon - StdAlgorithmsTeamReverse - StdAlgorithmsTeamReverseCopy - StdAlgorithmsTeamRotate - StdAlgorithmsTeamRotateCopy - ) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamReverse StdAlgorithmsTeamReverseCopy StdAlgorithmsTeamRotate + StdAlgorithmsTeamRotateCopy +) list(APPEND STDALGO_TEAM_SOURCES_F Test${Name}.cpp) endforeach() @@ -275,15 +225,16 @@ endforeach() # std team E # ------------------------------------------ set(STDALGO_TEAM_SOURCES_E) -foreach(Name - StdAlgorithmsCommon - StdAlgorithmsTeamFill - StdAlgorithmsTeamFill_n - StdAlgorithmsTeamReplace - StdAlgorithmsTeamReplaceIf - StdAlgorithmsTeamReplaceCopy - StdAlgorithmsTeamReplaceCopyIf - ) +foreach( + Name + StdAlgorithmsCommon + StdAlgorithmsTeamFill + StdAlgorithmsTeamFill_n + StdAlgorithmsTeamReplace + StdAlgorithmsTeamReplaceIf + StdAlgorithmsTeamReplaceCopy + StdAlgorithmsTeamReplaceCopyIf +) list(APPEND STDALGO_TEAM_SOURCES_E Test${Name}.cpp) endforeach() @@ -291,12 +242,7 @@ endforeach() # std team D # ------------------------------------------ set(STDALGO_TEAM_SOURCES_D) -foreach(Name - StdAlgorithmsCommon - StdAlgorithmsTeamMinElement - StdAlgorithmsTeamMaxElement - StdAlgorithmsTeamMinMaxElement - ) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamMinElement StdAlgorithmsTeamMaxElement StdAlgorithmsTeamMinMaxElement) list(APPEND STDALGO_TEAM_SOURCES_D Test${Name}.cpp) endforeach() @@ -304,16 +250,17 @@ endforeach() # std team C # ------------------------------------------ set(STDALGO_TEAM_SOURCES_C) -foreach(Name - StdAlgorithmsCommon - StdAlgorithmsTeamFind - StdAlgorithmsTeamFindIf - StdAlgorithmsTeamFindIfNot - StdAlgorithmsTeamAllOf - StdAlgorithmsTeamAnyOf - StdAlgorithmsTeamNoneOf - StdAlgorithmsTeamSearchN - ) +foreach( + Name + StdAlgorithmsCommon + StdAlgorithmsTeamFind + StdAlgorithmsTeamFindIf + StdAlgorithmsTeamFindIfNot + StdAlgorithmsTeamAllOf + StdAlgorithmsTeamAnyOf + StdAlgorithmsTeamNoneOf + StdAlgorithmsTeamSearchN +) list(APPEND STDALGO_TEAM_SOURCES_C Test${Name}.cpp) endforeach() @@ -321,13 +268,9 @@ endforeach() # std team B # ------------------------------------------ set(STDALGO_TEAM_SOURCES_B) -foreach(Name - StdAlgorithmsCommon - StdAlgorithmsTeamEqual - StdAlgorithmsTeamSearch - StdAlgorithmsTeamFindEnd - StdAlgorithmsTeamFindFirstOf - ) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamEqual StdAlgorithmsTeamSearch StdAlgorithmsTeamFindEnd + StdAlgorithmsTeamFindFirstOf +) list(APPEND STDALGO_TEAM_SOURCES_B Test${Name}.cpp) endforeach() @@ -335,34 +278,33 @@ endforeach() # std team A # ------------------------------------------ set(STDALGO_TEAM_SOURCES_A) -foreach(Name - StdAlgorithmsCommon - StdAlgorithmsTeamAdjacentFind - StdAlgorithmsTeamCount - StdAlgorithmsTeamCountIf - StdAlgorithmsTeamForEach - StdAlgorithmsTeamForEachN - StdAlgorithmsTeamLexicographicalCompare - StdAlgorithmsTeamMismatch - ) +foreach( + Name + StdAlgorithmsCommon + StdAlgorithmsTeamAdjacentFind + StdAlgorithmsTeamCount + StdAlgorithmsTeamCountIf + StdAlgorithmsTeamForEach + StdAlgorithmsTeamForEachN + StdAlgorithmsTeamLexicographicalCompare + StdAlgorithmsTeamMismatch +) list(APPEND STDALGO_TEAM_SOURCES_A Test${Name}.cpp) endforeach() # FIXME_OPENMPTARGET - remove sort test as it leads to ICE with clang/16 and above at compile time. -if(KOKKOS_ENABLE_OPENMPTARGET AND KOKKOS_CXX_COMPILER_ID STREQUAL "Clang" AND KOKKOS_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 16.0.0) - list(REMOVE_ITEM ALGO_SORT_SOURCES - TestSort.cpp - ) +if(KOKKOS_ENABLE_OPENMPTARGET AND KOKKOS_CXX_COMPILER_ID STREQUAL "Clang" AND KOKKOS_CXX_COMPILER_VERSION + VERSION_GREATER_EQUAL 16.0.0 +) + list(REMOVE_ITEM ALGO_SORT_SOURCES TestSort.cpp) endif() # FIXME_OPENMPTARGET remove tests for OpenMPTarget because in these cases # the impl needs to use either Kokkos or tailored reducers # which results in runtime memory errors. if(KOKKOS_ENABLE_OPENMPTARGET) - list(REMOVE_ITEM STDALGO_TEAM_SOURCES_L - TestStdAlgorithmsTeamIsPartitioned.cpp - TestStdAlgorithmsTeamPartitionPoint.cpp - TestStdAlgorithmsTeamPartitionCopy.cpp + list(REMOVE_ITEM STDALGO_TEAM_SOURCES_L TestStdAlgorithmsTeamIsPartitioned.cpp + TestStdAlgorithmsTeamPartitionPoint.cpp TestStdAlgorithmsTeamPartitionCopy.cpp ) endif() @@ -370,7 +312,9 @@ endif() # in these cases the impl needs to use either Kokkos or # tailored reducers which results in runtime memory errors. if(KOKKOS_ENABLE_OPENMPTARGET) - list(REMOVE_ITEM STDALGO_TEAM_SOURCES_C + list( + REMOVE_ITEM + STDALGO_TEAM_SOURCES_C TestStdAlgorithmsTeamFind.cpp TestStdAlgorithmsTeamFindIf.cpp TestStdAlgorithmsTeamFindIfNot.cpp @@ -386,35 +330,20 @@ endif() # FRIZZI: 04/26/2023: not sure if the compilation error is still applicable # but we conservatively leave this guard on if(NOT (KOKKOS_ENABLE_OPENMPTARGET AND KOKKOS_CXX_COMPILER_ID STREQUAL IntelLLVM)) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - UnitTest_Sort - SOURCES - UnitTestMain.cpp - TestStdAlgorithmsCommon.cpp - ${ALGO_SORT_SOURCES} + kokkos_add_executable_and_test( + UnitTest_Sort SOURCES UnitTestMain.cpp TestStdAlgorithmsCommon.cpp ${ALGO_SORT_SOURCES} ) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - UnitTest_Random - SOURCES - UnitTestMain.cpp - ${ALGO_RANDOM_SOURCES} - ) + kokkos_add_executable_and_test(UnitTest_Random SOURCES UnitTestMain.cpp ${ALGO_RANDOM_SOURCES}) endif() # FIXME_OPENMPTARGET: These tests cause internal compiler errors as of 09/01/22 # when compiling for Intel's Xe-HP GPUs. if(KOKKOS_ENABLE_OPENMPTARGET AND KOKKOS_CXX_COMPILER_ID STREQUAL IntelLLVM) - list(REMOVE_ITEM STDALGO_SOURCES_D - TestStdAlgorithmsCopyIf.cpp - TestStdAlgorithmsRemoveCopy.cpp - TestStdAlgorithmsUnique.cpp - TestStdAlgorithmsUniqueCopy.cpp - ) - list(REMOVE_ITEM STDALGO_SOURCES_E - TestStdAlgorithmsExclusiveScan.cpp - TestStdAlgorithmsInclusiveScan.cpp + list(REMOVE_ITEM STDALGO_SOURCES_D TestStdAlgorithmsCopyIf.cpp TestStdAlgorithmsRemoveCopy.cpp + TestStdAlgorithmsUnique.cpp TestStdAlgorithmsUniqueCopy.cpp ) + list(REMOVE_ITEM STDALGO_SOURCES_E TestStdAlgorithmsExclusiveScan.cpp TestStdAlgorithmsInclusiveScan.cpp) endif() # FIXME_OPENMPTARGET remove tests for OpenMPTarget @@ -422,48 +351,31 @@ endif() if(KOKKOS_ENABLE_OPENMPTARGET) # the following use either Kokkos or tailored reducers # which results in runtime memory errors. - list(REMOVE_ITEM STDALGO_TEAM_SOURCES_B - TestStdAlgorithmsTeamFindEnd.cpp - TestStdAlgorithmsTeamFindFirstOf.cpp - TestStdAlgorithmsTeamSearch.cpp + list(REMOVE_ITEM STDALGO_TEAM_SOURCES_B TestStdAlgorithmsTeamFindEnd.cpp TestStdAlgorithmsTeamFindFirstOf.cpp + TestStdAlgorithmsTeamSearch.cpp ) - list(REMOVE_ITEM STDALGO_TEAM_SOURCES_A - TestStdAlgorithmsTeamAdjacentFind.cpp - TestStdAlgorithmsTeamLexicographicalCompare.cpp - TestStdAlgorithmsTeamMismatch.cpp + list(REMOVE_ITEM STDALGO_TEAM_SOURCES_A TestStdAlgorithmsTeamAdjacentFind.cpp + TestStdAlgorithmsTeamLexicographicalCompare.cpp TestStdAlgorithmsTeamMismatch.cpp ) # this causes an illegal memory access if team_members_have_matching_result # is called - list(REMOVE_ITEM STDALGO_TEAM_SOURCES_M - TestStdAlgorithmsTeamTransformBinaryOp.cpp - ) + list(REMOVE_ITEM STDALGO_TEAM_SOURCES_M TestStdAlgorithmsTeamTransformBinaryOp.cpp) endif() foreach(ID A;B;C;D;E) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - AlgorithmsUnitTest_StdSet_${ID} - SOURCES - UnitTestMain.cpp - ${STDALGO_SOURCES_${ID}} - ) + kokkos_add_executable_and_test(AlgorithmsUnitTest_StdSet_${ID} SOURCES UnitTestMain.cpp ${STDALGO_SOURCES_${ID}}) endforeach() foreach(ID A;B;C;D;E;F;G;H;I;L;M;P;Q) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - AlgorithmsUnitTest_StdSet_Team_${ID} - SOURCES - UnitTestMain.cpp - ${STDALGO_TEAM_SOURCES_${ID}} - ) + kokkos_add_executable_and_test( + AlgorithmsUnitTest_StdSet_Team_${ID} SOURCES UnitTestMain.cpp ${STDALGO_TEAM_SOURCES_${ID}} + ) endforeach() # FIXME_OPENMPTARGET This test causes internal compiler errors as of 09/01/22 # when compiling for Intel's Xe-HP GPUs. if(NOT (KOKKOS_ENABLE_OPENMPTARGET AND KOKKOS_CXX_COMPILER_ID STREQUAL IntelLLVM)) - KOKKOS_ADD_EXECUTABLE( - AlgorithmsUnitTest_StdAlgoCompileOnly - SOURCES TestStdAlgorithmsCompileOnly.cpp - ) + kokkos_add_executable(AlgorithmsUnitTest_StdAlgoCompileOnly SOURCES TestStdAlgorithmsCompileOnly.cpp) endif() diff --git a/packages/kokkos/algorithms/unit_tests/TestBinSortA.hpp b/packages/kokkos/algorithms/unit_tests/TestBinSortA.hpp index dd3569e6715a..bb074f248034 100644 --- a/packages/kokkos/algorithms/unit_tests/TestBinSortA.hpp +++ b/packages/kokkos/algorithms/unit_tests/TestBinSortA.hpp @@ -31,13 +31,13 @@ struct bin3d_is_sorted_struct { using value_type = unsigned int; using execution_space = ExecutionSpace; - Kokkos::View keys; + Kokkos::View keys; int max_bins; Scalar min; Scalar max; - bin3d_is_sorted_struct(Kokkos::View keys_, + bin3d_is_sorted_struct(Kokkos::View keys_, int max_bins_, Scalar min_, Scalar max_) : keys(keys_), max_bins(max_bins_), min(min_), max(max_) {} KOKKOS_INLINE_FUNCTION @@ -65,9 +65,9 @@ struct sum3D { using value_type = double; using execution_space = ExecutionSpace; - Kokkos::View keys; + Kokkos::View keys; - sum3D(Kokkos::View keys_) : keys(keys_) {} + sum3D(Kokkos::View keys_) : keys(keys_) {} KOKKOS_INLINE_FUNCTION void operator()(int i, double& count) const { count += keys(i, 0); @@ -77,8 +77,8 @@ struct sum3D { }; template -void test_3D_sort_impl(unsigned int n) { - using KeyViewType = Kokkos::View; +void test_3D_sort_impl(size_t n) { + using KeyViewType = Kokkos::View; KeyViewType keys("Keys", n * n * n); @@ -207,7 +207,7 @@ void test_sort_integer_overflow() { // array with two extrema in reverse order to expose integer overflow bug in // bin calculation T a[2] = {Kokkos::Experimental::finite_max::value, - Kokkos::Experimental::finite_min::value}; + Kokkos::Experimental::finite_min::value}; auto vd = Kokkos::create_mirror_view_and_copy( ExecutionSpace(), Kokkos::View(a)); Kokkos::sort(vd); @@ -219,6 +219,10 @@ void test_sort_integer_overflow() { } // namespace BinSortSetA TEST(TEST_CATEGORY, BinSortGenericTests) { + // FIXME_OPENMPTARGET - causes runtime failure with CrayClang compiler +#if defined(KOKKOS_COMPILER_CRAY_LLVM) && defined(KOKKOS_ENABLE_OPENMPTARGET) + GTEST_SKIP() << "known to fail with OpenMPTarget+Cray LLVM"; +#endif using ExecutionSpace = TEST_EXECSPACE; using key_type = unsigned; constexpr int N = 171; @@ -246,11 +250,11 @@ TEST(TEST_CATEGORY, BinSortEmptyView) { // does not matter if we use int or something else Kokkos::View v("v", 0); - // test all exposed public sort methods - ASSERT_NO_THROW(Sorter.sort(ExecutionSpace(), v, 0, 0)); - ASSERT_NO_THROW(Sorter.sort(v, 0, 0)); - ASSERT_NO_THROW(Sorter.sort(ExecutionSpace(), v)); - ASSERT_NO_THROW(Sorter.sort(v)); + // test all exposed public sort methods are callable and do not throw + Sorter.sort(ExecutionSpace(), v, 0, 0); + Sorter.sort(v, 0, 0); + Sorter.sort(ExecutionSpace(), v); + Sorter.sort(v); } TEST(TEST_CATEGORY, BinSortEmptyKeysView) { @@ -263,7 +267,26 @@ TEST(TEST_CATEGORY, BinSortEmptyKeysView) { BinOp_t binOp(5, 0, 10); Kokkos::BinSort Sorter(ExecutionSpace{}, kv, binOp); - ASSERT_NO_THROW(Sorter.create_permute_vector(ExecutionSpace{})); + Sorter.create_permute_vector(ExecutionSpace{}); // does not throw +} + +// BinSort may delegate sorting within bins to std::sort when running on host +// and having a sufficiently large number of items within a single bin (10 by +// default). Test that this is done without undefined behavior when accessing +// the boundaries of the bin. Should be used in conjunction with a memory +// sanitizer or bounds check. +TEST(TEST_CATEGORY, BinSort_issue_7221) { + using ExecutionSpace = TEST_EXECSPACE; + + using KeyViewType = Kokkos::View; + KeyViewType kv("kv", 11); + + using BinOp_t = Kokkos::BinOp1D; + BinOp_t binOp(1, -10, 10); + Kokkos::BinSort Sorter(ExecutionSpace{}, kv, binOp, + /*sort_within_bins*/ true); + + Sorter.create_permute_vector(ExecutionSpace{}); // does not throw } } // namespace Test diff --git a/packages/kokkos/algorithms/unit_tests/TestBinSortB.hpp b/packages/kokkos/algorithms/unit_tests/TestBinSortB.hpp index a90224bf3158..d11b53a9a61f 100644 --- a/packages/kokkos/algorithms/unit_tests/TestBinSortB.hpp +++ b/packages/kokkos/algorithms/unit_tests/TestBinSortB.hpp @@ -185,6 +185,10 @@ void run_for_rank2() { } // namespace BinSortSetB TEST(TEST_CATEGORY, BinSortUnsignedKeyLayoutStrideValues) { + // FIXME_OPENMPTARGET - causes runtime failure with CrayClang compiler +#if defined(KOKKOS_COMPILER_CRAY_LLVM) && defined(KOKKOS_ENABLE_OPENMPTARGET) + GTEST_SKIP() << "known to fail with OpenMPTarget+Cray LLVM"; +#endif using ExeSpace = TEST_EXECSPACE; using key_type = unsigned; BinSortSetB::run_for_rank1(); diff --git a/packages/kokkos/algorithms/unit_tests/TestNestedSort.hpp b/packages/kokkos/algorithms/unit_tests/TestNestedSort.hpp index 1b7a3f48fc52..cd57fd23ecfa 100644 --- a/packages/kokkos/algorithms/unit_tests/TestNestedSort.hpp +++ b/packages/kokkos/algorithms/unit_tests/TestNestedSort.hpp @@ -386,6 +386,11 @@ void test_nested_sort_by_key(unsigned int N, KeyType minKey, KeyType maxKey, } // namespace NestedSortImpl TEST(TEST_CATEGORY, NestedSort) { + // FIXME_OPENMPTARGET - causes runtime failure with CrayClang compiler +#if defined(KOKKOS_COMPILER_CRAY_LLVM) && defined(KOKKOS_ENABLE_OPENMPTARGET) + GTEST_SKIP() << "known to fail with OpenMPTarget+Cray LLVM"; +#endif + using ExecutionSpace = TEST_EXECSPACE; NestedSortImpl::test_nested_sort(171, 0U, UINT_MAX); NestedSortImpl::test_nested_sort(42, -1e6f, 1e6f); @@ -394,6 +399,11 @@ TEST(TEST_CATEGORY, NestedSort) { } TEST(TEST_CATEGORY, NestedSortByKey) { + // FIXME_OPENMPTARGET - causes runtime failure with CrayClang compiler +#if defined(KOKKOS_COMPILER_CRAY_LLVM) && defined(KOKKOS_ENABLE_OPENMPTARGET) + GTEST_SKIP() << "known to fail with OpenMPTarget+Cray LLVM"; +#endif + using ExecutionSpace = TEST_EXECSPACE; // Second/third template arguments are key and value respectively. diff --git a/packages/kokkos/algorithms/unit_tests/TestRandom.hpp b/packages/kokkos/algorithms/unit_tests/TestRandom.hpp index 472af1403b2d..6960b912d0e3 100644 --- a/packages/kokkos/algorithms/unit_tests/TestRandom.hpp +++ b/packages/kokkos/algorithms/unit_tests/TestRandom.hpp @@ -542,6 +542,11 @@ void test_duplicate_stream() { } // namespace AlgoRandomImpl TEST(TEST_CATEGORY, Random_XorShift64) { + // FIXME_OPENMPTARGET - causes runtime failure with CrayClang compiler +#if defined(KOKKOS_COMPILER_CRAY_LLVM) && defined(KOKKOS_ENABLE_OPENMPTARGET) + GTEST_SKIP() << "known to fail with OpenMPTarget+Cray LLVM"; +#endif + using ExecutionSpace = TEST_EXECSPACE; #if defined(KOKKOS_ENABLE_SYCL) || defined(KOKKOS_ENABLE_CUDA) || \ @@ -562,6 +567,10 @@ TEST(TEST_CATEGORY, Random_XorShift64) { TEST(TEST_CATEGORY, Random_XorShift1024_0) { using ExecutionSpace = TEST_EXECSPACE; + // FIXME_OPENMPTARGET - causes runtime failure with CrayClang compiler +#if defined(KOKKOS_COMPILER_CRAY_LLVM) && defined(KOKKOS_ENABLE_OPENMPTARGET) + GTEST_SKIP() << "known to fail with OpenMPTarget+Cray LLVM"; +#endif #if defined(KOKKOS_ENABLE_SYCL) || defined(KOKKOS_ENABLE_CUDA) || \ defined(KOKKOS_ENABLE_HIP) @@ -589,7 +598,7 @@ TEST(TEST_CATEGORY, Multi_streams) { #endif #if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU) - if constexpr (std::is_same_v) { + if constexpr (std::is_same_v) { GTEST_SKIP() << "Failing on NVIDIA GPUs"; // FIXME_SYCL } #endif diff --git a/packages/kokkos/algorithms/unit_tests/TestRandomAccessIterator.cpp b/packages/kokkos/algorithms/unit_tests/TestRandomAccessIterator.cpp index 7d484136b6dd..5ab348cb1933 100644 --- a/packages/kokkos/algorithms/unit_tests/TestRandomAccessIterator.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestRandomAccessIterator.cpp @@ -23,7 +23,7 @@ namespace stdalgos { struct random_access_iterator_test : std_algorithms_test { public: - virtual void SetUp() { + void SetUp() override { Kokkos::parallel_for(m_static_view.extent(0), AssignIndexFunctor(m_static_view)); @@ -264,6 +264,37 @@ TEST_F(random_access_iterator_test, traits_helpers) { static_assert(KE::Impl::are_iterators_v); static_assert(KE::Impl::are_random_access_iterators_v); static_assert(!KE::Impl::are_iterators_v); + + static_assert(std::is_same_v); + static_assert(std::is_same_v); + static_assert(std::is_same_v); + + static_assert( + std::is_same_v); + static_assert(std::is_same_v); + static_assert(std::is_same_v); + + static_assert( + std::is_same_v); + static_assert( + std::is_same_v); + static_assert( + std::is_same_v); + + static_assert(std::is_same_v); + static_assert(std::is_same_v); + static_assert(std::is_same_v); } } // namespace stdalgos diff --git a/packages/kokkos/algorithms/unit_tests/TestSort.hpp b/packages/kokkos/algorithms/unit_tests/TestSort.hpp index 968fb8950b74..5ea88ae5d628 100644 --- a/packages/kokkos/algorithms/unit_tests/TestSort.hpp +++ b/packages/kokkos/algorithms/unit_tests/TestSort.hpp @@ -197,7 +197,7 @@ void test_sort_integer_overflow() { // array with two extrema in reverse order to expose integer overflow bug in // bin calculation T a[2] = {Kokkos::Experimental::finite_max::value, - Kokkos::Experimental::finite_min::value}; + Kokkos::Experimental::finite_min::value}; auto vd = Kokkos::create_mirror_view_and_copy( ExecutionSpace(), Kokkos::View(a)); Kokkos::sort(vd); @@ -209,6 +209,10 @@ void test_sort_integer_overflow() { } // namespace SortImpl TEST(TEST_CATEGORY, SortUnsignedValueType) { + // FIXME_OPENMPTARGET - causes runtime failure with CrayClang compiler +#if defined(KOKKOS_COMPILER_CRAY_LLVM) && defined(KOKKOS_ENABLE_OPENMPTARGET) + GTEST_SKIP() << "known to fail with OpenMPTarget+Cray LLVM"; +#endif using ExecutionSpace = TEST_EXECSPACE; using key_type = unsigned; constexpr int N = 171; @@ -224,14 +228,19 @@ TEST(TEST_CATEGORY, SortUnsignedValueType) { } TEST(TEST_CATEGORY, SortEmptyView) { + // FIXME_OPENMPTARGET - causes runtime failure with CrayClang compiler +#if defined(KOKKOS_COMPILER_CRAY_LLVM) && defined(KOKKOS_ENABLE_OPENMPTARGET) + GTEST_SKIP() << "known to fail with OpenMPTarget+Cray LLVM"; +#endif using ExecutionSpace = TEST_EXECSPACE; // does not matter if we use int or something else Kokkos::View v("v", 0); + // checking that it does not throw // TODO check the synchronous behavior of the calls below - ASSERT_NO_THROW(Kokkos::sort(ExecutionSpace(), v)); - ASSERT_NO_THROW(Kokkos::sort(v)); + Kokkos::sort(ExecutionSpace(), v); + Kokkos::sort(v); } } // namespace Test diff --git a/packages/kokkos/algorithms/unit_tests/TestSortByKey.hpp b/packages/kokkos/algorithms/unit_tests/TestSortByKey.hpp index 9e5bd4a57487..44abe4e73a4b 100644 --- a/packages/kokkos/algorithms/unit_tests/TestSortByKey.hpp +++ b/packages/kokkos/algorithms/unit_tests/TestSortByKey.hpp @@ -83,8 +83,8 @@ TEST(TEST_CATEGORY, SortByKeyEmptyView) { Kokkos::View keys("keys", 0); Kokkos::View values("values", 0); - ASSERT_NO_THROW( - Kokkos::Experimental::sort_by_key(ExecutionSpace(), keys, values)); + // checking that it does not throw + Kokkos::Experimental::sort_by_key(ExecutionSpace(), keys, values); } // Test #7036 @@ -95,8 +95,8 @@ TEST(TEST_CATEGORY, SortByKeyEmptyViewHost) { Kokkos::View keys("keys", 0); Kokkos::View values("values", 0); - ASSERT_NO_THROW( - Kokkos::Experimental::sort_by_key(ExecutionSpace(), keys, values)); + // checking that it does not throw + Kokkos::Experimental::sort_by_key(ExecutionSpace(), keys, values); } TEST(TEST_CATEGORY, SortByKey) { @@ -183,12 +183,12 @@ TEST(TEST_CATEGORY, SortByKeyStaticExtents) { Kokkos::View keys("keys"); Kokkos::View values_static("values_static"); - ASSERT_NO_THROW( - Kokkos::Experimental::sort_by_key(space, keys, values_static)); + // checking that it does not throw + Kokkos::Experimental::sort_by_key(space, keys, values_static); Kokkos::View values_dynamic("values_dynamic", 10); - ASSERT_NO_THROW( - Kokkos::Experimental::sort_by_key(space, keys, values_dynamic)); + // checking that it does not throw + Kokkos::Experimental::sort_by_key(space, keys, values_dynamic); } template @@ -234,7 +234,9 @@ TEST(TEST_CATEGORY, SortByKeyWithStrides) { ASSERT_EQ(sort_fails, 0u); } -TEST(TEST_CATEGORY, SortByKeyKeysLargerThanValues) { +TEST(TEST_CATEGORY_DEATH, SortByKeyKeysLargerThanValues) { + ::testing::FLAGS_gtest_death_test_style = "threadsafe"; + using ExecutionSpace = TEST_EXECSPACE; // does not matter if we use int or something else diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsAdjacentDifference.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsAdjacentDifference.cpp index 75ad533f6ee4..208b46b15f27 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsAdjacentDifference.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsAdjacentDifference.cpp @@ -96,7 +96,7 @@ void fill_view(DestViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, aux_v_h); diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsAdjacentFind.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsAdjacentFind.cpp index fa4ff48dbef8..d8b80675c9d8 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsAdjacentFind.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsAdjacentFind.cpp @@ -173,7 +173,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); @@ -243,7 +243,7 @@ void run_single_scenario(const InfoType& scenario_info, Args... args) { { auto res_it = KE::adjacent_find(exespace(), KE::cbegin(view), - KE::cend(view), args...); + KE::cend(view), args...); const auto my_diff = res_it - KE::cbegin(view); verify(my_diff, view, args...); } diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsCommon.hpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsCommon.hpp index 67052e2f9d4d..dadce2d4748a 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsCommon.hpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsCommon.hpp @@ -534,10 +534,10 @@ void fill_views_inc(ViewType view, ViewHostType host_view) { } template -std::enable_if_t::value> +std::enable_if_t> verify_values(ValueType expected, const ViewType view) { - static_assert(std::is_same::value, + static_assert(std::is_same_v, "Non-matching value types of view and reference value"); auto view_h = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), view); for (std::size_t i = 0; i < view_h.extent(0); i++) { @@ -546,10 +546,10 @@ verify_values(ValueType expected, const ViewType view) { } template -std::enable_if_t::value> +std::enable_if_t> verify_values(ValueType expected, const ViewType view) { - static_assert(std::is_same::value, + static_assert(std::is_same_v, "Non-matching value types of view and reference value"); using non_strided_view_t = Kokkos::View; @@ -566,11 +566,11 @@ verify_values(ValueType expected, const ViewType view) { } template -std::enable_if_t::value> +std::enable_if_t> compare_views(ViewType1 expected, const ViewType2 actual) { - static_assert(std::is_same::value, + static_assert(std::is_same_v, "Non-matching value types of expected and actual view"); auto expected_h = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), expected); @@ -583,11 +583,11 @@ compare_views(ViewType1 expected, const ViewType2 actual) { } template -std::enable_if_t::value> +std::enable_if_t> compare_views(ViewType1 expected, const ViewType2 actual) { - static_assert(std::is_same::value, + static_assert(std::is_same_v, "Non-matching value types of expected and actual view"); using non_strided_view_t = Kokkos::View; diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsConstraints.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsConstraints.cpp index 2a4525a8c332..923ea970f91d 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsConstraints.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsConstraints.cpp @@ -81,7 +81,7 @@ TEST(std_algorithms, is_admissible_to_std_algorithms) { strided_view_3d_t>::value); } -TEST(std_algorithms, expect_no_overlap) { +TEST(std_algorithms_DeathTest, expect_no_overlap) { namespace KE = Kokkos::Experimental; using value_type = double; @@ -104,6 +104,8 @@ TEST(std_algorithms, expect_no_overlap) { // Overlapping because iterators are identical #if defined(KOKKOS_ENABLE_DEBUG) + ::testing::FLAGS_gtest_death_test_style = "threadsafe"; + auto first_s = KE::begin(static_view_1d); auto last_s = first_s + extent0; EXPECT_DEATH({ KE::Impl::expect_no_overlap(first_s, last_s, first_s); }, @@ -148,8 +150,7 @@ TEST(std_algorithms, expect_no_overlap) { auto last_st0 = first_st0 + strided_view_1d_0.extent(0); auto first_st1 = KE::begin(strided_view_1d_1); // [3, 15) // Does not overlap since offset (=3) is not divisible by stride (=2) - EXPECT_NO_THROW( - { KE::Impl::expect_no_overlap(first_st0, last_st0, first_st1); }); + KE::Impl::expect_no_overlap(first_st0, last_st0, first_st1); // Iterating over the same range without overlapping Kokkos::View static_view_2d{ @@ -160,9 +161,7 @@ TEST(std_algorithms, expect_no_overlap) { auto sub_last_s0 = sub_first_s0 + sub_static_view_1d_0.extent(0); auto sub_first_s1 = KE::begin(sub_static_view_1d_1); // 1, 3, 5, ... - EXPECT_NO_THROW({ - KE::Impl::expect_no_overlap(sub_first_s0, sub_last_s0, sub_first_s1); - }); + KE::Impl::expect_no_overlap(sub_first_s0, sub_last_s0, sub_first_s1); Kokkos::View dynamic_view_2d{ "std-algo-test-2d-contiguous-view-dynamic", 2, extent0}; @@ -172,9 +171,7 @@ TEST(std_algorithms, expect_no_overlap) { auto sub_last_d0 = sub_first_d0 + sub_dynamic_view_1d_0.extent(0); auto sub_first_d1 = KE::begin(sub_dynamic_view_1d_1); // 1, 3, 5, ... - EXPECT_NO_THROW({ - KE::Impl::expect_no_overlap(sub_first_d0, sub_last_d0, sub_first_d1); - }); + KE::Impl::expect_no_overlap(sub_first_d0, sub_last_d0, sub_first_d1); Kokkos::LayoutStride layout2d{2, 3, extent0, 2 * 3}; Kokkos::View strided_view_2d{ @@ -185,9 +182,7 @@ TEST(std_algorithms, expect_no_overlap) { auto sub_last_st0 = sub_first_st0 + sub_strided_view_1d_0.extent(0); auto sub_first_st1 = KE::begin(sub_strided_view_1d_1); // 1, 7, 13, ... - EXPECT_NO_THROW({ - KE::Impl::expect_no_overlap(sub_first_st0, sub_last_st0, sub_first_st1); - }); + KE::Impl::expect_no_overlap(sub_first_st0, sub_last_st0, sub_first_st1); } } // namespace stdalgos diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsCopyIf.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsCopyIf.cpp index 5778e37be04d..7c9e8f84bfa4 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsCopyIf.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsCopyIf.cpp @@ -107,7 +107,7 @@ std::size_t fill_view(ViewType dest_view, const std::string& name, } else { - throw std::runtime_error("invalid choice"); + Kokkos::abort("invalid choice"); } Kokkos::deep_copy(aux_view, v_h); @@ -202,7 +202,7 @@ void verify_data(const std::string& name, ViewTypeFrom view_from, } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } } @@ -224,7 +224,7 @@ void run_single_scenario(const InfoType& scenario_info) { auto n = fill_view(view_from, name, pred); auto view_dest = create_view(Tag{}, view_ext, "copy_if_dest"); auto rit = KE::copy_if(exespace(), KE::cbegin(view_from), - KE::cend(view_from), KE::begin(view_dest), pred); + KE::cend(view_from), KE::begin(view_dest), pred); verify_data(name, view_from, view_dest, pred); ASSERT_EQ(rit, (KE::begin(view_dest) + n)); } @@ -233,7 +233,7 @@ void run_single_scenario(const InfoType& scenario_info) { auto n = fill_view(view_from, name, pred); auto view_dest = create_view(Tag{}, view_ext, "copy_if_dest"); auto rit = KE::copy_if("label", exespace(), KE::cbegin(view_from), - KE::cend(view_from), KE::begin(view_dest), pred); + KE::cend(view_from), KE::begin(view_dest), pred); verify_data(name, view_from, view_dest, pred); ASSERT_EQ(rit, (KE::begin(view_dest) + n)); } diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsExclusiveScan.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsExclusiveScan.cpp index b364c53a8882..a85e63fe3454 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsExclusiveScan.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsExclusiveScan.cpp @@ -110,7 +110,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsForEach.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsForEach.cpp index 793b98a67f16..b24730ff0094 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsForEach.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsForEach.cpp @@ -55,7 +55,6 @@ void test_for_each(const ViewType view) { std::for_each(KE::begin(expected), KE::end(expected), non_mod_functor); compare_views(expected, view); -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) const auto mod_lambda = KOKKOS_LAMBDA(value_t & i) { ++i; }; // pass view, lambda takes non-const ref @@ -79,7 +78,6 @@ void test_for_each(const ViewType view) { KE::for_each(exespace(), KE::cbegin(view), KE::cend(view), non_mod_lambda); std::for_each(KE::cbegin(expected), KE::cend(expected), non_mod_lambda); compare_views(expected, view); -#endif } // std::for_each_n is C++17, so we cannot compare results directly diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsHelperFunctors.hpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsHelperFunctors.hpp index 8dbd6cd7e30b..2b3361743e4d 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsHelperFunctors.hpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsHelperFunctors.hpp @@ -104,7 +104,7 @@ struct AssignIndexFunctor { template struct IsEvenFunctor { - static_assert(std::is_integral::value, + static_assert(std::is_integral_v, "IsEvenFunctor uses operator%, so ValueType must be int"); KOKKOS_INLINE_FUNCTION diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsInclusiveScan.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsInclusiveScan.cpp index a08a73721088..b4f40b4651d6 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsInclusiveScan.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsInclusiveScan.cpp @@ -110,7 +110,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsIsSorted.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsIsSorted.cpp index 75d4f0afebce..18928a352669 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsIsSorted.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsIsSorted.cpp @@ -92,7 +92,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); @@ -122,7 +122,8 @@ bool compute_gold(const std::string& name) { } else if (name == "large-b") { return false; } else { - throw std::runtime_error("invalid choice"); + Kokkos::abort("invalid choice"); + return false; // unreachable } } @@ -154,7 +155,7 @@ void run_single_scenario(const InfoType& scenario_info) { resultsB[0] = KE::is_sorted(exespace(), KE::cbegin(view), KE::cend(view), comp); resultsB[1] = KE::is_sorted("label", exespace(), KE::cbegin(view), - KE::cend(view), comp); + KE::cend(view), comp); resultsB[2] = KE::is_sorted(exespace(), view, comp); resultsB[3] = KE::is_sorted("label", exespace(), view, comp); const auto allB = std::all_of(resultsB.cbegin(), resultsB.cend(), diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsIsSortedUntil.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsIsSortedUntil.cpp index 29ac7cc9bc12..8327bfe13c0b 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsIsSortedUntil.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsIsSortedUntil.cpp @@ -92,7 +92,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); @@ -123,7 +123,8 @@ auto compute_gold(ViewType view, const std::string& name) { } else if (name == "large-b") { return KE::begin(view) + 156; } else { - throw std::runtime_error("invalid choice"); + Kokkos::abort("invalid choice"); + return KE::end(view); // unreachable } } diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsMismatch.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsMismatch.cpp index f3b3e269c446..df5df756d2ae 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsMismatch.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsMismatch.cpp @@ -86,7 +86,7 @@ void run_single_scenario(ViewType view1, ViewType view2, v2_h(ext2 / 2) = -5; } } else { - throw std::runtime_error("Kokkos: stdalgo: test: mismatch: Invalid string"); + FAIL() << "Kokkos: stdalgo: test: mismatch: Invalid string"; } Kokkos::deep_copy(aux_view1, v1_h); diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsModOps.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsModOps.cpp index 1b1a02f39c4f..6918185bc083 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsModOps.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsModOps.cpp @@ -48,7 +48,7 @@ struct MyMovableType { TEST(std_algorithms_mod_ops_test, move) { MyMovableType a; using move_t = decltype(std::move(a)); - static_assert(std::is_rvalue_reference::value); + static_assert(std::is_rvalue_reference_v); // move constr MyMovableType b(std::move(a)); diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsModSeqOps.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsModSeqOps.cpp index f80f30797e43..42a17d737796 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsModSeqOps.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsModSeqOps.cpp @@ -23,7 +23,7 @@ namespace stdalgos { struct std_algorithms_mod_seq_ops_test : std_algorithms_test { public: - virtual void SetUp() { + void SetUp() override { Kokkos::parallel_for(m_static_view.extent(0), AssignIndexFunctor(m_static_view)); } diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsMoveBackward.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsMoveBackward.cpp index b201ab95c1a6..88e2a68ff17d 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsMoveBackward.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsMoveBackward.cpp @@ -56,7 +56,7 @@ void run_single_scenario(const InfoType& scenario_info, int apiId) { ASSERT_EQ(dist, 5); } else if (apiId == 1) { auto rit = KE::move_backward("mylabel", exespace(), KE::begin(v), - KE::end(v), KE::end(v2)); + KE::end(v), KE::end(v2)); const int dist = KE::distance(KE::begin(v2), rit); ASSERT_EQ(dist, 5); } else if (apiId == 2) { diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsPartitionCopy.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsPartitionCopy.cpp index a36c9db2b9eb..e47cacdd7d9c 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsPartitionCopy.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsPartitionCopy.cpp @@ -95,7 +95,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); @@ -110,9 +110,9 @@ void verify_data(const std::string& name, ResultType my_result, ViewTypeDestFalse view_dest_false, PredType pred) { using value_type = typename ViewTypeFrom::value_type; static_assert( - std::is_same::value); + std::is_same_v); static_assert( - std::is_same::value); + std::is_same_v); const std::size_t ext = view_from.extent(0); diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemove.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemove.cpp index c35fc5c24b20..f897e9b65749 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemove.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemove.cpp @@ -99,7 +99,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); @@ -147,7 +147,7 @@ void run_single_scenario(const InfoType& scenario_info) { // make host copy BEFORE running algo auto data_h = create_host_space_copy(view); auto rit = KE::remove(exespace(), KE::begin(view), KE::end(view), - (ValueType)match_value); + (ValueType)match_value); verify_data(data_h, view, rit); } diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemoveCopy.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemoveCopy.cpp index 3d7c52108be0..3137880ea813 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemoveCopy.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemoveCopy.cpp @@ -110,7 +110,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemoveCopyIf.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemoveCopyIf.cpp index cb699aa92356..d88ab5473de6 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemoveCopyIf.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemoveCopyIf.cpp @@ -93,7 +93,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemoveIf.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemoveIf.cpp index f06f2234eedb..e42788799e47 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemoveIf.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemoveIf.cpp @@ -93,7 +93,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); @@ -144,7 +144,7 @@ void run_single_scenario(const InfoType& scenario_info) { // make host copy BEFORE running algo auto data_h = create_host_space_copy(view); auto rit = KE::remove_if(exespace(), KE::begin(view), KE::end(view), - remove_if_even); + remove_if_even); verify_data(data_h, view, rit, remove_if_even); } @@ -154,7 +154,7 @@ void run_single_scenario(const InfoType& scenario_info) { // make host copy BEFORE running algo auto data_h = create_host_space_copy(view); auto rit = KE::remove_if("label", exespace(), KE::begin(view), - KE::end(view), remove_if_even); + KE::end(view), remove_if_even); verify_data(data_h, view, rit, remove_if_even); } diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplace.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplace.cpp index a22ab32d764a..4596726cf3ce 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplace.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplace.cpp @@ -84,7 +84,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); @@ -153,7 +153,7 @@ void verify_data(const std::string& name, ViewType1 test_view, } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } } diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplaceCopy.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplaceCopy.cpp index a964ec8e173e..b18c859af593 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplaceCopy.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplaceCopy.cpp @@ -84,7 +84,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); @@ -175,7 +175,7 @@ void verify_data(const std::string& name, ViewTypeFrom view_from, } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } } diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplaceCopyIf.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplaceCopyIf.cpp index ceeba8897119..82f859bac124 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplaceCopyIf.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplaceCopyIf.cpp @@ -84,7 +84,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); @@ -175,7 +175,7 @@ void verify_data(const std::string& name, ViewTypeFrom view_from, } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } } diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplaceIf.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplaceIf.cpp index 802c0093c5cc..5ae2ff427853 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplaceIf.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplaceIf.cpp @@ -96,7 +96,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReverse.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReverse.cpp index 6e6ca7278300..3c934d64850c 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReverse.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReverse.cpp @@ -62,7 +62,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRotate.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRotate.cpp index 5638cbee4a62..bf5c2ee7828b 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRotate.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRotate.cpp @@ -117,7 +117,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRotateCopy.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRotateCopy.cpp index d0caca7cea3f..1a860c58cee8 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRotateCopy.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRotateCopy.cpp @@ -117,7 +117,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); @@ -175,7 +175,7 @@ void run_single_scenario(const InfoType& scenario_info, create_view(Tag{}, view_ext, "rotate_copy_dest"); auto n_it = KE::cbegin(view_from) + rotation_point; auto rit = KE::rotate_copy(exespace(), KE::cbegin(view_from), n_it, - KE::cend(view_from), KE::begin(view_dest)); + KE::cend(view_from), KE::begin(view_dest)); verify_data(view_from, view_dest, rotation_point); ASSERT_EQ(rit, (KE::begin(view_dest) + view_ext)); } diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsSearch.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsSearch.cpp index 021609c444d2..195f88a0b737 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsSearch.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsSearch.cpp @@ -256,7 +256,7 @@ void run_single_scenario(const InfoType& scenario_info, std::size_t seq_ext, { auto myrit = KE::search(exespace(), KE::cbegin(view), KE::cend(view), - KE::cbegin(s_view), KE::cend(s_view), args...); + KE::cbegin(s_view), KE::cend(s_view), args...); const auto mydiff = myrit - KE::cbegin(view); const auto stddiff = stdrit - KE::cbegin(view_h); ASSERT_EQ(mydiff, stddiff); diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsSearch_n.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsSearch_n.cpp index 53ad8daa2ec9..79d88bec23f7 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsSearch_n.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsSearch_n.cpp @@ -154,7 +154,7 @@ void fill_view(ViewType dest_view, ValueType value, std::size_t count, } else { - throw std::runtime_error("Kokkos: test: search_n: this should not happen"); + FAIL() << "Kokkos: test: search_n: this should not happen"; } Kokkos::deep_copy(aux_view, v_h); @@ -208,7 +208,7 @@ void run_single_scenario(const InfoType& scenario_info, std::size_t count, { auto myrit = KE::search_n("label", exespace(), KE::cbegin(view), - KE::cend(view), count, value, args...); + KE::cend(view), count, value, args...); const auto mydiff = myrit - KE::cbegin(view); ASSERT_EQ(mydiff, stddiff); } diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsShiftLeft.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsShiftLeft.cpp index 0b5fe9216eac..12835d5a2f7c 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsShiftLeft.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsShiftLeft.cpp @@ -150,7 +150,7 @@ void run_single_scenario(const InfoType& scenario_info, // create host copy BEFORE shift_left or view will be modified auto view_h = create_host_space_copy(view); auto rit = KE::shift_left("label", exespace(), KE::begin(view), - KE::end(view), shift_value); + KE::end(view), shift_value); verify_data(rit, view, view_h, shift_value); } diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsShiftRight.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsShiftRight.cpp index 8e4ae9437590..3e350cf3b384 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsShiftRight.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsShiftRight.cpp @@ -141,7 +141,7 @@ void run_single_scenario(const InfoType& scenario_info, // create host copy BEFORE shift_right or view will be modified auto view_h = create_host_space_copy(view); auto rit = KE::shift_right(exespace(), KE::begin(view), KE::end(view), - shift_value); + shift_value); verify_data(rit, view, view_h, shift_value); } @@ -152,7 +152,7 @@ void run_single_scenario(const InfoType& scenario_info, // create host copy BEFORE shift_right or view will be modified auto view_h = create_host_space_copy(view); auto rit = KE::shift_right("label", exespace(), KE::begin(view), - KE::end(view), shift_value); + KE::end(view), shift_value); verify_data(rit, view, view_h, shift_value); } diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamAdjacentDifference.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamAdjacentDifference.cpp index c388cadc9bba..5a2c04693945 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamAdjacentDifference.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamAdjacentDifference.cpp @@ -62,8 +62,8 @@ struct TestFunctorA { switch (m_apiPick) { case 0: { auto it = KE::adjacent_difference(member, KE::cbegin(myRowViewFrom), - KE::cend(myRowViewFrom), - KE::begin(myRowViewDest)); + KE::cend(myRowViewFrom), + KE::begin(myRowViewDest)); resultDist = KE::distance(KE::begin(myRowViewDest), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; @@ -73,8 +73,8 @@ struct TestFunctorA { case 1: { auto it = KE::adjacent_difference(member, KE::cbegin(myRowViewFrom), - KE::cend(myRowViewFrom), - KE::begin(myRowViewDest), m_binaryOp); + KE::cend(myRowViewFrom), + KE::begin(myRowViewDest), m_binaryOp); resultDist = KE::distance(KE::begin(myRowViewDest), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopy.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopy.cpp index e24ac37bf012..071ecd5a9a80 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopy.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopy.cpp @@ -50,7 +50,7 @@ struct TestFunctorA { if (m_apiPick == 0) { auto it = KE::copy(member, KE::begin(myRowViewFrom), - KE::end(myRowViewFrom), KE::begin(myRowViewDest)); + KE::end(myRowViewFrom), KE::begin(myRowViewDest)); resultDist = KE::distance(KE::begin(myRowViewDest), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopyIf.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopyIf.cpp index 7c3c465dc8d0..3f83ac7404fe 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopyIf.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopyIf.cpp @@ -144,7 +144,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { auto rowFrom = Kokkos::subview(sourceViewBeforeOp_h, i, Kokkos::ALL()); auto rowDest = Kokkos::subview(stdDestView, i, Kokkos::ALL()); auto it = std::copy_if(KE::cbegin(rowFrom), KE::cend(rowFrom), - KE::begin(rowDest), predicate); + KE::begin(rowDest), predicate); const std::size_t stdDistance = KE::distance(KE::begin(rowDest), it); ASSERT_EQ(stdDistance, distancesView_h(i)); ASSERT_TRUE(intraTeamSentinelView_h(i)); diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopy_n.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopy_n.cpp index 7cbc788f8e3c..9b509af55bf2 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopy_n.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopy_n.cpp @@ -53,7 +53,7 @@ struct TestFunctorA { if (m_apiPick == 0) { auto it = KE::copy_n(member, KE::begin(myRowViewFrom), m_copyCount, - KE::begin(myRowViewDest)); + KE::begin(myRowViewDest)); resultDist = KE::distance(KE::begin(myRowViewDest), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCount.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCount.cpp index 922424afbd98..38df5c30cec8 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCount.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCount.cpp @@ -111,7 +111,7 @@ void test_A(const bool searched_value_exist, std::size_t numTeams, using rand_pool = Kokkos::Random_XorShift64_Pool; - rand_pool pool(lowerBound * upperBound); + rand_pool pool(static_cast(lowerBound) * upperBound); if (searched_value_exist) { Kokkos::View randomIndices( diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamExclusiveScan.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamExclusiveScan.cpp index 7cb9851087a1..0c35c5e59934 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamExclusiveScan.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamExclusiveScan.cpp @@ -67,8 +67,8 @@ struct TestFunctorA { switch (m_apiPick) { case 0: { auto it = KE::exclusive_scan(member, KE::cbegin(rowViewSrc), - KE::cend(rowViewSrc), - KE::begin(rowViewDest), initVal); + KE::cend(rowViewSrc), + KE::begin(rowViewDest), initVal); resultDist = KE::distance(KE::begin(rowViewDest), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this] { m_distancesView(rowIndex) = resultDist; }); diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFind.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFind.cpp index 430e4917e06e..88c5e21f312f 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFind.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFind.cpp @@ -51,7 +51,7 @@ struct TestFunctorA { switch (m_apiPick) { case 0: { auto it = KE::find(member, KE::cbegin(myRowViewFrom), - KE::cend(myRowViewFrom), searchedValue); + KE::cend(myRowViewFrom), searchedValue); resultDist = KE::distance(KE::cbegin(myRowViewFrom), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFindEnd.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFindEnd.cpp index 83eca33569e1..d350bc62cdb3 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFindEnd.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFindEnd.cpp @@ -86,9 +86,9 @@ struct TestFunctorA { case 2: { auto it = KE::find_end(member, KE::cbegin(myRowViewFrom), - KE::cend(myRowViewFrom), - KE::cbegin(myRowSearchedSeqView), - KE::cend(myRowSearchedSeqView), m_binaryPred); + KE::cend(myRowViewFrom), + KE::cbegin(myRowSearchedSeqView), + KE::cend(myRowSearchedSeqView), m_binaryPred); resultDist = KE::distance(KE::cbegin(myRowViewFrom), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; @@ -99,7 +99,7 @@ struct TestFunctorA { case 3: { auto it = KE::find_end(member, myRowViewFrom, myRowSearchedSeqView, - m_binaryPred); + m_binaryPred); resultDist = KE::distance(KE::begin(myRowViewFrom), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFindIf.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFindIf.cpp index ee4bbed7a30d..70f2be77f632 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFindIf.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFindIf.cpp @@ -70,7 +70,7 @@ struct TestFunctorA { switch (m_apiPick) { case 0: { auto it = KE::find_if(member, KE::cbegin(myRowViewFrom), - KE::cend(myRowViewFrom), unaryPred); + KE::cend(myRowViewFrom), unaryPred); resultDist = KE::distance(KE::cbegin(myRowViewFrom), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFindIfNot.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFindIfNot.cpp index b9448c1a3e68..873e8faf4cad 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFindIfNot.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFindIfNot.cpp @@ -70,7 +70,7 @@ struct TestFunctorA { switch (m_apiPick) { case 0: { auto it = KE::find_if_not(member, KE::cbegin(myRowViewFrom), - KE::cend(myRowViewFrom), unaryPred); + KE::cend(myRowViewFrom), unaryPred); resultDist = KE::distance(KE::cbegin(myRowViewFrom), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamGenerate_n.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamGenerate_n.cpp index 4b66dd9131fa..265cdf474616 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamGenerate_n.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamGenerate_n.cpp @@ -63,7 +63,7 @@ struct TestFunctorA { }); } else if (m_apiPick == 1) { auto it = KE::generate_n(member, myRowView, m_count, - GenerateFunctor()); + GenerateFunctor()); resultDist = KE::distance(KE::begin(myRowView), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamIsSorted.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamIsSorted.cpp index 850e80dde1e0..f76a595b3f42 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamIsSorted.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamIsSorted.cpp @@ -62,7 +62,7 @@ struct TestFunctorA { } else if (m_apiPick == 3) { using value_type = typename ViewType::value_type; result = KE::is_sorted(member, myRowView, - CustomLessThanComparator{}); + CustomLessThanComparator{}); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_returnsView(myRowIndex) = result; }); } diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamIsSortedUntil.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamIsSortedUntil.cpp index e3b95527c77f..5bc49e46007e 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamIsSortedUntil.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamIsSortedUntil.cpp @@ -61,7 +61,7 @@ struct TestFunctorA { if (m_apiPick == 0) { auto it = KE::is_sorted_until(member, KE::cbegin(myRowView), - KE::cend(myRowView)); + KE::cend(myRowView)); resultDist = KE::distance(KE::cbegin(myRowView), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; @@ -77,8 +77,8 @@ struct TestFunctorA { else if (m_apiPick == 2) { using value_type = typename ViewType::value_type; auto it = KE::is_sorted_until(member, KE::cbegin(myRowView), - KE::cend(myRowView), - CustomLessThanComparator{}); + KE::cend(myRowView), + CustomLessThanComparator{}); resultDist = KE::distance(KE::cbegin(myRowView), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; @@ -88,7 +88,7 @@ struct TestFunctorA { else if (m_apiPick == 3) { using value_type = typename ViewType::value_type; auto it = KE::is_sorted_until(member, myRowView, - CustomLessThanComparator{}); + CustomLessThanComparator{}); resultDist = KE::distance(KE::begin(myRowView), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; @@ -210,7 +210,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId, stdDistance = KE::distance(KE::cbegin(myRow), it); } else { auto it = std::is_sorted_until(KE::cbegin(myRow), KE::cend(myRow), - CustomLessThanComparator{}); + CustomLessThanComparator{}); stdDistance = KE::distance(KE::cbegin(myRow), it); } ASSERT_EQ(stdDistance, distancesView_h(i)); diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMaxElement.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMaxElement.cpp index 283525dbd10f..452a48df2161 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMaxElement.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMaxElement.cpp @@ -74,7 +74,7 @@ struct TestFunctorA { else if (m_apiPick == 3) { using value_type = typename ViewType::value_type; auto it = KE::max_element(member, myRowView, - CustomLessThanComparator{}); + CustomLessThanComparator{}); resultDist = KE::distance(KE::begin(myRowView), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; @@ -144,7 +144,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { stdDistance = KE::distance(KE::cbegin(myRow), it); } else { auto it = std::max_element(KE::cbegin(myRow), KE::cend(myRow), - CustomLessThanComparator{}); + CustomLessThanComparator{}); stdDistance = KE::distance(KE::cbegin(myRow), it); } diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMinElement.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMinElement.cpp index 8579b48315d8..2c79370b926e 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMinElement.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMinElement.cpp @@ -74,7 +74,7 @@ struct TestFunctorA { else if (m_apiPick == 3) { using value_type = typename ViewType::value_type; auto it = KE::min_element(member, myRowView, - CustomLessThanComparator{}); + CustomLessThanComparator{}); resultDist = KE::distance(KE::begin(myRowView), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; @@ -144,7 +144,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { stdDistance = KE::distance(KE::cbegin(myRow), it); } else { auto it = std::min_element(KE::cbegin(myRow), KE::cend(myRow), - CustomLessThanComparator{}); + CustomLessThanComparator{}); stdDistance = KE::distance(KE::cbegin(myRow), it); } ASSERT_EQ(stdDistance, distancesView_h(i)); diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMinMaxElement.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMinMaxElement.cpp index 51010fdff59b..25a4487855b6 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMinMaxElement.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMinMaxElement.cpp @@ -84,7 +84,7 @@ struct TestFunctorA { else if (m_apiPick == 3) { using value_type = typename ViewType::value_type; auto itPair = KE::minmax_element(member, myRowView, - CustomLessThanComparator{}); + CustomLessThanComparator{}); resultDist1 = KE::distance(KE::begin(myRowView), itPair.first); resultDist2 = KE::distance(KE::begin(myRowView), itPair.second); @@ -160,7 +160,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { stdDistance[1] = KE::distance(KE::cbegin(myRow), itPair.second); } else { auto itPair = std::minmax_element(KE::cbegin(myRow), KE::cend(myRow), - CustomLessThanComparator{}); + CustomLessThanComparator{}); stdDistance[0] = KE::distance(KE::cbegin(myRow), itPair.first); stdDistance[1] = KE::distance(KE::cbegin(myRow), itPair.second); } diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMove.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMove.cpp index 1122d6d554ac..2c445dacf8e7 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMove.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMove.cpp @@ -50,7 +50,7 @@ struct TestFunctorA { if (m_apiPick == 0) { auto it = KE::move(member, KE::begin(myRowViewFrom), - KE::end(myRowViewFrom), KE::begin(myRowViewDest)); + KE::end(myRowViewFrom), KE::begin(myRowViewDest)); resultDist = KE::distance(KE::begin(myRowViewDest), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemove.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemove.cpp index fb9c70391b3d..2defa1dc6fc8 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemove.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemove.cpp @@ -63,7 +63,7 @@ struct TestFunctorA { if (m_apiPick == 0) { auto it = KE::remove(member, KE::begin(myRowView), KE::end(myRowView), - m_targetValue); + m_targetValue); resultDist = KE::distance(KE::begin(myRowView), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopy.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopy.cpp index 6bb0d249988d..71a50e39e3ef 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopy.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopy.cpp @@ -67,8 +67,8 @@ struct TestFunctorA { if (m_apiPick == 0) { auto it = KE::remove_copy(member, KE::cbegin(myRowViewFrom), - KE::cend(myRowViewFrom), - KE::begin(myRowViewDest), m_targetValue); + KE::cend(myRowViewFrom), + KE::begin(myRowViewDest), m_targetValue); resultDist = KE::distance(KE::begin(myRowViewDest), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopyIf.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopyIf.cpp index cff9aa178a29..d5b5304f6315 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopyIf.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopyIf.cpp @@ -65,8 +65,8 @@ struct TestFunctorA { GreaterThanValueFunctor predicate(m_threshold); if (m_apiPick == 0) { auto it = KE::remove_copy_if(member, KE::cbegin(myRowViewFrom), - KE::cend(myRowViewFrom), - KE::begin(myRowViewDest), predicate); + KE::cend(myRowViewFrom), + KE::begin(myRowViewDest), predicate); resultDist = KE::distance(KE::begin(myRowViewDest), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamReplaceCopy.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamReplaceCopy.cpp index 70dbf10574b8..64f172e401cc 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamReplaceCopy.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamReplaceCopy.cpp @@ -78,7 +78,7 @@ struct TestFunctorA { }); } else if (m_apiPick == 1) { auto it = KE::replace_copy(member, myRowViewFrom, myRowViewDest, - m_targetValue, m_newValue); + m_targetValue, m_newValue); resultDist = KE::distance(KE::begin(myRowViewDest), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; @@ -172,7 +172,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { auto rowFrom = Kokkos::subview(sourceView_dc_h, i, Kokkos::ALL()); auto rowDest = Kokkos::subview(stdDestView, i, Kokkos::ALL()); auto it = std::replace_copy(KE::cbegin(rowFrom), KE::cend(rowFrom), - KE::begin(rowDest), targetVal, newVal); + KE::begin(rowDest), targetVal, newVal); const std::size_t stdDistance = KE::distance(KE::begin(rowDest), it); ASSERT_EQ(stdDistance, distancesView_h(i)); ASSERT_TRUE(intraTeamSentinelView_h(i)); diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamReplaceCopyIf.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamReplaceCopyIf.cpp index d0217aed7a8e..9c3699320d8c 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamReplaceCopyIf.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamReplaceCopyIf.cpp @@ -76,7 +76,7 @@ struct TestFunctorA { }); } else if (m_apiPick == 1) { auto it = KE::replace_copy_if(member, myRowViewFrom, myRowViewDest, - predicate, m_newValue); + predicate, m_newValue); resultDist = KE::distance(KE::begin(myRowViewDest), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; @@ -151,7 +151,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { Kokkos::subview(cloneOfSourceViewBeforeOp_h, i, Kokkos::ALL()); auto rowDest = Kokkos::subview(stdDestView, i, Kokkos::ALL()); auto it = std::replace_copy_if(KE::cbegin(rowFrom), KE::cend(rowFrom), - KE::begin(rowDest), predicate, newVal); + KE::begin(rowDest), predicate, newVal); const std::size_t stdDistance = KE::distance(KE::begin(rowDest), it); ASSERT_EQ(stdDistance, distancesView_h(i)); ASSERT_TRUE(intraTeamSentinelView_h(i)); diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRotateCopy.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRotateCopy.cpp index e865b998f600..51f600fabad6 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRotateCopy.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRotateCopy.cpp @@ -136,7 +136,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, std::size_t pivotShift, auto pivot = KE::cbegin(myRowFrom) + pivotShift; auto it = std::rotate_copy(KE::cbegin(myRowFrom), pivot, - KE::cend(myRowFrom), KE::begin(myRowDest)); + KE::cend(myRowFrom), KE::begin(myRowDest)); const std::size_t stdDistance = KE::distance(KE::begin(myRowDest), it); ASSERT_EQ(stdDistance, distancesView_h(i)); ASSERT_TRUE(intraTeamSentinelView_h(i)); diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamShiftRight.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamShiftRight.cpp index 00a80c5ef070..08ff8fbbca62 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamShiftRight.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamShiftRight.cpp @@ -47,7 +47,7 @@ struct TestFunctorA { if (m_apiPick == 0) { auto it = KE::shift_right(member, KE::begin(myRowView), - KE::end(myRowView), m_shift); + KE::end(myRowView), m_shift); resultDist = KE::distance(KE::begin(myRowView), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamSwapRanges.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamSwapRanges.cpp index 5fc9612caa7b..60cb3f083779 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamSwapRanges.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamSwapRanges.cpp @@ -49,7 +49,7 @@ struct TestFunctorA { if (m_apiPick == 0) { auto it = KE::swap_ranges(member, KE::begin(myRowView1), - KE::end(myRowView1), KE::begin(myRowView2)); + KE::end(myRowView1), KE::begin(myRowView2)); resultDist = KE::distance(KE::begin(myRowView2), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamTransformInclusiveScan.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamTransformInclusiveScan.cpp index 0b0d798fd801..78a21c443055 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamTransformInclusiveScan.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamTransformInclusiveScan.cpp @@ -91,7 +91,7 @@ struct TestFunctorA { case 1: { auto it = KE::transform_inclusive_scan(member, srcRow, destRow, - m_binaryOp, m_unaryOp); + m_binaryOp, m_unaryOp); resultDist = KE::distance(firstDest, it); Kokkos::single(Kokkos::PerTeam(member), [=, *this] { m_distancesView(rowIndex) = resultDist; }); @@ -111,7 +111,7 @@ struct TestFunctorA { case 3: { auto it = KE::transform_inclusive_scan(member, srcRow, destRow, - m_binaryOp, m_unaryOp, initVal); + m_binaryOp, m_unaryOp, initVal); resultDist = KE::distance(firstDest, it); Kokkos::single(Kokkos::PerTeam(member), [=, *this] { m_distancesView(rowIndex) = resultDist; }); diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamUnique.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamUnique.cpp index c46146e0a8f6..cef0f7c13d07 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamUnique.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamUnique.cpp @@ -58,7 +58,7 @@ struct TestFunctorA { } else if (m_apiPick == 2) { using value_type = typename ViewType::value_type; auto it = KE::unique(member, KE::begin(myRowView), KE::end(myRowView), - CustomEqualityComparator{}); + CustomEqualityComparator{}); resultDist = KE::distance(KE::begin(myRowView), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; @@ -138,7 +138,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { stdDistance = KE::distance(KE::begin(myRow), it); } else { auto it = std::unique(KE::begin(myRow), KE::end(myRow), - CustomEqualityComparator{}); + CustomEqualityComparator{}); stdDistance = KE::distance(KE::begin(myRow), it); } ASSERT_EQ(stdDistance, distancesView_h(i)); diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamUniqueCopy.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamUniqueCopy.cpp index 0d3289e196f0..89ea8154c7ec 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamUniqueCopy.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamUniqueCopy.cpp @@ -72,8 +72,8 @@ struct TestFunctorA { using comparator_t = CustomEqualityComparator; auto it = KE::unique_copy(member, KE::begin(myRowViewFrom), - KE::end(myRowViewFrom), - KE::begin(myRowViewDest), comparator_t()); + KE::end(myRowViewFrom), + KE::begin(myRowViewDest), comparator_t()); resultDist = KE::distance(KE::begin(myRowViewDest), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; @@ -159,12 +159,12 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { std::size_t stdDistance = 0; if (apiId <= 1) { auto it = std::unique_copy(KE::cbegin(myRowFrom), KE::cend(myRowFrom), - KE::begin(myRowDest)); + KE::begin(myRowDest)); stdDistance = KE::distance(KE::begin(myRowDest), it); } else { auto it = std::unique_copy(KE::cbegin(myRowFrom), KE::cend(myRowFrom), - KE::begin(myRowDest), - CustomEqualityComparator{}); + KE::begin(myRowDest), + CustomEqualityComparator{}); stdDistance = KE::distance(KE::begin(myRowDest), it); } ASSERT_EQ(stdDistance, distancesView_h(i)); diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformExclusiveScan.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformExclusiveScan.cpp index fa2804256ac2..365ca21688b4 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformExclusiveScan.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformExclusiveScan.cpp @@ -115,7 +115,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); @@ -161,7 +161,7 @@ void verify_data(ViewType1 data_view, // contains data create_mirror_view_and_copy(Kokkos::HostSpace(), test_view_dc); if (test_view_h.extent(0) > 0) { for (std::size_t i = 0; i < test_view_h.extent(0); ++i) { - if (std::is_same::value) { + if (std::is_same_v) { ASSERT_EQ(gold_h(i), test_view_h(i)); } else { const auto error = std::abs(gold_h(i) - test_view_h(i)); diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformInclusiveScan.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformInclusiveScan.cpp index fb81ae91b049..cc8726214786 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformInclusiveScan.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformInclusiveScan.cpp @@ -115,7 +115,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); @@ -173,7 +173,7 @@ void verify_data(ViewType1 data_view, // contains data create_mirror_view_and_copy(Kokkos::HostSpace(), test_view_dc); if (test_view_h.extent(0) > 0) { for (std::size_t i = 0; i < test_view_h.extent(0); ++i) { - if (std::is_same::value) { + if (std::is_same_v) { ASSERT_EQ(gold_h(i), test_view_h(i)); } else { const auto error = std::abs(gold_h(i) - test_view_h(i)); diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsUnique.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsUnique.cpp index 9c5ae0cf8a1e..6ee93e3d5fa0 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsUnique.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsUnique.cpp @@ -138,7 +138,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsUniqueCopy.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsUniqueCopy.cpp index 3cf43ad4db8f..e3e969645839 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsUniqueCopy.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsUniqueCopy.cpp @@ -146,7 +146,7 @@ std::size_t fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + Kokkos::abort("invalid choice"); } Kokkos::deep_copy(aux_view, v_h); @@ -235,7 +235,7 @@ void verify_data(const std::string& name, ViewTypeFrom view_from, } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } } diff --git a/packages/kokkos/algorithms/unit_tests/TestStdReducers.cpp b/packages/kokkos/algorithms/unit_tests/TestStdReducers.cpp index c05006a1617c..0044b935587f 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdReducers.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdReducers.cpp @@ -72,7 +72,7 @@ auto create_host_view_with_reduction_order_indices( result(8) = 7; result(9) = 5; } else { - throw std::runtime_error("test: Invalid enum"); + Kokkos::abort("test: Invalid enum"); } return result; @@ -80,7 +80,7 @@ auto create_host_view_with_reduction_order_indices( template auto run_min_or_max_test(ViewType view, StdReducersTestEnumOrder enValue) { - static_assert(std::is_same::value, + static_assert(std::is_same_v, "test is only enabled for HostSpace"); using view_value_type = typename ViewType::value_type; @@ -191,7 +191,7 @@ template void run_min_max_test(ViewType view, StdReducersTestEnumOrder enValue, const ValuesPair gold_values, const IndexPair gold_locs) { - static_assert(std::is_same::value, + static_assert(std::is_same_v, "test is only enabled for HostSpace"); using view_value_type = typename ViewType::value_type; diff --git a/packages/kokkos/benchmarks/CMakeLists.txt b/packages/kokkos/benchmarks/CMakeLists.txt index 529ef393d994..968c8ae3bf59 100644 --- a/packages/kokkos/benchmarks/CMakeLists.txt +++ b/packages/kokkos/benchmarks/CMakeLists.txt @@ -1,12 +1,12 @@ #FIXME_OPENMPTARGET - compiling in debug mode causes ICE. -KOKKOS_ADD_BENCHMARK_DIRECTORIES(atomic) -KOKKOS_ADD_BENCHMARK_DIRECTORIES(gather) -KOKKOS_ADD_BENCHMARK_DIRECTORIES(gups) -KOKKOS_ADD_BENCHMARK_DIRECTORIES(launch_latency) -KOKKOS_ADD_BENCHMARK_DIRECTORIES(stream) -KOKKOS_ADD_BENCHMARK_DIRECTORIES(view_copy_constructor) +kokkos_add_benchmark_directories(atomic) +kokkos_add_benchmark_directories(gather) +kokkos_add_benchmark_directories(gups) +kokkos_add_benchmark_directories(launch_latency) +kokkos_add_benchmark_directories(stream) +kokkos_add_benchmark_directories(view_copy_constructor) #FIXME_OPENMPTARGET - These two benchmarks cause ICE. Commenting them for now but a deeper analysis on the cause and a possible fix will follow. -IF(NOT Kokkos_ENABLE_OPENMPTARGET) - KOKKOS_ADD_BENCHMARK_DIRECTORIES(policy_performance) - KOKKOS_ADD_BENCHMARK_DIRECTORIES(bytes_and_flops) -ENDIF() +if(NOT Kokkos_ENABLE_OPENMPTARGET) + kokkos_add_benchmark_directories(policy_performance) + kokkos_add_benchmark_directories(bytes_and_flops) +endif() diff --git a/packages/kokkos/benchmarks/atomic/CMakeLists.txt b/packages/kokkos/benchmarks/atomic/CMakeLists.txt index 85f7412f492f..7fda2bf6f6a4 100644 --- a/packages/kokkos/benchmarks/atomic/CMakeLists.txt +++ b/packages/kokkos/benchmarks/atomic/CMakeLists.txt @@ -1,4 +1 @@ -KOKKOS_ADD_EXECUTABLE( - atomic - SOURCES main.cpp -) +kokkos_add_executable(atomic SOURCES main.cpp) diff --git a/packages/kokkos/benchmarks/bytes_and_flops/CMakeLists.txt b/packages/kokkos/benchmarks/bytes_and_flops/CMakeLists.txt index 0ce44a6f1a8e..9c65d06ce28e 100644 --- a/packages/kokkos/benchmarks/bytes_and_flops/CMakeLists.txt +++ b/packages/kokkos/benchmarks/bytes_and_flops/CMakeLists.txt @@ -1,4 +1,9 @@ -KOKKOS_ADD_EXECUTABLE( +kokkos_add_executable( bytes_and_flops - SOURCES bench_double.cpp bench_float.cpp bench_int32_t.cpp bench_int64_t.cpp main.cpp + SOURCES + bench_double.cpp + bench_float.cpp + bench_int32_t.cpp + bench_int64_t.cpp + main.cpp ) diff --git a/packages/kokkos/benchmarks/bytes_and_flops/bench_unroll_stride.hpp b/packages/kokkos/benchmarks/bytes_and_flops/bench_unroll_stride.hpp index 78cfd48effec..762cc988f14e 100644 --- a/packages/kokkos/benchmarks/bytes_and_flops/bench_unroll_stride.hpp +++ b/packages/kokkos/benchmarks/bytes_and_flops/bench_unroll_stride.hpp @@ -17,9 +17,9 @@ template struct Run { static void run(int N, int K, int R, int F, int T, int S, int Ba, int I) { - Kokkos::View A("A", N, K); - Kokkos::View B("B", N, K); - Kokkos::View C("C", N, K); + Kokkos::View A("A", N, K); + Kokkos::View B("B", N, K); + Kokkos::View C("C", N, K); Kokkos::deep_copy(A, Scalar(1.5)); Kokkos::deep_copy(B, Scalar(2.5)); diff --git a/packages/kokkos/benchmarks/gather/CMakeLists.txt b/packages/kokkos/benchmarks/gather/CMakeLists.txt index 24c706277259..2de1ce85e637 100644 --- a/packages/kokkos/benchmarks/gather/CMakeLists.txt +++ b/packages/kokkos/benchmarks/gather/CMakeLists.txt @@ -1,4 +1 @@ -KOKKOS_ADD_EXECUTABLE( - gather - SOURCES main.cpp -) +kokkos_add_executable(gather SOURCES main.cpp) diff --git a/packages/kokkos/benchmarks/gups/CMakeLists.txt b/packages/kokkos/benchmarks/gups/CMakeLists.txt index 8de5b73cc67f..dc7074702925 100644 --- a/packages/kokkos/benchmarks/gups/CMakeLists.txt +++ b/packages/kokkos/benchmarks/gups/CMakeLists.txt @@ -1,4 +1 @@ -KOKKOS_ADD_EXECUTABLE( - gups - SOURCES gups.cpp -) +kokkos_add_executable(gups SOURCES gups.cpp) diff --git a/packages/kokkos/benchmarks/gups/gups.cpp b/packages/kokkos/benchmarks/gups/gups.cpp index 369052321d7b..e00f87968bde 100644 --- a/packages/kokkos/benchmarks/gups/gups.cpp +++ b/packages/kokkos/benchmarks/gups/gups.cpp @@ -140,7 +140,7 @@ int run_benchmark(const Index indicesCount, const Index dataCount, break; } default: { - throw std::runtime_error("unexpected mode"); + Kokkos::abort("unexpected mode"); } } diff --git a/packages/kokkos/benchmarks/launch_latency/CMakeLists.txt b/packages/kokkos/benchmarks/launch_latency/CMakeLists.txt index bb14da749d12..4775bf2261e0 100644 --- a/packages/kokkos/benchmarks/launch_latency/CMakeLists.txt +++ b/packages/kokkos/benchmarks/launch_latency/CMakeLists.txt @@ -1,4 +1 @@ -KOKKOS_ADD_EXECUTABLE( - launch_latency - SOURCES launch_latency.cpp -) +kokkos_add_executable(launch_latency SOURCES launch_latency.cpp) diff --git a/packages/kokkos/benchmarks/launch_latency/launch_latency.cpp b/packages/kokkos/benchmarks/launch_latency/launch_latency.cpp index 73b176ab8dd7..156c29af09e6 100644 --- a/packages/kokkos/benchmarks/launch_latency/launch_latency.cpp +++ b/packages/kokkos/benchmarks/launch_latency/launch_latency.cpp @@ -254,7 +254,7 @@ int main(int argc, char* argv[]) { else if (i == 3) K = atoi(arg.data()); else { - throw std::runtime_error("unexpected argument!"); + Kokkos::abort("unexpected argument!"); } } else if (arg == "--no-parallel-for") { opts.par_for = false; @@ -265,7 +265,7 @@ int main(int argc, char* argv[]) { } else { std::stringstream ss; ss << "unexpected argument \"" << arg << "\" at position " << i; - throw std::runtime_error(ss.str()); + Kokkos::abort(ss.str().c_str()); } } diff --git a/packages/kokkos/benchmarks/policy_performance/CMakeLists.txt b/packages/kokkos/benchmarks/policy_performance/CMakeLists.txt index 929b9c970237..4a939775c0bc 100644 --- a/packages/kokkos/benchmarks/policy_performance/CMakeLists.txt +++ b/packages/kokkos/benchmarks/policy_performance/CMakeLists.txt @@ -1,4 +1 @@ -KOKKOS_ADD_EXECUTABLE( - policy_performance - SOURCES main.cpp -) +kokkos_add_executable(policy_performance SOURCES main.cpp) diff --git a/packages/kokkos/benchmarks/stream/CMakeLists.txt b/packages/kokkos/benchmarks/stream/CMakeLists.txt index 0dded6e3a541..b096976c486f 100644 --- a/packages/kokkos/benchmarks/stream/CMakeLists.txt +++ b/packages/kokkos/benchmarks/stream/CMakeLists.txt @@ -1,4 +1 @@ -KOKKOS_ADD_EXECUTABLE( - stream - SOURCES stream-kokkos.cpp -) +kokkos_add_executable(stream SOURCES stream-kokkos.cpp) diff --git a/packages/kokkos/benchmarks/view_copy_constructor/CMakeLists.txt b/packages/kokkos/benchmarks/view_copy_constructor/CMakeLists.txt index 50a331b2b354..f7bbc13b6ec5 100644 --- a/packages/kokkos/benchmarks/view_copy_constructor/CMakeLists.txt +++ b/packages/kokkos/benchmarks/view_copy_constructor/CMakeLists.txt @@ -1,4 +1 @@ -KOKKOS_ADD_EXECUTABLE( - view_copy_constructor - SOURCES view_copy_constructor.cpp -) +kokkos_add_executable(view_copy_constructor SOURCES view_copy_constructor.cpp) diff --git a/packages/kokkos/bin/kokkos_launch_compiler b/packages/kokkos/bin/kokkos_launch_compiler index d1f8896f91b1..ee3c29e96d3a 100755 --- a/packages/kokkos/bin/kokkos_launch_compiler +++ b/packages/kokkos/bin/kokkos_launch_compiler @@ -62,7 +62,7 @@ KOKKOS_COMPILER=${1} shift # store the expected C++ compiler -CXX_COMPILER=${1} +CXX_COMPILER=$(which "${1}") # remove the expected C++ compiler from the arguments shift @@ -84,7 +84,7 @@ shift # kokkos_launch_compiler ${KOKKOS_COMPILER} g++ g++ -c file.cpp -o file.o # results in this command being executed: # ${KOKKOS_COMPILER} -c file.cpp -o file.o -if [[ "${KOKKOS_DEPENDENCE}" -eq "0" || "${CXX_COMPILER}" != "${1}" ]]; then +if [[ "${KOKKOS_DEPENDENCE}" -eq "0" || "${CXX_COMPILER}" != $(which "${1}") ]]; then debug-message "$@" # the command does not depend on Kokkos so just execute the command w/o re-directing to ${KOKKOS_COMPILER} exec "$@" diff --git a/packages/kokkos/cmake/Dependencies.cmake b/packages/kokkos/cmake/Dependencies.cmake index fb1e73b5799c..2f70c2f038c1 100644 --- a/packages/kokkos/cmake/Dependencies.cmake +++ b/packages/kokkos/cmake/Dependencies.cmake @@ -1,5 +1,3 @@ -TRIBITS_PACKAGE_DEFINE_DEPENDENCIES( - LIB_OPTIONAL_TPLS Pthread CUDA HWLOC DLlib - ) +tribits_package_define_dependencies(LIB_OPTIONAL_TPLS Pthread CUDA HWLOC DLlib) -TRIBITS_TPL_TENTATIVELY_ENABLE(DLlib) +tribits_tpl_tentatively_enable(DLlib) diff --git a/packages/kokkos/cmake/KokkosCore_config.h.in b/packages/kokkos/cmake/KokkosCore_config.h.in index a93007ff83f6..44f81bb8cea2 100644 --- a/packages/kokkos/cmake/KokkosCore_config.h.in +++ b/packages/kokkos/cmake/KokkosCore_config.h.in @@ -24,7 +24,6 @@ #cmakedefine KOKKOS_ENABLE_HIP #cmakedefine KOKKOS_ENABLE_HPX #cmakedefine KOKKOS_ENABLE_SYCL -#cmakedefine KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED /* General Settings */ #cmakedefine KOKKOS_ENABLE_CXX17 @@ -40,7 +39,10 @@ #cmakedefine KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY #cmakedefine KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE #cmakedefine KOKKOS_ENABLE_HIP_MULTIPLE_KERNEL_INSTANTIATIONS -#cmakedefine KOKKOS_ENABLE_IMPL_HIP_UNIFIED_MEMORY +#cmakedefine KOKKOS_ENABLE_IMPL_HIP_MALLOC_ASYNC +#cmakedefine KOKKOS_ENABLE_SYCL_RELOCATABLE_DEVICE_CODE +#cmakedefine KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED +#cmakedefine KOKKOS_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE #cmakedefine KOKKOS_ENABLE_IMPL_HPX_ASYNC_DISPATCH #cmakedefine KOKKOS_ENABLE_DEBUG #cmakedefine KOKKOS_ENABLE_DEBUG_DUALVIEW_MODIFY_CHECK @@ -80,6 +82,7 @@ #cmakedefine KOKKOS_ARCH_POWER8 #cmakedefine KOKKOS_ARCH_POWER9 #cmakedefine KOKKOS_ARCH_RISCV_SG2042 +#cmakedefine KOKKOS_ARCH_RISCV_RVA22V #cmakedefine KOKKOS_ARCH_INTEL_GEN #cmakedefine KOKKOS_ARCH_INTEL_DG1 #cmakedefine KOKKOS_ARCH_INTEL_GEN9 @@ -118,9 +121,11 @@ #cmakedefine KOKKOS_ARCH_AMD_GFX90A #cmakedefine KOKKOS_ARCH_AMD_GFX940 #cmakedefine KOKKOS_ARCH_AMD_GFX942 +#cmakedefine KOKKOS_ARCH_AMD_GFX942_APU #cmakedefine KOKKOS_ARCH_AMD_GFX1030 #cmakedefine KOKKOS_ARCH_AMD_GFX1100 -#cmakedefine KOKKOS_ARCH_AMD_GPU +#cmakedefine KOKKOS_ARCH_AMD_GFX1103 +#cmakedefine KOKKOS_ARCH_AMD_GPU "@KOKKOS_ARCH_AMD_GPU@" #cmakedefine KOKKOS_ARCH_VEGA // deprecated #cmakedefine KOKKOS_ARCH_VEGA906 // deprecated #cmakedefine KOKKOS_ARCH_VEGA908 // deprecated diff --git a/packages/kokkos/cmake/KokkosTrilinosConfig.cmake.in b/packages/kokkos/cmake/KokkosTrilinosConfig.cmake.in deleted file mode 100644 index 626ef5a8ebef..000000000000 --- a/packages/kokkos/cmake/KokkosTrilinosConfig.cmake.in +++ /dev/null @@ -1,17 +0,0 @@ -IF (NOT TARGET Kokkos::kokkos) - # Compute the installation prefix relative to this file. - get_filename_component(KOKKOS_IMPORT_PREFIX "${CMAKE_CURRENT_LIST_FILE}" PATH) - get_filename_component(KOKKOS_IMPORT_PREFIX "${KOKKOS_IMPORT_PREFIX}" PATH) - get_filename_component(KOKKOS_IMPORT_PREFIX "${KOKKOS_IMPORT_PREFIX}" PATH) - get_filename_component(KOKKOS_IMPORT_PREFIX "${KOKKOS_IMPORT_PREFIX}" PATH) - if(KOKKOS_IMPORT_PREFIX STREQUAL "/") - set(KOKKOS_IMPORT_PREFIX "") - endif() - add_library(Kokkos::kokkos INTERFACE IMPORTED) - set_target_properties(Kokkos::kokkos PROPERTIES - INTERFACE_LINK_LIBRARIES "@Kokkos_LIBRARIES@;@KOKKOS_LINK_OPTIONS@" - INTERFACE_COMPILE_FEATURES "@KOKKOS_CXX_STANDARD_FEATURE@" - INTERFACE_COMPILE_OPTIONS "@KOKKOS_ALL_COMPILE_OPTIONS@" - INTERFACE_INCLUDE_DIRECTORIES "${KOKKOS_IMPORT_PREFIX}/include" - ) -ENDIF() diff --git a/packages/kokkos/cmake/Modules/CudaToolkit.cmake b/packages/kokkos/cmake/Modules/CudaToolkit.cmake index eda5541f7c06..b8ac2048b5fc 100644 --- a/packages/kokkos/cmake/Modules/CudaToolkit.cmake +++ b/packages/kokkos/cmake/Modules/CudaToolkit.cmake @@ -483,38 +483,40 @@ endif() # Try language- or user-provided path first. if(CUDAToolkit_BIN_DIR) - find_program(CUDAToolkit_NVCC_EXECUTABLE + find_program( + CUDAToolkit_NVCC_EXECUTABLE NAMES nvcc nvcc.exe PATHS ${CUDAToolkit_BIN_DIR} NO_DEFAULT_PATH - ) + ) endif() # Search using CUDAToolkit_ROOT -find_program(CUDAToolkit_NVCC_EXECUTABLE +find_program( + CUDAToolkit_NVCC_EXECUTABLE NAMES nvcc nvcc.exe PATHS ENV CUDA_PATH PATH_SUFFIXES bin ) # If the user specified CUDAToolkit_ROOT but nvcc could not be found, this is an error. -if (NOT CUDAToolkit_NVCC_EXECUTABLE AND (DEFINED CUDAToolkit_ROOT OR DEFINED ENV{CUDAToolkit_ROOT})) +if(NOT CUDAToolkit_NVCC_EXECUTABLE AND (DEFINED CUDAToolkit_ROOT OR DEFINED ENV{CUDAToolkit_ROOT})) # Declare error messages now, print later depending on find_package args. set(fail_base "Could not find nvcc executable in path specified by") set(cuda_root_fail "${fail_base} CUDAToolkit_ROOT=${CUDAToolkit_ROOT}") set(env_cuda_root_fail "${fail_base} environment variable CUDAToolkit_ROOT=$ENV{CUDAToolkit_ROOT}") - if (CUDAToolkit_FIND_REQUIRED) - if (DEFINED CUDAToolkit_ROOT) + if(CUDAToolkit_FIND_REQUIRED) + if(DEFINED CUDAToolkit_ROOT) message(FATAL_ERROR ${cuda_root_fail}) - elseif (DEFINED ENV{CUDAToolkit_ROOT}) + elseif(DEFINED ENV{CUDAToolkit_ROOT}) message(FATAL_ERROR ${env_cuda_root_fail}) endif() else() - if (NOT CUDAToolkit_FIND_QUIETLY) - if (DEFINED CUDAToolkit_ROOT) + if(NOT CUDAToolkit_FIND_QUIETLY) + if(DEFINED CUDAToolkit_ROOT) message(STATUS ${cuda_root_fail}) - elseif (DEFINED ENV{CUDAToolkit_ROOT}) + elseif(DEFINED ENV{CUDAToolkit_ROOT}) message(STATUS ${env_cuda_root_fail}) endif() endif() @@ -535,9 +537,9 @@ endif() # We will also search the default symlink location /usr/local/cuda first since # if CUDAToolkit_ROOT is not specified, it is assumed that the symlinked # directory is the desired location. -if (NOT CUDAToolkit_NVCC_EXECUTABLE) - if (UNIX) - if (NOT APPLE) +if(NOT CUDAToolkit_NVCC_EXECUTABLE) + if(UNIX) + if(NOT APPLE) set(platform_base "/usr/local/cuda-") else() set(platform_base "/Developer/NVIDIA/CUDA-") @@ -550,10 +552,10 @@ if (NOT CUDAToolkit_NVCC_EXECUTABLE) file(GLOB possible_paths "${platform_base}*") # Iterate the glob results and create a descending list. set(possible_versions) - foreach (p ${possible_paths}) + foreach(p ${possible_paths}) # Extract version number from end of string string(REGEX MATCH "[0-9][0-9]?\\.[0-9]$" p_version ${p}) - if (IS_DIRECTORY ${p} AND p_version) + if(IS_DIRECTORY ${p} AND p_version) list(APPEND possible_versions ${p_version}) endif() endforeach() @@ -563,10 +565,10 @@ if (NOT CUDAToolkit_NVCC_EXECUTABLE) # every possible version of CUDA installed, this wouldn't create any # significant overhead. set(versions) - foreach (v ${possible_versions}) + foreach(v ${possible_versions}) list(LENGTH versions num_versions) # First version, nothing to compare with so just append. - if (num_versions EQUAL 0) + if(num_versions EQUAL 0) list(APPEND versions ${v}) else() # Loop through list. Insert at an index when comparison is @@ -574,9 +576,9 @@ if (NOT CUDAToolkit_NVCC_EXECUTABLE) # happen since this came from a glob list of directories. set(i 0) set(early_terminate FALSE) - while (i LESS num_versions) + while(i LESS num_versions) list(GET versions ${i} curr) - if (v VERSION_GREATER curr) + if(v VERSION_GREATER curr) list(INSERT versions ${i} ${v}) set(early_terminate TRUE) break() @@ -584,7 +586,7 @@ if (NOT CUDAToolkit_NVCC_EXECUTABLE) math(EXPR i "${i} + 1") endwhile() # If it did not get inserted, place it at the end. - if (NOT early_terminate) + if(NOT early_terminate) list(APPEND versions ${v}) endif() endif() @@ -592,17 +594,18 @@ if (NOT CUDAToolkit_NVCC_EXECUTABLE) # With a descending list of versions, populate possible paths to search. set(search_paths) - foreach (v ${versions}) + foreach(v ${versions}) list(APPEND search_paths "${platform_base}${v}") endforeach() # Force the global default /usr/local/cuda to the front on Unix. - if (UNIX) + if(UNIX) list(INSERT search_paths 0 "/usr/local/cuda") endif() # Now search for nvcc again using the platform default search paths. - find_program(CUDAToolkit_NVCC_EXECUTABLE + find_program( + CUDAToolkit_NVCC_EXECUTABLE NAMES nvcc nvcc.exe PATHS ${search_paths} PATH_SUFFIXES bin @@ -617,8 +620,8 @@ if (NOT CUDAToolkit_NVCC_EXECUTABLE) unset(early_terminate) unset(search_paths) - if (NOT CUDAToolkit_NVCC_EXECUTABLE) - if (CUDAToolkit_FIND_REQUIRED) + if(NOT CUDAToolkit_NVCC_EXECUTABLE) + if(CUDAToolkit_FIND_REQUIRED) message(FATAL_ERROR "Could not find nvcc, please set CUDAToolkit_ROOT.") elseif(NOT CUDAToolkit_FIND_QUIETLY) message(STATUS "Could not find nvcc, please set CUDAToolkit_ROOT.") @@ -636,8 +639,7 @@ if(NOT CUDAToolkit_BIN_DIR AND CUDAToolkit_NVCC_EXECUTABLE) unset(cuda_dir) endif() -if(CUDAToolkit_NVCC_EXECUTABLE AND - CUDAToolkit_NVCC_EXECUTABLE STREQUAL CMAKE_CUDA_COMPILER) +if(CUDAToolkit_NVCC_EXECUTABLE AND CUDAToolkit_NVCC_EXECUTABLE STREQUAL CMAKE_CUDA_COMPILER) # Need to set these based off the already computed CMAKE_CUDA_COMPILER_VERSION value # This if statement will always match, but is used to provide variables for MATCH 1,2,3... if(CMAKE_CUDA_COMPILER_VERSION MATCHES [=[([0-9]+)\.([0-9]+)\.([0-9]+)]=]) @@ -648,39 +650,38 @@ if(CUDAToolkit_NVCC_EXECUTABLE AND endif() else() # Compute the version by invoking nvcc - execute_process (COMMAND ${CUDAToolkit_NVCC_EXECUTABLE} "--version" OUTPUT_VARIABLE NVCC_OUT) + execute_process(COMMAND ${CUDAToolkit_NVCC_EXECUTABLE} "--version" OUTPUT_VARIABLE NVCC_OUT) if(NVCC_OUT MATCHES [=[ V([0-9]+)\.([0-9]+)\.([0-9]+)]=]) set(CUDAToolkit_VERSION_MAJOR "${CMAKE_MATCH_1}") set(CUDAToolkit_VERSION_MINOR "${CMAKE_MATCH_2}") set(CUDAToolkit_VERSION_PATCH "${CMAKE_MATCH_3}") - set(CUDAToolkit_VERSION "${CMAKE_MATCH_1}.${CMAKE_MATCH_2}.${CMAKE_MATCH_3}") + set(CUDAToolkit_VERSION "${CMAKE_MATCH_1}.${CMAKE_MATCH_2}.${CMAKE_MATCH_3}") endif() unset(NVCC_OUT) endif() - get_filename_component(CUDAToolkit_ROOT_DIR ${CUDAToolkit_BIN_DIR} DIRECTORY ABSOLUTE) # Handle cross compilation if(CMAKE_CROSSCOMPILING) if(CMAKE_SYSTEM_PROCESSOR STREQUAL "armv7-a") # Support for NVPACK - set (CUDAToolkit_TARGET_NAME "armv7-linux-androideabi") + set(CUDAToolkit_TARGET_NAME "armv7-linux-androideabi") elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "arm") # Support for arm cross compilation set(CUDAToolkit_TARGET_NAME "armv7-linux-gnueabihf") elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") # Support for aarch64 cross compilation - if (ANDROID_ARCH_NAME STREQUAL "arm64") + if(ANDROID_ARCH_NAME STREQUAL "arm64") set(CUDAToolkit_TARGET_NAME "aarch64-linux-androideabi") else() set(CUDAToolkit_TARGET_NAME "aarch64-linux") - endif (ANDROID_ARCH_NAME STREQUAL "arm64") + endif(ANDROID_ARCH_NAME STREQUAL "arm64") elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64") - set(CUDAToolkit_TARGET_NAME "x86_64-linux") + set(CUDAToolkit_TARGET_NAME "x86_64-linux") endif() - if (EXISTS "${CUDAToolkit_ROOT_DIR}/targets/${CUDAToolkit_TARGET_NAME}") + if(EXISTS "${CUDAToolkit_ROOT_DIR}/targets/${CUDAToolkit_TARGET_NAME}") set(CUDAToolkit_TARGET_DIR "${CUDAToolkit_ROOT_DIR}/targets/${CUDAToolkit_TARGET_NAME}") # add known CUDA target root path to the set of directories we search for programs, libraries and headers list(PREPEND CMAKE_FIND_ROOT_PATH "${CUDAToolkit_TARGET_DIR}") @@ -702,25 +703,16 @@ else() set(_CUDAToolkit_Pop_Prefix True) endif() - # Find the include/ directory -find_path(CUDAToolkit_INCLUDE_DIR - NAMES cuda_runtime.h -) +find_path(CUDAToolkit_INCLUDE_DIR NAMES cuda_runtime.h) # And find the CUDA Runtime Library libcudart -find_library(CUDA_CUDART - NAMES cudart - PATH_SUFFIXES lib64 lib/x64 -) -if (NOT CUDA_CUDART) - find_library(CUDA_CUDART - NAMES cudart - PATH_SUFFIXES lib64/stubs lib/x64/stubs - ) +find_library(CUDA_CUDART NAMES cudart PATH_SUFFIXES lib64 lib/x64) +if(NOT CUDA_CUDART) + find_library(CUDA_CUDART NAMES cudart PATH_SUFFIXES lib64/stubs lib/x64/stubs) endif() -if (NOT CUDA_CUDART AND NOT CUDAToolkit_FIND_QUIETLY) +if(NOT CUDA_CUDART AND NOT CUDAToolkit_FIND_QUIETLY) message(STATUS "Unable to find cudart library.") endif() @@ -733,24 +725,17 @@ endif() #----------------------------------------------------------------------------- # Perform version comparison and validate all required variables are set. include(FindPackageHandleStandardArgs) -find_package_handle_standard_args(CUDAToolkit - REQUIRED_VARS - CUDAToolkit_INCLUDE_DIR - CUDA_CUDART - CUDAToolkit_NVCC_EXECUTABLE - VERSION_VAR - CUDAToolkit_VERSION +find_package_handle_standard_args( + CUDAToolkit REQUIRED_VARS CUDAToolkit_INCLUDE_DIR CUDA_CUDART CUDAToolkit_NVCC_EXECUTABLE + VERSION_VAR CUDAToolkit_VERSION ) -mark_as_advanced(CUDA_CUDART - CUDAToolkit_INCLUDE_DIR - CUDAToolkit_NVCC_EXECUTABLE - ) +mark_as_advanced(CUDA_CUDART CUDAToolkit_INCLUDE_DIR CUDAToolkit_NVCC_EXECUTABLE) #----------------------------------------------------------------------------- # Construct result variables if(CUDAToolkit_FOUND) - set(CUDAToolkit_INCLUDE_DIRS ${CUDAToolkit_INCLUDE_DIR}) - get_filename_component(CUDAToolkit_LIBRARY_DIR ${CUDA_CUDART} DIRECTORY ABSOLUTE) + set(CUDAToolkit_INCLUDE_DIRS ${CUDAToolkit_INCLUDE_DIR}) + get_filename_component(CUDAToolkit_LIBRARY_DIR ${CUDA_CUDART} DIRECTORY ABSOLUTE) endif() #----------------------------------------------------------------------------- @@ -762,27 +747,26 @@ if(CUDAToolkit_FOUND) set(search_names ${lib_name} ${arg_ALT}) - find_library(CUDA_${lib_name}_LIBRARY + find_library( + CUDA_${lib_name}_LIBRARY NAMES ${search_names} - HINTS ${CUDAToolkit_LIBRARY_DIR} - ENV CUDA_PATH - PATH_SUFFIXES nvidia/current lib64 lib/x64 lib - ${arg_EXTRA_PATH_SUFFIXES} + HINTS ${CUDAToolkit_LIBRARY_DIR} ENV CUDA_PATH + PATH_SUFFIXES nvidia/current lib64 lib/x64 lib ${arg_EXTRA_PATH_SUFFIXES} ) # Don't try any stub directories intil we have exhausted all other # search locations. if(NOT CUDA_${lib_name}_LIBRARY) - find_library(CUDA_${lib_name}_LIBRARY + find_library( + CUDA_${lib_name}_LIBRARY NAMES ${search_names} - HINTS ${CUDAToolkit_LIBRARY_DIR} - ENV CUDA_PATH + HINTS ${CUDAToolkit_LIBRARY_DIR} ENV CUDA_PATH PATH_SUFFIXES lib64/stubs lib/x64/stubs lib/stubs stubs ) endif() mark_as_advanced(CUDA_${lib_name}_LIBRARY) - if (NOT TARGET CUDA::${lib_name} AND CUDA_${lib_name}_LIBRARY) + if(NOT TARGET CUDA::${lib_name} AND CUDA_${lib_name}_LIBRARY) add_library(CUDA::${lib_name} IMPORTED INTERFACE) target_include_directories(CUDA::${lib_name} SYSTEM INTERFACE "${CUDAToolkit_INCLUDE_DIRS}") target_link_libraries(CUDA::${lib_name} INTERFACE "${CUDA_${lib_name}_LIBRARY}") @@ -800,16 +784,15 @@ if(CUDAToolkit_FOUND) target_link_directories(CUDA::toolkit INTERFACE "${CUDAToolkit_LIBRARY_DIR}") endif() - _CUDAToolkit_find_and_add_import_lib(cuda_driver ALT cuda) + _cudatoolkit_find_and_add_import_lib(cuda_driver ALT cuda) - _CUDAToolkit_find_and_add_import_lib(cudart) - _CUDAToolkit_find_and_add_import_lib(cudart_static) + _cudatoolkit_find_and_add_import_lib(cudart) + _cudatoolkit_find_and_add_import_lib(cudart_static) # setup dependencies that are required for cudart_static when building # on linux. These are generally only required when using the CUDA toolkit # when CUDA language is disabled - if(NOT TARGET CUDA::cudart_static_deps - AND TARGET CUDA::cudart_static) + if(NOT TARGET CUDA::cudart_static_deps AND TARGET CUDA::cudart_static) add_library(CUDA::cudart_static_deps IMPORTED INTERFACE) target_link_libraries(CUDA::cudart_static INTERFACE CUDA::cudart_static_deps) @@ -831,55 +814,64 @@ if(CUDAToolkit_FOUND) endif() endif() - _CUDAToolkit_find_and_add_import_lib(culibos) # it's a static library - foreach (cuda_lib cublas cufft curand cusparse nppc nvjpeg) - _CUDAToolkit_find_and_add_import_lib(${cuda_lib}) - _CUDAToolkit_find_and_add_import_lib(${cuda_lib}_static DEPS culibos) + _cudatoolkit_find_and_add_import_lib(culibos) # it's a static library + foreach(cuda_lib cublas cufft curand cusparse nppc nvjpeg) + _cudatoolkit_find_and_add_import_lib(${cuda_lib}) + _cudatoolkit_find_and_add_import_lib(${cuda_lib}_static DEPS culibos) endforeach() # cuFFTW depends on cuFFT - _CUDAToolkit_find_and_add_import_lib(cufftw DEPS cufft) - _CUDAToolkit_find_and_add_import_lib(cufftw DEPS cufft_static) + _cudatoolkit_find_and_add_import_lib(cufftw DEPS cufft) + _cudatoolkit_find_and_add_import_lib(cufftw DEPS cufft_static) # cuSOLVER depends on cuBLAS, and cuSPARSE - _CUDAToolkit_find_and_add_import_lib(cusolver DEPS cublas cusparse) - _CUDAToolkit_find_and_add_import_lib(cusolver_static DEPS cublas_static cusparse_static culibos) + _cudatoolkit_find_and_add_import_lib(cusolver DEPS cublas cusparse) + _cudatoolkit_find_and_add_import_lib(cusolver_static DEPS cublas_static cusparse_static culibos) # nvGRAPH depends on cuRAND, and cuSOLVER. - _CUDAToolkit_find_and_add_import_lib(nvgraph DEPS curand cusolver) - _CUDAToolkit_find_and_add_import_lib(nvgraph_static DEPS curand_static cusolver_static) + _cudatoolkit_find_and_add_import_lib(nvgraph DEPS curand cusolver) + _cudatoolkit_find_and_add_import_lib(nvgraph_static DEPS curand_static cusolver_static) # Process the majority of the NPP libraries. - foreach (cuda_lib nppial nppicc nppidei nppif nppig nppim nppist nppitc npps nppicom nppisu) - _CUDAToolkit_find_and_add_import_lib(${cuda_lib} DEPS nppc) - _CUDAToolkit_find_and_add_import_lib(${cuda_lib}_static DEPS nppc_static) + foreach( + cuda_lib + nppial + nppicc + nppidei + nppif + nppig + nppim + nppist + nppitc + npps + nppicom + nppisu + ) + _cudatoolkit_find_and_add_import_lib(${cuda_lib} DEPS nppc) + _cudatoolkit_find_and_add_import_lib(${cuda_lib}_static DEPS nppc_static) endforeach() - _CUDAToolkit_find_and_add_import_lib(cupti - EXTRA_PATH_SUFFIXES ../extras/CUPTI/lib64/ - ../extras/CUPTI/lib/) - _CUDAToolkit_find_and_add_import_lib(cupti_static - EXTRA_PATH_SUFFIXES ../extras/CUPTI/lib64/ - ../extras/CUPTI/lib/) + _cudatoolkit_find_and_add_import_lib(cupti EXTRA_PATH_SUFFIXES ../extras/CUPTI/lib64/ ../extras/CUPTI/lib/) + _cudatoolkit_find_and_add_import_lib(cupti_static EXTRA_PATH_SUFFIXES ../extras/CUPTI/lib64/ ../extras/CUPTI/lib/) - _CUDAToolkit_find_and_add_import_lib(nvrtc DEPS cuda_driver) + _cudatoolkit_find_and_add_import_lib(nvrtc DEPS cuda_driver) - _CUDAToolkit_find_and_add_import_lib(nvml ALT nvidia-ml nvml) + _cudatoolkit_find_and_add_import_lib(nvml ALT nvidia-ml nvml) if(WIN32) # nvtools can be installed outside the CUDA toolkit directory # so prefer the NVTOOLSEXT_PATH windows only environment variable # In addition on windows the most common name is nvToolsExt64_1 - find_library(CUDA_nvToolsExt_LIBRARY + find_library( + CUDA_nvToolsExt_LIBRARY NAMES nvToolsExt64_1 nvToolsExt64 nvToolsExt - PATHS ENV NVTOOLSEXT_PATH - ENV CUDA_PATH + PATHS ENV NVTOOLSEXT_PATH ENV CUDA_PATH PATH_SUFFIXES lib/x64 lib ) endif() - _CUDAToolkit_find_and_add_import_lib(nvToolsExt ALT nvToolsExt64) + _cudatoolkit_find_and_add_import_lib(nvToolsExt ALT nvToolsExt64) - _CUDAToolkit_find_and_add_import_lib(OpenCL) + _cudatoolkit_find_and_add_import_lib(OpenCL) endif() if(_CUDAToolkit_Pop_ROOT_PATH) diff --git a/packages/kokkos/cmake/Modules/FindTPLCUDA.cmake b/packages/kokkos/cmake/Modules/FindTPLCUDA.cmake index 445f4e93a592..3a6a826197ec 100644 --- a/packages/kokkos/cmake/Modules/FindTPLCUDA.cmake +++ b/packages/kokkos/cmake/Modules/FindTPLCUDA.cmake @@ -1,44 +1,40 @@ -IF (NOT CUDAToolkit_ROOT) - IF (NOT CUDA_ROOT) - SET(CUDA_ROOT $ENV{CUDA_ROOT}) - ENDIF() - IF(CUDA_ROOT) - SET(CUDAToolkit_ROOT ${CUDA_ROOT}) - ENDIF() -ENDIF() +if(NOT CUDAToolkit_ROOT) + if(NOT CUDA_ROOT) + set(CUDA_ROOT $ENV{CUDA_ROOT}) + endif() + if(CUDA_ROOT) + set(CUDAToolkit_ROOT ${CUDA_ROOT}) + endif() +endif() -IF(KOKKOS_CXX_HOST_COMPILER_ID STREQUAL NVHPC AND CMAKE_VERSION VERSION_LESS "3.20.1") - MESSAGE(FATAL_ERROR "Using NVHPC as host compiler requires at least CMake 3.20.1") -ENDIF() +if(KOKKOS_CXX_HOST_COMPILER_ID STREQUAL NVHPC AND CMAKE_VERSION VERSION_LESS "3.20.1") + message(FATAL_ERROR "Using NVHPC as host compiler requires at least CMake 3.20.1") +endif() -IF(CMAKE_VERSION VERSION_GREATER_EQUAL "3.17.0") +if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.17.0") find_package(CUDAToolkit REQUIRED) - KOKKOS_CREATE_IMPORTED_TPL(CUDA INTERFACE - LINK_LIBRARIES CUDA::cuda_driver CUDA::cudart - ) - KOKKOS_EXPORT_CMAKE_TPL(CUDAToolkit REQUIRED) -ELSE() + kokkos_create_imported_tpl(CUDA INTERFACE LINK_LIBRARIES CUDA::cuda_driver CUDA::cudart) + kokkos_export_cmake_tpl(CUDAToolkit REQUIRED) +else() include(${CMAKE_CURRENT_LIST_DIR}/CudaToolkit.cmake) - IF (TARGET CUDA::cudart) - SET(FOUND_CUDART TRUE) - KOKKOS_EXPORT_IMPORTED_TPL(CUDA::cudart) - ELSE() - SET(FOUND_CUDART FALSE) - ENDIF() + if(TARGET CUDA::cudart) + set(FOUND_CUDART TRUE) + kokkos_export_imported_tpl(CUDA::cudart) + else() + set(FOUND_CUDART FALSE) + endif() - IF (TARGET CUDA::cuda_driver) - SET(FOUND_CUDA_DRIVER TRUE) - KOKKOS_EXPORT_IMPORTED_TPL(CUDA::cuda_driver) - ELSE() - SET(FOUND_CUDA_DRIVER FALSE) - ENDIF() + if(TARGET CUDA::cuda_driver) + set(FOUND_CUDA_DRIVER TRUE) + kokkos_export_imported_tpl(CUDA::cuda_driver) + else() + set(FOUND_CUDA_DRIVER FALSE) + endif() include(FindPackageHandleStandardArgs) - FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLCUDA ${DEFAULT_MSG} FOUND_CUDART FOUND_CUDA_DRIVER) - IF (FOUND_CUDA_DRIVER AND FOUND_CUDART) - KOKKOS_CREATE_IMPORTED_TPL(CUDA INTERFACE - LINK_LIBRARIES CUDA::cuda_driver CUDA::cudart - ) - ENDIF() -ENDIF() + find_package_handle_standard_args(TPLCUDA ${DEFAULT_MSG} FOUND_CUDART FOUND_CUDA_DRIVER) + if(FOUND_CUDA_DRIVER AND FOUND_CUDART) + kokkos_create_imported_tpl(CUDA INTERFACE LINK_LIBRARIES CUDA::cuda_driver CUDA::cudart) + endif() +endif() diff --git a/packages/kokkos/cmake/Modules/FindTPLHPX.cmake b/packages/kokkos/cmake/Modules/FindTPLHPX.cmake index d7b54fb9c9ab..e3c199b7c5de 100644 --- a/packages/kokkos/cmake/Modules/FindTPLHPX.cmake +++ b/packages/kokkos/cmake/Modules/FindTPLHPX.cmake @@ -1,15 +1,10 @@ - -FIND_PACKAGE(HPX REQUIRED 1.8.0) +find_package(HPX REQUIRED 1.8.0) #as of right now, HPX doesn't export correctly #so let's convert it to an interface target -KOKKOS_CREATE_IMPORTED_TPL(HPX INTERFACE - LINK_LIBRARIES ${HPX_LIBRARIES} - INCLUDES ${HPX_INCLUDE_DIRS} -) +kokkos_create_imported_tpl(HPX INTERFACE LINK_LIBRARIES ${HPX_LIBRARIES} INCLUDES ${HPX_INCLUDE_DIRS}) #this is a bit funky since this is a CMake target #but HPX doesn't export itself correctly -KOKKOS_EXPORT_CMAKE_TPL(HPX) +kokkos_export_cmake_tpl(HPX) #I would prefer all of this gets replaced with #KOKKOS_IMPORT_CMAKE_TPL(HPX) - diff --git a/packages/kokkos/cmake/Modules/FindTPLHWLOC.cmake b/packages/kokkos/cmake/Modules/FindTPLHWLOC.cmake index cf763b7e5bb5..77ce8c71f730 100644 --- a/packages/kokkos/cmake/Modules/FindTPLHWLOC.cmake +++ b/packages/kokkos/cmake/Modules/FindTPLHWLOC.cmake @@ -1 +1 @@ -KOKKOS_FIND_IMPORTED(HWLOC HEADER hwloc.h LIBRARY hwloc) +kokkos_find_imported(HWLOC HEADER hwloc.h LIBRARY hwloc) diff --git a/packages/kokkos/cmake/Modules/FindTPLLIBDL.cmake b/packages/kokkos/cmake/Modules/FindTPLLIBDL.cmake index 8adcdcdbb8e3..85ae0b82244c 100644 --- a/packages/kokkos/cmake/Modules/FindTPLLIBDL.cmake +++ b/packages/kokkos/cmake/Modules/FindTPLLIBDL.cmake @@ -1 +1 @@ -KOKKOS_FIND_IMPORTED(LIBDL HEADER dlfcn.h INTERFACE LIBRARIES ${CMAKE_DL_LIBS}) +kokkos_find_imported(LIBDL HEADER dlfcn.h INTERFACE LIBRARIES ${CMAKE_DL_LIBS}) diff --git a/packages/kokkos/cmake/Modules/FindTPLLIBQUADMATH.cmake b/packages/kokkos/cmake/Modules/FindTPLLIBQUADMATH.cmake index 70e0d6c454ad..ce428b0aeec6 100644 --- a/packages/kokkos/cmake/Modules/FindTPLLIBQUADMATH.cmake +++ b/packages/kokkos/cmake/Modules/FindTPLLIBQUADMATH.cmake @@ -2,17 +2,19 @@ # (which would not be contained in CMake's search paths anyway). # Hence, try if the compiler supports libquadmath natively first before doing # the standard package search. -SET(CMAKE_REQUIRED_LIBRARIES "quadmath") -INCLUDE(CheckCXXSourceCompiles) -CHECK_CXX_SOURCE_COMPILES(" +set(CMAKE_REQUIRED_LIBRARIES "quadmath") +include(CheckCXXSourceCompiles) +check_cxx_source_compiles( + " #include int main(void){ __float128 foo = ::sqrtq(123.456); return foo; }" - KOKKOS_QUADMATH_COMPILER_SUPPORT) -IF (KOKKOS_QUADMATH_COMPILER_SUPPORT) - KOKKOS_CREATE_IMPORTED_TPL(LIBQUADMATH INTERFACE LINK_LIBRARIES quadmath) -ELSE() - KOKKOS_FIND_IMPORTED(LIBQUADMATH HEADER quadmath.h LIBRARY quadmath) -ENDIF() + KOKKOS_QUADMATH_COMPILER_SUPPORT +) +if(KOKKOS_QUADMATH_COMPILER_SUPPORT) + kokkos_create_imported_tpl(LIBQUADMATH INTERFACE LINK_LIBRARIES quadmath) +else() + kokkos_find_imported(LIBQUADMATH HEADER quadmath.h LIBRARY quadmath) +endif() diff --git a/packages/kokkos/cmake/Modules/FindTPLONEDPL.cmake b/packages/kokkos/cmake/Modules/FindTPLONEDPL.cmake index 603510c315e4..68de942a6983 100644 --- a/packages/kokkos/cmake/Modules/FindTPLONEDPL.cmake +++ b/packages/kokkos/cmake/Modules/FindTPLONEDPL.cmake @@ -1,9 +1,10 @@ -INCLUDE(CheckIncludeFileCXX) -CHECK_INCLUDE_FILE_CXX(oneapi/dpl/execution KOKKOS_COMPILER_HAS_ONEDPL_EXECUTION_HEADER) -CHECK_INCLUDE_FILE_CXX(oneapi/dpl/algorithm KOKKOS_COMPILER_HAS_ONEDPL_ALGORITHM_HEADER) +include(CheckIncludeFileCXX) +check_include_file_cxx(oneapi/dpl/execution KOKKOS_COMPILER_HAS_ONEDPL_EXECUTION_HEADER) +check_include_file_cxx(oneapi/dpl/algorithm KOKKOS_COMPILER_HAS_ONEDPL_ALGORITHM_HEADER) -INCLUDE(CheckCXXSourceCompiles) -CHECK_CXX_SOURCE_COMPILES(" +include(CheckCXXSourceCompiles) +check_cxx_source_compiles( + " #include int main() @@ -13,37 +14,40 @@ CHECK_CXX_SOURCE_COMPILES(" #endif return 0; }" - KOKKOS_NO_TBB_CONFLICT) + KOKKOS_NO_TBB_CONFLICT +) -IF (KOKKOS_COMPILER_HAS_ONEDPL_EXECUTION_HEADER AND KOKKOS_COMPILER_HAS_ONEDPL_ALGORITHM_HEADER) - IF(KOKKOS_NO_TBB_CONFLICT) - KOKKOS_CREATE_IMPORTED_TPL( - ONEDPL INTERFACE - ) - ELSE() - KOKKOS_CREATE_IMPORTED_TPL( - ONEDPL INTERFACE +if(KOKKOS_COMPILER_HAS_ONEDPL_EXECUTION_HEADER AND KOKKOS_COMPILER_HAS_ONEDPL_ALGORITHM_HEADER) + if(KOKKOS_NO_TBB_CONFLICT) + kokkos_create_imported_tpl(ONEDPL INTERFACE) + else() + kokkos_create_imported_tpl( + ONEDPL + INTERFACE # https://stackoverflow.com/questions/67923287/how-to-resolve-no-member-named-task-in-namespace-tbb-error-when-using-oned/ - COMPILE_DEFINITIONS PSTL_USE_PARALLEL_POLICIES=0 _GLIBCXX_USE_TBB_PAR_BACKEND=0 + COMPILE_DEFINITIONS + PSTL_USE_PARALLEL_POLICIES=0 + _GLIBCXX_USE_TBB_PAR_BACKEND=0 ) - ENDIF() -ELSE() - FIND_PACKAGE(oneDPL REQUIRED) + endif() +else() + find_package(oneDPL REQUIRED) - IF(KOKKOS_NO_TBB_CONFLICT) - KOKKOS_CREATE_IMPORTED_TPL( - ONEDPL INTERFACE - LINK_LIBRARIES oneDPL - ) - ELSE() - KOKKOS_CREATE_IMPORTED_TPL( - ONEDPL INTERFACE - LINK_LIBRARIES oneDPL + if(KOKKOS_NO_TBB_CONFLICT) + kokkos_create_imported_tpl(ONEDPL INTERFACE LINK_LIBRARIES oneDPL) + else() + kokkos_create_imported_tpl( + ONEDPL + INTERFACE + LINK_LIBRARIES + oneDPL # https://stackoverflow.com/questions/67923287/how-to-resolve-no-member-named-task-in-namespace-tbb-error-when-using-oned/ - COMPILE_DEFINITIONS PSTL_USE_PARALLEL_POLICIES=0 _GLIBCXX_USE_TBB_PAR_BACKEND=0 + COMPILE_DEFINITIONS + PSTL_USE_PARALLEL_POLICIES=0 + _GLIBCXX_USE_TBB_PAR_BACKEND=0 ) - ENDIF() + endif() # Export oneDPL as a Kokkos dependency - KOKKOS_EXPORT_CMAKE_TPL(oneDPL) -ENDIF() + kokkos_export_cmake_tpl(oneDPL) +endif() diff --git a/packages/kokkos/cmake/Modules/FindTPLROCM.cmake b/packages/kokkos/cmake/Modules/FindTPLROCM.cmake index f796737f5b29..9673af0b9d90 100644 --- a/packages/kokkos/cmake/Modules/FindTPLROCM.cmake +++ b/packages/kokkos/cmake/Modules/FindTPLROCM.cmake @@ -1,7 +1,7 @@ include(FindPackageHandleStandardArgs) -FIND_LIBRARY(AMD_HIP_LIBRARY amdhip64 PATHS ENV ROCM_PATH PATH_SUFFIXES lib) -FIND_LIBRARY(HSA_RUNTIME_LIBRARY hsa-runtime64 PATHS ENV ROCM_PATH PATH_SUFFIXES lib) +find_library(AMD_HIP_LIBRARY amdhip64 PATHS ENV ROCM_PATH PATH_SUFFIXES lib) +find_library(HSA_RUNTIME_LIBRARY hsa-runtime64 PATHS ENV ROCM_PATH PATH_SUFFIXES lib) # FIXME_HIP Starting with ROCm 5.5 it is not necessary to link againt clang_rt. # We keep the code as is for now because it is hard to find the version of ROCM @@ -16,18 +16,24 @@ execute_process( COMMAND ${CMAKE_CXX_COMPILER} -print-libgcc-file-name --rtlib=compiler-rt OUTPUT_VARIABLE CLANG_RT_LIBRARY OUTPUT_STRIP_TRAILING_WHITESPACE - RESULT_VARIABLE CLANG_RT_CHECK) + RESULT_VARIABLE CLANG_RT_CHECK +) -if( NOT "${CLANG_RT_CHECK}" STREQUAL "0" ) +if(NOT "${CLANG_RT_CHECK}" STREQUAL "0") # if the above failed, we delete CLANG_RT_LIBRARY to make the args check # below fail unset(CLANG_RT_LIBRARY) endif() - find_package_handle_standard_args(TPLROCM DEFAULT_MSG AMD_HIP_LIBRARY HSA_RUNTIME_LIBRARY CLANG_RT_LIBRARY) -kokkos_create_imported_tpl(ROCM INTERFACE - LINK_LIBRARIES ${HSA_RUNTIME_LIBRARY} ${AMD_HIP_LIBRARY} ${CLANG_RT_LIBRARY} - COMPILE_DEFINITIONS __HIP_ROCclr__ +kokkos_create_imported_tpl( + ROCM + INTERFACE + LINK_LIBRARIES + ${HSA_RUNTIME_LIBRARY} + ${AMD_HIP_LIBRARY} + ${CLANG_RT_LIBRARY} + COMPILE_DEFINITIONS + __HIP_ROCclr__ ) diff --git a/packages/kokkos/cmake/Modules/FindTPLROCTHRUST.cmake b/packages/kokkos/cmake/Modules/FindTPLROCTHRUST.cmake index dae7dc3c9520..b4b905795dd0 100644 --- a/packages/kokkos/cmake/Modules/FindTPLROCTHRUST.cmake +++ b/packages/kokkos/cmake/Modules/FindTPLROCTHRUST.cmake @@ -6,10 +6,10 @@ # behavior of ROCm 5.7 and later for earlier version of ROCm we set # AMDGPU_TARGETS and GPU_TARGETS to empty and set the values in the cache. If # the values are not cached, FIND_PACKAGE(rocthrust) will overwrite them. -SET(AMDGPU_TARGETS "" CACHE STRING "AMD GPU targets to compile for") -SET(GPU_TARGETS "" CACHE STRING "GPU targets to compile for") -FIND_PACKAGE(rocthrust REQUIRED) -KOKKOS_CREATE_IMPORTED_TPL(ROCTHRUST INTERFACE LINK_LIBRARIES roc::rocthrust) +set(AMDGPU_TARGETS "" CACHE STRING "AMD GPU targets to compile for") +set(GPU_TARGETS "" CACHE STRING "GPU targets to compile for") +find_package(rocthrust REQUIRED) +kokkos_create_imported_tpl(ROCTHRUST INTERFACE LINK_LIBRARIES roc::rocthrust) # Export ROCTHRUST as a Kokkos dependency -KOKKOS_EXPORT_CMAKE_TPL(rocthrust) +kokkos_export_cmake_tpl(rocthrust) diff --git a/packages/kokkos/cmake/Modules/FindTPLTHREADS.cmake b/packages/kokkos/cmake/Modules/FindTPLTHREADS.cmake index ff0db5123f8e..280b8641da15 100644 --- a/packages/kokkos/cmake/Modules/FindTPLTHREADS.cmake +++ b/packages/kokkos/cmake/Modules/FindTPLTHREADS.cmake @@ -1,15 +1,14 @@ -INCLUDE(FindPackageHandleStandardArgs) -FIND_PACKAGE(Threads) +include(FindPackageHandleStandardArgs) +find_package(Threads) -IF (TARGET Threads::Threads) - SET(FOUND_THREADS TRUE) -ELSE() - SET(FOUND_THREADS FALSE) -ENDIF() +if(TARGET Threads::Threads) + set(FOUND_THREADS TRUE) +else() + set(FOUND_THREADS FALSE) +endif() -FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLTHREADS DEFAULT_MSG FOUND_THREADS) +find_package_handle_standard_args(TPLTHREADS DEFAULT_MSG FOUND_THREADS) #Only create the TPL if we succeed -IF (FOUND_THREADS) - KOKKOS_CREATE_IMPORTED_TPL(THREADS INTERFACE LINK_OPTIONS - ${CMAKE_THREAD_LIBS_INIT}) -ENDIF() +if(FOUND_THREADS) + kokkos_create_imported_tpl(THREADS INTERFACE LINK_OPTIONS ${CMAKE_THREAD_LIBS_INIT}) +endif() diff --git a/packages/kokkos/cmake/README.md b/packages/kokkos/cmake/README.md index 385bbfcd5d5a..0548e89a90e7 100644 --- a/packages/kokkos/cmake/README.md +++ b/packages/kokkos/cmake/README.md @@ -310,20 +310,6 @@ When Kokkos is loaded by a downstream project, this TPL must be loaded. Calling this function simply appends text recording the location where the TPL was found and adding a `find_dependency(...)` call that will reload the CMake target. -### The Great TriBITS Compromise - -TriBITS was a masterpiece of CMake version 2 before the modern CMake idioms of building and using. -TriBITS greatly limited verbosity of CMake files, handled complicated dependency trees between packages, and handled automatically setting up include and linker paths for dependent libraries. - -Kokkos is now used by numerous projects that don't (and won't) depend on TriBITS for their build systems. -Kokkos has to work outside of TriBITS and provide a standard CMake 3+ build system. -At the same time, Kokkos is used by numerous projects that depend on TriBITS and don't (and won't) switch to a standard CMake 3+ build system. - -Instead of calling functions `TRIBITS_X(...)`, the CMake calls wrapper functions `KOKKOS_X(...)`. -If TriBITS is available (as in Trilinos), `KOKKOS_X` will just be a thin wrapper around `TRIBITS_X`. -If TriBITS is not available, Kokkos maps `KOKKOS_X` calls to native CMake that complies with CMake 3 idioms. -For the time being, this seems the most sensible way to handle the competing requirements of a standalone modern CMake and TriBITS build system. - ##### [LICENSE](https://github.com/kokkos/kokkos/blob/devel/LICENSE) [![License](https://img.shields.io/badge/License-BSD%203--Clause-blue.svg)](https://opensource.org/licenses/BSD-3-Clause) diff --git a/packages/kokkos/cmake/build_env_info.cmake b/packages/kokkos/cmake/build_env_info.cmake index 0eeb6372455b..ac28b2d8503a 100644 --- a/packages/kokkos/cmake/build_env_info.cmake +++ b/packages/kokkos/cmake/build_env_info.cmake @@ -2,111 +2,108 @@ find_package(Git QUIET) -SET(CURRENT_LIST_DIR ${CMAKE_CURRENT_LIST_DIR}) -SET(pre_configure_dir ${CMAKE_CURRENT_LIST_DIR}) -SET(post_configure_dir ${CMAKE_BINARY_DIR}/generated) +set(CURRENT_LIST_DIR ${CMAKE_CURRENT_LIST_DIR}) +set(pre_configure_dir ${CMAKE_CURRENT_LIST_DIR}) +set(post_configure_dir ${CMAKE_BINARY_DIR}/generated) -SET(pre_configure_file ${pre_configure_dir}/Kokkos_Version_Info.cpp.in) -SET(post_configure_file ${post_configure_dir}/Kokkos_Version_Info.cpp) +set(pre_configure_file ${pre_configure_dir}/Kokkos_Version_Info.cpp.in) +set(post_configure_file ${post_configure_dir}/Kokkos_Version_Info.cpp) -FUNCTION(check_git_write git_hash git_clean_status) - FILE( - WRITE - ${CMAKE_BINARY_DIR}/git-state.txt - "${git_hash}-${git_clean_status}") -ENDFUNCTION() +function(check_git_write git_hash git_clean_status) + file(WRITE ${CMAKE_BINARY_DIR}/git-state.txt "${git_hash}-${git_clean_status}") +endfunction() -FUNCTION(check_git_read git_hash) - IF(EXISTS ${CMAKE_BINARY_DIR}/git-state.txt) - FILE(STRINGS ${CMAKE_BINARY_DIR}/git-state.txt CONTENT) - LIST(GET CONTENT 0 var) +function(check_git_read git_hash) + if(EXISTS ${CMAKE_BINARY_DIR}/git-state.txt) + file(STRINGS ${CMAKE_BINARY_DIR}/git-state.txt CONTENT) + list(GET CONTENT 0 var) message(DEBUG "Cached Git hash: ${var}") - SET(${git_hash} ${var} PARENT_SCOPE) + set(${git_hash} ${var} PARENT_SCOPE) else() - SET(${git_hash} "INVALID" PARENT_SCOPE) - ENDIF() -ENDFUNCTION() - -FUNCTION(check_git_version) - IF(NOT EXISTS ${post_configure_dir}/Kokkos_Version_Info.hpp) - FILE( - COPY ${pre_configure_dir}/Kokkos_Version_Info.hpp - DESTINATION ${post_configure_dir}) - ENDIF() - - IF(NOT Git_FOUND OR NOT EXISTS ${KOKKOS_SOURCE_DIR}/.git) + set(${git_hash} "INVALID" PARENT_SCOPE) + endif() +endfunction() + +function(check_git_version) + if(NOT EXISTS ${post_configure_dir}/Kokkos_Version_Info.hpp) + file(COPY ${pre_configure_dir}/Kokkos_Version_Info.hpp DESTINATION ${post_configure_dir}) + endif() + + if(NOT Git_FOUND OR NOT EXISTS ${KOKKOS_SOURCE_DIR}/.git) configure_file(${pre_configure_file} ${post_configure_file} @ONLY) return() - ENDIF() + endif() # Get the current working branch execute_process( COMMAND ${GIT_EXECUTABLE} rev-parse --abbrev-ref HEAD WORKING_DIRECTORY ${KOKKOS_SOURCE_DIR} OUTPUT_VARIABLE GIT_BRANCH - OUTPUT_STRIP_TRAILING_WHITESPACE) + OUTPUT_STRIP_TRAILING_WHITESPACE + ) # Get the latest commit description execute_process( COMMAND ${GIT_EXECUTABLE} show -s --format=%s WORKING_DIRECTORY ${KOKKOS_SOURCE_DIR} OUTPUT_VARIABLE GIT_COMMIT_DESCRIPTION - OUTPUT_STRIP_TRAILING_WHITESPACE) + OUTPUT_STRIP_TRAILING_WHITESPACE + ) # Get the latest commit date execute_process( COMMAND ${GIT_EXECUTABLE} log -1 --format=%cI WORKING_DIRECTORY ${KOKKOS_SOURCE_DIR} OUTPUT_VARIABLE GIT_COMMIT_DATE - OUTPUT_STRIP_TRAILING_WHITESPACE) + OUTPUT_STRIP_TRAILING_WHITESPACE + ) # Check if repo is dirty / clean execute_process( COMMAND ${GIT_EXECUTABLE} diff-index --quiet HEAD -- WORKING_DIRECTORY ${KOKKOS_SOURCE_DIR} RESULT_VARIABLE IS_DIRTY - OUTPUT_STRIP_TRAILING_WHITESPACE) + OUTPUT_STRIP_TRAILING_WHITESPACE + ) - IF(IS_DIRTY EQUAL 0) - SET(GIT_CLEAN_STATUS "CLEAN") + if(IS_DIRTY EQUAL 0) + set(GIT_CLEAN_STATUS "CLEAN") else() - SET(GIT_CLEAN_STATUS "DIRTY") - ENDIF() + set(GIT_CLEAN_STATUS "DIRTY") + endif() # Get the latest abbreviated commit hash of the working branch execute_process( COMMAND ${GIT_EXECUTABLE} log -1 --format=%h WORKING_DIRECTORY ${KOKKOS_SOURCE_DIR} OUTPUT_VARIABLE GIT_COMMIT_HASH - OUTPUT_STRIP_TRAILING_WHITESPACE) + OUTPUT_STRIP_TRAILING_WHITESPACE + ) check_git_read(GIT_HASH_CACHE) - IF(NOT EXISTS ${post_configure_dir}) + if(NOT EXISTS ${post_configure_dir}) file(MAKE_DIRECTORY ${post_configure_dir}) - ENDIF() + endif() # Only update the git_version.cpp if the hash has changed. This will # prevent us from rebuilding the project more than we need to. - IF(NOT "${GIT_COMMIT_HASH}-${GIT_CLEAN_STATUS}" STREQUAL ${GIT_HASH_CACHE} - OR NOT EXISTS ${post_configure_file}) + if(NOT "${GIT_COMMIT_HASH}-${GIT_CLEAN_STATUS}" STREQUAL ${GIT_HASH_CACHE} OR NOT EXISTS ${post_configure_file}) # Set the GIT_HASH_CACHE variable so the next build won't have # to regenerate the source file. check_git_write(${GIT_COMMIT_HASH} ${GIT_CLEAN_STATUS}) configure_file(${pre_configure_file} ${post_configure_file} @ONLY) message(STATUS "Configured git information in ${post_configure_file}") - ENDIF() -ENDFUNCTION() + endif() +endfunction() -FUNCTION(check_git_setup) +function(check_git_setup) add_custom_target( - AlwaysCheckGit COMMAND ${CMAKE_COMMAND} - -DRUN_CHECK_GIT_VERSION=1 - -DKOKKOS_SOURCE_DIR=${Kokkos_SOURCE_DIR} - -P ${CURRENT_LIST_DIR}/build_env_info.cmake - BYPRODUCTS ${post_configure_file}) + AlwaysCheckGit COMMAND ${CMAKE_COMMAND} -DRUN_CHECK_GIT_VERSION=1 -DKOKKOS_SOURCE_DIR=${Kokkos_SOURCE_DIR} -P + ${CURRENT_LIST_DIR}/build_env_info.cmake BYPRODUCTS ${post_configure_file} + ) add_library(impl_git_version ${CMAKE_BINARY_DIR}/generated/Kokkos_Version_Info.cpp) target_include_directories(impl_git_version PUBLIC ${CMAKE_BINARY_DIR}/generated) @@ -114,9 +111,9 @@ FUNCTION(check_git_setup) add_dependencies(impl_git_version AlwaysCheckGit) check_git_version() -ENDFUNCTION() +endfunction() # This is used to run this function from an external cmake process. -IF(RUN_CHECK_GIT_VERSION) +if(RUN_CHECK_GIT_VERSION) check_git_version() -ENDIF() +endif() diff --git a/packages/kokkos/cmake/compile_tests/amd_apu.cc b/packages/kokkos/cmake/compile_tests/amd_apu.cc new file mode 100644 index 000000000000..a9c1edbd57b0 --- /dev/null +++ b/packages/kokkos/cmake/compile_tests/amd_apu.cc @@ -0,0 +1,38 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include +#include + +int main() { + hipDeviceProp_t hipProp; + hipError_t error = hipGetDeviceProperties(&hipProp, 0); + + if (error != hipSuccess) { + std::cout << hipGetErrorString(error) << '\n'; + return error; + } + + if (hipProp.integrated == 1) { + // We detected an APU + std::cout << "ON"; + } else { + // We detected a discrete GPU + std::cout << "OFF"; + } + + return 0; +} diff --git a/packages/kokkos/cmake/cray.cmake b/packages/kokkos/cmake/cray.cmake index 08912f5130f9..4ce5352bda26 100644 --- a/packages/kokkos/cmake/cray.cmake +++ b/packages/kokkos/cmake/cray.cmake @@ -1,9 +1,6 @@ - - function(kokkos_set_cray_flags full_standard int_standard) - STRING(TOLOWER ${full_standard} FULL_LC_STANDARD) - STRING(TOLOWER ${int_standard} INT_LC_STANDARD) - SET(KOKKOS_CXX_STANDARD_FLAG "-hstd=c++${FULL_LC_STANDARD}", PARENT_SCOPE) - SET(KOKKOS_CXX_INTERMDIATE_STANDARD_FLAG "-hstd=c++${INT_LC_STANDARD}" PARENT_SCOPE) + string(TOLOWER ${full_standard} FULL_LC_STANDARD) + string(TOLOWER ${int_standard} INT_LC_STANDARD) + set(KOKKOS_CXX_STANDARD_FLAG "-hstd=c++${FULL_LC_STANDARD}", PARENT_SCOPE) + set(KOKKOS_CXX_INTERMDIATE_STANDARD_FLAG "-hstd=c++${INT_LC_STANDARD}" PARENT_SCOPE) endfunction() - diff --git a/packages/kokkos/cmake/deps/CUDA.cmake b/packages/kokkos/cmake/deps/CUDA.cmake index 5b6afd61512d..49eaf883a46f 100644 --- a/packages/kokkos/cmake/deps/CUDA.cmake +++ b/packages/kokkos/cmake/deps/CUDA.cmake @@ -17,24 +17,24 @@ # Check for CUDA support -SET(_CUDA_FAILURE OFF) +set(_CUDA_FAILURE OFF) # Have CMake find CUDA -IF(NOT _CUDA_FAILURE) - FIND_PACKAGE(CUDA 3.2) - IF (NOT CUDA_FOUND) - SET(_CUDA_FAILURE ON) - ENDIF() -ENDIF() +if(NOT _CUDA_FAILURE) + find_package(CUDA 3.2) + if(NOT CUDA_FOUND) + set(_CUDA_FAILURE ON) + endif() +endif() -IF(NOT _CUDA_FAILURE) +if(NOT _CUDA_FAILURE) # if we haven't met failure macro(PACKAGE_ADD_CUDA_LIBRARY cuda_target) - TRIBITS_ADD_LIBRARY(${cuda_target} ${ARGN} CUDALIBRARY) + tribits_add_library(${cuda_target} ${ARGN} CUDALIBRARY) endmacro() - GLOBAL_SET(TPL_CUDA_LIBRARY_DIRS) - GLOBAL_SET(TPL_CUDA_INCLUDE_DIRS ${CUDA_TOOLKIT_INCLUDE}) - GLOBAL_SET(TPL_CUDA_LIBRARIES ${CUDA_CUDART_LIBRARY} ${CUDA_cublas_LIBRARY} ${CUDA_cufft_LIBRARY}) -ELSE() - SET(TPL_ENABLE_CUDA OFF) -ENDIF() + global_set(TPL_CUDA_LIBRARY_DIRS) + global_set(TPL_CUDA_INCLUDE_DIRS ${CUDA_TOOLKIT_INCLUDE}) + global_set(TPL_CUDA_LIBRARIES ${CUDA_CUDART_LIBRARY} ${CUDA_cublas_LIBRARY} ${CUDA_cufft_LIBRARY}) +else() + set(TPL_ENABLE_CUDA OFF) +endif() diff --git a/packages/kokkos/cmake/deps/HWLOC.cmake b/packages/kokkos/cmake/deps/HWLOC.cmake index 77d5a9b83a64..52d8368d0419 100644 --- a/packages/kokkos/cmake/deps/HWLOC.cmake +++ b/packages/kokkos/cmake/deps/HWLOC.cmake @@ -15,7 +15,6 @@ # ************************************************************************ # @HEADER - #----------------------------------------------------------------------------- # Hardware locality detection and control library. # @@ -26,7 +25,4 @@ # Version: 1.3 # -KOKKOS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( HWLOC - REQUIRED_HEADERS hwloc.h - REQUIRED_LIBS_NAMES "hwloc" - ) +kokkos_tpl_find_include_dirs_and_libraries(HWLOC REQUIRED_HEADERS hwloc.h REQUIRED_LIBS_NAMES "hwloc") diff --git a/packages/kokkos/cmake/deps/Pthread.cmake b/packages/kokkos/cmake/deps/Pthread.cmake index e879bff3741d..b811f850841d 100644 --- a/packages/kokkos/cmake/deps/Pthread.cmake +++ b/packages/kokkos/cmake/deps/Pthread.cmake @@ -15,31 +15,27 @@ # ************************************************************************ # @HEADER +set(USE_THREADS FALSE) -SET(USE_THREADS FALSE) - -IF(NOT TPL_Pthread_INCLUDE_DIRS AND NOT TPL_Pthread_LIBRARY_DIRS AND NOT TPL_Pthread_LIBRARIES) +if(NOT TPL_Pthread_INCLUDE_DIRS AND NOT TPL_Pthread_LIBRARY_DIRS AND NOT TPL_Pthread_LIBRARIES) # Use CMake's Thread finder since it is a bit smarter in determining # whether pthreads is already built into the compiler and doesn't need # a library to link. - FIND_PACKAGE(Threads) + find_package(Threads) #If Threads found a copy of pthreads make sure it is one of the cases the tribits #tpl system cannot handle. - IF(Threads_FOUND AND CMAKE_USE_PTHREADS_INIT) - IF(CMAKE_THREAD_LIBS_INIT STREQUAL "" OR CMAKE_THREAD_LIBS_INIT STREQUAL "-pthread") - SET(USE_THREADS TRUE) - ENDIF() - ENDIF() -ENDIF() + if(Threads_FOUND AND CMAKE_USE_PTHREADS_INIT) + if(CMAKE_THREAD_LIBS_INIT STREQUAL "" OR CMAKE_THREAD_LIBS_INIT STREQUAL "-pthread") + set(USE_THREADS TRUE) + endif() + endif() +endif() -IF(USE_THREADS) - SET(TPL_Pthread_INCLUDE_DIRS "") - SET(TPL_Pthread_LIBRARIES "${CMAKE_THREAD_LIBS_INIT}") - SET(TPL_Pthread_LIBRARY_DIRS "") - KOKKOS_CREATE_IMPORTED_TPL_LIBRARY(Pthread) -ELSE() - KOKKOS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( Pthread - REQUIRED_HEADERS pthread.h - REQUIRED_LIBS_NAMES pthread - ) -ENDIF() +if(USE_THREADS) + set(TPL_Pthread_INCLUDE_DIRS "") + set(TPL_Pthread_LIBRARIES "${CMAKE_THREAD_LIBS_INIT}") + set(TPL_Pthread_LIBRARY_DIRS "") + kokkos_create_imported_tpl_library(Pthread) +else() + kokkos_tpl_find_include_dirs_and_libraries(Pthread REQUIRED_HEADERS pthread.h REQUIRED_LIBS_NAMES pthread) +endif() diff --git a/packages/kokkos/cmake/deps/quadmath.cmake b/packages/kokkos/cmake/deps/quadmath.cmake index 6aef08e8812f..9006d0cb9efb 100644 --- a/packages/kokkos/cmake/deps/quadmath.cmake +++ b/packages/kokkos/cmake/deps/quadmath.cmake @@ -15,7 +15,4 @@ # ************************************************************************ # @HEADER -KOKKOS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( quadmath - REQUIRED_HEADERS quadmath.h - REQUIRED_LIBS_NAMES quadmath -) +kokkos_tpl_find_include_dirs_and_libraries(quadmath REQUIRED_HEADERS quadmath.h REQUIRED_LIBS_NAMES quadmath) diff --git a/packages/kokkos/cmake/fake_tribits.cmake b/packages/kokkos/cmake/fake_tribits.cmake index a18d2ac518a6..d3fe1e6e2f62 100644 --- a/packages/kokkos/cmake/fake_tribits.cmake +++ b/packages/kokkos/cmake/fake_tribits.cmake @@ -1,288 +1,213 @@ #These are tribits wrappers used by all projects in the Kokkos ecosystem -INCLUDE(CMakeParseArguments) -INCLUDE(CTest) +include(CMakeParseArguments) +include(CTest) -FUNCTION(ASSERT_DEFINED VARS) - FOREACH(VAR ${VARS}) - IF(NOT DEFINED ${VAR}) - MESSAGE(SEND_ERROR "Error, the variable ${VAR} is not defined!") - ENDIF() - ENDFOREACH() -ENDFUNCTION() - -IF(NOT KOKKOS_HAS_TRILINOS) -MACRO(APPEND_GLOB VAR) - FILE(GLOB LOCAL_TMP_VAR ${ARGN}) - LIST(APPEND ${VAR} ${LOCAL_TMP_VAR}) -ENDMACRO() - -MACRO(GLOBAL_SET VARNAME) - SET(${VARNAME} ${ARGN} CACHE INTERNAL "" FORCE) -ENDMACRO() - -MACRO(PREPEND_GLOBAL_SET VARNAME) - ASSERT_DEFINED(${VARNAME}) - GLOBAL_SET(${VARNAME} ${ARGN} ${${VARNAME}}) -ENDMACRO() -ENDIF() - -MACRO(ADD_INTERFACE_LIBRARY LIB_NAME) - FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/dummy.cpp "") - ADD_LIBRARY(${LIB_NAME} STATIC ${CMAKE_CURRENT_BINARY_DIR}/dummy.cpp) - SET_TARGET_PROPERTIES(${LIB_NAME} PROPERTIES INTERFACE TRUE) -ENDMACRO() - -FUNCTION(KOKKOS_ADD_TEST) - if (KOKKOS_HAS_TRILINOS) - CMAKE_PARSE_ARGUMENTS(TEST - "SKIP_TRIBITS" - "EXE;NAME;TOOL" - "ARGS" - ${ARGN}) - - IF(TEST_SKIP_TRIBITS) - MESSAGE(STATUS "Skipping test ${TEST_NAME} in TriBits") - RETURN() - ENDIF() - - IF(TEST_EXE) - SET(EXE_ROOT ${TEST_EXE}) - ELSE() - SET(EXE_ROOT ${TEST_NAME}) - ENDIF() - - TRIBITS_ADD_TEST( - ${EXE_ROOT} - NAME ${TEST_NAME} - COMM serial mpi - NUM_MPI_PROCS 1 - ARGS ${TEST_ARGS} - ${TEST_UNPARSED_ARGUMENTS} - ADDED_TESTS_NAMES_OUT ALL_TESTS_ADDED - ) - - # We will get prepended package name here - SET(TEST_NAME ${PACKAGE_NAME}_${TEST_NAME}) - SET(EXE ${PACKAGE_NAME}_${EXE_ROOT}) - - # The function TRIBITS_ADD_TEST() has a CATEGORIES argument that defaults - # to BASIC. If a project elects to only enable tests marked as PERFORMANCE, - # the test won't actually be added and attempting to set a property on it below - # will yield an error. - if(TARGET ${EXE}) - if(TEST_TOOL) - add_dependencies(${EXE} ${TEST_TOOL}) #make sure the exe has to build the tool - foreach(TEST_ADDED ${ALL_TESTS_ADDED}) - set_property(TEST ${TEST_ADDED} APPEND PROPERTY ENVIRONMENT "KOKKOS_TOOLS_LIBS=$") - endforeach() - endif() +function(ASSERT_DEFINED VARS) + foreach(VAR ${VARS}) + if(NOT DEFINED ${VAR}) + message(SEND_ERROR "Error, the variable ${VAR} is not defined!") endif() + endforeach() +endfunction() + +macro(APPEND_GLOB VAR) + file(GLOB LOCAL_TMP_VAR ${ARGN}) + list(APPEND ${VAR} ${LOCAL_TMP_VAR}) +endmacro() + +macro(GLOBAL_SET VARNAME) + set(${VARNAME} ${ARGN} CACHE INTERNAL "" FORCE) +endmacro() + +macro(PREPEND_GLOBAL_SET VARNAME) + assert_defined(${VARNAME}) + global_set(${VARNAME} ${ARGN} ${${VARNAME}}) +endmacro() + +macro(ADD_INTERFACE_LIBRARY LIB_NAME) + file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/dummy.cpp "") + add_library(${LIB_NAME} STATIC ${CMAKE_CURRENT_BINARY_DIR}/dummy.cpp) + set_target_properties(${LIB_NAME} PROPERTIES INTERFACE TRUE) +endmacro() + +function(KOKKOS_ADD_TEST) + cmake_parse_arguments( + TEST "WILL_FAIL;SKIP_TRIBITS" "FAIL_REGULAR_EXPRESSION;PASS_REGULAR_EXPRESSION;EXE;NAME;TOOL" "CATEGORIES;ARGS" + ${ARGN} + ) + # To match Tribits, we should always be receiving + # the root names of exes/libs + if(TEST_EXE) + set(EXE_ROOT ${TEST_EXE}) else() - CMAKE_PARSE_ARGUMENTS(TEST - "WILL_FAIL;SKIP_TRIBITS" - "FAIL_REGULAR_EXPRESSION;PASS_REGULAR_EXPRESSION;EXE;NAME;TOOL" - "CATEGORIES;ARGS" - ${ARGN}) - # To match Tribits, we should always be receiving - # the root names of exes/libs - IF(TEST_EXE) - SET(EXE_ROOT ${TEST_EXE}) - ELSE() - SET(EXE_ROOT ${TEST_NAME}) - ENDIF() - # Prepend package name to the test name - # These should be the full target name - SET(TEST_NAME ${PACKAGE_NAME}_${TEST_NAME}) - SET(EXE ${PACKAGE_NAME}_${EXE_ROOT}) - IF(WIN32) - ADD_TEST(NAME ${TEST_NAME} WORKING_DIRECTORY ${LIBRARY_OUTPUT_PATH} - COMMAND ${EXE}${CMAKE_EXECUTABLE_SUFFIX} ${TEST_ARGS}) - ELSE() - ADD_TEST(NAME ${TEST_NAME} COMMAND ${EXE} ${TEST_ARGS}) - ENDIF() - IF(TEST_WILL_FAIL) - SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES WILL_FAIL ${TEST_WILL_FAIL}) - ENDIF() - IF(TEST_FAIL_REGULAR_EXPRESSION) - SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES FAIL_REGULAR_EXPRESSION ${TEST_FAIL_REGULAR_EXPRESSION}) - ENDIF() - IF(TEST_PASS_REGULAR_EXPRESSION) - SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES PASS_REGULAR_EXPRESSION ${TEST_PASS_REGULAR_EXPRESSION}) - ENDIF() - IF(TEST_TOOL) - ADD_DEPENDENCIES(${EXE} ${TEST_TOOL}) #make sure the exe has to build the tool - SET_PROPERTY(TEST ${TEST_NAME} APPEND_STRING PROPERTY ENVIRONMENT "KOKKOS_PROFILE_LIBRARY=$") - ENDIF() - VERIFY_EMPTY(KOKKOS_ADD_TEST ${TEST_UNPARSED_ARGUMENTS}) - ENDIF() -ENDFUNCTION() - -MACRO(KOKKOS_CREATE_IMPORTED_TPL_LIBRARY TPL_NAME) - ADD_INTERFACE_LIBRARY(TPL_LIB_${TPL_NAME}) - TARGET_LINK_LIBRARIES(TPL_LIB_${TPL_NAME} LINK_PUBLIC ${TPL_${TPL_NAME}_LIBRARIES}) - TARGET_INCLUDE_DIRECTORIES(TPL_LIB_${TPL_NAME} INTERFACE ${TPL_${TPL_NAME}_INCLUDE_DIRS}) -ENDMACRO() + set(EXE_ROOT ${TEST_NAME}) + endif() + # Prepend package name to the test name + # These should be the full target name + set(TEST_NAME ${PACKAGE_NAME}_${TEST_NAME}) + + # For compatibility with Trilinos testing, we support: + # * `-D _DISABLE=ON` + # * `-D _EXTRA_ARGS=";;;..."` + # * `-D _SET_RUN_SERIAL=ON` + if(${TEST_NAME}_DISABLE) + return() + endif() -FUNCTION(KOKKOS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES TPL_NAME) - if (KOKKOS_HAS_TRILINOS) - TRIBITS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES(${TPL_NAME} ${ARGN}) + set(EXE ${PACKAGE_NAME}_${EXE_ROOT}) + if(WIN32) + add_test(NAME ${TEST_NAME} WORKING_DIRECTORY ${LIBRARY_OUTPUT_PATH} COMMAND ${EXE}${CMAKE_EXECUTABLE_SUFFIX} + ${TEST_ARGS} ${${TEST_NAME}_EXTRA_ARGS} + ) else() - CMAKE_PARSE_ARGUMENTS(PARSE - "" - "" - "REQUIRED_HEADERS;REQUIRED_LIBS_NAMES" - ${ARGN}) - - SET(_${TPL_NAME}_ENABLE_SUCCESS TRUE) - IF (PARSE_REQUIRED_LIBS_NAMES) - FIND_LIBRARY(TPL_${TPL_NAME}_LIBRARIES NAMES ${PARSE_REQUIRED_LIBS_NAMES}) - IF(NOT TPL_${TPL_NAME}_LIBRARIES) - SET(_${TPL_NAME}_ENABLE_SUCCESS FALSE) - ENDIF() - ENDIF() - IF (PARSE_REQUIRED_HEADERS) - FIND_PATH(TPL_${TPL_NAME}_INCLUDE_DIRS NAMES ${PARSE_REQUIRED_HEADERS}) - IF(NOT TPL_${TPL_NAME}_INCLUDE_DIRS) - SET(_${TPL_NAME}_ENABLE_SUCCESS FALSE) - ENDIF() - ENDIF() - IF (_${TPL_NAME}_ENABLE_SUCCESS) - KOKKOS_CREATE_IMPORTED_TPL_LIBRARY(${TPL_NAME}) - ENDIF() - VERIFY_EMPTY(KOKKOS_CREATE_IMPORTED_TPL_LIBRARY ${PARSE_UNPARSED_ARGUMENTS}) + add_test(NAME ${TEST_NAME} COMMAND ${EXE} ${TEST_ARGS} ${${TEST_NAME}_EXTRA_ARGS}) endif() -ENDFUNCTION() - -MACRO(KOKKOS_TARGET_COMPILE_OPTIONS TARGET) -if(KOKKOS_HAS_TRILINOS) - TARGET_COMPILE_OPTIONS(${TARGET} ${ARGN}) -else() - TARGET_COMPILE_OPTIONS(${TARGET} ${ARGN}) -endif() -ENDMACRO() - -FUNCTION(KOKKOS_LIB_TYPE LIB RET) -GET_TARGET_PROPERTY(PROP ${LIB} TYPE) -IF (${PROP} STREQUAL "INTERFACE_LIBRARY") - SET(${RET} "INTERFACE" PARENT_SCOPE) -ELSE() - SET(${RET} "PUBLIC" PARENT_SCOPE) -ENDIF() -ENDFUNCTION() - -FUNCTION(KOKKOS_TARGET_INCLUDE_DIRECTORIES TARGET) -IF(KOKKOS_HAS_TRILINOS) - KOKKOS_LIB_TYPE(${TARGET} INCTYPE) - #don't trust tribits to do this correctly - but need to add package name - TARGET_INCLUDE_DIRECTORIES(${TARGET} ${INCTYPE} ${ARGN}) -ELSEIF(TARGET ${TARGET}) - #the target actually exists - this means we are doing separate libs - #or this a test library - KOKKOS_LIB_TYPE(${TARGET} INCTYPE) - TARGET_INCLUDE_DIRECTORIES(${TARGET} ${INCTYPE} ${ARGN}) -ELSE() - GET_PROPERTY(LIBS GLOBAL PROPERTY KOKKOS_LIBRARIES_NAMES) - IF (${TARGET} IN_LIST LIBS) - SET_PROPERTY(GLOBAL APPEND PROPERTY KOKKOS_LIBRARY_INCLUDES ${ARGN}) - ELSE() - MESSAGE(FATAL_ERROR "Trying to set include directories on unknown target ${TARGET}") - ENDIF() -ENDIF() -ENDFUNCTION() - -FUNCTION(KOKKOS_LINK_INTERNAL_LIBRARY TARGET DEPLIB) -IF(KOKKOS_HAS_TRILINOS) - #do nothing -ELSE() - SET(options INTERFACE) - SET(oneValueArgs) - SET(multiValueArgs) - CMAKE_PARSE_ARGUMENTS(PARSE - "INTERFACE" - "" - "" - ${ARGN}) - SET(LINK_TYPE) - IF(PARSE_INTERFACE) - SET(LINK_TYPE INTERFACE) - ELSE() - SET(LINK_TYPE PUBLIC) - ENDIF() - TARGET_LINK_LIBRARIES(${TARGET} ${LINK_TYPE} ${DEPLIB}) - VERIFY_EMPTY(KOKKOS_LINK_INTERNAL_LIBRARY ${PARSE_UNPARSED_ARGUMENTS}) -ENDIF() -ENDFUNCTION() - -FUNCTION(KOKKOS_ADD_TEST_LIBRARY NAME) -IF (KOKKOS_HAS_TRILINOS) - TRIBITS_ADD_LIBRARY(${NAME} ${ARGN} TESTONLY) -ELSE() - SET(oneValueArgs) - SET(multiValueArgs HEADERS SOURCES) - - CMAKE_PARSE_ARGUMENTS(PARSE - "STATIC;SHARED" - "" - "HEADERS;SOURCES;DEPLIBS" - ${ARGN}) - - SET(LIB_TYPE) - IF (PARSE_STATIC) - SET(LIB_TYPE STATIC) - ELSEIF (PARSE_SHARED) - SET(LIB_TYPE SHARED) - ENDIF() + # Trilinos testing benefits from labeling the tests as "Kokkos" tests + set_tests_properties(${TEST_NAME} PROPERTIES LABELS Kokkos) + if(${TEST_NAME}_SET_RUN_SERIAL) + set_tests_properties(${TEST_NAME} PROPERTIES RUN_SERIAL ON) + endif() + # TriBITS doesn't actually currently support `-D _ENVIRONMENT` + # but we decided to add it anyway + if(${TEST_NAME}_ENVIRONMENT) + set_tests_properties(${TEST_NAME} PROPERTIES ENVIRONMENT "${${TEST_NAME}_ENVIRONMENT}") + endif() + if(TEST_WILL_FAIL) + set_tests_properties(${TEST_NAME} PROPERTIES WILL_FAIL ${TEST_WILL_FAIL}) + endif() + if(TEST_FAIL_REGULAR_EXPRESSION) + set_tests_properties(${TEST_NAME} PROPERTIES FAIL_REGULAR_EXPRESSION ${TEST_FAIL_REGULAR_EXPRESSION}) + endif() + if(TEST_PASS_REGULAR_EXPRESSION) + set_tests_properties(${TEST_NAME} PROPERTIES PASS_REGULAR_EXPRESSION ${TEST_PASS_REGULAR_EXPRESSION}) + endif() + if(TEST_TOOL) + add_dependencies(${EXE} ${TEST_TOOL}) #make sure the exe has to build the tool + set_property( + TEST ${TEST_NAME} APPEND_STRING PROPERTY ENVIRONMENT "KOKKOS_PROFILE_LIBRARY=$" + ) + endif() + verify_empty(KOKKOS_ADD_TEST ${TEST_UNPARSED_ARGUMENTS}) +endfunction() + +macro(KOKKOS_CREATE_IMPORTED_TPL_LIBRARY TPL_NAME) + add_interface_library(TPL_LIB_${TPL_NAME}) + target_link_libraries(TPL_LIB_${TPL_NAME} LINK_PUBLIC ${TPL_${TPL_NAME}_LIBRARIES}) + target_include_directories(TPL_LIB_${TPL_NAME} INTERFACE ${TPL_${TPL_NAME}_INCLUDE_DIRS}) +endmacro() + +function(KOKKOS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES TPL_NAME) + cmake_parse_arguments(PARSE "" "" "REQUIRED_HEADERS;REQUIRED_LIBS_NAMES" ${ARGN}) + + set(_${TPL_NAME}_ENABLE_SUCCESS TRUE) + if(PARSE_REQUIRED_LIBS_NAMES) + find_library(TPL_${TPL_NAME}_LIBRARIES NAMES ${PARSE_REQUIRED_LIBS_NAMES}) + if(NOT TPL_${TPL_NAME}_LIBRARIES) + set(_${TPL_NAME}_ENABLE_SUCCESS FALSE) + endif() + endif() + if(PARSE_REQUIRED_HEADERS) + find_path(TPL_${TPL_NAME}_INCLUDE_DIRS NAMES ${PARSE_REQUIRED_HEADERS}) + if(NOT TPL_${TPL_NAME}_INCLUDE_DIRS) + set(_${TPL_NAME}_ENABLE_SUCCESS FALSE) + endif() + endif() + if(_${TPL_NAME}_ENABLE_SUCCESS) + kokkos_create_imported_tpl_library(${TPL_NAME}) + endif() + verify_empty(KOKKOS_CREATE_IMPORTED_TPL_LIBRARY ${PARSE_UNPARSED_ARGUMENTS}) +endfunction() - IF(PARSE_HEADERS) - LIST(REMOVE_DUPLICATES PARSE_HEADERS) - ENDIF() - IF(PARSE_SOURCES) - LIST(REMOVE_DUPLICATES PARSE_SOURCES) - ENDIF() - ADD_LIBRARY(${NAME} ${LIB_TYPE} ${PARSE_SOURCES}) - IF (PARSE_DEPLIBS) - TARGET_LINK_LIBRARIES(${NAME} PRIVATE ${PARSE_DEPLIBS}) - ENDIF() -ENDIF() -ENDFUNCTION() +function(KOKKOS_LIB_TYPE LIB RET) + get_target_property(PROP ${LIB} TYPE) + if(${PROP} STREQUAL "INTERFACE_LIBRARY") + set(${RET} "INTERFACE" PARENT_SCOPE) + else() + set(${RET} "PUBLIC" PARENT_SCOPE) + endif() +endfunction() + +function(KOKKOS_TARGET_INCLUDE_DIRECTORIES TARGET) + if(TARGET ${TARGET}) + #the target actually exists - this means we are doing separate libs + #or this a test library + kokkos_lib_type(${TARGET} INCTYPE) + target_include_directories(${TARGET} ${INCTYPE} ${ARGN}) + else() + get_property(LIBS GLOBAL PROPERTY KOKKOS_LIBRARIES_NAMES) + if(${TARGET} IN_LIST LIBS) + set_property(GLOBAL APPEND PROPERTY KOKKOS_LIBRARY_INCLUDES ${ARGN}) + else() + message(FATAL_ERROR "Trying to set include directories on unknown target ${TARGET}") + endif() + endif() +endfunction() + +function(KOKKOS_LINK_INTERNAL_LIBRARY TARGET DEPLIB) + set(options INTERFACE) + set(oneValueArgs) + set(multiValueArgs) + cmake_parse_arguments(PARSE "INTERFACE" "" "" ${ARGN}) + set(LINK_TYPE) + if(PARSE_INTERFACE) + set(LINK_TYPE INTERFACE) + else() + set(LINK_TYPE PUBLIC) + endif() + target_link_libraries(${TARGET} ${LINK_TYPE} ${DEPLIB}) + verify_empty(KOKKOS_LINK_INTERNAL_LIBRARY ${PARSE_UNPARSED_ARGUMENTS}) +endfunction() +function(KOKKOS_ADD_TEST_LIBRARY NAME) + set(oneValueArgs) + set(multiValueArgs HEADERS SOURCES) -FUNCTION(KOKKOS_INCLUDE_DIRECTORIES) -IF(KOKKOS_HAS_TRILINOS) - TRIBITS_INCLUDE_DIRECTORIES(${ARGN}) -ELSE() - CMAKE_PARSE_ARGUMENTS( - INC - "REQUIRED_DURING_INSTALLATION_TESTING" - "" - "" - ${ARGN} - ) - INCLUDE_DIRECTORIES(${INC_UNPARSED_ARGUMENTS}) -ENDIF() -ENDFUNCTION() + cmake_parse_arguments(PARSE "STATIC;SHARED" "" "HEADERS;SOURCES;DEPLIBS" ${ARGN}) + set(LIB_TYPE) + if(PARSE_STATIC) + set(LIB_TYPE STATIC) + elseif(PARSE_SHARED) + set(LIB_TYPE SHARED) + endif() -MACRO(PRINTALL match) -get_cmake_property(_variableNames VARIABLES) -list (SORT _variableNames) -foreach (_variableName ${_variableNames}) - if("${_variableName}" MATCHES "${match}") - message(STATUS "${_variableName}=${${_variableName}}") + if(PARSE_HEADERS) + list(REMOVE_DUPLICATES PARSE_HEADERS) endif() -endforeach() -ENDMACRO() + if(PARSE_SOURCES) + list(REMOVE_DUPLICATES PARSE_SOURCES) + endif() + add_library(${NAME} ${LIB_TYPE} ${PARSE_SOURCES}) + if(PARSE_DEPLIBS) + target_link_libraries(${NAME} PRIVATE ${PARSE_DEPLIBS}) + endif() +endfunction() + +function(KOKKOS_INCLUDE_DIRECTORIES) + cmake_parse_arguments(INC "REQUIRED_DURING_INSTALLATION_TESTING" "" "" ${ARGN}) + include_directories(${INC_UNPARSED_ARGUMENTS}) +endfunction() + +macro(PRINTALL match) + get_cmake_property(_variableNames VARIABLES) + list(SORT _variableNames) + foreach(_variableName ${_variableNames}) + if("${_variableName}" MATCHES "${match}") + message(STATUS "${_variableName}=${${_variableName}}") + endif() + endforeach() +endmacro() -MACRO(SET_GLOBAL_REPLACE SUBSTR VARNAME) - STRING(REPLACE ${SUBSTR} ${${VARNAME}} TEMP) - GLOBAL_SET(${VARNAME} ${TEMP}) -ENDMACRO() +macro(SET_GLOBAL_REPLACE SUBSTR VARNAME) + string(REPLACE ${SUBSTR} ${${VARNAME}} TEMP) + global_set(${VARNAME} ${TEMP}) +endmacro() -FUNCTION(GLOBAL_APPEND VARNAME) +function(GLOBAL_APPEND VARNAME) #We make this a function since we are setting variables #and want to use scope to avoid overwriting local variables - SET(TEMP ${${VARNAME}}) - LIST(APPEND TEMP ${ARGN}) - GLOBAL_SET(${VARNAME} ${TEMP}) -ENDFUNCTION() + set(TEMP ${${VARNAME}}) + list(APPEND TEMP ${ARGN}) + global_set(${VARNAME} ${TEMP}) +endfunction() diff --git a/packages/kokkos/cmake/gnu.cmake b/packages/kokkos/cmake/gnu.cmake index aa11fe87b111..e53b4a7becdd 100644 --- a/packages/kokkos/cmake/gnu.cmake +++ b/packages/kokkos/cmake/gnu.cmake @@ -1,23 +1,21 @@ - -FUNCTION(kokkos_set_gnu_flags full_standard int_standard) - STRING(TOLOWER ${full_standard} FULL_LC_STANDARD) - STRING(TOLOWER ${int_standard} INT_LC_STANDARD) +function(kokkos_set_gnu_flags full_standard int_standard) + string(TOLOWER ${full_standard} FULL_LC_STANDARD) + string(TOLOWER ${int_standard} INT_LC_STANDARD) # The following three blocks of code were copied from # /Modules/Compiler/Intel-CXX.cmake from CMake 3.7.2 and then modified. - IF(CMAKE_CXX_SIMULATE_ID STREQUAL MSVC) - SET(_std -Qstd) - SET(_ext c++) - ELSE() - SET(_std -std) - SET(_ext gnu++) - ENDIF() - - IF (CMAKE_CXX_EXTENSIONS) - SET(KOKKOS_CXX_STANDARD_FLAG "-std=gnu++${FULL_LC_STANDARD}" PARENT_SCOPE) - SET(KOKKOS_CXX_INTERMEDIATE_STANDARD_FLAG "-std=gnu++${INT_LC_STANDARD}" PARENT_SCOPE) - ELSE() - SET(KOKKOS_CXX_STANDARD_FLAG "-std=c++${FULL_LC_STANDARD}" PARENT_SCOPE) - SET(KOKKOS_CXX_INTERMEDIATE_STANDARD_FLAG "-std=c++${INT_LC_STANDARD}" PARENT_SCOPE) - ENDIF() -ENDFUNCTION() + if(CMAKE_CXX_SIMULATE_ID STREQUAL MSVC) + set(_std -Qstd) + set(_ext c++) + else() + set(_std -std) + set(_ext gnu++) + endif() + if(CMAKE_CXX_EXTENSIONS) + set(KOKKOS_CXX_STANDARD_FLAG "-std=gnu++${FULL_LC_STANDARD}" PARENT_SCOPE) + set(KOKKOS_CXX_INTERMEDIATE_STANDARD_FLAG "-std=gnu++${INT_LC_STANDARD}" PARENT_SCOPE) + else() + set(KOKKOS_CXX_STANDARD_FLAG "-std=c++${FULL_LC_STANDARD}" PARENT_SCOPE) + set(KOKKOS_CXX_INTERMEDIATE_STANDARD_FLAG "-std=c++${INT_LC_STANDARD}" PARENT_SCOPE) + endif() +endfunction() diff --git a/packages/kokkos/cmake/intel.cmake b/packages/kokkos/cmake/intel.cmake index 7e6ee3358c90..b7752caabdf8 100644 --- a/packages/kokkos/cmake/intel.cmake +++ b/packages/kokkos/cmake/intel.cmake @@ -1,18 +1,15 @@ - -FUNCTION(kokkos_set_intel_flags full_standard int_standard) - STRING(TOLOWER ${full_standard} FULL_LC_STANDARD) - STRING(TOLOWER ${int_standard} INT_LC_STANDARD) +function(kokkos_set_intel_flags full_standard int_standard) + string(TOLOWER ${full_standard} FULL_LC_STANDARD) + string(TOLOWER ${int_standard} INT_LC_STANDARD) # The following three blocks of code were copied from # /Modules/Compiler/Intel-CXX.cmake from CMake 3.18.1 and then modified. - IF(CMAKE_CXX_SIMULATE_ID STREQUAL MSVC) - SET(_std -Qstd) - SET(_ext c++) - ELSE() - SET(_std -std) - SET(_ext gnu++) - ENDIF() - SET(KOKKOS_CXX_STANDARD_FLAG "${_std}=c++${FULL_LC_STANDARD}" PARENT_SCOPE) - SET(KOKKOS_CXX_INTERMDIATE_STANDARD_FLAG "${_std}=${_ext}${INT_LC_STANDARD}" PARENT_SCOPE) -ENDFUNCTION() - - + if(CMAKE_CXX_SIMULATE_ID STREQUAL MSVC) + set(_std -Qstd) + set(_ext c++) + else() + set(_std -std) + set(_ext gnu++) + endif() + set(KOKKOS_CXX_STANDARD_FLAG "${_std}=c++${FULL_LC_STANDARD}" PARENT_SCOPE) + set(KOKKOS_CXX_INTERMDIATE_STANDARD_FLAG "${_std}=${_ext}${INT_LC_STANDARD}" PARENT_SCOPE) +endfunction() diff --git a/packages/kokkos/cmake/kokkos_arch.cmake b/packages/kokkos/cmake/kokkos_arch.cmake index a581d9f94571..ae45da806f73 100644 --- a/packages/kokkos/cmake/kokkos_arch.cmake +++ b/packages/kokkos/cmake/kokkos_arch.cmake @@ -1,611 +1,732 @@ - -FUNCTION(KOKKOS_ARCH_OPTION SUFFIX DEV_TYPE DESCRIPTION DEPENDENCY) +function(KOKKOS_ARCH_OPTION SUFFIX DEV_TYPE DESCRIPTION DEPENDENCY) #all optimizations off by default - KOKKOS_DEPENDENT_OPTION(ARCH_${SUFFIX} "Optimize for ${DESCRIPTION} (${DEV_TYPE})" OFF "${DEPENDENCY}" OFF) - SET(KOKKOS_ARCH_${SUFFIX} ${KOKKOS_ARCH_${SUFFIX}} PARENT_SCOPE) - SET(KOKKOS_OPTION_KEYS ${KOKKOS_OPTION_KEYS} PARENT_SCOPE) - SET(KOKKOS_OPTION_VALUES ${KOKKOS_OPTION_VALUES} PARENT_SCOPE) - SET(KOKKOS_OPTION_TYPES ${KOKKOS_OPTION_TYPES} PARENT_SCOPE) - IF(KOKKOS_ARCH_${SUFFIX}) - LIST(APPEND KOKKOS_ENABLED_ARCH_LIST ${SUFFIX}) - SET(KOKKOS_ENABLED_ARCH_LIST ${KOKKOS_ENABLED_ARCH_LIST} PARENT_SCOPE) - ENDIF() -ENDFUNCTION() - + kokkos_dependent_option(ARCH_${SUFFIX} "Optimize for ${DESCRIPTION} (${DEV_TYPE})" OFF "${DEPENDENCY}" OFF) + set(KOKKOS_ARCH_${SUFFIX} ${KOKKOS_ARCH_${SUFFIX}} PARENT_SCOPE) + set(KOKKOS_OPTION_KEYS ${KOKKOS_OPTION_KEYS} PARENT_SCOPE) + set(KOKKOS_OPTION_VALUES ${KOKKOS_OPTION_VALUES} PARENT_SCOPE) + set(KOKKOS_OPTION_TYPES ${KOKKOS_OPTION_TYPES} PARENT_SCOPE) + if(KOKKOS_ARCH_${SUFFIX}) + list(APPEND KOKKOS_ENABLED_ARCH_LIST ${SUFFIX}) + set(KOKKOS_ENABLED_ARCH_LIST ${KOKKOS_ENABLED_ARCH_LIST} PARENT_SCOPE) + endif() +endfunction() # Make sure devices and compiler ID are done -KOKKOS_CFG_DEPENDS(ARCH COMPILER_ID) -KOKKOS_CFG_DEPENDS(ARCH DEVICES) -KOKKOS_CFG_DEPENDS(ARCH OPTIONS) +kokkos_cfg_depends(ARCH COMPILER_ID) +kokkos_cfg_depends(ARCH DEVICES) +kokkos_cfg_depends(ARCH OPTIONS) -KOKKOS_CHECK_DEPRECATED_OPTIONS( - ARCH_EPYC "Please replace EPYC with ZEN or ZEN2, depending on your platform" - ARCH_RYZEN "Please replace RYZEN with ZEN or ZEN2, depending on your platform" +kokkos_check_deprecated_options( + ARCH_EPYC "Please replace EPYC with ZEN or ZEN2, depending on your platform" ARCH_RYZEN + "Please replace RYZEN with ZEN or ZEN2, depending on your platform" ) #------------------------------------------------------------------------------- # List of possible host architectures. #------------------------------------------------------------------------------- -SET(KOKKOS_ARCH_LIST) +set(KOKKOS_ARCH_LIST) include(CheckCXXCompilerFlag) -KOKKOS_DEPRECATED_LIST(ARCH ARCH) - -SET(HOST_ARCH_ALREADY_SPECIFIED "") -MACRO(DECLARE_AND_CHECK_HOST_ARCH ARCH LABEL) - KOKKOS_ARCH_OPTION(${ARCH} HOST "${LABEL}" TRUE) - IF(KOKKOS_ARCH_${ARCH}) - IF(HOST_ARCH_ALREADY_SPECIFIED) - MESSAGE(FATAL_ERROR "Multiple host architectures given! Already have ${HOST_ARCH_ALREADY_SPECIFIED}, but trying to add ${ARCH}. If you are re-running CMake, try clearing the cache and running again.") - ENDIF() - SET(HOST_ARCH_ALREADY_SPECIFIED ${ARCH}) - ENDIF() -ENDMACRO() - -DECLARE_AND_CHECK_HOST_ARCH(NATIVE "local machine") -DECLARE_AND_CHECK_HOST_ARCH(AMDAVX "AMD chip") -DECLARE_AND_CHECK_HOST_ARCH(ARMV80 "ARMv8.0 Compatible CPU") -DECLARE_AND_CHECK_HOST_ARCH(ARMV81 "ARMv8.1 Compatible CPU") -DECLARE_AND_CHECK_HOST_ARCH(ARMV8_THUNDERX "ARMv8 Cavium ThunderX CPU") -DECLARE_AND_CHECK_HOST_ARCH(ARMV8_THUNDERX2 "ARMv8 Cavium ThunderX2 CPU") -DECLARE_AND_CHECK_HOST_ARCH(A64FX "ARMv8.2 with SVE Support") -DECLARE_AND_CHECK_HOST_ARCH(ARMV9_GRACE "ARMv9 NVIDIA Grace CPU") -DECLARE_AND_CHECK_HOST_ARCH(SNB "Intel Sandy/Ivy Bridge CPUs") -DECLARE_AND_CHECK_HOST_ARCH(HSW "Intel Haswell CPUs") -DECLARE_AND_CHECK_HOST_ARCH(BDW "Intel Broadwell Xeon E-class CPUs") -DECLARE_AND_CHECK_HOST_ARCH(ICL "Intel Ice Lake Client CPUs (AVX512)") -DECLARE_AND_CHECK_HOST_ARCH(ICX "Intel Ice Lake Xeon Server CPUs (AVX512)") -DECLARE_AND_CHECK_HOST_ARCH(SKL "Intel Skylake Client CPUs") -DECLARE_AND_CHECK_HOST_ARCH(SKX "Intel Skylake Xeon Server CPUs (AVX512)") -DECLARE_AND_CHECK_HOST_ARCH(KNC "Intel Knights Corner Xeon Phi") -DECLARE_AND_CHECK_HOST_ARCH(KNL "Intel Knights Landing Xeon Phi") -DECLARE_AND_CHECK_HOST_ARCH(SPR "Intel Sapphire Rapids Xeon Server CPUs (AVX512)") -DECLARE_AND_CHECK_HOST_ARCH(POWER8 "IBM POWER8 CPUs") -DECLARE_AND_CHECK_HOST_ARCH(POWER9 "IBM POWER9 CPUs") -DECLARE_AND_CHECK_HOST_ARCH(ZEN "AMD Zen architecture") -DECLARE_AND_CHECK_HOST_ARCH(ZEN2 "AMD Zen2 architecture") -DECLARE_AND_CHECK_HOST_ARCH(ZEN3 "AMD Zen3 architecture") -DECLARE_AND_CHECK_HOST_ARCH(RISCV_SG2042 "SG2042 (RISC-V) CPUs") - -IF(Kokkos_ENABLE_CUDA OR Kokkos_ENABLE_OPENMPTARGET OR Kokkos_ENABLE_OPENACC OR Kokkos_ENABLE_SYCL) - SET(KOKKOS_SHOW_CUDA_ARCHS ON) -ENDIF() - -KOKKOS_ARCH_OPTION(KEPLER30 GPU "NVIDIA Kepler generation CC 3.0" "KOKKOS_SHOW_CUDA_ARCHS") -KOKKOS_ARCH_OPTION(KEPLER32 GPU "NVIDIA Kepler generation CC 3.2" "KOKKOS_SHOW_CUDA_ARCHS") -KOKKOS_ARCH_OPTION(KEPLER35 GPU "NVIDIA Kepler generation CC 3.5" "KOKKOS_SHOW_CUDA_ARCHS") -KOKKOS_ARCH_OPTION(KEPLER37 GPU "NVIDIA Kepler generation CC 3.7" "KOKKOS_SHOW_CUDA_ARCHS") -KOKKOS_ARCH_OPTION(MAXWELL50 GPU "NVIDIA Maxwell generation CC 5.0" "KOKKOS_SHOW_CUDA_ARCHS") -KOKKOS_ARCH_OPTION(MAXWELL52 GPU "NVIDIA Maxwell generation CC 5.2" "KOKKOS_SHOW_CUDA_ARCHS") -KOKKOS_ARCH_OPTION(MAXWELL53 GPU "NVIDIA Maxwell generation CC 5.3" "KOKKOS_SHOW_CUDA_ARCHS") -KOKKOS_ARCH_OPTION(PASCAL60 GPU "NVIDIA Pascal generation CC 6.0" "KOKKOS_SHOW_CUDA_ARCHS") -KOKKOS_ARCH_OPTION(PASCAL61 GPU "NVIDIA Pascal generation CC 6.1" "KOKKOS_SHOW_CUDA_ARCHS") -KOKKOS_ARCH_OPTION(VOLTA70 GPU "NVIDIA Volta generation CC 7.0" "KOKKOS_SHOW_CUDA_ARCHS") -KOKKOS_ARCH_OPTION(VOLTA72 GPU "NVIDIA Volta generation CC 7.2" "KOKKOS_SHOW_CUDA_ARCHS") -KOKKOS_ARCH_OPTION(TURING75 GPU "NVIDIA Turing generation CC 7.5" "KOKKOS_SHOW_CUDA_ARCHS") -KOKKOS_ARCH_OPTION(AMPERE80 GPU "NVIDIA Ampere generation CC 8.0" "KOKKOS_SHOW_CUDA_ARCHS") -KOKKOS_ARCH_OPTION(AMPERE86 GPU "NVIDIA Ampere generation CC 8.6" "KOKKOS_SHOW_CUDA_ARCHS") -KOKKOS_ARCH_OPTION(ADA89 GPU "NVIDIA Ada generation CC 8.9" "KOKKOS_SHOW_CUDA_ARCHS") -KOKKOS_ARCH_OPTION(HOPPER90 GPU "NVIDIA Hopper generation CC 9.0" "KOKKOS_SHOW_CUDA_ARCHS") - -IF(Kokkos_ENABLE_HIP OR Kokkos_ENABLE_OPENMPTARGET OR Kokkos_ENABLE_OPENACC OR Kokkos_ENABLE_SYCL) - SET(KOKKOS_SHOW_HIP_ARCHS ON) -ENDIF() +kokkos_deprecated_list(ARCH ARCH) + +set(HOST_ARCH_ALREADY_SPECIFIED "") +macro(DECLARE_AND_CHECK_HOST_ARCH ARCH LABEL) + kokkos_arch_option(${ARCH} HOST "${LABEL}" TRUE) + if(KOKKOS_ARCH_${ARCH}) + if(HOST_ARCH_ALREADY_SPECIFIED) + message( + FATAL_ERROR + "Multiple host architectures given! Already have ${HOST_ARCH_ALREADY_SPECIFIED}, but trying to add ${ARCH}. If you are re-running CMake, try clearing the cache and running again." + ) + endif() + set(HOST_ARCH_ALREADY_SPECIFIED ${ARCH}) + endif() +endmacro() + +declare_and_check_host_arch(NATIVE "local machine") +declare_and_check_host_arch(AMDAVX "AMD chip") +declare_and_check_host_arch(ARMV80 "ARMv8.0 Compatible CPU") +declare_and_check_host_arch(ARMV81 "ARMv8.1 Compatible CPU") +declare_and_check_host_arch(ARMV8_THUNDERX "ARMv8 Cavium ThunderX CPU") +declare_and_check_host_arch(ARMV8_THUNDERX2 "ARMv8 Cavium ThunderX2 CPU") +declare_and_check_host_arch(A64FX "ARMv8.2 with SVE Support") +declare_and_check_host_arch(ARMV9_GRACE "ARMv9 NVIDIA Grace CPU") +declare_and_check_host_arch(SNB "Intel Sandy/Ivy Bridge CPUs") +declare_and_check_host_arch(HSW "Intel Haswell CPUs") +declare_and_check_host_arch(BDW "Intel Broadwell Xeon E-class CPUs") +declare_and_check_host_arch(ICL "Intel Ice Lake Client CPUs (AVX512)") +declare_and_check_host_arch(ICX "Intel Ice Lake Xeon Server CPUs (AVX512)") +declare_and_check_host_arch(SKL "Intel Skylake Client CPUs") +declare_and_check_host_arch(SKX "Intel Skylake Xeon Server CPUs (AVX512)") +declare_and_check_host_arch(KNC "Intel Knights Corner Xeon Phi") +declare_and_check_host_arch(KNL "Intel Knights Landing Xeon Phi") +declare_and_check_host_arch(SPR "Intel Sapphire Rapids Xeon Server CPUs (AVX512)") +declare_and_check_host_arch(POWER8 "IBM POWER8 CPUs") +declare_and_check_host_arch(POWER9 "IBM POWER9 CPUs") +declare_and_check_host_arch(ZEN "AMD Zen architecture") +declare_and_check_host_arch(ZEN2 "AMD Zen2 architecture") +declare_and_check_host_arch(ZEN3 "AMD Zen3 architecture") +declare_and_check_host_arch(RISCV_SG2042 "SG2042 (RISC-V) CPUs") +declare_and_check_host_arch(RISCV_RVA22V "RVA22V (RISC-V) CPUs") + +if(Kokkos_ENABLE_CUDA + OR Kokkos_ENABLE_OPENMPTARGET + OR Kokkos_ENABLE_OPENACC + OR Kokkos_ENABLE_SYCL +) + set(KOKKOS_SHOW_CUDA_ARCHS ON) +endif() + +kokkos_arch_option(KEPLER30 GPU "NVIDIA Kepler generation CC 3.0" "KOKKOS_SHOW_CUDA_ARCHS") +kokkos_arch_option(KEPLER32 GPU "NVIDIA Kepler generation CC 3.2" "KOKKOS_SHOW_CUDA_ARCHS") +kokkos_arch_option(KEPLER35 GPU "NVIDIA Kepler generation CC 3.5" "KOKKOS_SHOW_CUDA_ARCHS") +kokkos_arch_option(KEPLER37 GPU "NVIDIA Kepler generation CC 3.7" "KOKKOS_SHOW_CUDA_ARCHS") +kokkos_arch_option(MAXWELL50 GPU "NVIDIA Maxwell generation CC 5.0" "KOKKOS_SHOW_CUDA_ARCHS") +kokkos_arch_option(MAXWELL52 GPU "NVIDIA Maxwell generation CC 5.2" "KOKKOS_SHOW_CUDA_ARCHS") +kokkos_arch_option(MAXWELL53 GPU "NVIDIA Maxwell generation CC 5.3" "KOKKOS_SHOW_CUDA_ARCHS") +kokkos_arch_option(PASCAL60 GPU "NVIDIA Pascal generation CC 6.0" "KOKKOS_SHOW_CUDA_ARCHS") +kokkos_arch_option(PASCAL61 GPU "NVIDIA Pascal generation CC 6.1" "KOKKOS_SHOW_CUDA_ARCHS") +kokkos_arch_option(VOLTA70 GPU "NVIDIA Volta generation CC 7.0" "KOKKOS_SHOW_CUDA_ARCHS") +kokkos_arch_option(VOLTA72 GPU "NVIDIA Volta generation CC 7.2" "KOKKOS_SHOW_CUDA_ARCHS") +kokkos_arch_option(TURING75 GPU "NVIDIA Turing generation CC 7.5" "KOKKOS_SHOW_CUDA_ARCHS") +kokkos_arch_option(AMPERE80 GPU "NVIDIA Ampere generation CC 8.0" "KOKKOS_SHOW_CUDA_ARCHS") +kokkos_arch_option(AMPERE86 GPU "NVIDIA Ampere generation CC 8.6" "KOKKOS_SHOW_CUDA_ARCHS") +kokkos_arch_option(ADA89 GPU "NVIDIA Ada generation CC 8.9" "KOKKOS_SHOW_CUDA_ARCHS") +kokkos_arch_option(HOPPER90 GPU "NVIDIA Hopper generation CC 9.0" "KOKKOS_SHOW_CUDA_ARCHS") + +if(Kokkos_ENABLE_HIP + OR Kokkos_ENABLE_OPENMPTARGET + OR Kokkos_ENABLE_OPENACC + OR Kokkos_ENABLE_SYCL +) + set(KOKKOS_SHOW_HIP_ARCHS ON) +endif() # AMD archs ordered in decreasing priority of autodetection -LIST(APPEND SUPPORTED_AMD_GPUS MI300 MI300) -LIST(APPEND SUPPORTED_AMD_ARCHS AMD_GFX942 AMD_GFX940) -LIST(APPEND CORRESPONDING_AMD_FLAGS gfx942 gfx940) -LIST(APPEND SUPPORTED_AMD_GPUS MI200 MI200 MI100 MI100) -LIST(APPEND SUPPORTED_AMD_ARCHS VEGA90A AMD_GFX90A VEGA908 AMD_GFX908) -LIST(APPEND CORRESPONDING_AMD_FLAGS gfx90a gfx90a gfx908 gfx908) -LIST(APPEND SUPPORTED_AMD_GPUS MI50/60 MI50/60) -LIST(APPEND SUPPORTED_AMD_ARCHS VEGA906 AMD_GFX906) -LIST(APPEND CORRESPONDING_AMD_FLAGS gfx906 gfx906) -LIST(APPEND SUPPORTED_AMD_GPUS RX7900XTX RX7900XTX V620/W6800 V620/W6800) -LIST(APPEND SUPPORTED_AMD_ARCHS NAVI1100 AMD_GFX1100 NAVI1030 AMD_GFX1030) -LIST(APPEND CORRESPONDING_AMD_FLAGS gfx1100 gfx1100 gfx1030 gfx1030) +list(APPEND SUPPORTED_AMD_GPUS MI300 MI300A MI300) +list(APPEND SUPPORTED_AMD_ARCHS AMD_GFX942 AMD_GFX942_APU AMD_GFX940) +list(APPEND CORRESPONDING_AMD_FLAGS gfx942 gfx942 gfx940) +list(APPEND SUPPORTED_AMD_GPUS MI200 MI200 MI100 MI100) +list(APPEND SUPPORTED_AMD_ARCHS VEGA90A AMD_GFX90A VEGA908 AMD_GFX908) +list(APPEND CORRESPONDING_AMD_FLAGS gfx90a gfx90a gfx908 gfx908) +list(APPEND SUPPORTED_AMD_GPUS MI50/60 MI50/60) +list(APPEND SUPPORTED_AMD_ARCHS VEGA906 AMD_GFX906) +list(APPEND CORRESPONDING_AMD_FLAGS gfx906 gfx906) +list(APPEND SUPPORTED_AMD_GPUS PHOENIX RX7900XTX V620/W6800 V620/W6800) +list(APPEND SUPPORTED_AMD_ARCHS AMD_GFX1103 AMD_GFX1100 NAVI1030 AMD_GFX1030) +list(APPEND CORRESPONDING_AMD_FLAGS gfx1103 gfx1100 gfx1030 gfx1030) #FIXME CAN BE REPLACED WITH LIST_ZIP IN CMAKE 3.17 -FOREACH(ARCH IN LISTS SUPPORTED_AMD_ARCHS) - LIST(FIND SUPPORTED_AMD_ARCHS ${ARCH} LIST_INDEX) - LIST(GET SUPPORTED_AMD_GPUS ${LIST_INDEX} GPU) - LIST(GET CORRESPONDING_AMD_FLAGS ${LIST_INDEX} FLAG) - KOKKOS_ARCH_OPTION(${ARCH} GPU "AMD GPU ${GPU} ${FLAG}" "KOKKOS_SHOW_HIP_ARCHS") -ENDFOREACH() - -IF(Kokkos_ENABLE_SYCL OR Kokkos_ENABLE_OPENMPTARGET) - SET(KOKKOS_SHOW_SYCL_ARCHS ON) -ENDIF() - -KOKKOS_ARCH_OPTION(INTEL_GEN GPU "SPIR64-based devices, e.g. Intel GPUs, using JIT" "KOKKOS_SHOW_SYCL_ARCHS") -KOKKOS_ARCH_OPTION(INTEL_DG1 GPU "Intel Iris XeMAX GPU" "KOKKOS_SHOW_SYCL_ARCHS") -KOKKOS_ARCH_OPTION(INTEL_GEN9 GPU "Intel GPU Gen9" "KOKKOS_SHOW_SYCL_ARCHS") -KOKKOS_ARCH_OPTION(INTEL_GEN11 GPU "Intel GPU Gen11" "KOKKOS_SHOW_SYCL_ARCHS") -KOKKOS_ARCH_OPTION(INTEL_GEN12LP GPU "Intel GPU Gen12LP" "KOKKOS_SHOW_SYCL_ARCHS") -KOKKOS_ARCH_OPTION(INTEL_XEHP GPU "Intel GPU Xe-HP" "KOKKOS_SHOW_SYCL_ARCHS") -KOKKOS_ARCH_OPTION(INTEL_PVC GPU "Intel GPU Ponte Vecchio" "KOKKOS_SHOW_SYCL_ARCHS") - -IF(KOKKOS_ENABLE_COMPILER_WARNINGS) - SET(COMMON_WARNINGS - "-Wall" "-Wextra" "-Wunused-parameter" "-Wshadow" "-pedantic" - "-Wsign-compare" "-Wtype-limits" "-Wuninitialized") +foreach(ARCH IN LISTS SUPPORTED_AMD_ARCHS) + list(FIND SUPPORTED_AMD_ARCHS ${ARCH} LIST_INDEX) + list(GET SUPPORTED_AMD_GPUS ${LIST_INDEX} GPU) + list(GET CORRESPONDING_AMD_FLAGS ${LIST_INDEX} FLAG) + kokkos_arch_option(${ARCH} GPU "AMD GPU ${GPU} ${FLAG}" "KOKKOS_SHOW_HIP_ARCHS") +endforeach() + +if(Kokkos_ENABLE_SYCL OR Kokkos_ENABLE_OPENMPTARGET) + set(KOKKOS_SHOW_SYCL_ARCHS ON) +endif() + +kokkos_arch_option(INTEL_GEN GPU "SPIR64-based devices, e.g. Intel GPUs, using JIT" "KOKKOS_SHOW_SYCL_ARCHS") +kokkos_arch_option(INTEL_DG1 GPU "Intel Iris XeMAX GPU" "KOKKOS_SHOW_SYCL_ARCHS") +kokkos_arch_option(INTEL_GEN9 GPU "Intel GPU Gen9" "KOKKOS_SHOW_SYCL_ARCHS") +kokkos_arch_option(INTEL_GEN11 GPU "Intel GPU Gen11" "KOKKOS_SHOW_SYCL_ARCHS") +kokkos_arch_option(INTEL_GEN12LP GPU "Intel GPU Gen12LP" "KOKKOS_SHOW_SYCL_ARCHS") +kokkos_arch_option(INTEL_XEHP GPU "Intel GPU Xe-HP" "KOKKOS_SHOW_SYCL_ARCHS") +kokkos_arch_option(INTEL_PVC GPU "Intel GPU Ponte Vecchio" "KOKKOS_SHOW_SYCL_ARCHS") + +if(KOKKOS_ENABLE_COMPILER_WARNINGS) + set(COMMON_WARNINGS + "-Wall" + "-Wextra" + "-Wunused-parameter" + "-Wshadow" + "-pedantic" + "-Wsign-compare" + "-Wtype-limits" + "-Wuninitialized" + "-Wsuggest-override" + ) # NOTE KOKKOS_ prefixed variable (all uppercase) is not set yet because TPLs are processed after ARCH - IF(Kokkos_ENABLE_LIBQUADMATH) + if(Kokkos_ENABLE_LIBQUADMATH) # warning: non-standard suffix on floating constant [-Wpedantic] - LIST(REMOVE_ITEM COMMON_WARNINGS "-pedantic") - ENDIF() + list(REMOVE_ITEM COMMON_WARNINGS "-pedantic") + endif() # NVHPC compiler does not support -Wtype-limits. - IF(KOKKOS_ENABLE_OPENACC) - IF(KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) - LIST(REMOVE_ITEM COMMON_WARNINGS "-Wtype-limits") - ENDIF() - ENDIF() - - IF(KOKKOS_CXX_COMPILER_ID STREQUAL Clang) - LIST(APPEND COMMON_WARNINGS "-Wimplicit-fallthrough") - ENDIF() - - SET(GNU_WARNINGS "-Wempty-body" "-Wclobbered" "-Wignored-qualifiers" - ${COMMON_WARNINGS}) - IF(KOKKOS_CXX_COMPILER_ID STREQUAL GNU) - LIST(APPEND GNU_WARNINGS "-Wimplicit-fallthrough") - ENDIF() + if(KOKKOS_ENABLE_OPENACC) + if(KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) + list(REMOVE_ITEM COMMON_WARNINGS "-Wtype-limits") + endif() + endif() + + # ICPC doesn't support -Wsuggest-override + if(KOKKOS_CXX_COMPILER_ID STREQUAL Intel) + list(REMOVE_ITEM COMMON_WARNINGS "-Wsuggest-override") + endif() + + if(KOKKOS_CXX_COMPILER_ID STREQUAL Clang) + list(APPEND COMMON_WARNINGS "-Wimplicit-fallthrough") + endif() + + set(GNU_WARNINGS "-Wempty-body" "-Wclobbered" "-Wignored-qualifiers" ${COMMON_WARNINGS}) + if(KOKKOS_CXX_COMPILER_ID STREQUAL GNU) + list(APPEND GNU_WARNINGS "-Wimplicit-fallthrough") + endif() # Not using COMPILER_SPECIFIC_FLAGS function so the warning flags are not passed downstream - IF(CMAKE_CXX_COMPILER_ID STREQUAL GNU) - STRING(REPLACE ";" " " WARNING_FLAGS "${GNU_WARNINGS}") - ELSEIF(CMAKE_CXX_COMPILER_ID STREQUAL NVHPC) + if(CMAKE_CXX_COMPILER_ID STREQUAL GNU) + string(REPLACE ";" " " WARNING_FLAGS "${GNU_WARNINGS}") + elseif(CMAKE_CXX_COMPILER_ID STREQUAL NVHPC) # FIXME_NVHPC - ELSE() - STRING(REPLACE ";" " " WARNING_FLAGS "${COMMON_WARNINGS}") - ENDIF() - SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${WARNING_FLAGS}") -ENDIF() - + else() + string(REPLACE ";" " " WARNING_FLAGS "${COMMON_WARNINGS}") + endif() + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${WARNING_FLAGS}") +endif() #------------------------------- KOKKOS_CUDA_OPTIONS --------------------------- #clear anything that might be in the cache -GLOBAL_SET(KOKKOS_CUDA_OPTIONS) +global_set(KOKKOS_CUDA_OPTIONS) # Construct the Makefile options -IF(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) - GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS "-extended-lambda") - GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS "-Wext-lambda-captures-this") -ENDIF() - -IF (KOKKOS_ENABLE_CUDA_CONSTEXPR) - IF(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) - GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS "-expt-relaxed-constexpr") - ENDIF() -ENDIF() - -IF (KOKKOS_CXX_COMPILER_ID STREQUAL Clang) - SET(CUDA_ARCH_FLAG "--cuda-gpu-arch") - GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS -x cuda) +if(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) + global_append(KOKKOS_CUDA_OPTIONS "-extended-lambda") + global_append(KOKKOS_CUDA_OPTIONS "-Wext-lambda-captures-this") +endif() + +if(KOKKOS_ENABLE_CUDA_CONSTEXPR) + if(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) + global_append(KOKKOS_CUDA_OPTIONS "-expt-relaxed-constexpr") + endif() +endif() + +if(KOKKOS_CXX_COMPILER_ID STREQUAL Clang) + set(CUDA_ARCH_FLAG "--cuda-gpu-arch") + global_append(KOKKOS_CUDA_OPTIONS -x cuda) # Kokkos_CUDA_DIR has priority over CUDAToolkit_BIN_DIR - IF (Kokkos_CUDA_DIR) - GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS --cuda-path=${Kokkos_CUDA_DIR}) - ELSEIF(CUDAToolkit_BIN_DIR) - GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS --cuda-path=${CUDAToolkit_BIN_DIR}/..) - ENDIF() -ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) - SET(CUDA_ARCH_FLAG "-arch") -ENDIF() - -IF (KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) - STRING(TOUPPER "${CMAKE_BUILD_TYPE}" _UPPERCASE_CMAKE_BUILD_TYPE) - IF (KOKKOS_ENABLE_DEBUG OR _UPPERCASE_CMAKE_BUILD_TYPE STREQUAL "DEBUG") - GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS -lineinfo) - ENDIF() - UNSET(_UPPERCASE_CMAKE_BUILD_TYPE) -ENDIF() - + if(Kokkos_CUDA_DIR) + global_append(KOKKOS_CUDA_OPTIONS --cuda-path=${Kokkos_CUDA_DIR}) + elseif(CUDAToolkit_BIN_DIR) + global_append(KOKKOS_CUDA_OPTIONS --cuda-path=${CUDAToolkit_BIN_DIR}/..) + endif() +elseif(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) + set(CUDA_ARCH_FLAG "-arch") +endif() + +if(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) + string(TOUPPER "${CMAKE_BUILD_TYPE}" _UPPERCASE_CMAKE_BUILD_TYPE) + if(KOKKOS_ENABLE_DEBUG OR _UPPERCASE_CMAKE_BUILD_TYPE STREQUAL "DEBUG") + global_append(KOKKOS_CUDA_OPTIONS -lineinfo) + endif() + unset(_UPPERCASE_CMAKE_BUILD_TYPE) +endif() #------------------------------- KOKKOS_HIP_OPTIONS --------------------------- -KOKKOS_OPTION(IMPL_AMDGPU_FLAGS "" STRING "Set compiler flags for AMD GPUs") -KOKKOS_OPTION(IMPL_AMDGPU_LINK "" STRING "Set linker flags for AMD GPUs") -MARK_AS_ADVANCED(Kokkos_IMPL_AMDGPU_FLAGS) -MARK_AS_ADVANCED(Kokkos_IMPL_AMDGPU_LINK) +kokkos_option(IMPL_AMDGPU_FLAGS "" STRING "Set compiler flags for AMD GPUs") +kokkos_option(IMPL_AMDGPU_LINK "" STRING "Set linker flags for AMD GPUs") +mark_as_advanced(Kokkos_IMPL_AMDGPU_FLAGS) +mark_as_advanced(Kokkos_IMPL_AMDGPU_LINK) #clear anything that might be in the cache -GLOBAL_SET(KOKKOS_AMDGPU_OPTIONS) -IF(KOKKOS_ENABLE_HIP) - SET(AMDGPU_ARCH_FLAG "--offload-arch") - IF(NOT KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC) - IF (NOT CMAKE_CXX_STANDARD) - MESSAGE(FATAL_ERROR "Kokkos requires CMAKE_CXX_STANDARD to set to 17 or higher") - ENDIF() - GLOBAL_APPEND(KOKKOS_AMDGPU_OPTIONS -xhip) - IF(DEFINED ENV{ROCM_PATH}) - GLOBAL_APPEND(KOKKOS_AMDGPU_OPTIONS --rocm-path=$ENV{ROCM_PATH}) - ENDIF() - ENDIF() -ENDIF() - - -IF(KOKKOS_ARCH_NATIVE) - IF(KOKKOS_CXX_HOST_COMPILER_ID STREQUAL "MSVC") - MESSAGE(FATAL_ERROR "MSVC doesn't support ARCH_NATIVE!") - ENDIF() - - STRING(TOUPPER "${CMAKE_SYSTEM_PROCESSOR}" KOKKOS_UC_SYSTEM_PROCESSOR) - IF(KOKKOS_UC_SYSTEM_PROCESSOR MATCHES "(X86)|(AMD64)") - SET(KOKKOS_NATIVE_FLAGS "-march=native;-mtune=native") - ELSE() - SET(KOKKOS_NATIVE_FLAGS "-mcpu=native") - ENDIF() - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - NVHPC -tp=native - DEFAULT ${KOKKOS_NATIVE_FLAGS} +global_set(KOKKOS_AMDGPU_OPTIONS) +if(KOKKOS_ENABLE_HIP) + set(AMDGPU_ARCH_FLAG "--offload-arch") + if(NOT KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC) + if(NOT CMAKE_CXX_STANDARD) + message(FATAL_ERROR "Kokkos requires CMAKE_CXX_STANDARD to set to 17 or higher") + endif() + global_append(KOKKOS_AMDGPU_OPTIONS -xhip) + if(DEFINED ENV{ROCM_PATH}) + global_append(KOKKOS_AMDGPU_OPTIONS --rocm-path=$ENV{ROCM_PATH}) + endif() + endif() +endif() + +if(KOKKOS_ARCH_NATIVE) + if(KOKKOS_CXX_HOST_COMPILER_ID STREQUAL "MSVC") + message(FATAL_ERROR "MSVC doesn't support ARCH_NATIVE!") + endif() + + string(TOUPPER "${CMAKE_SYSTEM_PROCESSOR}" KOKKOS_UC_SYSTEM_PROCESSOR) + if(KOKKOS_UC_SYSTEM_PROCESSOR MATCHES "(X86)|(AMD64)") + set(KOKKOS_NATIVE_FLAGS "-march=native;-mtune=native") + else() + set(KOKKOS_NATIVE_FLAGS "-mcpu=native") + endif() + compiler_specific_flags(COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID NVHPC -tp=native DEFAULT ${KOKKOS_NATIVE_FLAGS}) +endif() + +if(KOKKOS_ARCH_ARMV80) + set(KOKKOS_ARCH_ARM_NEON ON) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + Cray + NO-VALUE-SPECIFIED + MSVC + /arch:armv8.0 + NVHPC + NO-VALUE-SPECIFIED + DEFAULT + -march=armv8-a ) -ENDIF() - -IF (KOKKOS_ARCH_ARMV80) - SET(KOKKOS_ARCH_ARM_NEON ON) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - Cray NO-VALUE-SPECIFIED - MSVC /arch:armv8.0 - NVHPC NO-VALUE-SPECIFIED - DEFAULT -march=armv8-a +endif() + +if(KOKKOS_ARCH_ARMV81) + set(KOKKOS_ARCH_ARM_NEON ON) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + Cray + NO-VALUE-SPECIFIED + MSVC + /arch:armv8.1 + NVHPC + NO-VALUE-SPECIFIED + DEFAULT + -march=armv8.1-a ) -ENDIF() - -IF (KOKKOS_ARCH_ARMV81) - SET(KOKKOS_ARCH_ARM_NEON ON) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - Cray NO-VALUE-SPECIFIED - MSVC /arch:armv8.1 - NVHPC NO-VALUE-SPECIFIED - DEFAULT -march=armv8.1-a +endif() + +if(KOKKOS_ARCH_ARMV8_THUNDERX) + set(KOKKOS_ARCH_ARM_NEON ON) + set(KOKKOS_ARCH_ARMV80 ON) #Not a cache variable + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + Cray + NO-VALUE-SPECIFIED + MSVC + /arch:armv8.0 + NVHPC + NO-VALUE-SPECIFIED + DEFAULT + -march=armv8-a + -mtune=thunderx ) -ENDIF() - -IF (KOKKOS_ARCH_ARMV8_THUNDERX) - SET(KOKKOS_ARCH_ARM_NEON ON) - SET(KOKKOS_ARCH_ARMV80 ON) #Not a cache variable - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - Cray NO-VALUE-SPECIFIED - MSVC /arch:armv8.0 - NVHPC NO-VALUE-SPECIFIED - DEFAULT -march=armv8-a -mtune=thunderx +endif() + +if(KOKKOS_ARCH_ARMV8_THUNDERX2) + set(KOKKOS_ARCH_ARM_NEON ON) + set(KOKKOS_ARCH_ARMV81 ON) #Not a cache variable + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + Cray + NO-VALUE-SPECIFIED + MSVC + /arch:armv8.1 + NVHPC + NO-VALUE-SPECIFIED + DEFAULT + -mcpu=thunderx2t99 + -mtune=thunderx2t99 ) -ENDIF() - -IF (KOKKOS_ARCH_ARMV8_THUNDERX2) - SET(KOKKOS_ARCH_ARM_NEON ON) - SET(KOKKOS_ARCH_ARMV81 ON) #Not a cache variable - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - Cray NO-VALUE-SPECIFIED - MSVC /arch:armv8.1 - NVHPC NO-VALUE-SPECIFIED - DEFAULT -mcpu=thunderx2t99 -mtune=thunderx2t99 +endif() + +if(KOKKOS_ARCH_A64FX) + set(KOKKOS_ARCH_ARM_NEON ON) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + Clang + -march=armv8.2-a+sve + -msve-vector-bits=512 + GNU + -march=armv8.2-a+sve + -msve-vector-bits=512 + MSVC + NO-VALUE-SPECIFIED + NVHPC + NO-VALUE-SPECIFIED + DEFAULT + -march=armv8.2-a+sve ) -ENDIF() - -IF (KOKKOS_ARCH_A64FX) - SET(KOKKOS_ARCH_ARM_NEON ON) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - Clang -march=armv8.2-a+sve -msve-vector-bits=512 - GNU -march=armv8.2-a+sve -msve-vector-bits=512 - MSVC NO-VALUE-SPECIFIED - NVHPC NO-VALUE-SPECIFIED - DEFAULT -march=armv8.2-a+sve - ) -ENDIF() +endif() -IF (KOKKOS_ARCH_ARMV9_GRACE) - SET(KOKKOS_ARCH_ARM_NEON ON) +if(KOKKOS_ARCH_ARMV9_GRACE) + set(KOKKOS_ARCH_ARM_NEON ON) check_cxx_compiler_flag("-mcpu=neoverse-n2" COMPILER_SUPPORTS_NEOVERSE_N2) check_cxx_compiler_flag("-msve-vector-bits=128" COMPILER_SUPPORTS_SVE_VECTOR_BITS) - IF (COMPILER_SUPPORTS_NEOVERSE_N2 AND COMPILER_SUPPORTS_SVE_VECTOR_BITS) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - DEFAULT -mcpu=neoverse-n2 -msve-vector-bits=128 - ) - ELSE() - MESSAGE(WARNING "Compiler does not support ARMv9 Grace architecture") - ENDIF() -ENDIF() - -IF (KOKKOS_ARCH_ZEN) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - Intel -mavx2 - MSVC /arch:AVX2 - NVHPC -tp=zen - DEFAULT -march=znver1 -mtune=znver1 + if(COMPILER_SUPPORTS_NEOVERSE_N2 AND COMPILER_SUPPORTS_SVE_VECTOR_BITS) + compiler_specific_flags(COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID DEFAULT -mcpu=neoverse-n2 -msve-vector-bits=128) + else() + message(WARNING "Compiler does not support ARMv9 Grace architecture") + endif() +endif() + +if(KOKKOS_ARCH_ZEN) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + Intel + -mavx2 + MSVC + /arch:AVX2 + NVHPC + -tp=zen + DEFAULT + -march=znver1 + -mtune=znver1 ) - SET(KOKKOS_ARCH_AMD_ZEN ON) - SET(KOKKOS_ARCH_AVX2 ON) -ENDIF() - -IF (KOKKOS_ARCH_ZEN2) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - Intel -mavx2 - MSVC /arch:AVX2 - NVHPC -tp=zen2 - DEFAULT -march=znver2 -mtune=znver2 + set(KOKKOS_ARCH_AMD_ZEN ON) + set(KOKKOS_ARCH_AVX2 ON) +endif() + +if(KOKKOS_ARCH_ZEN2) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + Intel + -mavx2 + MSVC + /arch:AVX2 + NVHPC + -tp=zen2 + DEFAULT + -march=znver2 + -mtune=znver2 ) - SET(KOKKOS_ARCH_AMD_ZEN2 ON) - SET(KOKKOS_ARCH_AVX2 ON) -ENDIF() - -IF (KOKKOS_ARCH_ZEN3) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - Intel -mavx2 - MSVC /arch:AVX2 - NVHPC -tp=zen2 - DEFAULT -march=znver3 -mtune=znver3 + set(KOKKOS_ARCH_AMD_ZEN2 ON) + set(KOKKOS_ARCH_AVX2 ON) +endif() + +if(KOKKOS_ARCH_ZEN3) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + Intel + -mavx2 + MSVC + /arch:AVX2 + NVHPC + -tp=zen2 + DEFAULT + -march=znver3 + -mtune=znver3 ) - SET(KOKKOS_ARCH_AMD_ZEN3 ON) - SET(KOKKOS_ARCH_AVX2 ON) -ENDIF() - -IF (KOKKOS_ARCH_SNB OR KOKKOS_ARCH_AMDAVX) - SET(KOKKOS_ARCH_AVX ON) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - Cray NO-VALUE-SPECIFIED - Intel -mavx - MSVC /arch:AVX - NVHPC -tp=sandybridge - DEFAULT -mavx + set(KOKKOS_ARCH_AMD_ZEN3 ON) + set(KOKKOS_ARCH_AVX2 ON) +endif() + +if(KOKKOS_ARCH_SNB OR KOKKOS_ARCH_AMDAVX) + set(KOKKOS_ARCH_AVX ON) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + Cray + NO-VALUE-SPECIFIED + Intel + -mavx + MSVC + /arch:AVX + NVHPC + -tp=sandybridge + DEFAULT + -mavx ) -ENDIF() - -IF (KOKKOS_ARCH_HSW) - SET(KOKKOS_ARCH_AVX2 ON) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - Cray NO-VALUE-SPECIFIED - Intel -xCORE-AVX2 - MSVC /arch:AVX2 - NVHPC -tp=haswell - DEFAULT -march=core-avx2 -mtune=core-avx2 +endif() + +if(KOKKOS_ARCH_HSW) + set(KOKKOS_ARCH_AVX2 ON) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + Cray + NO-VALUE-SPECIFIED + Intel + -xCORE-AVX2 + MSVC + /arch:AVX2 + NVHPC + -tp=haswell + DEFAULT + -march=core-avx2 + -mtune=core-avx2 ) -ENDIF() - -IF (KOKKOS_ARCH_RISCV_SG2042) - IF(NOT - (KOKKOS_CXX_COMPILER_ID STREQUAL GNU - AND KOKKOS_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 12) - OR - (KOKKOS_CXX_COMPILER_ID STREQUAL Clang - AND KOKKOS_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 14) +endif() + +if(KOKKOS_ARCH_RISCV_SG2042) + if(NOT (KOKKOS_CXX_COMPILER_ID STREQUAL GNU AND KOKKOS_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 12) + OR (KOKKOS_CXX_COMPILER_ID STREQUAL Clang AND KOKKOS_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 14) ) - MESSAGE(SEND_ERROR "Only gcc >= 12 and clang >= 14 support RISC-V.") - ENDIF() - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - DEFAULT -march=rv64imafdcv - ) -ENDIF() - - -IF (KOKKOS_ARCH_BDW) - SET(KOKKOS_ARCH_AVX2 ON) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - Cray NO-VALUE-SPECIFIED - Intel -xCORE-AVX2 - MSVC /arch:AVX2 - NVHPC -tp=haswell - DEFAULT -march=core-avx2 -mtune=core-avx2 -mrtm + message(SEND_ERROR "Only gcc >= 12 and clang >= 14 support RISC-V.") + endif() + compiler_specific_flags(COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID DEFAULT -march=rv64imafdcv) +endif() + +if(KOKKOS_ARCH_RISCV_RVA22V) + if(NOT (KOKKOS_CXX_COMPILER_ID STREQUAL GNU AND KOKKOS_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 12) + OR (KOKKOS_CXX_COMPILER_ID STREQUAL Clang AND KOKKOS_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 14) ) -ENDIF() - -IF (KOKKOS_ARCH_KNL) - #avx512-mic - SET(KOKKOS_ARCH_AVX512MIC ON) #not a cache variable - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - Cray NO-VALUE-SPECIFIED - Intel -xMIC-AVX512 - MSVC /arch:AVX512 - NVHPC -tp=knl - DEFAULT -march=knl -mtune=knl + message(SEND_ERROR "Only gcc >= 12 and clang >= 14 support RISC-V.") + endif() + compiler_specific_flags( + COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID DEFAULT + -march=rv64imafdcv_sscofpmf_sstc_svpbmt_zicbom_zicboz_zicbop_zihintpause + ) +endif() + +if(KOKKOS_ARCH_BDW) + set(KOKKOS_ARCH_AVX2 ON) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + Cray + NO-VALUE-SPECIFIED + Intel + -xCORE-AVX2 + MSVC + /arch:AVX2 + NVHPC + -tp=haswell + DEFAULT + -march=core-avx2 + -mtune=core-avx2 + -mrtm ) -ENDIF() +endif() -IF (KOKKOS_ARCH_KNC) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - MSVC NO-VALUE-SPECIFIED - DEFAULT -mmic +if(KOKKOS_ARCH_KNL) + #avx512-mic + set(KOKKOS_ARCH_AVX512MIC ON) #not a cache variable + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + Cray + NO-VALUE-SPECIFIED + Intel + -xMIC-AVX512 + MSVC + /arch:AVX512 + NVHPC + -tp=knl + DEFAULT + -march=knl + -mtune=knl ) -ENDIF() - -IF (KOKKOS_ARCH_SKL) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - Cray NO-VALUE-SPECIFIED - Intel -xSKYLAKE - MSVC /arch:AVX2 - NVHPC -tp=skylake - DEFAULT -march=skylake -mtune=skylake +endif() + +if(KOKKOS_ARCH_KNC) + compiler_specific_flags(COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID MSVC NO-VALUE-SPECIFIED DEFAULT -mmic) +endif() + +if(KOKKOS_ARCH_SKL) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + Cray + NO-VALUE-SPECIFIED + Intel + -xSKYLAKE + MSVC + /arch:AVX2 + NVHPC + -tp=skylake + DEFAULT + -march=skylake + -mtune=skylake ) -ENDIF() - -IF (KOKKOS_ARCH_SKX) - SET(KOKKOS_ARCH_AVX512XEON ON) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - Cray NO-VALUE-SPECIFIED - Intel -xCORE-AVX512 - MSVC /arch:AVX512 - NVHPC -tp=skylake - DEFAULT -march=skylake-avx512 -mtune=skylake-avx512 +endif() + +if(KOKKOS_ARCH_SKX) + set(KOKKOS_ARCH_AVX512XEON ON) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + Cray + NO-VALUE-SPECIFIED + Intel + -xCORE-AVX512 + MSVC + /arch:AVX512 + NVHPC + -tp=skylake + DEFAULT + -march=skylake-avx512 + -mtune=skylake-avx512 ) -ENDIF() - -IF (KOKKOS_ARCH_ICL) - SET(KOKKOS_ARCH_AVX512XEON ON) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - MSVC /arch:AVX512 - DEFAULT -march=icelake-client -mtune=icelake-client +endif() + +if(KOKKOS_ARCH_ICL) + set(KOKKOS_ARCH_AVX512XEON ON) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + MSVC + /arch:AVX512 + DEFAULT + -march=icelake-client + -mtune=icelake-client ) -ENDIF() - -IF (KOKKOS_ARCH_ICX) - SET(KOKKOS_ARCH_AVX512XEON ON) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - MSVC /arch:AVX512 - DEFAULT -march=icelake-server -mtune=icelake-server +endif() + +if(KOKKOS_ARCH_ICX) + set(KOKKOS_ARCH_AVX512XEON ON) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + MSVC + /arch:AVX512 + DEFAULT + -march=icelake-server + -mtune=icelake-server ) -ENDIF() - -IF (KOKKOS_ARCH_SPR) - SET(KOKKOS_ARCH_AVX512XEON ON) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - MSVC /arch:AVX512 - DEFAULT -march=sapphirerapids -mtune=sapphirerapids +endif() + +if(KOKKOS_ARCH_SPR) + set(KOKKOS_ARCH_AVX512XEON ON) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + MSVC + /arch:AVX512 + DEFAULT + -march=sapphirerapids + -mtune=sapphirerapids ) -ENDIF() - -IF (KOKKOS_ARCH_POWER7) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - MSVC NO-VALUE-SPECIFIED - NVHPC NO-VALUE-SPECIFIED - DEFAULT -mcpu=power7 -mtune=power7 +endif() + +if(KOKKOS_ARCH_POWER7) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + MSVC + NO-VALUE-SPECIFIED + NVHPC + NO-VALUE-SPECIFIED + DEFAULT + -mcpu=power7 + -mtune=power7 ) -ENDIF() - -IF (KOKKOS_ARCH_POWER8) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - MSVC NO-VALUE-SPECIFIED - NVHPC -tp=pwr8 - DEFAULT -mcpu=power8 -mtune=power8 +endif() + +if(KOKKOS_ARCH_POWER8) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + MSVC + NO-VALUE-SPECIFIED + NVHPC + -tp=pwr8 + DEFAULT + -mcpu=power8 + -mtune=power8 ) -ENDIF() - -IF (KOKKOS_ARCH_POWER9) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - MSVC NO-VALUE-SPECIFIED - NVHPC -tp=pwr9 - DEFAULT -mcpu=power9 -mtune=power9 +endif() + +if(KOKKOS_ARCH_POWER9) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + MSVC + NO-VALUE-SPECIFIED + NVHPC + -tp=pwr9 + DEFAULT + -mcpu=power9 + -mtune=power9 ) -ENDIF() +endif() # If Kokkos_ARCH_NATIVE is enabled, we are trying to autodetect # the SIMD capabilities based on compiler macros. -IF (KOKKOS_ARCH_NATIVE) +if(KOKKOS_ARCH_NATIVE) # Make sure to rerun the checks if compile options have changed - IF(NOT "${KOKKOS_COMPILE_OPTIONS}" STREQUAL "${KOKKOS_COMPILE_OPTIONS_SAVED}") - SET(KOKKOS_COMPILE_OPTIONS_SAVED "${KOKKOS_COMPILE_OPTIONS}" CACHE INTERNAL "") - - SET(CMAKE_REQUIRED_QUIET ON) - SET(CMAKE_REQUIRED_FLAGS "${KOKKOS_COMPILE_OPTIONS}") - INCLUDE(CheckCXXSymbolExists) - - UNSET(KOKKOS_COMPILER_HAS_AVX512 CACHE) - CHECK_CXX_SYMBOL_EXISTS(__AVX512F__ "" KOKKOS_COMPILER_HAS_AVX512) - UNSET(KOKKOS_COMPILER_HAS_AVX2 CACHE) - CHECK_CXX_SYMBOL_EXISTS(__AVX2__ "" KOKKOS_COMPILER_HAS_AVX2) - UNSET(KOKKOS_COMPILER_HAS_ARM_NEON CACHE) - CHECK_CXX_SYMBOL_EXISTS(__ARM_NEON "" KOKKOS_COMPILER_HAS_ARM_NEON) - UNSET(KOKKOS_COMPILER_HAS_AVX CACHE) - CHECK_CXX_SYMBOL_EXISTS(__AVX__ "" KOKKOS_COMPILER_HAS_AVX) - SET(CMAKE_REQUIRED_FLAGS "${KOKKOS_COMPILE_OPTIONS}") - - UNSET(CMAKE_REQUIRED_QUIET) - UNSET(CMAKE_REQUIRED_FLAGS) - ENDIF() + if(NOT "${KOKKOS_COMPILE_OPTIONS}" STREQUAL "${KOKKOS_COMPILE_OPTIONS_SAVED}") + set(KOKKOS_COMPILE_OPTIONS_SAVED "${KOKKOS_COMPILE_OPTIONS}" CACHE INTERNAL "") + + set(CMAKE_REQUIRED_QUIET ON) + set(CMAKE_REQUIRED_FLAGS "${KOKKOS_COMPILE_OPTIONS}") + include(CheckCXXSymbolExists) + + unset(KOKKOS_COMPILER_HAS_AVX512 CACHE) + check_cxx_symbol_exists(__AVX512F__ "" KOKKOS_COMPILER_HAS_AVX512) + unset(KOKKOS_COMPILER_HAS_AVX2 CACHE) + check_cxx_symbol_exists(__AVX2__ "" KOKKOS_COMPILER_HAS_AVX2) + unset(KOKKOS_COMPILER_HAS_ARM_NEON CACHE) + check_cxx_symbol_exists(__ARM_NEON "" KOKKOS_COMPILER_HAS_ARM_NEON) + unset(KOKKOS_COMPILER_HAS_AVX CACHE) + check_cxx_symbol_exists(__AVX__ "" KOKKOS_COMPILER_HAS_AVX) + set(CMAKE_REQUIRED_FLAGS "${KOKKOS_COMPILE_OPTIONS}") + + unset(CMAKE_REQUIRED_QUIET) + unset(CMAKE_REQUIRED_FLAGS) + endif() # Only define one of these macros for now # to be uniform with what we are doing for other architectures. - IF(KOKKOS_COMPILER_HAS_AVX512) - MESSAGE(STATUS "SIMD: AVX512 detected") - SET(KOKKOS_ARCH_AVX512XEON ON) - ELSEIF(KOKKOS_COMPILER_HAS_AVX2) - MESSAGE(STATUS "SIMD: AVX2 detected") - SET(KOKKOS_ARCH_AVX2 ON) - ELSEIF(KOKKOS_COMPILER_HAS_ARM_NEON) - MESSAGE(STATUS "SIMD: ARM_NEON detected") - SET(KOKKOS_ARCH_ARM_NEON ON) - ELSEIF(KOKKOS_COMPILER_HAS_AVX) - MESSAGE(STATUS "SIMD: AVX detected") - SET(KOKKOS_ARCH_AVX ON) - ENDIF() -ENDIF() + if(KOKKOS_COMPILER_HAS_AVX512) + message(STATUS "SIMD: AVX512 detected") + set(KOKKOS_ARCH_AVX512XEON ON) + elseif(KOKKOS_COMPILER_HAS_AVX2) + message(STATUS "SIMD: AVX2 detected") + set(KOKKOS_ARCH_AVX2 ON) + elseif(KOKKOS_COMPILER_HAS_ARM_NEON) + message(STATUS "SIMD: ARM_NEON detected") + set(KOKKOS_ARCH_ARM_NEON ON) + elseif(KOKKOS_COMPILER_HAS_AVX) + message(STATUS "SIMD: AVX detected") + set(KOKKOS_ARCH_AVX ON) + endif() +endif() # FIXME_NVHPC nvc++ doesn't seem to support AVX512. -IF (KOKKOS_CXX_HOST_COMPILER_ID STREQUAL NVHPC) - SET(KOKKOS_ARCH_AVX512XEON OFF) -ENDIF() +if(KOKKOS_CXX_HOST_COMPILER_ID STREQUAL NVHPC) + set(KOKKOS_ARCH_AVX512XEON OFF) +endif() # FIXME_NVCC nvcc doesn't seem to support Arm Neon. -IF(KOKKOS_ARCH_ARM_NEON AND KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) - UNSET(KOKKOS_ARCH_ARM_NEON) -ENDIF() - -IF (NOT KOKKOS_COMPILE_LANGUAGE STREQUAL CUDA) - IF (KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE) - COMPILER_SPECIFIC_FLAGS( - Clang -fcuda-rdc - NVIDIA --relocatable-device-code=true - ) - ENDIF() -ENDIF() +if(KOKKOS_ARCH_ARM_NEON AND KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) + unset(KOKKOS_ARCH_ARM_NEON) +endif() + +if(NOT KOKKOS_COMPILE_LANGUAGE STREQUAL CUDA) + if(KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE) + compiler_specific_flags(Clang -fcuda-rdc NVIDIA --relocatable-device-code=true) + endif() +endif() # Clang needs mcx16 option enabled for Windows atomic functions -IF (CMAKE_CXX_COMPILER_ID STREQUAL Clang AND WIN32) - COMPILER_SPECIFIC_OPTIONS( - Clang -mcx16 - ) -ENDIF() +if(CMAKE_CXX_COMPILER_ID STREQUAL Clang AND WIN32) + compiler_specific_options(Clang -mcx16) +endif() # MSVC ABI has many deprecation warnings, so ignore them -IF (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC" OR "x${CMAKE_CXX_SIMULATE_ID}" STREQUAL "xMSVC") - COMPILER_SPECIFIC_DEFS( - Clang _CRT_SECURE_NO_WARNINGS - ) -ENDIF() - +if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC" OR "x${CMAKE_CXX_SIMULATE_ID}" STREQUAL "xMSVC") + compiler_specific_defs(Clang _CRT_SECURE_NO_WARNINGS) +endif() #Right now we cannot get the compiler ID when cross-compiling, so just check #that HIP is enabled -IF (KOKKOS_ENABLE_HIP) - IF (KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE) - COMPILER_SPECIFIC_FLAGS( - DEFAULT -fgpu-rdc - ) - IF (NOT KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC AND NOT KOKKOS_IMPL_AMDGPU_FLAGS) - COMPILER_SPECIFIC_LINK_OPTIONS( - DEFAULT --hip-link - ) - ENDIF() - ELSE() - COMPILER_SPECIFIC_FLAGS( - DEFAULT -fno-gpu-rdc - ) - ENDIF() -ENDIF() - -IF (KOKKOS_ENABLE_SYCL) - COMPILER_SPECIFIC_FLAGS( - DEFAULT -fsycl -fno-sycl-id-queries-fit-in-int -fsycl-dead-args-optimization - ) - COMPILER_SPECIFIC_OPTIONS( - DEFAULT -fsycl-unnamed-lambda - ) -ENDIF() +if(KOKKOS_ENABLE_HIP) + if(KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE) + compiler_specific_flags(DEFAULT -fgpu-rdc) + if(NOT KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC AND NOT KOKKOS_IMPL_AMDGPU_FLAGS) + compiler_specific_link_options(DEFAULT --hip-link) + endif() + else() + compiler_specific_flags(DEFAULT -fno-gpu-rdc) + endif() +endif() + +if(KOKKOS_ENABLE_SYCL) + compiler_specific_flags(DEFAULT -fsycl -fno-sycl-id-queries-fit-in-int -fsycl-dead-args-optimization) + compiler_specific_options(DEFAULT -fsycl-unnamed-lambda) + if(KOKKOS_CXX_COMPILER_ID STREQUAL IntelLLVM AND KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 2024.1.0) + # Before oneAPI 2024.1.0 passing -fno-sycl didn't work properly + if(NOT KOKKOS_ENABLE_SYCL_RELOCATABLE_DEVICE_CODE) + message(FATAL_ERROR "Kokkos_ENABLE_SYCL_RELOCATABLE_DEVICE_CODE=OFF requires oneAPI 2024.1.0 or later") + endif() + elseif(KOKKOS_ENABLE_SYCL_RELOCATABLE_DEVICE_CODE) + compiler_specific_options(DEFAULT -fsycl-rdc) + else() + compiler_specific_options(DEFAULT -fno-sycl-rdc) + endif() +endif() # Check support for device_global variables # FIXME_SYCL If SYCL_EXT_ONEAPI_DEVICE_GLOBAL is defined, we can use device @@ -613,17 +734,18 @@ ENDIF() # implementation. Otherwise, the feature is not supported when building shared # libraries. Thus, we don't even check for support if shared libraries are # requested and SYCL_EXT_ONEAPI_DEVICE_GLOBAL is not defined. -IF(KOKKOS_ENABLE_SYCL) - STRING(REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${KOKKOS_COMPILE_OPTIONS}") - INCLUDE(CheckCXXSymbolExists) - CHECK_CXX_SYMBOL_EXISTS(SYCL_EXT_ONEAPI_DEVICE_GLOBAL "sycl/sycl.hpp" KOKKOS_IMPL_HAVE_SYCL_EXT_ONEAPI_DEVICE_GLOBAL) - IF (KOKKOS_IMPL_HAVE_SYCL_EXT_ONEAPI_DEVICE_GLOBAL) - SET(KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED ON) +if(KOKKOS_ENABLE_SYCL) + string(REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${KOKKOS_COMPILE_OPTIONS}") + include(CheckCXXSymbolExists) + check_cxx_symbol_exists(SYCL_EXT_ONEAPI_DEVICE_GLOBAL "sycl/sycl.hpp" KOKKOS_IMPL_HAVE_SYCL_EXT_ONEAPI_DEVICE_GLOBAL) + if(KOKKOS_IMPL_HAVE_SYCL_EXT_ONEAPI_DEVICE_GLOBAL) + set(KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED ON) # Use the non-separable compilation implementation to support shared libraries as well. - COMPILER_SPECIFIC_FLAGS(DEFAULT -DDESUL_SYCL_DEVICE_GLOBAL_SUPPORTED) - ELSEIF(NOT BUILD_SHARED_LIBS) - INCLUDE(CheckCXXSourceCompiles) - CHECK_CXX_SOURCE_COMPILES(" + compiler_specific_flags(DEFAULT -DDESUL_SYCL_DEVICE_GLOBAL_SUPPORTED) + elseif(NOT BUILD_SHARED_LIBS AND KOKKOS_ENABLE_SYCL_RELOCATABLE_DEVICE_CODE) + include(CheckCXXSourceCompiles) + check_cxx_source_compiles( + " #include using namespace sycl::ext::oneapi::experimental; using namespace sycl; @@ -638,548 +760,617 @@ IF(KOKKOS_ENABLE_SYCL) int main(){ return 0; } " - KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED) + KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED + ) - IF(KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED) + if(KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED) # Only the separable compilation implementation is supported. - COMPILER_SPECIFIC_FLAGS( - DEFAULT -fsycl-device-code-split=off -DDESUL_SYCL_DEVICE_GLOBAL_SUPPORTED + compiler_specific_flags(DEFAULT -fsycl-device-code-split=off -DDESUL_SYCL_DEVICE_GLOBAL_SUPPORTED) + endif() + endif() + + check_cxx_symbol_exists(SYCL_EXT_ONEAPI_GRAPH "sycl/sycl.hpp" KOKKOS_IMPL_HAVE_SYCL_EXT_ONEAPI_GRAPH) +endif() + +set(CUDA_ARCH_ALREADY_SPECIFIED "") +function(CHECK_CUDA_ARCH ARCH FLAG) + if(KOKKOS_ARCH_${ARCH}) + if(CUDA_ARCH_ALREADY_SPECIFIED) + message( + FATAL_ERROR + "Multiple GPU architectures given! Already have ${CUDA_ARCH_ALREADY_SPECIFIED}, but trying to add ${ARCH}. If you are re-running CMake, try clearing the cache and running again." ) - ENDIF() - ENDIF() -ENDIF() - -SET(CUDA_ARCH_ALREADY_SPECIFIED "") -FUNCTION(CHECK_CUDA_ARCH ARCH FLAG) - IF(KOKKOS_ARCH_${ARCH}) - IF(CUDA_ARCH_ALREADY_SPECIFIED) - MESSAGE(FATAL_ERROR "Multiple GPU architectures given! Already have ${CUDA_ARCH_ALREADY_SPECIFIED}, but trying to add ${ARCH}. If you are re-running CMake, try clearing the cache and running again.") - ENDIF() - SET(CUDA_ARCH_ALREADY_SPECIFIED ${ARCH} PARENT_SCOPE) - IF (NOT KOKKOS_ENABLE_CUDA AND NOT KOKKOS_ENABLE_OPENMPTARGET AND NOT KOKKOS_ENABLE_SYCL AND NOT KOKKOS_ENABLE_OPENACC) - MESSAGE(WARNING "Given CUDA arch ${ARCH}, but Kokkos_ENABLE_CUDA, Kokkos_ENABLE_SYCL, Kokkos_ENABLE_OPENACC, and Kokkos_ENABLE_OPENMPTARGET are OFF. Option will be ignored.") - UNSET(KOKKOS_ARCH_${ARCH} PARENT_SCOPE) - ELSE() - IF(KOKKOS_ENABLE_CUDA) - STRING(REPLACE "sm_" "" CMAKE_ARCH ${FLAG}) - SET(KOKKOS_CUDA_ARCHITECTURES ${CMAKE_ARCH}) - SET(KOKKOS_CUDA_ARCHITECTURES ${CMAKE_ARCH} PARENT_SCOPE) - ENDIF() - SET(KOKKOS_CUDA_ARCH_FLAG ${FLAG} PARENT_SCOPE) - IF(KOKKOS_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) - SET(CMAKE_CUDA_ARCHITECTURES ${KOKKOS_CUDA_ARCHITECTURES} PARENT_SCOPE) - ELSE() - GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS "${CUDA_ARCH_FLAG}=${FLAG}") - IF(KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE OR KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) - GLOBAL_APPEND(KOKKOS_LINK_OPTIONS "${CUDA_ARCH_FLAG}=${FLAG}") - ENDIF() - ENDIF() - ENDIF() - ENDIF() - LIST(APPEND KOKKOS_CUDA_ARCH_FLAGS ${FLAG}) - SET(KOKKOS_CUDA_ARCH_FLAGS ${KOKKOS_CUDA_ARCH_FLAGS} PARENT_SCOPE) - LIST(APPEND KOKKOS_CUDA_ARCH_LIST ${ARCH}) - SET(KOKKOS_CUDA_ARCH_LIST ${KOKKOS_CUDA_ARCH_LIST} PARENT_SCOPE) -ENDFUNCTION() - + endif() + set(CUDA_ARCH_ALREADY_SPECIFIED ${ARCH} PARENT_SCOPE) + if(NOT KOKKOS_ENABLE_CUDA + AND NOT KOKKOS_ENABLE_OPENMPTARGET + AND NOT KOKKOS_ENABLE_SYCL + AND NOT KOKKOS_ENABLE_OPENACC + ) + message( + WARNING + "Given CUDA arch ${ARCH}, but Kokkos_ENABLE_CUDA, Kokkos_ENABLE_SYCL, Kokkos_ENABLE_OPENACC, and Kokkos_ENABLE_OPENMPTARGET are OFF. Option will be ignored." + ) + unset(KOKKOS_ARCH_${ARCH} PARENT_SCOPE) + else() + if(KOKKOS_ENABLE_CUDA) + string(REPLACE "sm_" "" CMAKE_ARCH ${FLAG}) + set(KOKKOS_CUDA_ARCHITECTURES ${CMAKE_ARCH}) + set(KOKKOS_CUDA_ARCHITECTURES ${CMAKE_ARCH} PARENT_SCOPE) + endif() + set(KOKKOS_CUDA_ARCH_FLAG ${FLAG} PARENT_SCOPE) + if(KOKKOS_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) + set(CMAKE_CUDA_ARCHITECTURES ${KOKKOS_CUDA_ARCHITECTURES} PARENT_SCOPE) + else() + global_append(KOKKOS_CUDA_OPTIONS "${CUDA_ARCH_FLAG}=${FLAG}") + if(KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE OR KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) + global_append(KOKKOS_LINK_OPTIONS "${CUDA_ARCH_FLAG}=${FLAG}") + endif() + endif() + endif() + endif() + list(APPEND KOKKOS_CUDA_ARCH_FLAGS ${FLAG}) + set(KOKKOS_CUDA_ARCH_FLAGS ${KOKKOS_CUDA_ARCH_FLAGS} PARENT_SCOPE) + list(APPEND KOKKOS_CUDA_ARCH_LIST ${ARCH}) + set(KOKKOS_CUDA_ARCH_LIST ${KOKKOS_CUDA_ARCH_LIST} PARENT_SCOPE) +endfunction() #These will define KOKKOS_CUDA_ARCH_FLAG #to the corresponding flag name if ON -CHECK_CUDA_ARCH(KEPLER30 sm_30) -CHECK_CUDA_ARCH(KEPLER32 sm_32) -CHECK_CUDA_ARCH(KEPLER35 sm_35) -CHECK_CUDA_ARCH(KEPLER37 sm_37) -CHECK_CUDA_ARCH(MAXWELL50 sm_50) -CHECK_CUDA_ARCH(MAXWELL52 sm_52) -CHECK_CUDA_ARCH(MAXWELL53 sm_53) -CHECK_CUDA_ARCH(PASCAL60 sm_60) -CHECK_CUDA_ARCH(PASCAL61 sm_61) -CHECK_CUDA_ARCH(VOLTA70 sm_70) -CHECK_CUDA_ARCH(VOLTA72 sm_72) -CHECK_CUDA_ARCH(TURING75 sm_75) -CHECK_CUDA_ARCH(AMPERE80 sm_80) -CHECK_CUDA_ARCH(AMPERE86 sm_86) -CHECK_CUDA_ARCH(ADA89 sm_89) -CHECK_CUDA_ARCH(HOPPER90 sm_90) - -SET(AMDGPU_ARCH_ALREADY_SPECIFIED "") -FUNCTION(CHECK_AMDGPU_ARCH ARCH FLAG) - IF(KOKKOS_ARCH_${ARCH}) - IF(AMDGPU_ARCH_ALREADY_SPECIFIED) - MESSAGE(FATAL_ERROR "Multiple GPU architectures given! Already have ${AMDGPU_ARCH_ALREADY_SPECIFIED}, but trying to add ${ARCH}. If you are re-running CMake, try clearing the cache and running again.") - ENDIF() - SET(AMDGPU_ARCH_ALREADY_SPECIFIED ${ARCH} PARENT_SCOPE) - IF (NOT KOKKOS_ENABLE_HIP AND NOT KOKKOS_ENABLE_OPENMPTARGET AND NOT KOKKOS_ENABLE_OPENACC AND NOT KOKKOS_ENABLE_SYCL) - MESSAGE(WARNING "Given AMD GPU architecture ${ARCH}, but Kokkos_ENABLE_HIP, Kokkos_ENABLE_SYCL, Kokkos_ENABLE_OPENACC, and Kokkos_ENABLE_OPENMPTARGET are OFF. Option will be ignored.") - UNSET(KOKKOS_ARCH_${ARCH} PARENT_SCOPE) - ELSE() - IF(KOKKOS_ENABLE_HIP) - SET(KOKKOS_HIP_ARCHITECTURES ${FLAG} PARENT_SCOPE) - ENDIF() - IF(NOT KOKKOS_IMPL_AMDGPU_FLAGS) - SET(KOKKOS_AMDGPU_ARCH_FLAG ${FLAG} PARENT_SCOPE) - GLOBAL_APPEND(KOKKOS_AMDGPU_OPTIONS "${AMDGPU_ARCH_FLAG}=${FLAG}") - ENDIF() - IF(KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE) - GLOBAL_APPEND(KOKKOS_LINK_OPTIONS "${AMDGPU_ARCH_FLAG}=${FLAG}") - ENDIF() - ENDIF() - ENDIF() -ENDFUNCTION() +check_cuda_arch(KEPLER30 sm_30) +check_cuda_arch(KEPLER32 sm_32) +check_cuda_arch(KEPLER35 sm_35) +check_cuda_arch(KEPLER37 sm_37) +check_cuda_arch(MAXWELL50 sm_50) +check_cuda_arch(MAXWELL52 sm_52) +check_cuda_arch(MAXWELL53 sm_53) +check_cuda_arch(PASCAL60 sm_60) +check_cuda_arch(PASCAL61 sm_61) +check_cuda_arch(VOLTA70 sm_70) +check_cuda_arch(VOLTA72 sm_72) +check_cuda_arch(TURING75 sm_75) +check_cuda_arch(AMPERE80 sm_80) +check_cuda_arch(AMPERE86 sm_86) +check_cuda_arch(ADA89 sm_89) +check_cuda_arch(HOPPER90 sm_90) + +set(AMDGPU_ARCH_ALREADY_SPECIFIED "") +function(CHECK_AMDGPU_ARCH ARCH FLAG) + if(KOKKOS_ARCH_${ARCH}) + if(AMDGPU_ARCH_ALREADY_SPECIFIED) + message( + FATAL_ERROR + "Multiple GPU architectures given! Already have ${AMDGPU_ARCH_ALREADY_SPECIFIED}, but trying to add ${ARCH}. If you are re-running CMake, try clearing the cache and running again." + ) + endif() + set(AMDGPU_ARCH_ALREADY_SPECIFIED ${ARCH} PARENT_SCOPE) + if(NOT KOKKOS_ENABLE_HIP + AND NOT KOKKOS_ENABLE_OPENMPTARGET + AND NOT KOKKOS_ENABLE_OPENACC + AND NOT KOKKOS_ENABLE_SYCL + ) + message( + WARNING + "Given AMD GPU architecture ${ARCH}, but Kokkos_ENABLE_HIP, Kokkos_ENABLE_SYCL, Kokkos_ENABLE_OPENACC, and Kokkos_ENABLE_OPENMPTARGET are OFF. Option will be ignored." + ) + unset(KOKKOS_ARCH_${ARCH} PARENT_SCOPE) + else() + if(KOKKOS_ENABLE_HIP) + set(KOKKOS_HIP_ARCHITECTURES ${FLAG} PARENT_SCOPE) + endif() + if(NOT KOKKOS_IMPL_AMDGPU_FLAGS) + set(KOKKOS_AMDGPU_ARCH_FLAG ${FLAG} PARENT_SCOPE) + global_append(KOKKOS_AMDGPU_OPTIONS "${AMDGPU_ARCH_FLAG}=${FLAG}") + endif() + if(KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE) + global_append(KOKKOS_LINK_OPTIONS "${AMDGPU_ARCH_FLAG}=${FLAG}") + endif() + endif() + endif() +endfunction() #These will define KOKKOS_AMDGPU_ARCH_FLAG #to the corresponding flag name if ON -FOREACH(ARCH IN LISTS SUPPORTED_AMD_ARCHS) - LIST(FIND SUPPORTED_AMD_ARCHS ${ARCH} LIST_INDEX) - LIST(GET CORRESPONDING_AMD_FLAGS ${LIST_INDEX} FLAG) - CHECK_AMDGPU_ARCH(${ARCH} ${FLAG}) -ENDFOREACH() - -IF(KOKKOS_IMPL_AMDGPU_FLAGS) - IF (NOT AMDGPU_ARCH_ALREADY_SPECIFIED) - MESSAGE(FATAL_ERROR "When IMPL_AMDGPU_FLAGS is set the architecture autodectection is disabled. " - "Please explicitly set the GPU architecture.") - ENDIF() - GLOBAL_APPEND(KOKKOS_AMDGPU_OPTIONS "${KOKKOS_IMPL_AMDGPU_FLAGS}") - GLOBAL_APPEND(KOKKOS_LINK_OPTIONS "${KOKKOS_IMPL_AMDGPU_LINK}") -ENDIF() - -MACRO(SET_AND_CHECK_AMD_ARCH ARCH FLAG) - KOKKOS_SET_OPTION(ARCH_${ARCH} ON) - CHECK_AMDGPU_ARCH(${ARCH} ${FLAG}) - LIST(APPEND KOKKOS_ENABLED_ARCH_LIST ${ARCH}) -ENDMACRO() - -MACRO(CHECK_MULTIPLE_INTEL_ARCH) - IF(KOKKOS_ARCH_INTEL_GPU) - MESSAGE(FATAL_ERROR "Specifying multiple Intel GPU architectures is not allowed!") - ENDIF() - SET(KOKKOS_ARCH_INTEL_GPU ON) -ENDMACRO() - -IF(KOKKOS_ARCH_INTEL_GEN) - CHECK_MULTIPLE_INTEL_ARCH() -ENDIF() -IF(KOKKOS_ARCH_INTEL_DG1) - CHECK_MULTIPLE_INTEL_ARCH() -ENDIF() -IF(KOKKOS_ARCH_INTEL_GEN9) - CHECK_MULTIPLE_INTEL_ARCH() -ENDIF() -IF(KOKKOS_ARCH_INTEL_GEN11) - CHECK_MULTIPLE_INTEL_ARCH() -ENDIF() -IF(KOKKOS_ARCH_INTEL_GEN12LP) - CHECK_MULTIPLE_INTEL_ARCH() -ENDIF() -IF(KOKKOS_ARCH_INTEL_XEHP) - CHECK_MULTIPLE_INTEL_ARCH() -ENDIF() -IF(KOKKOS_ARCH_INTEL_PVC) - CHECK_MULTIPLE_INTEL_ARCH() -ENDIF() - -IF (KOKKOS_ENABLE_OPENMPTARGET) - SET(CLANG_CUDA_ARCH ${KOKKOS_CUDA_ARCH_FLAG}) - IF (CLANG_CUDA_ARCH) - IF(KOKKOS_CLANG_IS_CRAY) - COMPILER_SPECIFIC_FLAGS( - Cray -fopenmp - ) - ELSE() - STRING(REPLACE "sm_" "cc" NVHPC_CUDA_ARCH ${CLANG_CUDA_ARCH}) - COMPILER_SPECIFIC_FLAGS( - Clang -Xopenmp-target -march=${CLANG_CUDA_ARCH} -fopenmp-targets=nvptx64 - NVHPC -gpu=${NVHPC_CUDA_ARCH} +foreach(ARCH IN LISTS SUPPORTED_AMD_ARCHS) + list(FIND SUPPORTED_AMD_ARCHS ${ARCH} LIST_INDEX) + list(GET CORRESPONDING_AMD_FLAGS ${LIST_INDEX} FLAG) + check_amdgpu_arch(${ARCH} ${FLAG}) +endforeach() + +if(KOKKOS_IMPL_AMDGPU_FLAGS) + if(NOT AMDGPU_ARCH_ALREADY_SPECIFIED) + message(FATAL_ERROR "When IMPL_AMDGPU_FLAGS is set the architecture autodectection is disabled. " + "Please explicitly set the GPU architecture." + ) + endif() + global_append(KOKKOS_AMDGPU_OPTIONS "${KOKKOS_IMPL_AMDGPU_FLAGS}") + global_append(KOKKOS_LINK_OPTIONS "${KOKKOS_IMPL_AMDGPU_LINK}") +endif() + +macro(SET_AND_CHECK_AMD_ARCH ARCH FLAG) + kokkos_set_option(ARCH_${ARCH} ON) + check_amdgpu_arch(${ARCH} ${FLAG}) + list(APPEND KOKKOS_ENABLED_ARCH_LIST ${ARCH}) +endmacro() + +macro(CHECK_MULTIPLE_INTEL_ARCH) + if(KOKKOS_ARCH_INTEL_GPU) + message(FATAL_ERROR "Specifying multiple Intel GPU architectures is not allowed!") + endif() + set(KOKKOS_ARCH_INTEL_GPU ON) +endmacro() + +if(KOKKOS_ARCH_INTEL_GEN) + check_multiple_intel_arch() +endif() +if(KOKKOS_ARCH_INTEL_DG1) + check_multiple_intel_arch() +endif() +if(KOKKOS_ARCH_INTEL_GEN9) + check_multiple_intel_arch() +endif() +if(KOKKOS_ARCH_INTEL_GEN11) + check_multiple_intel_arch() +endif() +if(KOKKOS_ARCH_INTEL_GEN12LP) + check_multiple_intel_arch() +endif() +if(KOKKOS_ARCH_INTEL_XEHP) + check_multiple_intel_arch() +endif() +if(KOKKOS_ARCH_INTEL_PVC) + check_multiple_intel_arch() +endif() + +if(KOKKOS_ENABLE_OPENMP) + compiler_specific_link_options(CrayClang -fopenmp) +endif() + +if(KOKKOS_ENABLE_OPENMPTARGET) + set(CLANG_CUDA_ARCH ${KOKKOS_CUDA_ARCH_FLAG}) + if(CLANG_CUDA_ARCH) + if(KOKKOS_CLANG_IS_CRAY) + compiler_specific_flags(Cray -fopenmp) + else() + string(REPLACE "sm_" "cc" NVHPC_CUDA_ARCH ${CLANG_CUDA_ARCH}) + compiler_specific_flags( + Clang -Xopenmp-target -march=${CLANG_CUDA_ARCH} -fopenmp-targets=nvptx64 NVHPC -gpu=${NVHPC_CUDA_ARCH} ) - ENDIF() - ENDIF() - SET(CLANG_AMDGPU_ARCH ${KOKKOS_AMDGPU_ARCH_FLAG}) - IF (CLANG_AMDGPU_ARCH) - COMPILER_SPECIFIC_FLAGS( + endif() + endif() + set(CLANG_AMDGPU_ARCH ${KOKKOS_AMDGPU_ARCH_FLAG}) + if(CLANG_AMDGPU_ARCH) + compiler_specific_flags( Clang -Xopenmp-target=amdgcn-amd-amdhsa -march=${CLANG_AMDGPU_ARCH} -fopenmp-targets=amdgcn-amd-amdhsa ) - ENDIF() - IF (KOKKOS_ARCH_INTEL_GEN) - COMPILER_SPECIFIC_FLAGS( - IntelLLVM -fopenmp-targets=spir64 -D__STRICT_ANSI__ - ) - ELSE() - COMPILER_SPECIFIC_OPTIONS( - IntelLLVM -fopenmp-targets=spir64_gen -D__STRICT_ANSI__ - ) - IF(KOKKOS_ARCH_INTEL_GEN9) - COMPILER_SPECIFIC_LINK_OPTIONS( - IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device gen9" - ) - ELSEIF(KOKKOS_ARCH_INTEL_GEN11) - COMPILER_SPECIFIC_LINK_OPTIONS( - IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device gen11" + endif() + if(KOKKOS_ARCH_INTEL_GEN) + compiler_specific_flags(IntelLLVM -fopenmp-targets=spir64 -D__STRICT_ANSI__) + else() + compiler_specific_options(IntelLLVM -fopenmp-targets=spir64_gen -D__STRICT_ANSI__) + if(KOKKOS_ARCH_INTEL_GEN9) + compiler_specific_link_options(IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device gen9") + elseif(KOKKOS_ARCH_INTEL_GEN11) + compiler_specific_link_options(IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device gen11") + elseif(KOKKOS_ARCH_INTEL_GEN12LP) + compiler_specific_link_options(IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device gen12lp") + elseif(KOKKOS_ARCH_INTEL_DG1) + compiler_specific_link_options(IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device dg1") + elseif(KOKKOS_ARCH_INTEL_XEHP) + compiler_specific_link_options(IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device 12.50.4") + elseif(KOKKOS_ARCH_INTEL_PVC) + compiler_specific_link_options(IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device 12.60.7") + endif() + endif() +endif() + +if(KOKKOS_ENABLE_OPENACC) + if(KOKKOS_CUDA_ARCH_FLAG) + if(KOKKOS_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE) + message( + FATAL_ERROR + "If a GPU architecture is specified, Kokkos_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE option cannot be used. Disable the Kokkos_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE option." ) - ELSEIF(KOKKOS_ARCH_INTEL_GEN12LP) - COMPILER_SPECIFIC_LINK_OPTIONS( - IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device gen12lp" - ) - ELSEIF(KOKKOS_ARCH_INTEL_DG1) - COMPILER_SPECIFIC_LINK_OPTIONS( - IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device dg1" - ) - ELSEIF(KOKKOS_ARCH_INTEL_XEHP) - COMPILER_SPECIFIC_LINK_OPTIONS( - IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device 12.50.4" - ) - ELSEIF(KOKKOS_ARCH_INTEL_PVC) - COMPILER_SPECIFIC_LINK_OPTIONS( - IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device 12.60.7" - ) - ENDIF() - ENDIF() -ENDIF() - -IF (KOKKOS_ENABLE_OPENACC) - IF(KOKKOS_CUDA_ARCH_FLAG) - SET(CLANG_CUDA_ARCH ${KOKKOS_CUDA_ARCH_FLAG}) - STRING(REPLACE "sm_" "cc" NVHPC_CUDA_ARCH ${KOKKOS_CUDA_ARCH_FLAG}) - COMPILER_SPECIFIC_FLAGS( - NVHPC -acc -gpu=${NVHPC_CUDA_ARCH} - Clang -Xopenmp-target=nvptx64-nvidia-cuda -march=${CLANG_CUDA_ARCH} - -fopenmp-targets=nvptx64-nvidia-cuda + endif() + set(CLANG_CUDA_ARCH ${KOKKOS_CUDA_ARCH_FLAG}) + string(REPLACE "sm_" "cc" NVHPC_CUDA_ARCH ${KOKKOS_CUDA_ARCH_FLAG}) + compiler_specific_flags( + NVHPC + -acc + -gpu=${NVHPC_CUDA_ARCH} + Clang + -Xopenmp-target=nvptx64-nvidia-cuda + -march=${CLANG_CUDA_ARCH} + -fopenmp-targets=nvptx64-nvidia-cuda ) - ELSEIF(KOKKOS_AMDGPU_ARCH_FLAG) - COMPILER_SPECIFIC_FLAGS( - Clang -Xopenmp-target=amdgcn-amd-amdhsa -march=${KOKKOS_AMDGPU_ARCH_FLAG} - -fopenmp-targets=amdgcn-amd-amdhsa - ) - ELSE() - COMPILER_SPECIFIC_FLAGS( - NVHPC -acc - ) - ENDIF() -ENDIF() - -IF (KOKKOS_ENABLE_SYCL) - IF(CUDA_ARCH_ALREADY_SPECIFIED) - IF(KOKKOS_ENABLE_UNSUPPORTED_ARCHS) - COMPILER_SPECIFIC_FLAGS( - DEFAULT -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=${KOKKOS_CUDA_ARCH_FLAG} + if(DEFINED ENV{CUDA_PATH}) + compiler_specific_link_options(Clang -L$ENV{CUDA_PATH}/lib64) + endif() + compiler_specific_libs(Clang -lcudart NVHPC -cuda) + elseif(KOKKOS_AMDGPU_ARCH_FLAG) + if(KOKKOS_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE) + message( + FATAL_ERROR + "If a GPU architecture is specified, Kokkos_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE option cannot be used. Disable the Kokkos_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE option." ) - ELSE() - MESSAGE(SEND_ERROR "Setting a CUDA architecture for SYCL is only allowed with Kokkos_ENABLE_UNSUPPORTED_ARCHS=ON!") - ENDIF() - ELSEIF(AMDGPU_ARCH_ALREADY_SPECIFIED) - IF(KOKKOS_ENABLE_UNSUPPORTED_ARCHS) - COMPILER_SPECIFIC_FLAGS( - DEFAULT -fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend --offload-arch=${KOKKOS_AMDGPU_ARCH_FLAG} - ) - ELSE() - MESSAGE(SEND_ERROR "Setting a AMDGPU architecture for SYCL is only allowed with Kokkos_ENABLE_UNSUPPORTED_ARCHS=ON!") - ENDIF() - ELSEIF(KOKKOS_ARCH_INTEL_GEN) - COMPILER_SPECIFIC_FLAGS( - DEFAULT -fsycl-targets=spir64 + endif() + compiler_specific_flags( + Clang -Xopenmp-target=amdgcn-amd-amdhsa -march=${KOKKOS_AMDGPU_ARCH_FLAG} -fopenmp-targets=amdgcn-amd-amdhsa ) - ELSE() - COMPILER_SPECIFIC_OPTIONS( - DEFAULT -fsycl-targets=spir64_gen + if(DEFINED ENV{ROCM_PATH}) + compiler_specific_flags(Clang -I$ENV{ROCM_PATH}/include) + compiler_specific_link_options(Clang -L$ENV{ROCM_PATH}/lib) + endif() + compiler_specific_libs(Clang -lamdhip64) + elseif(KOKKOS_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE) + # Compile for kernel execution on the host. In that case, + # memory is shared between the OpenACC space and the host space. + compiler_specific_flags(NVHPC -acc=multicore) + else() + # Automatic fallback mode; try to offload any available GPU, and fall back + # to the host CPU if no available GPU is found. + compiler_specific_flags(NVHPC -acc=gpu,multicore) + message( + STATUS + "No OpenACC target device is specificed; the OpenACC backend will be executed in an automatic fallback mode." ) - IF(KOKKOS_ARCH_INTEL_GEN9) - COMPILER_SPECIFIC_LINK_OPTIONS( - DEFAULT -fsycl-targets=spir64_gen -Xsycl-target-backend "-device gen9" - ) - ELSEIF(KOKKOS_ARCH_INTEL_GEN11) - COMPILER_SPECIFIC_LINK_OPTIONS( - DEFAULT -fsycl-targets=spir64_gen -Xsycl-target-backend "-device gen11" + endif() +endif() + +if(KOKKOS_ENABLE_SYCL) + if(CUDA_ARCH_ALREADY_SPECIFIED) + if(KOKKOS_ENABLE_UNSUPPORTED_ARCHS) + compiler_specific_flags( + DEFAULT -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend=nvptx64-nvidia-cuda + --cuda-gpu-arch=${KOKKOS_CUDA_ARCH_FLAG} ) - ELSEIF(KOKKOS_ARCH_INTEL_GEN12LP) - COMPILER_SPECIFIC_LINK_OPTIONS( - DEFAULT -fsycl-targets=spir64_gen -Xsycl-target-backend "-device gen12lp" + else() + message( + SEND_ERROR "Setting a CUDA architecture for SYCL is only allowed with Kokkos_ENABLE_UNSUPPORTED_ARCHS=ON!" ) - ELSEIF(KOKKOS_ARCH_INTEL_DG1) - COMPILER_SPECIFIC_LINK_OPTIONS( - DEFAULT -fsycl-targets=spir64_gen -Xsycl-target-backend "-device dg1" - ) - ELSEIF(KOKKOS_ARCH_INTEL_XEHP) - COMPILER_SPECIFIC_LINK_OPTIONS( - DEFAULT -fsycl-targets=spir64_gen -Xsycl-target-backend "-device 12.50.4" + endif() + elseif(AMDGPU_ARCH_ALREADY_SPECIFIED) + if(KOKKOS_ENABLE_UNSUPPORTED_ARCHS) + compiler_specific_flags( + DEFAULT -fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend --offload-arch=${KOKKOS_AMDGPU_ARCH_FLAG} ) - ELSEIF(KOKKOS_ARCH_INTEL_PVC) - COMPILER_SPECIFIC_LINK_OPTIONS( - DEFAULT -fsycl-targets=spir64_gen -Xsycl-target-backend "-device 12.60.7" + else() + message( + SEND_ERROR "Setting a AMDGPU architecture for SYCL is only allowed with Kokkos_ENABLE_UNSUPPORTED_ARCHS=ON!" ) - ENDIF() - ENDIF() -ENDIF() - -IF(KOKKOS_ENABLE_CUDA AND NOT CUDA_ARCH_ALREADY_SPECIFIED) + endif() + elseif(KOKKOS_ARCH_INTEL_GEN) + compiler_specific_flags(DEFAULT -fsycl-targets=spir64) + elseif(KOKKOS_ARCH_INTEL_GPU) + set(SYCL_TARGET_FLAG -fsycl-targets=spir64_gen) + + if(KOKKOS_ARCH_INTEL_GEN9) + set(SYCL_TARGET_BACKEND_FLAG -Xsycl-target-backend "-device gen9") + elseif(KOKKOS_ARCH_INTEL_GEN11) + set(SYCL_TARGET_BACKEND_FLAG -Xsycl-target-backend "-device gen11") + elseif(KOKKOS_ARCH_INTEL_GEN12LP) + set(SYCL_TARGET_BACKEND_FLAG -Xsycl-target-backend "-device gen12lp") + elseif(KOKKOS_ARCH_INTEL_DG1) + set(SYCL_TARGET_BACKEND_FLAG -Xsycl-target-backend "-device dg1") + elseif(KOKKOS_ARCH_INTEL_XEHP) + set(SYCL_TARGET_BACKEND_FLAG -Xsycl-target-backend "-device 12.50.4") + elseif(KOKKOS_ARCH_INTEL_PVC) + set(SYCL_TARGET_BACKEND_FLAG -Xsycl-target-backend "-device 12.60.7") + endif() + + if(Kokkos_ENABLE_SYCL_RELOCATABLE_DEVICE_CODE) + compiler_specific_options(DEFAULT ${SYCL_TARGET_FLAG}) + compiler_specific_link_options(DEFAULT ${SYCL_TARGET_FLAG} ${SYCL_TARGET_BACKEND_FLAG}) + else() + compiler_specific_options(DEFAULT ${SYCL_TARGET_FLAG} ${SYCL_TARGET_BACKEND_FLAG}) + endif() + endif() +endif() + +if(KOKKOS_ENABLE_CUDA AND NOT CUDA_ARCH_ALREADY_SPECIFIED) # Try to autodetect the CUDA Compute Capability by asking the device - SET(_BINARY_TEST_DIR ${CMAKE_CURRENT_BINARY_DIR}/cmake/compile_tests/CUDAComputeCapabilityWorkdir) - FILE(REMOVE_RECURSE ${_BINARY_TEST_DIR}) - FILE(MAKE_DIRECTORY ${_BINARY_TEST_DIR}) - - TRY_RUN( - _RESULT - _COMPILE_RESULT - ${_BINARY_TEST_DIR} - ${CMAKE_CURRENT_SOURCE_DIR}/cmake/compile_tests/cuda_compute_capability.cc - COMPILE_DEFINITIONS -DSM_ONLY - RUN_OUTPUT_VARIABLE _CUDA_COMPUTE_CAPABILITY) + set(_BINARY_TEST_DIR ${CMAKE_CURRENT_BINARY_DIR}/cmake/compile_tests/CUDAComputeCapabilityWorkdir) + file(REMOVE_RECURSE ${_BINARY_TEST_DIR}) + file(MAKE_DIRECTORY ${_BINARY_TEST_DIR}) + + try_run(_RESULT _COMPILE_RESULT ${_BINARY_TEST_DIR} + ${CMAKE_CURRENT_SOURCE_DIR}/cmake/compile_tests/cuda_compute_capability.cc COMPILE_DEFINITIONS -DSM_ONLY + RUN_OUTPUT_VARIABLE _CUDA_COMPUTE_CAPABILITY + ) # if user is using kokkos_compiler_launcher, above will fail. - IF(NOT _COMPILE_RESULT OR NOT _RESULT EQUAL 0) + if(NOT _COMPILE_RESULT OR NOT _RESULT EQUAL 0) # check to see if CUDA is not already enabled (may happen when Kokkos is subproject) - GET_PROPERTY(_ENABLED_LANGUAGES GLOBAL PROPERTY ENABLED_LANGUAGES) + get_property(_ENABLED_LANGUAGES GLOBAL PROPERTY ENABLED_LANGUAGES) # language has to be fully enabled, just checking for CMAKE_CUDA_COMPILER isn't enough - IF(NOT "CUDA" IN_LIST _ENABLED_LANGUAGES) + if(NOT "CUDA" IN_LIST _ENABLED_LANGUAGES) # make sure the user knows that we aren't using CUDA compiler for anything else - MESSAGE(STATUS "CUDA auto-detection of architecture failed with ${CMAKE_CXX_COMPILER}. Enabling CUDA language ONLY to auto-detect architecture...") - INCLUDE(CheckLanguage) - CHECK_LANGUAGE(CUDA) - IF(CMAKE_CUDA_COMPILER) - ENABLE_LANGUAGE(CUDA) - ELSE() - MESSAGE(STATUS "CUDA language could not be enabled") - ENDIF() - ENDIF() + message( + STATUS + "CUDA auto-detection of architecture failed with ${CMAKE_CXX_COMPILER}. Enabling CUDA language ONLY to auto-detect architecture..." + ) + include(CheckLanguage) + check_language(CUDA) + if(CMAKE_CUDA_COMPILER) + enable_language(CUDA) + else() + message(STATUS "CUDA language could not be enabled") + endif() + endif() # if CUDA was enabled, this will be defined - IF(CMAKE_CUDA_COMPILER) + if(CMAKE_CUDA_COMPILER) # copy our test to .cu so cmake compiles as CUDA - CONFIGURE_FILE( + configure_file( ${CMAKE_CURRENT_SOURCE_DIR}/cmake/compile_tests/cuda_compute_capability.cc - ${CMAKE_CURRENT_BINARY_DIR}/compile_tests/cuda_compute_capability.cu - COPYONLY + ${CMAKE_CURRENT_BINARY_DIR}/compile_tests/cuda_compute_capability.cu COPYONLY ) # run test again - TRY_RUN( - _RESULT - _COMPILE_RESULT - ${_BINARY_TEST_DIR} - ${CMAKE_CURRENT_BINARY_DIR}/compile_tests/cuda_compute_capability.cu - COMPILE_DEFINITIONS -DSM_ONLY - RUN_OUTPUT_VARIABLE _CUDA_COMPUTE_CAPABILITY) - ENDIF() - ENDIF() - - LIST(FIND KOKKOS_CUDA_ARCH_FLAGS sm_${_CUDA_COMPUTE_CAPABILITY} FLAG_INDEX) - IF(_COMPILE_RESULT AND _RESULT EQUAL 0 AND NOT FLAG_INDEX EQUAL -1) - MESSAGE(STATUS "Detected CUDA Compute Capability ${_CUDA_COMPUTE_CAPABILITY}") - LIST(GET KOKKOS_CUDA_ARCH_LIST ${FLAG_INDEX} ARCHITECTURE) - KOKKOS_SET_OPTION(ARCH_${ARCHITECTURE} ON) - CHECK_CUDA_ARCH(${ARCHITECTURE} sm_${_CUDA_COMPUTE_CAPABILITY}) - LIST(APPEND KOKKOS_ENABLED_ARCH_LIST ${ARCHITECTURE}) - ELSE() - MESSAGE(SEND_ERROR "CUDA enabled but no NVIDIA GPU architecture currently enabled and auto-detection failed. " - "Please give one -DKokkos_ARCH_{..}=ON' to enable an NVIDIA GPU architecture.\n" - "You can yourself try to compile ${CMAKE_CURRENT_SOURCE_DIR}/cmake/compile_tests/cuda_compute_capability.cc and run the executable. " - "If you are cross-compiling, you should try to do this on a compute node.") - ENDIF() -ENDIF() + try_run(_RESULT _COMPILE_RESULT ${_BINARY_TEST_DIR} + ${CMAKE_CURRENT_BINARY_DIR}/compile_tests/cuda_compute_capability.cu COMPILE_DEFINITIONS -DSM_ONLY + RUN_OUTPUT_VARIABLE _CUDA_COMPUTE_CAPABILITY + ) + endif() + endif() + + list(FIND KOKKOS_CUDA_ARCH_FLAGS sm_${_CUDA_COMPUTE_CAPABILITY} FLAG_INDEX) + if(_COMPILE_RESULT AND _RESULT EQUAL 0 AND NOT FLAG_INDEX EQUAL -1) + message(STATUS "Detected CUDA Compute Capability ${_CUDA_COMPUTE_CAPABILITY}") + list(GET KOKKOS_CUDA_ARCH_LIST ${FLAG_INDEX} ARCHITECTURE) + kokkos_set_option(ARCH_${ARCHITECTURE} ON) + check_cuda_arch(${ARCHITECTURE} sm_${_CUDA_COMPUTE_CAPABILITY}) + list(APPEND KOKKOS_ENABLED_ARCH_LIST ${ARCHITECTURE}) + else() + message( + SEND_ERROR + "CUDA enabled but no NVIDIA GPU architecture currently enabled and auto-detection failed. " + "Please give one -DKokkos_ARCH_{..}=ON' to enable an NVIDIA GPU architecture.\n" + "You can yourself try to compile ${CMAKE_CURRENT_SOURCE_DIR}/cmake/compile_tests/cuda_compute_capability.cc and run the executable. " + "If you are cross-compiling, you should try to do this on a compute node." + ) + endif() +endif() #Regardless of version, make sure we define the general architecture name -IF (KOKKOS_ARCH_KEPLER30 OR KOKKOS_ARCH_KEPLER32 OR KOKKOS_ARCH_KEPLER35 OR KOKKOS_ARCH_KEPLER37) - SET(KOKKOS_ARCH_KEPLER ON) -ENDIF() +if(KOKKOS_ARCH_KEPLER30 + OR KOKKOS_ARCH_KEPLER32 + OR KOKKOS_ARCH_KEPLER35 + OR KOKKOS_ARCH_KEPLER37 +) + set(KOKKOS_ARCH_KEPLER ON) +endif() #Regardless of version, make sure we define the general architecture name -IF (KOKKOS_ARCH_MAXWELL50 OR KOKKOS_ARCH_MAXWELL52 OR KOKKOS_ARCH_MAXWELL53) - SET(KOKKOS_ARCH_MAXWELL ON) -ENDIF() +if(KOKKOS_ARCH_MAXWELL50 OR KOKKOS_ARCH_MAXWELL52 OR KOKKOS_ARCH_MAXWELL53) + set(KOKKOS_ARCH_MAXWELL ON) +endif() #Regardless of version, make sure we define the general architecture name -IF (KOKKOS_ARCH_PASCAL60 OR KOKKOS_ARCH_PASCAL61) - SET(KOKKOS_ARCH_PASCAL ON) -ENDIF() +if(KOKKOS_ARCH_PASCAL60 OR KOKKOS_ARCH_PASCAL61) + set(KOKKOS_ARCH_PASCAL ON) +endif() #Regardless of version, make sure we define the general architecture name -IF (KOKKOS_ARCH_VOLTA70 OR KOKKOS_ARCH_VOLTA72) - SET(KOKKOS_ARCH_VOLTA ON) -ENDIF() +if(KOKKOS_ARCH_VOLTA70 OR KOKKOS_ARCH_VOLTA72) + set(KOKKOS_ARCH_VOLTA ON) +endif() -IF (KOKKOS_ARCH_AMPERE80 OR KOKKOS_ARCH_AMPERE86) - SET(KOKKOS_ARCH_AMPERE ON) -ENDIF() +if(KOKKOS_ARCH_AMPERE80 OR KOKKOS_ARCH_AMPERE86) + set(KOKKOS_ARCH_AMPERE ON) +endif() -IF (KOKKOS_ARCH_HOPPER90) - SET(KOKKOS_ARCH_HOPPER ON) -ENDIF() +if(KOKKOS_ARCH_HOPPER90) + set(KOKKOS_ARCH_HOPPER ON) +endif() + +function(CHECK_AMD_APU ARCH) + set(BINARY_TEST_DIR ${CMAKE_CURRENT_BINARY_DIR}/cmake/compile_tests/AmdApuWorkdir) + file(REMOVE_RECURSE ${BINARY_TEST_DIR}) + file(MAKE_DIRECTORY ${BINARY_TEST_DIR}) + + try_run(RESULT COMPILE_RESULT ${BINARY_TEST_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/cmake/compile_tests/amd_apu.cc + RUN_OUTPUT_VARIABLE AMD_APU + ) + + if(NOT COMPILE_RESULT OR NOT RESULT EQUAL 0) + message(SEND_ERROR "Autodetection of AMD APU failed." + "Please manually specify one AMD GPU architecture via -DKokkos_ARCH_{..}=ON'." + ) + endif() + + if(AMD_APU) + set(${ARCH} AMD_GFX942_APU PARENT_SCOPE) + endif() +endfunction() #HIP detection of gpu arch -IF(KOKKOS_ENABLE_HIP AND NOT AMDGPU_ARCH_ALREADY_SPECIFIED AND NOT KOKKOS_IMPL_AMDGPU_FLAGS) - FIND_PROGRAM(ROCM_ENUMERATOR rocm_agent_enumerator) - IF(NOT ROCM_ENUMERATOR) - MESSAGE(FATAL_ERROR "Autodetection of AMD GPU architecture not possible as " - "rocm_agent_enumerator could not be found. " - "Please specify an arch manually via -DKokkos_ARCH_{..}=ON") - ELSE() - EXECUTE_PROCESS(COMMAND ${ROCM_ENUMERATOR} OUTPUT_VARIABLE GPU_ARCHS) - STRING(LENGTH "${GPU_ARCHS}" len_str) +if(KOKKOS_ENABLE_HIP AND NOT AMDGPU_ARCH_ALREADY_SPECIFIED AND NOT KOKKOS_IMPL_AMDGPU_FLAGS) + find_program(ROCM_ENUMERATOR rocm_agent_enumerator) + if(NOT ROCM_ENUMERATOR) + message( + FATAL_ERROR "Autodetection of AMD GPU architecture not possible as " "rocm_agent_enumerator could not be found. " + "Please specify an arch manually via -DKokkos_ARCH_{..}=ON" + ) + else() + execute_process(COMMAND ${ROCM_ENUMERATOR} OUTPUT_VARIABLE GPU_ARCHS) + string(LENGTH "${GPU_ARCHS}" len_str) # enumerator always output gfx000 as the first line - IF(${len_str} LESS 8) - MESSAGE(SEND_ERROR "HIP enabled but no AMD GPU architecture could be automatically detected. " - "Please manually specify one AMD GPU architecture via -DKokkos_ARCH_{..}=ON'.") - # check for known gpu archs, otherwise error out - ELSE() - SET(AMD_ARCH_DETECTED "") - FOREACH(ARCH IN LISTS SUPPORTED_AMD_ARCHS) - LIST(FIND SUPPORTED_AMD_ARCHS ${ARCH} LIST_INDEX) - LIST(GET CORRESPONDING_AMD_FLAGS ${LIST_INDEX} FLAG) - STRING(REGEX MATCH "(${FLAG})" DETECTED_GPU_ARCH ${GPU_ARCHS}) - IF("${DETECTED_GPU_ARCH}" STREQUAL "${FLAG}") - SET_AND_CHECK_AMD_ARCH(${ARCH} ${FLAG}) - SET(AMD_ARCH_DETECTED ${ARCH}) - BREAK() - ENDIF() - ENDFOREACH() - IF("${AMD_ARCH_DETECTED}" STREQUAL "") - MESSAGE(FATAL_ERROR "HIP enabled but no automatically detected AMD GPU architecture " - "is supported. " - "Please manually specify one AMD GPU architecture via -DKokkos_ARCH_{..}=ON'.") - ENDIF() - ENDIF() - ENDIF() -ENDIF() - -FOREACH(ARCH IN LISTS SUPPORTED_AMD_ARCHS) - IF (KOKKOS_ARCH_${ARCH}) - STRING(REGEX MATCH "90A" IS_90A ${ARCH}) - IF(IS_90A) - SET(KOKKOS_ARCH_AMD_GFX90A ON) - SET(KOKKOS_ARCH_VEGA90A ON) - BREAK() - ENDIF() - STRING(REGEX MATCH "908" IS_908 ${ARCH}) - IF(IS_908) - SET(KOKKOS_ARCH_AMD_GFX908 ON) - SET(KOKKOS_ARCH_VEGA908 ON) - BREAK() - ENDIF() - STRING(REGEX MATCH "906" IS_906 ${ARCH}) - IF(IS_906) - SET(KOKKOS_ARCH_AMD_GFX906 ON) - SET(KOKKOS_ARCH_VEGA906 ON) - BREAK() - ENDIF() - STRING(REGEX MATCH "1100" IS_1100 ${ARCH}) - IF(IS_1100) - SET(KOKKOS_ARCH_AMD_GFX1100 ON) - SET(KOKKOS_ARCH_NAVI1100 ON) - BREAK() - ENDIF() - STRING(REGEX MATCH "1030" IS_1030 ${ARCH}) - IF(IS_1030) - SET(KOKKOS_ARCH_AMD_GFX1030 ON) - SET(KOKKOS_ARCH_NAVI1030 ON) - BREAK() - ENDIF() - ENDIF() -ENDFOREACH() + if(${len_str} LESS 8) + message(SEND_ERROR "HIP enabled but no AMD GPU architecture could be automatically detected. " + "Please manually specify one AMD GPU architecture via -DKokkos_ARCH_{..}=ON'." + ) + # check for known gpu archs, otherwise error out + else() + set(AMD_ARCH_DETECTED "") + foreach(ARCH IN LISTS SUPPORTED_AMD_ARCHS) + list(FIND SUPPORTED_AMD_ARCHS ${ARCH} LIST_INDEX) + list(GET CORRESPONDING_AMD_FLAGS ${LIST_INDEX} FLAG) + string(REGEX MATCH "(${FLAG})" DETECTED_GPU_ARCH ${GPU_ARCHS}) + if("${DETECTED_GPU_ARCH}" STREQUAL "${FLAG}") + # If we detected gfx942, we need to discriminate between APU and discrete GPU + if(FLAG STREQUAL "gfx942") + check_amd_apu(ARCH) + endif() + set_and_check_amd_arch(${ARCH} ${FLAG}) + set(AMD_ARCH_DETECTED ${ARCH}) + break() + endif() + endforeach() + if("${AMD_ARCH_DETECTED}" STREQUAL "") + message(FATAL_ERROR "HIP enabled but no automatically detected AMD GPU architecture " "is supported. " + "Please manually specify one AMD GPU architecture via -DKokkos_ARCH_{..}=ON'." + ) + endif() + endif() + endif() +endif() + +foreach(ARCH IN LISTS SUPPORTED_AMD_ARCHS) + if(KOKKOS_ARCH_${ARCH}) + string(REGEX MATCH "90A" IS_90A ${ARCH}) + if(IS_90A) + set(KOKKOS_ARCH_AMD_GFX90A ON) + set(KOKKOS_ARCH_VEGA90A ON) + break() + endif() + string(REGEX MATCH "908" IS_908 ${ARCH}) + if(IS_908) + set(KOKKOS_ARCH_AMD_GFX908 ON) + set(KOKKOS_ARCH_VEGA908 ON) + break() + endif() + string(REGEX MATCH "906" IS_906 ${ARCH}) + if(IS_906) + set(KOKKOS_ARCH_AMD_GFX906 ON) + set(KOKKOS_ARCH_VEGA906 ON) + break() + endif() + string(REGEX MATCH "1100" IS_1100 ${ARCH}) + if(IS_1100) + set(KOKKOS_ARCH_AMD_GFX1100 ON) + set(KOKKOS_ARCH_NAVI1100 ON) + break() + endif() + string(REGEX MATCH "1030" IS_1030 ${ARCH}) + if(IS_1030) + set(KOKKOS_ARCH_AMD_GFX1030 ON) + set(KOKKOS_ARCH_NAVI1030 ON) + break() + endif() + endif() +endforeach() #Regardless of version, make sure we define the general architecture name -FOREACH(ARCH IN LISTS SUPPORTED_AMD_ARCHS) - IF (KOKKOS_ARCH_${ARCH}) - SET(KOKKOS_ARCH_AMD_GPU ON) - STRING(REGEX MATCH "(VEGA)" IS_VEGA ${ARCH}) - IF(IS_VEGA) - SET(KOKKOS_ARCH_VEGA ON) - BREAK() - ENDIF() - STRING(REGEX MATCH "(NAVI)" IS_NAVI ${ARCH}) - IF(IS_NAVI) - SET(KOKKOS_ARCH_NAVI ON) - BREAK() - ENDIF() - ENDIF() -ENDFOREACH() +foreach(ARCH IN LISTS SUPPORTED_AMD_ARCHS) + if(KOKKOS_ARCH_${ARCH}) + list(FIND SUPPORTED_AMD_ARCHS ${ARCH} LIST_INDEX) + list(GET CORRESPONDING_AMD_FLAGS ${LIST_INDEX} FLAG) + set(KOKKOS_ARCH_AMD_GPU "${FLAG}") + string(REGEX MATCH "(VEGA)" IS_VEGA ${ARCH}) + if(IS_VEGA) + set(KOKKOS_ARCH_VEGA ON) + break() + endif() + string(REGEX MATCH "(NAVI)" IS_NAVI ${ARCH}) + if(IS_NAVI) + set(KOKKOS_ARCH_NAVI ON) + break() + endif() + endif() +endforeach() #CMake verbose is kind of pointless #Let's just always print things -MESSAGE(STATUS "Built-in Execution Spaces:") - -FOREACH (_BACKEND Cuda OpenMPTarget HIP SYCL OpenACC) - STRING(TOUPPER ${_BACKEND} UC_BACKEND) - IF(KOKKOS_ENABLE_${UC_BACKEND}) - IF(_DEVICE_PARALLEL) - MESSAGE(FATAL_ERROR "Multiple device parallel execution spaces are not allowed! " - "Trying to enable execution space ${_BACKEND}, " - "but execution space ${_DEVICE_PARALLEL} is already enabled. " - "Remove the CMakeCache.txt file and re-configure.") - ENDIF() - IF (${_BACKEND} STREQUAL "Cuda") - IF(KOKKOS_ENABLE_CUDA_UVM) - MESSAGE(DEPRECATION "Setting Kokkos_ENABLE_CUDA_UVM is deprecated - use the portable Kokkos::SharedSpace as an explicit memory space in your code instead") - IF(KOKKOS_ENABLE_DEPRECATED_CODE_4) - SET(_DEFAULT_DEVICE_MEMSPACE "Kokkos::${_BACKEND}UVMSpace") - ELSE() - MESSAGE(FATAL_ERROR "Kokkos_ENABLE_DEPRECATED_CODE_4 must be set to use Kokkos_ENABLE_CUDA_UVM") - ENDIF() - ELSE() - SET(_DEFAULT_DEVICE_MEMSPACE "Kokkos::${_BACKEND}Space") - ENDIF() - SET(_DEVICE_PARALLEL "Kokkos::${_BACKEND}") - ELSEIF(${_BACKEND} STREQUAL "HIP") - SET(_DEFAULT_DEVICE_MEMSPACE "Kokkos::${_BACKEND}Space") - SET(_DEVICE_PARALLEL "Kokkos::${_BACKEND}") - ELSE() - SET(_DEFAULT_DEVICE_MEMSPACE "Kokkos::Experimental::${_BACKEND}Space") - SET(_DEVICE_PARALLEL "Kokkos::Experimental::${_BACKEND}") - ENDIF() - ENDIF() -ENDFOREACH() -IF(NOT _DEVICE_PARALLEL) - SET(_DEVICE_PARALLEL "NoTypeDefined") - SET(_DEFAULT_DEVICE_MEMSPACE "NoTypeDefined") -ENDIF() -MESSAGE(STATUS " Device Parallel: ${_DEVICE_PARALLEL}") - -FOREACH (_BACKEND OpenMP Threads HPX) - STRING(TOUPPER ${_BACKEND} UC_BACKEND) - IF(KOKKOS_ENABLE_${UC_BACKEND}) - IF(_HOST_PARALLEL) - MESSAGE(FATAL_ERROR "Multiple host parallel execution spaces are not allowed! " - "Trying to enable execution space ${_BACKEND}, " - "but execution space ${_HOST_PARALLEL} is already enabled. " - "Remove the CMakeCache.txt file and re-configure.") - ENDIF() - IF (${_BACKEND} STREQUAL "HPX") - SET(_HOST_PARALLEL "Kokkos::Experimental::${_BACKEND}") - ELSE() - SET(_HOST_PARALLEL "Kokkos::${_BACKEND}") - ENDIF() - ENDIF() -ENDFOREACH() - -IF(NOT _HOST_PARALLEL AND NOT KOKKOS_ENABLE_SERIAL) - MESSAGE(FATAL_ERROR "At least one host execution space must be enabled, " - "but no host parallel execution space was requested " - "and Kokkos_ENABLE_SERIAL=OFF.") -ENDIF() - -IF(_HOST_PARALLEL) -MESSAGE(STATUS " Host Parallel: ${_HOST_PARALLEL}") -ELSE() - SET(_HOST_PARALLEL "NoTypeDefined") - MESSAGE(STATUS " Host Parallel: NoTypeDefined") -ENDIF() - -IF(KOKKOS_ENABLE_SERIAL) - MESSAGE(STATUS " Host Serial: SERIAL") -ELSE() - MESSAGE(STATUS " Host Serial: NONE") -ENDIF() - -MESSAGE(STATUS "") -MESSAGE(STATUS "Architectures:") -FOREACH(Arch ${KOKKOS_ENABLED_ARCH_LIST}) - MESSAGE(STATUS " ${Arch}") -ENDFOREACH() - - -IF(KOKKOS_ENABLE_ATOMICS_BYPASS) - IF(NOT _HOST_PARALLEL STREQUAL "NoTypeDefined" OR NOT _DEVICE_PARALLEL STREQUAL "NoTypeDefined") - MESSAGE(FATAL_ERROR "Not allowed to disable atomics (via -DKokkos_ENABLE_AROMICS_BYPASS=ON) if neither a host parallel nor a device backend is enabled!") - ENDIF() - IF(NOT KOKKOS_ENABLE_SERIAL) - MESSAGE(FATAL_ERROR "Implementation bug") # safeguard - ENDIF() - MESSAGE(STATUS "Atomics: **DISABLED**") -ENDIF() +message(STATUS "Built-in Execution Spaces:") + +foreach(_BACKEND Cuda OpenMPTarget HIP SYCL OpenACC) + string(TOUPPER ${_BACKEND} UC_BACKEND) + if(KOKKOS_ENABLE_${UC_BACKEND}) + if(_DEVICE_PARALLEL) + message( + FATAL_ERROR + "Multiple device parallel execution spaces are not allowed! " + "Trying to enable execution space ${_BACKEND}, " + "but execution space ${_DEVICE_PARALLEL} is already enabled. " + "Remove the CMakeCache.txt file and re-configure." + ) + endif() + if(${_BACKEND} STREQUAL "Cuda") + if(KOKKOS_ENABLE_CUDA_UVM) + message( + DEPRECATION + "Setting Kokkos_ENABLE_CUDA_UVM is deprecated - use the portable Kokkos::SharedSpace as an explicit memory space in your code instead" + ) + if(NOT KOKKOS_ENABLE_DEPRECATED_CODE_4) + message(FATAL_ERROR "Kokkos_ENABLE_DEPRECATED_CODE_4 must be set to use Kokkos_ENABLE_CUDA_UVM") + endif() + endif() + set(_DEVICE_PARALLEL "Kokkos::${_BACKEND}") + elseif(${_BACKEND} STREQUAL "HIP" OR ${_BACKEND} STREQUAL "SYCL") + set(_DEVICE_PARALLEL "Kokkos::${_BACKEND}") + else() + set(_DEVICE_PARALLEL "Kokkos::Experimental::${_BACKEND}") + endif() + endif() +endforeach() +if(NOT _DEVICE_PARALLEL) + set(_DEVICE_PARALLEL "NoTypeDefined") +endif() +message(STATUS " Device Parallel: ${_DEVICE_PARALLEL}") + +foreach(_BACKEND OpenMP Threads HPX) + string(TOUPPER ${_BACKEND} UC_BACKEND) + if(KOKKOS_ENABLE_${UC_BACKEND}) + if(_HOST_PARALLEL) + message( + FATAL_ERROR + "Multiple host parallel execution spaces are not allowed! " "Trying to enable execution space ${_BACKEND}, " + "but execution space ${_HOST_PARALLEL} is already enabled. " + "Remove the CMakeCache.txt file and re-configure." + ) + endif() + if(${_BACKEND} STREQUAL "HPX") + set(_HOST_PARALLEL "Kokkos::Experimental::${_BACKEND}") + else() + set(_HOST_PARALLEL "Kokkos::${_BACKEND}") + endif() + endif() +endforeach() + +if(NOT _HOST_PARALLEL AND NOT KOKKOS_ENABLE_SERIAL) + message(FATAL_ERROR "At least one host execution space must be enabled, " + "but no host parallel execution space was requested " "and Kokkos_ENABLE_SERIAL=OFF." + ) +endif() + +if(_HOST_PARALLEL) + message(STATUS " Host Parallel: ${_HOST_PARALLEL}") +else() + set(_HOST_PARALLEL "NoTypeDefined") + message(STATUS " Host Parallel: NoTypeDefined") +endif() + +if(KOKKOS_ENABLE_SERIAL) + message(STATUS " Host Serial: SERIAL") +else() + message(STATUS " Host Serial: NONE") +endif() + +message(STATUS "") +message(STATUS "Architectures:") +foreach(Arch ${KOKKOS_ENABLED_ARCH_LIST}) + message(STATUS " ${Arch}") +endforeach() + +if(KOKKOS_ENABLE_ATOMICS_BYPASS) + if(NOT _HOST_PARALLEL STREQUAL "NoTypeDefined" OR NOT _DEVICE_PARALLEL STREQUAL "NoTypeDefined") + message( + FATAL_ERROR + "Disabling atomics (via -DKokkos_ENABLE_ATOMICS_BYPASS=ON) is not allowed if a host parallel or a device backend is enabled!" + ) + endif() + if(NOT KOKKOS_ENABLE_SERIAL) + message(FATAL_ERROR "Implementation bug") # safeguard + endif() + message(STATUS "Atomics: **DISABLED**") +endif() diff --git a/packages/kokkos/cmake/kokkos_check_env.cmake b/packages/kokkos/cmake/kokkos_check_env.cmake index a455a403b9d5..f1a309ff8579 100644 --- a/packages/kokkos/cmake/kokkos_check_env.cmake +++ b/packages/kokkos/cmake/kokkos_check_env.cmake @@ -1,12 +1,15 @@ -SET(CRAYPE_VERSION $ENV{CRAYPE_VERSION}) -IF (CRAYPE_VERSION) - SET(KOKKOS_IS_CRAYPE TRUE) - SET(CRAYPE_LINK_TYPE $ENV{CRAYPE_LINK_TYPE}) - IF (CRAYPE_LINK_TYPE) - IF (NOT CRAYPE_LINK_TYPE STREQUAL "dynamic") - MESSAGE(WARNING "CRAYPE_LINK_TYPE is set to ${CRAYPE_LINK_TYPE}. Linking is likely to fail unless this is set to 'dynamic'") - ENDIF() - ELSE() - MESSAGE(WARNING "CRAYPE_LINK_TYPE is not set. Linking is likely to fail unless this is set to 'dynamic'") - ENDIF() -ENDIF() +set(CRAYPE_VERSION $ENV{CRAYPE_VERSION}) +if(CRAYPE_VERSION) + set(KOKKOS_IS_CRAYPE TRUE) + set(CRAYPE_LINK_TYPE $ENV{CRAYPE_LINK_TYPE}) + if(CRAYPE_LINK_TYPE) + if(NOT CRAYPE_LINK_TYPE STREQUAL "dynamic") + message( + WARNING + "CRAYPE_LINK_TYPE is set to ${CRAYPE_LINK_TYPE}. Linking is likely to fail unless this is set to 'dynamic'" + ) + endif() + else() + message(WARNING "CRAYPE_LINK_TYPE is not set. Linking is likely to fail unless this is set to 'dynamic'") + endif() +endif() diff --git a/packages/kokkos/cmake/kokkos_compiler_id.cmake b/packages/kokkos/cmake/kokkos_compiler_id.cmake index e8bfadb64ebe..010ed33ede89 100644 --- a/packages/kokkos/cmake/kokkos_compiler_id.cmake +++ b/packages/kokkos/cmake/kokkos_compiler_id.cmake @@ -1,262 +1,273 @@ -KOKKOS_CFG_DEPENDS(COMPILER_ID NONE) +kokkos_cfg_depends(COMPILER_ID NONE) -SET(KOKKOS_CXX_COMPILER ${CMAKE_CXX_COMPILER}) -SET(KOKKOS_CXX_COMPILER_ID ${CMAKE_CXX_COMPILER_ID}) -SET(KOKKOS_CXX_COMPILER_VERSION ${CMAKE_CXX_COMPILER_VERSION}) +set(KOKKOS_CXX_COMPILER ${CMAKE_CXX_COMPILER}) +set(KOKKOS_CXX_COMPILER_ID ${CMAKE_CXX_COMPILER_ID}) +set(KOKKOS_CXX_COMPILER_VERSION ${CMAKE_CXX_COMPILER_VERSION}) -MACRO(kokkos_internal_have_compiler_nvcc) +macro(kokkos_internal_have_compiler_nvcc) # Check if the compiler is nvcc (which really means nvcc_wrapper). - EXECUTE_PROCESS(COMMAND ${ARGN} --version - OUTPUT_VARIABLE INTERNAL_COMPILER_VERSION - OUTPUT_STRIP_TRAILING_WHITESPACE) - STRING(REPLACE "\n" " - " INTERNAL_COMPILER_VERSION_ONE_LINE ${INTERNAL_COMPILER_VERSION} ) - STRING(FIND ${INTERNAL_COMPILER_VERSION_ONE_LINE} "nvcc" INTERNAL_COMPILER_VERSION_CONTAINS_NVCC) - STRING(REGEX REPLACE "^ +" "" INTERNAL_HAVE_COMPILER_NVCC "${INTERNAL_HAVE_COMPILER_NVCC}") - IF(${INTERNAL_COMPILER_VERSION_CONTAINS_NVCC} GREATER -1) - SET(INTERNAL_HAVE_COMPILER_NVCC true) - ELSE() - SET(INTERNAL_HAVE_COMPILER_NVCC false) - ENDIF() -ENDMACRO() + execute_process(COMMAND ${ARGN} --version OUTPUT_VARIABLE INTERNAL_COMPILER_VERSION OUTPUT_STRIP_TRAILING_WHITESPACE) + string(REPLACE "\n" " - " INTERNAL_COMPILER_VERSION_ONE_LINE ${INTERNAL_COMPILER_VERSION}) + string(FIND ${INTERNAL_COMPILER_VERSION_ONE_LINE} "nvcc" INTERNAL_COMPILER_VERSION_CONTAINS_NVCC) + string(REGEX REPLACE "^ +" "" INTERNAL_HAVE_COMPILER_NVCC "${INTERNAL_HAVE_COMPILER_NVCC}") + if(${INTERNAL_COMPILER_VERSION_CONTAINS_NVCC} GREATER -1) + set(INTERNAL_HAVE_COMPILER_NVCC true) + else() + set(INTERNAL_HAVE_COMPILER_NVCC false) + endif() +endmacro() -IF(Kokkos_ENABLE_CUDA) +if(Kokkos_ENABLE_CUDA) # kokkos_enable_options is not yet called so use lower case here - IF(Kokkos_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) + if(Kokkos_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) kokkos_internal_have_compiler_nvcc(${CMAKE_CUDA_COMPILER}) - ELSE() + else() # find kokkos_launch_compiler - FIND_PROGRAM(Kokkos_COMPILE_LAUNCHER - NAMES kokkos_launch_compiler - HINTS ${PROJECT_SOURCE_DIR} - PATHS ${PROJECT_SOURCE_DIR} - PATH_SUFFIXES bin) + find_program( + Kokkos_COMPILE_LAUNCHER + NAMES kokkos_launch_compiler + HINTS ${PROJECT_SOURCE_DIR} + PATHS ${PROJECT_SOURCE_DIR} + PATH_SUFFIXES bin + ) - FIND_PROGRAM(Kokkos_NVCC_WRAPPER - NAMES nvcc_wrapper - HINTS ${PROJECT_SOURCE_DIR} - PATHS ${PROJECT_SOURCE_DIR} - PATH_SUFFIXES bin) + find_program( + Kokkos_NVCC_WRAPPER + NAMES nvcc_wrapper + HINTS ${PROJECT_SOURCE_DIR} + PATHS ${PROJECT_SOURCE_DIR} + PATH_SUFFIXES bin + ) # Check if compiler was set to nvcc_wrapper kokkos_internal_have_compiler_nvcc(${CMAKE_CXX_COMPILER}) # If launcher was found and nvcc_wrapper was not specified as # compiler and `CMAKE_CXX_COMPILIER_LAUNCHER` is not set, set to use launcher. # Will ensure CMAKE_CXX_COMPILER is replaced by nvcc_wrapper - IF(Kokkos_COMPILE_LAUNCHER AND NOT INTERNAL_HAVE_COMPILER_NVCC AND NOT KOKKOS_CXX_COMPILER_ID STREQUAL Clang) - IF(CMAKE_CXX_COMPILER_LAUNCHER) - MESSAGE(FATAL_ERROR "Cannot use CMAKE_CXX_COMPILER_LAUNCHER if the CMAKE_CXX_COMPILER is not able to compile CUDA code, i.e. nvcc_wrapper or clang++!") - ENDIF() + if(Kokkos_COMPILE_LAUNCHER AND NOT INTERNAL_HAVE_COMPILER_NVCC AND NOT KOKKOS_CXX_COMPILER_ID STREQUAL Clang) + if(CMAKE_CXX_COMPILER_LAUNCHER) + message( + FATAL_ERROR + "Cannot use CMAKE_CXX_COMPILER_LAUNCHER if the CMAKE_CXX_COMPILER is not able to compile CUDA code, i.e. nvcc_wrapper or clang++!" + ) + endif() # the first argument to launcher is always the C++ compiler defined by cmake # if the second argument matches the C++ compiler, it forwards the rest of the # args to nvcc_wrapper kokkos_internal_have_compiler_nvcc( - ${Kokkos_COMPILE_LAUNCHER} ${Kokkos_NVCC_WRAPPER} ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_COMPILER} -DKOKKOS_DEPENDENCE) - SET(INTERNAL_USE_COMPILER_LAUNCHER true) - ENDIF() - ENDIF() -ENDIF() + ${Kokkos_COMPILE_LAUNCHER} ${Kokkos_NVCC_WRAPPER} ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_COMPILER} + -DKOKKOS_DEPENDENCE + ) + set(INTERNAL_USE_COMPILER_LAUNCHER true) + endif() + endif() +endif() -IF(INTERNAL_HAVE_COMPILER_NVCC) +if(INTERNAL_HAVE_COMPILER_NVCC) # Save the host compiler id before overwriting it. - SET(KOKKOS_CXX_HOST_COMPILER_ID ${KOKKOS_CXX_COMPILER_ID}) + set(KOKKOS_CXX_HOST_COMPILER_ID ${KOKKOS_CXX_COMPILER_ID}) # SET the compiler id to nvcc. We use the value used by CMake 3.8. - SET(KOKKOS_CXX_COMPILER_ID NVIDIA CACHE STRING INTERNAL FORCE) + set(KOKKOS_CXX_COMPILER_ID NVIDIA CACHE STRING INTERNAL FORCE) - STRING(REGEX MATCH "V[0-9]+\\.[0-9]+\\.[0-9]+" - TEMP_CXX_COMPILER_VERSION ${INTERNAL_COMPILER_VERSION_ONE_LINE}) - STRING(SUBSTRING ${TEMP_CXX_COMPILER_VERSION} 1 -1 TEMP_CXX_COMPILER_VERSION) - SET(KOKKOS_CXX_COMPILER_VERSION ${TEMP_CXX_COMPILER_VERSION} CACHE STRING INTERNAL FORCE) - MESSAGE(STATUS "Compiler Version: ${KOKKOS_CXX_COMPILER_VERSION}") - IF(INTERNAL_USE_COMPILER_LAUNCHER) - MESSAGE(STATUS "kokkos_launch_compiler (${Kokkos_COMPILE_LAUNCHER}) is enabled...") + string(REGEX MATCH "V[0-9]+\\.[0-9]+\\.[0-9]+" TEMP_CXX_COMPILER_VERSION ${INTERNAL_COMPILER_VERSION_ONE_LINE}) + string(SUBSTRING ${TEMP_CXX_COMPILER_VERSION} 1 -1 TEMP_CXX_COMPILER_VERSION) + set(KOKKOS_CXX_COMPILER_VERSION ${TEMP_CXX_COMPILER_VERSION} CACHE STRING INTERNAL FORCE) + message(STATUS "Compiler Version: ${KOKKOS_CXX_COMPILER_VERSION}") + if(INTERNAL_USE_COMPILER_LAUNCHER) + message(STATUS "kokkos_launch_compiler (${Kokkos_COMPILE_LAUNCHER}) is enabled...") kokkos_compilation(GLOBAL) - ENDIF() -ENDIF() + endif() +endif() -IF(Kokkos_ENABLE_HIP) +if(Kokkos_ENABLE_HIP) # get HIP version - EXECUTE_PROCESS(COMMAND ${CMAKE_CXX_COMPILER} --version - OUTPUT_VARIABLE INTERNAL_COMPILER_VERSION - OUTPUT_STRIP_TRAILING_WHITESPACE) + execute_process( + COMMAND ${CMAKE_CXX_COMPILER} --version OUTPUT_VARIABLE INTERNAL_COMPILER_VERSION OUTPUT_STRIP_TRAILING_WHITESPACE + ) - STRING(REPLACE "\n" " - " INTERNAL_COMPILER_VERSION_ONE_LINE ${INTERNAL_COMPILER_VERSION} ) + string(REPLACE "\n" " - " INTERNAL_COMPILER_VERSION_ONE_LINE ${INTERNAL_COMPILER_VERSION}) - STRING(FIND ${INTERNAL_COMPILER_VERSION_ONE_LINE} "HIP version" INTERNAL_COMPILER_VERSION_CONTAINS_HIP) - IF(INTERNAL_COMPILER_VERSION_CONTAINS_HIP GREATER -1) - SET(KOKKOS_CXX_COMPILER_ID HIPCC CACHE STRING INTERNAL FORCE) - ENDIF() + string(FIND ${INTERNAL_COMPILER_VERSION_ONE_LINE} "HIP version" INTERNAL_COMPILER_VERSION_CONTAINS_HIP) + if(INTERNAL_COMPILER_VERSION_CONTAINS_HIP GREATER -1) + set(KOKKOS_CXX_COMPILER_ID HIPCC CACHE STRING INTERNAL FORCE) + endif() - STRING(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" - TEMP_CXX_COMPILER_VERSION ${INTERNAL_COMPILER_VERSION_ONE_LINE}) - SET(KOKKOS_CXX_COMPILER_VERSION ${TEMP_CXX_COMPILER_VERSION} CACHE STRING INTERNAL FORCE) - MESSAGE(STATUS "Compiler Version: ${KOKKOS_CXX_COMPILER_VERSION}") -ENDIF() + string(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" TEMP_CXX_COMPILER_VERSION ${INTERNAL_COMPILER_VERSION_ONE_LINE}) + set(KOKKOS_CXX_COMPILER_VERSION ${TEMP_CXX_COMPILER_VERSION} CACHE STRING INTERNAL FORCE) + message(STATUS "Compiler Version: ${KOKKOS_CXX_COMPILER_VERSION}") +endif() -IF(KOKKOS_CXX_COMPILER_ID STREQUAL Clang) +if(KOKKOS_CXX_COMPILER_ID STREQUAL Clang) # The Cray compiler reports as Clang to most versions of CMake - EXECUTE_PROCESS(COMMAND ${CMAKE_CXX_COMPILER} --version - COMMAND grep -c Cray - OUTPUT_VARIABLE INTERNAL_HAVE_CRAY_COMPILER - OUTPUT_STRIP_TRAILING_WHITESPACE) - IF (INTERNAL_HAVE_CRAY_COMPILER) #not actually Clang - SET(KOKKOS_CLANG_IS_CRAY TRUE) - ENDIF() + execute_process( + COMMAND ${CMAKE_CXX_COMPILER} --version + COMMAND grep -c Cray + OUTPUT_VARIABLE INTERNAL_HAVE_CRAY_COMPILER + OUTPUT_STRIP_TRAILING_WHITESPACE + ) + if(INTERNAL_HAVE_CRAY_COMPILER) #not actually Clang + set(KOKKOS_CLANG_IS_CRAY TRUE) + set(KOKKOS_CXX_COMPILER_ID CrayClang) + endif() # The clang based Intel compiler reports as Clang to most versions of CMake - EXECUTE_PROCESS(COMMAND ${CMAKE_CXX_COMPILER} --version - COMMAND grep -c "DPC++\\|icpx" - OUTPUT_VARIABLE INTERNAL_HAVE_INTEL_COMPILER - OUTPUT_STRIP_TRAILING_WHITESPACE) - IF (INTERNAL_HAVE_INTEL_COMPILER) #not actually Clang - SET(KOKKOS_CLANG_IS_INTEL TRUE) - SET(KOKKOS_CXX_COMPILER_ID IntelLLVM CACHE STRING INTERNAL FORCE) - EXECUTE_PROCESS(COMMAND ${CMAKE_CXX_COMPILER} --version - OUTPUT_VARIABLE INTERNAL_CXX_COMPILER_VERSION - OUTPUT_STRIP_TRAILING_WHITESPACE) - STRING(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" - KOKKOS_CXX_COMPILER_VERSION ${INTERNAL_CXX_COMPILER_VERSION}) - ENDIF() -ENDIF() + execute_process( + COMMAND ${CMAKE_CXX_COMPILER} --version + COMMAND grep -c "DPC++\\|icpx" + OUTPUT_VARIABLE INTERNAL_HAVE_INTEL_COMPILER + OUTPUT_STRIP_TRAILING_WHITESPACE + ) + if(INTERNAL_HAVE_INTEL_COMPILER) #not actually Clang + set(KOKKOS_CLANG_IS_INTEL TRUE) + set(KOKKOS_CXX_COMPILER_ID IntelLLVM CACHE STRING INTERNAL FORCE) + execute_process( + COMMAND ${CMAKE_CXX_COMPILER} --version OUTPUT_VARIABLE INTERNAL_CXX_COMPILER_VERSION + OUTPUT_STRIP_TRAILING_WHITESPACE + ) + string(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" KOKKOS_CXX_COMPILER_VERSION ${INTERNAL_CXX_COMPILER_VERSION}) + endif() +endif() -IF(KOKKOS_CXX_COMPILER_ID STREQUAL Cray OR KOKKOS_CLANG_IS_CRAY) +if(KOKKOS_CXX_COMPILER_ID STREQUAL Cray OR KOKKOS_CLANG_IS_CRAY) # SET Cray's compiler version. - EXECUTE_PROCESS(COMMAND ${CMAKE_CXX_COMPILER} --version - OUTPUT_VARIABLE INTERNAL_CXX_COMPILER_VERSION - OUTPUT_STRIP_TRAILING_WHITESPACE) + execute_process( + COMMAND ${CMAKE_CXX_COMPILER} --version OUTPUT_VARIABLE INTERNAL_CXX_COMPILER_VERSION + OUTPUT_STRIP_TRAILING_WHITESPACE + ) - STRING(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" - TEMP_CXX_COMPILER_VERSION ${INTERNAL_CXX_COMPILER_VERSION}) - IF (KOKKOS_CLANG_IS_CRAY) - SET(KOKKOS_CLANG_CRAY_COMPILER_VERSION ${TEMP_CXX_COMPILER_VERSION}) - ELSE() - SET(KOKKOS_CXX_COMPILER_VERSION ${TEMP_CXX_COMPILER_VERSION} CACHE STRING INTERNAL FORCE) - ENDIF() -ENDIF() + string(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" TEMP_CXX_COMPILER_VERSION ${INTERNAL_CXX_COMPILER_VERSION}) + if(KOKKOS_CLANG_IS_CRAY) + set(KOKKOS_CLANG_CRAY_COMPILER_VERSION ${TEMP_CXX_COMPILER_VERSION}) + else() + set(KOKKOS_CXX_COMPILER_VERSION ${TEMP_CXX_COMPILER_VERSION} CACHE STRING INTERNAL FORCE) + endif() +endif() -IF(KOKKOS_CXX_COMPILER_ID STREQUAL Fujitsu) +if(KOKKOS_CXX_COMPILER_ID STREQUAL Fujitsu) # SET Fujitsus compiler version which is not detected by CMake - EXECUTE_PROCESS(COMMAND ${CMAKE_CXX_COMPILER} --version - OUTPUT_VARIABLE INTERNAL_CXX_COMPILER_VERSION - OUTPUT_STRIP_TRAILING_WHITESPACE) + execute_process( + COMMAND ${CMAKE_CXX_COMPILER} --version OUTPUT_VARIABLE INTERNAL_CXX_COMPILER_VERSION + OUTPUT_STRIP_TRAILING_WHITESPACE + ) - STRING(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" - TEMP_CXX_COMPILER_VERSION ${INTERNAL_CXX_COMPILER_VERSION}) - SET(KOKKOS_CXX_COMPILER_VERSION ${TEMP_CXX_COMPILER_VERSION} CACHE STRING INTERNAL FORCE) -ENDIF() + string(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" TEMP_CXX_COMPILER_VERSION ${INTERNAL_CXX_COMPILER_VERSION}) + set(KOKKOS_CXX_COMPILER_VERSION ${TEMP_CXX_COMPILER_VERSION} CACHE STRING INTERNAL FORCE) +endif() # Enforce the minimum compilers supported by Kokkos. -IF(NOT CMAKE_CXX_STANDARD) - SET(CMAKE_CXX_STANDARD 17) -ENDIF() -IF(CMAKE_CXX_STANDARD EQUAL 17) - SET(KOKKOS_CLANG_CPU_MINIMUM 8.0.0) - SET(KOKKOS_CLANG_CUDA_MINIMUM 10.0.0) - SET(KOKKOS_CLANG_OPENMPTARGET_MINIMUM 15.0.0) - SET(KOKKOS_GCC_MINIMUM 8.2.0) - SET(KOKKOS_INTEL_MINIMUM 19.0.5) - SET(KOKKOS_INTEL_LLVM_CPU_MINIMUM 2021.1.1) - SET(KOKKOS_INTEL_LLVM_SYCL_MINIMUM 2023.0.0) - SET(KOKKOS_NVCC_MINIMUM 11.0.0) - SET(KOKKOS_HIPCC_MINIMUM 5.2.0) - SET(KOKKOS_NVHPC_MINIMUM 22.3) - SET(KOKKOS_MSVC_MINIMUM 19.29) -ELSE() - SET(KOKKOS_CLANG_CPU_MINIMUM 14.0.0) - SET(KOKKOS_CLANG_CUDA_MINIMUM 14.0.0) - SET(KOKKOS_CLANG_OPENMPTARGET_MINIMUM 15.0.0) - SET(KOKKOS_GCC_MINIMUM 10.1.0) - SET(KOKKOS_INTEL_MINIMUM "not supported") - SET(KOKKOS_INTEL_LLVM_CPU_MINIMUM 2022.0.0) - SET(KOKKOS_INTEL_LLVM_SYCL_MINIMUM 2023.0.0) - SET(KOKKOS_NVCC_MINIMUM 12.0.0) - SET(KOKKOS_HIPCC_MINIMUM 5.2.0) - SET(KOKKOS_NVHPC_MINIMUM 22.3) - SET(KOKKOS_MSVC_MINIMUM 19.30) -ENDIF() +if(NOT CMAKE_CXX_STANDARD) + set(CMAKE_CXX_STANDARD 17) +endif() +if(CMAKE_CXX_STANDARD EQUAL 17) + set(KOKKOS_CLANG_CPU_MINIMUM 8.0.0) + set(KOKKOS_CLANG_CUDA_MINIMUM 10.0.0) + set(KOKKOS_CLANG_OPENMPTARGET_MINIMUM 15.0.0) + set(KOKKOS_GCC_MINIMUM 8.2.0) + set(KOKKOS_INTEL_MINIMUM 19.0.5) + set(KOKKOS_INTEL_LLVM_CPU_MINIMUM 2021.1.1) + set(KOKKOS_INTEL_LLVM_SYCL_MINIMUM 2023.0.0) + set(KOKKOS_NVCC_MINIMUM 11.0.0) + set(KOKKOS_HIPCC_MINIMUM 5.2.0) + set(KOKKOS_NVHPC_MINIMUM 22.3) + set(KOKKOS_MSVC_MINIMUM 19.29) +else() + set(KOKKOS_CLANG_CPU_MINIMUM 14.0.0) + set(KOKKOS_CLANG_CUDA_MINIMUM 14.0.0) + set(KOKKOS_CLANG_OPENMPTARGET_MINIMUM 15.0.0) + set(KOKKOS_GCC_MINIMUM 10.1.0) + set(KOKKOS_INTEL_MINIMUM "not supported") + set(KOKKOS_INTEL_LLVM_CPU_MINIMUM 2022.0.0) + set(KOKKOS_INTEL_LLVM_SYCL_MINIMUM 2023.0.0) + set(KOKKOS_NVCC_MINIMUM 12.0.0) + set(KOKKOS_HIPCC_MINIMUM 5.2.0) + set(KOKKOS_NVHPC_MINIMUM 22.3) + set(KOKKOS_MSVC_MINIMUM 19.30) +endif() -SET(KOKKOS_MESSAGE_TEXT "Compiler not supported by Kokkos for C++${CMAKE_CXX_STANDARD}. Required minimum compiler versions:") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n Clang(CPU) ${KOKKOS_CLANG_CPU_MINIMUM}") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n Clang(CUDA) ${KOKKOS_CLANG_CUDA_MINIMUM}") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n Clang(OpenMPTarget) ${KOKKOS_CLANG_OPENMPTARGET_MINIMUM}") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n GCC ${KOKKOS_GCC_MINIMUM}") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n Intel ${KOKKOS_INTEL_MINIMUM}") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n IntelLLVM(CPU) ${KOKKOS_INTEL_LLVM_CPU_MINIMUM}") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n IntelLLVM(SYCL) ${KOKKOS_INTEL_LLVM_SYCL_MINIMUM}") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n NVCC ${KOKKOS_NVCC_MINIMUM}") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n HIPCC ${KOKKOS_HIPCC_MINIMUM}") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n NVHPC/PGI ${KOKKOS_NVHPC_MINIMUM}") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n MSVC ${KOKKOS_MSVC_MINIMUM}") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n XL/XLClang not supported") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\nCompiler: ${KOKKOS_CXX_COMPILER_ID} ${KOKKOS_CXX_COMPILER_VERSION}\n") +set(KOKKOS_MESSAGE_TEXT + "Compiler not supported by Kokkos for C++${CMAKE_CXX_STANDARD}. Required minimum compiler versions:" +) +set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n Clang(CPU) ${KOKKOS_CLANG_CPU_MINIMUM}") +set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n Clang(CUDA) ${KOKKOS_CLANG_CUDA_MINIMUM}") +set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n Clang(OpenMPTarget) ${KOKKOS_CLANG_OPENMPTARGET_MINIMUM}") +set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n GCC ${KOKKOS_GCC_MINIMUM}") +set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n Intel ${KOKKOS_INTEL_MINIMUM}") +set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n IntelLLVM(CPU) ${KOKKOS_INTEL_LLVM_CPU_MINIMUM}") +set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n IntelLLVM(SYCL) ${KOKKOS_INTEL_LLVM_SYCL_MINIMUM}") +set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n NVCC ${KOKKOS_NVCC_MINIMUM}") +set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n HIPCC ${KOKKOS_HIPCC_MINIMUM}") +set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n NVHPC/PGI ${KOKKOS_NVHPC_MINIMUM}") +set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n MSVC ${KOKKOS_MSVC_MINIMUM}") +set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n XL/XLClang not supported") +set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\nCompiler: ${KOKKOS_CXX_COMPILER_ID} ${KOKKOS_CXX_COMPILER_VERSION}\n") -IF(KOKKOS_CXX_COMPILER_ID STREQUAL Clang AND NOT Kokkos_ENABLE_CUDA) - IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_CLANG_CPU_MINIMUM}) - MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") - ENDIF() -ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL Clang AND Kokkos_ENABLE_CUDA) - IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_CLANG_CUDA_MINIMUM}) - MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") - ENDIF() -ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL GNU) - IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_GCC_MINIMUM}) - MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") - ENDIF() -ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL Intel) - IF((NOT CMAKE_CXX_STANDARD EQUAL 17) OR (KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_INTEL_MINIMUM})) - MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") - ENDIF() -ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL IntelLLVM AND NOT Kokkos_ENABLE_SYCL) - IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_INTEL_LLVM_CPU_MINIMUM}) - MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") - ENDIF() -ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL IntelLLVM AND Kokkos_ENABLE_SYCL) - IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_INTEL_LLVM_SYCL_MINIMUM}) - MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") - ENDIF() -ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) - IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_NVCC_MINIMUM}) - MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") - ENDIF() - SET(CMAKE_CXX_EXTENSIONS OFF CACHE BOOL "Kokkos turns off CXX extensions" FORCE) -ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC) - IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_HIPCC_MINIMUM}) - MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") - ENDIF() -ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL PGI OR KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) - IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_NVHPC_MINIMUM}) - MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") - ENDIF() +if(KOKKOS_CXX_COMPILER_ID STREQUAL Clang AND NOT Kokkos_ENABLE_CUDA) + if(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_CLANG_CPU_MINIMUM}) + message(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") + endif() +elseif(KOKKOS_CXX_COMPILER_ID STREQUAL Clang AND Kokkos_ENABLE_CUDA) + if(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_CLANG_CUDA_MINIMUM}) + message(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") + endif() +elseif(KOKKOS_CXX_COMPILER_ID STREQUAL GNU) + if(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_GCC_MINIMUM}) + message(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") + endif() +elseif(KOKKOS_CXX_COMPILER_ID STREQUAL Intel) + if((NOT CMAKE_CXX_STANDARD EQUAL 17) OR (KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_INTEL_MINIMUM})) + message(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") + endif() +elseif(KOKKOS_CXX_COMPILER_ID STREQUAL IntelLLVM AND NOT Kokkos_ENABLE_SYCL) + if(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_INTEL_LLVM_CPU_MINIMUM}) + message(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") + endif() +elseif(KOKKOS_CXX_COMPILER_ID STREQUAL IntelLLVM AND Kokkos_ENABLE_SYCL) + if(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_INTEL_LLVM_SYCL_MINIMUM}) + message(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") + endif() +elseif(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) + if(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_NVCC_MINIMUM}) + message(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") + endif() + set(CMAKE_CXX_EXTENSIONS OFF CACHE BOOL "Kokkos turns off CXX extensions" FORCE) +elseif(KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC) + if(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_HIPCC_MINIMUM}) + message(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") + endif() +elseif(KOKKOS_CXX_COMPILER_ID STREQUAL PGI OR KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) + if(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_NVHPC_MINIMUM}) + message(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") + endif() # Treat PGI internally as NVHPC to simplify handling both compilers. # Before CMake 3.20 NVHPC was identified as PGI, nvc++ is # backward-compatible to pgc++. - SET(KOKKOS_CXX_COMPILER_ID NVHPC CACHE STRING INTERNAL FORCE) -ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL "MSVC") - IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_MSVC_MINIMUM}) - MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") - ENDIF() -ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL XL OR KOKKOS_CXX_COMPILER_ID STREQUAL XLClang) - MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") -ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL Clang AND Kokkos_ENABLE_OPENMPTARGET) - IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS KOKKOS_CLANG_OPENMPTARGET_MINIMUM) - MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") - ENDIF() -ENDIF() + set(KOKKOS_CXX_COMPILER_ID NVHPC CACHE STRING INTERNAL FORCE) +elseif(KOKKOS_CXX_COMPILER_ID STREQUAL "MSVC") + if(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_MSVC_MINIMUM}) + message(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") + endif() +elseif(KOKKOS_CXX_COMPILER_ID STREQUAL XL OR KOKKOS_CXX_COMPILER_ID STREQUAL XLClang) + message(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") +elseif(KOKKOS_CXX_COMPILER_ID STREQUAL Clang AND Kokkos_ENABLE_OPENMPTARGET) + if(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS KOKKOS_CLANG_OPENMPTARGET_MINIMUM) + message(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") + endif() +endif() -IF(NOT DEFINED KOKKOS_CXX_HOST_COMPILER_ID) - SET(KOKKOS_CXX_HOST_COMPILER_ID ${KOKKOS_CXX_COMPILER_ID}) -ELSEIF(KOKKOS_CXX_HOST_COMPILER_ID STREQUAL PGI) - SET(KOKKOS_CXX_HOST_COMPILER_ID NVHPC CACHE STRING INTERNAL FORCE) -ENDIF() +if(NOT DEFINED KOKKOS_CXX_HOST_COMPILER_ID) + set(KOKKOS_CXX_HOST_COMPILER_ID ${KOKKOS_CXX_COMPILER_ID}) +elseif(KOKKOS_CXX_HOST_COMPILER_ID STREQUAL PGI) + set(KOKKOS_CXX_HOST_COMPILER_ID NVHPC CACHE STRING INTERNAL FORCE) +endif() -STRING(REPLACE "." ";" VERSION_LIST ${KOKKOS_CXX_COMPILER_VERSION}) -LIST(GET VERSION_LIST 0 KOKKOS_COMPILER_VERSION_MAJOR) -LIST(GET VERSION_LIST 1 KOKKOS_COMPILER_VERSION_MINOR) -LIST(LENGTH VERSION_LIST LIST_LENGTH) +string(REPLACE "." ";" VERSION_LIST ${KOKKOS_CXX_COMPILER_VERSION}) +list(GET VERSION_LIST 0 KOKKOS_COMPILER_VERSION_MAJOR) +list(GET VERSION_LIST 1 KOKKOS_COMPILER_VERSION_MINOR) +list(LENGTH VERSION_LIST LIST_LENGTH) # On Android, the compiler doesn't have a patch version, just a major/minor -IF(LIST_LENGTH GREATER 2) - LIST(GET VERSION_LIST 2 KOKKOS_COMPILER_VERSION_PATCH) -ELSE() - SET(KOKKOS_COMPILER_VERSION_PATCH 0) -ENDIF() - +if(LIST_LENGTH GREATER 2) + list(GET VERSION_LIST 2 KOKKOS_COMPILER_VERSION_PATCH) +else() + set(KOKKOS_COMPILER_VERSION_PATCH 0) +endif() diff --git a/packages/kokkos/cmake/kokkos_configure_trilinos.cmake b/packages/kokkos/cmake/kokkos_configure_trilinos.cmake new file mode 100644 index 000000000000..5aeef61e7b32 --- /dev/null +++ b/packages/kokkos/cmake/kokkos_configure_trilinos.cmake @@ -0,0 +1,38 @@ +if(CMAKE_PROJECT_NAME STREQUAL "Trilinos") + set(Kokkos_ENABLE_SERIAL ON CACHE BOOL "Whether to build Serial backend" FORCE) + + if(NOT ${Trilinos_ENABLE_OpenMP} STREQUAL "") + set(Kokkos_ENABLE_OPENMP ${Trilinos_ENABLE_OpenMP} CACHE BOOL "Whether to build OpenMP backend" FORCE) + else() + set(Kokkos_ENABLE_OPENMP OFF CACHE BOOL "Whether to build OpenMP backend" FORCE) + endif() + + if(NOT ${TPL_ENABLE_CUDA} STREQUAL "") + set(Kokkos_ENABLE_CUDA ${TPL_ENABLE_CUDA} CACHE BOOL "Whether to build CUDA backend" FORCE) + else() + set(Kokkos_ENABLE_CUDA OFF CACHE BOOL "Whether to build CUDA backend" FORCE) + endif() + + if(NOT ${TPL_ENABLE_HPX} STREQUAL "") + set(Kokkos_ENABLE_HPX ${TPL_ENABLE_HPX} CACHE BOOL "Whether to build HPX backend" FORCE) + else() + set(Kokkos_ENABLE_HPX OFF CACHE BOOL "Whether to build HPX backend" FORCE) + endif() + + if(NOT ${TPL_ENABLE_quadmath} STREQUAL "") + set(Kokkos_ENABLE_LIBQUADMATH ${TPL_ENABLE_quadmath} CACHE BOOL "Whether to enable the LIBQUADMATH library" FORCE) + else() + set(Kokkos_ENABLE_LIBQUADMATH OFF CACHE BOOL "Whether to enable the LIBQUADMATH library" FORCE) + endif() + + if(NOT ${TPL_ENABLE_DLlib} STREQUAL "") + set(Kokkos_ENABLE_LIBDL ${TPL_ENABLE_DLlib} CACHE BOOL "Whether to enable the LIBDL library" FORCE) + else() + set(Kokkos_ENABLE_LIBDL OFF CACHE BOOL "Whether to enable the LIBDL library" FORCE) + endif() + + set(Kokkos_ENABLE_COMPLEX_ALIGN OFF CACHE BOOL "Whether to align Kokkos::complex to 2*alignof(RealType)") + + # FIXME_TRILINOS We run into problems when trying to use an external GTest in Trilinos CI + set(CMAKE_DISABLE_FIND_PACKAGE_GTest ON) +endif() diff --git a/packages/kokkos/cmake/kokkos_corner_cases.cmake b/packages/kokkos/cmake/kokkos_corner_cases.cmake index ede2b4e0caf8..530e9e8fd8e0 100644 --- a/packages/kokkos/cmake/kokkos_corner_cases.cmake +++ b/packages/kokkos/cmake/kokkos_corner_cases.cmake @@ -1,4 +1,8 @@ -IF (KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA AND KOKKOS_ENABLE_CUDA_CONSTEXPR AND KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 11.2) - MESSAGE(WARNING "You have requested -DKokkos_ENABLE_CUDA_CONSTEXPR=ON for NVCC ${KOKKOS_CXX_COMPILER_VERSION} which is known to trigger compiler bugs before NVCC version 11.2. See https://github.com/kokkos/kokkos/issues/3496") -ENDIF() - +if(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA AND KOKKOS_ENABLE_CUDA_CONSTEXPR AND KOKKOS_CXX_COMPILER_VERSION VERSION_LESS + 11.2 +) + message( + WARNING + "You have requested -DKokkos_ENABLE_CUDA_CONSTEXPR=ON for NVCC ${KOKKOS_CXX_COMPILER_VERSION} which is known to trigger compiler bugs before NVCC version 11.2. See https://github.com/kokkos/kokkos/issues/3496" + ) +endif() diff --git a/packages/kokkos/cmake/kokkos_enable_devices.cmake b/packages/kokkos/cmake/kokkos_enable_devices.cmake index c7d189285c58..40c2d3ea8afb 100644 --- a/packages/kokkos/cmake/kokkos_enable_devices.cmake +++ b/packages/kokkos/cmake/kokkos_enable_devices.cmake @@ -1,128 +1,132 @@ - -FUNCTION(KOKKOS_DEVICE_OPTION SUFFIX DEFAULT DEV_TYPE DOCSTRING) - KOKKOS_OPTION(ENABLE_${SUFFIX} ${DEFAULT} BOOL ${DOCSTRING}) - STRING(TOUPPER ${SUFFIX} UC_NAME) - IF (KOKKOS_ENABLE_${UC_NAME}) - LIST(APPEND KOKKOS_ENABLED_DEVICES ${SUFFIX}) +function(KOKKOS_DEVICE_OPTION SUFFIX DEFAULT DEV_TYPE DOCSTRING) + kokkos_option(ENABLE_${SUFFIX} ${DEFAULT} BOOL ${DOCSTRING}) + string(TOUPPER ${SUFFIX} UC_NAME) + if(KOKKOS_ENABLE_${UC_NAME}) + list(APPEND KOKKOS_ENABLED_DEVICES ${SUFFIX}) #I hate that CMake makes me do this - SET(KOKKOS_ENABLED_DEVICES ${KOKKOS_ENABLED_DEVICES} PARENT_SCOPE) - ENDIF() - SET(KOKKOS_ENABLE_${UC_NAME} ${KOKKOS_ENABLE_${UC_NAME}} PARENT_SCOPE) - IF (KOKKOS_ENABLE_${UC_NAME} AND DEV_TYPE STREQUAL "HOST") - SET(KOKKOS_HAS_HOST ON PARENT_SCOPE) - ENDIF() -ENDFUNCTION() + set(KOKKOS_ENABLED_DEVICES ${KOKKOS_ENABLED_DEVICES} PARENT_SCOPE) + endif() + set(KOKKOS_ENABLE_${UC_NAME} ${KOKKOS_ENABLE_${UC_NAME}} PARENT_SCOPE) + if(KOKKOS_ENABLE_${UC_NAME} AND DEV_TYPE STREQUAL "HOST") + set(KOKKOS_HAS_HOST ON PARENT_SCOPE) + endif() +endfunction() -KOKKOS_CFG_DEPENDS(DEVICES NONE) +kokkos_cfg_depends(DEVICES NONE) # Put a check in just in case people are using this option -KOKKOS_DEPRECATED_LIST(DEVICES ENABLE) - +kokkos_deprecated_list(DEVICES ENABLE) -KOKKOS_DEVICE_OPTION(THREADS OFF HOST "Whether to build C++ threads backend") +kokkos_device_option(THREADS OFF HOST "Whether to build C++ threads backend") # detect clang++ / cl / clang-cl clashes -IF (CMAKE_CXX_COMPILER_ID STREQUAL Clang AND "x${CMAKE_CXX_SIMULATE_ID}" STREQUAL "xMSVC") +if(CMAKE_CXX_COMPILER_ID STREQUAL Clang AND "x${CMAKE_CXX_SIMULATE_ID}" STREQUAL "xMSVC") # this specific test requires CMake >= 3.15 - IF ("x${CMAKE_CXX_COMPILER_FRONTEND_VARIANT}" STREQUAL "xGNU") + if("x${CMAKE_CXX_COMPILER_FRONTEND_VARIANT}" STREQUAL "xGNU") # use pure clang++ instead of clang-cl - SET(KOKKOS_COMPILER_CLANG_MSVC OFF) - ELSE() + set(KOKKOS_COMPILER_CLANG_MSVC OFF) + else() # it defaults to clang-cl - SET(KOKKOS_COMPILER_CLANG_MSVC ON) - ENDIF() -ENDIF() - -IF(Trilinos_ENABLE_Kokkos AND Trilinos_ENABLE_OpenMP) - SET(OMP_DEFAULT ON) -ELSE() - SET(OMP_DEFAULT OFF) -ENDIF() -KOKKOS_DEVICE_OPTION(OPENMP ${OMP_DEFAULT} HOST "Whether to build OpenMP backend") + set(KOKKOS_COMPILER_CLANG_MSVC ON) + endif() +endif() +if(Trilinos_ENABLE_Kokkos AND Trilinos_ENABLE_OpenMP) + set(OMP_DEFAULT ON) +else() + set(OMP_DEFAULT OFF) +endif() +kokkos_device_option(OPENMP ${OMP_DEFAULT} HOST "Whether to build OpenMP backend") # We want this to default to OFF for cache reasons, but if no # host space is given, then activate serial -IF (KOKKOS_HAS_TRILINOS) - #However, Trilinos always wants Serial ON - SET(SERIAL_DEFAULT ON) -ELSEIF (KOKKOS_HAS_HOST) - SET(SERIAL_DEFAULT OFF) -ELSE() - SET(SERIAL_DEFAULT ON) - IF (NOT DEFINED Kokkos_ENABLE_SERIAL) - MESSAGE(STATUS "SERIAL backend is being turned on to ensure there is at least one Host space. To change this, you must enable another host execution space and configure with -DKokkos_ENABLE_SERIAL=OFF or change CMakeCache.txt") - ENDIF() -ENDIF() -KOKKOS_DEVICE_OPTION(SERIAL ${SERIAL_DEFAULT} HOST "Whether to build serial backend") - -KOKKOS_DEVICE_OPTION(HPX OFF HOST "Whether to build HPX backend (experimental)") +if(KOKKOS_HAS_HOST) + set(SERIAL_DEFAULT OFF) +else() + set(SERIAL_DEFAULT ON) + if(NOT DEFINED Kokkos_ENABLE_SERIAL) + message( + STATUS + "SERIAL backend is being turned on to ensure there is at least one Host space. To change this, you must enable another host execution space and configure with -DKokkos_ENABLE_SERIAL=OFF or change CMakeCache.txt" + ) + endif() +endif() +kokkos_device_option(SERIAL ${SERIAL_DEFAULT} HOST "Whether to build serial backend") + +kokkos_device_option(HPX OFF HOST "Whether to build HPX backend (experimental)") # Device backends have to come after host backends for header include order reasons # Without this we can't make e.g. CudaSpace accessible by HostSpace -KOKKOS_DEVICE_OPTION(OPENACC OFF DEVICE "Whether to build the OpenACC backend") -IF (KOKKOS_ENABLE_OPENACC) - COMPILER_SPECIFIC_FLAGS( - Clang -fopenacc -fopenacc-fake-async-wait - -Wno-openacc-and-cxx -Wno-openmp-mapping -Wno-unknown-cuda-version - -Wno-pass-failed - ) - COMPILER_SPECIFIC_DEFS( - Clang KOKKOS_WORKAROUND_OPENMPTARGET_CLANG - ) -ENDIF() - -KOKKOS_DEVICE_OPTION(OPENMPTARGET OFF DEVICE "Whether to build the OpenMP target backend") -IF (KOKKOS_ENABLE_OPENMPTARGET) - SET(ClangOpenMPFlag -fopenmp=libomp) - IF(KOKKOS_CLANG_IS_CRAY) - SET(ClangOpenMPFlag -fopenmp) - ENDIF() - - COMPILER_SPECIFIC_FLAGS( - Clang ${ClangOpenMPFlag} -Wno-openmp-mapping - IntelLLVM -fiopenmp -Wno-openmp-mapping - NVHPC -mp=gpu - DEFAULT -fopenmp +kokkos_device_option(OPENACC OFF DEVICE "Whether to build the OpenACC backend") +if(KOKKOS_ENABLE_OPENACC) + compiler_specific_flags( + Clang + -fopenacc + -fopenacc-fake-async-wait + -fopenacc-implicit-worker=vector + -Wno-openacc-and-cxx + -Wno-openmp-mapping + -Wno-unknown-cuda-version + -Wno-pass-failed ) - COMPILER_SPECIFIC_DEFS( - Clang KOKKOS_WORKAROUND_OPENMPTARGET_CLANG + compiler_specific_defs(Clang KOKKOS_WORKAROUND_OPENMPTARGET_CLANG) +endif() + +kokkos_device_option(OPENMPTARGET OFF DEVICE "Whether to build the OpenMP target backend") +if(KOKKOS_ENABLE_OPENMPTARGET) + set(ClangOpenMPFlag -fopenmp=libomp) + if(KOKKOS_CLANG_IS_CRAY) + set(ClangOpenMPFlag -fopenmp) + endif() + + compiler_specific_flags( + Clang + ${ClangOpenMPFlag} + -Wno-openmp-mapping + IntelLLVM + -fiopenmp + -Wno-openmp-mapping + NVHPC + -mp=gpu + DEFAULT + -fopenmp ) -# Are there compilers which identify as Clang and need this library? -# COMPILER_SPECIFIC_LIBS( -# Clang -lopenmptarget -# ) - IF(KOKKOS_CXX_STANDARD LESS 17) - MESSAGE(FATAL_ERROR "OpenMPTarget backend requires C++17 or newer") - ENDIF() -ENDIF() - -IF(Trilinos_ENABLE_Kokkos AND TPL_ENABLE_CUDA) - SET(CUDA_DEFAULT ON) -ELSE() - SET(CUDA_DEFAULT OFF) -ENDIF() -KOKKOS_DEVICE_OPTION(CUDA ${CUDA_DEFAULT} DEVICE "Whether to build CUDA backend") - -IF (KOKKOS_ENABLE_CUDA) - GLOBAL_SET(KOKKOS_DONT_ALLOW_EXTENSIONS "CUDA enabled") -## Cuda has extra setup requirements, turn on Kokkos_Setup_Cuda.hpp in macros - LIST(APPEND DEVICE_SETUP_LIST Cuda) -ENDIF() - -KOKKOS_DEVICE_OPTION(HIP OFF DEVICE "Whether to build HIP backend") + compiler_specific_defs(Clang KOKKOS_WORKAROUND_OPENMPTARGET_CLANG) + # Are there compilers which identify as Clang and need this library? + # COMPILER_SPECIFIC_LIBS( + # Clang -lopenmptarget + # ) + if(KOKKOS_CXX_STANDARD LESS 17) + message(FATAL_ERROR "OpenMPTarget backend requires C++17 or newer") + endif() +endif() + +if(Trilinos_ENABLE_Kokkos AND TPL_ENABLE_CUDA) + set(CUDA_DEFAULT ON) +else() + set(CUDA_DEFAULT OFF) +endif() +kokkos_device_option(CUDA ${CUDA_DEFAULT} DEVICE "Whether to build CUDA backend") + +if(KOKKOS_ENABLE_CUDA) + global_set(KOKKOS_DONT_ALLOW_EXTENSIONS "CUDA enabled") + ## Cuda has extra setup requirements, turn on Kokkos_Setup_Cuda.hpp in macros + list(APPEND DEVICE_SETUP_LIST Cuda) +endif() + +kokkos_device_option(HIP OFF DEVICE "Whether to build HIP backend") ## HIP has extra setup requirements, turn on Kokkos_Setup_HIP.hpp in macros -IF (KOKKOS_ENABLE_HIP) - LIST(APPEND DEVICE_SETUP_LIST HIP) -ENDIF() +if(KOKKOS_ENABLE_HIP) + list(APPEND DEVICE_SETUP_LIST HIP) +endif() -KOKKOS_DEVICE_OPTION(SYCL OFF DEVICE "Whether to build SYCL backend") +kokkos_device_option(SYCL OFF DEVICE "Whether to build SYCL backend") ## SYCL has extra setup requirements, turn on Kokkos_Setup_SYCL.hpp in macros -IF (KOKKOS_ENABLE_SYCL) - IF(KOKKOS_CXX_STANDARD LESS 17) - MESSAGE(FATAL_ERROR "SYCL backend requires C++17 or newer!") - ENDIF() - LIST(APPEND DEVICE_SETUP_LIST SYCL) -ENDIF() +if(KOKKOS_ENABLE_SYCL) + if(KOKKOS_CXX_STANDARD LESS 17) + message(FATAL_ERROR "SYCL backend requires C++17 or newer!") + endif() + list(APPEND DEVICE_SETUP_LIST SYCL) +endif() diff --git a/packages/kokkos/cmake/kokkos_enable_options.cmake b/packages/kokkos/cmake/kokkos_enable_options.cmake index 53764b0c6848..a5d6fdfe4edd 100644 --- a/packages/kokkos/cmake/kokkos_enable_options.cmake +++ b/packages/kokkos/cmake/kokkos_enable_options.cmake @@ -1,198 +1,236 @@ ########################## NOTES ############################################### # List the options for configuring kokkos using CMake method of doing it. -# These options then get mapped onto KOKKOS_SETTINGS environment variable by -# kokkos_settings.cmake. It is separate to allow other packages to override -# these variables (e.g., TriBITS). ########################## AVAILABLE OPTIONS ################################### # Use lists for documentation, verification, and programming convenience - -FUNCTION(KOKKOS_ENABLE_OPTION SUFFIX DEFAULT DOCSTRING) - KOKKOS_OPTION(ENABLE_${SUFFIX} ${DEFAULT} BOOL ${DOCSTRING}) - STRING(TOUPPER ${SUFFIX} UC_NAME) - IF (KOKKOS_ENABLE_${UC_NAME} AND NOT "Kokkos_ENABLE_${UC_NAME}" IN_LIST Kokkos_OPTIONS_NOT_TO_EXPORT) - LIST(APPEND KOKKOS_ENABLED_OPTIONS ${UC_NAME}) +function(KOKKOS_ENABLE_OPTION SUFFIX DEFAULT DOCSTRING) + kokkos_option(ENABLE_${SUFFIX} ${DEFAULT} BOOL ${DOCSTRING}) + string(TOUPPER ${SUFFIX} UC_NAME) + if(KOKKOS_ENABLE_${UC_NAME} AND NOT "Kokkos_ENABLE_${UC_NAME}" IN_LIST Kokkos_OPTIONS_NOT_TO_EXPORT) + list(APPEND KOKKOS_ENABLED_OPTIONS ${UC_NAME}) #I hate that CMake makes me do this - SET(KOKKOS_ENABLED_OPTIONS ${KOKKOS_ENABLED_OPTIONS} PARENT_SCOPE) - ENDIF() - SET(KOKKOS_ENABLE_${UC_NAME} ${KOKKOS_ENABLE_${UC_NAME}} PARENT_SCOPE) -ENDFUNCTION() + set(KOKKOS_ENABLED_OPTIONS ${KOKKOS_ENABLED_OPTIONS} PARENT_SCOPE) + endif() + set(KOKKOS_ENABLE_${UC_NAME} ${KOKKOS_ENABLE_${UC_NAME}} PARENT_SCOPE) +endfunction() # Certain defaults will depend on knowing the enabled devices -KOKKOS_CFG_DEPENDS(OPTIONS DEVICES) -KOKKOS_CFG_DEPENDS(OPTIONS COMPILER_ID) +kokkos_cfg_depends(OPTIONS DEVICES) +kokkos_cfg_depends(OPTIONS COMPILER_ID) # Put a check in just in case people are using this option -KOKKOS_DEPRECATED_LIST(OPTIONS ENABLE) +kokkos_deprecated_list(OPTIONS ENABLE) -KOKKOS_ENABLE_OPTION(CUDA_RELOCATABLE_DEVICE_CODE OFF "Whether to enable relocatable device code (RDC) for CUDA") -KOKKOS_ENABLE_OPTION(CUDA_UVM OFF "Whether to use unified memory (UM) for CUDA by default") -KOKKOS_ENABLE_OPTION(CUDA_LDG_INTRINSIC OFF "Whether to use CUDA LDG intrinsics") +kokkos_enable_option(CUDA_RELOCATABLE_DEVICE_CODE OFF "Whether to enable relocatable device code (RDC) for CUDA") +kokkos_enable_option(CUDA_UVM OFF "Whether to use unified memory (UM) for CUDA by default") +kokkos_enable_option(CUDA_LDG_INTRINSIC OFF "Whether to use CUDA LDG intrinsics") # In contrast to other CUDA-dependent, options CUDA_LAMBDA is ON by default. # That is problematic when CUDA is not enabled because this not only yields a # bogus warning, but also exports the Kokkos_ENABLE_CUDA_LAMBDA variable and -# sets it to ON. This if-clause is a crutch that delays the refactoring of the -# way we declare all options until after we get rid of TriBITS. -IF (Trilinos_ENABLE_Kokkos AND TPL_ENABLE_CUDA) - SET(CUDA_LAMBDA_DEFAULT ON) -ELSEIF (KOKKOS_ENABLE_CUDA) - SET(CUDA_LAMBDA_DEFAULT ON) -ELSE() - SET(CUDA_LAMBDA_DEFAULT OFF) -ENDIF() -KOKKOS_ENABLE_OPTION(CUDA_LAMBDA ${CUDA_LAMBDA_DEFAULT} "Whether to allow lambda expressions on the device with NVCC **DEPRECATED**") - -# May be used to disable our use of CudaMallocAsync. It had caused issues in -# the past when UCX was used as MPI communication layer. We expect it is -# resolved but we keep the option around a bit longer to be safe. -KOKKOS_ENABLE_OPTION(IMPL_CUDA_MALLOC_ASYNC ON "Whether to enable CudaMallocAsync (requires CUDA Toolkit 11.2)") -KOKKOS_ENABLE_OPTION(IMPL_NVHPC_AS_DEVICE_COMPILER OFF "Whether to allow nvc++ as Cuda device compiler") -KOKKOS_ENABLE_OPTION(IMPL_CUDA_UNIFIED_MEMORY OFF "Whether to leverage unified memory architectures for CUDA") - -KOKKOS_ENABLE_OPTION(DEPRECATED_CODE_4 ON "Whether code deprecated in major release 4 is available" ) -KOKKOS_ENABLE_OPTION(DEPRECATION_WARNINGS ON "Whether to emit deprecation warnings" ) -KOKKOS_ENABLE_OPTION(HIP_RELOCATABLE_DEVICE_CODE OFF "Whether to enable relocatable device code (RDC) for HIP") -KOKKOS_ENABLE_OPTION(TESTS OFF "Whether to build the unit tests") -KOKKOS_ENABLE_OPTION(BENCHMARKS OFF "Whether to build the benchmarks") -KOKKOS_ENABLE_OPTION(EXAMPLES OFF "Whether to build the examples") -STRING(TOUPPER "${CMAKE_BUILD_TYPE}" UPPERCASE_CMAKE_BUILD_TYPE) -IF(UPPERCASE_CMAKE_BUILD_TYPE STREQUAL "DEBUG") - KOKKOS_ENABLE_OPTION(DEBUG ON "Whether to activate extra debug features - may increase compile times") - KOKKOS_ENABLE_OPTION(DEBUG_DUALVIEW_MODIFY_CHECK ON "Debug check on dual views") -ELSE() - KOKKOS_ENABLE_OPTION(DEBUG OFF "Whether to activate extra debug features - may increase compile times") - KOKKOS_ENABLE_OPTION(DEBUG_DUALVIEW_MODIFY_CHECK OFF "Debug check on dual views") -ENDIF() -UNSET(_UPPERCASE_CMAKE_BUILD_TYPE) -KOKKOS_ENABLE_OPTION(LARGE_MEM_TESTS OFF "Whether to perform extra large memory tests") -KOKKOS_ENABLE_OPTION(DEBUG_BOUNDS_CHECK OFF "Whether to use bounds checking - will increase runtime") -KOKKOS_ENABLE_OPTION(COMPILER_WARNINGS OFF "Whether to print all compiler warnings") -KOKKOS_ENABLE_OPTION(TUNING OFF "Whether to create bindings for tuning tools") -KOKKOS_ENABLE_OPTION(AGGRESSIVE_VECTORIZATION OFF "Whether to aggressively vectorize loops") -KOKKOS_ENABLE_OPTION(COMPILE_AS_CMAKE_LANGUAGE OFF "Whether to use native cmake language support") -KOKKOS_ENABLE_OPTION(HIP_MULTIPLE_KERNEL_INSTANTIATIONS OFF "Whether multiple kernels are instantiated at compile time - improve performance but increase compile time") -KOKKOS_ENABLE_OPTION(IMPL_HIP_UNIFIED_MEMORY OFF "Whether to leverage unified memory architectures for HIP") +# sets it to ON. +kokkos_enable_option( + CUDA_LAMBDA ${KOKKOS_ENABLE_CUDA} "Whether to allow lambda expressions on the device with NVCC **DEPRECATED**" +) + +# As of 09/2024, cudaMallocAsync causes issues with ICP and older version of UCX +# as MPI communication layer. +kokkos_enable_option(IMPL_CUDA_MALLOC_ASYNC OFF "Whether to enable CudaMallocAsync (requires CUDA Toolkit 11.2)") +kokkos_enable_option(IMPL_NVHPC_AS_DEVICE_COMPILER OFF "Whether to allow nvc++ as Cuda device compiler") +kokkos_enable_option(IMPL_CUDA_UNIFIED_MEMORY OFF "Whether to leverage unified memory architectures for CUDA") + +kokkos_enable_option(DEPRECATED_CODE_4 ON "Whether code deprecated in major release 4 is available") +kokkos_enable_option(DEPRECATION_WARNINGS ON "Whether to emit deprecation warnings") +kokkos_enable_option(HIP_RELOCATABLE_DEVICE_CODE OFF "Whether to enable relocatable device code (RDC) for HIP") + +# Disabling RDC only works properly since oneAPI 2024.1.0 +if(KOKKOS_ENABLE_SYCL AND KOKKOS_CXX_COMPILER_ID STREQUAL IntelLLVM AND KOKKOS_CXX_COMPILER_VERSION VERSION_LESS + 2024.1.0 +) + set(SYCL_RDC_DEFAULT ON) +else() + set(SYCL_RDC_DEFAULT OFF) +endif() +kokkos_enable_option( + SYCL_RELOCATABLE_DEVICE_CODE ${SYCL_RDC_DEFAULT} "Whether to enable relocatable device code (RDC) for SYCL" +) +kokkos_enable_option(TESTS OFF "Whether to build the unit tests") +kokkos_enable_option(BENCHMARKS OFF "Whether to build the benchmarks") +kokkos_enable_option(EXAMPLES OFF "Whether to build the examples") +string(TOUPPER "${CMAKE_BUILD_TYPE}" UPPERCASE_CMAKE_BUILD_TYPE) +if(UPPERCASE_CMAKE_BUILD_TYPE STREQUAL "DEBUG") + kokkos_enable_option(DEBUG ON "Whether to activate extra debug features - may increase compile times") + kokkos_enable_option(DEBUG_DUALVIEW_MODIFY_CHECK ON "Debug check on dual views") +else() + kokkos_enable_option(DEBUG OFF "Whether to activate extra debug features - may increase compile times") + kokkos_enable_option(DEBUG_DUALVIEW_MODIFY_CHECK OFF "Debug check on dual views") +endif() +unset(_UPPERCASE_CMAKE_BUILD_TYPE) +kokkos_enable_option(LARGE_MEM_TESTS OFF "Whether to perform extra large memory tests") +kokkos_enable_option(DEBUG_BOUNDS_CHECK OFF "Whether to use bounds checking - will increase runtime") +kokkos_enable_option(COMPILER_WARNINGS OFF "Whether to print all compiler warnings") +kokkos_enable_option(TUNING OFF "Whether to create bindings for tuning tools") +kokkos_enable_option(AGGRESSIVE_VECTORIZATION OFF "Whether to aggressively vectorize loops") +kokkos_enable_option(COMPILE_AS_CMAKE_LANGUAGE OFF "Whether to use native cmake language support") +kokkos_enable_option( + HIP_MULTIPLE_KERNEL_INSTANTIATIONS OFF + "Whether multiple kernels are instantiated at compile time - improve performance but increase compile time" +) +kokkos_enable_option(IMPL_HIP_MALLOC_ASYNC OFF "Whether to enable hipMallocAsync") +kokkos_enable_option(OPENACC_FORCE_HOST_AS_DEVICE OFF "Whether to force to use host as a target device for OpenACC") # This option will go away eventually, but allows fallback to old implementation when needed. -KOKKOS_ENABLE_OPTION(DESUL_ATOMICS_EXTERNAL OFF "Whether to use an external desul installation") -KOKKOS_ENABLE_OPTION(ATOMICS_BYPASS OFF "**NOT RECOMMENDED** Whether to make atomics non-atomic for non-threaded MPI-only use cases") -KOKKOS_ENABLE_OPTION(IMPL_REF_COUNT_BRANCH_UNLIKELY ON "Whether to use the C++20 `[[unlikely]]` attribute in the view reference counting") +kokkos_enable_option(DESUL_ATOMICS_EXTERNAL OFF "Whether to use an external desul installation") +kokkos_enable_option( + ATOMICS_BYPASS OFF "**NOT RECOMMENDED** Whether to make atomics non-atomic for non-threaded MPI-only use cases" +) +kokkos_enable_option( + IMPL_REF_COUNT_BRANCH_UNLIKELY ON "Whether to use the C++20 `[[unlikely]]` attribute in the view reference counting" +) mark_as_advanced(Kokkos_ENABLE_IMPL_REF_COUNT_BRANCH_UNLIKELY) -KOKKOS_ENABLE_OPTION(IMPL_VIEW_OF_VIEWS_DESTRUCTOR_PRECONDITION_VIOLATION_WORKAROUND OFF "Whether to enable a workaround for invalid use of View of Views that causes program hang on destruction.") +kokkos_enable_option( + IMPL_VIEW_OF_VIEWS_DESTRUCTOR_PRECONDITION_VIOLATION_WORKAROUND OFF + "Whether to enable a workaround for invalid use of View of Views that causes program hang on destruction." +) mark_as_advanced(Kokkos_ENABLE_IMPL_VIEW_OF_VIEWS_DESTRUCTOR_PRECONDITION_VIOLATION_WORKAROUND) -KOKKOS_ENABLE_OPTION(IMPL_MDSPAN ON "Whether to enable experimental mdspan support") -KOKKOS_ENABLE_OPTION(MDSPAN_EXTERNAL OFF BOOL "Whether to use an external version of mdspan") -KOKKOS_ENABLE_OPTION(IMPL_SKIP_COMPILER_MDSPAN ON BOOL "Whether to use an internal version of mdspan even if the compiler supports mdspan") +kokkos_enable_option(IMPL_MDSPAN ON "Whether to enable experimental mdspan support") +kokkos_enable_option(MDSPAN_EXTERNAL OFF BOOL "Whether to use an external version of mdspan") +kokkos_enable_option( + IMPL_SKIP_COMPILER_MDSPAN ON BOOL "Whether to use an internal version of mdspan even if the compiler supports mdspan" +) mark_as_advanced(Kokkos_ENABLE_IMPL_MDSPAN) mark_as_advanced(Kokkos_ENABLE_MDSPAN_EXTERNAL) mark_as_advanced(Kokkos_ENABLE_IMPL_SKIP_COMPILER_MDSPAN) -IF (Trilinos_ENABLE_Kokkos) - SET(COMPLEX_ALIGN_DEFAULT OFF) -ELSE() - SET(COMPLEX_ALIGN_DEFAULT ON) -ENDIF() -KOKKOS_ENABLE_OPTION(COMPLEX_ALIGN ${COMPLEX_ALIGN_DEFAULT} "Whether to align Kokkos::complex to 2*alignof(RealType)") - -IF (KOKKOS_ENABLE_TESTS) - SET(HEADER_SELF_CONTAINMENT_TESTS_DEFAULT ON) -ELSE() - SET(HEADER_SELF_CONTAINMENT_TESTS_DEFAULT OFF) -ENDIF() -KOKKOS_ENABLE_OPTION(HEADER_SELF_CONTAINMENT_TESTS ${HEADER_SELF_CONTAINMENT_TESTS_DEFAULT} "Enable header self-containment unit tests") -IF (NOT KOKKOS_ENABLE_TESTS AND KOKKOS_ENABLE_HEADER_SELF_CONTAINMENT_TESTS) - MESSAGE(WARNING "Kokkos_ENABLE_HEADER_SELF_CONTAINMENT_TESTS is ON but Kokkos_ENABLE_TESTS is OFF. Option will be ignored.") -ENDIF() - -IF (KOKKOS_ENABLE_CUDA AND (KOKKOS_CXX_COMPILER_ID STREQUAL Clang)) - SET(CUDA_CONSTEXPR_DEFAULT ON) -ELSE() - SET(CUDA_CONSTEXPR_DEFAULT OFF) -ENDIF() -KOKKOS_ENABLE_OPTION(CUDA_CONSTEXPR ${CUDA_CONSTEXPR_DEFAULT} "Whether to activate experimental relaxed constexpr functions") - -IF (KOKKOS_ENABLE_HPX) - SET(HPX_ASYNC_DISPATCH_DEFAULT ON) -ELSE() - SET(HPX_ASYNC_DISPATCH_DEFAULT OFF) -ENDIF() -KOKKOS_ENABLE_OPTION(IMPL_HPX_ASYNC_DISPATCH ${HPX_ASYNC_DISPATCH_DEFAULT} "Whether HPX supports asynchronous dispatch") - -Kokkos_ENABLE_OPTION(UNSUPPORTED_ARCHS OFF "Whether to allow architectures in backends Kokkos doesn't optimize for") - -FUNCTION(check_device_specific_options) - CMAKE_PARSE_ARGUMENTS(SOME "" "DEVICE" "OPTIONS" ${ARGN}) - IF(NOT KOKKOS_ENABLE_${SOME_DEVICE}) - FOREACH(OPTION ${SOME_OPTIONS}) - IF(NOT DEFINED CACHE{Kokkos_ENABLE_${OPTION}} OR NOT DEFINED CACHE{Kokkos_ENABLE_${SOME_DEVICE}}) - MESSAGE(FATAL_ERROR "Internal logic error: option '${OPTION}' or device '${SOME_DEVICE}' not recognized.") - ENDIF() - IF(KOKKOS_ENABLE_${OPTION}) - MESSAGE(WARNING "Kokkos_ENABLE_${OPTION} is ON but ${SOME_DEVICE} backend is not enabled. Option will be ignored.") - UNSET(KOKKOS_ENABLE_${OPTION} PARENT_SCOPE) - ENDIF() - ENDFOREACH() - ENDIF() -ENDFUNCTION() - -CHECK_DEVICE_SPECIFIC_OPTIONS(DEVICE CUDA OPTIONS CUDA_UVM CUDA_RELOCATABLE_DEVICE_CODE CUDA_LAMBDA CUDA_CONSTEXPR CUDA_LDG_INTRINSIC IMPL_CUDA_UNIFIED_MEMORY) -CHECK_DEVICE_SPECIFIC_OPTIONS(DEVICE HIP OPTIONS HIP_RELOCATABLE_DEVICE_CODE) -CHECK_DEVICE_SPECIFIC_OPTIONS(DEVICE HPX OPTIONS IMPL_HPX_ASYNC_DISPATCH) +kokkos_enable_option(COMPLEX_ALIGN ON "Whether to align Kokkos::complex to 2*alignof(RealType)") + +if(KOKKOS_ENABLE_TESTS) + set(HEADER_SELF_CONTAINMENT_TESTS_DEFAULT ON) +else() + set(HEADER_SELF_CONTAINMENT_TESTS_DEFAULT OFF) +endif() +kokkos_enable_option( + HEADER_SELF_CONTAINMENT_TESTS ${HEADER_SELF_CONTAINMENT_TESTS_DEFAULT} "Enable header self-containment unit tests" +) +if(NOT KOKKOS_ENABLE_TESTS AND KOKKOS_ENABLE_HEADER_SELF_CONTAINMENT_TESTS) + message( + WARNING "Kokkos_ENABLE_HEADER_SELF_CONTAINMENT_TESTS is ON but Kokkos_ENABLE_TESTS is OFF. Option will be ignored." + ) +endif() + +if(KOKKOS_ENABLE_CUDA AND (KOKKOS_CXX_COMPILER_ID STREQUAL Clang)) + set(CUDA_CONSTEXPR_DEFAULT ON) +else() + set(CUDA_CONSTEXPR_DEFAULT OFF) +endif() +kokkos_enable_option( + CUDA_CONSTEXPR ${CUDA_CONSTEXPR_DEFAULT} "Whether to activate experimental relaxed constexpr functions" +) + +if(KOKKOS_ENABLE_HPX) + set(HPX_ASYNC_DISPATCH_DEFAULT ON) +else() + set(HPX_ASYNC_DISPATCH_DEFAULT OFF) +endif() +kokkos_enable_option(IMPL_HPX_ASYNC_DISPATCH ${HPX_ASYNC_DISPATCH_DEFAULT} "Whether HPX supports asynchronous dispatch") + +kokkos_enable_option(UNSUPPORTED_ARCHS OFF "Whether to allow architectures in backends Kokkos doesn't optimize for") + +function(check_device_specific_options) + cmake_parse_arguments(SOME "" "DEVICE" "OPTIONS" ${ARGN}) + if(NOT KOKKOS_ENABLE_${SOME_DEVICE}) + foreach(OPTION ${SOME_OPTIONS}) + if(NOT DEFINED CACHE{Kokkos_ENABLE_${OPTION}} OR NOT DEFINED CACHE{Kokkos_ENABLE_${SOME_DEVICE}}) + message(FATAL_ERROR "Internal logic error: option '${OPTION}' or device '${SOME_DEVICE}' not recognized.") + endif() + if(KOKKOS_ENABLE_${OPTION}) + message( + WARNING "Kokkos_ENABLE_${OPTION} is ON but ${SOME_DEVICE} backend is not enabled. Option will be ignored." + ) + unset(KOKKOS_ENABLE_${OPTION} PARENT_SCOPE) + endif() + endforeach() + endif() +endfunction() + +check_device_specific_options( + DEVICE + CUDA + OPTIONS + CUDA_UVM + CUDA_RELOCATABLE_DEVICE_CODE + CUDA_LAMBDA + CUDA_CONSTEXPR + CUDA_LDG_INTRINSIC + IMPL_CUDA_MALLOC_ASYNC + IMPL_CUDA_UNIFIED_MEMORY +) +check_device_specific_options( + DEVICE HIP OPTIONS HIP_RELOCATABLE_DEVICE_CODE HIP_MULTIPLE_KERNEL_INSTANTIATIONS IMPL_HIP_MALLOC_ASYNC +) +check_device_specific_options(DEVICE HPX OPTIONS IMPL_HPX_ASYNC_DISPATCH) +check_device_specific_options(DEVICE OPENACC OPTIONS OPENACC_FORCE_HOST_AS_DEVICE) # Needed due to change from deprecated name to new header define name -IF (KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) - SET(KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION ON) -ENDIF() +if(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) + set(KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION ON) +endif() # Force consistency of KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE # and CMAKE_CUDA_SEPARABLE_COMPILATION when we are compiling # using the CMake CUDA language support. # Either one being on will turn the other one on. -IF (KOKKOS_COMPILE_LANGUAGE STREQUAL CUDA) - IF (KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE) - IF (NOT CMAKE_CUDA_SEPARABLE_COMPILATION) - MESSAGE(STATUS "Setting CMAKE_CUDA_SEPARABLE_COMPILATION=ON since Kokkos_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE is true. When compiling Kokkos with CMake language CUDA, please use CMAKE_CUDA_SEPARABLE_COMPILATION to control RDC support") - SET(CMAKE_CUDA_SEPARABLE_COMPILATION ON) - ENDIF() - ELSE() - IF (CMAKE_CUDA_SEPARABLE_COMPILATION) - SET(KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE ON) - ENDIF() - ENDIF() -ENDIF() +if(KOKKOS_COMPILE_LANGUAGE STREQUAL CUDA) + if(KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE) + if(NOT CMAKE_CUDA_SEPARABLE_COMPILATION) + message( + STATUS + "Setting CMAKE_CUDA_SEPARABLE_COMPILATION=ON since Kokkos_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE is true. When compiling Kokkos with CMake language CUDA, please use CMAKE_CUDA_SEPARABLE_COMPILATION to control RDC support" + ) + set(CMAKE_CUDA_SEPARABLE_COMPILATION ON) + endif() + else() + if(CMAKE_CUDA_SEPARABLE_COMPILATION) + set(KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE ON) + endif() + endif() +endif() # This is known to occur with Clang 9. We would need to use nvcc as the linker # http://lists.llvm.org/pipermail/cfe-dev/2018-June/058296.html # TODO: Through great effort we can use a different linker by hacking # CMAKE_CXX_LINK_EXECUTABLE in a future release -IF (KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE AND KOKKOS_CXX_COMPILER_ID STREQUAL Clang) - MESSAGE(FATAL_ERROR "Relocatable device code is currently not supported with Clang - must use nvcc_wrapper or turn off RDC") -ENDIF() - -IF (KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE AND BUILD_SHARED_LIBS) - MESSAGE(FATAL_ERROR "Relocatable device code requires static libraries.") -ENDIF() - -IF(Kokkos_ENABLE_CUDA_LDG_INTRINSIC) - IF(KOKKOS_ENABLE_DEPRECATED_CODE_4) - MESSAGE(DEPRECATION "Setting Kokkos_ENABLE_CUDA_LDG_INTRINSIC is deprecated. LDG intrinsics are always enabled.") - ELSE() - MESSAGE(FATAL_ERROR "Kokkos_ENABLE_CUDA_LDG_INTRINSIC has been removed. LDG intrinsics are always enabled.") - ENDIF() -ENDIF() -IF(Kokkos_ENABLE_CUDA AND NOT Kokkos_ENABLE_CUDA_LAMBDA) - IF(KOKKOS_ENABLE_DEPRECATED_CODE_4) - MESSAGE(DEPRECATION "Setting Kokkos_ENABLE_CUDA_LAMBDA is deprecated. Lambda expressions in device code are always enabled. Forcing -DKokkos_ENABLE_CUDA_LAMBDA=ON") +if(KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE AND KOKKOS_CXX_COMPILER_ID STREQUAL Clang) + message( + FATAL_ERROR "Relocatable device code is currently not supported with Clang - must use nvcc_wrapper or turn off RDC" + ) +endif() + +if(KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE AND BUILD_SHARED_LIBS) + message(FATAL_ERROR "Relocatable device code requires static libraries.") +endif() + +if(Kokkos_ENABLE_CUDA_LDG_INTRINSIC) + if(KOKKOS_ENABLE_DEPRECATED_CODE_4) + message(DEPRECATION "Setting Kokkos_ENABLE_CUDA_LDG_INTRINSIC is deprecated. LDG intrinsics are always enabled.") + else() + message(FATAL_ERROR "Kokkos_ENABLE_CUDA_LDG_INTRINSIC has been removed. LDG intrinsics are always enabled.") + endif() +endif() +if(Kokkos_ENABLE_CUDA AND NOT Kokkos_ENABLE_CUDA_LAMBDA) + if(KOKKOS_ENABLE_DEPRECATED_CODE_4) + message( + DEPRECATION + "Setting Kokkos_ENABLE_CUDA_LAMBDA is deprecated. Lambda expressions in device code are always enabled. Forcing -DKokkos_ENABLE_CUDA_LAMBDA=ON" + ) set(Kokkos_ENABLE_CUDA_LAMBDA ON CACHE BOOL "Kokkos turned Cuda lambda support ON!" FORCE) set(KOKKOS_ENABLE_CUDA_LAMBDA ON) - ELSE() - MESSAGE(FATAL_ERROR "Kokkos_ENABLE_CUDA_LAMBDA has been removed. Lambda expressions in device code always enabled.") - ENDIF() -ENDIF() - - -IF(DEFINED Kokkos_ENABLE_IMPL_DESUL_ATOMICS) - MESSAGE(WARNING "Kokkos_ENABLE_IMPL_DESUL_ATOMICS option has been removed. Desul atomics cannot be disabled.") -ENDIF() + else() + message(FATAL_ERROR "Kokkos_ENABLE_CUDA_LAMBDA has been removed. Lambda expressions in device code always enabled.") + endif() +endif() + +if(DEFINED Kokkos_ENABLE_IMPL_DESUL_ATOMICS) + message(WARNING "Kokkos_ENABLE_IMPL_DESUL_ATOMICS option has been removed. Desul atomics cannot be disabled.") +endif() diff --git a/packages/kokkos/cmake/kokkos_functions.cmake b/packages/kokkos/cmake/kokkos_functions.cmake index d1f1e0d7a785..38eedd8362c5 100644 --- a/packages/kokkos/cmake/kokkos_functions.cmake +++ b/packages/kokkos/cmake/kokkos_functions.cmake @@ -5,12 +5,8 @@ # Validate options are given with correct case and define an internal # upper-case version for use within -set(Kokkos_OPTIONS_NOT_TO_EXPORT - Kokkos_ENABLE_BENCHMARKS - Kokkos_ENABLE_EXAMPLES - Kokkos_ENABLE_TESTS - Kokkos_ENABLE_HEADER_SELF_CONTAINMENT_TESTS - Kokkos_ENABLE_COMPILER_WARNINGS +set(Kokkos_OPTIONS_NOT_TO_EXPORT Kokkos_ENABLE_BENCHMARKS Kokkos_ENABLE_EXAMPLES Kokkos_ENABLE_TESTS + Kokkos_ENABLE_HEADER_SELF_CONTAINMENT_TESTS Kokkos_ENABLE_COMPILER_WARNINGS ) # @@ -22,139 +18,122 @@ set(Kokkos_OPTIONS_NOT_TO_EXPORT # It attempts to print a helpful message about updating the options for the new CMake. # Kokkos_${SUFFIX} is the name of the option (like Kokkos_ARCH) being checked. # Kokkos_${PREFIX}_X is the name of new option to be defined from a list X,Y,Z,... -FUNCTION(kokkos_deprecated_list SUFFIX PREFIX) - SET(CAMEL_NAME Kokkos_${SUFFIX}) - STRING(TOUPPER ${CAMEL_NAME} UC_NAME) +function(kokkos_deprecated_list SUFFIX PREFIX) + set(CAMEL_NAME Kokkos_${SUFFIX}) + string(TOUPPER ${CAMEL_NAME} UC_NAME) #I don't love doing it this way but better to be safe - FOREACH(opt ${KOKKOS_GIVEN_VARIABLES}) - STRING(TOUPPER ${opt} OPT_UC) - IF ("${OPT_UC}" STREQUAL "${UC_NAME}") - STRING(REPLACE "," ";" optlist "${${opt}}") - SET(ERROR_MSG "Given deprecated option list ${opt}. This must now be given as separate -D options, which assuming you spelled options correctly would be:") - FOREACH(entry ${optlist}) - STRING(TOUPPER ${entry} ENTRY_UC) - STRING(APPEND ERROR_MSG "\n -DKokkos_${PREFIX}_${ENTRY_UC}=ON") - ENDFOREACH() - STRING(APPEND ERROR_MSG "\nRemove CMakeCache.txt and re-run. For a list of valid options, refer to BUILD.md or even look at CMakeCache.txt (before deleting it).") - IF (KOKKOS_HAS_TRILINOS) - MESSAGE(WARNING ${ERROR_MSG}) - FOREACH(entry ${optlist}) - STRING(TOUPPER ${entry} ENTRY_UC) - SET(${CAMEL_NAME}_${ENTRY_UC} ON CACHE BOOL "Deprecated Trilinos translation") - ENDFOREACH() - UNSET(${opt} CACHE) - ELSE() - MESSAGE(SEND_ERROR ${ERROR_MSG}) - ENDIF() - ENDIF() - ENDFOREACH() -ENDFUNCTION() - -FUNCTION(kokkos_option CAMEL_SUFFIX DEFAULT TYPE DOCSTRING) - SET(CAMEL_NAME Kokkos_${CAMEL_SUFFIX}) - STRING(TOUPPER ${CAMEL_NAME} UC_NAME) - - LIST(APPEND KOKKOS_OPTION_KEYS ${CAMEL_SUFFIX}) - SET(KOKKOS_OPTION_KEYS ${KOKKOS_OPTION_KEYS} PARENT_SCOPE) - LIST(APPEND KOKKOS_OPTION_VALUES "${DOCSTRING}") - SET(KOKKOS_OPTION_VALUES ${KOKKOS_OPTION_VALUES} PARENT_SCOPE) - LIST(APPEND KOKKOS_OPTION_TYPES ${TYPE}) - SET(KOKKOS_OPTION_TYPES ${KOKKOS_OPTION_TYPES} PARENT_SCOPE) + foreach(opt ${KOKKOS_GIVEN_VARIABLES}) + string(TOUPPER ${opt} OPT_UC) + if("${OPT_UC}" STREQUAL "${UC_NAME}") + string(REPLACE "," ";" optlist "${${opt}}") + set(ERROR_MSG + "Given deprecated option list ${opt}. This must now be given as separate -D options, which assuming you spelled options correctly would be:" + ) + foreach(entry ${optlist}) + string(TOUPPER ${entry} ENTRY_UC) + string(APPEND ERROR_MSG "\n -DKokkos_${PREFIX}_${ENTRY_UC}=ON") + endforeach() + string( + APPEND + ERROR_MSG + "\nRemove CMakeCache.txt and re-run. For a list of valid options, refer to BUILD.md or even look at CMakeCache.txt (before deleting it)." + ) + message(SEND_ERROR ${ERROR_MSG}) + endif() + endforeach() +endfunction() + +function(kokkos_option CAMEL_SUFFIX DEFAULT TYPE DOCSTRING) + set(CAMEL_NAME Kokkos_${CAMEL_SUFFIX}) + string(TOUPPER ${CAMEL_NAME} UC_NAME) + + list(APPEND KOKKOS_OPTION_KEYS ${CAMEL_SUFFIX}) + set(KOKKOS_OPTION_KEYS ${KOKKOS_OPTION_KEYS} PARENT_SCOPE) + list(APPEND KOKKOS_OPTION_VALUES "${DOCSTRING}") + set(KOKKOS_OPTION_VALUES ${KOKKOS_OPTION_VALUES} PARENT_SCOPE) + list(APPEND KOKKOS_OPTION_TYPES ${TYPE}) + set(KOKKOS_OPTION_TYPES ${KOKKOS_OPTION_TYPES} PARENT_SCOPE) # Make sure this appears in the cache with the appropriate DOCSTRING - SET(${CAMEL_NAME} ${DEFAULT} CACHE ${TYPE} ${DOCSTRING}) - - IF (KOKKOS_HAS_TRILINOS) - IF (NOT CAMEL_NAME IN_LIST Kokkos_OPTIONS_NOT_TO_EXPORT) - TRIBITS_PKG_EXPORT_CACHE_VAR(${CAMEL_NAME}) - ENDIF() - ENDIF() + set(${CAMEL_NAME} ${DEFAULT} CACHE ${TYPE} ${DOCSTRING}) #I don't love doing it this way because it's N^2 in number options, but c'est la vie - FOREACH(opt ${KOKKOS_GIVEN_VARIABLES}) - STRING(TOUPPER ${opt} OPT_UC) - IF ("${OPT_UC}" STREQUAL "${UC_NAME}") - IF (NOT "${opt}" STREQUAL "${CAMEL_NAME}") - IF (KOKKOS_HAS_TRILINOS) - #Allow this for now if Trilinos... we need to bootstrap our way to integration - MESSAGE(WARNING "Deprecated option ${opt} found - please change spelling to ${CAMEL_NAME}") - SET(${CAMEL_NAME} "${${opt}}" CACHE ${TYPE} ${DOCSTRING} FORCE) - UNSET(${opt} CACHE) - ELSE() - MESSAGE(FATAL_ERROR "Matching option found for ${CAMEL_NAME} with the wrong case ${opt}. Please delete your CMakeCache.txt and change option to -D${CAMEL_NAME}=${${opt}}. This is now enforced to avoid hard-to-debug CMake cache inconsistencies.") - ENDIF() - ENDIF() - ENDIF() - ENDFOREACH() + foreach(opt ${KOKKOS_GIVEN_VARIABLES}) + string(TOUPPER ${opt} OPT_UC) + if("${OPT_UC}" STREQUAL "${UC_NAME}") + if(NOT "${opt}" STREQUAL "${CAMEL_NAME}") + message( + FATAL_ERROR + "Matching option found for ${CAMEL_NAME} with the wrong case ${opt}. Please delete your CMakeCache.txt and change option to -D${CAMEL_NAME}=${${opt}}. This is now enforced to avoid hard-to-debug CMake cache inconsistencies." + ) + endif() + endif() + endforeach() #okay, great, we passed the validation test - use the default - IF (DEFINED ${CAMEL_NAME}) - SET(${UC_NAME} ${${CAMEL_NAME}} PARENT_SCOPE) - ELSE() - SET(${UC_NAME} ${DEFAULT} PARENT_SCOPE) - ENDIF() -ENDFUNCTION() - -INCLUDE (CMakeDependentOption) -FUNCTION(kokkos_dependent_option CAMEL_SUFFIX DOCSTRING DEFAULT DEPENDENCY FORCE) - SET(CAMEL_NAME Kokkos_${CAMEL_SUFFIX}) - STRING(TOUPPER ${CAMEL_NAME} UC_NAME) - - LIST(APPEND KOKKOS_OPTION_KEYS ${CAMEL_SUFFIX}) - SET(KOKKOS_OPTION_KEYS ${KOKKOS_OPTION_KEYS} PARENT_SCOPE) - LIST(APPEND KOKKOS_OPTION_VALUES "${DOCSTRING}") - SET(KOKKOS_OPTION_VALUES ${KOKKOS_OPTION_VALUES} PARENT_SCOPE) - LIST(APPEND KOKKOS_OPTION_TYPES BOOL) - SET(KOKKOS_OPTION_TYPES ${KOKKOS_OPTION_TYPES} PARENT_SCOPE) - - CMAKE_DEPENDENT_OPTION(${CAMEL_NAME} ${DOCSTRING} ${DEFAULT} "${DEPENDENCY}" ${FORCE}) + if(DEFINED ${CAMEL_NAME}) + set(${UC_NAME} ${${CAMEL_NAME}} PARENT_SCOPE) + else() + set(${UC_NAME} ${DEFAULT} PARENT_SCOPE) + endif() +endfunction() + +include(CMakeDependentOption) +function(kokkos_dependent_option CAMEL_SUFFIX DOCSTRING DEFAULT DEPENDENCY FORCE) + set(CAMEL_NAME Kokkos_${CAMEL_SUFFIX}) + string(TOUPPER ${CAMEL_NAME} UC_NAME) + + list(APPEND KOKKOS_OPTION_KEYS ${CAMEL_SUFFIX}) + set(KOKKOS_OPTION_KEYS ${KOKKOS_OPTION_KEYS} PARENT_SCOPE) + list(APPEND KOKKOS_OPTION_VALUES "${DOCSTRING}") + set(KOKKOS_OPTION_VALUES ${KOKKOS_OPTION_VALUES} PARENT_SCOPE) + list(APPEND KOKKOS_OPTION_TYPES BOOL) + set(KOKKOS_OPTION_TYPES ${KOKKOS_OPTION_TYPES} PARENT_SCOPE) + + cmake_dependent_option(${CAMEL_NAME} ${DOCSTRING} ${DEFAULT} "${DEPENDENCY}" ${FORCE}) #I don't love doing it this way because it's N^2 in number options, but c'est la vie - FOREACH(opt ${KOKKOS_GIVEN_VARIABLES}) - STRING(TOUPPER ${opt} OPT_UC) - IF ("${OPT_UC}" STREQUAL "${UC_NAME}") - IF (NOT "${opt}" STREQUAL "${CAMEL_NAME}") - IF (KOKKOS_HAS_TRILINOS) - #Allow this for now if Trilinos... we need to bootstrap our way to integration - MESSAGE(WARNING "Deprecated option ${opt} found - please change spelling to ${CAMEL_NAME}") - SET(${CAMEL_NAME} "${${opt}}" CACHE ${TYPE} ${DOCSTRING} FORCE) - UNSET(${opt} CACHE) - ELSE() - MESSAGE(FATAL_ERROR "Matching option found for ${CAMEL_NAME} with the wrong case ${opt}. Please delete your CMakeCache.txt and change option to -D${CAMEL_NAME}=${${opt}}. This is now enforced to avoid hard-to-debug CMake cache inconsistencies.") - ENDIF() - ENDIF() - ENDIF() - ENDFOREACH() + foreach(opt ${KOKKOS_GIVEN_VARIABLES}) + string(TOUPPER ${opt} OPT_UC) + if("${OPT_UC}" STREQUAL "${UC_NAME}") + if(NOT "${opt}" STREQUAL "${CAMEL_NAME}") + message( + FATAL_ERROR + "Matching option found for ${CAMEL_NAME} with the wrong case ${opt}. Please delete your CMakeCache.txt and change option to -D${CAMEL_NAME}=${${opt}}. This is now enforced to avoid hard-to-debug CMake cache inconsistencies." + ) + endif() + endif() + endforeach() #okay, great, we passed the validation test - use the default - IF (DEFINED ${CAMEL_NAME}) - SET(${UC_NAME} ${${CAMEL_NAME}} PARENT_SCOPE) - ELSE() - SET(${UC_NAME} ${DEFAULT} PARENT_SCOPE) - ENDIF() -ENDFUNCTION() - -FUNCTION(kokkos_set_option CAMEL_SUFFIX VALUE) - LIST(FIND KOKKOS_OPTION_KEYS ${CAMEL_SUFFIX} OPTION_INDEX) - IF(OPTION_INDEX EQUAL -1) - MESSAGE(FATAL_ERROR "Couldn't set value for Kokkos_${CAMEL_SUFFIX}") - ENDIF() - SET(CAMEL_NAME Kokkos_${CAMEL_SUFFIX}) - STRING(TOUPPER ${CAMEL_NAME} UC_NAME) - - LIST(GET KOKKOS_OPTION_VALUES ${OPTION_INDEX} DOCSTRING) - LIST(GET KOKKOS_OPTION_TYPES ${OPTION_INDEX} TYPE) - SET(${CAMEL_NAME} ${VALUE} CACHE ${TYPE} ${DOCSTRING} FORCE) - MESSAGE(STATUS "Setting ${CAMEL_NAME}=${VALUE}") - SET(${UC_NAME} ${VALUE} PARENT_SCOPE) -ENDFUNCTION() - -FUNCTION(kokkos_append_config_line LINE) - GLOBAL_APPEND(KOKKOS_TPL_EXPORTS "${LINE}") -ENDFUNCTION() - -MACRO(kokkos_export_cmake_tpl NAME) + if(DEFINED ${CAMEL_NAME}) + set(${UC_NAME} ${${CAMEL_NAME}} PARENT_SCOPE) + else() + set(${UC_NAME} ${DEFAULT} PARENT_SCOPE) + endif() +endfunction() + +function(kokkos_set_option CAMEL_SUFFIX VALUE) + list(FIND KOKKOS_OPTION_KEYS ${CAMEL_SUFFIX} OPTION_INDEX) + if(OPTION_INDEX EQUAL -1) + message(FATAL_ERROR "Couldn't set value for Kokkos_${CAMEL_SUFFIX}") + endif() + set(CAMEL_NAME Kokkos_${CAMEL_SUFFIX}) + string(TOUPPER ${CAMEL_NAME} UC_NAME) + + list(GET KOKKOS_OPTION_VALUES ${OPTION_INDEX} DOCSTRING) + list(GET KOKKOS_OPTION_TYPES ${OPTION_INDEX} TYPE) + set(${CAMEL_NAME} ${VALUE} CACHE ${TYPE} ${DOCSTRING} FORCE) + message(STATUS "Setting ${CAMEL_NAME}=${VALUE}") + set(${UC_NAME} ${VALUE} PARENT_SCOPE) +endfunction() + +function(kokkos_append_config_line LINE) + global_append(KOKKOS_TPL_EXPORTS "${LINE}") +endfunction() + +macro(kokkos_export_cmake_tpl NAME) cmake_parse_arguments(KOKKOS_EXTRA_ARG "REQUIRED" "" "COMPONENTS" ${ARGN}) #CMake TPLs are located with a call to find_package @@ -163,91 +142,88 @@ MACRO(kokkos_export_cmake_tpl NAME) #If Kokkos was configured to find the TPL through a _DIR variable #make sure thar DIR variable is available to downstream packages - IF (DEFINED ${NAME}_DIR) + if(DEFINED ${NAME}_DIR) #The downstream project may override the TPL location that Kokkos used #Check if the downstream project chose its own TPL location #If not, make the Kokkos found location available - KOKKOS_APPEND_CONFIG_LINE("IF(NOT DEFINED ${NAME}_DIR)") - KOKKOS_APPEND_CONFIG_LINE(" SET(${NAME}_DIR ${${NAME}_DIR})") - KOKKOS_APPEND_CONFIG_LINE("ENDIF()") - ENDIF() + kokkos_append_config_line("IF(NOT DEFINED ${NAME}_DIR)") + kokkos_append_config_line(" SET(${NAME}_DIR ${${NAME}_DIR})") + kokkos_append_config_line("ENDIF()") + endif() - IF (DEFINED ${NAME}_ROOT) + if(DEFINED ${NAME}_ROOT) #The downstream project may override the TPL location that Kokkos used #Check if the downstream project chose its own TPL location #If not, make the Kokkos found location available - KOKKOS_APPEND_CONFIG_LINE("IF(NOT DEFINED ${NAME}_ROOT)") - KOKKOS_APPEND_CONFIG_LINE(" SET(${NAME}_ROOT ${${NAME}_ROOT})") - KOKKOS_APPEND_CONFIG_LINE("ENDIF()") - ENDIF() - SET(KOKKOS_CONFIG_STRING "FIND_DEPENDENCY(${NAME}") - - IF(KOKKOS_EXTRA_ARG_REQUIRED) - STRING(APPEND KOKKOS_CONFIG_STRING " REQUIRED") - ENDIF() - IF(KOKKOS_EXTRA_ARG_COMPONENTS) - STRING(APPEND KOKKOS_CONFIG_STRING " COMPONENTS ${KOKKOS_EXTRA_ARG_COMPONENTS}") - ENDIF() - STRING(APPEND KOKKOS_CONFIG_STRING ")") - KOKKOS_APPEND_CONFIG_LINE(${KOKKOS_CONFIG_STRING}) -ENDMACRO() - -MACRO(kokkos_export_imported_tpl NAME) - IF (NOT KOKKOS_HAS_TRILINOS) - GET_TARGET_PROPERTY(LIB_IMPORTED ${NAME} IMPORTED) - IF (NOT LIB_IMPORTED) - # This is not an imported target - # This an interface library that we created - INSTALL( - TARGETS ${NAME} - EXPORT KokkosTargets - RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} - LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} - ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} - ) - ELSE() - #make sure this also gets "exported" in the config file - KOKKOS_APPEND_CONFIG_LINE("IF(NOT TARGET ${NAME})") - - GET_TARGET_PROPERTY(LIB_TYPE ${NAME} TYPE) - IF (${LIB_TYPE} STREQUAL "INTERFACE_LIBRARY") - KOKKOS_APPEND_CONFIG_LINE("ADD_LIBRARY(${NAME} INTERFACE IMPORTED)") - KOKKOS_APPEND_CONFIG_LINE("SET_TARGET_PROPERTIES(${NAME} PROPERTIES") - ELSE() - KOKKOS_APPEND_CONFIG_LINE("ADD_LIBRARY(${NAME} UNKNOWN IMPORTED)") - KOKKOS_APPEND_CONFIG_LINE("SET_TARGET_PROPERTIES(${NAME} PROPERTIES") - GET_TARGET_PROPERTY(TPL_LIBRARY ${NAME} IMPORTED_LOCATION) - IF(TPL_LIBRARY) - KOKKOS_APPEND_CONFIG_LINE("IMPORTED_LOCATION \"${TPL_LIBRARY}\"") - ENDIF() - ENDIF() - - GET_TARGET_PROPERTY(TPL_INCLUDES ${NAME} INTERFACE_INCLUDE_DIRECTORIES) - IF(TPL_INCLUDES) - KOKKOS_APPEND_CONFIG_LINE("INTERFACE_INCLUDE_DIRECTORIES \"${TPL_INCLUDES}\"") - ENDIF() - - GET_TARGET_PROPERTY(TPL_COMPILE_OPTIONS ${NAME} INTERFACE_COMPILE_OPTIONS) - IF(TPL_COMPILE_OPTIONS) - KOKKOS_APPEND_CONFIG_LINE("INTERFACE_COMPILE_OPTIONS ${TPL_COMPILE_OPTIONS}") - ENDIF() - - SET(TPL_LINK_OPTIONS) - GET_TARGET_PROPERTY(TPL_LINK_OPTIONS ${NAME} INTERFACE_LINK_OPTIONS) - IF(TPL_LINK_OPTIONS) - KOKKOS_APPEND_CONFIG_LINE("INTERFACE_LINK_OPTIONS ${TPL_LINK_OPTIONS}") - ENDIF() - - GET_TARGET_PROPERTY(TPL_LINK_LIBRARIES ${NAME} INTERFACE_LINK_LIBRARIES) - IF(TPL_LINK_LIBRARIES) - KOKKOS_APPEND_CONFIG_LINE("INTERFACE_LINK_LIBRARIES \"${TPL_LINK_LIBRARIES}\"") - ENDIF() - KOKKOS_APPEND_CONFIG_LINE(")") - KOKKOS_APPEND_CONFIG_LINE("ENDIF()") - ENDIF() - ENDIF() -ENDMACRO() - + kokkos_append_config_line("IF(NOT DEFINED ${NAME}_ROOT)") + kokkos_append_config_line(" SET(${NAME}_ROOT ${${NAME}_ROOT})") + kokkos_append_config_line("ENDIF()") + endif() + set(KOKKOS_CONFIG_STRING "FIND_DEPENDENCY(${NAME}") + + if(KOKKOS_EXTRA_ARG_REQUIRED) + string(APPEND KOKKOS_CONFIG_STRING " REQUIRED") + endif() + if(KOKKOS_EXTRA_ARG_COMPONENTS) + string(APPEND KOKKOS_CONFIG_STRING " COMPONENTS ${KOKKOS_EXTRA_ARG_COMPONENTS}") + endif() + string(APPEND KOKKOS_CONFIG_STRING ")") + kokkos_append_config_line(${KOKKOS_CONFIG_STRING}) +endmacro() + +macro(kokkos_export_imported_tpl NAME) + get_target_property(LIB_IMPORTED ${NAME} IMPORTED) + if(NOT LIB_IMPORTED) + # This is not an imported target + # This an interface library that we created + install( + TARGETS ${NAME} + EXPORT KokkosTargets + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + ) + else() + #make sure this also gets "exported" in the config file + kokkos_append_config_line("IF(NOT TARGET ${NAME})") + + get_target_property(LIB_TYPE ${NAME} TYPE) + if(${LIB_TYPE} STREQUAL "INTERFACE_LIBRARY") + kokkos_append_config_line("ADD_LIBRARY(${NAME} INTERFACE IMPORTED)") + kokkos_append_config_line("SET_TARGET_PROPERTIES(${NAME} PROPERTIES") + else() + kokkos_append_config_line("ADD_LIBRARY(${NAME} UNKNOWN IMPORTED)") + kokkos_append_config_line("SET_TARGET_PROPERTIES(${NAME} PROPERTIES") + get_target_property(TPL_LIBRARY ${NAME} IMPORTED_LOCATION) + if(TPL_LIBRARY) + kokkos_append_config_line("IMPORTED_LOCATION \"${TPL_LIBRARY}\"") + endif() + endif() + + get_target_property(TPL_INCLUDES ${NAME} INTERFACE_INCLUDE_DIRECTORIES) + if(TPL_INCLUDES) + kokkos_append_config_line("INTERFACE_INCLUDE_DIRECTORIES \"${TPL_INCLUDES}\"") + endif() + + get_target_property(TPL_COMPILE_OPTIONS ${NAME} INTERFACE_COMPILE_OPTIONS) + if(TPL_COMPILE_OPTIONS) + kokkos_append_config_line("INTERFACE_COMPILE_OPTIONS ${TPL_COMPILE_OPTIONS}") + endif() + + set(TPL_LINK_OPTIONS) + get_target_property(TPL_LINK_OPTIONS ${NAME} INTERFACE_LINK_OPTIONS) + if(TPL_LINK_OPTIONS) + kokkos_append_config_line("INTERFACE_LINK_OPTIONS ${TPL_LINK_OPTIONS}") + endif() + + get_target_property(TPL_LINK_LIBRARIES ${NAME} INTERFACE_LINK_LIBRARIES) + if(TPL_LINK_LIBRARIES) + kokkos_append_config_line("INTERFACE_LINK_LIBRARIES \"${TPL_LINK_LIBRARIES}\"") + endif() + kokkos_append_config_line(")") + kokkos_append_config_line("ENDIF()") + endif() +endmacro() # # @MACRO: KOKKOS_IMPORT_TPL() @@ -271,57 +247,43 @@ ENDMACRO() # # If specified, this TPL will build an INTERFACE library rather than an # IMPORTED target -IF (KOKKOS_HAS_TRILINOS) -MACRO(kokkos_import_tpl NAME) - #do nothing -ENDMACRO() -ELSE() -MACRO(kokkos_import_tpl NAME) - CMAKE_PARSE_ARGUMENTS(TPL - "NO_EXPORT;INTERFACE" - "" - "" - ${ARGN}) - IF (TPL_INTERFACE) - SET(TPL_IMPORTED_NAME ${NAME}) - ELSE() - SET(TPL_IMPORTED_NAME Kokkos::${NAME}) - ENDIF() - - IF (KOKKOS_ENABLE_${NAME}) +macro(kokkos_import_tpl NAME) + cmake_parse_arguments(TPL "NO_EXPORT;INTERFACE" "" "" ${ARGN}) + if(TPL_INTERFACE) + set(TPL_IMPORTED_NAME ${NAME}) + else() + set(TPL_IMPORTED_NAME Kokkos::${NAME}) + endif() + + if(KOKKOS_ENABLE_${NAME}) #Tack on a TPL here to make sure we avoid using anyone else's find - FIND_PACKAGE(TPL${NAME} REQUIRED MODULE) - IF(NOT TARGET ${TPL_IMPORTED_NAME}) - MESSAGE(FATAL_ERROR "Find module succeeded for ${NAME}, but did not produce valid target ${TPL_IMPORTED_NAME}") - ENDIF() - IF(NOT TPL_NO_EXPORT) - GET_TARGET_PROPERTY(TPL_ORIGINAL_NAME ${TPL_IMPORTED_NAME} ALIASED_TARGET) - IF (NOT TPL_ORIGINAL_NAME) - SET(TPL_ORIGINAL_NAME ${TPL_IMPORTED_NAME}) - ENDIF() - KOKKOS_EXPORT_IMPORTED_TPL(${TPL_ORIGINAL_NAME}) - ENDIF() - LIST(APPEND KOKKOS_ENABLED_TPLS ${NAME}) - ENDIF() -ENDMACRO(kokkos_import_tpl) -ENDIF() - -MACRO(kokkos_import_cmake_tpl MODULE_NAME) + find_package(TPL${NAME} REQUIRED MODULE) + if(NOT TARGET ${TPL_IMPORTED_NAME}) + message(FATAL_ERROR "Find module succeeded for ${NAME}, but did not produce valid target ${TPL_IMPORTED_NAME}") + endif() + if(NOT TPL_NO_EXPORT) + get_target_property(TPL_ORIGINAL_NAME ${TPL_IMPORTED_NAME} ALIASED_TARGET) + if(NOT TPL_ORIGINAL_NAME) + set(TPL_ORIGINAL_NAME ${TPL_IMPORTED_NAME}) + endif() + kokkos_export_imported_tpl(${TPL_ORIGINAL_NAME}) + endif() + list(APPEND KOKKOS_ENABLED_TPLS ${NAME}) + endif() +endmacro(kokkos_import_tpl) + +macro(kokkos_import_cmake_tpl MODULE_NAME) kokkos_import_tpl(${MODULE_NAME} ${ARGN} NO_EXPORT) - CMAKE_PARSE_ARGUMENTS(TPL - "NO_EXPORT" - "OPTION_NAME" - "" - ${ARGN}) + cmake_parse_arguments(TPL "NO_EXPORT" "OPTION_NAME" "" ${ARGN}) - IF (NOT TPL_OPTION_NAME) - SET(TPL_OPTION_NAME ${MODULE_NAME}) - ENDIF() + if(NOT TPL_OPTION_NAME) + set(TPL_OPTION_NAME ${MODULE_NAME}) + endif() - IF (NOT TPL_NO_EXPORT) - KOKKOS_EXPORT_CMAKE_TPL(${MODULE_NAME}) - ENDIF() -ENDMACRO() + if(NOT TPL_NO_EXPORT) + kokkos_export_cmake_tpl(${MODULE_NAME}) + endif() +endmacro() # # @MACRO: KOKKOS_CREATE_IMPORTED_TPL() @@ -368,68 +330,57 @@ ENDMACRO() # # If specified, this gives a list of linker flags that must be used # for using this library. -MACRO(kokkos_create_imported_tpl NAME) - CMAKE_PARSE_ARGUMENTS(TPL - "INTERFACE" - "LIBRARY" - "LINK_LIBRARIES;INCLUDES;COMPILE_DEFINITIONS;COMPILE_OPTIONS;LINK_OPTIONS" - ${ARGN}) - - - IF (KOKKOS_HAS_TRILINOS) - #TODO: we need to set a bunch of cache variables here - ELSEIF (TPL_INTERFACE) - ADD_LIBRARY(${NAME} INTERFACE) +macro(kokkos_create_imported_tpl NAME) + cmake_parse_arguments( + TPL "INTERFACE" "LIBRARY" "LINK_LIBRARIES;INCLUDES;COMPILE_DEFINITIONS;COMPILE_OPTIONS;LINK_OPTIONS" ${ARGN} + ) + + if(TPL_INTERFACE) + add_library(${NAME} INTERFACE) #Give this an importy-looking name - ADD_LIBRARY(Kokkos::${NAME} ALIAS ${NAME}) - IF (TPL_LIBRARY) - MESSAGE(SEND_ERROR "TPL Interface library ${NAME} should not have an IMPORTED_LOCATION") - ENDIF() + add_library(Kokkos::${NAME} ALIAS ${NAME}) + if(TPL_LIBRARY) + message(SEND_ERROR "TPL Interface library ${NAME} should not have an IMPORTED_LOCATION") + endif() #Things have to go in quoted in case we have multiple list entries - IF(TPL_LINK_LIBRARIES) - TARGET_LINK_LIBRARIES(${NAME} INTERFACE ${TPL_LINK_LIBRARIES}) - ENDIF() - IF(TPL_INCLUDES) - TARGET_INCLUDE_DIRECTORIES(${NAME} INTERFACE ${TPL_INCLUDES}) - ENDIF() - IF(TPL_COMPILE_DEFINITIONS) - TARGET_COMPILE_DEFINITIONS(${NAME} INTERFACE ${TPL_COMPILE_DEFINITIONS}) - ENDIF() - IF(TPL_COMPILE_OPTIONS) - TARGET_COMPILE_OPTIONS(${NAME} INTERFACE ${TPL_COMPILE_OPTIONS}) - ENDIF() - IF(TPL_LINK_OPTIONS) - TARGET_LINK_LIBRARIES(${NAME} INTERFACE ${TPL_LINK_OPTIONS}) - ENDIF() - ELSE() - ADD_LIBRARY(${NAME} UNKNOWN IMPORTED) - IF(TPL_LIBRARY) - SET_TARGET_PROPERTIES(${NAME} PROPERTIES - IMPORTED_LOCATION ${TPL_LIBRARY}) - ENDIF() + if(TPL_LINK_LIBRARIES) + target_link_libraries(${NAME} INTERFACE ${TPL_LINK_LIBRARIES}) + endif() + if(TPL_INCLUDES) + target_include_directories(${NAME} INTERFACE ${TPL_INCLUDES}) + endif() + if(TPL_COMPILE_DEFINITIONS) + target_compile_definitions(${NAME} INTERFACE ${TPL_COMPILE_DEFINITIONS}) + endif() + if(TPL_COMPILE_OPTIONS) + target_compile_options(${NAME} INTERFACE ${TPL_COMPILE_OPTIONS}) + endif() + if(TPL_LINK_OPTIONS) + target_link_libraries(${NAME} INTERFACE ${TPL_LINK_OPTIONS}) + endif() + else() + add_library(${NAME} UNKNOWN IMPORTED) + if(TPL_LIBRARY) + set_target_properties(${NAME} PROPERTIES IMPORTED_LOCATION ${TPL_LIBRARY}) + endif() #Things have to go in quoted in case we have multiple list entries - IF(TPL_LINK_LIBRARIES) - SET_TARGET_PROPERTIES(${NAME} PROPERTIES - INTERFACE_LINK_LIBRARIES "${TPL_LINK_LIBRARIES}") - ENDIF() - IF(TPL_INCLUDES) - SET_TARGET_PROPERTIES(${NAME} PROPERTIES - INTERFACE_INCLUDE_DIRECTORIES "${TPL_INCLUDES}") - ENDIF() - IF(TPL_COMPILE_DEFINITIONS) - SET_TARGET_PROPERTIES(${NAME} PROPERTIES - INTERFACE_COMPILE_DEFINITIONS "${TPL_COMPILE_DEFINITIONS}") - ENDIF() - IF(TPL_COMPILE_OPTIONS) - SET_TARGET_PROPERTIES(${NAME} PROPERTIES - INTERFACE_COMPILE_OPTIONS "${TPL_COMPILE_OPTIONS}") - ENDIF() - IF(TPL_LINK_OPTIONS) - SET_TARGET_PROPERTIES(${NAME} PROPERTIES - INTERFACE_LINK_LIBRARIES "${TPL_LINK_OPTIONS}") - ENDIF() - ENDIF() -ENDMACRO() + if(TPL_LINK_LIBRARIES) + set_target_properties(${NAME} PROPERTIES INTERFACE_LINK_LIBRARIES "${TPL_LINK_LIBRARIES}") + endif() + if(TPL_INCLUDES) + set_target_properties(${NAME} PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${TPL_INCLUDES}") + endif() + if(TPL_COMPILE_DEFINITIONS) + set_target_properties(${NAME} PROPERTIES INTERFACE_COMPILE_DEFINITIONS "${TPL_COMPILE_DEFINITIONS}") + endif() + if(TPL_COMPILE_OPTIONS) + set_target_properties(${NAME} PROPERTIES INTERFACE_COMPILE_OPTIONS "${TPL_COMPILE_OPTIONS}") + endif() + if(TPL_LINK_OPTIONS) + set_target_properties(${NAME} PROPERTIES INTERFACE_LINK_LIBRARIES "${TPL_LINK_OPTIONS}") + endif() + endif() +endmacro() # # @MACRO: KOKKOS_FIND_HEADER @@ -479,37 +430,32 @@ ENDMACRO() # # Custom paths to search for the header # -MACRO(kokkos_find_header VAR_NAME HEADER TPL_NAME) - CMAKE_PARSE_ARGUMENTS(TPL - "ALLOW_SYSTEM_PATH_FALLBACK" - "" - "PATHS" - ${ARGN}) - - SET(${VAR_NAME} "${VARNAME}-NOTFOUND") - SET(HAVE_CUSTOM_PATHS FALSE) - - IF(DEFINED ${TPL_NAME}_ROOT OR - DEFINED ENV{${TPL_NAME}_ROOT} OR - DEFINED KOKKOS_${TPL_NAME}_DIR OR - TPL_PATHS) - FIND_PATH(${VAR_NAME} ${HEADER} - PATHS - ${${TPL_NAME}_ROOT} - $ENV{${TPL_NAME}_ROOT} - ${KOKKOS_${TPL_NAME}_DIR} - ${TPL_PATHS} +macro(kokkos_find_header VAR_NAME HEADER TPL_NAME) + cmake_parse_arguments(TPL "ALLOW_SYSTEM_PATH_FALLBACK" "" "PATHS" ${ARGN}) + + set(${VAR_NAME} "${VARNAME}-NOTFOUND") + set(HAVE_CUSTOM_PATHS FALSE) + + if(DEFINED ${TPL_NAME}_ROOT + OR DEFINED ENV{${TPL_NAME}_ROOT} + OR DEFINED KOKKOS_${TPL_NAME}_DIR + OR TPL_PATHS + ) + find_path( + ${VAR_NAME} ${HEADER} + PATHS ${${TPL_NAME}_ROOT} $ENV{${TPL_NAME}_ROOT} ${KOKKOS_${TPL_NAME}_DIR} ${TPL_PATHS} PATH_SUFFIXES include - NO_DEFAULT_PATH) - SET(HAVE_CUSTOM_PATHS TRUE) - ENDIF() + NO_DEFAULT_PATH + ) + set(HAVE_CUSTOM_PATHS TRUE) + endif() - IF(NOT HAVE_CUSTOM_PATHS OR TPL_ALLOW_SYSTEM_PATH_FALLBACK) + if(NOT HAVE_CUSTOM_PATHS OR TPL_ALLOW_SYSTEM_PATH_FALLBACK) #No-op if ${VAR_NAME} set by previous call - FIND_PATH(${VAR_NAME} ${HEADER}) - ENDIF() + find_path(${VAR_NAME} ${HEADER}) + endif() -ENDMACRO() +endmacro() # # @MACRO: KOKKOS_FIND_LIBRARY @@ -565,42 +511,36 @@ ENDMACRO() # Suffixes appended to PATHS when attempting to locate # the library. Defaults to {lib, lib64}. # -MACRO(kokkos_find_library VAR_NAME LIB TPL_NAME) - CMAKE_PARSE_ARGUMENTS(TPL - "ALLOW_SYSTEM_PATH_FALLBACK" - "" - "PATHS;SUFFIXES" - ${ARGN}) - - IF(NOT TPL_SUFFIXES) - SET(TPL_SUFFIXES lib lib64) - ENDIF() - - SET(${VAR_NAME} "${VARNAME}-NOTFOUND") - SET(HAVE_CUSTOM_PATHS FALSE) - - IF(DEFINED ${TPL_NAME}_ROOT OR - DEFINED ENV{${TPL_NAME}_ROOT} OR - DEFINED KOKKOS_${TPL_NAME}_DIR OR - TPL_PATHS) - FIND_LIBRARY(${VAR_NAME} ${LIB} - PATHS - ${${TPL_NAME}_ROOT} - $ENV{${TPL_NAME}_ROOT} - ${KOKKOS_${TPL_NAME}_DIR} - ${TPL_PATHS} - PATH_SUFFIXES - ${TPL_SUFFIXES} - NO_DEFAULT_PATH) - SET(HAVE_CUSTOM_PATHS TRUE) - ENDIF() - - IF(NOT HAVE_CUSTOM_PATHS OR TPL_ALLOW_SYSTEM_PATH_FALLBACK) +macro(kokkos_find_library VAR_NAME LIB TPL_NAME) + cmake_parse_arguments(TPL "ALLOW_SYSTEM_PATH_FALLBACK" "" "PATHS;SUFFIXES" ${ARGN}) + + if(NOT TPL_SUFFIXES) + set(TPL_SUFFIXES lib lib64) + endif() + + set(${VAR_NAME} "${VARNAME}-NOTFOUND") + set(HAVE_CUSTOM_PATHS FALSE) + + if(DEFINED ${TPL_NAME}_ROOT + OR DEFINED ENV{${TPL_NAME}_ROOT} + OR DEFINED KOKKOS_${TPL_NAME}_DIR + OR TPL_PATHS + ) + find_library( + ${VAR_NAME} ${LIB} + PATHS ${${TPL_NAME}_ROOT} $ENV{${TPL_NAME}_ROOT} ${KOKKOS_${TPL_NAME}_DIR} ${TPL_PATHS} + PATH_SUFFIXES ${TPL_SUFFIXES} + NO_DEFAULT_PATH + ) + set(HAVE_CUSTOM_PATHS TRUE) + endif() + + if(NOT HAVE_CUSTOM_PATHS OR TPL_ALLOW_SYSTEM_PATH_FALLBACK) #No-op if ${VAR_NAME} set by previous call - FIND_LIBRARY(${VAR_NAME} ${LIB} PATH_SUFFIXES ${TPL_SUFFIXES}) - ENDIF() + find_library(${VAR_NAME} ${LIB} PATH_SUFFIXES ${TPL_SUFFIXES}) + endif() -ENDMACRO() +endmacro() # # @MACRO: KOKKOS_FIND_IMPORTED @@ -683,111 +623,127 @@ ENDMACRO() # If specified, this gives a list of paths to search for the headers # If not given, _ROOT/include and _ROOT/include will be searched. # -MACRO(kokkos_find_imported NAME) - CMAKE_PARSE_ARGUMENTS(TPL - "INTERFACE;ALLOW_SYSTEM_PATH_FALLBACK" - "IMPORTED_NAME;MODULE_NAME;LIBRARY;HEADER" - "LIBRARIES;LIBRARY_PATHS;LIBRARY_SUFFIXES;HEADERS;HEADER_PATHS" - ${ARGN}) - - IF(NOT TPL_MODULE_NAME) - SET(TPL_MODULE_NAME TPL${NAME}) - ENDIF() - - IF (TPL_ALLOW_SYSTEM_PATH_FALLBACK) - SET(ALLOW_PATH_FALLBACK_OPT ALLOW_SYSTEM_PATH_FALLBACK) - ELSE() - SET(ALLOW_PATH_FALLBACK_OPT) - ENDIF() - - IF (NOT TPL_IMPORTED_NAME) - IF (TPL_INTERFACE) - SET(TPL_IMPORTED_NAME ${NAME}) - ELSE() - SET(TPL_IMPORTED_NAME Kokkos::${NAME}) - ENDIF() - ENDIF() - - IF (NOT TPL_LIBRARY_SUFFIXES) - SET(TPL_LIBRARY_SUFFIXES lib) - IF(KOKKOS_IMPL_32BIT) - LIST(APPEND TPL_LIBRARY_SUFFIXES lib32) - ELSE() - LIST(APPEND TPL_LIBRARY_SUFFIXES lib64) - ENDIF() - ENDIF() - - SET(${NAME}_INCLUDE_DIRS) - IF (TPL_HEADER) - KOKKOS_FIND_HEADER(${NAME}_INCLUDE_DIRS ${TPL_HEADER} ${NAME} ${ALLOW_PATH_FALLBACK_OPT} PATHS ${TPL_HEADER_PATHS}) - ENDIF() - - FOREACH(HEADER ${TPL_HEADERS}) - KOKKOS_FIND_HEADER(HEADER_FIND_TEMP ${HEADER} ${NAME} ${ALLOW_PATH_FALLBACK_OPT} PATHS ${TPL_HEADER_PATHS}) - IF(HEADER_FIND_TEMP) - LIST(APPEND ${NAME}_INCLUDE_DIRS ${HEADER_FIND_TEMP}) - ENDIF() - ENDFOREACH() - - SET(${NAME}_LIBRARY) - IF(TPL_LIBRARY) - KOKKOS_FIND_LIBRARY(${NAME}_LIBRARY ${TPL_LIBRARY} ${NAME} +macro(kokkos_find_imported NAME) + cmake_parse_arguments( + TPL "INTERFACE;ALLOW_SYSTEM_PATH_FALLBACK" "IMPORTED_NAME;MODULE_NAME;LIBRARY;HEADER" + "LIBRARIES;LIBRARY_PATHS;LIBRARY_SUFFIXES;HEADERS;HEADER_PATHS" ${ARGN} + ) + + if(NOT TPL_MODULE_NAME) + set(TPL_MODULE_NAME TPL${NAME}) + endif() + + if(TPL_ALLOW_SYSTEM_PATH_FALLBACK) + set(ALLOW_PATH_FALLBACK_OPT ALLOW_SYSTEM_PATH_FALLBACK) + else() + set(ALLOW_PATH_FALLBACK_OPT) + endif() + + if(NOT TPL_IMPORTED_NAME) + if(TPL_INTERFACE) + set(TPL_IMPORTED_NAME ${NAME}) + else() + set(TPL_IMPORTED_NAME Kokkos::${NAME}) + endif() + endif() + + if(NOT TPL_LIBRARY_SUFFIXES) + set(TPL_LIBRARY_SUFFIXES lib) + if(KOKKOS_IMPL_32BIT) + list(APPEND TPL_LIBRARY_SUFFIXES lib32) + else() + list(APPEND TPL_LIBRARY_SUFFIXES lib64) + endif() + endif() + + set(${NAME}_INCLUDE_DIRS) + if(TPL_HEADER) + kokkos_find_header(${NAME}_INCLUDE_DIRS ${TPL_HEADER} ${NAME} ${ALLOW_PATH_FALLBACK_OPT} PATHS ${TPL_HEADER_PATHS}) + endif() + + foreach(HEADER ${TPL_HEADERS}) + kokkos_find_header(HEADER_FIND_TEMP ${HEADER} ${NAME} ${ALLOW_PATH_FALLBACK_OPT} PATHS ${TPL_HEADER_PATHS}) + if(HEADER_FIND_TEMP) + list(APPEND ${NAME}_INCLUDE_DIRS ${HEADER_FIND_TEMP}) + endif() + endforeach() + + set(${NAME}_LIBRARY) + if(TPL_LIBRARY) + kokkos_find_library( + ${NAME}_LIBRARY + ${TPL_LIBRARY} + ${NAME} ${ALLOW_PATH_FALLBACK_OPT} - PATHS ${TPL_LIBRARY_PATHS} - SUFFIXES ${TPL_LIBRARY_SUFFIXES}) - ENDIF() - - SET(${NAME}_FOUND_LIBRARIES) - FOREACH(LIB ${TPL_LIBRARIES}) - KOKKOS_FIND_LIBRARY(${LIB}_LOCATION ${LIB} ${NAME} + PATHS + ${TPL_LIBRARY_PATHS} + SUFFIXES + ${TPL_LIBRARY_SUFFIXES} + ) + endif() + + set(${NAME}_FOUND_LIBRARIES) + foreach(LIB ${TPL_LIBRARIES}) + kokkos_find_library( + ${LIB}_LOCATION + ${LIB} + ${NAME} ${ALLOW_PATH_FALLBACK_OPT} - PATHS ${TPL_LIBRARY_PATHS} - SUFFIXES ${TPL_LIBRARY_SUFFIXES}) - IF(${LIB}_LOCATION) - LIST(APPEND ${NAME}_FOUND_LIBRARIES ${${LIB}_LOCATION}) - ELSE() - SET(${NAME}_FOUND_LIBRARIES ${${LIB}_LOCATION}) - BREAK() - ENDIF() - ENDFOREACH() - - INCLUDE(FindPackageHandleStandardArgs) + PATHS + ${TPL_LIBRARY_PATHS} + SUFFIXES + ${TPL_LIBRARY_SUFFIXES} + ) + if(${LIB}_LOCATION) + list(APPEND ${NAME}_FOUND_LIBRARIES ${${LIB}_LOCATION}) + else() + set(${NAME}_FOUND_LIBRARIES ${${LIB}_LOCATION}) + break() + endif() + endforeach() + + include(FindPackageHandleStandardArgs) #Collect all the variables we need to be valid for #find_package to have succeeded - SET(TPL_VARS_NEEDED) - IF (TPL_LIBRARY) - LIST(APPEND TPL_VARS_NEEDED ${NAME}_LIBRARY) - ENDIF() - IF(TPL_HEADER) - LIST(APPEND TPL_VARS_NEEDED ${NAME}_INCLUDE_DIRS) - ENDIF() - IF(TPL_LIBRARIES) - LIST(APPEND TPL_VARS_NEEDED ${NAME}_FOUND_LIBRARIES) - ENDIF() - FIND_PACKAGE_HANDLE_STANDARD_ARGS(${TPL_MODULE_NAME} REQUIRED_VARS ${TPL_VARS_NEEDED}) - - MARK_AS_ADVANCED(${NAME}_INCLUDE_DIRS ${NAME}_FOUND_LIBRARIES ${NAME}_LIBRARY) + set(TPL_VARS_NEEDED) + if(TPL_LIBRARY) + list(APPEND TPL_VARS_NEEDED ${NAME}_LIBRARY) + endif() + if(TPL_HEADER) + list(APPEND TPL_VARS_NEEDED ${NAME}_INCLUDE_DIRS) + endif() + if(TPL_LIBRARIES) + list(APPEND TPL_VARS_NEEDED ${NAME}_FOUND_LIBRARIES) + endif() + find_package_handle_standard_args(${TPL_MODULE_NAME} REQUIRED_VARS ${TPL_VARS_NEEDED}) + + mark_as_advanced(${NAME}_INCLUDE_DIRS ${NAME}_FOUND_LIBRARIES ${NAME}_LIBRARY) #this is so much fun on a Cray system #/usr/include should never be added as a -isystem include #this freaks out the compiler include search order - IF (KOKKOS_IS_CRAYPE) - LIST(REMOVE_ITEM ${NAME}_INCLUDE_DIRS "/usr/include") - ENDIF() - - IF (${TPL_MODULE_NAME}_FOUND) - SET(IMPORT_TYPE) - IF (TPL_INTERFACE) - SET(IMPORT_TYPE "INTERFACE") - SET(${NAME}_FOUND_LIBRARIES ${TPL_LIBRARIES}) - ENDIF() - KOKKOS_CREATE_IMPORTED_TPL(${TPL_IMPORTED_NAME} + if(KOKKOS_IS_CRAYPE) + list(REMOVE_ITEM ${NAME}_INCLUDE_DIRS "/usr/include") + endif() + + if(${TPL_MODULE_NAME}_FOUND) + set(IMPORT_TYPE) + if(TPL_INTERFACE) + set(IMPORT_TYPE "INTERFACE") + set(${NAME}_FOUND_LIBRARIES ${TPL_LIBRARIES}) + endif() + kokkos_create_imported_tpl( + ${TPL_IMPORTED_NAME} ${IMPORT_TYPE} - INCLUDES "${${NAME}_INCLUDE_DIRS}" - LIBRARY "${${NAME}_LIBRARY}" - LINK_LIBRARIES "${${NAME}_FOUND_LIBRARIES}") - ENDIF() -ENDMACRO(kokkos_find_imported) + INCLUDES + "${${NAME}_INCLUDE_DIRS}" + LIBRARY + "${${NAME}_LIBRARY}" + LINK_LIBRARIES + "${${NAME}_FOUND_LIBRARIES}" + ) + endif() +endmacro(kokkos_find_imported) # # @MACRO: KOKKOS_LINK_TPL() @@ -817,109 +773,114 @@ ENDMACRO(kokkos_find_imported) # If specified, this gives the exact name of the target to link against # target_link_libraries( ) # -FUNCTION(kokkos_link_tpl TARGET) - CMAKE_PARSE_ARGUMENTS(TPL - "PUBLIC;PRIVATE;INTERFACE" - "IMPORTED_NAME" - "" - ${ARGN}) +function(kokkos_link_tpl TARGET) + cmake_parse_arguments(TPL "PUBLIC;PRIVATE;INTERFACE" "IMPORTED_NAME" "" ${ARGN}) #the name of the TPL - SET(TPL ${TPL_UNPARSED_ARGUMENTS}) - IF (KOKKOS_HAS_TRILINOS) - #Do nothing, they will have already been linked - ELSE() - IF (NOT TPL_IMPORTED_NAME) - SET(TPL_IMPORTED_NAME Kokkos::${TPL}) - ENDIF() - IF (KOKKOS_ENABLE_${TPL}) - IF (TPL_PUBLIC) - TARGET_LINK_LIBRARIES(${TARGET} PUBLIC ${TPL_IMPORTED_NAME}) - ELSEIF (TPL_PRIVATE) - TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ${TPL_IMPORTED_NAME}) - ELSEIF (TPL_INTERFACE) - TARGET_LINK_LIBRARIES(${TARGET} INTERFACE ${TPL_IMPORTED_NAME}) - ELSE() - TARGET_LINK_LIBRARIES(${TARGET} ${TPL_IMPORTED_NAME}) - ENDIF() - ENDIF() - ENDIF() -ENDFUNCTION() - -FUNCTION(COMPILER_SPECIFIC_OPTIONS_HELPER) - SET(COMPILERS NVIDIA NVHPC DEFAULT Cray Intel Clang AppleClang IntelLLVM GNU HIPCC Fujitsu MSVC) - CMAKE_PARSE_ARGUMENTS( - PARSE - "LINK_OPTIONS;COMPILE_OPTIONS;COMPILE_DEFINITIONS;LINK_LIBRARIES" - "COMPILER_ID" - "${COMPILERS}" - ${ARGN}) - IF(PARSE_UNPARSED_ARGUMENTS) - MESSAGE(SEND_ERROR "'${PARSE_UNPARSED_ARGUMENTS}' argument(s) not recognized when providing compiler specific options") - ENDIF() - - IF(PARSE_COMPILER_ID) - SET(COMPILER ${${PARSE_COMPILER_ID}}) - ELSE() - SET(COMPILER ${KOKKOS_CXX_COMPILER_ID}) - ENDIF() - - SET(COMPILER_SPECIFIC_FLAGS_TMP ${PARSE_DEFAULT}) - FOREACH(COMP ${COMPILERS}) - IF (COMPILER STREQUAL "${COMP}") - IF (PARSE_${COMPILER}) - IF ("${PARSE_${COMPILER}}" STREQUAL "NO-VALUE-SPECIFIED") - SET(COMPILER_SPECIFIC_FLAGS_TMP "") - ELSE() - SET(COMPILER_SPECIFIC_FLAGS_TMP ${PARSE_${COMPILER}}) - ENDIF() - ENDIF() - ENDIF() - ENDFOREACH() - - IF (PARSE_COMPILE_OPTIONS) + set(TPL ${TPL_UNPARSED_ARGUMENTS}) + if(NOT TPL_IMPORTED_NAME) + set(TPL_IMPORTED_NAME Kokkos::${TPL}) + endif() + if(KOKKOS_ENABLE_${TPL}) + if(TPL_PUBLIC) + target_link_libraries(${TARGET} PUBLIC ${TPL_IMPORTED_NAME}) + elseif(TPL_PRIVATE) + target_link_libraries(${TARGET} PRIVATE ${TPL_IMPORTED_NAME}) + elseif(TPL_INTERFACE) + target_link_libraries(${TARGET} INTERFACE ${TPL_IMPORTED_NAME}) + else() + target_link_libraries(${TARGET} ${TPL_IMPORTED_NAME}) + endif() + endif() +endfunction() + +function(COMPILER_SPECIFIC_OPTIONS_HELPER) + set(COMPILERS + NVIDIA + NVHPC + DEFAULT + Cray + Intel + Clang + AppleClang + IntelLLVM + GNU + HIPCC + Fujitsu + MSVC + CrayClang + ) + cmake_parse_arguments( + PARSE "LINK_OPTIONS;COMPILE_OPTIONS;COMPILE_DEFINITIONS;LINK_LIBRARIES" "COMPILER_ID" "${COMPILERS}" ${ARGN} + ) + if(PARSE_UNPARSED_ARGUMENTS) + message( + SEND_ERROR "'${PARSE_UNPARSED_ARGUMENTS}' argument(s) not recognized when providing compiler specific options" + ) + endif() + + if(PARSE_COMPILER_ID) + set(COMPILER ${${PARSE_COMPILER_ID}}) + else() + set(COMPILER ${KOKKOS_CXX_COMPILER_ID}) + endif() + + set(COMPILER_SPECIFIC_FLAGS_TMP ${PARSE_DEFAULT}) + foreach(COMP ${COMPILERS}) + if(COMPILER STREQUAL "${COMP}") + if(PARSE_${COMPILER}) + if("${PARSE_${COMPILER}}" STREQUAL "NO-VALUE-SPECIFIED") + set(COMPILER_SPECIFIC_FLAGS_TMP "") + else() + set(COMPILER_SPECIFIC_FLAGS_TMP ${PARSE_${COMPILER}}) + endif() + endif() + endif() + endforeach() + + if(PARSE_COMPILE_OPTIONS) # The funky logic here is for future handling of argument deduplication # If we naively pass multiple -Xcompiler flags to target_compile_options # -Xcompiler will get deduplicated and break the build - IF ("-Xcompiler" IN_LIST COMPILER_SPECIFIC_FLAGS_TMP) - LIST(REMOVE_ITEM COMPILER_SPECIFIC_FLAGS_TMP "-Xcompiler") - GLOBAL_APPEND(KOKKOS_XCOMPILER_OPTIONS ${COMPILER_SPECIFIC_FLAGS_TMP}) - ELSE() - GLOBAL_APPEND(KOKKOS_COMPILE_OPTIONS ${COMPILER_SPECIFIC_FLAGS_TMP}) - ENDIF() - ENDIF() - - IF (PARSE_LINK_OPTIONS) - GLOBAL_APPEND(KOKKOS_LINK_OPTIONS ${COMPILER_SPECIFIC_FLAGS_TMP}) - ENDIF() - - IF (PARSE_COMPILE_DEFINITIONS) - GLOBAL_APPEND(KOKKOS_COMPILE_DEFINITIONS ${COMPILER_SPECIFIC_FLAGS_TMP}) - ENDIF() - - IF (PARSE_LINK_LIBRARIES) - GLOBAL_APPEND(KOKKOS_LINK_LIBRARIES ${COMPILER_SPECIFIC_FLAGS_TMP}) - ENDIF() -ENDFUNCTION(COMPILER_SPECIFIC_OPTIONS_HELPER) - -FUNCTION(COMPILER_SPECIFIC_FLAGS) - COMPILER_SPECIFIC_OPTIONS_HELPER(${ARGN} COMPILE_OPTIONS LINK_OPTIONS) -ENDFUNCTION(COMPILER_SPECIFIC_FLAGS) - -FUNCTION(COMPILER_SPECIFIC_OPTIONS) - COMPILER_SPECIFIC_OPTIONS_HELPER(${ARGN} COMPILE_OPTIONS) -ENDFUNCTION(COMPILER_SPECIFIC_OPTIONS) - -FUNCTION(COMPILER_SPECIFIC_LINK_OPTIONS) - COMPILER_SPECIFIC_OPTIONS_HELPER(${ARGN} LINK_OPTIONS) -ENDFUNCTION(COMPILER_SPECIFIC_LINK_OPTIONS) - -FUNCTION(COMPILER_SPECIFIC_DEFS) - COMPILER_SPECIFIC_OPTIONS_HELPER(${ARGN} COMPILE_DEFINITIONS) -ENDFUNCTION(COMPILER_SPECIFIC_DEFS) - -FUNCTION(COMPILER_SPECIFIC_LIBS) - COMPILER_SPECIFIC_OPTIONS_HELPER(${ARGN} LINK_LIBRARIES) -ENDFUNCTION(COMPILER_SPECIFIC_LIBS) + if("-Xcompiler" IN_LIST COMPILER_SPECIFIC_FLAGS_TMP) + list(REMOVE_ITEM COMPILER_SPECIFIC_FLAGS_TMP "-Xcompiler") + global_append(KOKKOS_XCOMPILER_OPTIONS ${COMPILER_SPECIFIC_FLAGS_TMP}) + else() + global_append(KOKKOS_COMPILE_OPTIONS ${COMPILER_SPECIFIC_FLAGS_TMP}) + endif() + endif() + + if(PARSE_LINK_OPTIONS) + global_append(KOKKOS_LINK_OPTIONS ${COMPILER_SPECIFIC_FLAGS_TMP}) + endif() + + if(PARSE_COMPILE_DEFINITIONS) + global_append(KOKKOS_COMPILE_DEFINITIONS ${COMPILER_SPECIFIC_FLAGS_TMP}) + endif() + + if(PARSE_LINK_LIBRARIES) + global_append(KOKKOS_LINK_LIBRARIES ${COMPILER_SPECIFIC_FLAGS_TMP}) + endif() +endfunction(COMPILER_SPECIFIC_OPTIONS_HELPER) + +function(COMPILER_SPECIFIC_FLAGS) + compiler_specific_options_helper(${ARGN} COMPILE_OPTIONS LINK_OPTIONS) +endfunction(COMPILER_SPECIFIC_FLAGS) + +function(COMPILER_SPECIFIC_OPTIONS) + compiler_specific_options_helper(${ARGN} COMPILE_OPTIONS) +endfunction(COMPILER_SPECIFIC_OPTIONS) + +function(COMPILER_SPECIFIC_LINK_OPTIONS) + compiler_specific_options_helper(${ARGN} LINK_OPTIONS) +endfunction(COMPILER_SPECIFIC_LINK_OPTIONS) + +function(COMPILER_SPECIFIC_DEFS) + compiler_specific_options_helper(${ARGN} COMPILE_DEFINITIONS) +endfunction(COMPILER_SPECIFIC_DEFS) + +function(COMPILER_SPECIFIC_LIBS) + compiler_specific_options_helper(${ARGN} LINK_LIBRARIES) +endfunction(COMPILER_SPECIFIC_LIBS) # Given a list of the form # key1;value1;key2;value2,... # Create a list of all keys in a variable named ${KEY_LIST_NAME} @@ -927,41 +888,42 @@ ENDFUNCTION(COMPILER_SPECIFIC_LIBS) # kokkos_key_value_map(ARCH ALL_ARCHES key1;value1;key2;value2) # would produce a list variable ALL_ARCHES=key1;key2 # and individual variables ARCHkey1=value1 and ARCHkey2=value2 -MACRO(KOKKOS_KEY_VALUE_MAP VAR_PREFIX KEY_LIST_NAME) - SET(PARSE_KEY ON) - SET(${KEY_LIST_NAME}) - FOREACH(ENTRY ${ARGN}) - IF(PARSE_KEY) - SET(CURRENT_KEY ${ENTRY}) - SET(PARSE_KEY OFF) - LIST(APPEND ${KEY_LIST_NAME} ${CURRENT_KEY}) - ELSE() - SET(${VAR_PREFIX}${CURRENT_KEY} ${ENTRY}) - SET(PARSE_KEY ON) - ENDIF() - ENDFOREACH() -ENDMACRO() - -FUNCTION(KOKKOS_CHECK_DEPRECATED_OPTIONS) - KOKKOS_KEY_VALUE_MAP(DEPRECATED_MSG_ DEPRECATED_LIST ${ARGN}) - FOREACH(OPTION_SUFFIX ${DEPRECATED_LIST}) - SET(OPTION_NAME Kokkos_${OPTION_SUFFIX}) - SET(OPTION_MESSAGE ${DEPRECATED_MSG_${OPTION_SUFFIX}}) - IF(DEFINED ${OPTION_NAME}) # This variable has been given by the user as on or off - MESSAGE(SEND_ERROR "Removed option ${OPTION_NAME} has been given with value ${${OPTION_NAME}}. ${OPT_MESSAGE}") - ENDIF() - ENDFOREACH() -ENDFUNCTION() +macro(KOKKOS_KEY_VALUE_MAP VAR_PREFIX KEY_LIST_NAME) + set(PARSE_KEY ON) + set(${KEY_LIST_NAME}) + foreach(ENTRY ${ARGN}) + if(PARSE_KEY) + set(CURRENT_KEY ${ENTRY}) + set(PARSE_KEY OFF) + list(APPEND ${KEY_LIST_NAME} ${CURRENT_KEY}) + else() + set(${VAR_PREFIX}${CURRENT_KEY} ${ENTRY}) + set(PARSE_KEY ON) + endif() + endforeach() +endmacro() + +function(KOKKOS_CHECK_DEPRECATED_OPTIONS) + kokkos_key_value_map(DEPRECATED_MSG_ DEPRECATED_LIST ${ARGN}) + foreach(OPTION_SUFFIX ${DEPRECATED_LIST}) + set(OPTION_NAME Kokkos_${OPTION_SUFFIX}) + set(OPTION_MESSAGE ${DEPRECATED_MSG_${OPTION_SUFFIX}}) + if(DEFINED ${OPTION_NAME}) # This variable has been given by the user as on or off + message(SEND_ERROR "Removed option ${OPTION_NAME} has been given with value ${${OPTION_NAME}}. ${OPT_MESSAGE}") + endif() + endforeach() +endfunction() # this function checks whether the current CXX compiler supports building CUDA -FUNCTION(kokkos_cxx_compiler_cuda_test _VAR) - # don't run this test every time - IF(DEFINED ${_VAR}) - RETURN() - ENDIF() - - FILE(WRITE ${PROJECT_BINARY_DIR}/compile_tests/compiles_cuda.cpp -" +function(kokkos_cxx_compiler_cuda_test _VAR) + # don't run this test every time + if(DEFINED ${_VAR}) + return() + endif() + + file( + WRITE ${PROJECT_BINARY_DIR}/compile_tests/compiles_cuda.cpp + " #include #include @@ -985,14 +947,13 @@ int main() cudaDeviceSynchronize(); return EXIT_SUCCESS; } -") +" + ) - TRY_COMPILE(_RET - ${PROJECT_BINARY_DIR}/compile_tests - SOURCES ${PROJECT_BINARY_DIR}/compile_tests/compiles_cuda.cpp) + try_compile(_RET ${PROJECT_BINARY_DIR}/compile_tests SOURCES ${PROJECT_BINARY_DIR}/compile_tests/compiles_cuda.cpp) - SET(${_VAR} ${_RET} CACHE STRING "CXX compiler supports building CUDA") -ENDFUNCTION() + set(${_VAR} ${_RET} CACHE STRING "CXX compiler supports building CUDA") +endfunction() # this function is provided to easily select which files use nvcc_wrapper: # @@ -1005,58 +966,77 @@ ENDFUNCTION() # NOTE: this is VERY DIFFERENT than the version in KokkosConfigCommon.cmake.in. # This version explicitly uses nvcc_wrapper. # -FUNCTION(kokkos_compilation) - # check whether the compiler already supports building CUDA - KOKKOS_CXX_COMPILER_CUDA_TEST(Kokkos_CXX_COMPILER_COMPILES_CUDA) - # if CUDA compile test has already been performed, just return - IF(Kokkos_CXX_COMPILER_COMPILES_CUDA) - RETURN() - ENDIF() - - CMAKE_PARSE_ARGUMENTS(COMP "GLOBAL;PROJECT" "" "DIRECTORY;TARGET;SOURCE" ${ARGN}) - - # find kokkos_launch_compiler - FIND_PROGRAM(Kokkos_COMPILE_LAUNCHER - NAMES kokkos_launch_compiler - HINTS ${PROJECT_SOURCE_DIR} - PATHS ${PROJECT_SOURCE_DIR} - PATH_SUFFIXES bin) - - IF(NOT Kokkos_COMPILE_LAUNCHER) - MESSAGE(FATAL_ERROR "Kokkos could not find 'kokkos_launch_compiler'. Please set '-DKokkos_COMPILE_LAUNCHER=/path/to/launcher'") - ENDIF() - - # find nvcc_wrapper - FIND_PROGRAM(Kokkos_NVCC_WRAPPER - NAMES nvcc_wrapper - HINTS ${PROJECT_SOURCE_DIR} - PATHS ${PROJECT_SOURCE_DIR} - PATH_SUFFIXES bin) - - IF(NOT Kokkos_COMPILE_LAUNCHER) - MESSAGE(FATAL_ERROR "Kokkos could not find 'nvcc_wrapper'. Please set '-DKokkos_COMPILE_LAUNCHER=/path/to/nvcc_wrapper'") - ENDIF() - - IF(COMP_GLOBAL) - # if global, don't bother setting others - SET_PROPERTY(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${Kokkos_COMPILE_LAUNCHER} ${Kokkos_NVCC_WRAPPER} ${CMAKE_CXX_COMPILER}") - SET_PROPERTY(GLOBAL PROPERTY RULE_LAUNCH_LINK "${Kokkos_COMPILE_LAUNCHER} ${Kokkos_NVCC_WRAPPER} ${CMAKE_CXX_COMPILER}") - ELSE() - FOREACH(_TYPE PROJECT DIRECTORY TARGET SOURCE) - # make project/subproject scoping easy, e.g. KokkosCompilation(PROJECT) after project(...) - IF("${_TYPE}" STREQUAL "PROJECT" AND COMP_${_TYPE}) - LIST(APPEND COMP_DIRECTORY ${PROJECT_SOURCE_DIR}) - UNSET(COMP_${_TYPE}) - ENDIF() - # set the properties if defined - IF(COMP_${_TYPE}) - # MESSAGE(STATUS "Using nvcc_wrapper :: ${_TYPE} :: ${COMP_${_TYPE}}") - SET_PROPERTY(${_TYPE} ${COMP_${_TYPE}} PROPERTY RULE_LAUNCH_COMPILE "${Kokkos_COMPILE_LAUNCHER} ${Kokkos_NVCC_WRAPPER} ${CMAKE_CXX_COMPILER}") - SET_PROPERTY(${_TYPE} ${COMP_${_TYPE}} PROPERTY RULE_LAUNCH_LINK "${Kokkos_COMPILE_LAUNCHER} ${Kokkos_NVCC_WRAPPER} ${CMAKE_CXX_COMPILER}") - ENDIF() - ENDFOREACH() - ENDIF() -ENDFUNCTION() +function(kokkos_compilation) + # check whether the compiler already supports building CUDA + kokkos_cxx_compiler_cuda_test(Kokkos_CXX_COMPILER_COMPILES_CUDA) + # if CUDA compile test has already been performed, just return + if(Kokkos_CXX_COMPILER_COMPILES_CUDA) + return() + endif() + + cmake_parse_arguments(COMP "GLOBAL;PROJECT" "" "DIRECTORY;TARGET;SOURCE" ${ARGN}) + + # find kokkos_launch_compiler + find_program( + Kokkos_COMPILE_LAUNCHER + NAMES kokkos_launch_compiler + HINTS ${PROJECT_SOURCE_DIR} + PATHS ${PROJECT_SOURCE_DIR} + PATH_SUFFIXES bin + ) + + if(NOT Kokkos_COMPILE_LAUNCHER) + message( + FATAL_ERROR + "Kokkos could not find 'kokkos_launch_compiler'. Please set '-DKokkos_COMPILE_LAUNCHER=/path/to/launcher'" + ) + endif() + + # find nvcc_wrapper + find_program( + Kokkos_NVCC_WRAPPER + NAMES nvcc_wrapper + HINTS ${PROJECT_SOURCE_DIR} + PATHS ${PROJECT_SOURCE_DIR} + PATH_SUFFIXES bin + ) + + if(NOT Kokkos_COMPILE_LAUNCHER) + message( + FATAL_ERROR "Kokkos could not find 'nvcc_wrapper'. Please set '-DKokkos_COMPILE_LAUNCHER=/path/to/nvcc_wrapper'" + ) + endif() + + if(COMP_GLOBAL) + # if global, don't bother setting others + set_property( + GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${Kokkos_COMPILE_LAUNCHER} ${Kokkos_NVCC_WRAPPER} ${CMAKE_CXX_COMPILER}" + ) + set_property( + GLOBAL PROPERTY RULE_LAUNCH_LINK "${Kokkos_COMPILE_LAUNCHER} ${Kokkos_NVCC_WRAPPER} ${CMAKE_CXX_COMPILER}" + ) + else() + foreach(_TYPE PROJECT DIRECTORY TARGET SOURCE) + # make project/subproject scoping easy, e.g. KokkosCompilation(PROJECT) after project(...) + if("${_TYPE}" STREQUAL "PROJECT" AND COMP_${_TYPE}) + list(APPEND COMP_DIRECTORY ${PROJECT_SOURCE_DIR}) + unset(COMP_${_TYPE}) + endif() + # set the properties if defined + if(COMP_${_TYPE}) + # MESSAGE(STATUS "Using nvcc_wrapper :: ${_TYPE} :: ${COMP_${_TYPE}}") + set_property( + ${_TYPE} ${COMP_${_TYPE}} PROPERTY RULE_LAUNCH_COMPILE + "${Kokkos_COMPILE_LAUNCHER} ${Kokkos_NVCC_WRAPPER} ${CMAKE_CXX_COMPILER}" + ) + set_property( + ${_TYPE} ${COMP_${_TYPE}} PROPERTY RULE_LAUNCH_LINK + "${Kokkos_COMPILE_LAUNCHER} ${Kokkos_NVCC_WRAPPER} ${CMAKE_CXX_COMPILER}" + ) + endif() + endforeach() + endif() +endfunction() ## KOKKOS_CONFIG_HEADER - parse the data list which is a list of backend names ## and create output config header file...used for ## creating dynamic include files based on enabled backends @@ -1066,14 +1046,15 @@ ENDFUNCTION() ## HEADER_GUARD TEXT used with include header guard ## HEADER_PREFIX prefix used with include (i.e. fwd, decl, setup) ## DATA_LIST list of backends to include in generated file -FUNCTION(KOKKOS_CONFIG_HEADER SRC_FILE TARGET_FILE HEADER_GUARD HEADER_PREFIX DATA_LIST) - SET(HEADER_GUARD_TAG "${HEADER_GUARD}_HPP_") - CONFIGURE_FILE(cmake/${SRC_FILE} ${PROJECT_BINARY_DIR}/temp/${TARGET_FILE}.work COPYONLY) - FOREACH( BACKEND_NAME ${DATA_LIST} ) - SET(INCLUDE_NEXT_FILE "#include <${HEADER_PREFIX}_${BACKEND_NAME}.hpp> -\@INCLUDE_NEXT_FILE\@") - CONFIGURE_FILE(${PROJECT_BINARY_DIR}/temp/${TARGET_FILE}.work ${PROJECT_BINARY_DIR}/temp/${TARGET_FILE}.work @ONLY) - ENDFOREACH() - SET(INCLUDE_NEXT_FILE "" ) - CONFIGURE_FILE(${PROJECT_BINARY_DIR}/temp/${TARGET_FILE}.work ${TARGET_FILE} @ONLY) -ENDFUNCTION() +function(KOKKOS_CONFIG_HEADER SRC_FILE TARGET_FILE HEADER_GUARD HEADER_PREFIX DATA_LIST) + set(HEADER_GUARD_TAG "${HEADER_GUARD}_HPP_") + configure_file(cmake/${SRC_FILE} ${PROJECT_BINARY_DIR}/temp/${TARGET_FILE}.work COPYONLY) + foreach(BACKEND_NAME ${DATA_LIST}) + set(INCLUDE_NEXT_FILE "#include <${HEADER_PREFIX}_${BACKEND_NAME}.hpp> +\@INCLUDE_NEXT_FILE\@" + ) + configure_file(${PROJECT_BINARY_DIR}/temp/${TARGET_FILE}.work ${PROJECT_BINARY_DIR}/temp/${TARGET_FILE}.work @ONLY) + endforeach() + set(INCLUDE_NEXT_FILE "") + configure_file(${PROJECT_BINARY_DIR}/temp/${TARGET_FILE}.work ${TARGET_FILE} @ONLY) +endfunction() diff --git a/packages/kokkos/cmake/kokkos_install.cmake b/packages/kokkos/cmake/kokkos_install.cmake index f818dfa24485..3ae7570ffea5 100644 --- a/packages/kokkos/cmake/kokkos_install.cmake +++ b/packages/kokkos/cmake/kokkos_install.cmake @@ -1,57 +1,51 @@ -INCLUDE(CMakePackageConfigHelpers) -IF (NOT KOKKOS_HAS_TRILINOS AND NOT Kokkos_INSTALL_TESTING) - INCLUDE(GNUInstallDirs) +include(CMakePackageConfigHelpers) +if(NOT Kokkos_INSTALL_TESTING) + include(GNUInstallDirs) #Set all the variables needed for KokkosConfig.cmake - GET_PROPERTY(KOKKOS_PROP_LIBS GLOBAL PROPERTY KOKKOS_LIBRARIES_NAMES) - SET(KOKKOS_LIBRARIES ${KOKKOS_PROP_LIBS}) + get_property(KOKKOS_PROP_LIBS GLOBAL PROPERTY KOKKOS_LIBRARIES_NAMES) + set(KOKKOS_LIBRARIES ${KOKKOS_PROP_LIBS}) - INCLUDE(CMakePackageConfigHelpers) - CONFIGURE_PACKAGE_CONFIG_FILE( - cmake/KokkosConfig.cmake.in - "${Kokkos_BINARY_DIR}/KokkosConfig.cmake" - INSTALL_DESTINATION ${CMAKE_INSTALL_FULL_LIBDIR}/cmake) + include(CMakePackageConfigHelpers) + configure_package_config_file( + cmake/KokkosConfig.cmake.in "${Kokkos_BINARY_DIR}/KokkosConfig.cmake" + INSTALL_DESTINATION ${CMAKE_INSTALL_FULL_LIBDIR}/cmake + ) - CONFIGURE_PACKAGE_CONFIG_FILE( - cmake/KokkosConfigCommon.cmake.in - "${Kokkos_BINARY_DIR}/KokkosConfigCommon.cmake" - INSTALL_DESTINATION ${CMAKE_INSTALL_FULL_LIBDIR}/cmake) + configure_package_config_file( + cmake/KokkosConfigCommon.cmake.in "${Kokkos_BINARY_DIR}/KokkosConfigCommon.cmake" + INSTALL_DESTINATION ${CMAKE_INSTALL_FULL_LIBDIR}/cmake + ) - WRITE_BASIC_PACKAGE_VERSION_FILE("${Kokkos_BINARY_DIR}/KokkosConfigVersion.cmake" - VERSION "${Kokkos_VERSION}" - COMPATIBILITY AnyNewerVersion) + write_basic_package_version_file( + "${Kokkos_BINARY_DIR}/KokkosConfigVersion.cmake" VERSION "${Kokkos_VERSION}" COMPATIBILITY AnyNewerVersion + ) # Install the KokkosConfig*.cmake files - install(FILES - "${Kokkos_BINARY_DIR}/KokkosConfig.cmake" - "${Kokkos_BINARY_DIR}/KokkosConfigCommon.cmake" - "${Kokkos_BINARY_DIR}/KokkosConfigVersion.cmake" - DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/Kokkos) + install(FILES "${Kokkos_BINARY_DIR}/KokkosConfig.cmake" "${Kokkos_BINARY_DIR}/KokkosConfigCommon.cmake" + "${Kokkos_BINARY_DIR}/KokkosConfigVersion.cmake" DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/Kokkos + ) install(EXPORT KokkosTargets NAMESPACE Kokkos:: DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/Kokkos) export(EXPORT KokkosTargets NAMESPACE Kokkos:: FILE ${Kokkos_BINARY_DIR}/KokkosTargets.cmake) # Required to be a TriBITS-compliant external package file(MAKE_DIRECTORY ${CMAKE_BINARY_DIR}/cmake_packages/Kokkos) - file(COPY ${Kokkos_BINARY_DIR}/KokkosConfig.cmake - ${Kokkos_BINARY_DIR}/KokkosConfigCommon.cmake - ${Kokkos_BINARY_DIR}/KokkosConfigVersion.cmake - DESTINATION ${CMAKE_BINARY_DIR}/cmake_packages/Kokkos) - export(EXPORT KokkosTargets NAMESPACE Kokkos:: FILE ${CMAKE_BINARY_DIR}/cmake_packages/Kokkos/KokkosTargets.cmake) -ELSE() - CONFIGURE_FILE(cmake/KokkosConfigCommon.cmake.in ${Kokkos_BINARY_DIR}/KokkosConfigCommon.cmake @ONLY) - file(READ ${Kokkos_BINARY_DIR}/KokkosConfigCommon.cmake KOKKOS_CONFIG_COMMON) - file(APPEND "${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/KokkosConfig_install.cmake" "${KOKKOS_CONFIG_COMMON}") - CONFIGURE_FILE(cmake/KokkosTrilinosConfig.cmake.in ${Kokkos_BINARY_DIR}/KokkosTrilinosConfig.cmake @ONLY) - file(READ ${Kokkos_BINARY_DIR}/KokkosTrilinosConfig.cmake KOKKOS_TRILINOS_CONFIG) - file(APPEND "${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/KokkosConfig_install.cmake" "${KOKKOS_TRILINOS_CONFIG}") - - WRITE_BASIC_PACKAGE_VERSION_FILE("${CMAKE_CURRENT_BINARY_DIR}/KokkosConfigVersion.cmake" - VERSION "${Kokkos_VERSION}" - COMPATIBILITY AnyNewerVersion) + file(COPY ${Kokkos_BINARY_DIR}/KokkosConfig.cmake ${Kokkos_BINARY_DIR}/KokkosConfigCommon.cmake + ${Kokkos_BINARY_DIR}/KokkosConfigVersion.cmake DESTINATION ${CMAKE_BINARY_DIR}/cmake_packages/Kokkos + ) + file(WRITE ${CMAKE_BINARY_DIR}/cmake_packages/Kokkos/KokkosTargets.cmake + "include(${Kokkos_BINARY_DIR}/KokkosTargets.cmake)" + ) +else() + configure_file(cmake/KokkosConfigCommon.cmake.in ${Kokkos_BINARY_DIR}/KokkosConfigCommon.cmake @ONLY) + + write_basic_package_version_file( + "${CMAKE_CURRENT_BINARY_DIR}/KokkosConfigVersion.cmake" VERSION "${Kokkos_VERSION}" COMPATIBILITY AnyNewerVersion + ) install(FILES ${CMAKE_CURRENT_BINARY_DIR}/KokkosConfigVersion.cmake - DESTINATION "${${PROJECT_NAME}_INSTALL_LIB_DIR}/cmake/${PACKAGE_NAME}") -ENDIF() - -INSTALL(FILES ${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_config.h DESTINATION ${KOKKOS_HEADER_DIR}) + DESTINATION "${${PROJECT_NAME}_INSTALL_LIB_DIR}/cmake/Kokkos" + ) +endif() +install(FILES ${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_config.h DESTINATION ${KOKKOS_HEADER_DIR}) diff --git a/packages/kokkos/cmake/kokkos_pick_cxx_std.cmake b/packages/kokkos/cmake/kokkos_pick_cxx_std.cmake index ae14a10d531f..0d31e6d131f7 100644 --- a/packages/kokkos/cmake/kokkos_pick_cxx_std.cmake +++ b/packages/kokkos/cmake/kokkos_pick_cxx_std.cmake @@ -1,20 +1,28 @@ # From CMake 3.10 documentation #This can run at any time -KOKKOS_OPTION(CXX_STANDARD "" STRING "[[DEPRECATED - USE CMAKE_CXX_STANDARD INSTEAD]] The C++ standard for Kokkos to use: 17 or 20. If empty, this will default to CMAKE_CXX_STANDARD. If both CMAKE_CXX_STANDARD and Kokkos_CXX_STANDARD are empty, this will default to 17") +kokkos_option( + CXX_STANDARD + "" + STRING + "[[DEPRECATED - USE CMAKE_CXX_STANDARD INSTEAD]] The C++ standard for Kokkos to use: 17 or 20. If empty, this will default to CMAKE_CXX_STANDARD. If both CMAKE_CXX_STANDARD and Kokkos_CXX_STANDARD are empty, this will default to 17" +) # Set CXX standard flags -SET(KOKKOS_ENABLE_CXX17 OFF) -SET(KOKKOS_ENABLE_CXX20 OFF) -SET(KOKKOS_ENABLE_CXX23 OFF) -SET(KOKKOS_ENABLE_CXX26 OFF) -IF (KOKKOS_CXX_STANDARD) - MESSAGE(FATAL_ERROR "Setting the variable Kokkos_CXX_STANDARD in configuration is deprecated - set CMAKE_CXX_STANDARD directly instead") -ENDIF() +set(KOKKOS_ENABLE_CXX17 OFF) +set(KOKKOS_ENABLE_CXX20 OFF) +set(KOKKOS_ENABLE_CXX23 OFF) +set(KOKKOS_ENABLE_CXX26 OFF) +if(KOKKOS_CXX_STANDARD) + message( + FATAL_ERROR + "Setting the variable Kokkos_CXX_STANDARD in configuration is deprecated - set CMAKE_CXX_STANDARD directly instead" + ) +endif() -IF (NOT CMAKE_CXX_STANDARD) - SET(KOKKOS_CXX_STANDARD "17") -ELSE() - SET(KOKKOS_CXX_STANDARD ${CMAKE_CXX_STANDARD}) -ENDIF() -MESSAGE(STATUS "Setting default Kokkos CXX standard to ${KOKKOS_CXX_STANDARD}") +if(NOT CMAKE_CXX_STANDARD) + set(KOKKOS_CXX_STANDARD "17") +else() + set(KOKKOS_CXX_STANDARD ${CMAKE_CXX_STANDARD}) +endif() +message(STATUS "Setting default Kokkos CXX standard to ${KOKKOS_CXX_STANDARD}") diff --git a/packages/kokkos/cmake/kokkos_test_cxx_std.cmake b/packages/kokkos/cmake/kokkos_test_cxx_std.cmake index 5b45674e0570..a84e714064df 100644 --- a/packages/kokkos/cmake/kokkos_test_cxx_std.cmake +++ b/packages/kokkos/cmake/kokkos_test_cxx_std.cmake @@ -1,101 +1,112 @@ -KOKKOS_CFG_DEPENDS(CXX_STD COMPILER_ID) +kokkos_cfg_depends(CXX_STD COMPILER_ID) -FUNCTION(kokkos_set_cxx_standard_feature standard) - SET(EXTENSION_NAME CMAKE_CXX${standard}_EXTENSION_COMPILE_OPTION) - SET(STANDARD_NAME CMAKE_CXX${standard}_STANDARD_COMPILE_OPTION) - SET(FEATURE_NAME cxx_std_${standard}) +function(kokkos_set_cxx_standard_feature standard) + set(EXTENSION_NAME CMAKE_CXX${standard}_EXTENSION_COMPILE_OPTION) + set(STANDARD_NAME CMAKE_CXX${standard}_STANDARD_COMPILE_OPTION) + set(FEATURE_NAME cxx_std_${standard}) #CMake's way of telling us that the standard (or extension) #flags are supported is the extension/standard variables - IF (NOT DEFINED CMAKE_CXX_EXTENSIONS) - IF(KOKKOS_DONT_ALLOW_EXTENSIONS) - GLOBAL_SET(KOKKOS_USE_CXX_EXTENSIONS OFF) - ELSE() - GLOBAL_SET(KOKKOS_USE_CXX_EXTENSIONS ON) - ENDIF() - ELSEIF(CMAKE_CXX_EXTENSIONS) - IF(KOKKOS_DONT_ALLOW_EXTENSIONS) - MESSAGE(FATAL_ERROR "The chosen configuration does not support CXX extensions flags: ${KOKKOS_DONT_ALLOW_EXTENSIONS}. Must set CMAKE_CXX_EXTENSIONS=OFF to continue") - ELSE() - GLOBAL_SET(KOKKOS_USE_CXX_EXTENSIONS ON) - ENDIF() - ELSE() - #For trilinos, we need to make sure downstream projects - GLOBAL_SET(KOKKOS_USE_CXX_EXTENSIONS OFF) - ENDIF() + if(NOT DEFINED CMAKE_CXX_EXTENSIONS) + if(KOKKOS_DONT_ALLOW_EXTENSIONS) + global_set(KOKKOS_USE_CXX_EXTENSIONS OFF) + else() + global_set(KOKKOS_USE_CXX_EXTENSIONS ON) + endif() + elseif(CMAKE_CXX_EXTENSIONS) + if(KOKKOS_DONT_ALLOW_EXTENSIONS) + message( + FATAL_ERROR + "The chosen configuration does not support CXX extensions flags: ${KOKKOS_DONT_ALLOW_EXTENSIONS}. Must set CMAKE_CXX_EXTENSIONS=OFF to continue" + ) + else() + global_set(KOKKOS_USE_CXX_EXTENSIONS ON) + endif() + endif() - IF (KOKKOS_USE_CXX_EXTENSIONS AND ${EXTENSION_NAME}) - MESSAGE(STATUS "Using ${${EXTENSION_NAME}} for C++${standard} extensions as feature") - GLOBAL_SET(KOKKOS_CXX_STANDARD_FEATURE ${FEATURE_NAME}) - ELSEIF(NOT KOKKOS_USE_CXX_EXTENSIONS AND ${STANDARD_NAME}) - MESSAGE(STATUS "Using ${${STANDARD_NAME}} for C++${standard} standard as feature") - IF (KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA AND (KOKKOS_CXX_HOST_COMPILER_ID STREQUAL GNU OR KOKKOS_CXX_HOST_COMPILER_ID STREQUAL Clang)) - IF(${KOKKOS_CXX_COMPILER_VERSION} VERSION_LESS 12.0.0) - SET(SUPPORTED_NVCC_FLAGS "-std=c++17") - ELSE() - SET(SUPPORTED_NVCC_FLAGS "-std=c++17" "-std=c++20") - ENDIF() - IF (NOT ${${STANDARD_NAME}} IN_LIST SUPPORTED_NVCC_FLAGS) - MESSAGE(FATAL_ERROR "CMake wants to use ${${STANDARD_NAME}} which is not supported by NVCC. Using a more recent host compiler or a more recent CMake version might help.") - ENDIF() - ENDIF() - GLOBAL_SET(KOKKOS_CXX_STANDARD_FEATURE ${FEATURE_NAME}) - ELSEIF (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC" OR "x${CMAKE_CXX_SIMULATE_ID}" STREQUAL "xMSVC") + if(KOKKOS_USE_CXX_EXTENSIONS AND ${EXTENSION_NAME}) + message(STATUS "Using ${${EXTENSION_NAME}} for C++${standard} extensions as feature") + global_set(KOKKOS_CXX_STANDARD_FEATURE ${FEATURE_NAME}) + elseif(NOT KOKKOS_USE_CXX_EXTENSIONS AND ${STANDARD_NAME}) + message(STATUS "Using ${${STANDARD_NAME}} for C++${standard} standard as feature") + if(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA AND (KOKKOS_CXX_HOST_COMPILER_ID STREQUAL GNU + OR KOKKOS_CXX_HOST_COMPILER_ID STREQUAL Clang) + ) + if(${KOKKOS_CXX_COMPILER_VERSION} VERSION_LESS 12.0.0) + set(SUPPORTED_NVCC_FLAGS "-std=c++17") + else() + set(SUPPORTED_NVCC_FLAGS "-std=c++17" "-std=c++20") + endif() + if(NOT ${${STANDARD_NAME}} IN_LIST SUPPORTED_NVCC_FLAGS) + message( + FATAL_ERROR + "CMake wants to use ${${STANDARD_NAME}} which is not supported by NVCC. Using a more recent host compiler or a more recent CMake version might help." + ) + endif() + endif() + global_set(KOKKOS_CXX_STANDARD_FEATURE ${FEATURE_NAME}) + elseif(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC" OR "x${CMAKE_CXX_SIMULATE_ID}" STREQUAL "xMSVC") #MSVC doesn't need a command line flag, that doesn't mean it has no support - MESSAGE(STATUS "Using no flag for C++${standard} standard as feature") - GLOBAL_SET(KOKKOS_CXX_STANDARD_FEATURE ${FEATURE_NAME}) - ELSEIF((KOKKOS_CXX_COMPILER_ID STREQUAL "NVIDIA") AND WIN32) - MESSAGE(STATUS "Using no flag for C++${standard} standard as feature") - GLOBAL_SET(KOKKOS_CXX_STANDARD_FEATURE "") - ELSEIF((KOKKOS_CXX_COMPILER_ID STREQUAL "Fujitsu")) - MESSAGE(STATUS "Using no flag for C++${standard} standard as feature") - GLOBAL_SET(KOKKOS_CXX_STANDARD_FEATURE "") - ELSE() + message(STATUS "Using no flag for C++${standard} standard as feature") + global_set(KOKKOS_CXX_STANDARD_FEATURE ${FEATURE_NAME}) + elseif((KOKKOS_CXX_COMPILER_ID STREQUAL "NVIDIA") AND WIN32) + message(STATUS "Using no flag for C++${standard} standard as feature") + global_set(KOKKOS_CXX_STANDARD_FEATURE "") + elseif(KOKKOS_CXX_COMPILER_ID STREQUAL "Fujitsu") + message(STATUS "Using no flag for C++${standard} standard as feature") + global_set(KOKKOS_CXX_STANDARD_FEATURE "") + else() #nope, we can't do anything here - MESSAGE(WARNING "C++${standard} is not supported as a compiler feature. We will choose custom flags for now, but this behavior has been deprecated. Please open an issue at https://github.com/kokkos/kokkos/issues reporting that ${KOKKOS_CXX_COMPILER_ID} ${KOKKOS_CXX_COMPILER_VERSION} failed for ${KOKKOS_CXX_STANDARD}, preferably including your CMake command.") - GLOBAL_SET(KOKKOS_CXX_STANDARD_FEATURE "") - ENDIF() + message( + WARNING + "C++${standard} is not supported as a compiler feature. We will choose custom flags for now, but this behavior has been deprecated. Please open an issue at https://github.com/kokkos/kokkos/issues reporting that ${KOKKOS_CXX_COMPILER_ID} ${KOKKOS_CXX_COMPILER_VERSION} failed for ${KOKKOS_CXX_STANDARD}, preferably including your CMake command." + ) + global_set(KOKKOS_CXX_STANDARD_FEATURE "") + endif() - IF((NOT WIN32) AND (NOT ("${KOKKOS_CXX_COMPILER_ID}" STREQUAL "Fujitsu"))) - IF(NOT ${FEATURE_NAME} IN_LIST CMAKE_CXX_COMPILE_FEATURES) - MESSAGE(FATAL_ERROR "Compiler ${KOKKOS_CXX_COMPILER_ID} should support ${FEATURE_NAME}, but CMake reports feature not supported") - ENDIF() - ENDIF() -ENDFUNCTION() + if((NOT WIN32) AND (NOT ("${KOKKOS_CXX_COMPILER_ID}" STREQUAL "Fujitsu"))) + if(NOT ${FEATURE_NAME} IN_LIST CMAKE_CXX_COMPILE_FEATURES) + message( + FATAL_ERROR + "Compiler ${KOKKOS_CXX_COMPILER_ID} should support ${FEATURE_NAME}, but CMake reports feature not supported" + ) + endif() + endif() +endfunction() -IF(KOKKOS_CXX_STANDARD STREQUAL "17") +if(KOKKOS_CXX_STANDARD STREQUAL "17") kokkos_set_cxx_standard_feature(17) - SET(KOKKOS_CXX_INTERMEDIATE_STANDARD "1Z") - SET(KOKKOS_ENABLE_CXX17 ON) -ELSEIF(KOKKOS_CXX_STANDARD STREQUAL "20") + set(KOKKOS_CXX_INTERMEDIATE_STANDARD "1Z") + set(KOKKOS_ENABLE_CXX17 ON) +elseif(KOKKOS_CXX_STANDARD STREQUAL "20") kokkos_set_cxx_standard_feature(20) - SET(KOKKOS_CXX_INTERMEDIATE_STANDARD "2A") - SET(KOKKOS_ENABLE_CXX20 ON) -ELSEIF(KOKKOS_CXX_STANDARD STREQUAL "23") + set(KOKKOS_CXX_INTERMEDIATE_STANDARD "2A") + set(KOKKOS_ENABLE_CXX20 ON) +elseif(KOKKOS_CXX_STANDARD STREQUAL "23") kokkos_set_cxx_standard_feature(23) - SET(KOKKOS_CXX_INTERMEDIATE_STANDARD "2B") - SET(KOKKOS_ENABLE_CXX23 ON) -ELSEIF(KOKKOS_CXX_STANDARD STREQUAL "26") + set(KOKKOS_CXX_INTERMEDIATE_STANDARD "2B") + set(KOKKOS_ENABLE_CXX23 ON) +elseif(KOKKOS_CXX_STANDARD STREQUAL "26") kokkos_set_cxx_standard_feature(26) - SET(KOKKOS_CXX_INTERMEDIATE_STANDARD "2C") - SET(KOKKOS_ENABLE_CXX26 ON) -ELSE() - MESSAGE(FATAL_ERROR "Kokkos requires C++17 or newer but requested ${KOKKOS_CXX_STANDARD}!") -ENDIF() + set(KOKKOS_CXX_INTERMEDIATE_STANDARD "2C") + set(KOKKOS_ENABLE_CXX26 ON) +else() + message(FATAL_ERROR "Kokkos requires C++17 or newer but requested ${KOKKOS_CXX_STANDARD}!") +endif() # Enforce that we can compile a simple C++17 program -TRY_COMPILE(CAN_COMPILE_CPP17 - ${KOKKOS_TOP_BUILD_DIR}/corner_cases - ${KOKKOS_SOURCE_DIR}/cmake/compile_tests/cplusplus17.cpp - OUTPUT_VARIABLE ERROR_MESSAGE - CXX_STANDARD 17 +try_compile( + CAN_COMPILE_CPP17 ${KOKKOS_TOP_BUILD_DIR}/corner_cases ${KOKKOS_SOURCE_DIR}/cmake/compile_tests/cplusplus17.cpp + OUTPUT_VARIABLE ERROR_MESSAGE CXX_STANDARD 17 ) -if (NOT CAN_COMPILE_CPP17) - UNSET(CAN_COMPILE_CPP17 CACHE) #make sure CMake always re-runs this - MESSAGE(FATAL_ERROR "C++${KOKKOS_CXX_STANDARD}-compliant compiler detected, but unable to compile C++17 or later program. Verify that ${CMAKE_CXX_COMPILER_ID}:${CMAKE_CXX_COMPILER_VERSION} is set up correctly (e.g., check that correct library headers are being used).\nFailing output:\n ${ERROR_MESSAGE}") -ENDIF() -UNSET(CAN_COMPILE_CPP17 CACHE) #make sure CMake always re-runs this - +if(NOT CAN_COMPILE_CPP17) + unset(CAN_COMPILE_CPP17 CACHE) #make sure CMake always re-runs this + message( + FATAL_ERROR + "C++${KOKKOS_CXX_STANDARD}-compliant compiler detected, but unable to compile C++17 or later program. Verify that ${CMAKE_CXX_COMPILER_ID}:${CMAKE_CXX_COMPILER_VERSION} is set up correctly (e.g., check that correct library headers are being used).\nFailing output:\n ${ERROR_MESSAGE}" + ) +endif() +unset(CAN_COMPILE_CPP17 CACHE) #make sure CMake always re-runs this # Enforce that extensions are turned off for nvcc_wrapper. # For compiling CUDA code using nvcc_wrapper, we will use the host compiler's @@ -105,66 +116,70 @@ UNSET(CAN_COMPILE_CPP17 CACHE) #make sure CMake always re-runs this # that we can only use host compilers for CUDA builds that use those flags. # It also means that extensions (gnu++17) can't be turned on for CUDA builds. -IF(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) - IF(NOT DEFINED CMAKE_CXX_EXTENSIONS) - SET(CMAKE_CXX_EXTENSIONS OFF) - ELSEIF(CMAKE_CXX_EXTENSIONS) - MESSAGE(FATAL_ERROR "NVCC doesn't support C++ extensions. Set -DCMAKE_CXX_EXTENSIONS=OFF") - ENDIF() -ENDIF() +if(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) + if(NOT DEFINED CMAKE_CXX_EXTENSIONS) + set(CMAKE_CXX_EXTENSIONS OFF) + elseif(CMAKE_CXX_EXTENSIONS) + message(FATAL_ERROR "NVCC doesn't support C++ extensions. Set -DCMAKE_CXX_EXTENSIONS=OFF") + endif() +endif() -IF(KOKKOS_ENABLE_CUDA) +if(KOKKOS_ENABLE_CUDA) # ENFORCE that the compiler can compile CUDA code. - IF(KOKKOS_CXX_COMPILER_ID STREQUAL Clang) - IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 4.0.0) - MESSAGE(FATAL_ERROR "Compiling CUDA code directly with Clang requires version 4.0.0 or higher.") - ENDIF() - IF(NOT DEFINED CMAKE_CXX_EXTENSIONS) - SET(CMAKE_CXX_EXTENSIONS OFF) - ELSEIF(CMAKE_CXX_EXTENSIONS) - MESSAGE(FATAL_ERROR "Compiling CUDA code with clang doesn't support C++ extensions. Set -DCMAKE_CXX_EXTENSIONS=OFF") - ENDIF() - ELSEIF(NOT KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) - MESSAGE(FATAL_ERROR "Invalid compiler for CUDA. The compiler must be nvcc_wrapper or Clang or use kokkos_launch_compiler, but compiler ID was ${KOKKOS_CXX_COMPILER_ID}") - ENDIF() -ENDIF() + if(KOKKOS_CXX_COMPILER_ID STREQUAL Clang) + if(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 4.0.0) + message(FATAL_ERROR "Compiling CUDA code directly with Clang requires version 4.0.0 or higher.") + endif() + if(NOT DEFINED CMAKE_CXX_EXTENSIONS) + set(CMAKE_CXX_EXTENSIONS OFF) + elseif(CMAKE_CXX_EXTENSIONS) + message( + FATAL_ERROR "Compiling CUDA code with clang doesn't support C++ extensions. Set -DCMAKE_CXX_EXTENSIONS=OFF" + ) + endif() + elseif(NOT KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) + message( + FATAL_ERROR + "Invalid compiler for CUDA. The compiler must be nvcc_wrapper or Clang or use kokkos_launch_compiler, but compiler ID was ${KOKKOS_CXX_COMPILER_ID}" + ) + endif() +endif() -IF (NOT KOKKOS_CXX_STANDARD_FEATURE) +if(NOT KOKKOS_CXX_STANDARD_FEATURE) #we need to pick the C++ flags ourselves - UNSET(CMAKE_CXX_STANDARD) - UNSET(CMAKE_CXX_STANDARD CACHE) - IF(KOKKOS_CXX_COMPILER_ID STREQUAL Cray) - INCLUDE(${KOKKOS_SRC_PATH}/cmake/cray.cmake) + unset(CMAKE_CXX_STANDARD) + unset(CMAKE_CXX_STANDARD CACHE) + if(KOKKOS_CXX_COMPILER_ID STREQUAL Cray) + include(${KOKKOS_SRC_PATH}/cmake/cray.cmake) kokkos_set_cray_flags(${KOKKOS_CXX_STANDARD} ${KOKKOS_CXX_INTERMEDIATE_STANDARD}) - ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) - INCLUDE(${KOKKOS_SRC_PATH}/cmake/pgi.cmake) + elseif(KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) + include(${KOKKOS_SRC_PATH}/cmake/pgi.cmake) kokkos_set_pgi_flags(${KOKKOS_CXX_STANDARD} ${KOKKOS_CXX_INTERMEDIATE_STANDARD}) - ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL Intel) - INCLUDE(${KOKKOS_SRC_PATH}/cmake/intel.cmake) + elseif(KOKKOS_CXX_COMPILER_ID STREQUAL Intel) + include(${KOKKOS_SRC_PATH}/cmake/intel.cmake) kokkos_set_intel_flags(${KOKKOS_CXX_STANDARD} ${KOKKOS_CXX_INTERMEDIATE_STANDARD}) - ELSEIF((KOKKOS_CXX_COMPILER_ID STREQUAL "MSVC") OR ((KOKKOS_CXX_COMPILER_ID STREQUAL "NVIDIA") AND WIN32)) - INCLUDE(${KOKKOS_SRC_PATH}/cmake/msvc.cmake) + elseif((KOKKOS_CXX_COMPILER_ID STREQUAL "MSVC") OR ((KOKKOS_CXX_COMPILER_ID STREQUAL "NVIDIA") AND WIN32)) + include(${KOKKOS_SRC_PATH}/cmake/msvc.cmake) kokkos_set_msvc_flags(${KOKKOS_CXX_STANDARD} ${KOKKOS_CXX_INTERMEDIATE_STANDARD}) - ELSE() - INCLUDE(${KOKKOS_SRC_PATH}/cmake/gnu.cmake) + else() + include(${KOKKOS_SRC_PATH}/cmake/gnu.cmake) kokkos_set_gnu_flags(${KOKKOS_CXX_STANDARD} ${KOKKOS_CXX_INTERMEDIATE_STANDARD}) - ENDIF() + endif() #check that the compiler accepts the C++ standard flag - INCLUDE(CheckCXXCompilerFlag) - IF (DEFINED CXX_STD_FLAGS_ACCEPTED) - UNSET(CXX_STD_FLAGS_ACCEPTED CACHE) - ENDIF() - CHECK_CXX_COMPILER_FLAG("${KOKKOS_CXX_STANDARD_FLAG}" CXX_STD_FLAGS_ACCEPTED) - IF (NOT CXX_STD_FLAGS_ACCEPTED) - CHECK_CXX_COMPILER_FLAG("${KOKKOS_CXX_INTERMEDIATE_STANDARD_FLAG}" CXX_INT_STD_FLAGS_ACCEPTED) - IF (NOT CXX_INT_STD_FLAGS_ACCEPTED) - MESSAGE(FATAL_ERROR "${KOKKOS_CXX_COMPILER_ID} did not accept ${KOKKOS_CXX_STANDARD_FLAG} or ${KOKKOS_CXX_INTERMEDIATE_STANDARD_FLAG}. You likely need to reduce the level of the C++ standard from ${KOKKOS_CXX_STANDARD}") - ENDIF() - SET(KOKKOS_CXX_STANDARD_FLAG ${KOKKOS_CXX_INTERMEDIATE_STANDARD_FLAG}) - ENDIF() - MESSAGE(STATUS "Compiler features not supported, but ${KOKKOS_CXX_COMPILER_ID} accepts ${KOKKOS_CXX_STANDARD_FLAG}") -ENDIF() - - - - + include(CheckCXXCompilerFlag) + if(DEFINED CXX_STD_FLAGS_ACCEPTED) + unset(CXX_STD_FLAGS_ACCEPTED CACHE) + endif() + check_cxx_compiler_flag("${KOKKOS_CXX_STANDARD_FLAG}" CXX_STD_FLAGS_ACCEPTED) + if(NOT CXX_STD_FLAGS_ACCEPTED) + check_cxx_compiler_flag("${KOKKOS_CXX_INTERMEDIATE_STANDARD_FLAG}" CXX_INT_STD_FLAGS_ACCEPTED) + if(NOT CXX_INT_STD_FLAGS_ACCEPTED) + message( + FATAL_ERROR + "${KOKKOS_CXX_COMPILER_ID} did not accept ${KOKKOS_CXX_STANDARD_FLAG} or ${KOKKOS_CXX_INTERMEDIATE_STANDARD_FLAG}. You likely need to reduce the level of the C++ standard from ${KOKKOS_CXX_STANDARD}" + ) + endif() + set(KOKKOS_CXX_STANDARD_FLAG ${KOKKOS_CXX_INTERMEDIATE_STANDARD_FLAG}) + endif() + message(STATUS "Compiler features not supported, but ${KOKKOS_CXX_COMPILER_ID} accepts ${KOKKOS_CXX_STANDARD_FLAG}") +endif() diff --git a/packages/kokkos/cmake/kokkos_tpls.cmake b/packages/kokkos/cmake/kokkos_tpls.cmake index cda9e0d6004a..f43aff4d1f08 100644 --- a/packages/kokkos/cmake/kokkos_tpls.cmake +++ b/packages/kokkos/cmake/kokkos_tpls.cmake @@ -1,126 +1,120 @@ -KOKKOS_CFG_DEPENDS(TPLS OPTIONS) -KOKKOS_CFG_DEPENDS(TPLS DEVICES) -KOKKOS_CFG_DEPENDS(TPLS COMPILER_ID) +kokkos_cfg_depends(TPLS OPTIONS) +kokkos_cfg_depends(TPLS DEVICES) +kokkos_cfg_depends(TPLS COMPILER_ID) -FUNCTION(KOKKOS_TPL_OPTION PKG DEFAULT) - CMAKE_PARSE_ARGUMENTS(PARSED - "" - "TRIBITS" - "" - ${ARGN}) +function(KOKKOS_TPL_OPTION PKG DEFAULT) + cmake_parse_arguments(PARSED "" "TRIBITS" "" ${ARGN}) - IF (PARSED_TRIBITS) + if(PARSED_TRIBITS) #this is also a TPL option you can activate with Tribits - IF (NOT "${TPL_ENABLE_${PARSED_TRIBITS}}" STREQUAL "") + if(NOT "${TPL_ENABLE_${PARSED_TRIBITS}}" STREQUAL "") #Tribits brought its own default that should take precedence - SET(DEFAULT ${TPL_ENABLE_${PARSED_TRIBITS}}) - ENDIF() - ENDIF() + set(DEFAULT ${TPL_ENABLE_${PARSED_TRIBITS}}) + endif() + endif() - KOKKOS_ENABLE_OPTION(${PKG} ${DEFAULT} "Whether to enable the ${PKG} library") - KOKKOS_OPTION(${PKG}_DIR "" PATH "Location of ${PKG} library") - SET(KOKKOS_ENABLE_${PKG} ${KOKKOS_ENABLE_${PKG}} PARENT_SCOPE) - SET(KOKKOS_${PKG}_DIR ${KOKKOS_${PKG}_DIR} PARENT_SCOPE) + kokkos_enable_option(${PKG} ${DEFAULT} "Whether to enable the ${PKG} library") + kokkos_option(${PKG}_DIR "" PATH "Location of ${PKG} library") + set(KOKKOS_ENABLE_${PKG} ${KOKKOS_ENABLE_${PKG}} PARENT_SCOPE) + set(KOKKOS_${PKG}_DIR ${KOKKOS_${PKG}_DIR} PARENT_SCOPE) +endfunction() - IF (KOKKOS_HAS_TRILINOS - AND KOKKOS_ENABLE_${PKG} - AND NOT PARSED_TRIBITS) - #this TPL was enabled, but it is not valid to use inside of TriBITS - MESSAGE(FATAL_ERROR "Enabled TPL ${PKG} inside TriBITS build, " - "but this can only be enabled in a standalone build") - ENDIF() -ENDFUNCTION() - -KOKKOS_TPL_OPTION(HWLOC Off TRIBITS HWLOC) -KOKKOS_TPL_OPTION(CUDA ${Kokkos_ENABLE_CUDA} TRIBITS CUDA) -IF(KOKKOS_ENABLE_HIP AND NOT KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC AND NOT - KOKKOS_HAS_TRILINOS) - SET(ROCM_DEFAULT ON) -ELSE() - SET(ROCM_DEFAULT OFF) -ENDIF() -IF(KOKKOS_ENABLE_HIP AND NOT KOKKOS_HAS_TRILINOS) - SET(ROCTHRUST_DEFAULT ON) -ELSE() - SET(ROCTHRUST_DEFAULT OFF) -ENDIF() -KOKKOS_TPL_OPTION(ROCM ${ROCM_DEFAULT}) -KOKKOS_TPL_OPTION(ROCTHRUST ${ROCTHRUST_DEFAULT}) +kokkos_tpl_option(HWLOC Off TRIBITS HWLOC) +kokkos_tpl_option(CUDA ${Kokkos_ENABLE_CUDA} TRIBITS CUDA) +if(KOKKOS_ENABLE_HIP AND NOT KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC) + set(ROCM_DEFAULT ON) +else() + set(ROCM_DEFAULT OFF) +endif() +if(KOKKOS_ENABLE_HIP) + set(ROCTHRUST_DEFAULT ON) +else() + set(ROCTHRUST_DEFAULT OFF) +endif() +kokkos_tpl_option(ROCM ${ROCM_DEFAULT}) +kokkos_tpl_option(ROCTHRUST ${ROCTHRUST_DEFAULT}) +if(Kokkos_ENABLE_ROCTHRUST) + include(CheckCXXSourceCompiles) + check_cxx_source_compiles( + " + #include + int main() { + static_assert(_GLIBCXX_RELEASE < 9); + return 0; + } + " + Kokkos_ENABLE_IMPL_SKIP_NO_RTTI_FLAG + ) +endif() -IF(KOKKOS_ENABLE_SYCL AND NOT KOKKOS_HAS_TRILINOS) - SET(ONEDPL_DEFAULT ON) -ELSE() - SET(ONEDPL_DEFAULT OFF) -ENDIF() -KOKKOS_TPL_OPTION(ONEDPL ${ONEDPL_DEFAULT}) +if(KOKKOS_ENABLE_SYCL) + set(ONEDPL_DEFAULT ON) +else() + set(ONEDPL_DEFAULT OFF) +endif() +kokkos_tpl_option(ONEDPL ${ONEDPL_DEFAULT}) -IF (WIN32) - SET(LIBDL_DEFAULT Off) -ELSE() - SET(LIBDL_DEFAULT On) -ENDIF() -KOKKOS_TPL_OPTION(LIBDL ${LIBDL_DEFAULT} TRIBITS DLlib) +if(WIN32) + set(LIBDL_DEFAULT Off) +else() + set(LIBDL_DEFAULT On) +endif() +kokkos_tpl_option(LIBDL ${LIBDL_DEFAULT} TRIBITS DLlib) -IF(Trilinos_ENABLE_Kokkos AND TPL_ENABLE_HPX) -SET(HPX_DEFAULT ON) -ELSE() -SET(HPX_DEFAULT OFF) -ENDIF() -KOKKOS_TPL_OPTION(HPX ${HPX_DEFAULT}) +if(Trilinos_ENABLE_Kokkos AND TPL_ENABLE_HPX) + set(HPX_DEFAULT ON) +else() + set(HPX_DEFAULT OFF) +endif() +kokkos_tpl_option(HPX ${HPX_DEFAULT}) -KOKKOS_TPL_OPTION(THREADS ${Kokkos_ENABLE_THREADS} TRIBITS Pthread) +kokkos_tpl_option(THREADS ${Kokkos_ENABLE_THREADS} TRIBITS Pthread) -IF(Trilinos_ENABLE_Kokkos AND TPL_ENABLE_quadmath) - SET(LIBQUADMATH_DEFAULT ON) -ELSE() - SET(LIBQUADMATH_DEFAULT OFF) -ENDIF() -KOKKOS_TPL_OPTION(LIBQUADMATH ${LIBQUADMATH_DEFAULT} TRIBITS quadmath) +if(Trilinos_ENABLE_Kokkos AND TPL_ENABLE_quadmath) + set(LIBQUADMATH_DEFAULT ON) +else() + set(LIBQUADMATH_DEFAULT OFF) +endif() +kokkos_tpl_option(LIBQUADMATH ${LIBQUADMATH_DEFAULT} TRIBITS quadmath) #Make sure we use our local FindKokkosCuda.cmake -KOKKOS_IMPORT_TPL(HPX INTERFACE) -KOKKOS_IMPORT_TPL(CUDA INTERFACE) -KOKKOS_IMPORT_TPL(HWLOC) -KOKKOS_IMPORT_TPL(LIBDL) -IF (NOT WIN32) - KOKKOS_IMPORT_TPL(THREADS INTERFACE) -ENDIF() -IF (NOT KOKKOS_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) - KOKKOS_IMPORT_TPL(ROCM INTERFACE) -ENDIF() -KOKKOS_IMPORT_TPL(ONEDPL INTERFACE) -KOKKOS_IMPORT_TPL(LIBQUADMATH) -KOKKOS_IMPORT_TPL(ROCTHRUST) +kokkos_import_tpl(HPX INTERFACE) +kokkos_import_tpl(CUDA INTERFACE) +kokkos_import_tpl(HWLOC) +kokkos_import_tpl(LIBDL) +if(NOT WIN32) + kokkos_import_tpl(THREADS INTERFACE) +endif() +if(NOT KOKKOS_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) + kokkos_import_tpl(ROCM INTERFACE) +endif() +kokkos_import_tpl(ONEDPL INTERFACE) +kokkos_import_tpl(LIBQUADMATH) +kokkos_import_tpl(ROCTHRUST) -IF (Kokkos_ENABLE_DESUL_ATOMICS_EXTERNAL) +if(Kokkos_ENABLE_DESUL_ATOMICS_EXTERNAL) find_package(desul REQUIRED COMPONENTS atomics) - KOKKOS_EXPORT_CMAKE_TPL(desul REQUIRED COMPONENTS atomics) -ENDIF() + kokkos_export_cmake_tpl(desul REQUIRED COMPONENTS atomics) +endif() -if (Kokkos_ENABLE_IMPL_MDSPAN AND Kokkos_ENABLE_MDSPAN_EXTERNAL) +if(Kokkos_ENABLE_IMPL_MDSPAN AND Kokkos_ENABLE_MDSPAN_EXTERNAL) find_package(mdspan REQUIRED) - KOKKOS_EXPORT_CMAKE_TPL(mdspan REQUIRED) + kokkos_export_cmake_tpl(mdspan REQUIRED) endif() -IF (Kokkos_ENABLE_OPENMP) - find_package(OpenMP REQUIRED COMPONENTS CXX) - # FIXME_TRILINOS Trilinos doesn't allow for Kokkos to use find_dependency - # so we just append the flags here instead of linking with the OpenMP target. - IF(KOKKOS_HAS_TRILINOS) - COMPILER_SPECIFIC_FLAGS(DEFAULT ${OpenMP_CXX_FLAGS}) - ELSE() - KOKKOS_EXPORT_CMAKE_TPL(OpenMP REQUIRED COMPONENTS CXX) - ENDIF() - IF(Kokkos_ENABLE_HIP AND KOKKOS_COMPILE_LANGUAGE STREQUAL HIP) - GLOBAL_APPEND(KOKKOS_AMDGPU_OPTIONS ${OpenMP_CXX_FLAGS}) - ENDIF() - IF(Kokkos_ENABLE_CUDA AND KOKKOS_COMPILE_LANGUAGE STREQUAL CUDA) - GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS -Xcompiler ${OpenMP_CXX_FLAGS}) - ENDIF() -ENDIF() +if(Kokkos_ENABLE_OPENMP) + find_package(OpenMP 3.0 REQUIRED COMPONENTS CXX) + kokkos_export_cmake_tpl(OpenMP REQUIRED COMPONENTS CXX) + if(Kokkos_ENABLE_HIP AND KOKKOS_COMPILE_LANGUAGE STREQUAL HIP) + global_append(KOKKOS_AMDGPU_OPTIONS ${OpenMP_CXX_FLAGS}) + endif() + if(Kokkos_ENABLE_CUDA AND KOKKOS_COMPILE_LANGUAGE STREQUAL CUDA) + global_append(KOKKOS_CUDA_OPTIONS -Xcompiler ${OpenMP_CXX_FLAGS}) + endif() +endif() #Convert list to newlines (which CMake doesn't always like in cache variables) -STRING(REPLACE ";" "\n" KOKKOS_TPL_EXPORT_TEMP "${KOKKOS_TPL_EXPORTS}") +string(REPLACE ";" "\n" KOKKOS_TPL_EXPORT_TEMP "${KOKKOS_TPL_EXPORTS}") #Convert to a regular variable -UNSET(KOKKOS_TPL_EXPORTS CACHE) -SET(KOKKOS_TPL_EXPORTS ${KOKKOS_TPL_EXPORT_TEMP}) +unset(KOKKOS_TPL_EXPORTS CACHE) +set(KOKKOS_TPL_EXPORTS ${KOKKOS_TPL_EXPORT_TEMP}) diff --git a/packages/kokkos/cmake/kokkos_tribits.cmake b/packages/kokkos/cmake/kokkos_tribits.cmake index 6da543a2c85b..2fda803b1181 100644 --- a/packages/kokkos/cmake/kokkos_tribits.cmake +++ b/packages/kokkos/cmake/kokkos_tribits.cmake @@ -1,82 +1,47 @@ #These are tribits wrappers only ever called by Kokkos itself -INCLUDE(CMakeParseArguments) -INCLUDE(CTest) -INCLUDE(GNUInstallDirs) +include(CMakeParseArguments) +include(CTest) +include(GNUInstallDirs) -MESSAGE(STATUS "The project name is: ${PROJECT_NAME}") +message(STATUS "The project name is: ${PROJECT_NAME}") -IF(GTest_FOUND) - SET(KOKKOS_GTEST_LIB GTest::gtest) - MESSAGE(STATUS "Using gtest found in ${GTest_DIR}") -ELSE() # fallback to internal gtest - SET(KOKKOS_GTEST_LIB kokkos_gtest) - MESSAGE(STATUS "Using internal gtest for testing") -ENDIF() +if(GTest_FOUND) + set(KOKKOS_GTEST_LIB GTest::gtest) + message(STATUS "Using gtest found in ${GTest_DIR}") +else() # fallback to internal gtest + set(KOKKOS_GTEST_LIB kokkos_gtest) + message(STATUS "Using internal gtest for testing") +endif() -FUNCTION(VERIFY_EMPTY CONTEXT) +function(VERIFY_EMPTY CONTEXT) if(${ARGN}) - MESSAGE(FATAL_ERROR "Kokkos does not support all of Tribits. Unhandled arguments in ${CONTEXT}:\n${ARGN}") + message(FATAL_ERROR "Kokkos does not support all of Tribits. Unhandled arguments in ${CONTEXT}:\n${ARGN}") endif() -ENDFUNCTION() - -#Leave this here for now - but only do for tribits -#This breaks the standalone CMake -IF (KOKKOS_HAS_TRILINOS) - IF(NOT DEFINED ${PROJECT_NAME}_ENABLE_OpenMP) - SET(${PROJECT_NAME}_ENABLE_OpenMP OFF) - ENDIF() - - IF(NOT DEFINED ${PROJECT_NAME}_ENABLE_HPX) - SET(${PROJECT_NAME}_ENABLE_HPX OFF) - ENDIF() - - IF(NOT DEFINED ${PROJECT_NAME}_ENABLE_DEBUG) - SET(${PROJECT_NAME}_ENABLE_DEBUG OFF) - ENDIF() - - IF(NOT DEFINED ${PROJECT_NAME}_ENABLE_TESTS) - SET(${PROJECT_NAME}_ENABLE_TESTS OFF) - ENDIF() - - IF(NOT DEFINED TPL_ENABLE_Pthread) - SET(TPL_ENABLE_Pthread OFF) - ENDIF() -ENDIF() - -MACRO(KOKKOS_PROCESS_SUBPACKAGES) - ADD_SUBDIRECTORY(core) - ADD_SUBDIRECTORY(containers) - ADD_SUBDIRECTORY(algorithms) - ADD_SUBDIRECTORY(simd) - if (NOT KOKKOS_HAS_TRILINOS) - ADD_SUBDIRECTORY(example) - ADD_SUBDIRECTORY(benchmarks) - endif() -ENDMACRO() - -MACRO(KOKKOS_PACKAGE_DEF) - if (KOKKOS_HAS_TRILINOS) - TRIBITS_PACKAGE_DEF() - else() - #do nothing - endif() -ENDMACRO() - -MACRO(KOKKOS_INTERNAL_ADD_LIBRARY_INSTALL LIBRARY_NAME) - KOKKOS_LIB_TYPE(${LIBRARY_NAME} INCTYPE) - TARGET_INCLUDE_DIRECTORIES(${LIBRARY_NAME} ${INCTYPE} $) - - INSTALL( +endfunction() + +macro(KOKKOS_PROCESS_SUBPACKAGES) + add_subdirectory(core) + add_subdirectory(containers) + add_subdirectory(algorithms) + add_subdirectory(simd) + add_subdirectory(example) + add_subdirectory(benchmarks) +endmacro() + +macro(KOKKOS_INTERNAL_ADD_LIBRARY_INSTALL LIBRARY_NAME) + kokkos_lib_type(${LIBRARY_NAME} INCTYPE) + target_include_directories(${LIBRARY_NAME} ${INCTYPE} $) + + install( TARGETS ${LIBRARY_NAME} EXPORT ${PROJECT_NAME} RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} - ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} - COMPONENT ${PACKAGE_NAME} + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT ${PACKAGE_NAME} ) - INSTALL( + install( TARGETS ${LIBRARY_NAME} EXPORT KokkosTargets RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} @@ -84,157 +49,131 @@ MACRO(KOKKOS_INTERNAL_ADD_LIBRARY_INSTALL LIBRARY_NAME) ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} ) - VERIFY_EMPTY(KOKKOS_ADD_LIBRARY ${PARSE_UNPARSED_ARGUMENTS}) -ENDMACRO() + verify_empty(KOKKOS_ADD_LIBRARY ${PARSE_UNPARSED_ARGUMENTS}) +endmacro() + +function(KOKKOS_ADD_EXECUTABLE ROOT_NAME) + cmake_parse_arguments(PARSE "TESTONLY" "" "SOURCES;TESTONLYLIBS" ${ARGN}) -FUNCTION(KOKKOS_ADD_EXECUTABLE ROOT_NAME) - if (KOKKOS_HAS_TRILINOS) - TRIBITS_ADD_EXECUTABLE(${ROOT_NAME} ${ARGN}) + set_source_files_properties(${PARSE_SOURCES} PROPERTIES LANGUAGE ${KOKKOS_COMPILE_LANGUAGE}) + + set(EXE_NAME ${PACKAGE_NAME}_${ROOT_NAME}) + add_executable(${EXE_NAME} ${PARSE_SOURCES}) + if(PARSE_TESTONLYLIBS) + target_link_libraries(${EXE_NAME} PRIVATE ${PARSE_TESTONLYLIBS}) + endif() + verify_empty(KOKKOS_ADD_EXECUTABLE ${PARSE_UNPARSED_ARGUMENTS}) + #All executables must link to all the kokkos targets + #This is just private linkage because exe is final + target_link_libraries(${EXE_NAME} PRIVATE Kokkos::kokkos) +endfunction() + +function(KOKKOS_ADD_EXECUTABLE_AND_TEST ROOT_NAME) + cmake_parse_arguments(PARSE "" "" "SOURCES;CATEGORIES;ARGS" ${ARGN}) + verify_empty(KOKKOS_ADD_EXECUTABLE_AND_TEST ${PARSE_UNPARSED_ARGUMENTS}) + + kokkos_add_test_executable(${ROOT_NAME} SOURCES ${PARSE_SOURCES}) + if(PARSE_ARGS) + set(TEST_NUMBER 0) + foreach(ARG_STR ${PARSE_ARGS}) + # This is passed as a single string blob to match TriBITS behavior + # We need this to be turned into a list + string(REPLACE " " ";" ARG_STR_LIST ${ARG_STR}) + list(APPEND TEST_NAME "${ROOT_NAME}${TEST_NUMBER}") + math(EXPR TEST_NUMBER "${TEST_NUMBER} + 1") + kokkos_add_test( + NAME + ${TEST_NAME} + EXE + ${ROOT_NAME} + FAIL_REGULAR_EXPRESSION + " FAILED " + ARGS + ${ARG_STR_LIST} + ) + endforeach() else() - CMAKE_PARSE_ARGUMENTS(PARSE - "TESTONLY" - "" - "SOURCES;TESTONLYLIBS" - ${ARGN}) - - SET_SOURCE_FILES_PROPERTIES(${PARSE_SOURCES} PROPERTIES LANGUAGE ${KOKKOS_COMPILE_LANGUAGE}) - - SET(EXE_NAME ${PACKAGE_NAME}_${ROOT_NAME}) - ADD_EXECUTABLE(${EXE_NAME} ${PARSE_SOURCES}) - IF (PARSE_TESTONLYLIBS) - TARGET_LINK_LIBRARIES(${EXE_NAME} PRIVATE ${PARSE_TESTONLYLIBS}) - ENDIF() - VERIFY_EMPTY(KOKKOS_ADD_EXECUTABLE ${PARSE_UNPARSED_ARGUMENTS}) - #All executables must link to all the kokkos targets - #This is just private linkage because exe is final - TARGET_LINK_LIBRARIES(${EXE_NAME} PRIVATE Kokkos::kokkos) + kokkos_add_test(NAME ${ROOT_NAME} EXE ${ROOT_NAME} FAIL_REGULAR_EXPRESSION " FAILED ") + endif() + # We noticed problems with -fvisibility=hidden for inline static variables + # if Kokkos was built as shared library. + if(BUILD_SHARED_LIBS AND NOT ${TEST_NAME}_DISABLE) + set_property(TARGET ${EXE_NAME} PROPERTY VISIBILITY_INLINES_HIDDEN ON) + set_property(TARGET ${EXE_NAME} PROPERTY CXX_VISIBILITY_PRESET hidden) + endif() + if(NOT + (Kokkos_INSTALL_TESTING + OR Kokkos_ENABLE_SYCL + OR Kokkos_ENABLE_HPX + OR Kokkos_ENABLE_IMPL_SKIP_NO_RTTI_FLAG + OR (KOKKOS_CXX_COMPILER_ID STREQUAL "Intel" AND KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 2021.2.0) + OR (KOKKOS_CXX_COMPILER_ID STREQUAL "NVIDIA" AND KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 11.3.0) + OR (KOKKOS_CXX_COMPILER_ID STREQUAL "NVIDIA" AND KOKKOS_CXX_HOST_COMPILER_ID STREQUAL "MSVC")) + ) + if(MSVC) + target_compile_options(${PACKAGE_NAME}_${ROOT_NAME} PRIVATE "/GR-") + else() + target_compile_options(${PACKAGE_NAME}_${ROOT_NAME} PRIVATE "-fno-rtti") + endif() + endif() +endfunction() + +function(KOKKOS_SET_EXE_PROPERTY ROOT_NAME) + set(TARGET_NAME ${PACKAGE_NAME}_${ROOT_NAME}) + if(NOT TARGET ${TARGET_NAME}) + message(SEND_ERROR "No target ${TARGET_NAME} exists - cannot set target properties") endif() -ENDFUNCTION() - -FUNCTION(KOKKOS_ADD_EXECUTABLE_AND_TEST ROOT_NAME) - CMAKE_PARSE_ARGUMENTS(PARSE - "" - "" - "SOURCES;CATEGORIES;ARGS" - ${ARGN}) - VERIFY_EMPTY(KOKKOS_ADD_EXECUTABLE_AND_TEST ${PARSE_UNPARSED_ARGUMENTS}) - - IF (KOKKOS_HAS_TRILINOS) - IF(DEFINED PARSE_ARGS) - STRING(REPLACE ";" " " PARSE_ARGS "${PARSE_ARGS}") - ENDIF() - TRIBITS_ADD_EXECUTABLE_AND_TEST( - ${ROOT_NAME} - SOURCES ${PARSE_SOURCES} - TESTONLYLIBS ${KOKKOS_GTEST_LIB} - NUM_MPI_PROCS 1 - COMM serial mpi - ARGS ${PARSE_ARGS} - CATEGORIES ${PARSE_CATEGORIES} - SOURCES ${PARSE_SOURCES} - FAIL_REGULAR_EXPRESSION " FAILED " - ARGS ${PARSE_ARGS} - ) - ELSE() - KOKKOS_ADD_TEST_EXECUTABLE(${ROOT_NAME} - SOURCES ${PARSE_SOURCES} - ) - IF (PARSE_ARGS) - SET(TEST_NUMBER 0) - FOREACH (ARG_STR ${PARSE_ARGS}) - # This is passed as a single string blob to match TriBITS behavior - # We need this to be turned into a list - STRING(REPLACE " " ";" ARG_STR_LIST ${ARG_STR}) - LIST(APPEND TEST_NAME "${ROOT_NAME}${TEST_NUMBER}") - MATH(EXPR TEST_NUMBER "${TEST_NUMBER} + 1") - KOKKOS_ADD_TEST(NAME ${TEST_NAME} - EXE ${ROOT_NAME} - FAIL_REGULAR_EXPRESSION " FAILED " - ARGS ${ARG_STR_LIST} - ) - ENDFOREACH() - ELSE() - KOKKOS_ADD_TEST(NAME ${ROOT_NAME} - EXE ${ROOT_NAME} - FAIL_REGULAR_EXPRESSION " FAILED " - ) - ENDIF() - ENDIF() - # We noticed problems with -fvisibility=hidden for inline static variables - # if Kokkos was built as shared library. - IF(BUILD_SHARED_LIBS) - SET_PROPERTY(TARGET ${PACKAGE_NAME}_${ROOT_NAME} PROPERTY VISIBILITY_INLINES_HIDDEN ON) - SET_PROPERTY(TARGET ${PACKAGE_NAME}_${ROOT_NAME} PROPERTY CXX_VISIBILITY_PRESET hidden) - ENDIF() -ENDFUNCTION() - -FUNCTION(KOKKOS_SET_EXE_PROPERTY ROOT_NAME) - SET(TARGET_NAME ${PACKAGE_NAME}_${ROOT_NAME}) - IF (NOT TARGET ${TARGET_NAME}) - MESSAGE(SEND_ERROR "No target ${TARGET_NAME} exists - cannot set target properties") - ENDIF() - SET_PROPERTY(TARGET ${TARGET_NAME} PROPERTY ${ARGN}) -ENDFUNCTION() - -MACRO(KOKKOS_SETUP_BUILD_ENVIRONMENT) + set_property(TARGET ${TARGET_NAME} PROPERTY ${ARGN}) +endfunction() + +macro(KOKKOS_SETUP_BUILD_ENVIRONMENT) # This is needed for both regular build and install tests - INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_compiler_id.cmake) + include(${KOKKOS_SRC_PATH}/cmake/kokkos_compiler_id.cmake) #set an internal option, if not already set - SET(Kokkos_INSTALL_TESTING OFF CACHE INTERNAL "Whether to build tests and examples against installation") - IF (Kokkos_INSTALL_TESTING) - SET(KOKKOS_ENABLE_TESTS ON) - SET(KOKKOS_ENABLE_BENCHMARKS ON) - SET(KOKKOS_ENABLE_EXAMPLES ON) + set(Kokkos_INSTALL_TESTING OFF CACHE INTERNAL "Whether to build tests and examples against installation") + if(Kokkos_INSTALL_TESTING) + set(KOKKOS_ENABLE_TESTS ON) + set(KOKKOS_ENABLE_BENCHMARKS ON) + set(KOKKOS_ENABLE_EXAMPLES ON) # This looks a little weird, but what we are doing # is to NOT build Kokkos but instead look for an # installed Kokkos - then build examples and tests # against that installed Kokkos - FIND_PACKAGE(Kokkos REQUIRED) + find_package(Kokkos REQUIRED) # Just grab the configuration from the installation - FOREACH(DEV ${Kokkos_DEVICES}) - SET(KOKKOS_ENABLE_${DEV} ON) - ENDFOREACH() - FOREACH(OPT ${Kokkos_OPTIONS}) - SET(KOKKOS_ENABLE_${OPT} ON) - ENDFOREACH() - FOREACH(TPL ${Kokkos_TPLS}) - SET(KOKKOS_ENABLE_${TPL} ON) - ENDFOREACH() - FOREACH(ARCH ${Kokkos_ARCH}) - SET(KOKKOS_ARCH_${ARCH} ON) - ENDFOREACH() - ELSE() - INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_enable_devices.cmake) - INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_enable_options.cmake) - INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_test_cxx_std.cmake) - INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_arch.cmake) - IF (NOT KOKKOS_HAS_TRILINOS) - SET(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${Kokkos_SOURCE_DIR}/cmake/Modules/") - ENDIF() - INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_tpls.cmake) - INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_corner_cases.cmake) - ENDIF() -ENDMACRO() - -MACRO(KOKKOS_ADD_TEST_EXECUTABLE ROOT_NAME) - CMAKE_PARSE_ARGUMENTS(PARSE - "" - "" - "SOURCES" - ${ARGN}) - KOKKOS_ADD_EXECUTABLE(${ROOT_NAME} - SOURCES ${PARSE_SOURCES} - ${PARSE_UNPARSED_ARGUMENTS} - TESTONLYLIBS ${KOKKOS_GTEST_LIB} - ) - SET(EXE_NAME ${PACKAGE_NAME}_${ROOT_NAME}) -ENDMACRO() - -MACRO(KOKKOS_PACKAGE_POSTPROCESS) - if (KOKKOS_HAS_TRILINOS) - TRIBITS_PACKAGE_POSTPROCESS() + foreach(DEV ${Kokkos_DEVICES}) + set(KOKKOS_ENABLE_${DEV} ON) + endforeach() + foreach(OPT ${Kokkos_OPTIONS}) + set(KOKKOS_ENABLE_${OPT} ON) + endforeach() + foreach(TPL ${Kokkos_TPLS}) + set(KOKKOS_ENABLE_${TPL} ON) + endforeach() + foreach(ARCH ${Kokkos_ARCH}) + set(KOKKOS_ARCH_${ARCH} ON) + endforeach() + else() + include(${KOKKOS_SRC_PATH}/cmake/kokkos_enable_devices.cmake) + include(${KOKKOS_SRC_PATH}/cmake/kokkos_enable_options.cmake) + include(${KOKKOS_SRC_PATH}/cmake/kokkos_test_cxx_std.cmake) + include(${KOKKOS_SRC_PATH}/cmake/kokkos_arch.cmake) + set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${Kokkos_SOURCE_DIR}/cmake/Modules/") + include(${KOKKOS_SRC_PATH}/cmake/kokkos_tpls.cmake) + include(${KOKKOS_SRC_PATH}/cmake/kokkos_corner_cases.cmake) endif() -ENDMACRO() +endmacro() + +macro(KOKKOS_ADD_TEST_EXECUTABLE ROOT_NAME) + cmake_parse_arguments(PARSE "" "" "SOURCES" ${ARGN}) + # Don't do anything if the user disabled the test + if(NOT ${PACKAGE_NAME}_${ROOT_NAME}_DISABLE) + kokkos_add_executable( + ${ROOT_NAME} SOURCES ${PARSE_SOURCES} ${PARSE_UNPARSED_ARGUMENTS} TESTONLYLIBS ${KOKKOS_GTEST_LIB} + ) + set(EXE_NAME ${PACKAGE_NAME}_${ROOT_NAME}) + endif() +endmacro() ## KOKKOS_CONFIGURE_CORE Configure/Generate header files for core content based ## on enabled backends. @@ -242,265 +181,214 @@ ENDMACRO() ## KOKKOS_SETUP is included in Kokkos_Macros.hpp and include prefix includes/defines ## KOKKOS_DECLARE is the declaration set ## KOKKOS_POST_INCLUDE is included at the end of Kokkos_Core.hpp -MACRO(KOKKOS_CONFIGURE_CORE) - MESSAGE(STATUS "Kokkos Backends: ${KOKKOS_ENABLED_DEVICES}") - KOKKOS_CONFIG_HEADER( KokkosCore_Config_HeaderSet.in KokkosCore_Config_FwdBackend.hpp "KOKKOS_FWD" "fwd/Kokkos_Fwd" "${KOKKOS_ENABLED_DEVICES}") - KOKKOS_CONFIG_HEADER( KokkosCore_Config_HeaderSet.in KokkosCore_Config_SetupBackend.hpp "KOKKOS_SETUP" "setup/Kokkos_Setup" "${DEVICE_SETUP_LIST}") - KOKKOS_CONFIG_HEADER( KokkosCore_Config_HeaderSet.in KokkosCore_Config_DeclareBackend.hpp "KOKKOS_DECLARE" "decl/Kokkos_Declare" "${KOKKOS_ENABLED_DEVICES}") - CONFIGURE_FILE(cmake/KokkosCore_config.h.in KokkosCore_config.h @ONLY) -ENDMACRO() +macro(KOKKOS_CONFIGURE_CORE) + message(STATUS "Kokkos Backends: ${KOKKOS_ENABLED_DEVICES}") + kokkos_config_header( + KokkosCore_Config_HeaderSet.in KokkosCore_Config_FwdBackend.hpp "KOKKOS_FWD" "fwd/Kokkos_Fwd" + "${KOKKOS_ENABLED_DEVICES}" + ) + kokkos_config_header( + KokkosCore_Config_HeaderSet.in KokkosCore_Config_SetupBackend.hpp "KOKKOS_SETUP" "setup/Kokkos_Setup" + "${DEVICE_SETUP_LIST}" + ) + kokkos_config_header( + KokkosCore_Config_HeaderSet.in KokkosCore_Config_DeclareBackend.hpp "KOKKOS_DECLARE" "decl/Kokkos_Declare" + "${KOKKOS_ENABLED_DEVICES}" + ) + configure_file(cmake/KokkosCore_config.h.in KokkosCore_config.h @ONLY) +endmacro() ## KOKKOS_INSTALL_ADDITIONAL_FILES - instruct cmake to install files in target destination. ## Includes generated header files, scripts such as nvcc_wrapper and hpcbind, ## as well as other files provided through plugins. -MACRO(KOKKOS_INSTALL_ADDITIONAL_FILES) +macro(KOKKOS_INSTALL_ADDITIONAL_FILES) # kokkos_launch_compiler is used by Kokkos to prefix compiler commands so that they forward to original kokkos compiler # if nvcc_wrapper was not used as CMAKE_CXX_COMPILER, configure the original compiler into kokkos_launch_compiler - IF(NOT "${CMAKE_CXX_COMPILER}" MATCHES "nvcc_wrapper") - SET(NVCC_WRAPPER_DEFAULT_COMPILER "${CMAKE_CXX_COMPILER}") - ELSE() - IF(NOT "$ENV{NVCC_WRAPPER_DEFAULT_COMPILER}" STREQUAL "") - SET(NVCC_WRAPPER_DEFAULT_COMPILER "$ENV{NVCC_WRAPPER_DEFAULT_COMPILER}") - ENDIF() - ENDIF() - - CONFIGURE_FILE(${CMAKE_CURRENT_SOURCE_DIR}/bin/kokkos_launch_compiler - ${PROJECT_BINARY_DIR}/temp/kokkos_launch_compiler - @ONLY) - - INSTALL(PROGRAMS - "${CMAKE_CURRENT_SOURCE_DIR}/bin/nvcc_wrapper" - "${CMAKE_CURRENT_SOURCE_DIR}/bin/hpcbind" - "${PROJECT_BINARY_DIR}/temp/kokkos_launch_compiler" - DESTINATION ${CMAKE_INSTALL_BINDIR}) - INSTALL(FILES - "${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_config.h" + if(NOT "${CMAKE_CXX_COMPILER}" MATCHES "nvcc_wrapper") + set(NVCC_WRAPPER_DEFAULT_COMPILER "${CMAKE_CXX_COMPILER}") + else() + if(NOT "$ENV{NVCC_WRAPPER_DEFAULT_COMPILER}" STREQUAL "") + set(NVCC_WRAPPER_DEFAULT_COMPILER "$ENV{NVCC_WRAPPER_DEFAULT_COMPILER}") + endif() + endif() + + configure_file( + ${CMAKE_CURRENT_SOURCE_DIR}/bin/kokkos_launch_compiler ${PROJECT_BINARY_DIR}/temp/kokkos_launch_compiler @ONLY + ) + + install(PROGRAMS "${CMAKE_CURRENT_SOURCE_DIR}/bin/nvcc_wrapper" "${CMAKE_CURRENT_SOURCE_DIR}/bin/hpcbind" + "${PROJECT_BINARY_DIR}/temp/kokkos_launch_compiler" DESTINATION ${CMAKE_INSTALL_BINDIR} + ) + install( + FILES "${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_config.h" "${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_Config_FwdBackend.hpp" "${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_Config_SetupBackend.hpp" "${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_Config_DeclareBackend.hpp" - DESTINATION ${KOKKOS_HEADER_DIR}) -ENDMACRO() - + DESTINATION ${KOKKOS_HEADER_DIR} + ) +endmacro() -FUNCTION(KOKKOS_SET_LIBRARY_PROPERTIES LIBRARY_NAME) - CMAKE_PARSE_ARGUMENTS(PARSE - "PLAIN_STYLE" - "" - "" - ${ARGN}) +function(KOKKOS_SET_LIBRARY_PROPERTIES LIBRARY_NAME) + cmake_parse_arguments(PARSE "PLAIN_STYLE" "" "" ${ARGN}) - IF((NOT KOKKOS_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) AND (${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.18")) + if((NOT KOKKOS_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) AND (${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.18")) #I can use link options #check for CXX linkage using the simple 3.18 way - TARGET_LINK_OPTIONS( - ${LIBRARY_NAME} PUBLIC - $<$:${KOKKOS_LINK_OPTIONS}> - ) - ELSE() + target_link_options(${LIBRARY_NAME} PUBLIC $<$:${KOKKOS_LINK_OPTIONS}>) + else() #I can use link options #just assume CXX linkage - TARGET_LINK_OPTIONS( - ${LIBRARY_NAME} PUBLIC ${KOKKOS_LINK_OPTIONS} - ) - ENDIF() + target_link_options(${LIBRARY_NAME} PUBLIC ${KOKKOS_LINK_OPTIONS}) + endif() - TARGET_COMPILE_OPTIONS( - ${LIBRARY_NAME} PUBLIC - $<$:${KOKKOS_COMPILE_OPTIONS}> + target_compile_options( + ${LIBRARY_NAME} PUBLIC $<$:${KOKKOS_COMPILE_OPTIONS}> ) - TARGET_COMPILE_DEFINITIONS( - ${LIBRARY_NAME} PUBLIC - $<$:${KOKKOS_COMPILE_DEFINITIONS}> + target_compile_definitions( + ${LIBRARY_NAME} PUBLIC $<$:${KOKKOS_COMPILE_DEFINITIONS}> ) - TARGET_LINK_LIBRARIES( - ${LIBRARY_NAME} PUBLIC ${KOKKOS_LINK_LIBRARIES} - ) + target_link_libraries(${LIBRARY_NAME} PUBLIC ${KOKKOS_LINK_LIBRARIES}) - IF (KOKKOS_ENABLE_CUDA) - TARGET_COMPILE_OPTIONS( - ${LIBRARY_NAME} - PUBLIC $<$:${KOKKOS_CUDA_OPTIONS}> + if(KOKKOS_ENABLE_CUDA) + target_compile_options( + ${LIBRARY_NAME} PUBLIC $<$:${KOKKOS_CUDA_OPTIONS}> ) - SET(NODEDUP_CUDAFE_OPTIONS) - FOREACH(OPT ${KOKKOS_CUDAFE_OPTIONS}) - LIST(APPEND NODEDUP_CUDAFE_OPTIONS -Xcudafe ${OPT}) - ENDFOREACH() - TARGET_COMPILE_OPTIONS( - ${LIBRARY_NAME} - PUBLIC $<$:${NODEDUP_CUDAFE_OPTIONS}> + set(NODEDUP_CUDAFE_OPTIONS) + foreach(OPT ${KOKKOS_CUDAFE_OPTIONS}) + list(APPEND NODEDUP_CUDAFE_OPTIONS -Xcudafe ${OPT}) + endforeach() + target_compile_options( + ${LIBRARY_NAME} PUBLIC $<$:${NODEDUP_CUDAFE_OPTIONS}> ) - ENDIF() + endif() - IF (KOKKOS_ENABLE_HIP) - TARGET_COMPILE_OPTIONS( - ${LIBRARY_NAME} - PUBLIC $<$:${KOKKOS_AMDGPU_OPTIONS}> + if(KOKKOS_ENABLE_HIP) + target_compile_options( + ${LIBRARY_NAME} PUBLIC $<$:${KOKKOS_AMDGPU_OPTIONS}> ) - ENDIF() - - LIST(LENGTH KOKKOS_XCOMPILER_OPTIONS XOPT_LENGTH) - IF (XOPT_LENGTH GREATER 1) - MESSAGE(FATAL_ERROR "CMake deduplication does not allow multiple -Xcompiler flags (${KOKKOS_XCOMPILER_OPTIONS}): will require Kokkos to upgrade to minimum 3.12") - ENDIF() - IF(KOKKOS_XCOMPILER_OPTIONS) - SET(NODEDUP_XCOMPILER_OPTIONS) - FOREACH(OPT ${KOKKOS_XCOMPILER_OPTIONS}) + endif() + + list(LENGTH KOKKOS_XCOMPILER_OPTIONS XOPT_LENGTH) + if(XOPT_LENGTH GREATER 1) + message( + FATAL_ERROR + "CMake deduplication does not allow multiple -Xcompiler flags (${KOKKOS_XCOMPILER_OPTIONS}): will require Kokkos to upgrade to minimum 3.12" + ) + endif() + if(KOKKOS_XCOMPILER_OPTIONS) + set(NODEDUP_XCOMPILER_OPTIONS) + foreach(OPT ${KOKKOS_XCOMPILER_OPTIONS}) #I have to do this for now because we can't guarantee 3.12 support #I really should do this with the shell option - LIST(APPEND NODEDUP_XCOMPILER_OPTIONS -Xcompiler) - LIST(APPEND NODEDUP_XCOMPILER_OPTIONS ${OPT}) - ENDFOREACH() - TARGET_COMPILE_OPTIONS( - ${LIBRARY_NAME} - PUBLIC $<$:${NODEDUP_XCOMPILER_OPTIONS}> + list(APPEND NODEDUP_XCOMPILER_OPTIONS -Xcompiler) + list(APPEND NODEDUP_XCOMPILER_OPTIONS ${OPT}) + endforeach() + target_compile_options( + ${LIBRARY_NAME} PUBLIC $<$:${NODEDUP_XCOMPILER_OPTIONS}> ) - ENDIF() + endif() - IF (KOKKOS_CXX_STANDARD_FEATURE) + if(KOKKOS_CXX_STANDARD_FEATURE) #GREAT! I can do this the right way - TARGET_COMPILE_FEATURES(${LIBRARY_NAME} PUBLIC ${KOKKOS_CXX_STANDARD_FEATURE}) - IF (NOT KOKKOS_USE_CXX_EXTENSIONS) - SET_TARGET_PROPERTIES(${LIBRARY_NAME} PROPERTIES CXX_EXTENSIONS OFF) - ENDIF() - ELSE() + target_compile_features(${LIBRARY_NAME} PUBLIC ${KOKKOS_CXX_STANDARD_FEATURE}) + if(NOT KOKKOS_USE_CXX_EXTENSIONS) + set_target_properties(${LIBRARY_NAME} PROPERTIES CXX_EXTENSIONS OFF) + endif() + else() #OH, well, no choice but the wrong way - TARGET_COMPILE_OPTIONS(${LIBRARY_NAME} PUBLIC ${KOKKOS_CXX_STANDARD_FLAG}) - ENDIF() -ENDFUNCTION() - - -FUNCTION(KOKKOS_INTERNAL_ADD_LIBRARY LIBRARY_NAME) - CMAKE_PARSE_ARGUMENTS(PARSE - "STATIC;SHARED" - "" - "HEADERS;SOURCES" - ${ARGN}) - - IF(PARSE_HEADERS) - LIST(REMOVE_DUPLICATES PARSE_HEADERS) - ENDIF() - IF(PARSE_SOURCES) - LIST(REMOVE_DUPLICATES PARSE_SOURCES) - ENDIF() - FOREACH(source ${PARSE_SOURCES}) + target_compile_options(${LIBRARY_NAME} PUBLIC ${KOKKOS_CXX_STANDARD_FLAG}) + endif() +endfunction() + +function(KOKKOS_INTERNAL_ADD_LIBRARY LIBRARY_NAME) + cmake_parse_arguments(PARSE "STATIC;SHARED" "" "HEADERS;SOURCES" ${ARGN}) + + if(PARSE_HEADERS) + list(REMOVE_DUPLICATES PARSE_HEADERS) + endif() + if(PARSE_SOURCES) + list(REMOVE_DUPLICATES PARSE_SOURCES) + endif() + foreach(source ${PARSE_SOURCES}) set_source_files_properties(${source} PROPERTIES LANGUAGE ${KOKKOS_COMPILE_LANGUAGE}) - ENDFOREACH() + endforeach() - IF(PARSE_STATIC) - SET(LINK_TYPE STATIC) - ENDIF() + if(PARSE_STATIC) + set(LINK_TYPE STATIC) + endif() - IF(PARSE_SHARED) - SET(LINK_TYPE SHARED) - ENDIF() + if(PARSE_SHARED) + set(LINK_TYPE SHARED) + endif() # MSVC and other platforms want to have # the headers included as source files # for better dependency detection - ADD_LIBRARY( - ${LIBRARY_NAME} - ${LINK_TYPE} - ${PARSE_HEADERS} - ${PARSE_SOURCES} - ) + add_library(${LIBRARY_NAME} ${LINK_TYPE} ${PARSE_HEADERS} ${PARSE_SOURCES}) - IF(PARSE_SHARED OR BUILD_SHARED_LIBS) - SET_TARGET_PROPERTIES(${LIBRARY_NAME} PROPERTIES - VERSION ${Kokkos_VERSION} - SOVERSION ${Kokkos_VERSION_MAJOR}.${Kokkos_VERSION_MINOR} + if(PARSE_SHARED OR BUILD_SHARED_LIBS) + set_target_properties( + ${LIBRARY_NAME} PROPERTIES VERSION ${Kokkos_VERSION} SOVERSION ${Kokkos_VERSION_MAJOR}.${Kokkos_VERSION_MINOR} ) - ENDIF() + endif() - KOKKOS_INTERNAL_ADD_LIBRARY_INSTALL(${LIBRARY_NAME}) + kokkos_internal_add_library_install(${LIBRARY_NAME}) #In case we are building in-tree, add an alias name #that matches the install Kokkos:: name - ADD_LIBRARY(Kokkos::${LIBRARY_NAME} ALIAS ${LIBRARY_NAME}) -ENDFUNCTION() - -FUNCTION(KOKKOS_ADD_LIBRARY LIBRARY_NAME) - CMAKE_PARSE_ARGUMENTS(PARSE - "ADD_BUILD_OPTIONS" - "" - "HEADERS" - ${ARGN} - ) - IF (KOKKOS_HAS_TRILINOS) - # We do not pass headers to trilinos. They would get installed - # to the default include folder, but we want headers installed - # preserving the directory structure, e.g. impl - # If headers got installed in both locations, it breaks some - # downstream packages - TRIBITS_ADD_LIBRARY(${LIBRARY_NAME} ${PARSE_UNPARSED_ARGUMENTS} - ADDED_LIB_TARGET_NAME_OUT ${LIBRARY_NAME}_TARGET_NAME ) - IF (PARSE_ADD_BUILD_OPTIONS) - KOKKOS_SET_LIBRARY_PROPERTIES(${${LIBRARY_NAME}_TARGET_NAME}) - ENDIF() - ELSE() - # Forward the headers, we want to know about all headers - # to make sure they appear correctly in IDEs - KOKKOS_INTERNAL_ADD_LIBRARY( - ${LIBRARY_NAME} ${PARSE_UNPARSED_ARGUMENTS} HEADERS ${PARSE_HEADERS}) - IF (PARSE_ADD_BUILD_OPTIONS) - KOKKOS_SET_LIBRARY_PROPERTIES(${LIBRARY_NAME}) - ENDIF() - ENDIF() -ENDFUNCTION() - - -FUNCTION(KOKKOS_ADD_INTERFACE_LIBRARY NAME) - IF (KOKKOS_HAS_TRILINOS) - TRIBITS_ADD_LIBRARY(${NAME} ${ARGN}) - ELSE() - ADD_LIBRARY(${NAME} INTERFACE) - KOKKOS_INTERNAL_ADD_LIBRARY_INSTALL(${NAME}) - ENDIF() -ENDFUNCTION() - - -FUNCTION(KOKKOS_LIB_INCLUDE_DIRECTORIES TARGET) - KOKKOS_LIB_TYPE(${TARGET} INCTYPE) - FOREACH(DIR ${ARGN}) - TARGET_INCLUDE_DIRECTORIES(${TARGET} ${INCTYPE} $) - ENDFOREACH() -ENDFUNCTION() - -FUNCTION(KOKKOS_LIB_COMPILE_OPTIONS TARGET) - KOKKOS_LIB_TYPE(${TARGET} INCTYPE) - KOKKOS_TARGET_COMPILE_OPTIONS(${${PROJECT_NAME}_LIBRARY_NAME_PREFIX}${TARGET} ${INCTYPE} ${ARGN}) -ENDFUNCTION() - -MACRO(KOKKOS_ADD_TEST_DIRECTORIES) - IF (KOKKOS_HAS_TRILINOS) - TRIBITS_ADD_TEST_DIRECTORIES(${ARGN}) - ELSE() - IF(KOKKOS_ENABLE_TESTS) - FOREACH(TEST_DIR ${ARGN}) - ADD_SUBDIRECTORY(${TEST_DIR}) - ENDFOREACH() - ENDIF() - ENDIF() -ENDMACRO() - -MACRO(KOKKOS_ADD_EXAMPLE_DIRECTORIES) - if (KOKKOS_HAS_TRILINOS) - TRIBITS_ADD_EXAMPLE_DIRECTORIES(${ARGN}) - else() - IF(KOKKOS_ENABLE_EXAMPLES) - FOREACH(EXAMPLE_DIR ${ARGN}) - ADD_SUBDIRECTORY(${EXAMPLE_DIR}) - ENDFOREACH() - ENDIF() + add_library(Kokkos::${LIBRARY_NAME} ALIAS ${LIBRARY_NAME}) +endfunction() + +function(KOKKOS_ADD_LIBRARY LIBRARY_NAME) + cmake_parse_arguments(PARSE "ADD_BUILD_OPTIONS" "" "HEADERS" ${ARGN}) + # Forward the headers, we want to know about all headers + # to make sure they appear correctly in IDEs + kokkos_internal_add_library(${LIBRARY_NAME} ${PARSE_UNPARSED_ARGUMENTS} HEADERS ${PARSE_HEADERS}) + if(PARSE_ADD_BUILD_OPTIONS) + kokkos_set_library_properties(${LIBRARY_NAME}) + endif() +endfunction() + +function(KOKKOS_ADD_INTERFACE_LIBRARY NAME) + add_library(${NAME} INTERFACE) + kokkos_internal_add_library_install(${NAME}) +endfunction() + +function(KOKKOS_LIB_INCLUDE_DIRECTORIES TARGET) + kokkos_lib_type(${TARGET} INCTYPE) + foreach(DIR ${ARGN}) + target_include_directories(${TARGET} ${INCTYPE} $) + endforeach() +endfunction() + +function(KOKKOS_LIB_COMPILE_OPTIONS TARGET) + kokkos_lib_type(${TARGET} INCTYPE) + target_compile_options(${${PROJECT_NAME}_LIBRARY_NAME_PREFIX}${TARGET} ${INCTYPE} ${ARGN}) +endfunction() + +macro(KOKKOS_ADD_TEST_DIRECTORIES) + if(KOKKOS_ENABLE_TESTS) + foreach(TEST_DIR ${ARGN}) + add_subdirectory(${TEST_DIR}) + endforeach() + endif() +endmacro() + +macro(KOKKOS_ADD_EXAMPLE_DIRECTORIES) + if(KOKKOS_ENABLE_EXAMPLES) + foreach(EXAMPLE_DIR ${ARGN}) + add_subdirectory(${EXAMPLE_DIR}) + endforeach() + endif() +endmacro() + +macro(KOKKOS_ADD_BENCHMARK_DIRECTORIES) + if(KOKKOS_ENABLE_BENCHMARKS) + foreach(BENCHMARK_DIR ${ARGN}) + add_subdirectory(${BENCHMARK_DIR}) + endforeach() endif() -ENDMACRO() - -MACRO(KOKKOS_ADD_BENCHMARK_DIRECTORIES) - IF(KOKKOS_ENABLE_BENCHMARKS) - FOREACH(BENCHMARK_DIR ${ARGN}) - ADD_SUBDIRECTORY(${BENCHMARK_DIR}) - ENDFOREACH() - ENDIF() -ENDMACRO() +endmacro() diff --git a/packages/kokkos/cmake/msvc.cmake b/packages/kokkos/cmake/msvc.cmake index 85421bdbaaa4..1de13585c730 100644 --- a/packages/kokkos/cmake/msvc.cmake +++ b/packages/kokkos/cmake/msvc.cmake @@ -1,11 +1,9 @@ - -FUNCTION(kokkos_set_msvc_flags full_standard int_standard) - IF (CMAKE_CXX_EXTENSIONS) - SET(KOKKOS_CXX_STANDARD_FLAG "" PARENT_SCOPE) - SET(KOKKOS_CXX_INTERMEDIATE_STANDARD_FLAG "" PARENT_SCOPE) - ELSE() - SET(KOKKOS_CXX_STANDARD_FLAG "" PARENT_SCOPE) - SET(KOKKOS_CXX_INTERMEDIATE_STANDARD_FLAG "" PARENT_SCOPE) - ENDIF() -ENDFUNCTION() - +function(kokkos_set_msvc_flags full_standard int_standard) + if(CMAKE_CXX_EXTENSIONS) + set(KOKKOS_CXX_STANDARD_FLAG "" PARENT_SCOPE) + set(KOKKOS_CXX_INTERMEDIATE_STANDARD_FLAG "" PARENT_SCOPE) + else() + set(KOKKOS_CXX_STANDARD_FLAG "" PARENT_SCOPE) + set(KOKKOS_CXX_INTERMEDIATE_STANDARD_FLAG "" PARENT_SCOPE) + endif() +endfunction() diff --git a/packages/kokkos/cmake/pgi.cmake b/packages/kokkos/cmake/pgi.cmake index e98e84955888..45f59dcd10bf 100644 --- a/packages/kokkos/cmake/pgi.cmake +++ b/packages/kokkos/cmake/pgi.cmake @@ -1,8 +1,6 @@ - function(kokkos_set_pgi_flags full_standard int_standard) - STRING(TOLOWER ${full_standard} FULL_LC_STANDARD) - STRING(TOLOWER ${int_standard} INT_LC_STANDARD) - SET(KOKKOS_CXX_STANDARD_FLAG "--c++${FULL_LC_STANDARD}" PARENT_SCOPE) - SET(KOKKOS_CXX_INTERMDIATE_STANDARD_FLAG "--c++${INT_LC_STANDARD}" PARENT_SCOPE) + string(TOLOWER ${full_standard} FULL_LC_STANDARD) + string(TOLOWER ${int_standard} INT_LC_STANDARD) + set(KOKKOS_CXX_STANDARD_FLAG "--c++${FULL_LC_STANDARD}" PARENT_SCOPE) + set(KOKKOS_CXX_INTERMDIATE_STANDARD_FLAG "--c++${INT_LC_STANDARD}" PARENT_SCOPE) endfunction() - diff --git a/packages/kokkos/cmake/tpls/FindTPLHWLOC.cmake b/packages/kokkos/cmake/tpls/FindTPLHWLOC.cmake index 4e05d2253489..52d8368d0419 100644 --- a/packages/kokkos/cmake/tpls/FindTPLHWLOC.cmake +++ b/packages/kokkos/cmake/tpls/FindTPLHWLOC.cmake @@ -15,7 +15,6 @@ # ************************************************************************ # @HEADER - #----------------------------------------------------------------------------- # Hardware locality detection and control library. # @@ -26,8 +25,4 @@ # Version: 1.3 # -KOKKOS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( HWLOC - REQUIRED_HEADERS hwloc.h - REQUIRED_LIBS_NAMES "hwloc" - ) - +kokkos_tpl_find_include_dirs_and_libraries(HWLOC REQUIRED_HEADERS hwloc.h REQUIRED_LIBS_NAMES "hwloc") diff --git a/packages/kokkos/cmake/tpls/FindTPLPthread.cmake b/packages/kokkos/cmake/tpls/FindTPLPthread.cmake index 3d5b03805d4d..f51bce5d64d7 100644 --- a/packages/kokkos/cmake/tpls/FindTPLPthread.cmake +++ b/packages/kokkos/cmake/tpls/FindTPLPthread.cmake @@ -15,29 +15,26 @@ # ************************************************************************ # @HEADER -SET(USE_THREADS FALSE) +set(USE_THREADS FALSE) -IF(NOT TPL_Pthread_INCLUDE_DIRS AND NOT TPL_Pthread_LIBRARY_DIRS AND NOT TPL_Pthread_LIBRARIES) +if(NOT TPL_Pthread_INCLUDE_DIRS AND NOT TPL_Pthread_LIBRARY_DIRS AND NOT TPL_Pthread_LIBRARIES) # Use CMake's Thread finder since it is a bit smarter in determining # whether pthreads is already built into the compiler and doesn't need # a library to link. - FIND_PACKAGE(Threads) + find_package(Threads) #If Threads found a copy of pthreads make sure it is one of the cases the tribits #tpl system cannot handle. - IF(Threads_FOUND AND CMAKE_USE_PTHREADS_INIT) - IF(CMAKE_THREAD_LIBS_INIT STREQUAL "" OR CMAKE_THREAD_LIBS_INIT STREQUAL "-pthread") - SET(USE_THREADS TRUE) - ENDIF() - ENDIF() -ENDIF() + if(Threads_FOUND AND CMAKE_USE_PTHREADS_INIT) + if(CMAKE_THREAD_LIBS_INIT STREQUAL "" OR CMAKE_THREAD_LIBS_INIT STREQUAL "-pthread") + set(USE_THREADS TRUE) + endif() + endif() +endif() -IF(USE_THREADS) - SET(TPL_Pthread_INCLUDE_DIRS "") - SET(TPL_Pthread_LIBRARIES "${CMAKE_THREAD_LIBS_INIT}") - SET(TPL_Pthread_LIBRARY_DIRS "") -ELSE() - KOKKOS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( Pthread - REQUIRED_HEADERS pthread.h - REQUIRED_LIBS_NAMES pthread - ) -ENDIF() +if(USE_THREADS) + set(TPL_Pthread_INCLUDE_DIRS "") + set(TPL_Pthread_LIBRARIES "${CMAKE_THREAD_LIBS_INIT}") + set(TPL_Pthread_LIBRARY_DIRS "") +else() + kokkos_tpl_find_include_dirs_and_libraries(Pthread REQUIRED_HEADERS pthread.h REQUIRED_LIBS_NAMES pthread) +endif() diff --git a/packages/kokkos/cmake/tpls/FindTPLquadmath.cmake b/packages/kokkos/cmake/tpls/FindTPLquadmath.cmake index 8560ec60f1b5..b449f45135aa 100644 --- a/packages/kokkos/cmake/tpls/FindTPLquadmath.cmake +++ b/packages/kokkos/cmake/tpls/FindTPLquadmath.cmake @@ -15,7 +15,4 @@ # ************************************************************************ # @HEADER -TRIBITS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( quadmath - REQUIRED_HEADERS quadmath.h - REQUIRED_LIBS_NAMES quadmath -) +tribits_tpl_find_include_dirs_and_libraries(quadmath REQUIRED_HEADERS quadmath.h REQUIRED_LIBS_NAMES quadmath) diff --git a/packages/kokkos/containers/CMakeLists.txt b/packages/kokkos/containers/CMakeLists.txt index 0857d7007b44..8ee8bb41a28a 100644 --- a/packages/kokkos/containers/CMakeLists.txt +++ b/packages/kokkos/containers/CMakeLists.txt @@ -1,9 +1,9 @@ -IF (NOT Kokkos_INSTALL_TESTING) - ADD_SUBDIRECTORY(src) -ENDIF() +if(NOT Kokkos_INSTALL_TESTING) + add_subdirectory(src) +endif() # FIXME_OPENACC: temporarily disabled due to unimplemented features -IF(NOT KOKKOS_ENABLE_OPENACC) -KOKKOS_ADD_TEST_DIRECTORIES(unit_tests) -KOKKOS_ADD_TEST_DIRECTORIES(performance_tests) -ENDIF() +if(NOT KOKKOS_ENABLE_OPENACC) + kokkos_add_test_directories(unit_tests) + kokkos_add_test_directories(performance_tests) +endif() diff --git a/packages/kokkos/containers/performance_tests/CMakeLists.txt b/packages/kokkos/containers/performance_tests/CMakeLists.txt index e325e45e85dc..8d4d605b0871 100644 --- a/packages/kokkos/containers/performance_tests/CMakeLists.txt +++ b/packages/kokkos/containers/performance_tests/CMakeLists.txt @@ -1,7 +1,6 @@ - -KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) -KOKKOS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}) -KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../src ) +kokkos_include_directories(${CMAKE_CURRENT_BINARY_DIR}) +kokkos_include_directories(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}) +kokkos_include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../src) foreach(Tag Threads;OpenMP;Cuda;HPX;HIP) string(TOUPPER ${Tag} DEVICE) @@ -10,14 +9,8 @@ foreach(Tag Threads;OpenMP;Cuda;HPX;HIP) if(Kokkos_ENABLE_${DEVICE}) message(STATUS "Sources Test${Tag}.cpp") - set(SOURCES - TestMain.cpp - Test${Tag}.cpp - ) + set(SOURCES TestMain.cpp Test${Tag}.cpp) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - ContainersPerformanceTest_${Tag} - SOURCES ${SOURCES} - ) + kokkos_add_executable_and_test(ContainersPerformanceTest_${Tag} SOURCES ${SOURCES}) endif() endforeach() diff --git a/packages/kokkos/containers/performance_tests/TestScatterView.hpp b/packages/kokkos/containers/performance_tests/TestScatterView.hpp index a74f833b9f52..953b8bff6e59 100644 --- a/packages/kokkos/containers/performance_tests/TestScatterView.hpp +++ b/packages/kokkos/containers/performance_tests/TestScatterView.hpp @@ -25,8 +25,8 @@ namespace Perf { template void test_scatter_view(int m, int n) { - Kokkos::View original_view("original_view", - n); + Kokkos::View original_view("original_view", + n); { auto scatter_view = Kokkos::Experimental::create_scatter_view< Kokkos::Experimental::ScatterSum, Duplication, Contribution>( @@ -40,8 +40,8 @@ void test_scatter_view(int m, int n) { { auto num_threads = unique_token.size(); std::cout << "num_threads " << num_threads << '\n'; - Kokkos::View - hand_coded_duplicate_view("hand_coded_duplicate", num_threads, n); + Kokkos::View hand_coded_duplicate_view( + "hand_coded_duplicate", num_threads, n); auto f2 = KOKKOS_LAMBDA(int i) { auto thread_id = unique_token.acquire(); for (int j = 0; j < 10; ++j) { diff --git a/packages/kokkos/containers/src/CMakeLists.txt b/packages/kokkos/containers/src/CMakeLists.txt index b7d85ebf11d7..b386fbe67505 100644 --- a/packages/kokkos/containers/src/CMakeLists.txt +++ b/packages/kokkos/containers/src/CMakeLists.txt @@ -1,33 +1,27 @@ #need these here for now -KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) -KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) +kokkos_include_directories(${CMAKE_CURRENT_BINARY_DIR}) +kokkos_include_directories(${CMAKE_CURRENT_SOURCE_DIR}) #----------------------------------------------------------------------------- -SET(KOKKOS_CONTAINERS_SRCS) -APPEND_GLOB(KOKKOS_CONTAINERS_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/impl/*.cpp) -SET(KOKKOS_CONTAINER_HEADERS) -APPEND_GLOB(KOKKOS_CONTAINERS_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/impl/*.hpp) -APPEND_GLOB(KOKKOS_CONTAINERS_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/*.hpp) +set(KOKKOS_CONTAINERS_SRCS) +append_glob(KOKKOS_CONTAINERS_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/impl/*.cpp) +set(KOKKOS_CONTAINER_HEADERS) +append_glob(KOKKOS_CONTAINERS_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/impl/*.hpp) +append_glob(KOKKOS_CONTAINERS_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/*.hpp) - -INSTALL ( +install( DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/" DESTINATION ${KOKKOS_HEADER_DIR} - FILES_MATCHING PATTERN "*.hpp" + FILES_MATCHING + PATTERN "*.hpp" ) -KOKKOS_ADD_LIBRARY( - kokkoscontainers - SOURCES ${KOKKOS_CONTAINERS_SRCS} - HEADERS ${KOKKOS_CONTAINERS_HEADERS} -) +kokkos_add_library(kokkoscontainers SOURCES ${KOKKOS_CONTAINERS_SRCS} HEADERS ${KOKKOS_CONTAINERS_HEADERS}) -KOKKOS_LIB_INCLUDE_DIRECTORIES(kokkoscontainers - ${KOKKOS_TOP_BUILD_DIR} - ${CMAKE_CURRENT_BINARY_DIR} - ${CMAKE_CURRENT_SOURCE_DIR} +kokkos_lib_include_directories( + kokkoscontainers ${KOKKOS_TOP_BUILD_DIR} ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR} ) -KOKKOS_LINK_INTERNAL_LIBRARY(kokkoscontainers kokkoscore) +kokkos_link_internal_library(kokkoscontainers kokkoscore) #----------------------------------------------------------------------------- diff --git a/packages/kokkos/containers/src/Kokkos_Bitset.hpp b/packages/kokkos/containers/src/Kokkos_Bitset.hpp index f50ab0a0f7e9..409260f0218d 100644 --- a/packages/kokkos/containers/src/Kokkos_Bitset.hpp +++ b/packages/kokkos/containers/src/Kokkos_Bitset.hpp @@ -271,7 +271,7 @@ class Bitset { offset = !(scan_direction & BIT_SCAN_REVERSE) ? offset : (offset + block_mask) & block_mask; - block = Impl::rotate_right(block, offset); + block = Impl::rotate_right(block, offset); return (((!(scan_direction & BIT_SCAN_REVERSE) ? Impl::bit_scan_forward(block) : Impl::int_log2(block)) + diff --git a/packages/kokkos/containers/src/Kokkos_DualView.hpp b/packages/kokkos/containers/src/Kokkos_DualView.hpp index a37a2bdcebd9..6a2e6f73a15e 100644 --- a/packages/kokkos/containers/src/Kokkos_DualView.hpp +++ b/packages/kokkos/containers/src/Kokkos_DualView.hpp @@ -275,14 +275,29 @@ class DualView : public ViewTraits { const size_t n5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, const size_t n6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, const size_t n7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) - : modified_flags(t_modified_flags("DualView::modified_flags")), - d_view(arg_prop, n0, n1, n2, n3, n4, n5, n6, n7) { - // without UVM, host View mirrors - if constexpr (Kokkos::Impl::has_type::value) - h_view = Kokkos::create_mirror_view(Kokkos::WithoutInitializing, d_view); - else - h_view = Kokkos::create_mirror_view(d_view); + : modified_flags(t_modified_flags("DualView::modified_flags")) { + if constexpr (Impl::ViewCtorProp::sequential_host_init) { + h_view = t_host(arg_prop, n0, n1, n2, n3, n4, n5, n6, n7); + static_assert(Impl::ViewCtorProp::initialize, + "DualView: SequentialHostInit isn't compatible with " + "WithoutInitializing!"); + static_assert(!Impl::ViewCtorProp::has_execution_space, + "DualView: SequentialHostInit isn't compatible with " + "providing an execution space instance!"); + + d_view = Kokkos::create_mirror_view_and_copy( + typename traits::memory_space{}, h_view); + } else { + d_view = t_dev(arg_prop, n0, n1, n2, n3, n4, n5, n6, n7); + + // without UVM, host View mirrors + if constexpr (Kokkos::Impl::has_type::value) + h_view = + Kokkos::create_mirror_view(Kokkos::WithoutInitializing, d_view); + else + h_view = Kokkos::create_mirror_view(d_view); + } } //! Copy constructor (shallow copy) @@ -338,23 +353,21 @@ class DualView : public ViewTraits { // does the DualView have only one device struct impl_dualview_is_single_device { enum : bool { - value = std::is_same::value + value = std::is_same_v }; }; // does the given device match the device of t_dev? template struct impl_device_matches_tdev_device { - enum : bool { - value = std::is_same::value - }; + enum : bool { value = std::is_same_v }; }; // does the given device match the device of t_host? template struct impl_device_matches_thost_device { enum : bool { - value = std::is_same::value + value = std::is_same_v }; }; @@ -362,7 +375,7 @@ class DualView : public ViewTraits { template struct impl_device_matches_thost_exec { enum : bool { - value = std::is_same::value + value = std::is_same_v }; }; @@ -370,7 +383,7 @@ class DualView : public ViewTraits { template struct impl_device_matches_tdev_exec { enum : bool { - value = std::is_same::value + value = std::is_same_v }; }; @@ -378,8 +391,8 @@ class DualView : public ViewTraits { template struct impl_device_matches_tdev_memory_space { enum : bool { - value = std::is_same::value + value = std::is_same_v }; }; @@ -389,11 +402,6 @@ class DualView : public ViewTraits { /// \brief Return a View on a specific device \c Device. /// - /// Please don't be afraid of the nested if_c expressions in the return - /// value's type. That just tells the method what the return type - /// should be: t_dev if the \c Device template parameter matches - /// this DualView's device type, else t_host. - /// /// For example, suppose you create a DualView on Cuda, like this: /// \code /// using dual_view_type = @@ -410,56 +418,47 @@ class DualView : public ViewTraits { /// typename dual_view_type::t_host hostView = DV.view (); /// \endcode template - KOKKOS_INLINE_FUNCTION const typename std::conditional_t< - impl_device_matches_tdev_device::value, t_dev, - typename std::conditional_t< - impl_device_matches_thost_device::value, t_host, - typename std::conditional_t< - impl_device_matches_thost_exec::value, t_host, - typename std::conditional_t< - impl_device_matches_tdev_exec::value, t_dev, - typename std::conditional_t< - impl_device_matches_tdev_memory_space::value, - t_dev, t_host>>>>> - view() const { - constexpr bool device_is_memspace = - std::is_same::value; - constexpr bool device_is_execspace = - std::is_same::value; - constexpr bool device_exec_is_t_dev_exec = - std::is_same::value; - constexpr bool device_mem_is_t_dev_mem = - std::is_same::value; - constexpr bool device_exec_is_t_host_exec = - std::is_same::value; - constexpr bool device_mem_is_t_host_mem = - std::is_same::value; - constexpr bool device_is_t_host_device = - std::is_same::value; - constexpr bool device_is_t_dev_device = - std::is_same::value; - - static_assert( - device_is_t_dev_device || device_is_t_host_device || - (device_is_memspace && - (device_mem_is_t_dev_mem || device_mem_is_t_host_mem)) || - (device_is_execspace && - (device_exec_is_t_dev_exec || device_exec_is_t_host_exec)) || - ((!device_is_execspace && !device_is_memspace) && - ((device_mem_is_t_dev_mem || device_mem_is_t_host_mem) || - (device_exec_is_t_dev_exec || device_exec_is_t_host_exec))), - "Template parameter to .view() must exactly match one of the " - "DualView's device types or one of the execution or memory spaces"); - - return Impl::if_c::value, - t_dev, t_host>::select(d_view, h_view); + KOKKOS_FUNCTION auto view() const { + if constexpr (std::is_same_v) { + if constexpr (std::is_same_v) { + return d_view; + } else { + static_assert(std::is_same_v, + "The template argument is a memory space but doesn't " + "match either of DualView's memory spaces!"); + return h_view; + } + } else { + if constexpr (std::is_same_v) { + if constexpr (std::is_same_v) { + return d_view; + } else { + static_assert(std::is_same_v, + "The template argument is an execution space but " + "doesn't match either of DualView's execution spaces!"); + return h_view; + } + } else { + static_assert(std::is_same_v, + "The template argument is neither a memory space, " + "execution space, or device!"); + if constexpr (std::is_same_v) + return d_view; + else { + static_assert(std::is_same_v, + "The template argument is a device but " + "doesn't match either of DualView's devices!"); + return h_view; + } + } + } +#ifdef KOKKOS_COMPILER_INTEL + __builtin_unreachable(); +#endif } KOKKOS_INLINE_FUNCTION @@ -475,27 +474,27 @@ class DualView : public ViewTraits { template static int get_device_side() { constexpr bool device_is_memspace = - std::is_same::value; + std::is_same_v; constexpr bool device_is_execspace = - std::is_same::value; + std::is_same_v; constexpr bool device_exec_is_t_dev_exec = - std::is_same::value; + std::is_same_v; constexpr bool device_mem_is_t_dev_mem = - std::is_same::value; + std::is_same_v; constexpr bool device_exec_is_t_host_exec = - std::is_same::value; + std::is_same_v; constexpr bool device_mem_is_t_host_mem = - std::is_same::value; + std::is_same_v; constexpr bool device_is_t_host_device = - std::is_same::value; + std::is_same_v; constexpr bool device_is_t_dev_device = - std::is_same::value; + std::is_same_v; static_assert( device_is_t_dev_device || device_is_t_host_device || @@ -627,9 +626,9 @@ class DualView : public ViewTraits { template void sync(const std::enable_if_t< - (std::is_same::value) || - (std::is_same::value), + (std::is_same_v) || + (std::is_same_v), int>& = 0) { sync_impl(std::true_type{}); } @@ -637,9 +636,9 @@ class DualView : public ViewTraits { template void sync(const ExecutionSpace& exec, const std::enable_if_t< - (std::is_same::value) || - (std::is_same::value), + (std::is_same_v) || + (std::is_same_v), int>& = 0) { sync_impl(std::true_type{}, exec); } @@ -669,18 +668,18 @@ class DualView : public ViewTraits { template void sync(const std::enable_if_t< - (!std::is_same::value) || - (std::is_same::value), + (!std::is_same_v) || + (std::is_same_v), int>& = 0) { sync_impl(std::false_type{}); } template void sync(const ExecutionSpace& exec, const std::enable_if_t< - (!std::is_same::value) || - (std::is_same::value), + (!std::is_same_v) || + (std::is_same_v), int>& = 0) { sync_impl(std::false_type{}, exec); } @@ -943,12 +942,21 @@ class DualView : public ViewTraits { Impl::size_mismatch(h_view, h_view.rank_dynamic, new_extents); if (sizeMismatch) { - ::Kokkos::realloc(arg_prop, d_view, n0, n1, n2, n3, n4, n5, n6, n7); - if constexpr (alloc_prop_input::initialize) { - h_view = create_mirror_view(typename t_host::memory_space(), d_view); + if constexpr (alloc_prop_input::sequential_host_init) { + static_assert(alloc_prop_input::initialize, + "DualView: SequentialHostInit isn't compatible with " + "WithoutInitializing!"); + ::Kokkos::realloc(arg_prop, h_view, n0, n1, n2, n3, n4, n5, n6, n7); + d_view = + create_mirror_view_and_copy(typename t_dev::memory_space(), h_view); } else { - h_view = create_mirror_view(Kokkos::WithoutInitializing, - typename t_host::memory_space(), d_view); + ::Kokkos::realloc(arg_prop, d_view, n0, n1, n2, n3, n4, n5, n6, n7); + if constexpr (alloc_prop_input::initialize) { + h_view = create_mirror_view(typename t_host::memory_space(), d_view); + } else { + h_view = create_mirror_view(Kokkos::WithoutInitializing, + typename t_host::memory_space(), d_view); + } } } else if constexpr (alloc_prop_input::initialize) { if constexpr (alloc_prop_input::has_execution_space) { @@ -1062,9 +1070,22 @@ class DualView : public ViewTraits { } }; - constexpr bool has_execution_space = alloc_prop_input::has_execution_space; + if constexpr (alloc_prop_input::sequential_host_init) { + static_assert(alloc_prop_input::initialize, + "DualView: SequentialHostInit isn't compatible with " + "WithoutInitializing!"); + static_assert(!alloc_prop_input::has_execution_space, + "DualView: SequentialHostInit isn't compatible with " + "providing an execution space instance!"); - if constexpr (has_execution_space) { + if (sizeMismatch) { + sync(); + ::Kokkos::resize(arg_prop, h_view, n0, n1, n2, n3, n4, n5, n6, n7); + d_view = + create_mirror_view_and_copy(typename t_dev::memory_space(), h_view); + } + return; + } else if constexpr (alloc_prop_input::has_execution_space) { using ExecSpace = typename alloc_prop_input::execution_space; const auto& exec_space = Impl::get_property(arg_prop); @@ -1182,15 +1203,15 @@ class DualView : public ViewTraits { } template - KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t< - std::is_integral::value, size_t> + KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t, + size_t> extent(const iType& r) const { return d_view.extent(r); } template - KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t< - std::is_integral::value, int> + KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t, + int> extent_int(const iType& r) const { return static_cast(d_view.extent(r)); } diff --git a/packages/kokkos/containers/src/Kokkos_DynRankView.hpp b/packages/kokkos/containers/src/Kokkos_DynRankView.hpp index 5f7fcaf69e7f..2f2f4433e7ca 100644 --- a/packages/kokkos/containers/src/Kokkos_DynRankView.hpp +++ b/packages/kokkos/containers/src/Kokkos_DynRankView.hpp @@ -38,6 +38,23 @@ class DynRankView; // forward declare namespace Impl { +template +struct ViewDataTypeFromRank { + using type = typename ViewDataTypeFromRank::type*; +}; + +template +struct ViewDataTypeFromRank { + using type = T; +}; + +template +KOKKOS_FUNCTION View::type, Args...> +as_view_of_rank_n( + DynRankView v, + std::enable_if_t::specialize, + void>>* = nullptr); + template struct DynRankDimTraits { enum : size_t { unspecified = KOKKOS_INVALID_INDEX }; @@ -91,54 +108,59 @@ struct DynRankDimTraits { } // Create the layout for the rank-7 view. + // Because the underlying View is rank-7, preserve "unspecified" for + // dimension 8. + // Non-strided Layout template KOKKOS_INLINE_FUNCTION static std::enable_if_t< - (std::is_same::value || - std::is_same::value), + (std::is_same_v || + std::is_same_v), Layout> createLayout(const Layout& layout) { - return Layout(layout.dimension[0] != unspecified ? layout.dimension[0] : 1, - layout.dimension[1] != unspecified ? layout.dimension[1] : 1, - layout.dimension[2] != unspecified ? layout.dimension[2] : 1, - layout.dimension[3] != unspecified ? layout.dimension[3] : 1, - layout.dimension[4] != unspecified ? layout.dimension[4] : 1, - layout.dimension[5] != unspecified ? layout.dimension[5] : 1, - layout.dimension[6] != unspecified ? layout.dimension[6] : 1, - layout.dimension[7] != unspecified ? layout.dimension[7] : 1); + Layout new_layout( + layout.dimension[0] != unspecified ? layout.dimension[0] : 1, + layout.dimension[1] != unspecified ? layout.dimension[1] : 1, + layout.dimension[2] != unspecified ? layout.dimension[2] : 1, + layout.dimension[3] != unspecified ? layout.dimension[3] : 1, + layout.dimension[4] != unspecified ? layout.dimension[4] : 1, + layout.dimension[5] != unspecified ? layout.dimension[5] : 1, + layout.dimension[6] != unspecified ? layout.dimension[6] : 1, + layout.dimension[7] != unspecified ? layout.dimension[7] : unspecified); + new_layout.stride = layout.stride; + return new_layout; } // LayoutStride template KOKKOS_INLINE_FUNCTION static std::enable_if_t< - (std::is_same::value), Layout> + (std::is_same_v), Layout> createLayout(const Layout& layout) { - return Layout(layout.dimension[0] != unspecified ? layout.dimension[0] : 1, - layout.stride[0], - layout.dimension[1] != unspecified ? layout.dimension[1] : 1, - layout.stride[1], - layout.dimension[2] != unspecified ? layout.dimension[2] : 1, - layout.stride[2], - layout.dimension[3] != unspecified ? layout.dimension[3] : 1, - layout.stride[3], - layout.dimension[4] != unspecified ? layout.dimension[4] : 1, - layout.stride[4], - layout.dimension[5] != unspecified ? layout.dimension[5] : 1, - layout.stride[5], - layout.dimension[6] != unspecified ? layout.dimension[6] : 1, - layout.stride[6], - layout.dimension[7] != unspecified ? layout.dimension[7] : 1, - layout.stride[7]); + return Layout( + layout.dimension[0] != unspecified ? layout.dimension[0] : 1, + layout.stride[0], + layout.dimension[1] != unspecified ? layout.dimension[1] : 1, + layout.stride[1], + layout.dimension[2] != unspecified ? layout.dimension[2] : 1, + layout.stride[2], + layout.dimension[3] != unspecified ? layout.dimension[3] : 1, + layout.stride[3], + layout.dimension[4] != unspecified ? layout.dimension[4] : 1, + layout.stride[4], + layout.dimension[5] != unspecified ? layout.dimension[5] : 1, + layout.stride[5], + layout.dimension[6] != unspecified ? layout.dimension[6] : 1, + layout.stride[6], + layout.dimension[7] != unspecified ? layout.dimension[7] : unspecified, + layout.stride[7]); } // Extra overload to match that for specialize types template KOKKOS_INLINE_FUNCTION static std::enable_if_t< - (std::is_same::value || - std::is_same::value || - std::is_same::value), + (std::is_same_v || + std::is_same_v || + std::is_same_v), typename Traits::array_layout> createLayout(const Kokkos::Impl::ViewCtorProp& /* prop */, const typename Traits::array_layout& layout) { @@ -164,9 +186,8 @@ struct DynRankDimTraits { // Non-strided Layout template KOKKOS_INLINE_FUNCTION static std::enable_if_t< - (std::is_same::value || - std::is_same::value) && - std::is_integral::value, + (std::is_same_v || + std::is_same_v)&&std::is_integral_v, Layout> reconstructLayout(const Layout& layout, iType dynrank) { return Layout(dynrank > 0 ? layout.dimension[0] : KOKKOS_INVALID_INDEX, @@ -182,8 +203,7 @@ reconstructLayout(const Layout& layout, iType dynrank) { // LayoutStride template KOKKOS_INLINE_FUNCTION static std::enable_if_t< - (std::is_same::value) && - std::is_integral::value, + (std::is_same_v)&&std::is_integral_v, Layout> reconstructLayout(const Layout& layout, iType dynrank) { return Layout(dynrank > 0 ? layout.dimension[0] : KOKKOS_INVALID_INDEX, @@ -284,40 +304,43 @@ namespace Impl { template class ViewMapping< DstTraits, SrcTraits, - std::enable_if_t<(std::is_same::value && - std::is_void::value && - std::is_void::value && - (std::is_same::value || - ((std::is_same::value || - std::is_same::value || - std::is_same::value) && - (std::is_same::value || - std::is_same::value || - std::is_same::value)))), - Kokkos::Impl::ViewToDynRankViewTag>> { + std::enable_if_t< + (std::is_same_v && + std::is_void_v && + std::is_void_v && + (std::is_same_v || + ((std::is_same_v || + std::is_same_v || + std::is_same_v< + typename DstTraits::array_layout, + Kokkos::LayoutStride>)&&(std::is_same_v || + std::is_same_v< + typename SrcTraits::array_layout, + Kokkos::LayoutRight> || + std::is_same_v< + typename SrcTraits::array_layout, + Kokkos::LayoutStride>)))), + Kokkos::Impl::ViewToDynRankViewTag>> { private: enum { is_assignable_value_type = - std::is_same::value || - std::is_same::value + std::is_same_v || + std::is_same_v }; enum { is_assignable_layout = - std::is_same::value || - std::is_same::value + std::is_same_v || + std::is_same_v }; public: @@ -345,7 +368,7 @@ class ViewMapping< src.layout()); // Check this for integer input1 for padding, etc dst.m_map.m_impl_handle = Kokkos::Impl::ViewDataHandle::assign( src.m_map.m_impl_handle, src.m_track.m_tracker); - dst.m_track.assign(src.m_track.m_tracker, DstTraits::is_managed); + dst.m_track.m_tracker.assign(src.m_track.m_tracker, DstTraits::is_managed); dst.m_rank = Kokkos::View::rank(); } }; @@ -378,10 +401,11 @@ struct is_dyn_rank_view> : public std::true_type { template inline constexpr bool is_dyn_rank_view_v = is_dyn_rank_view::value; +// Inherit privately from View, this way we don't import anything funky +// for example the rank member vs the rank() function of DynRankView template -class DynRankView : public ViewTraits { - static_assert(!std::is_array::value && - !std::is_pointer::value, +class DynRankView : private View { + static_assert(!std::is_array_v && !std::is_pointer_v, "Cannot template DynRankView with array or pointer datatype - " "must be pod"); @@ -391,28 +415,66 @@ class DynRankView : public ViewTraits { template friend class Kokkos::Impl::ViewMapping; + size_t m_rank{}; + public: using drvtraits = ViewTraits; using view_type = View; - using traits = ViewTraits; - private: - using map_type = - Kokkos::Impl::ViewMapping; - using track_type = Kokkos::Impl::SharedAllocationTracker; - - track_type m_track; - map_type m_map; - unsigned m_rank; + using drdtraits = Impl::DynRankDimTraits; public: - KOKKOS_INLINE_FUNCTION + // typedefs from ViewTraits, overriden + using data_type = typename drvtraits::data_type; + using const_data_type = typename drvtraits::const_data_type; + using non_const_data_type = typename drvtraits::non_const_data_type; + + // typedefs from ViewTraits not overriden + using value_type = typename view_type::value_type; + using const_value_type = typename view_type::const_value_type; + using non_const_value_type = typename view_type::non_const_value_type; + using traits = typename view_type::traits; + using array_layout = typename view_type::array_layout; + + using execution_space = typename view_type::execution_space; + using memory_space = typename view_type::memory_space; + using device_type = typename view_type::device_type; + + using memory_traits = typename view_type::memory_traits; + using host_mirror_space = typename view_type::host_mirror_space; + using size_type = typename view_type::size_type; + + using reference_type = typename view_type::reference_type; + using pointer_type = typename view_type::pointer_type; + + using scalar_array_type = value_type; + using const_scalar_array_type = const_value_type; + using non_const_scalar_array_type = non_const_value_type; + using specialize = typename view_type::specialize; + + // typedefs in View for mdspan compatibility + // cause issues with MSVC+CUDA + // using layout_type = typename view_type::layout_type; + using index_type = typename view_type::index_type; + using element_type = typename view_type::element_type; + using rank_type = typename view_type::rank_type; + using reference = reference_type; + using data_handle_type = pointer_type; + + KOKKOS_FUNCTION view_type& DownCast() const { return (view_type&)(*this); } - KOKKOS_INLINE_FUNCTION + + // FIXME: this function make NO sense, the above one already is marked const + // Maybe one would want to get back a view of const?? + KOKKOS_FUNCTION const view_type& ConstDownCast() const { return (const view_type&)(*this); } + // FIXME: deprecate DownCast in favor of to_view + // KOKKOS_FUNCTION + // view_type to_view() const { return *this; } + // Types below - at least the HostMirror requires the value_type, NOT the rank // 7 data_type of the traits @@ -436,113 +498,32 @@ class DynRankView : public ViewTraits { typename drvtraits::array_layout, typename drvtraits::host_mirror_space>; + using host_mirror_type = HostMirror; //---------------------------------------- // Domain rank and extents // enum { Rank = map_type::Rank }; //Will be dyn rank of 7 always, keep the // enum? - template - KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t< - std::is_integral::value, size_t> - extent(const iType& r) const { - return m_map.extent(r); - } - - template - KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t< - std::is_integral::value, int> - extent_int(const iType& r) const { - return static_cast(m_map.extent(r)); - } - - KOKKOS_INLINE_FUNCTION constexpr typename traits::array_layout layout() const; - //---------------------------------------- /* Deprecate all 'dimension' functions in favor of * ISO/C++ vocabulary 'extent'. */ - KOKKOS_INLINE_FUNCTION constexpr size_t size() const { - return m_map.extent(0) * m_map.extent(1) * m_map.extent(2) * - m_map.extent(3) * m_map.extent(4) * m_map.extent(5) * - m_map.extent(6) * m_map.extent(7); - } - - KOKKOS_INLINE_FUNCTION constexpr size_t stride_0() const { - return m_map.stride_0(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_1() const { - return m_map.stride_1(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_2() const { - return m_map.stride_2(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_3() const { - return m_map.stride_3(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_4() const { - return m_map.stride_4(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_5() const { - return m_map.stride_5(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_6() const { - return m_map.stride_6(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_7() const { - return m_map.stride_7(); - } - - template - KOKKOS_INLINE_FUNCTION void stride(iType* const s) const { - m_map.stride(s); - } - - //---------------------------------------- - // Range span is the span which contains all members. - - using reference_type = typename map_type::reference_type; - using pointer_type = typename map_type::pointer_type; - - enum { - reference_type_is_lvalue_reference = - std::is_lvalue_reference::value - }; - - KOKKOS_INLINE_FUNCTION constexpr size_t span() const { return m_map.span(); } - KOKKOS_INLINE_FUNCTION constexpr bool span_is_contiguous() const { - return m_map.span_is_contiguous(); - } - KOKKOS_INLINE_FUNCTION constexpr pointer_type data() const { - return m_map.data(); - } - KOKKOS_INLINE_FUNCTION constexpr bool is_allocated() const { - return (m_map.data() != nullptr); - } - - //---------------------------------------- - // Allow specializations to query their specialized map - KOKKOS_INLINE_FUNCTION - const Kokkos::Impl::ViewMapping& - impl_map() const { - return m_map; - } - //---------------------------------------- private: enum { is_layout_left = - std::is_same::value, + std::is_same_v, is_layout_right = - std::is_same::value, + std::is_same_v, - is_layout_stride = std::is_same::value, + is_layout_stride = + std::is_same_v, - is_default_map = std::is_void::value && + is_default_map = std::is_void_v && (is_layout_left || is_layout_right || is_layout_stride) }; @@ -570,476 +551,150 @@ class DynRankView : public ViewTraits { #endif public: - KOKKOS_INLINE_FUNCTION + KOKKOS_FUNCTION constexpr unsigned rank() const { return m_rank; } - // operators () - // Rank 0 - KOKKOS_INLINE_FUNCTION - reference_type operator()() const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((0, this->rank(), m_track, m_map)) - return impl_map().reference(); - // return m_map.reference(0,0,0,0,0,0,0); - } - - // Rank 1 - // This assumes a contiguous underlying memory (i.e. no padding, no - // striding...) - template - KOKKOS_INLINE_FUNCTION std::enable_if_t< - std::is_same::value && - std::is_integral::value, - reference_type> - operator[](const iType& i0) const { - // Phalanx is violating this, since they use the operator to access ALL - // elements in the allocation KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (1 , - // this->rank(), m_track, m_map) ) - return data()[i0]; - } - - // This assumes a contiguous underlying memory (i.e. no padding, no - // striding... AND a Trilinos/Sacado scalar type ) - template - KOKKOS_INLINE_FUNCTION std::enable_if_t< - !std::is_same::value && - std::is_integral::value, - reference_type> - operator[](const iType& i0) const { - // auto map = impl_map(); - const size_t dim_scalar = m_map.dimension_scalar(); - const size_t bytes = this->span() / dim_scalar; - - using tmp_view_type = Kokkos::View< - DataType*, typename traits::array_layout, typename traits::device_type, - Kokkos::MemoryTraits>; - tmp_view_type rankone_view(this->data(), bytes, dim_scalar); - return rankone_view(i0); - } - - // Rank 1 parenthesis - template - KOKKOS_INLINE_FUNCTION - std::enable_if_t<(std::is_void::value && - std::is_integral::value), - reference_type> - operator()(const iType& i0) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((1, this->rank(), m_track, m_map, i0)) - return m_map.reference(i0); - } - - template - KOKKOS_INLINE_FUNCTION - std::enable_if_t::value && - std::is_integral::value), - reference_type> - operator()(const iType& i0) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((1, this->rank(), m_track, m_map, i0)) - return m_map.reference(i0, 0, 0, 0, 0, 0, 0); - } - - // Rank 2 - template - KOKKOS_INLINE_FUNCTION std::enable_if_t< - (std::is_void::value && - std::is_integral::value && std::is_integral::value), - reference_type> - operator()(const iType0& i0, const iType1& i1) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((2, this->rank(), m_track, m_map, i0, i1)) - return m_map.reference(i0, i1); - } - - template - KOKKOS_INLINE_FUNCTION - std::enable_if_t::value && - std::is_integral::value), - reference_type> - operator()(const iType0& i0, const iType1& i1) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((2, this->rank(), m_track, m_map, i0, i1)) - return m_map.reference(i0, i1, 0, 0, 0, 0, 0); - } - - // Rank 3 - template - KOKKOS_INLINE_FUNCTION std::enable_if_t< - (std::is_void::value && - std::is_integral::value && std::is_integral::value && - std::is_integral::value), - reference_type> - operator()(const iType0& i0, const iType1& i1, const iType2& i2) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (3, this->rank(), m_track, m_map, i0, i1, i2)) - return m_map.reference(i0, i1, i2); - } - - template - KOKKOS_INLINE_FUNCTION - std::enable_if_t::value && - std::is_integral::value), - reference_type> - operator()(const iType0& i0, const iType1& i1, const iType2& i2) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (3, this->rank(), m_track, m_map, i0, i1, i2)) - return m_map.reference(i0, i1, i2, 0, 0, 0, 0); - } - - // Rank 4 - template - KOKKOS_INLINE_FUNCTION std::enable_if_t< - (std::is_void::value && - std::is_integral::value && std::is_integral::value && - std::is_integral::value && std::is_integral::value), - reference_type> - operator()(const iType0& i0, const iType1& i1, const iType2& i2, - const iType3& i3) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (4, this->rank(), m_track, m_map, i0, i1, i2, i3)) - return m_map.reference(i0, i1, i2, i3); - } - - template - KOKKOS_INLINE_FUNCTION - std::enable_if_t::value && - std::is_integral::value), - reference_type> - operator()(const iType0& i0, const iType1& i1, const iType2& i2, - const iType3& i3) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (4, this->rank(), m_track, m_map, i0, i1, i2, i3)) - return m_map.reference(i0, i1, i2, i3, 0, 0, 0); - } - - // Rank 5 - template - KOKKOS_INLINE_FUNCTION std::enable_if_t< - (std::is_void::value && - std::is_integral::value && std::is_integral::value && - std::is_integral::value && std::is_integral::value && - std::is_integral::value), - reference_type> - operator()(const iType0& i0, const iType1& i1, const iType2& i2, - const iType3& i3, const iType4& i4) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (5, this->rank(), m_track, m_map, i0, i1, i2, i3, i4)) - return m_map.reference(i0, i1, i2, i3, i4); - } - - template - KOKKOS_INLINE_FUNCTION - std::enable_if_t::value && - std::is_integral::value), - reference_type> - operator()(const iType0& i0, const iType1& i1, const iType2& i2, - const iType3& i3, const iType4& i4) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (5, this->rank(), m_track, m_map, i0, i1, i2, i3, i4)) - return m_map.reference(i0, i1, i2, i3, i4, 0, 0); - } - - // Rank 6 - template - KOKKOS_INLINE_FUNCTION std::enable_if_t< - (std::is_void::value && - std::is_integral::value && std::is_integral::value && - std::is_integral::value && std::is_integral::value && - std::is_integral::value && std::is_integral::value), - reference_type> - operator()(const iType0& i0, const iType1& i1, const iType2& i2, - const iType3& i3, const iType4& i4, const iType5& i5) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (6, this->rank(), m_track, m_map, i0, i1, i2, i3, i4, i5)) - return m_map.reference(i0, i1, i2, i3, i4, i5); - } - - template - KOKKOS_INLINE_FUNCTION - std::enable_if_t::value && - std::is_integral::value), - reference_type> - operator()(const iType0& i0, const iType1& i1, const iType2& i2, - const iType3& i3, const iType4& i4, const iType5& i5) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (6, this->rank(), m_track, m_map, i0, i1, i2, i3, i4, i5)) - return m_map.reference(i0, i1, i2, i3, i4, i5, 0); - } - - // Rank 7 - template - KOKKOS_INLINE_FUNCTION std::enable_if_t< - (std::is_integral::value && std::is_integral::value && - std::is_integral::value && std::is_integral::value && - std::is_integral::value && std::is_integral::value && - std::is_integral::value), - reference_type> - operator()(const iType0& i0, const iType1& i1, const iType2& i2, - const iType3& i3, const iType4& i4, const iType5& i5, - const iType6& i6) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (7, this->rank(), m_track, m_map, i0, i1, i2, i3, i4, i5, i6)) - return m_map.reference(i0, i1, i2, i3, i4, i5, i6); - } - - // Rank 0 - KOKKOS_INLINE_FUNCTION - reference_type access() const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((0, this->rank(), m_track, m_map)) - return impl_map().reference(); - // return m_map.reference(0,0,0,0,0,0,0); - } - - // Rank 1 - // Rank 1 parenthesis - template - KOKKOS_INLINE_FUNCTION - std::enable_if_t<(std::is_void::value && - std::is_integral::value), - reference_type> - access(const iType& i0) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((1, this->rank(), m_track, m_map, i0)) - return m_map.reference(i0); - } - - template - KOKKOS_INLINE_FUNCTION - std::enable_if_t::value && - std::is_integral::value), - reference_type> - access(const iType& i0) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((1, this->rank(), m_track, m_map, i0)) - return m_map.reference(i0, 0, 0, 0, 0, 0, 0); - } - - // Rank 2 - template - KOKKOS_INLINE_FUNCTION std::enable_if_t< - (std::is_void::value && - std::is_integral::value && std::is_integral::value), - reference_type> - access(const iType0& i0, const iType1& i1) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((2, this->rank(), m_track, m_map, i0, i1)) - return m_map.reference(i0, i1); - } - - template - KOKKOS_INLINE_FUNCTION - std::enable_if_t::value && - std::is_integral::value), - reference_type> - access(const iType0& i0, const iType1& i1) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((2, this->rank(), m_track, m_map, i0, i1)) - return m_map.reference(i0, i1, 0, 0, 0, 0, 0); - } - - // Rank 3 - template - KOKKOS_INLINE_FUNCTION std::enable_if_t< - (std::is_void::value && - std::is_integral::value && std::is_integral::value && - std::is_integral::value), - reference_type> - access(const iType0& i0, const iType1& i1, const iType2& i2) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (3, this->rank(), m_track, m_map, i0, i1, i2)) - return m_map.reference(i0, i1, i2); - } - - template - KOKKOS_INLINE_FUNCTION - std::enable_if_t::value && - std::is_integral::value), - reference_type> - access(const iType0& i0, const iType1& i1, const iType2& i2) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (3, this->rank(), m_track, m_map, i0, i1, i2)) - return m_map.reference(i0, i1, i2, 0, 0, 0, 0); - } - - // Rank 4 - template - KOKKOS_INLINE_FUNCTION std::enable_if_t< - (std::is_void::value && - std::is_integral::value && std::is_integral::value && - std::is_integral::value && std::is_integral::value), - reference_type> - access(const iType0& i0, const iType1& i1, const iType2& i2, - const iType3& i3) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (4, this->rank(), m_track, m_map, i0, i1, i2, i3)) - return m_map.reference(i0, i1, i2, i3); - } - - template - KOKKOS_INLINE_FUNCTION - std::enable_if_t::value && - std::is_integral::value), - reference_type> - access(const iType0& i0, const iType1& i1, const iType2& i2, - const iType3& i3) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (4, this->rank(), m_track, m_map, i0, i1, i2, i3)) - return m_map.reference(i0, i1, i2, i3, 0, 0, 0); - } - - // Rank 5 - template - KOKKOS_INLINE_FUNCTION std::enable_if_t< - (std::is_void::value && - std::is_integral::value && std::is_integral::value && - std::is_integral::value && std::is_integral::value && - std::is_integral::value), - reference_type> - access(const iType0& i0, const iType1& i1, const iType2& i2, const iType3& i3, - const iType4& i4) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (5, this->rank(), m_track, m_map, i0, i1, i2, i3, i4)) - return m_map.reference(i0, i1, i2, i3, i4); - } - - template - KOKKOS_INLINE_FUNCTION - std::enable_if_t::value && - std::is_integral::value), - reference_type> - access(const iType0& i0, const iType1& i1, const iType2& i2, - const iType3& i3, const iType4& i4) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (5, this->rank(), m_track, m_map, i0, i1, i2, i3, i4)) - return m_map.reference(i0, i1, i2, i3, i4, 0, 0); - } - - // Rank 6 - template - KOKKOS_INLINE_FUNCTION std::enable_if_t< - (std::is_void::value && - std::is_integral::value && std::is_integral::value && - std::is_integral::value && std::is_integral::value && - std::is_integral::value && std::is_integral::value), - reference_type> - access(const iType0& i0, const iType1& i1, const iType2& i2, const iType3& i3, - const iType4& i4, const iType5& i5) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (6, this->rank(), m_track, m_map, i0, i1, i2, i3, i4, i5)) - return m_map.reference(i0, i1, i2, i3, i4, i5); + using view_type::data; + using view_type::extent; + using view_type::extent_int; // FIXME: not tested + using view_type::impl_map; // FIXME: not tested + using view_type::is_allocated; + using view_type::label; + using view_type::size; + using view_type::span; + using view_type::span_is_contiguous; // FIXME: not tested + using view_type::stride; // FIXME: not tested + using view_type::stride_0; // FIXME: not tested + using view_type::stride_1; // FIXME: not tested + using view_type::stride_2; // FIXME: not tested + using view_type::stride_3; // FIXME: not tested + using view_type::stride_4; // FIXME: not tested + using view_type::stride_5; // FIXME: not tested + using view_type::stride_6; // FIXME: not tested + using view_type::stride_7; // FIXME: not tested + using view_type::use_count; + + KOKKOS_FUNCTION reference_type + operator()(index_type i0 = 0, index_type i1 = 0, index_type i2 = 0, + index_type i3 = 0, index_type i4 = 0, index_type i5 = 0, + index_type i6 = 0) const { + return view_type::operator()(i0, i1, i2, i3, i4, i5, i6); + } + +// This is an accomodation for Phalanx, that is usint the operator[] to access +// all elements in a linear fashion even when the rank is not 1 +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + KOKKOS_FUNCTION reference_type operator[](index_type i0) const { + if constexpr (std::is_same_v) { + return view_type::data()[i0]; + } else { + const size_t dim_scalar = view_type::impl_map().dimension_scalar(); + const size_t bytes = view_type::span() / dim_scalar; + + using tmp_view_type = + Kokkos::View>; + tmp_view_type rankone_view(view_type::data(), bytes, dim_scalar); + return rankone_view(i0); + } } - - template - KOKKOS_INLINE_FUNCTION - std::enable_if_t::value && - std::is_integral::value), - reference_type> - access(const iType0& i0, const iType1& i1, const iType2& i2, - const iType3& i3, const iType4& i4, const iType5& i5) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (6, this->rank(), m_track, m_map, i0, i1, i2, i3, i4, i5)) - return m_map.reference(i0, i1, i2, i3, i4, i5, 0); +#else + KOKKOS_FUNCTION reference_type operator[](index_type i0) const { +#ifdef KOKKOS_ENABLE_DEBUG + if (rank() != 1u) + Kokkos::abort("DynRankView operator[] can only be used for rank-1"); +#endif + return view_type::operator()(i0, 0, 0, 0, 0, 0, 0); } +#endif - // Rank 7 - template - KOKKOS_INLINE_FUNCTION std::enable_if_t< - (std::is_integral::value && std::is_integral::value && - std::is_integral::value && std::is_integral::value && - std::is_integral::value && std::is_integral::value && - std::is_integral::value), - reference_type> - access(const iType0& i0, const iType1& i1, const iType2& i2, const iType3& i3, - const iType4& i4, const iType5& i5, const iType6& i6) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (7, this->rank(), m_track, m_map, i0, i1, i2, i3, i4, i5, i6)) - return m_map.reference(i0, i1, i2, i3, i4, i5, i6); + KOKKOS_FUNCTION reference_type access(index_type i0 = 0, index_type i1 = 0, + index_type i2 = 0, index_type i3 = 0, + index_type i4 = 0, index_type i5 = 0, + index_type i6 = 0) const { + return view_type::operator()(i0, i1, i2, i3, i4, i5, i6); } -#undef KOKKOS_IMPL_VIEW_OPERATOR_VERIFY - //---------------------------------------- // Standard constructor, destructor, and assignment operators... KOKKOS_DEFAULTED_FUNCTION ~DynRankView() = default; - KOKKOS_INLINE_FUNCTION - DynRankView() : m_track(), m_map(), m_rank() {} // Default ctor + KOKKOS_DEFAULTED_FUNCTION DynRankView() = default; - KOKKOS_INLINE_FUNCTION - DynRankView(const DynRankView& rhs) - : m_track(rhs.m_track), m_map(rhs.m_map), m_rank(rhs.m_rank) {} - - KOKKOS_INLINE_FUNCTION - DynRankView(DynRankView&& rhs) - : m_track(rhs.m_track), m_map(rhs.m_map), m_rank(rhs.m_rank) {} + //---------------------------------------- + // Compatible view copy constructor and assignment + // may assign unmanaged from managed. + // Make this conditionally explicit? + template + KOKKOS_FUNCTION DynRankView(const DynRankView& rhs) + : view_type(rhs), m_rank(rhs.m_rank) {} - KOKKOS_INLINE_FUNCTION - DynRankView& operator=(const DynRankView& rhs) { - m_track = rhs.m_track; - m_map = rhs.m_map; - m_rank = rhs.m_rank; + template + KOKKOS_FUNCTION DynRankView& operator=(const DynRankView& rhs) { + view_type::operator=(rhs); + m_rank = rhs.m_rank; return *this; } - KOKKOS_INLINE_FUNCTION - DynRankView& operator=(DynRankView&& rhs) { - m_track = rhs.m_track; - m_map = rhs.m_map; - m_rank = rhs.m_rank; - return *this; +#if 0 // TODO: this will later be swapped in depending on whether the new View + // impl is active + private: + template + KOKKOS_FUNCTION typename view_type::extents_type create_rank7_extents( + const Ext& ext) { + return typename view_type::extents_type( + ext.rank() > 0 ? ext.extent(0) : 1, ext.rank() > 1 ? ext.extent(1) : 1, + ext.rank() > 2 ? ext.extent(2) : 1, ext.rank() > 3 ? ext.extent(3) : 1, + ext.rank() > 4 ? ext.extent(4) : 1, ext.rank() > 5 ? ext.extent(5) : 1, + ext.rank() > 6 ? ext.extent(6) : 1); } - //---------------------------------------- - // Compatible view copy constructor and assignment - // may assign unmanaged from managed. + public: + // Copy/Assign View to DynRankView template - KOKKOS_INLINE_FUNCTION DynRankView(const DynRankView& rhs) - : m_track(rhs.m_track, traits::is_managed), m_map(), m_rank(rhs.m_rank) { - using SrcTraits = typename DynRankView::traits; - using Mapping = Kokkos::Impl::ViewMapping; - static_assert(Mapping::is_assignable, - "Incompatible DynRankView copy construction"); - Mapping::assign(m_map, rhs.m_map, rhs.m_track); + KOKKOS_INLINE_FUNCTION DynRankView(const View& rhs, + size_t new_rank) + : view_type(rhs.data_handle(), drdtraits::createLayout(rhs.layout())), + m_rank(new_rank) { + if (new_rank > rhs.rank()) + Kokkos::abort( + "Attempting to construct DynRankView from View and new rank, with " + "the new rank being too large."); } template - KOKKOS_INLINE_FUNCTION DynRankView& operator=( - const DynRankView& rhs) { - using SrcTraits = typename DynRankView::traits; - using Mapping = Kokkos::Impl::ViewMapping; - static_assert(Mapping::is_assignable, - "Incompatible DynRankView copy construction"); - Mapping::assign(m_map, rhs.m_map, rhs.m_track); - m_track.assign(rhs.m_track, traits::is_managed); + KOKKOS_INLINE_FUNCTION DynRankView& operator=(const View& rhs) { + view_type::operator=(view_type( + rhs.data_handle(), + typename view_type::mapping_type(create_rank7_extents(rhs.extents())), + rhs.accessor())); m_rank = rhs.rank(); return *this; } - - // Copy/Assign View to DynRankView +#else template - KOKKOS_INLINE_FUNCTION DynRankView(const View& rhs) - : m_track(), m_map(), m_rank(View::rank()) { + KOKKOS_FUNCTION DynRankView(const View& rhs, size_t new_rank) { using SrcTraits = typename View::traits; using Mapping = Kokkos::Impl::ViewMapping; static_assert(Mapping::is_assignable, - "Incompatible View to DynRankView copy construction"); + "Incompatible View to DynRankView copy assignment"); + if (new_rank > View::rank()) + Kokkos::abort( + "Attempting to construct DynRankView from View and new rank, with " + "the new rank being too large."); Mapping::assign(*this, rhs); + m_rank = new_rank; } template - KOKKOS_INLINE_FUNCTION DynRankView& operator=(const View& rhs) { + KOKKOS_FUNCTION DynRankView& operator=(const View& rhs) { using SrcTraits = typename View::traits; using Mapping = Kokkos::Impl::ViewMapping { static_assert(Mapping::is_assignable, "Incompatible View to DynRankView copy assignment"); Mapping::assign(*this, rhs); + m_rank = View::rank(); return *this; } +#endif + + template + KOKKOS_FUNCTION DynRankView(const View& rhs) + : DynRankView(rhs, View::rank()) {} //---------------------------------------- // Allocation tracking properties - KOKKOS_INLINE_FUNCTION - int use_count() const { return m_track.use_count(); } - - inline const std::string label() const { - return m_track.template get_label(); - } - //---------------------------------------- // Allocation according to allocation properties and array layout // unused arg_layout dimensions must be set to KOKKOS_INVALID_INDEX so that // rank deduction can properly take place + // We need two variants to avoid calling host function from host device + // function warnings template - explicit inline DynRankView( + explicit KOKKOS_FUNCTION DynRankView( const Kokkos::Impl::ViewCtorProp& arg_prop, - std::enable_if_t::has_pointer, - typename traits::array_layout> const& arg_layout) - : m_track(), - m_map(), - m_rank(Impl::DynRankDimTraits:: - template computeRank( - arg_prop, arg_layout)) { - // Copy the input allocation properties with possibly defaulted properties - auto prop_copy = Impl::with_properties_if_unset( - arg_prop, std::string{}, typename traits::device_type::memory_space{}, - typename traits::device_type::execution_space{}); - using alloc_prop = decltype(prop_copy); - - static_assert(traits::is_managed, - "View allocation constructor requires managed memory"); - - if (alloc_prop::initialize && - !alloc_prop::execution_space::impl_is_initialized()) { - // If initializing view data then - // the execution space must be initialized. - Kokkos::Impl::throw_runtime_exception( - "Constructing DynRankView and initializing data with uninitialized " - "execution space"); - } - - Kokkos::Impl::SharedAllocationRecord<>* record = m_map.allocate_shared( - prop_copy, - Impl::DynRankDimTraits:: - template createLayout(arg_prop, arg_layout), - Impl::ViewCtorProp::has_execution_space); - - // Setup and initialization complete, start tracking - m_track.assign_allocated_record_to_uninitialized(record); - } + std::enable_if_t::has_pointer, + typename traits::array_layout const&> + arg_layout) + : view_type(arg_prop, drdtraits::template createLayout( + arg_prop, arg_layout)), + m_rank(drdtraits::computeRank(arg_prop, arg_layout)) {} - // Wrappers template - explicit KOKKOS_INLINE_FUNCTION DynRankView( + explicit DynRankView( const Kokkos::Impl::ViewCtorProp& arg_prop, - std::enable_if_t::has_pointer, - typename traits::array_layout> const& arg_layout) - : m_track() // No memory tracking - , - m_map(arg_prop, - Impl::DynRankDimTraits:: - template createLayout(arg_prop, arg_layout)), - m_rank(Impl::DynRankDimTraits:: - template computeRank( - arg_prop, arg_layout)) { - static_assert( - std::is_same::pointer_type>::value, - "Constructing DynRankView to wrap user memory must supply matching " - "pointer type"); - } + std::enable_if_t::has_pointer, + typename traits::array_layout const&> + arg_layout) + : view_type(arg_prop, drdtraits::template createLayout( + arg_prop, arg_layout)), + m_rank(drdtraits::computeRank(arg_prop, arg_layout)) {} //---------------------------------------- // Constructor(s) // Simple dimension-only layout + // We need two variants to avoid calling host function from host device + // function warnings template - explicit inline DynRankView( + explicit KOKKOS_FUNCTION DynRankView( const Kokkos::Impl::ViewCtorProp& arg_prop, - std::enable_if_t::has_pointer, - size_t> const arg_N0 = KOKKOS_INVALID_INDEX, - const size_t arg_N1 = KOKKOS_INVALID_INDEX, - const size_t arg_N2 = KOKKOS_INVALID_INDEX, - const size_t arg_N3 = KOKKOS_INVALID_INDEX, - const size_t arg_N4 = KOKKOS_INVALID_INDEX, - const size_t arg_N5 = KOKKOS_INVALID_INDEX, - const size_t arg_N6 = KOKKOS_INVALID_INDEX, - const size_t arg_N7 = KOKKOS_INVALID_INDEX) + std::enable_if_t::has_pointer, + const size_t> + arg_N0 = KOKKOS_INVALID_INDEX, + const size_t arg_N1 = KOKKOS_INVALID_INDEX, + const size_t arg_N2 = KOKKOS_INVALID_INDEX, + const size_t arg_N3 = KOKKOS_INVALID_INDEX, + const size_t arg_N4 = KOKKOS_INVALID_INDEX, + const size_t arg_N5 = KOKKOS_INVALID_INDEX, + const size_t arg_N6 = KOKKOS_INVALID_INDEX, + const size_t arg_N7 = KOKKOS_INVALID_INDEX) : DynRankView(arg_prop, typename traits::array_layout( arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7)) {} template - explicit KOKKOS_INLINE_FUNCTION DynRankView( + explicit DynRankView( const Kokkos::Impl::ViewCtorProp& arg_prop, - std::enable_if_t::has_pointer, - size_t> const arg_N0 = KOKKOS_INVALID_INDEX, - const size_t arg_N1 = KOKKOS_INVALID_INDEX, - const size_t arg_N2 = KOKKOS_INVALID_INDEX, - const size_t arg_N3 = KOKKOS_INVALID_INDEX, - const size_t arg_N4 = KOKKOS_INVALID_INDEX, - const size_t arg_N5 = KOKKOS_INVALID_INDEX, - const size_t arg_N6 = KOKKOS_INVALID_INDEX, - const size_t arg_N7 = KOKKOS_INVALID_INDEX) + std::enable_if_t::has_pointer, + const size_t> + arg_N0 = KOKKOS_INVALID_INDEX, + const size_t arg_N1 = KOKKOS_INVALID_INDEX, + const size_t arg_N2 = KOKKOS_INVALID_INDEX, + const size_t arg_N3 = KOKKOS_INVALID_INDEX, + const size_t arg_N4 = KOKKOS_INVALID_INDEX, + const size_t arg_N5 = KOKKOS_INVALID_INDEX, + const size_t arg_N6 = KOKKOS_INVALID_INDEX, + const size_t arg_N7 = KOKKOS_INVALID_INDEX) : DynRankView(arg_prop, typename traits::array_layout( arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7)) {} @@ -1188,16 +809,20 @@ class DynRankView : public ViewTraits { //---------------------------------------- // Memory span required to wrap these dimensions. + // FIXME: this function needs to be tested static constexpr size_t required_allocation_size( - const size_t arg_N0 = 0, const size_t arg_N1 = 0, const size_t arg_N2 = 0, - const size_t arg_N3 = 0, const size_t arg_N4 = 0, const size_t arg_N5 = 0, - const size_t arg_N6 = 0, const size_t arg_N7 = 0) { - return map_type::memory_span(typename traits::array_layout( - arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7)); + const size_t arg_N0 = 1, const size_t arg_N1 = 1, const size_t arg_N2 = 1, + const size_t arg_N3 = 1, const size_t arg_N4 = 1, const size_t arg_N5 = 1, + const size_t arg_N6 = 1, + [[maybe_unused]] const size_t arg_N7 = KOKKOS_INVALID_INDEX) { + // FIXME: check that arg_N7 is not set by user (in debug mode) + return view_type::required_allocation_size(arg_N0, arg_N1, arg_N2, arg_N3, + arg_N4, arg_N5, arg_N6); } - explicit KOKKOS_INLINE_FUNCTION DynRankView( - pointer_type arg_ptr, const size_t arg_N0 = KOKKOS_INVALID_INDEX, + explicit KOKKOS_FUNCTION DynRankView( + typename view_type::pointer_type arg_ptr, + const size_t arg_N0 = KOKKOS_INVALID_INDEX, const size_t arg_N1 = KOKKOS_INVALID_INDEX, const size_t arg_N2 = KOKKOS_INVALID_INDEX, const size_t arg_N3 = KOKKOS_INVALID_INDEX, @@ -1205,55 +830,38 @@ class DynRankView : public ViewTraits { const size_t arg_N5 = KOKKOS_INVALID_INDEX, const size_t arg_N6 = KOKKOS_INVALID_INDEX, const size_t arg_N7 = KOKKOS_INVALID_INDEX) - : DynRankView(Kokkos::Impl::ViewCtorProp(arg_ptr), arg_N0, - arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7) {} + : DynRankView( + Kokkos::Impl::ViewCtorProp( + arg_ptr), + arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7) {} - explicit KOKKOS_INLINE_FUNCTION DynRankView( - pointer_type arg_ptr, typename traits::array_layout& arg_layout) - : DynRankView(Kokkos::Impl::ViewCtorProp(arg_ptr), - arg_layout) {} + explicit KOKKOS_FUNCTION DynRankView( + typename view_type::pointer_type arg_ptr, + typename traits::array_layout& arg_layout) + : DynRankView( + Kokkos::Impl::ViewCtorProp( + arg_ptr), + arg_layout) {} //---------------------------------------- // Shared scratch memory constructor - static inline size_t shmem_size(const size_t arg_N0 = KOKKOS_INVALID_INDEX, - const size_t arg_N1 = KOKKOS_INVALID_INDEX, - const size_t arg_N2 = KOKKOS_INVALID_INDEX, - const size_t arg_N3 = KOKKOS_INVALID_INDEX, - const size_t arg_N4 = KOKKOS_INVALID_INDEX, - const size_t arg_N5 = KOKKOS_INVALID_INDEX, - const size_t arg_N6 = KOKKOS_INVALID_INDEX, - const size_t arg_N7 = KOKKOS_INVALID_INDEX) { - const size_t num_passed_args = - (arg_N0 != KOKKOS_INVALID_INDEX) + (arg_N1 != KOKKOS_INVALID_INDEX) + - (arg_N2 != KOKKOS_INVALID_INDEX) + (arg_N3 != KOKKOS_INVALID_INDEX) + - (arg_N4 != KOKKOS_INVALID_INDEX) + (arg_N5 != KOKKOS_INVALID_INDEX) + - (arg_N6 != KOKKOS_INVALID_INDEX) + (arg_N7 != KOKKOS_INVALID_INDEX); - - if (std::is_void::value && - num_passed_args != traits::rank_dynamic) { - Kokkos::abort( - "Kokkos::View::shmem_size() rank_dynamic != number of arguments.\n"); - } - {} - - return map_type::memory_span(typename traits::array_layout( - arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7)); + // Note: We must pass 7 valid args since view_type is rank 7 + static inline size_t shmem_size( + const size_t arg_N0 = 1, const size_t arg_N1 = 1, const size_t arg_N2 = 1, + const size_t arg_N3 = 1, const size_t arg_N4 = 1, const size_t arg_N5 = 1, + const size_t arg_N6 = 1, const size_t arg_N7 = KOKKOS_INVALID_INDEX) { + return view_type::shmem_size(arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, + arg_N6, arg_N7); } - explicit KOKKOS_INLINE_FUNCTION DynRankView( + explicit KOKKOS_FUNCTION DynRankView( const typename traits::execution_space::scratch_memory_space& arg_space, const typename traits::array_layout& arg_layout) - : DynRankView( - Kokkos::Impl::ViewCtorProp( - reinterpret_cast( - arg_space.get_shmem(map_type::memory_span( - Impl::DynRankDimTraits:: - createLayout(arg_layout) // is this correct? - )))), - arg_layout) {} + : view_type(arg_space, drdtraits::createLayout(arg_layout)), + m_rank(drdtraits::computeRank(arg_layout)) {} - explicit KOKKOS_INLINE_FUNCTION DynRankView( + explicit KOKKOS_FUNCTION DynRankView( const typename traits::execution_space::scratch_memory_space& arg_space, const size_t arg_N0 = KOKKOS_INVALID_INDEX, const size_t arg_N1 = KOKKOS_INVALID_INDEX, @@ -1264,21 +872,38 @@ class DynRankView : public ViewTraits { const size_t arg_N6 = KOKKOS_INVALID_INDEX, const size_t arg_N7 = KOKKOS_INVALID_INDEX) - : DynRankView( - Kokkos::Impl::ViewCtorProp( - reinterpret_cast( - arg_space.get_shmem(map_type::memory_span( - Impl::DynRankDimTraits:: - createLayout(typename traits::array_layout( - arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, - arg_N6, arg_N7)))))), - typename traits::array_layout(arg_N0, arg_N1, arg_N2, arg_N3, - arg_N4, arg_N5, arg_N6, arg_N7)) {} + : DynRankView(arg_space, typename traits::array_layout( + arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, + arg_N5, arg_N6, arg_N7)) {} + + KOKKOS_FUNCTION constexpr auto layout() const { + switch (rank()) { + case 0: return Impl::as_view_of_rank_n<0>(*this).layout(); + case 1: return Impl::as_view_of_rank_n<1>(*this).layout(); + case 2: return Impl::as_view_of_rank_n<2>(*this).layout(); + case 3: return Impl::as_view_of_rank_n<3>(*this).layout(); + case 4: return Impl::as_view_of_rank_n<4>(*this).layout(); + case 5: return Impl::as_view_of_rank_n<5>(*this).layout(); + case 6: return Impl::as_view_of_rank_n<6>(*this).layout(); + case 7: return Impl::as_view_of_rank_n<7>(*this).layout(); + default: + KOKKOS_IF_ON_HOST( + Kokkos::abort( + std::string( + "Calling DynRankView::layout on DRV of unexpected rank " + + std::to_string(rank())) + .c_str());) + KOKKOS_IF_ON_DEVICE( + Kokkos::abort( + "Calling DynRankView::layout on DRV of unexpected rank");) + } + // control flow should never reach here + return view_type::layout(); + } }; template -KOKKOS_INLINE_FUNCTION constexpr unsigned rank( - const DynRankView& DRV) { +KOKKOS_FUNCTION constexpr unsigned rank(const DynRankView& DRV) { return DRV.rank(); } // needed for transition to common constexpr method in view and dynrankview // to return rank @@ -1293,181 +918,46 @@ struct DynRankSubviewTag {}; } // namespace Impl -namespace Impl { - -template -class ViewMapping< - std::enable_if_t<(std::is_void::value && - (std::is_same::value || - std::is_same::value || - std::is_same::value)), - Kokkos::Impl::DynRankSubviewTag>, - SrcTraits, Args...> { - private: - enum { - RZ = false, - R0 = bool(is_integral_extent<0, Args...>::value), - R1 = bool(is_integral_extent<1, Args...>::value), - R2 = bool(is_integral_extent<2, Args...>::value), - R3 = bool(is_integral_extent<3, Args...>::value), - R4 = bool(is_integral_extent<4, Args...>::value), - R5 = bool(is_integral_extent<5, Args...>::value), - R6 = bool(is_integral_extent<6, Args...>::value) - }; - - enum { - rank = unsigned(R0) + unsigned(R1) + unsigned(R2) + unsigned(R3) + - unsigned(R4) + unsigned(R5) + unsigned(R6) - }; - - using array_layout = Kokkos::LayoutStride; - - using value_type = typename SrcTraits::value_type; - - using data_type = value_type*******; - - public: - using traits_type = Kokkos::ViewTraits; - - using type = - Kokkos::View; - - template - struct apply { - static_assert(Kokkos::is_memory_traits::value); - - using traits_type = - Kokkos::ViewTraits; - - using type = Kokkos::View; - }; - - using dimension = typename SrcTraits::dimension; - - template - struct ExtentGenerator { - KOKKOS_INLINE_FUNCTION - static SubviewExtents<7, rank> generator( - const dimension& dim, Arg0 arg0 = Arg0(), Arg1 arg1 = Arg1(), - Arg2 arg2 = Arg2(), Arg3 arg3 = Arg3(), Arg4 arg4 = Arg4(), - Arg5 arg5 = Arg5(), Arg6 arg6 = Arg6()) { - return SubviewExtents<7, rank>(dim, arg0, arg1, arg2, arg3, arg4, arg5, - arg6); - } - }; - - using ret_type = Kokkos::DynRankView; - - template - KOKKOS_INLINE_FUNCTION static ret_type subview( - const unsigned src_rank, Kokkos::DynRankView const& src, - Args... args) { - using DstType = ViewMapping; - - using DstDimType = std::conditional_t< - (rank == 0), ViewDimension<>, - std::conditional_t< - (rank == 1), ViewDimension<0>, - std::conditional_t< - (rank == 2), ViewDimension<0, 0>, - std::conditional_t< - (rank == 3), ViewDimension<0, 0, 0>, - std::conditional_t< - (rank == 4), ViewDimension<0, 0, 0, 0>, - std::conditional_t< - (rank == 5), ViewDimension<0, 0, 0, 0, 0>, - std::conditional_t< - (rank == 6), ViewDimension<0, 0, 0, 0, 0, 0>, - ViewDimension<0, 0, 0, 0, 0, 0, 0>>>>>>>>; - - using dst_offset_type = ViewOffset; - using dst_handle_type = typename DstType::handle_type; - - ret_type dst; - - const SubviewExtents<7, rank> extents = ExtentGenerator::generator( - src.m_map.m_impl_offset.m_dim, args...); - - dst_offset_type tempdst(src.m_map.m_impl_offset, extents); - - dst.m_track = src.m_track; - - dst.m_map.m_impl_offset.m_dim.N0 = tempdst.m_dim.N0; - dst.m_map.m_impl_offset.m_dim.N1 = tempdst.m_dim.N1; - dst.m_map.m_impl_offset.m_dim.N2 = tempdst.m_dim.N2; - dst.m_map.m_impl_offset.m_dim.N3 = tempdst.m_dim.N3; - dst.m_map.m_impl_offset.m_dim.N4 = tempdst.m_dim.N4; - dst.m_map.m_impl_offset.m_dim.N5 = tempdst.m_dim.N5; - dst.m_map.m_impl_offset.m_dim.N6 = tempdst.m_dim.N6; - - dst.m_map.m_impl_offset.m_stride.S0 = tempdst.m_stride.S0; - dst.m_map.m_impl_offset.m_stride.S1 = tempdst.m_stride.S1; - dst.m_map.m_impl_offset.m_stride.S2 = tempdst.m_stride.S2; - dst.m_map.m_impl_offset.m_stride.S3 = tempdst.m_stride.S3; - dst.m_map.m_impl_offset.m_stride.S4 = tempdst.m_stride.S4; - dst.m_map.m_impl_offset.m_stride.S5 = tempdst.m_stride.S5; - dst.m_map.m_impl_offset.m_stride.S6 = tempdst.m_stride.S6; - - dst.m_map.m_impl_handle = - dst_handle_type(src.m_map.m_impl_handle + - src.m_map.m_impl_offset( - extents.domain_offset(0), extents.domain_offset(1), - extents.domain_offset(2), extents.domain_offset(3), - extents.domain_offset(4), extents.domain_offset(5), - extents.domain_offset(6))); - - dst.m_rank = - (src_rank > 0 ? unsigned(R0) : 0) + (src_rank > 1 ? unsigned(R1) : 0) + - (src_rank > 2 ? unsigned(R2) : 0) + (src_rank > 3 ? unsigned(R3) : 0) + - (src_rank > 4 ? unsigned(R4) : 0) + (src_rank > 5 ? unsigned(R5) : 0) + - (src_rank > 6 ? unsigned(R6) : 0); - - return dst; - } -}; - -} // namespace Impl - template using Subdynrankview = typename Kokkos::Impl::ViewMapping::ret_type; -template -KOKKOS_INLINE_FUNCTION Subdynrankview, Args...> -subdynrankview(const Kokkos::DynRankView& src, Args... args) { - if (src.rank() > sizeof...(Args)) // allow sizeof...(Args) >= src.rank(), - // ignore the remaining args - { - Kokkos::abort( - "subdynrankview: num of args must be >= rank of the source " - "DynRankView"); - } - - using metafcn = - Kokkos::Impl::ViewMapping, Args...>; - - return metafcn::subview(src.rank(), src, args...); +template +KOKKOS_INLINE_FUNCTION auto subdynrankview( + const DynRankView& drv, SubArg0 arg0 = SubArg0{}, + SubArg1 arg1 = SubArg1{}, SubArg2 arg2 = SubArg2{}, + SubArg3 arg3 = SubArg3{}, SubArg4 arg4 = SubArg4{}, + SubArg5 arg5 = SubArg5{}, SubArg6 arg6 = SubArg6{}) { + auto sub = subview(drv.DownCast(), arg0, arg1, arg2, arg3, arg4, arg5, arg6); + using sub_t = decltype(sub); + size_t new_rank = (drv.rank() > 0 && !std::is_integral_v ? 1 : 0) + + (drv.rank() > 1 && !std::is_integral_v ? 1 : 0) + + (drv.rank() > 2 && !std::is_integral_v ? 1 : 0) + + (drv.rank() > 3 && !std::is_integral_v ? 1 : 0) + + (drv.rank() > 4 && !std::is_integral_v ? 1 : 0) + + (drv.rank() > 5 && !std::is_integral_v ? 1 : 0) + + (drv.rank() > 6 && !std::is_integral_v ? 1 : 0); + + using return_type = + DynRankView; + return static_cast( + DynRankView( + sub, new_rank)); } - -// Wrapper to allow subview function name -template -KOKKOS_INLINE_FUNCTION Subdynrankview, Args...> -subview(const Kokkos::DynRankView& src, Args... args) { - return subdynrankview(src, args...); +template +KOKKOS_INLINE_FUNCTION auto subview( + const DynRankView& drv, SubArg0 arg0 = SubArg0{}, + SubArg1 arg1 = SubArg1{}, SubArg2 arg2 = SubArg2{}, + SubArg3 arg3 = SubArg3{}, SubArg4 arg4 = SubArg4{}, + SubArg5 arg5 = SubArg5{}, SubArg6 arg6 = SubArg6{}) { + return subdynrankview(drv, arg0, arg1, arg2, arg3, arg4, arg5, arg6); } } // namespace Kokkos @@ -1482,12 +972,12 @@ KOKKOS_INLINE_FUNCTION bool operator==(const DynRankView& lhs, using lhs_traits = ViewTraits; using rhs_traits = ViewTraits; - return std::is_same::value && - std::is_same::value && - std::is_same::value && + return std::is_same_v && + std::is_same_v && + std::is_same_v && lhs.rank() == rhs.rank() && lhs.data() == rhs.data() && lhs.span() == rhs.span() && lhs.extent(0) == rhs.extent(0) && lhs.extent(1) == rhs.extent(1) && lhs.extent(2) == rhs.extent(2) && @@ -1638,11 +1128,11 @@ namespace Impl { underlying memory, to facilitate implementation of deep_copy() and other routines that are defined on View */ template -KOKKOS_FUNCTION auto as_view_of_rank_n( +KOKKOS_FUNCTION View::type, Args...> +as_view_of_rank_n( DynRankView v, - typename std::enable_if::specialize, void>::value>::type* = - nullptr) { + std::enable_if_t< + std::is_same_v::specialize, void>>*) { if (v.rank() != N) { KOKKOS_IF_ON_HOST( const std::string message = @@ -1653,7 +1143,7 @@ KOKKOS_FUNCTION auto as_view_of_rank_n( Kokkos::abort("Converting DynRankView to a View of mis-matched rank!");) } - auto layout = v.impl_map().layout(); + auto layout = v.DownCast().layout(); if constexpr (std::is_same_v || std::is_same_v || @@ -1691,43 +1181,16 @@ void apply_to_view_of_static_rank(Function&& f, DynRankView a) { } // namespace Impl -template -KOKKOS_INLINE_FUNCTION constexpr auto DynRankView::layout() const -> - typename traits::array_layout { - switch (rank()) { - case 0: return Impl::as_view_of_rank_n<0>(*this).layout(); - case 1: return Impl::as_view_of_rank_n<1>(*this).layout(); - case 2: return Impl::as_view_of_rank_n<2>(*this).layout(); - case 3: return Impl::as_view_of_rank_n<3>(*this).layout(); - case 4: return Impl::as_view_of_rank_n<4>(*this).layout(); - case 5: return Impl::as_view_of_rank_n<5>(*this).layout(); - case 6: return Impl::as_view_of_rank_n<6>(*this).layout(); - case 7: return Impl::as_view_of_rank_n<7>(*this).layout(); - default: - KOKKOS_IF_ON_HOST( - Kokkos::abort( - std::string( - "Calling DynRankView::layout on DRV of unexpected rank " + - std::to_string(rank())) - .c_str());) - KOKKOS_IF_ON_DEVICE( - Kokkos::abort( - "Calling DynRankView::layout on DRV of unexpected rank");) - } - // control flow should never reach here - return m_map.layout(); -} - /** \brief Deep copy a value from Host memory into a view. */ template inline void deep_copy( const ExecSpace& e, const DynRankView& dst, typename ViewTraits::const_value_type& value, - std::enable_if_t::specialize, - void>::value>* = nullptr) { + std::enable_if_t::specialize, + void>>* = nullptr) { static_assert( - std::is_same::non_const_value_type, - typename ViewTraits::value_type>::value, + std::is_same_v::non_const_value_type, + typename ViewTraits::value_type>, "deep_copy requires non-const type"); Impl::apply_to_view_of_static_rank( @@ -1738,8 +1201,8 @@ template inline void deep_copy( const DynRankView& dst, typename ViewTraits::const_value_type& value, - std::enable_if_t::specialize, - void>::value>* = nullptr) { + std::enable_if_t::specialize, + void>>* = nullptr) { Impl::apply_to_view_of_static_rank([=](auto view) { deep_copy(view, value); }, dst); } @@ -1750,8 +1213,8 @@ inline void deep_copy( const ExecSpace& e, typename ViewTraits::non_const_value_type& dst, const DynRankView& src, - std::enable_if_t::specialize, - void>::value>* = 0) { + std::enable_if_t::specialize, + void>>* = 0) { deep_copy(e, dst, Impl::as_view_of_rank_n<0>(src)); } @@ -1759,8 +1222,8 @@ template inline void deep_copy( typename ViewTraits::non_const_value_type& dst, const DynRankView& src, - std::enable_if_t::specialize, - void>::value>* = 0) { + std::enable_if_t::specialize, + void>>* = 0) { deep_copy(dst, Impl::as_view_of_rank_n<0>(src)); } @@ -1773,15 +1236,13 @@ inline void deep_copy( template inline void deep_copy( const ExecSpace& exec_space, const DstType& dst, const SrcType& src, - std::enable_if_t< - (std::is_void::value && - std::is_void::value && - (Kokkos::is_dyn_rank_view::value || - Kokkos::is_dyn_rank_view::value))>* = nullptr) { - static_assert( - std::is_same::value, - "deep_copy requires non-const destination type"); + std::enable_if_t<(std::is_void_v && + std::is_void_v && + (Kokkos::is_dyn_rank_view::value || + Kokkos::is_dyn_rank_view::value))>* = nullptr) { + static_assert(std::is_same_v, + "deep_copy requires non-const destination type"); switch (rank(dst)) { case 0: @@ -1826,15 +1287,13 @@ inline void deep_copy( template inline void deep_copy( const DstType& dst, const SrcType& src, - std::enable_if_t< - (std::is_void::value && - std::is_void::value && - (Kokkos::is_dyn_rank_view::value || - Kokkos::is_dyn_rank_view::value))>* = nullptr) { - static_assert( - std::is_same::value, - "deep_copy requires non-const destination type"); + std::enable_if_t<(std::is_void_v && + std::is_void_v && + (Kokkos::is_dyn_rank_view::value || + Kokkos::is_dyn_rank_view::value))>* = nullptr) { + static_assert(std::is_same_v, + "deep_copy requires non-const destination type"); switch (rank(dst)) { case 0: @@ -1894,7 +1353,7 @@ struct MirrorDRViewType { // Check whether it is the same memory space enum { is_same_memspace = - std::is_same::value + std::is_same_v }; // The array_layout using array_layout = typename src_view_type::array_layout; @@ -1909,26 +1368,6 @@ struct MirrorDRViewType { std::conditional_t; }; -template -struct MirrorDRVType { - // The incoming view_type - using src_view_type = typename Kokkos::DynRankView; - // The memory space for the mirror view - using memory_space = typename Space::memory_space; - // Check whether it is the same memory space - enum { - is_same_memspace = - std::is_same::value - }; - // The array_layout - using array_layout = typename src_view_type::array_layout; - // The data type (we probably want it non-const since otherwise we can't even - // deep_copy to it. - using data_type = typename src_view_type::non_const_data_type; - // The destination view type if it is not the same memory space - using view_type = Kokkos::DynRankView; -}; - } // namespace Impl namespace Impl { @@ -1945,10 +1384,9 @@ inline auto create_mirror(const DynRankView& src, arg_prop, std::string(src.label()).append("_mirror")); if constexpr (Impl::ViewCtorProp::has_memory_space) { - using dst_type = typename Impl::MirrorDRVType< + using dst_type = typename Impl::MirrorDRViewType< typename Impl::ViewCtorProp::memory_space, T, - P...>::view_type; - + P...>::dest_view_type; return dst_type(prop_copy, Impl::reconstructLayout(src.layout(), src.rank())); } else { @@ -1989,7 +1427,8 @@ template ::value && std::is_void_v::specialize>>> -auto create_mirror(const Space&, const Kokkos::DynRankView& src) { +inline auto create_mirror(const Space&, + const Kokkos::DynRankView& src) { return Impl::create_mirror( src, Kokkos::view_alloc(typename Space::memory_space{})); } @@ -1999,8 +1438,8 @@ template ::value && std::is_void_v::specialize>>> -auto create_mirror(Kokkos::Impl::WithoutInitializing_t wi, const Space&, - const Kokkos::DynRankView& src) { +inline auto create_mirror(Kokkos::Impl::WithoutInitializing_t wi, const Space&, + const Kokkos::DynRankView& src) { return Impl::create_mirror( src, Kokkos::view_alloc(wi, typename Space::memory_space{})); } @@ -2026,12 +1465,12 @@ inline auto create_mirror_view( [[maybe_unused]] const typename Impl::ViewCtorProp& arg_prop) { if constexpr (!Impl::ViewCtorProp::has_memory_space) { - if constexpr (std::is_same::memory_space, - typename DynRankView< - T, P...>::HostMirror::memory_space>::value && - std::is_same::data_type, - typename DynRankView< - T, P...>::HostMirror::data_type>::value) { + if constexpr (std::is_same_v::memory_space, + typename DynRankView< + T, P...>::HostMirror::memory_space> && + std::is_same_v< + typename DynRankView::data_type, + typename DynRankView::HostMirror::data_type>) { return typename DynRankView::HostMirror(src); } else { return Kokkos::Impl::choose_create_mirror(src, arg_prop); @@ -2102,7 +1541,7 @@ inline auto create_mirror_view( // view_alloc template ::specialize>::value>> + std::is_void_v::specialize>>> auto create_mirror_view_and_copy( [[maybe_unused]] const Impl::ViewCtorProp& arg_prop, const Kokkos::DynRankView& src) { diff --git a/packages/kokkos/containers/src/Kokkos_DynamicView.hpp b/packages/kokkos/containers/src/Kokkos_DynamicView.hpp index a4b74e246e0d..caae3f791f04 100644 --- a/packages/kokkos/containers/src/Kokkos_DynamicView.hpp +++ b/packages/kokkos/containers/src/Kokkos_DynamicView.hpp @@ -40,10 +40,10 @@ struct ChunkedArrayManager { using pointer_type = ValueType*; using track_type = Kokkos::Impl::SharedAllocationTracker; - ChunkedArrayManager() = default; - ChunkedArrayManager(ChunkedArrayManager const&) = default; - ChunkedArrayManager(ChunkedArrayManager&&) = default; - ChunkedArrayManager& operator=(ChunkedArrayManager&&) = default; + ChunkedArrayManager() = default; + ChunkedArrayManager(ChunkedArrayManager const&) = default; + ChunkedArrayManager(ChunkedArrayManager&&) = default; + ChunkedArrayManager& operator=(ChunkedArrayManager&&) = default; ChunkedArrayManager& operator=(const ChunkedArrayManager&) = default; template @@ -129,10 +129,10 @@ struct ChunkedArrayManager { /// allocation template struct Destroy { - Destroy() = default; - Destroy(Destroy&&) = default; - Destroy(const Destroy&) = default; - Destroy& operator=(Destroy&&) = default; + Destroy() = default; + Destroy(Destroy&&) = default; + Destroy(const Destroy&) = default; + Destroy& operator=(Destroy&&) = default; Destroy& operator=(const Destroy&) = default; Destroy(std::string label, value_type** arg_chunk, @@ -250,7 +250,7 @@ class DynamicView : public Kokkos::ViewTraits { // It is assumed that the value_type is trivially copyable; // when this is not the case, potential problems can occur. - static_assert(std::is_void::value, + static_assert(std::is_void_v, "DynamicView only implemented for non-specialized View type"); private: @@ -363,7 +363,7 @@ class DynamicView : public Kokkos::ViewTraits { enum { reference_type_is_lvalue_reference = - std::is_lvalue_reference::value + std::is_lvalue_reference_v }; KOKKOS_INLINE_FUNCTION constexpr bool span_is_contiguous() const { @@ -463,11 +463,11 @@ class DynamicView : public Kokkos::ViewTraits { //---------------------------------------------------------------------- - ~DynamicView() = default; - DynamicView() = default; - DynamicView(DynamicView&&) = default; - DynamicView(const DynamicView&) = default; - DynamicView& operator=(DynamicView&&) = default; + ~DynamicView() = default; + DynamicView() = default; + DynamicView(DynamicView&&) = default; + DynamicView(const DynamicView&) = default; + DynamicView& operator=(DynamicView&&) = default; DynamicView& operator=(const DynamicView&) = default; template @@ -572,7 +572,7 @@ struct MirrorDynamicViewType { // Check whether it is the same memory space enum { is_same_memspace = - std::is_same::value + std::is_same_v }; // The array_layout using array_layout = typename src_view_type::array_layout; @@ -665,9 +665,9 @@ template ::value && std::is_void_v::specialize>>> -typename Kokkos::Impl::MirrorDynamicViewType::view_type -create_mirror(Kokkos::Impl::WithoutInitializing_t wi, const Space&, - const Kokkos::Experimental::DynamicView& src) { +inline auto create_mirror( + Kokkos::Impl::WithoutInitializing_t wi, const Space&, + const Kokkos::Experimental::DynamicView& src) { return Impl::create_mirror( src, Kokkos::view_alloc(wi, typename Space::memory_space{})); } @@ -693,14 +693,14 @@ inline auto create_mirror_view( const Kokkos::Experimental::DynamicView& src, [[maybe_unused]] const Impl::ViewCtorProp& arg_prop) { if constexpr (!Impl::ViewCtorProp::has_memory_space) { - if constexpr (std::is_same::memory_space, - typename Kokkos::Experimental::DynamicView< - T, P...>::HostMirror::memory_space>::value && - std::is_same::data_type, - typename Kokkos::Experimental::DynamicView< - T, P...>::HostMirror::data_type>::value) { + if constexpr (std::is_same_v::memory_space, + typename Kokkos::Experimental::DynamicView< + T, P...>::HostMirror::memory_space> && + std::is_same_v::data_type, + typename Kokkos::Experimental::DynamicView< + T, P...>::HostMirror::data_type>) { return typename Kokkos::Experimental::DynamicView::HostMirror(src); } else { @@ -835,21 +835,17 @@ inline void deep_copy(const View& dst, using dst_execution_space = typename ViewTraits::execution_space; using src_memory_space = typename ViewTraits::memory_space; - enum { - DstExecCanAccessSrc = - Kokkos::SpaceAccessibility::accessible - }; + constexpr bool DstExecCanAccessSrc = + Kokkos::SpaceAccessibility::accessible; + static_assert( + DstExecCanAccessSrc, + "deep_copy given views that would require a temporary allocation"); - if (DstExecCanAccessSrc) { - // Copying data between views in accessible memory spaces and either - // non-contiguous or incompatible shape. - Kokkos::Impl::ViewRemap(dst, src); - Kokkos::fence("Kokkos::deep_copy(DynamicView)"); - } else { - Kokkos::Impl::throw_runtime_exception( - "deep_copy given views that would require a temporary allocation"); - } + // Copying data between views in accessible memory spaces and either + // non-contiguous or incompatible shape. + Kokkos::Impl::ViewRemap(dst, src); + Kokkos::fence("Kokkos::deep_copy(DynamicView)"); } template @@ -861,21 +857,17 @@ inline void deep_copy(const Kokkos::Experimental::DynamicView& dst, using dst_execution_space = typename ViewTraits::execution_space; using src_memory_space = typename ViewTraits::memory_space; - enum { - DstExecCanAccessSrc = - Kokkos::SpaceAccessibility::accessible - }; + constexpr bool DstExecCanAccessSrc = + Kokkos::SpaceAccessibility::accessible; + static_assert( + DstExecCanAccessSrc, + "deep_copy given views that would require a temporary allocation"); - if (DstExecCanAccessSrc) { - // Copying data between views in accessible memory spaces and either - // non-contiguous or incompatible shape. - Kokkos::Impl::ViewRemap(dst, src); - Kokkos::fence("Kokkos::deep_copy(DynamicView)"); - } else { - Kokkos::Impl::throw_runtime_exception( - "deep_copy given views that would require a temporary allocation"); - } + // Copying data between views in accessible memory spaces and either + // non-contiguous or incompatible shape. + Kokkos::Impl::ViewRemap(dst, src); + Kokkos::fence("Kokkos::deep_copy(DynamicView)"); } namespace Impl { @@ -964,7 +956,7 @@ struct ViewCopy, // view_alloc template ::specialize>::value>> + std::is_void_v::specialize>>> auto create_mirror_view_and_copy( [[maybe_unused]] const Impl::ViewCtorProp& arg_prop, const Kokkos::Experimental::DynamicView& src) { diff --git a/packages/kokkos/containers/src/Kokkos_OffsetView.hpp b/packages/kokkos/containers/src/Kokkos_OffsetView.hpp index 3adc70b19049..cf23c25b86bd 100644 --- a/packages/kokkos/containers/src/Kokkos_OffsetView.hpp +++ b/packages/kokkos/containers/src/Kokkos_OffsetView.hpp @@ -50,9 +50,9 @@ inline constexpr bool is_offset_view_v = is_offset_view::value; #define KOKKOS_INVALID_INDEX_RANGE \ { KOKKOS_INVALID_OFFSET, KOKKOS_INVALID_OFFSET } -template ::value && - std::is_signed::value, - iType> = 0> +template && std::is_signed_v, + iType> = 0> using IndexRange = Kokkos::Array; using index_list_type = std::initializer_list; @@ -118,11 +118,11 @@ KOKKOS_INLINE_FUNCTION void offsetview_verify_operator_bounds( (enum {LEN = 1024}; char buffer[LEN]; const std::string label = tracker.template get_label(); int n = snprintf(buffer, LEN, - "OffsetView bounds error of view labeled %s (", - label.c_str()); + "OffsetView bounds error of view labeled %s (", + label.c_str()); offsetview_error_operator_bounds<0>(buffer + n, LEN - n, map, begins, args...); - Kokkos::Impl::throw_runtime_exception(std::string(buffer));)) + Kokkos::abort(buffer);)) KOKKOS_IF_ON_DEVICE( (Kokkos::abort("OffsetView bounds error"); (void)tracker;)) @@ -180,44 +180,40 @@ void runtime_check_rank_device(const size_t rank_dynamic, const size_t rank, } // namespace Impl template -class OffsetView : public ViewTraits { - public: - using traits = ViewTraits; - +class OffsetView : public View { private: template friend class OffsetView; - template - friend class View; // FIXME delete this line - template - friend class Kokkos::Impl::ViewMapping; - using map_type = Kokkos::Impl::ViewMapping; - using track_type = Kokkos::Impl::SharedAllocationTracker; + using base_t = View; public: - enum { Rank = map_type::Rank }; - using begins_type = Kokkos::Array; + // typedefs to reduce typing base_t:: further down + using traits = typename base_t::traits; + // FIXME: should be base_t::index_type after refactor + using index_type = typename base_t::memory_space::size_type; + using pointer_type = typename base_t::pointer_type; + + using begins_type = Kokkos::Array; template ::value, iType> = 0> + std::enable_if_t, iType> = 0> KOKKOS_FUNCTION int64_t begin(const iType local_dimension) const { - return local_dimension < Rank ? m_begins[local_dimension] - : KOKKOS_INVALID_OFFSET; + return static_cast(local_dimension) < base_t::rank() + ? m_begins[local_dimension] + : KOKKOS_INVALID_OFFSET; } KOKKOS_FUNCTION begins_type begins() const { return m_begins; } template ::value, iType> = 0> + std::enable_if_t, iType> = 0> KOKKOS_FUNCTION int64_t end(const iType local_dimension) const { - return begin(local_dimension) + m_map.extent(local_dimension); + return begin(local_dimension) + base_t::extent(local_dimension); } private: - track_type m_track; - map_type m_map; begins_type m_begins; public: @@ -245,529 +241,60 @@ class OffsetView : public ViewTraits { typename traits::array_layout, typename traits::host_mirror_space>; - //---------------------------------------- - // Domain rank and extents - - /** \brief rank() to be implemented - */ - // KOKKOS_FUNCTION - // static - // constexpr unsigned rank() { return map_type::Rank; } - - template - KOKKOS_FUNCTION constexpr std::enable_if_t::value, - size_t> - extent(const iType& r) const { - return m_map.extent(r); - } - - template - KOKKOS_FUNCTION constexpr std::enable_if_t::value, - int> - extent_int(const iType& r) const { - return static_cast(m_map.extent(r)); - } - - KOKKOS_FUNCTION constexpr typename traits::array_layout layout() const { - return m_map.layout(); - } - - KOKKOS_FUNCTION constexpr size_t size() const { - return m_map.dimension_0() * m_map.dimension_1() * m_map.dimension_2() * - m_map.dimension_3() * m_map.dimension_4() * m_map.dimension_5() * - m_map.dimension_6() * m_map.dimension_7(); - } - - KOKKOS_FUNCTION constexpr size_t stride_0() const { return m_map.stride_0(); } - KOKKOS_FUNCTION constexpr size_t stride_1() const { return m_map.stride_1(); } - KOKKOS_FUNCTION constexpr size_t stride_2() const { return m_map.stride_2(); } - KOKKOS_FUNCTION constexpr size_t stride_3() const { return m_map.stride_3(); } - KOKKOS_FUNCTION constexpr size_t stride_4() const { return m_map.stride_4(); } - KOKKOS_FUNCTION constexpr size_t stride_5() const { return m_map.stride_5(); } - KOKKOS_FUNCTION constexpr size_t stride_6() const { return m_map.stride_6(); } - KOKKOS_FUNCTION constexpr size_t stride_7() const { return m_map.stride_7(); } - - template - KOKKOS_FUNCTION constexpr std::enable_if_t::value, - size_t> - stride(iType r) const { - return ( - r == 0 - ? m_map.stride_0() - : (r == 1 - ? m_map.stride_1() - : (r == 2 - ? m_map.stride_2() - : (r == 3 - ? m_map.stride_3() - : (r == 4 - ? m_map.stride_4() - : (r == 5 - ? m_map.stride_5() - : (r == 6 - ? m_map.stride_6() - : m_map.stride_7()))))))); + template + KOKKOS_FUNCTION typename base_t::reference_type offset_operator( + std::integer_sequence, OtherIndexTypes... indices) const { + return base_t::operator()((indices - m_begins[I])...); } - template - KOKKOS_FUNCTION void stride(iType* const s) const { - m_map.stride(s); - } - - //---------------------------------------- - // Range span is the span which contains all members. - - using reference_type = typename map_type::reference_type; - using pointer_type = typename map_type::pointer_type; - - enum { - reference_type_is_lvalue_reference = - std::is_lvalue_reference::value - }; - - KOKKOS_FUNCTION constexpr size_t span() const { return m_map.span(); } - KOKKOS_FUNCTION bool span_is_contiguous() const { - return m_map.span_is_contiguous(); - } - KOKKOS_FUNCTION constexpr bool is_allocated() const { - return m_map.data() != nullptr; - } - KOKKOS_FUNCTION constexpr pointer_type data() const { return m_map.data(); } - - //---------------------------------------- - // Allow specializations to query their specialized map - - KOKKOS_FUNCTION - const Kokkos::Impl::ViewMapping& implementation_map() const { - return m_map; - } - - //---------------------------------------- - - private: - static constexpr bool is_layout_left = - std::is_same::value; - - static constexpr bool is_layout_right = - std::is_same::value; - - static constexpr bool is_layout_stride = - std::is_same::value; - - static constexpr bool is_default_map = - std::is_void::value && - (is_layout_left || is_layout_right || is_layout_stride); - -#if defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK) - -#define KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY(ARG) \ - Kokkos::Impl::runtime_check_memory_access_violation< \ - typename traits::memory_space>( \ - "Kokkos::OffsetView ERROR: attempt to access inaccessible memory " \ - "space"); \ - Kokkos::Experimental::Impl::offsetview_verify_operator_bounds< \ - typename traits::memory_space> \ - ARG; - -#else - -#define KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY(ARG) \ - Kokkos::Impl::runtime_check_memory_access_violation< \ - typename traits::memory_space>( \ - "Kokkos::OffsetView ERROR: attempt to access inaccessible memory " \ - "space"); - + template +#ifndef KOKKOS_ENABLE_CXX17 + requires(std::is_convertible_v && + std::is_nothrow_constructible_v && + (base_t::rank() == 1)) #endif - public: - //------------------------------ - // Rank 0 operator() - - KOKKOS_FORCEINLINE_FUNCTION - reference_type operator()() const { return m_map.reference(); } - //------------------------------ - // Rank 1 operator() - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::are_integral::value && (1 == Rank) && !is_default_map), - reference_type> - operator()(const I0& i0) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY((m_track, m_map, m_begins, i0)) - const size_t j0 = i0 - m_begins[0]; - return m_map.reference(j0); - } - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::are_integral::value && (1 == Rank) && - is_default_map && !is_layout_stride), - reference_type> - operator()(const I0& i0) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY((m_track, m_map, m_begins, i0)) - const size_t j0 = i0 - m_begins[0]; - return m_map.m_impl_handle[j0]; - } - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::are_integral::value && (1 == Rank) && - is_default_map && is_layout_stride), - reference_type> - operator()(const I0& i0) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY((m_track, m_map, m_begins, i0)) - const size_t j0 = i0 - m_begins[0]; - return m_map.m_impl_handle[m_map.m_impl_offset.m_stride.S0 * j0]; - } - //------------------------------ - // Rank 1 operator[] - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::are_integral::value && (1 == Rank) && !is_default_map), - reference_type> - operator[](const I0& i0) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY((m_track, m_map, m_begins, i0)) - const size_t j0 = i0 - m_begins[0]; - return m_map.reference(j0); - } - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::are_integral::value && (1 == Rank) && - is_default_map && !is_layout_stride), - reference_type> - operator[](const I0& i0) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY((m_track, m_map, m_begins, i0)) - const size_t j0 = i0 - m_begins[0]; - return m_map.m_impl_handle[j0]; - } - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::are_integral::value && (1 == Rank) && - is_default_map && is_layout_stride), - reference_type> - operator[](const I0& i0) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY((m_track, m_map, m_begins, i0)) - const size_t j0 = i0 - m_begins[0]; - return m_map.m_impl_handle[m_map.m_impl_offset.m_stride.S0 * j0]; - } - - //------------------------------ - // Rank 2 - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::are_integral::value && - (2 == Rank) && !is_default_map), - reference_type> - operator()(const I0& i0, const I1& i1) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY((m_track, m_map, m_begins, i0, i1)) - const size_t j0 = i0 - m_begins[0]; - const size_t j1 = i1 - m_begins[1]; - return m_map.reference(j0, j1); - } - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::are_integral::value && (2 == Rank) && - is_default_map && - (is_layout_left || is_layout_right || is_layout_stride)), - reference_type> - operator()(const I0& i0, const I1& i1) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY((m_track, m_map, m_begins, i0, i1)) - const size_t j0 = i0 - m_begins[0]; - const size_t j1 = i1 - m_begins[1]; - if constexpr (is_layout_left) { - if constexpr (traits::rank_dynamic == 0) - return m_map.m_impl_handle[j0 + m_map.m_impl_offset.m_dim.N0 * j1]; - else - return m_map.m_impl_handle[j0 + m_map.m_impl_offset.m_stride * j1]; - } else if constexpr (is_layout_right) { - if constexpr (traits::rank_dynamic == 0) - return m_map.m_impl_handle[j1 + m_map.m_impl_offset.m_dim.N1 * j0]; - else - return m_map.m_impl_handle[j1 + m_map.m_impl_offset.m_stride * j0]; - } else { - static_assert(is_layout_stride); - return m_map.m_impl_handle[j0 * m_map.m_impl_offset.m_stride.S0 + - j1 * m_map.m_impl_offset.m_stride.S1]; - } -#if defined(KOKKOS_COMPILER_INTEL) - __builtin_unreachable(); + KOKKOS_FUNCTION constexpr typename base_t::reference_type operator[]( + const OtherIndexType& idx) const { +#ifdef KOKKOS_ENABLE_CXX17 + static_assert(std::is_convertible_v && + std::is_nothrow_constructible_v && + (base_t::rank() == 1)); #endif + return base_t::operator[](idx - m_begins[0]); } - //------------------------------ - // Rank 3 - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::are_integral::value && - (3 == Rank) && is_default_map), - reference_type> - operator()(const I0& i0, const I1& i1, const I2& i2) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY( - (m_track, m_map, m_begins, i0, i1, i2)) - const size_t j0 = i0 - m_begins[0]; - const size_t j1 = i1 - m_begins[1]; - const size_t j2 = i2 - m_begins[2]; - return m_map.m_impl_handle[m_map.m_impl_offset(j0, j1, j2)]; - } - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::are_integral::value && - (3 == Rank) && !is_default_map), - reference_type> - operator()(const I0& i0, const I1& i1, const I2& i2) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY( - (m_track, m_map, m_begins, i0, i1, i2)) - const size_t j0 = i0 - m_begins[0]; - const size_t j1 = i1 - m_begins[1]; - const size_t j2 = i2 - m_begins[2]; - return m_map.reference(j0, j1, j2); - } - - //------------------------------ - // Rank 4 - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::are_integral::value && - (4 == Rank) && is_default_map), - reference_type> - operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY( - (m_track, m_map, m_begins, i0, i1, i2, i3)) - const size_t j0 = i0 - m_begins[0]; - const size_t j1 = i1 - m_begins[1]; - const size_t j2 = i2 - m_begins[2]; - const size_t j3 = i3 - m_begins[3]; - return m_map.m_impl_handle[m_map.m_impl_offset(j0, j1, j2, j3)]; - } - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::are_integral::value && - (4 == Rank) && !is_default_map), - reference_type> - operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY( - (m_track, m_map, m_begins, i0, i1, i2, i3)) - const size_t j0 = i0 - m_begins[0]; - const size_t j1 = i1 - m_begins[1]; - const size_t j2 = i2 - m_begins[2]; - const size_t j3 = i3 - m_begins[3]; - return m_map.reference(j0, j1, j2, j3); - } - - //------------------------------ - // Rank 5 - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::are_integral::value && - (5 == Rank) && is_default_map), - reference_type> - operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3, - const I4& i4) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY( - (m_track, m_map, m_begins, i0, i1, i2, i3, i4)) - const size_t j0 = i0 - m_begins[0]; - const size_t j1 = i1 - m_begins[1]; - const size_t j2 = i2 - m_begins[2]; - const size_t j3 = i3 - m_begins[3]; - const size_t j4 = i4 - m_begins[4]; - return m_map.m_impl_handle[m_map.m_impl_offset(j0, j1, j2, j3, j4)]; - } - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::are_integral::value && - (5 == Rank) && !is_default_map), - reference_type> - operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3, - const I4& i4) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY( - (m_track, m_map, m_begins, i0, i1, i2, i3, i4)) - const size_t j0 = i0 - m_begins[0]; - const size_t j1 = i1 - m_begins[1]; - const size_t j2 = i2 - m_begins[2]; - const size_t j3 = i3 - m_begins[3]; - const size_t j4 = i4 - m_begins[4]; - return m_map.reference(j0, j1, j2, j3, j4); - } - - //------------------------------ - // Rank 6 - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::are_integral::value && - (6 == Rank) && is_default_map), - reference_type> - operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3, - const I4& i4, const I5& i5) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY( - (m_track, m_map, m_begins, i0, i1, i2, i3, i4, i5)) - const size_t j0 = i0 - m_begins[0]; - const size_t j1 = i1 - m_begins[1]; - const size_t j2 = i2 - m_begins[2]; - const size_t j3 = i3 - m_begins[3]; - const size_t j4 = i4 - m_begins[4]; - const size_t j5 = i5 - m_begins[5]; - return m_map.m_impl_handle[m_map.m_impl_offset(j0, j1, j2, j3, j4, j5)]; - } - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::are_integral::value && - (6 == Rank) && !is_default_map), - reference_type> - operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3, - const I4& i4, const I5& i5) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY( - (m_track, m_map, m_begins, i0, i1, i2, i3, i4, i5)) - const size_t j0 = i0 - m_begins[0]; - const size_t j1 = i1 - m_begins[1]; - const size_t j2 = i2 - m_begins[2]; - const size_t j3 = i3 - m_begins[3]; - const size_t j4 = i4 - m_begins[4]; - const size_t j5 = i5 - m_begins[5]; - return m_map.reference(j0, j1, j2, j3, j4, j5); - } - - //------------------------------ - // Rank 7 - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::are_integral::value && - (7 == Rank) && is_default_map), - reference_type> - operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3, - const I4& i4, const I5& i5, const I6& i6) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY( - (m_track, m_map, m_begins, i0, i1, i2, i3, i4, i5, i6)) - const size_t j0 = i0 - m_begins[0]; - const size_t j1 = i1 - m_begins[1]; - const size_t j2 = i2 - m_begins[2]; - const size_t j3 = i3 - m_begins[3]; - const size_t j4 = i4 - m_begins[4]; - const size_t j5 = i5 - m_begins[5]; - const size_t j6 = i6 - m_begins[6]; - return m_map.m_impl_handle[m_map.m_impl_offset(j0, j1, j2, j3, j4, j5, j6)]; - } - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::are_integral::value && - (7 == Rank) && !is_default_map), - reference_type> - operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3, - const I4& i4, const I5& i5, const I6& i6) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY( - (m_track, m_map, m_begins, i0, i1, i2, i3, i4, i5, i6)) - const size_t j0 = i0 - m_begins[0]; - const size_t j1 = i1 - m_begins[1]; - const size_t j2 = i2 - m_begins[2]; - const size_t j3 = i3 - m_begins[3]; - const size_t j4 = i4 - m_begins[4]; - const size_t j5 = i5 - m_begins[5]; - const size_t j6 = i6 - m_begins[6]; - return m_map.reference(j0, j1, j2, j3, j4, j5, j6); - } - - //------------------------------ - // Rank 8 - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::are_integral::value && - (8 == Rank) && is_default_map), - reference_type> - operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3, - const I4& i4, const I5& i5, const I6& i6, const I7& i7) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY( - (m_track, m_map, m_begins, i0, i1, i2, i3, i4, i5, i6, i7)) - const size_t j0 = i0 - m_begins[0]; - const size_t j1 = i1 - m_begins[1]; - const size_t j2 = i2 - m_begins[2]; - const size_t j3 = i3 - m_begins[3]; - const size_t j4 = i4 - m_begins[4]; - const size_t j5 = i5 - m_begins[5]; - const size_t j6 = i6 - m_begins[6]; - const size_t j7 = i7 - m_begins[7]; - return m_map - .m_impl_handle[m_map.m_impl_offset(j0, j1, j2, j3, j4, j5, j6, j7)]; + template +#ifndef KOKKOS_ENABLE_CXX17 + requires((std::is_convertible_v && ...) && + (std::is_nothrow_constructible_v && + ...) && + (sizeof...(OtherIndexTypes) == base_t::rank())) +#endif + KOKKOS_FUNCTION constexpr typename base_t::reference_type operator()( + OtherIndexTypes... indices) const { +#ifdef KOKKOS_ENABLE_CXX17 + static_assert( + (std::is_convertible_v && ...) && + (std::is_nothrow_constructible_v && ...) && + (sizeof...(OtherIndexTypes) == base_t::rank())); +#endif + return offset_operator(std::make_index_sequence(), + indices...); } - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::are_integral::value && - (8 == Rank) && !is_default_map), - reference_type> - operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3, - const I4& i4, const I5& i5, const I6& i6, const I7& i7) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY( - (m_track, m_map, m_begins, i0, i1, i2, i3, i4, i5, i6, i7)) - const size_t j0 = i0 - m_begins[0]; - const size_t j1 = i1 - m_begins[1]; - const size_t j2 = i2 - m_begins[2]; - const size_t j3 = i3 - m_begins[3]; - const size_t j4 = i4 - m_begins[4]; - const size_t j5 = i5 - m_begins[5]; - const size_t j6 = i6 - m_begins[6]; - const size_t j7 = i7 - m_begins[7]; - return m_map.reference(j0, j1, j2, j3, j4, j5, j6, j7); - } + template + KOKKOS_FUNCTION constexpr typename base_t::reference_type access( + OtherIndexTypes... args) const = delete; -#undef KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY + //---------------------------------------- //---------------------------------------- // Standard destructor, constructors, and assignment operators - KOKKOS_DEFAULTED_FUNCTION - ~OffsetView() = default; - KOKKOS_FUNCTION - OffsetView() : m_track(), m_map() { - for (size_t i = 0; i < Rank; ++i) m_begins[i] = KOKKOS_INVALID_OFFSET; - } - - KOKKOS_FUNCTION - OffsetView(const OffsetView& rhs) - : m_track(rhs.m_track, traits::is_managed), - m_map(rhs.m_map), - m_begins(rhs.m_begins) {} - - KOKKOS_FUNCTION - OffsetView(OffsetView&& rhs) - : m_track(std::move(rhs.m_track)), - m_map(std::move(rhs.m_map)), - m_begins(std::move(rhs.m_begins)) {} - - KOKKOS_FUNCTION - OffsetView& operator=(const OffsetView& rhs) { - m_track = rhs.m_track; - m_map = rhs.m_map; - m_begins = rhs.m_begins; - return *this; - } - - KOKKOS_FUNCTION - OffsetView& operator=(OffsetView&& rhs) { - m_track = std::move(rhs.m_track); - m_map = std::move(rhs.m_map); - m_begins = std::move(rhs.m_begins); - return *this; + OffsetView() : base_t() { + for (size_t i = 0; i < base_t::rank(); ++i) + m_begins[i] = KOKKOS_INVALID_OFFSET; } // interoperability with View @@ -778,20 +305,10 @@ class OffsetView : public ViewTraits { public: KOKKOS_FUNCTION - view_type view() const { - view_type v(m_track, m_map); - return v; - } + view_type view() const { return *this; } template - KOKKOS_FUNCTION OffsetView(const View& aview) - : m_track(aview.impl_track()), m_map() { - using SrcTraits = typename OffsetView::traits; - using Mapping = Kokkos::Impl::ViewMapping; - static_assert(Mapping::is_assignable, - "Incompatible OffsetView copy construction"); - Mapping::assign(m_map, aview.impl_map(), m_track); - + KOKKOS_FUNCTION OffsetView(const View& aview) : base_t(aview) { for (size_t i = 0; i < View::rank(); ++i) { m_begins[i] = 0; } @@ -800,19 +317,14 @@ class OffsetView : public ViewTraits { template KOKKOS_FUNCTION OffsetView(const View& aview, const index_list_type& minIndices) - : m_track(aview.impl_track()), m_map() { - using SrcTraits = typename OffsetView::traits; - using Mapping = Kokkos::Impl::ViewMapping; - static_assert(Mapping::is_assignable, - "Incompatible OffsetView copy construction"); - Mapping::assign(m_map, aview.impl_map(), m_track); - - KOKKOS_IF_ON_HOST((Kokkos::Experimental::Impl::runtime_check_rank_host( - traits::rank_dynamic, Rank, minIndices, label());)) - - KOKKOS_IF_ON_DEVICE((Kokkos::Experimental::Impl::runtime_check_rank_device( - traits::rank_dynamic, Rank, minIndices);)) + : base_t(aview) { + KOKKOS_IF_ON_HOST( + (Kokkos::Experimental::Impl::runtime_check_rank_host( + traits::rank_dynamic, base_t::rank(), minIndices, aview.label());)) + KOKKOS_IF_ON_DEVICE( + (Kokkos::Experimental::Impl::runtime_check_rank_device( + traits::rank_dynamic, base_t::rank(), minIndices);)) for (size_t i = 0; i < minIndices.size(); ++i) { m_begins[i] = minIndices.begin()[i]; } @@ -820,27 +332,13 @@ class OffsetView : public ViewTraits { template KOKKOS_FUNCTION OffsetView(const View& aview, const begins_type& beg) - : m_track(aview.impl_track()), m_map(), m_begins(beg) { - using SrcTraits = typename OffsetView::traits; - using Mapping = Kokkos::Impl::ViewMapping; - static_assert(Mapping::is_assignable, - "Incompatible OffsetView copy construction"); - Mapping::assign(m_map, aview.impl_map(), m_track); - } + : base_t(aview), m_begins(beg) {} // may assign unmanaged from managed. template KOKKOS_FUNCTION OffsetView(const OffsetView& rhs) - : m_track(rhs.m_track, traits::is_managed), - m_map(), - m_begins(rhs.m_begins) { - using SrcTraits = typename OffsetView::traits; - using Mapping = Kokkos::Impl::ViewMapping; - static_assert(Mapping::is_assignable, - "Incompatible OffsetView copy construction"); - Mapping::assign(m_map, rhs.m_map, rhs.m_track); // swb what about assign? - } + : base_t(rhs.view()), m_begins(rhs.m_begins) {} private: enum class subtraction_failure { @@ -879,7 +377,7 @@ class OffsetView : public ViewTraits { static subtraction_failure runtime_check_begins_ends_host(const B& begins, const E& ends) { std::string message; - if (begins.size() != Rank) + if (begins.size() != base_t::rank()) message += "begins.size() " "(" + @@ -887,19 +385,19 @@ class OffsetView : public ViewTraits { ")" " != Rank " "(" + - std::to_string(Rank) + + std::to_string(base_t::rank()) + ")" "\n"; - if (ends.size() != Rank) + if (ends.size() != base_t::rank()) message += "ends.size() " "(" + - std::to_string(begins.size()) + + std::to_string(ends.size()) + ")" " != Rank " "(" + - std::to_string(Rank) + + std::to_string(base_t::rank()) + ")" "\n"; @@ -941,7 +439,7 @@ class OffsetView : public ViewTraits { message = "Kokkos::Experimental::OffsetView ERROR: for unmanaged OffsetView\n" + message; - Kokkos::Impl::throw_runtime_exception(message); + Kokkos::abort(message.c_str()); } return subtraction_failure::none; @@ -951,11 +449,11 @@ class OffsetView : public ViewTraits { template KOKKOS_FUNCTION static subtraction_failure runtime_check_begins_ends_device( const B& begins, const E& ends) { - if (begins.size() != Rank) + if (begins.size() != base_t::rank()) Kokkos::abort( "Kokkos::Experimental::OffsetView ERROR: for unmanaged " "OffsetView: begins has bad Rank"); - if (ends.size() != Rank) + if (ends.size() != base_t::rank()) Kokkos::abort( "Kokkos::Experimental::OffsetView ERROR: for unmanaged " "OffsetView: ends has bad Rank"); @@ -993,20 +491,25 @@ class OffsetView : public ViewTraits { // Precondition: begins.size() == ends.size() == m_begins.size() == Rank template KOKKOS_FUNCTION OffsetView(const pointer_type& p, const B& begins_, - const E& ends_, - subtraction_failure) - : m_track() // no tracking - , - m_map(Kokkos::Impl::ViewCtorProp(p), - typename traits::array_layout( - Rank > 0 ? at(ends_, 0) - at(begins_, 0) : 0, - Rank > 1 ? at(ends_, 1) - at(begins_, 1) : 0, - Rank > 2 ? at(ends_, 2) - at(begins_, 2) : 0, - Rank > 3 ? at(ends_, 3) - at(begins_, 3) : 0, - Rank > 4 ? at(ends_, 4) - at(begins_, 4) : 0, - Rank > 5 ? at(ends_, 5) - at(begins_, 5) : 0, - Rank > 6 ? at(ends_, 6) - at(begins_, 6) : 0, - Rank > 7 ? at(ends_, 7) - at(begins_, 7) : 0)) { + const E& ends_, subtraction_failure) + : base_t(Kokkos::view_wrap(p), + typename traits::array_layout( + base_t::rank() > 0 ? at(ends_, 0) - at(begins_, 0) + : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + base_t::rank() > 1 ? at(ends_, 1) - at(begins_, 1) + : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + base_t::rank() > 2 ? at(ends_, 2) - at(begins_, 2) + : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + base_t::rank() > 3 ? at(ends_, 3) - at(begins_, 3) + : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + base_t::rank() > 4 ? at(ends_, 4) - at(begins_, 4) + : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + base_t::rank() > 5 ? at(ends_, 5) - at(begins_, 5) + : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + base_t::rank() > 6 ? at(ends_, 6) - at(begins_, 6) + : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + base_t::rank() > 7 ? at(ends_, 7) - at(begins_, 7) + : KOKKOS_IMPL_CTOR_DEFAULT_ARG)) { for (size_t i = 0; i != m_begins.size(); ++i) { m_begins[i] = at(begins_, i); }; @@ -1040,15 +543,6 @@ class OffsetView : public ViewTraits { : OffsetView(p, begins_, ends_, runtime_check_begins_ends(begins_, ends_)) {} - //---------------------------------------- - // Allocation tracking properties - KOKKOS_FUNCTION - int use_count() const { return m_track.use_count(); } - - const std::string label() const { - return m_track.template get_label(); - } - // Choosing std::pair as type for the arguments allows constructing an // OffsetView using list initialization syntax, e.g., // OffsetView dummy("dummy", {-1, 3}, {-2,2}); @@ -1070,18 +564,34 @@ class OffsetView : public ViewTraits { const std::pair range7 = KOKKOS_INVALID_INDEX_RANGE ) - : OffsetView( - Kokkos::Impl::ViewCtorProp(arg_label), - typename traits::array_layout(range0.second - range0.first + 1, - range1.second - range1.first + 1, - range2.second - range2.first + 1, - range3.second - range3.first + 1, - range4.second - range4.first + 1, - range5.second - range5.first + 1, - range6.second - range6.first + 1, - range7.second - range7.first + 1), - {range0.first, range1.first, range2.first, range3.first, - range4.first, range5.first, range6.first, range7.first}) {} + : OffsetView(Kokkos::Impl::ViewCtorProp(arg_label), + typename traits::array_layout( + range0.first == KOKKOS_INVALID_OFFSET + ? KOKKOS_IMPL_CTOR_DEFAULT_ARG - 1 + : range0.second - range0.first + 1, + range1.first == KOKKOS_INVALID_OFFSET + ? KOKKOS_IMPL_CTOR_DEFAULT_ARG + : range1.second - range1.first + 1, + range2.first == KOKKOS_INVALID_OFFSET + ? KOKKOS_IMPL_CTOR_DEFAULT_ARG + : range2.second - range2.first + 1, + range3.first == KOKKOS_INVALID_OFFSET + ? KOKKOS_IMPL_CTOR_DEFAULT_ARG + : range3.second - range3.first + 1, + range4.first == KOKKOS_INVALID_OFFSET + ? KOKKOS_IMPL_CTOR_DEFAULT_ARG + : range4.second - range4.first + 1, + range5.first == KOKKOS_INVALID_OFFSET + ? KOKKOS_IMPL_CTOR_DEFAULT_ARG + : range5.second - range5.first + 1, + range6.first == KOKKOS_INVALID_OFFSET + ? KOKKOS_IMPL_CTOR_DEFAULT_ARG + : range6.second - range6.first + 1, + range7.first == KOKKOS_INVALID_OFFSET + ? KOKKOS_IMPL_CTOR_DEFAULT_ARG + : range7.second - range7.first + 1), + {range0.first, range1.first, range2.first, range3.first, + range4.first, range5.first, range6.first, range7.first}) {} template explicit OffsetView( @@ -1094,18 +604,34 @@ class OffsetView : public ViewTraits { const std::pair range5 = KOKKOS_INVALID_INDEX_RANGE, const std::pair range6 = KOKKOS_INVALID_INDEX_RANGE, const std::pair range7 = KOKKOS_INVALID_INDEX_RANGE) - : OffsetView( - arg_prop, - typename traits::array_layout(range0.second - range0.first + 1, - range1.second - range1.first + 1, - range2.second - range2.first + 1, - range3.second - range3.first + 1, - range4.second - range4.first + 1, - range5.second - range5.first + 1, - range6.second - range6.first + 1, - range7.second - range7.first + 1), - {range0.first, range1.first, range2.first, range3.first, - range4.first, range5.first, range6.first, range7.first}) {} + : OffsetView(arg_prop, + typename traits::array_layout( + range0.first == KOKKOS_INVALID_OFFSET + ? KOKKOS_IMPL_CTOR_DEFAULT_ARG + : range0.second - range0.first + 1, + range1.first == KOKKOS_INVALID_OFFSET + ? KOKKOS_IMPL_CTOR_DEFAULT_ARG + : range1.second - range1.first + 1, + range2.first == KOKKOS_INVALID_OFFSET + ? KOKKOS_IMPL_CTOR_DEFAULT_ARG + : range2.second - range2.first + 1, + range3.first == KOKKOS_INVALID_OFFSET + ? KOKKOS_IMPL_CTOR_DEFAULT_ARG + : range3.second - range3.first + 1, + range4.first == KOKKOS_INVALID_OFFSET + ? KOKKOS_IMPL_CTOR_DEFAULT_ARG + : range4.second - range4.first + 1, + range5.first == KOKKOS_INVALID_OFFSET + ? KOKKOS_IMPL_CTOR_DEFAULT_ARG + : range5.second - range5.first + 1, + range6.first == KOKKOS_INVALID_OFFSET + ? KOKKOS_IMPL_CTOR_DEFAULT_ARG + : range6.second - range6.first + 1, + range7.first == KOKKOS_INVALID_OFFSET + ? KOKKOS_IMPL_CTOR_DEFAULT_ARG + : range7.second - range7.first + 1), + {range0.first, range1.first, range2.first, range3.first, + range4.first, range5.first, range6.first, range7.first}) {} template explicit KOKKOS_FUNCTION OffsetView( @@ -1113,9 +639,14 @@ class OffsetView : public ViewTraits { std::enable_if_t::has_pointer, typename traits::array_layout> const& arg_layout, const index_list_type minIndices) - : m_track() // No memory tracking - , - m_map(arg_prop, arg_layout) { + : base_t(arg_prop, arg_layout) { + KOKKOS_IF_ON_HOST((Kokkos::Experimental::Impl::runtime_check_rank_host( + traits::rank_dynamic, base_t::rank(), minIndices, + base_t::label());)) + + KOKKOS_IF_ON_DEVICE( + (Kokkos::Experimental::Impl::runtime_check_rank_device( + traits::rank_dynamic, base_t::rank(), minIndices);)) for (size_t i = 0; i < minIndices.size(); ++i) { m_begins[i] = minIndices.begin()[i]; } @@ -1132,42 +663,9 @@ class OffsetView : public ViewTraits { std::enable_if_t::has_pointer, typename traits::array_layout> const& arg_layout, const index_list_type minIndices) - : m_track(), - m_map() - - { - for (size_t i = 0; i < Rank; ++i) m_begins[i] = minIndices.begin()[i]; - - // Copy the input allocation properties with possibly defaulted properties - auto prop_copy = Kokkos::Impl::with_properties_if_unset( - arg_prop, std::string{}, typename traits::device_type::memory_space{}, - typename traits::device_type::execution_space{}); - using alloc_prop = decltype(prop_copy); - - static_assert(traits::is_managed, - "OffsetView allocation constructor requires managed memory"); - - if (alloc_prop::initialize && - !alloc_prop::execution_space::impl_is_initialized()) { - // If initializing view data then - // the execution space must be initialized. - Kokkos::Impl::throw_runtime_exception( - "Constructing OffsetView and initializing data with uninitialized " - "execution space"); - } - - Kokkos::Impl::SharedAllocationRecord<>* record = m_map.allocate_shared( - prop_copy, arg_layout, - Kokkos::Impl::ViewCtorProp::has_execution_space); - - // Setup and initialization complete, start tracking - m_track.assign_allocated_record_to_uninitialized(record); - - KOKKOS_IF_ON_HOST((Kokkos::Experimental::Impl::runtime_check_rank_host( - traits::rank_dynamic, Rank, minIndices, label());)) - - KOKKOS_IF_ON_DEVICE((Kokkos::Experimental::Impl::runtime_check_rank_device( - traits::rank_dynamic, Rank, minIndices);)) + : base_t(arg_prop, arg_layout) { + for (size_t i = 0; i < base_t::rank(); ++i) + m_begins[i] = minIndices.begin()[i]; } }; @@ -1177,7 +675,7 @@ class OffsetView : public ViewTraits { */ template KOKKOS_INLINE_FUNCTION constexpr unsigned rank(const OffsetView& V) { - return V.Rank; + return V.rank(); } // Temporary until added to view //---------------------------------------------------------------------------- @@ -1185,8 +683,8 @@ KOKKOS_INLINE_FUNCTION constexpr unsigned rank(const OffsetView& V) { namespace Impl { template -KOKKOS_INLINE_FUNCTION std::enable_if_t::value, T> -shift_input(const T arg, const int64_t offset) { +KOKKOS_INLINE_FUNCTION std::enable_if_t, T> shift_input( + const T arg, const int64_t offset) { return arg - offset; } @@ -1197,13 +695,13 @@ Kokkos::ALL_t shift_input(const Kokkos::ALL_t arg, const int64_t /*offset*/) { template KOKKOS_INLINE_FUNCTION - std::enable_if_t::value, Kokkos::pair> + std::enable_if_t, Kokkos::pair> shift_input(const Kokkos::pair arg, const int64_t offset) { return Kokkos::make_pair(arg.first - offset, arg.second - offset); } template -inline std::enable_if_t::value, std::pair> -shift_input(const std::pair arg, const int64_t offset) { +inline std::enable_if_t, std::pair> shift_input( + const std::pair arg, const int64_t offset) { return std::make_pair(arg.first - offset, arg.second - offset); } @@ -1212,7 +710,7 @@ KOKKOS_INLINE_FUNCTION void map_arg_to_new_begin( const size_t i, Kokkos::Array& subviewBegins, std::enable_if_t shiftedArg, const Arg arg, const A viewBegins, size_t& counter) { - if (!std::is_integral::value) { + if (!std::is_integral_v) { subviewBegins[counter] = shiftedArg == arg ? viewBegins[i] : 0; counter++; } @@ -1621,7 +1119,7 @@ KOKKOS_INLINE_FUNCTION ViewTraits, Args...>::type>::type subview(const OffsetView& src, Args... args) { static_assert( - OffsetView::Rank == sizeof...(Args), + OffsetView::rank() == sizeof...(Args), "subview requires one argument for each source OffsetView rank"); return Kokkos::Experimental::Impl::subview_offset(src, args...); @@ -1641,12 +1139,12 @@ KOKKOS_INLINE_FUNCTION bool operator==(const OffsetView& lhs, using lhs_traits = ViewTraits; using rhs_traits = ViewTraits; - return std::is_same::value && - std::is_same::value && - std::is_same::value && + return std::is_same_v && + std::is_same_v && + std::is_same_v && unsigned(lhs_traits::rank) == unsigned(rhs_traits::rank) && lhs.data() == rhs.data() && lhs.span() == rhs.span() && lhs.extent(0) == rhs.extent(0) && lhs.extent(1) == rhs.extent(1) && @@ -1672,12 +1170,12 @@ KOKKOS_INLINE_FUNCTION bool operator==(const View& lhs, using lhs_traits = ViewTraits; using rhs_traits = ViewTraits; - return std::is_same::value && - std::is_same::value && - std::is_same::value && + return std::is_same_v && + std::is_same_v && + std::is_same_v && unsigned(lhs_traits::rank) == unsigned(rhs_traits::rank) && lhs.data() == rhs.data() && lhs.span() == rhs.span() && lhs.extent(0) == rhs.extent(0) && lhs.extent(1) == rhs.extent(1) && @@ -1704,11 +1202,11 @@ template inline void deep_copy( const Experimental::OffsetView& dst, typename ViewTraits::const_value_type& value, - std::enable_if_t::specialize, - void>::value>* = nullptr) { + std::enable_if_t::specialize, + void>>* = nullptr) { static_assert( - std::is_same::non_const_value_type, - typename ViewTraits::value_type>::value, + std::is_same_v::non_const_value_type, + typename ViewTraits::value_type>, "deep_copy requires non-const type"); auto dstView = dst.view(); @@ -1719,11 +1217,11 @@ template inline void deep_copy( const Experimental::OffsetView& dst, const Experimental::OffsetView& value, - std::enable_if_t::specialize, - void>::value>* = nullptr) { + std::enable_if_t::specialize, + void>>* = nullptr) { static_assert( - std::is_same::value_type, - typename ViewTraits::non_const_value_type>::value, + std::is_same_v::value_type, + typename ViewTraits::non_const_value_type>, "deep_copy requires matching non-const destination type"); auto dstView = dst.view(); @@ -1733,11 +1231,11 @@ template inline void deep_copy( const Experimental::OffsetView& dst, const View& value, - std::enable_if_t::specialize, - void>::value>* = nullptr) { + std::enable_if_t::specialize, + void>>* = nullptr) { static_assert( - std::is_same::value_type, - typename ViewTraits::non_const_value_type>::value, + std::is_same_v::value_type, + typename ViewTraits::non_const_value_type>, "deep_copy requires matching non-const destination type"); auto dstView = dst.view(); @@ -1748,11 +1246,11 @@ template inline void deep_copy( const View& dst, const Experimental::OffsetView& value, - std::enable_if_t::specialize, - void>::value>* = nullptr) { + std::enable_if_t::specialize, + void>>* = nullptr) { static_assert( - std::is_same::value_type, - typename ViewTraits::non_const_value_type>::value, + std::is_same_v::value_type, + typename ViewTraits::non_const_value_type>, "deep_copy requires matching non-const destination type"); Kokkos::deep_copy(dst, value.view()); @@ -1770,7 +1268,7 @@ struct MirrorOffsetViewType { // Check whether it is the same memory space enum { is_same_memspace = - std::is_same::value + std::is_same_v }; // The array_layout using array_layout = typename src_view_type::array_layout; @@ -1786,27 +1284,6 @@ struct MirrorOffsetViewType { std::conditional_t; }; -template -struct MirrorOffsetType { - // The incoming view_type - using src_view_type = typename Kokkos::Experimental::OffsetView; - // The memory space for the mirror view - using memory_space = typename Space::memory_space; - // Check whether it is the same memory space - enum { - is_same_memspace = - std::is_same::value - }; - // The array_layout - using array_layout = typename src_view_type::array_layout; - // The data type (we probably want it non-const since otherwise we can't even - // deep_copy to it.) - using data_type = typename src_view_type::non_const_data_type; - // The destination view type if it is not the same memory space - using view_type = - Kokkos::Experimental::OffsetView; -}; - } // namespace Impl namespace Impl { @@ -1825,10 +1302,12 @@ inline auto create_mirror(const Kokkos::Experimental::OffsetView& src, auto prop_copy = Impl::with_properties_if_unset( arg_prop, std::string(src.label()).append("_mirror")); - return typename Kokkos::Impl::MirrorOffsetType::view_type( - prop_copy, src.layout(), - {src.begin(0), src.begin(1), src.begin(2), src.begin(3), src.begin(4), - src.begin(5), src.begin(6), src.begin(7)}); + return typename Kokkos::Impl::MirrorOffsetViewType< + Space, T, P...>::dest_view_type(prop_copy, src.layout(), + {src.begin(0), src.begin(1), + src.begin(2), src.begin(3), + src.begin(4), src.begin(5), + src.begin(6), src.begin(7)}); } else { return typename Kokkos::Experimental::OffsetView::HostMirror( Kokkos::create_mirror(arg_prop, src.view()), src.begins()); @@ -1877,9 +1356,9 @@ template ::value && std::is_void_v::specialize>>> -typename Kokkos::Impl::MirrorOffsetType::view_type -create_mirror(Kokkos::Impl::WithoutInitializing_t wi, const Space&, - const Kokkos::Experimental::OffsetView& src) { +inline auto create_mirror( + Kokkos::Impl::WithoutInitializing_t wi, const Space&, + const Kokkos::Experimental::OffsetView& src) { return Impl::create_mirror( src, Kokkos::view_alloc(typename Space::memory_space{}, wi)); } @@ -1905,14 +1384,14 @@ inline auto create_mirror_view( const Kokkos::Experimental::OffsetView& src, [[maybe_unused]] const Impl::ViewCtorProp& arg_prop) { if constexpr (!Impl::ViewCtorProp::has_memory_space) { - if constexpr (std::is_same::memory_space, - typename Kokkos::Experimental::OffsetView< - T, P...>::HostMirror::memory_space>::value && - std::is_same::data_type, - typename Kokkos::Experimental::OffsetView< - T, P...>::HostMirror::data_type>::value) { + if constexpr (std::is_same_v::memory_space, + typename Kokkos::Experimental::OffsetView< + T, P...>::HostMirror::memory_space> && + std::is_same_v::data_type, + typename Kokkos::Experimental::OffsetView< + T, P...>::HostMirror::data_type>) { return typename Kokkos::Experimental::OffsetView::HostMirror(src); } else { diff --git a/packages/kokkos/containers/src/Kokkos_ScatterView.hpp b/packages/kokkos/containers/src/Kokkos_ScatterView.hpp index 9d04cf6acd0e..52af567c61d2 100644 --- a/packages/kokkos/containers/src/Kokkos_ScatterView.hpp +++ b/packages/kokkos/containers/src/Kokkos_ScatterView.hpp @@ -184,16 +184,16 @@ struct DefaultContribution -struct DefaultDuplication { +struct DefaultDuplication { using type = Kokkos::Experimental::ScatterNonDuplicated; }; template <> -struct DefaultContribution { using type = Kokkos::Experimental::ScatterAtomic; }; template <> -struct DefaultContribution { using type = Kokkos::Experimental::ScatterAtomic; }; @@ -532,32 +532,56 @@ void args_to_array(size_t* array, int pos, T dim0, Dims... dims) { subview where the index specified is the largest-stride one. */ template struct Slice { - using next = Slice; - using value_type = typename next::value_type; - - static value_type get(V const& src, const size_t i, Args... args) { + using next = Slice; + static auto get(V const& src, const size_t i, Args... args) { return next::get(src, i, Kokkos::ALL, args...); } }; template struct Slice { - using value_type = - typename Kokkos::Impl::ViewMapping::type; - static value_type get(V const& src, const size_t i, Args... args) { + static auto get(V const& src, const size_t i, Args... args) { return Kokkos::subview(src, i, args...); } }; template struct Slice { - using value_type = - typename Kokkos::Impl::ViewMapping::type; - static value_type get(V const& src, const size_t i, Args... args) { + static auto get(V const& src, const size_t i, Args... args) { + return Kokkos::subview(src, args..., i); + } +}; + +#ifdef KOKKOS_ENABLE_IMPL_MDSPAN +template +struct Slice { + static auto get(V const& src, const size_t i, Args... args) { + return Kokkos::subview(src, i, args...); + } +}; + +template +struct Slice { + static auto get(V const& src, const size_t i, Args... args) { return Kokkos::subview(src, args..., i); } }; +template +struct Slice, 1, V, Args...> { + static auto get(V const& src, const size_t i, Args... args) { + return Kokkos::subview(src, i, args...); + } +}; + +template +struct Slice, 1, V, Args...> { + static auto get(V const& src, const size_t i, Args... args) { + return Kokkos::subview(src, args..., i); + } +}; +#endif + template struct ReduceDuplicates; @@ -905,7 +929,7 @@ class ScatterAccess KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - view_type::original_view_type::rank == 1 && std::is_integral::value, + std::is_integral_v && view_type::original_view_type::rank == 1, value_type> operator[](Arg arg) const { return view.at(arg); @@ -1028,10 +1052,7 @@ class ScatterView::value_type - subview() const { + auto subview() const { return Kokkos::Impl::Experimental::Slice< Kokkos::LayoutRight, internal_view_type::rank, internal_view_type>::get(internal_view, 0); @@ -1233,8 +1254,8 @@ class ScatterView::value_type - subview() const { + auto subview() const { return Kokkos::Impl::Experimental::Slice< Kokkos::LayoutLeft, internal_view_type::rank, internal_view_type>::get(internal_view, 0); @@ -1460,7 +1478,7 @@ class ScatterAccess KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - view_type::original_view_type::rank == 1 && std::is_integral::value, + std::is_integral_v && view_type::original_view_type::rank == 1, value_type> operator[](Arg arg) const { return view.at(thread_id, arg); @@ -1470,9 +1488,9 @@ class ScatterAccess::array_layout, typename ViewTraits::device_type, Op, std::conditional_t< - std::is_void::value, + std::is_void_v, typename Kokkos::Impl::Experimental::DefaultDuplication< typename ViewTraits::execution_space>::type, Duplication>, std::conditional_t< - std::is_void::value, + std::is_void_v, typename Kokkos::Impl::Experimental::DefaultContribution< typename ViewTraits::execution_space, typename std::conditional_t< - std::is_void::value, + std::is_void_v, typename Kokkos::Impl::Experimental::DefaultDuplication< typename ViewTraits::execution_space>::type, Duplication>>::type, diff --git a/packages/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp b/packages/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp index 8ce868cac217..ec1b8905c766 100644 --- a/packages/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp +++ b/packages/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp @@ -190,7 +190,7 @@ struct GraphRowViewConst { const typename GraphType::entries_type& colidx_in, const ordinal_type& stride, const ordinal_type& count, const OffsetType& idx, - const std::enable_if_t::value, int>& = 0) + const std::enable_if_t, int>& = 0) : colidx_(&colidx_in(idx)), stride_(stride), length(count) {} /// \brief Number of entries in the row. diff --git a/packages/kokkos/containers/src/Kokkos_UnorderedMap.hpp b/packages/kokkos/containers/src/Kokkos_UnorderedMap.hpp index c3a8b67df8df..4f47051a5c1c 100644 --- a/packages/kokkos/containers/src/Kokkos_UnorderedMap.hpp +++ b/packages/kokkos/containers/src/Kokkos_UnorderedMap.hpp @@ -34,7 +34,7 @@ #include #include -#include +#include #include @@ -746,7 +746,7 @@ class UnorderedMap { /// 'const value_type' via Cuda texture fetch must return by value. template KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - !std::is_void::value, // !is_set + !std::is_void_v, // !is_set std::conditional_t> value_at(size_type i) const { KOKKOS_EXPECTS(i < capacity()); @@ -808,8 +808,8 @@ class UnorderedMap { // Re-allocate the views of the calling UnorderedMap according to src // capacity, and deep copy the src data. template - std::enable_if_t, key_type>::value && - std::is_same, value_type>::value> + std::enable_if_t, key_type> && + std::is_same_v, value_type>> create_copy_view( UnorderedMap const &src) { if (m_hash_lists.data() != src.m_hash_lists.data()) { @@ -821,8 +821,8 @@ class UnorderedMap { // Allocate views of the calling UnorderedMap with the same capacity as the // src. template - std::enable_if_t, key_type>::value && - std::is_same, value_type>::value> + std::enable_if_t, key_type> && + std::is_same_v, value_type>> allocate_view( UnorderedMap const &src) { insertable_map_type tmp; @@ -852,8 +852,8 @@ class UnorderedMap { // Deep copy view data from src. This requires that the src capacity is // identical to the capacity of the calling UnorderedMap. template - std::enable_if_t, key_type>::value && - std::is_same, value_type>::value> + std::enable_if_t, key_type> && + std::is_same_v, value_type>> deep_copy_view( UnorderedMap const &src) { #ifndef KOKKOS_ENABLE_DEPRECATED_CODE_4 diff --git a/packages/kokkos/containers/src/Kokkos_Vector.hpp b/packages/kokkos/containers/src/Kokkos_Vector.hpp index 88109fb0ba56..83ccfbf6305f 100644 --- a/packages/kokkos/containers/src/Kokkos_Vector.hpp +++ b/packages/kokkos/containers/src/Kokkos_Vector.hpp @@ -172,9 +172,8 @@ class KOKKOS_DEPRECATED vector private: template - struct impl_is_input_iterator - : /* TODO replace this */ std::bool_constant< - !std::is_convertible::value> {}; + struct impl_is_input_iterator : /* TODO replace this */ std::bool_constant< + !std::is_convertible_v> {}; public: // TODO: can use detection idiom to generate better error message here later diff --git a/packages/kokkos/containers/unit_tests/CMakeLists.txt b/packages/kokkos/containers/unit_tests/CMakeLists.txt index e69e46bb6a85..6255a86c4614 100644 --- a/packages/kokkos/containers/unit_tests/CMakeLists.txt +++ b/packages/kokkos/containers/unit_tests/CMakeLists.txt @@ -1,8 +1,7 @@ - -KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) -KOKKOS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}) -KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../src ) -KOKKOS_INCLUDE_DIRECTORIES(${KOKKOS_SOURCE_DIR}/core/unit_test/category_files) +kokkos_include_directories(${CMAKE_CURRENT_BINARY_DIR}) +kokkos_include_directories(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}) +kokkos_include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../src) +kokkos_include_directories(${KOKKOS_SOURCE_DIR}/core/unit_test/category_files) foreach(Tag Threads;Serial;OpenMP;HPX;Cuda;HIP;SYCL) string(TOUPPER ${Tag} DEVICE) @@ -12,57 +11,49 @@ foreach(Tag Threads;Serial;OpenMP;HPX;Cuda;HIP;SYCL) set(UnitTestSources UnitTestMain.cpp) set(dir ${CMAKE_CURRENT_BINARY_DIR}/${dir}) file(MAKE_DIRECTORY ${dir}) - foreach(Name - Bitset - DualView - DynamicView - DynViewAPI_generic - DynViewAPI_rank12345 - DynViewAPI_rank67 - ErrorReporter - OffsetView - ScatterView - StaticCrsGraph - WithoutInitializing - UnorderedMap - Vector - ViewCtorPropEmbeddedDim - ) + foreach( + Name + Bitset + DualView + DynamicView + DynViewAPI_generic + DynViewAPI_rank12345 + DynViewAPI_rank67 + DynRankView_TeamScratch + ErrorReporter + OffsetView + ScatterView + StaticCrsGraph + WithoutInitializing + UnorderedMap + Vector + ViewCtorPropEmbeddedDim + ) if(NOT Kokkos_ENABLE_DEPRECATED_CODE_4 AND Name STREQUAL "Vector") continue() # skip Kokkos::vector test if deprecated code 4 is not enabled endif() # Write to a temporary intermediate file and call configure_file to avoid # updating timestamps triggering unnecessary rebuilds on subsequent cmake runs. set(file ${dir}/Test${Tag}_${Name}.cpp) - file(WRITE ${dir}/dummy.cpp - "#include \n" - "#include \n" - ) + file(WRITE ${dir}/dummy.cpp "#include \n" "#include \n") configure_file(${dir}/dummy.cpp ${file}) list(APPEND UnitTestSources ${file}) endforeach() #fatal error C1128: number of sections exceeded object file format limit: compile with /bigobj if(KOKKOS_ENABLE_CUDA AND WIN32) - LIST(REMOVE_ITEM UnitTestSources ${dir}/TestCuda_DynViewAPI_generic.cpp) + list(REMOVE_ITEM UnitTestSources ${dir}/TestCuda_DynViewAPI_generic.cpp) endif() # FIXME_NVHPC: NVC++-S-0000-Internal compiler error. extractor: bad opc 0 if(KOKKOS_ENABLE_CUDA AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) - LIST(REMOVE_ITEM UnitTestSources ${dir}/TestCuda_WithoutInitializing.cpp) + list(REMOVE_ITEM UnitTestSources ${dir}/TestCuda_WithoutInitializing.cpp) endif() - KOKKOS_ADD_EXECUTABLE_AND_TEST(ContainersUnitTest_${Tag} SOURCES ${UnitTestSources}) + kokkos_add_executable_and_test(ContainersUnitTest_${Tag} SOURCES ${UnitTestSources}) endif() endforeach() -SET(COMPILE_ONLY_SOURCES - TestCreateMirror.cpp - TestDualViewParameterPack.cpp - TestIsViewTrait.cpp -) -KOKKOS_ADD_EXECUTABLE( - ContainersTestCompileOnly - SOURCES - TestCompileMain.cpp - ${COMPILE_ONLY_SOURCES} +set(COMPILE_ONLY_SOURCES TestCreateMirror.cpp TestDualViewParameterPack.cpp TestIsViewTrait.cpp + TestDynRankViewTypedefs.cpp ) +kokkos_add_executable(ContainersTestCompileOnly SOURCES TestCompileMain.cpp ${COMPILE_ONLY_SOURCES}) diff --git a/packages/kokkos/containers/unit_tests/TestBitset.hpp b/packages/kokkos/containers/unit_tests/TestBitset.hpp index 9923453f72ce..91dc1710e5f8 100644 --- a/packages/kokkos/containers/unit_tests/TestBitset.hpp +++ b/packages/kokkos/containers/unit_tests/TestBitset.hpp @@ -39,7 +39,7 @@ struct TestBitset { TestBitset(bitset_type const& bitset) : m_bitset(bitset) {} - unsigned testit(unsigned collisions) { + unsigned testit(unsigned long long collisions) { execution_space().fence(); unsigned count = 0; diff --git a/packages/kokkos/containers/unit_tests/TestDualView.hpp b/packages/kokkos/containers/unit_tests/TestDualView.hpp index 2512cb5c4915..5d03e6202a89 100644 --- a/packages/kokkos/containers/unit_tests/TestDualView.hpp +++ b/packages/kokkos/containers/unit_tests/TestDualView.hpp @@ -71,7 +71,7 @@ struct test_dualview_copy_construction_and_assignment { using SrcViewType = Kokkos::DualView; using DstViewType = - Kokkos::DualView; + Kokkos::DualView; SrcViewType a("A", n, m); @@ -520,58 +520,26 @@ namespace { * that we keep the semantics of UVM DualViews intact. */ // modify if we have other UVM enabled backends -#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_SYCL) || \ - defined(KOKKOS_ENABLE_HIP) // OR other UVM builds -#define UVM_ENABLED_BUILD -#endif - -#ifdef UVM_ENABLED_BUILD -template -struct UVMSpaceFor; -#endif - -#ifdef KOKKOS_ENABLE_CUDA // specific to CUDA -template <> -struct UVMSpaceFor { - using type = Kokkos::CudaUVMSpace; -}; -#endif - -#ifdef KOKKOS_ENABLE_SYCL // specific to SYCL -template <> -struct UVMSpaceFor { - using type = Kokkos::Experimental::SYCLSharedUSMSpace; -}; -#endif -#ifdef KOKKOS_ENABLE_HIP // specific to HIP -template <> -struct UVMSpaceFor { - using type = Kokkos::HIPManagedSpace; -}; -#endif - -#ifdef UVM_ENABLED_BUILD -template <> -struct UVMSpaceFor { - using type = typename UVMSpaceFor::type; -}; +#ifdef KOKKOS_HAS_SHARED_SPACE +template +using TestSharedSpace = Kokkos::SharedSpace; #else -template -struct UVMSpaceFor { - using type = typename ExecSpace::memory_space; -}; +template +using TestSharedSpace = typename ExecutionSpace::memory_space; #endif using ExecSpace = Kokkos::DefaultExecutionSpace; -using MemSpace = typename UVMSpaceFor::type; +using MemSpace = TestSharedSpace; using DeviceType = Kokkos::Device; using DualViewType = Kokkos::DualView; -using d_device = DeviceType; -using h_device = Kokkos::Device< - Kokkos::DefaultHostExecutionSpace, - typename UVMSpaceFor::type>; +using ConstDualViewType = + Kokkos::DualView; +using d_device = DeviceType; +using h_device = + Kokkos::Device>; TEST(TEST_CATEGORY, dualview_device_correct_kokkos_device) { DualViewType dv("myView", 100); @@ -635,14 +603,69 @@ TEST(TEST_CATEGORY, dualview_template_views_return_correct_executionspace_views) { DualViewType dv("myView", 100); dv.clear_sync_state(); - using hvt = decltype(dv.view()); - using dvt = decltype(dv.view()); + using hvt = decltype(dv.view()); + using dvt = decltype(dv.view()); ASSERT_STREQ(Kokkos::DefaultExecutionSpace::name(), dvt::device_type::execution_space::name()); ASSERT_STREQ(Kokkos::DefaultHostExecutionSpace::name(), hvt::device_type::execution_space::name()); } +TEST(TEST_CATEGORY, + dualview_template_views_return_correct_views_from_const_dual_view) { + DualViewType dv("myView", 100); + ConstDualViewType const_dv = dv; + dv.clear_sync_state(); + ASSERT_EQ(dv.view(), + const_dv.view()); + ASSERT_EQ(dv.view(), + const_dv.view()); +} + +// User-defined types with a View data member, only host-constructible +template +class S { + V v_; + + public: + template + S(std::string label, Extents... extents) : v_(std::move(label), extents...) {} + S() : v_("v", 10) {} +}; + +template +auto initialize_view_of_views() { + Kokkos::DualView dv_v( + Kokkos::view_alloc("myView", Kokkos::SequentialHostInit), 3u); + + V v("v", 2); + V w("w", 2); + dv_v.h_view(0) = v; + dv_v.h_view(1) = w; + + dv_v.modify_host(); + dv_v.sync_device(); + + return dv_v; +} + +TEST(TEST_CATEGORY, dualview_sequential_host_init) { + auto dv_v = initialize_view_of_views>(); + dv_v.resize(Kokkos::view_alloc(Kokkos::SequentialHostInit), 2u); + ASSERT_EQ(dv_v.d_view.size(), 2u); + ASSERT_EQ(dv_v.h_view.size(), 2u); + + initialize_view_of_views>>(); + + Kokkos::DualView dv( + Kokkos::view_alloc("myView", Kokkos::SequentialHostInit), 1u); + dv.resize(Kokkos::view_alloc(Kokkos::SequentialHostInit), 2u); + ASSERT_EQ(dv.d_view.size(), 2u); + ASSERT_EQ(dv.h_view.size(), 2u); + dv.realloc(Kokkos::view_alloc(Kokkos::SequentialHostInit), 3u); + ASSERT_EQ(dv.d_view.size(), 3u); + ASSERT_EQ(dv.h_view.size(), 3u); +} } // anonymous namespace } // namespace Test diff --git a/packages/kokkos/containers/unit_tests/TestDynRankViewTypedefs.cpp b/packages/kokkos/containers/unit_tests/TestDynRankViewTypedefs.cpp new file mode 100644 index 000000000000..95117a22e6e8 --- /dev/null +++ b/packages/kokkos/containers/unit_tests/TestDynRankViewTypedefs.cpp @@ -0,0 +1,260 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include +#include + +namespace { + +// clang-format off +template +struct data_analysis { + using data_type = DataType; + using const_data_type = const DataType; + using runtime_data_type = DataType; + using runtime_const_data_type = const DataType; + using non_const_data_type = std::remove_const_t; +}; + +template +struct data_analysis { + using data_type = typename data_analysis::data_type*; + using const_data_type = typename data_analysis::const_data_type*; + using runtime_data_type = typename data_analysis::runtime_data_type*; + using runtime_const_data_type = typename data_analysis::runtime_const_data_type*; + using non_const_data_type = typename data_analysis::non_const_data_type*; +}; + +template +struct data_analysis { + using data_type = typename data_analysis::data_type[N]; + using const_data_type = typename data_analysis::const_data_type[N]; + using runtime_data_type = typename data_analysis::runtime_data_type*; + using runtime_const_data_type = typename data_analysis::runtime_const_data_type*; + using non_const_data_type = typename data_analysis::non_const_data_type[N]; +}; + +template +constexpr bool test_view_typedefs_impl() { + // ======================== + // inherited from ViewTraits + // ======================== + static_assert(std::is_same_v); + static_assert(std::is_same_v::const_data_type>); + static_assert(std::is_same_v::non_const_data_type>); + + // FIXME: these should be deprecated and for proper testing (I.e. where this is different from data_type) + // we would need ensemble types which use the hidden View dimension facility of View (i.e. which make + // "specialize" not void) + static_assert(std::is_same_v); + static_assert(std::is_same_v::const_data_type>); + static_assert(std::is_same_v::non_const_data_type>); + static_assert(std::is_same_v); + + // FIXME: value_type definition conflicts with mdspan value_type + static_assert(std::is_same_v); + static_assert(std::is_same_v); + static_assert(std::is_same_v>); + + // FIXME: should maybe be deprecated + static_assert(std::is_same_v); + + // FIXME: should be deprecated and is some complicated impl type + // static_assert(!std::is_void_v); + + static_assert(std::is_same_v); + static_assert(std::is_same_v); + static_assert(std::is_same_v>); + static_assert(std::is_same_v); + static_assert(std::is_same_v); + static_assert(std::is_same_v); + + // FIXME: should be deprecated in favor of reference + static_assert(std::is_same_v); + // FIXME: should be deprecated in favor of data_handle_type + static_assert(std::is_same_v); + + // ========================================= + // in Legacy View: some helper View variants + // ========================================= + + // FIXME: in contrast to View, hooks_policy is not propagated + static_assert(std::is_same_v); + static_assert(std::is_same_v>); + static_assert(std::is_same_v>); + static_assert(std::is_same_v>); + static_assert(std::is_same_v>); + +/* FIXME: these don't exist in DynRankView, should they? + using uniform_layout_type = std::conditional_t), + Kokkos::LayoutLeft, Layout>; + + // Uhm uniformtype removes all memorytraits? + static_assert(std::is_same_v>>); + static_assert(std::is_same_v>>); + static_assert(std::is_same_v::runtime_data_type, uniform_layout_type, + typename ViewType::device_type, Kokkos::MemoryTraits<0>>>); + static_assert(std::is_same_v::runtime_const_data_type, uniform_layout_type, + typename ViewType::device_type, Kokkos::MemoryTraits<0>>>); + + using anonymous_device_type = Kokkos::Device; + static_assert(std::is_same_v>>); + static_assert(std::is_same_v>>); + static_assert(std::is_same_v::runtime_data_type, uniform_layout_type, + anonymous_device_type, Kokkos::MemoryTraits<0>>>); + static_assert(std::is_same_v::runtime_const_data_type, uniform_layout_type, + anonymous_device_type, Kokkos::MemoryTraits<0>>>); +*/ + + // ================================== + // mdspan compatibility + // ================================== + + // FIXME: This typedef caused some weird issue with MSVC+NVCC + // static_assert(std::is_same_v); + // FIXME: Not supported yet + // static_assert(std::is_same_v); + // static_assert(std::is_same_v); + // static_assert(std::is_same_v); + + static_assert(std::is_same_v); + // FIXME: should be remove_const_t + static_assert(std::is_same_v); + // FIXME: should be extents_type::index_type + static_assert(std::is_same_v); + static_assert(std::is_same_v); + + // FIXME: should come from accessor_type + static_assert(std::is_same_v); + static_assert(std::is_same_v); + return true; +} + +// Helper function to unpack data type and other args from the View, and pass them on +template +struct ViewParams {}; + +template +constexpr bool test_view_typedefs(ViewParams) { + return test_view_typedefs_impl, Kokkos::ViewTraits, + T, L, S, M, HostMirrorSpace, ValueType, ReferenceType>(); +} + + +constexpr bool is_host_exec = std::is_same_v; + +#if defined(KOKKOS_ENABLE_CUDA_UVM) || defined(KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY) || defined(KOKKOS_IMPL_HIP_UNIFIED_MEMORY) +constexpr bool has_unified_mem_space = true; +#else +constexpr bool has_unified_mem_space = false; +#endif + +// The test take explicit template arguments for: LayoutType, Space, MemoryTraits, HostMirrorSpace, ValueType, ReferenceType +// The ViewParams is just a type pack for the View template arguments + +// Kokkos::View +namespace TestInt { + using layout_type = Kokkos::DefaultExecutionSpace::array_layout; + using space = Kokkos::DefaultExecutionSpace; + using memory_traits = Kokkos::MemoryTraits<0>; + // HostMirrorSpace is a mess so: if the default exec is a host exec, that is it + using host_mirror_space = std::conditional_t>>; + static_assert(test_view_typedefs( + ViewParams{})); +} + +// Kokkos::View +namespace TestIntDefaultExecutionSpace { + using layout_type = Kokkos::DefaultExecutionSpace::array_layout; + using space = Kokkos::DefaultExecutionSpace; + using memory_traits = Kokkos::MemoryTraits<0>; + // HostMirrorSpace is a mess so: if the default exec is a host exec, it is HostSpace (note difference from View ...) + using host_mirror_space = std::conditional_t>; + static_assert(test_view_typedefs( + ViewParams{})); +} + +// Kokkos::View +namespace TestFloatPPHostSpace { + using layout_type = Kokkos::LayoutRight; + using space = Kokkos::HostSpace; + using memory_traits = Kokkos::MemoryTraits<0>; + using host_mirror_space = Kokkos::HostSpace; + static_assert(test_view_typedefs( + ViewParams{})); +} + +// Kokkos::View> +namespace TestFloatPPDeviceDefaultHostExecHostSpace { + using layout_type = Kokkos::LayoutRight; + using space = Kokkos::Device; + using memory_traits = Kokkos::MemoryTraits<0>; + using host_mirror_space = Kokkos::HostSpace; + static_assert(test_view_typedefs( + ViewParams>{})); +} + +// Kokkos::View> +namespace TestIntAtomic { + using layout_type = Kokkos::DefaultExecutionSpace::array_layout; + using space = Kokkos::DefaultExecutionSpace; + using memory_traits = Kokkos::MemoryTraits; + // HostMirrorSpace is a mess so: if the default exec is a host exec, that is it + using host_mirror_space = std::conditional_t>>; + static_assert(test_view_typedefs>>>( + ViewParams>{})); +} +// clang-format on +} // namespace diff --git a/packages/kokkos/containers/unit_tests/TestDynRankView_TeamScratch.hpp b/packages/kokkos/containers/unit_tests/TestDynRankView_TeamScratch.hpp new file mode 100644 index 000000000000..e5f8860de76c --- /dev/null +++ b/packages/kokkos/containers/unit_tests/TestDynRankView_TeamScratch.hpp @@ -0,0 +1,72 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include + +#include + +namespace { + +void test_dyn_rank_view_team_scratch() { + using execution_space = TEST_EXECSPACE; + using memory_space = execution_space::scratch_memory_space; + using drv_type = Kokkos::DynRankView; + using policy_type = Kokkos::TeamPolicy; + using team_type = policy_type::member_type; + + int N0 = 10, N1 = 4, N2 = 3; + size_t shmem_size = drv_type::shmem_size(N0, N1, N2); + ASSERT_GE(shmem_size, N0 * N1 * N2 * sizeof(int)); + + Kokkos::View> + errors("errors"); + auto policy = policy_type(1, Kokkos::AUTO) + .set_scratch_size(0, Kokkos::PerTeam(shmem_size)); + Kokkos::parallel_for( + policy, KOKKOS_LAMBDA(const team_type& team) { + drv_type scr(team.team_scratch(0), N0, N1, N2); + // Control that the code ran at all + if (scr.rank() != 3) errors() |= 1u; + if (scr.extent_int(0) != N0) errors() |= 2u; + if (scr.extent_int(1) != N1) errors() |= 4u; + if (scr.extent_int(2) != N2) errors() |= 8u; + Kokkos::parallel_for( + Kokkos::TeamThreadMDRange(team, N0, N1, N2), + [=](int i, int j, int k) { scr(i, j, k) = i * 100 + j * 10 + k; }); + team.team_barrier(); + Kokkos::parallel_for(Kokkos::TeamThreadMDRange(team, N0, N1, N2), + [=](int i, int j, int k) { + if (scr(i, j, k) != i * 100 + j * 10 + k) + errors() |= 16u; + }); + errors() |= 256u; + }); + unsigned h_errors = 0; + Kokkos::deep_copy(h_errors, errors); + + ASSERT_EQ((h_errors & 1u), 0u) << "Rank mismatch"; + ASSERT_EQ((h_errors & 2u), 0u) << "extent 0 mismatch"; + ASSERT_EQ((h_errors & 4u), 0u) << "extent 1 mismatch"; + ASSERT_EQ((h_errors & 8u), 0u) << "extent 2 mismatch"; + ASSERT_EQ((h_errors & 16u), 0u) << "data access incorrect"; + ASSERT_EQ(h_errors, 256u); +} + +TEST(TEST_CATEGORY, dyn_rank_view_team_scratch) { + test_dyn_rank_view_team_scratch(); +} + +} // namespace diff --git a/packages/kokkos/containers/unit_tests/TestDynViewAPI.hpp b/packages/kokkos/containers/unit_tests/TestDynViewAPI.hpp index 4ecb6cf25cc5..930c76c32c47 100644 --- a/packages/kokkos/containers/unit_tests/TestDynViewAPI.hpp +++ b/packages/kokkos/containers/unit_tests/TestDynViewAPI.hpp @@ -792,9 +792,8 @@ class TestDynViewAPI { int equal_ptr_h2_d = a_h2.data() == a_d.data() ? 1 : 0; int is_same_memspace = - std::is_same::value - ? 1 - : 0; + std::is_same_v ? 1 + : 0; ASSERT_EQ(equal_ptr_h_h2, 1); ASSERT_EQ(equal_ptr_h_d, is_same_memspace); ASSERT_EQ(equal_ptr_h2_d, is_same_memspace); @@ -817,9 +816,8 @@ class TestDynViewAPI { int equal_ptr_h2_d = a_h2.data() == a_d.data() ? 1 : 0; int is_same_memspace = - std::is_same::value - ? 1 - : 0; + std::is_same_v ? 1 + : 0; ASSERT_EQ(equal_ptr_h_h2, 1); ASSERT_EQ(equal_ptr_h_d, is_same_memspace); ASSERT_EQ(equal_ptr_h2_d, is_same_memspace); @@ -846,9 +844,8 @@ class TestDynViewAPI { int equal_ptr_h2_d = a_h2.data() == a_d.data() ? 1 : 0; int is_same_memspace = - std::is_same::value - ? 1 - : 0; + std::is_same_v ? 1 + : 0; ASSERT_EQ(equal_ptr_h_h2, 1); ASSERT_EQ(equal_ptr_h_d, is_same_memspace); ASSERT_EQ(equal_ptr_h2_d, is_same_memspace); @@ -879,8 +876,7 @@ class TestDynViewAPI { int equal_ptr_h3_d = a_h3.data() == a_d.data() ? 1 : 0; int is_same_memspace = - std::is_same::value + std::is_same_v ? 1 : 0; ASSERT_EQ(equal_ptr_h_h2, 1); @@ -915,8 +911,7 @@ class TestDynViewAPI { int equal_ptr_h3_d = a_h3.data() == a_d.data() ? 1 : 0; int is_same_memspace = - std::is_same::value + std::is_same_v ? 1 : 0; ASSERT_EQ(equal_ptr_h_h2, 1); @@ -943,8 +938,6 @@ class TestDynViewAPI { dView0 d("d"); -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) - // Rank 0 Kokkos::resize(d); @@ -1121,8 +1114,6 @@ class TestDynViewAPI { Kokkos::deep_copy(error_flag_host, error_flag); ASSERT_EQ(error_flag_host(), 0); #endif // MDRangePolict Rank < 7 - -#endif // defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) } static void run_test_scalar() { diff --git a/packages/kokkos/containers/unit_tests/TestDynamicView.hpp b/packages/kokkos/containers/unit_tests/TestDynamicView.hpp index c8f8fed3b8b3..94ccea86eb9b 100644 --- a/packages/kokkos/containers/unit_tests/TestDynamicView.hpp +++ b/packages/kokkos/containers/unit_tests/TestDynamicView.hpp @@ -71,7 +71,6 @@ struct TestDynamicView { da.resize_serial(da_size); ASSERT_EQ(da.size(), da_size); -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) Kokkos::parallel_for( Kokkos::RangePolicy(0, da_size), KOKKOS_LAMBDA(const int i) { da(i) = Scalar(i); }); @@ -85,7 +84,6 @@ struct TestDynamicView { result_sum); ASSERT_EQ(result_sum, (value_type)(da_size * (da_size - 1) / 2)); -#endif // add 3x more entries i.e. 4x larger than previous size // the first 1/4 should remain the same @@ -93,7 +91,6 @@ struct TestDynamicView { da.resize_serial(da_resize); ASSERT_EQ(da.size(), da_resize); -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) Kokkos::parallel_for( Kokkos::RangePolicy(da_size, da_resize), KOKKOS_LAMBDA(const int i) { da(i) = Scalar(i); }); @@ -108,7 +105,6 @@ struct TestDynamicView { ASSERT_EQ(new_result_sum + result_sum, (value_type)(da_resize * (da_resize - 1) / 2)); -#endif } // end scope // Test: Create DynamicView, initialize size (via resize), run through @@ -123,7 +119,6 @@ struct TestDynamicView { da.resize_serial(da_size); ASSERT_EQ(da.size(), da_size); -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) Kokkos::parallel_for( Kokkos::RangePolicy(0, da_size), KOKKOS_LAMBDA(const int i) { da(i) = Scalar(i); }); @@ -137,7 +132,6 @@ struct TestDynamicView { result_sum); ASSERT_EQ(result_sum, (value_type)(da_size * (da_size - 1) / 2)); -#endif // add 3x more entries i.e. 4x larger than previous size // the first 1/4 should remain the same @@ -145,7 +139,6 @@ struct TestDynamicView { da.resize_serial(da_resize); ASSERT_EQ(da.size(), da_resize); -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) Kokkos::parallel_for( Kokkos::RangePolicy(da_size, da_resize), KOKKOS_LAMBDA(const int i) { da(i) = Scalar(i); }); @@ -160,7 +153,6 @@ struct TestDynamicView { ASSERT_EQ(new_result_sum + result_sum, (value_type)(da_resize * (da_resize - 1) / 2)); -#endif } // end scope // Test: Create DynamicView, initialize size (via resize), run through @@ -175,7 +167,6 @@ struct TestDynamicView { da.resize_serial(da_size); ASSERT_EQ(da.size(), da_size); -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) Kokkos::parallel_for( Kokkos::RangePolicy(0, da_size), KOKKOS_LAMBDA(const int i) { da(i) = Scalar(i); }); @@ -189,14 +180,12 @@ struct TestDynamicView { result_sum); ASSERT_EQ(result_sum, (value_type)(da_size * (da_size - 1) / 2)); -#endif // remove the final 3/4 entries i.e. first 1/4 remain unsigned da_resize = arg_total_size / 8; da.resize_serial(da_resize); ASSERT_EQ(da.size(), da_resize); -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) Kokkos::parallel_for( Kokkos::RangePolicy(0, da_resize), KOKKOS_LAMBDA(const int i) { da(i) = Scalar(i); }); @@ -210,7 +199,6 @@ struct TestDynamicView { new_result_sum); ASSERT_EQ(new_result_sum, (value_type)(da_resize * (da_resize - 1) / 2)); -#endif } // end scope // Test: Reproducer to demonstrate compile-time error of deep_copy @@ -229,7 +217,6 @@ struct TestDynamicView { device_dynamic_view.resize_serial(da_size); // Use parallel_for to populate device_dynamic_view and verify values -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) Kokkos::parallel_for( Kokkos::RangePolicy(0, da_size), KOKKOS_LAMBDA(const int i) { device_dynamic_view(i) = Scalar(i); }); @@ -243,7 +230,6 @@ struct TestDynamicView { result_sum); ASSERT_EQ(result_sum, (value_type)(da_size * (da_size - 1) / 2)); -#endif // Use an on-device View as intermediate to deep_copy the // device_dynamic_view to host, zero out the device_dynamic_view, @@ -251,13 +237,11 @@ struct TestDynamicView { Kokkos::deep_copy(device_view, device_dynamic_view); Kokkos::deep_copy(host_view, device_view); Kokkos::deep_copy(device_view, host_view); -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) Kokkos::parallel_for( Kokkos::RangePolicy(0, da_size), KOKKOS_LAMBDA(const int i) { device_dynamic_view(i) = Scalar(0); }); -#endif Kokkos::deep_copy(device_dynamic_view, device_view); -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) + value_type new_result_sum = 0.0; Kokkos::parallel_reduce( Kokkos::RangePolicy(0, da_size), @@ -267,21 +251,6 @@ struct TestDynamicView { new_result_sum); ASSERT_EQ(new_result_sum, (value_type)(da_size * (da_size - 1) / 2)); -#endif - - // Try to deep_copy device_dynamic_view directly to/from host. - // host-to-device currently fails to compile because DP and SP are - // swapped in the deep_copy implementation. - // Once that's fixed, both deep_copy's will fail at runtime because the - // destination execution space cannot access the source memory space. - // Check if the memory spaces are different before testing the deep_copy. - if (!Kokkos::SpaceAccessibility::accessible) { - ASSERT_THROW(Kokkos::deep_copy(host_view, device_dynamic_view), - std::runtime_error); - ASSERT_THROW(Kokkos::deep_copy(device_dynamic_view, host_view), - std::runtime_error); - } } } }; diff --git a/packages/kokkos/containers/unit_tests/TestErrorReporter.hpp b/packages/kokkos/containers/unit_tests/TestErrorReporter.hpp index 0003a29468c5..4ebab889c78f 100644 --- a/packages/kokkos/containers/unit_tests/TestErrorReporter.hpp +++ b/packages/kokkos/containers/unit_tests/TestErrorReporter.hpp @@ -149,7 +149,6 @@ struct ErrorReporterDriver : public ErrorReporterDriverBase { } }; -#if !defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_CUDA_LAMBDA) template struct ErrorReporterDriverUseLambda : public ErrorReporterDriverBase { @@ -178,7 +177,6 @@ struct ErrorReporterDriverUseLambda driver_base::check_expectations(reporter_capacity, test_size); } }; -#endif #ifdef KOKKOS_ENABLE_OPENMP struct ErrorReporterDriverNativeOpenMP @@ -205,8 +203,7 @@ struct ErrorReporterDriverNativeOpenMP // FIXME_MSVC MSVC just gets confused when using the base class in the // KOKKOS_CLASS_LAMBDA -#if !defined(KOKKOS_COMPILER_MSVC) && \ - (!defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_CUDA_LAMBDA)) +#ifndef KOKKOS_COMPILER_MSVC TEST(TEST_CATEGORY, ErrorReporterViaLambda) { TestErrorReporter>(); } diff --git a/packages/kokkos/containers/unit_tests/TestOffsetView.hpp b/packages/kokkos/containers/unit_tests/TestOffsetView.hpp index c133922e3def..706b40fff386 100644 --- a/packages/kokkos/containers/unit_tests/TestOffsetView.hpp +++ b/packages/kokkos/containers/unit_tests/TestOffsetView.hpp @@ -56,7 +56,18 @@ void test_offsetview_construction() { offset_view_type ov("firstOV", range0, range1); ASSERT_EQ("firstOV", ov.label()); - ASSERT_EQ(2, ov.Rank); + +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS + KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() +#endif +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + ASSERT_EQ(2u, ov.Rank); +#endif +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS + KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() +#endif + + ASSERT_EQ(2u, ov.rank()); ASSERT_EQ(ov.begin(0), -1); ASSERT_EQ(ov.end(0), 4); @@ -67,7 +78,6 @@ void test_offsetview_construction() { ASSERT_EQ(ov.extent(0), 5u); ASSERT_EQ(ov.extent(1), 5u); -#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) { Kokkos::Experimental::OffsetView offsetV1("OneDOffsetView", range0); @@ -149,7 +159,6 @@ void test_offsetview_construction() { } ASSERT_EQ(OVResult, answer) << "Bad data found in OffsetView"; -#endif { offset_view_type ovCopy(ov); @@ -184,7 +193,6 @@ void test_offsetview_construction() { range3_type rangePolicy3DZero(point3_type{{0, 0, 0}}, point3_type{{extent0, extent1, extent2}}); -#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) int view3DSum = 0; Kokkos::parallel_reduce( rangePolicy3DZero, @@ -207,7 +215,6 @@ void test_offsetview_construction() { ASSERT_EQ(view3DSum, offsetView3DSum) << "construction of OffsetView from View and begins array broken."; -#endif } view_type viewFromOV = ov.view(); @@ -232,7 +239,6 @@ void test_offsetview_construction() { view_type aView("aView", ov.extent(0), ov.extent(1)); Kokkos::deep_copy(aView, ov); -#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) int sum = 0; Kokkos::parallel_reduce( rangePolicy2D, @@ -242,7 +248,6 @@ void test_offsetview_construction() { sum); ASSERT_EQ(sum, 0) << "deep_copy(view, offsetView) broken."; -#endif } { // test view to offsetview deep copy @@ -251,7 +256,6 @@ void test_offsetview_construction() { Kokkos::deep_copy(aView, 99); Kokkos::deep_copy(ov, aView); -#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) int sum = 0; Kokkos::parallel_reduce( rangePolicy2D, @@ -261,7 +265,6 @@ void test_offsetview_construction() { sum); ASSERT_EQ(sum, 0) << "deep_copy(offsetView, view) broken."; -#endif } } @@ -329,46 +332,131 @@ void test_offsetview_unmanaged_construction() { ASSERT_EQ(bb, ib); ASSERT_EQ(bb, ii); } +} + +template +void test_offsetview_unmanaged_construction_death() { + // Preallocated memory (Only need a valid address for this test) + Scalar s; + + // Regular expression syntax on Windows is a pain. `.` does not match `\n`. + // Feel free to make it work if you have time to spare. +#ifdef _WIN32 +#define SKIP_REGEX_ON_WINDOWS(REGEX) "" +#else +#define SKIP_REGEX_ON_WINDOWS(REGEX) REGEX +#endif { using offset_view_type = Kokkos::Experimental::OffsetView; // Range calculations must be positive - ASSERT_NO_THROW(offset_view_type(&s, {0}, {1})); - ASSERT_NO_THROW(offset_view_type(&s, {0}, {0})); - ASSERT_THROW(offset_view_type(&s, {0}, {-1}), std::runtime_error); + (void)offset_view_type(&s, {0}, {1}); + (void)offset_view_type(&s, {0}, {0}); + ASSERT_DEATH( + offset_view_type(&s, {0}, {-1}), + SKIP_REGEX_ON_WINDOWS( + "Kokkos::Experimental::OffsetView ERROR: for unmanaged OffsetView" + ".*" + "\\(ends\\[0\\] \\(-1\\) - begins\\[0\\] \\(0\\)\\) must be " + "non-negative")); } { using offset_view_type = Kokkos::Experimental::OffsetView; // Range calculations must not overflow - ASSERT_NO_THROW(offset_view_type(&s, {0}, {0x7fffffffffffffffl})); - ASSERT_THROW(offset_view_type(&s, {-1}, {0x7fffffffffffffffl}), - std::runtime_error); - ASSERT_THROW( + (void)offset_view_type(&s, {0}, {0x7fffffffffffffffl}); + ASSERT_DEATH( + offset_view_type(&s, {-1}, {0x7fffffffffffffffl}), + SKIP_REGEX_ON_WINDOWS( + "Kokkos::Experimental::OffsetView ERROR: for unmanaged OffsetView" + ".*" + "\\(ends\\[0\\] \\(9223372036854775807\\) - begins\\[0\\] " + "\\(-1\\)\\) " + "overflows")); + ASSERT_DEATH( offset_view_type(&s, {-0x7fffffffffffffffl - 1}, {0x7fffffffffffffffl}), - std::runtime_error); - ASSERT_THROW(offset_view_type(&s, {-0x7fffffffffffffffl - 1}, {0}), - std::runtime_error); + SKIP_REGEX_ON_WINDOWS( + "Kokkos::Experimental::OffsetView ERROR: for unmanaged OffsetView" + ".*" + "\\(ends\\[0\\] \\(9223372036854775807\\) - begins\\[0\\] " + "\\(-9223372036854775808\\)\\) " + "overflows")); + ASSERT_DEATH( + offset_view_type(&s, {-0x7fffffffffffffffl - 1}, {0}), + SKIP_REGEX_ON_WINDOWS( + "Kokkos::Experimental::OffsetView ERROR: for unmanaged OffsetView" + ".*" + "\\(ends\\[0\\] \\(0\\) - begins\\[0\\] " + "\\(-9223372036854775808\\)\\) " + "overflows")); } { using offset_view_type = Kokkos::Experimental::OffsetView; - // Should throw when the rank of begins and/or ends doesn't match that of - // OffsetView - ASSERT_THROW(offset_view_type(&s, {0}, {1}), std::runtime_error); - ASSERT_THROW(offset_view_type(&s, {0}, {1, 1}), std::runtime_error); - ASSERT_THROW(offset_view_type(&s, {0}, {1, 1, 1}), std::runtime_error); - ASSERT_THROW(offset_view_type(&s, {0, 0}, {1}), std::runtime_error); - ASSERT_NO_THROW(offset_view_type(&s, {0, 0}, {1, 1})); - ASSERT_THROW(offset_view_type(&s, {0, 0}, {1, 1, 1}), std::runtime_error); - ASSERT_THROW(offset_view_type(&s, {0, 0, 0}, {1}), std::runtime_error); - ASSERT_THROW(offset_view_type(&s, {0, 0, 0}, {1, 1}), std::runtime_error); - ASSERT_THROW(offset_view_type(&s, {0, 0, 0}, {1, 1, 1}), - std::runtime_error); + // Should throw when the rank of begins and/or ends doesn't match that + // of OffsetView + ASSERT_DEATH( + offset_view_type(&s, {0}, {1}), + SKIP_REGEX_ON_WINDOWS( + "Kokkos::Experimental::OffsetView ERROR: for unmanaged OffsetView" + ".*" + "begins\\.size\\(\\) \\(1\\) != Rank \\(2\\)" + ".*" + "ends\\.size\\(\\) \\(1\\) != Rank \\(2\\)")); + ASSERT_DEATH( + offset_view_type(&s, {0}, {1, 1}), + SKIP_REGEX_ON_WINDOWS( + "Kokkos::Experimental::OffsetView ERROR: for unmanaged OffsetView" + ".*" + "begins\\.size\\(\\) \\(1\\) != Rank \\(2\\)")); + ASSERT_DEATH( + offset_view_type(&s, {0}, {1, 1, 1}), + SKIP_REGEX_ON_WINDOWS( + "Kokkos::Experimental::OffsetView ERROR: for unmanaged OffsetView" + ".*" + "begins\\.size\\(\\) \\(1\\) != Rank \\(2\\)" + ".*" + "ends\\.size\\(\\) \\(3\\) != Rank \\(2\\)")); + ASSERT_DEATH( + offset_view_type(&s, {0, 0}, {1}), + SKIP_REGEX_ON_WINDOWS( + "Kokkos::Experimental::OffsetView ERROR: for unmanaged OffsetView" + ".*" + "ends\\.size\\(\\) \\(1\\) != Rank \\(2\\)")); + (void)offset_view_type(&s, {0, 0}, {1, 1}); + ASSERT_DEATH( + offset_view_type(&s, {0, 0}, {1, 1, 1}), + SKIP_REGEX_ON_WINDOWS( + "Kokkos::Experimental::OffsetView ERROR: for unmanaged OffsetView" + ".*" + "ends\\.size\\(\\) \\(3\\) != Rank \\(2\\)")); + ASSERT_DEATH( + offset_view_type(&s, {0, 0, 0}, {1}), + SKIP_REGEX_ON_WINDOWS( + "Kokkos::Experimental::OffsetView ERROR: for unmanaged OffsetView" + ".*" + "begins\\.size\\(\\) \\(3\\) != Rank \\(2\\)" + ".*" + "ends\\.size\\(\\) \\(1\\) != Rank \\(2\\)")); + ASSERT_DEATH( + offset_view_type(&s, {0, 0, 0}, {1, 1}), + SKIP_REGEX_ON_WINDOWS( + "Kokkos::Experimental::OffsetView ERROR: for unmanaged OffsetView" + ".*" + "begins\\.size\\(\\) \\(3\\) != Rank \\(2\\)")); + ASSERT_DEATH( + offset_view_type(&s, {0, 0, 0}, {1, 1, 1}), + SKIP_REGEX_ON_WINDOWS( + "Kokkos::Experimental::OffsetView ERROR: for unmanaged OffsetView" + ".*" + "begins\\.size\\(\\) \\(3\\) != Rank \\(2\\)" + ".*" + "ends\\.size\\(\\) \\(3\\) != Rank \\(2\\)")); } +#undef SKIP_REGEX_ON_WINDOWS } template @@ -377,8 +465,8 @@ void test_offsetview_subview() { Kokkos::Experimental::OffsetView sliceMe("offsetToSlice", {-10, 20}); { - auto offsetSubviewa = Kokkos::Experimental::subview(sliceMe, 0); - ASSERT_EQ(offsetSubviewa.Rank, 0) << "subview of offset is broken."; + auto offsetSubview = Kokkos::Experimental::subview(sliceMe, 0); + ASSERT_EQ(offsetSubview.rank(), 0u) << "subview of offset is broken."; } } { // test subview 2 @@ -387,13 +475,13 @@ void test_offsetview_subview() { { auto offsetSubview = Kokkos::Experimental::subview(sliceMe, Kokkos::ALL(), -2); - ASSERT_EQ(offsetSubview.Rank, 1) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 1u) << "subview of offset is broken."; } { auto offsetSubview = Kokkos::Experimental::subview(sliceMe, 0, Kokkos::ALL()); - ASSERT_EQ(offsetSubview.Rank, 1) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 1u) << "subview of offset is broken."; } } @@ -406,30 +494,29 @@ void test_offsetview_subview() { { auto offsetSubview = Kokkos::Experimental::subview(sliceMe, Kokkos::ALL(), Kokkos::ALL(), 0); - ASSERT_EQ(offsetSubview.Rank, 2) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 2u) << "subview of offset is broken."; } { auto offsetSubview = Kokkos::Experimental::subview(sliceMe, Kokkos::ALL(), 0, Kokkos::ALL()); - ASSERT_EQ(offsetSubview.Rank, 2) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 2u) << "subview of offset is broken."; } { auto offsetSubview = Kokkos::Experimental::subview( sliceMe, 0, Kokkos::ALL(), Kokkos::ALL()); - ASSERT_EQ(offsetSubview.Rank, 2) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 2u) << "subview of offset is broken."; } { auto offsetSubview = Kokkos::Experimental::subview( sliceMe, 0, Kokkos::ALL(), Kokkos::make_pair(-30, -21)); - ASSERT_EQ(offsetSubview.Rank, 2) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 2u) << "subview of offset is broken."; ASSERT_EQ(offsetSubview.begin(0), -20); ASSERT_EQ(offsetSubview.end(0), 31); ASSERT_EQ(offsetSubview.begin(1), 0); ASSERT_EQ(offsetSubview.end(1), 9); -#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) using range_type = Kokkos::MDRangePolicy, Kokkos::IndexType >; using point_type = typename range_type::point_type; @@ -455,25 +542,24 @@ void test_offsetview_subview() { sum); ASSERT_EQ(sum, 6 * (e0 - b0) * (e1 - b1)); -#endif } // slice 2 { auto offsetSubview = Kokkos::Experimental::subview(sliceMe, Kokkos::ALL(), 0, 0); - ASSERT_EQ(offsetSubview.Rank, 1) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 1u) << "subview of offset is broken."; } { auto offsetSubview = Kokkos::Experimental::subview(sliceMe, 0, 0, Kokkos::ALL()); - ASSERT_EQ(offsetSubview.Rank, 1) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 1u) << "subview of offset is broken."; } { auto offsetSubview = Kokkos::Experimental::subview(sliceMe, 0, Kokkos::ALL(), 0); - ASSERT_EQ(offsetSubview.Rank, 1) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 1u) << "subview of offset is broken."; } } @@ -486,73 +572,72 @@ void test_offsetview_subview() { { auto offsetSubview = Kokkos::Experimental::subview( sliceMe, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), 0); - ASSERT_EQ(offsetSubview.Rank, 3) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 3u) << "subview of offset is broken."; } { auto offsetSubview = Kokkos::Experimental::subview( sliceMe, Kokkos::ALL(), Kokkos::ALL(), 0, Kokkos::ALL()); - ASSERT_EQ(offsetSubview.Rank, 3) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 3u) << "subview of offset is broken."; } { auto offsetSubview = Kokkos::Experimental::subview( sliceMe, Kokkos::ALL(), 0, Kokkos::ALL(), Kokkos::ALL()); - ASSERT_EQ(offsetSubview.Rank, 3) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 3u) << "subview of offset is broken."; } { auto offsetSubview = Kokkos::Experimental::subview( sliceMe, 0, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); - ASSERT_EQ(offsetSubview.Rank, 3) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 3u) << "subview of offset is broken."; } // slice 2 auto offsetSubview2a = Kokkos::Experimental::subview(sliceMe, Kokkos::ALL(), Kokkos::ALL(), 0, 0); - ASSERT_EQ(offsetSubview2a.Rank, 2) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview2a.rank(), 2u) << "subview of offset is broken."; { auto offsetSubview2b = Kokkos::Experimental::subview( sliceMe, Kokkos::ALL(), 0, Kokkos::ALL(), 0); - ASSERT_EQ(offsetSubview2b.Rank, 2) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview2b.rank(), 2u) << "subview of offset is broken."; } { auto offsetSubview2b = Kokkos::Experimental::subview( sliceMe, Kokkos::ALL(), 0, 0, Kokkos::ALL()); - ASSERT_EQ(offsetSubview2b.Rank, 2) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview2b.rank(), 2u) << "subview of offset is broken."; } { auto offsetSubview2b = Kokkos::Experimental::subview( sliceMe, 0, Kokkos::ALL(), 0, Kokkos::ALL()); - ASSERT_EQ(offsetSubview2b.Rank, 2) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview2b.rank(), 2u) << "subview of offset is broken."; } { auto offsetSubview2b = Kokkos::Experimental::subview( sliceMe, 0, 0, Kokkos::ALL(), Kokkos::ALL()); - ASSERT_EQ(offsetSubview2b.Rank, 2) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview2b.rank(), 2u) << "subview of offset is broken."; } // slice 3 { auto offsetSubview = Kokkos::Experimental::subview(sliceMe, Kokkos::ALL(), 0, 0, 0); - ASSERT_EQ(offsetSubview.Rank, 1) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 1u) << "subview of offset is broken."; } { auto offsetSubview = Kokkos::Experimental::subview(sliceMe, 0, Kokkos::ALL(), 0, 0); - ASSERT_EQ(offsetSubview.Rank, 1) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 1u) << "subview of offset is broken."; } { auto offsetSubview = Kokkos::Experimental::subview(sliceMe, 0, 0, Kokkos::ALL(), 0); - ASSERT_EQ(offsetSubview.Rank, 1) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 1u) << "subview of offset is broken."; } { auto offsetSubview = Kokkos::Experimental::subview(sliceMe, 0, 0, 0, Kokkos::ALL()); - ASSERT_EQ(offsetSubview.Rank, 1) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 1u) << "subview of offset is broken."; } } } -#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) template KOKKOS_INLINE_FUNCTION T std_accumulate(InputIt first, InputIt last, T init, BinaryOperation op) { @@ -586,6 +671,7 @@ void test_offsetview_offsets_rank1() { KOKKOS_LAMBDA(const int ii, int& lerrors) { offset_view_type ov(v, {ii}); lerrors += (ov(3) != element({3 - ii})); + lerrors += (ov[3] != element({3 - ii})); }, errors); @@ -655,7 +741,6 @@ void test_offsetview_offsets_rank3() { ASSERT_EQ(0, errors); } -#endif TEST(TEST_CATEGORY, offsetview_construction) { test_offsetview_construction(); @@ -665,11 +750,15 @@ TEST(TEST_CATEGORY, offsetview_unmanaged_construction) { test_offsetview_unmanaged_construction(); } +TEST(TEST_CATEGORY_DEATH, offsetview_unmanaged_construction) { + ::testing::FLAGS_gtest_death_test_style = "threadsafe"; + test_offsetview_unmanaged_construction_death(); +} + TEST(TEST_CATEGORY, offsetview_subview) { test_offsetview_subview(); } -#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) TEST(TEST_CATEGORY, offsetview_offsets_rank1) { test_offsetview_offsets_rank1(); } @@ -681,7 +770,6 @@ TEST(TEST_CATEGORY, offsetview_offsets_rank2) { TEST(TEST_CATEGORY, offsetview_offsets_rank3) { test_offsetview_offsets_rank3(); } -#endif } // namespace Test diff --git a/packages/kokkos/containers/unit_tests/TestScatterView.hpp b/packages/kokkos/containers/unit_tests/TestScatterView.hpp index 733f43122ce9..72c1afbe96a7 100644 --- a/packages/kokkos/containers/unit_tests/TestScatterView.hpp +++ b/packages/kokkos/containers/unit_tests/TestScatterView.hpp @@ -33,11 +33,11 @@ struct test_scatter_view_impl_cls { public: using scatter_view_type = - Kokkos::Experimental::ScatterView; - using orig_view_type = Kokkos::View; + using orig_view_type = Kokkos::View; using size_type = typename Kokkos::HostSpace::size_type; @@ -134,11 +134,11 @@ struct test_scatter_view_impl_cls { public: using scatter_view_type = - Kokkos::Experimental::ScatterView; - using orig_view_type = Kokkos::View; + using orig_view_type = Kokkos::View; using size_type = typename Kokkos::HostSpace::size_type; @@ -235,11 +235,11 @@ struct test_scatter_view_impl_cls { public: using scatter_view_type = - Kokkos::Experimental::ScatterView; - using orig_view_type = Kokkos::View; + using orig_view_type = Kokkos::View; using size_type = typename Kokkos::HostSpace::size_type; @@ -335,11 +335,11 @@ struct test_scatter_view_impl_cls { public: using scatter_view_type = - Kokkos::Experimental::ScatterView; - using orig_view_type = Kokkos::View; + using orig_view_type = Kokkos::View; using size_type = typename Kokkos::HostSpace::size_type; @@ -714,7 +714,7 @@ void test_scatter_view(int64_t n) { test_sv_config.run_test(n); } #ifdef KOKKOS_ENABLE_SERIAL - if (!std::is_same::value) { + if (!std::is_same_v) { #endif test_scatter_view_config::value)); - ASSERT_TRUE((std::is_same::value)); - ASSERT_TRUE((std::is_same::value)); - ASSERT_TRUE((std::is_same::value)); + ASSERT_TRUE((std::is_same_v)); + ASSERT_TRUE((std::is_same_v)); + ASSERT_TRUE((std::is_same_v)); + ASSERT_TRUE((std::is_same_v)); } } /* namespace TestStaticCrsGraph */ diff --git a/packages/kokkos/containers/unit_tests/TestUnorderedMap.hpp b/packages/kokkos/containers/unit_tests/TestUnorderedMap.hpp index 4a7e826ecbe4..fc7435a75e56 100644 --- a/packages/kokkos/containers/unit_tests/TestUnorderedMap.hpp +++ b/packages/kokkos/containers/unit_tests/TestUnorderedMap.hpp @@ -460,7 +460,7 @@ struct UnorderedMapInsert { //! Insert multiple values. template - void insert(Args &&... args) const { + void insert(Args &&...args) const { static_assert(sizeof...(Args) > 1, "Prefer the single value version"); constexpr size_t size = sizeof...(Args); Kokkos::Array values{ @@ -534,8 +534,6 @@ TEST(TEST_CATEGORY, UnorderedMap_shallow_copyable_on_device) { ASSERT_EQ(1u, test_map_copy.m_map.size()); } -#if !defined(KOKKOS_ENABLE_CUDA) || \ - (defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_ENABLE_CUDA_LAMBDA)) void test_unordered_map_device_capture() { TestMapCopy::map_type map; @@ -549,7 +547,6 @@ void test_unordered_map_device_capture() { TEST(TEST_CATEGORY, UnorderedMap_lambda_capturable) { test_unordered_map_device_capture(); } -#endif /** * @test This test ensures that an @ref UnorderedMap can be built diff --git a/packages/kokkos/containers/unit_tests/TestViewCtorPropEmbeddedDim.hpp b/packages/kokkos/containers/unit_tests/TestViewCtorPropEmbeddedDim.hpp index 0246f11ddfe7..2edddcce34f4 100644 --- a/packages/kokkos/containers/unit_tests/TestViewCtorPropEmbeddedDim.hpp +++ b/packages/kokkos/containers/unit_tests/TestViewCtorPropEmbeddedDim.hpp @@ -48,7 +48,7 @@ struct TestViewCtorProp_EmbeddedDim { void operator()(const int i) const { v(i) = i; } }; - static void test_vcpt(const int N0, const int N1) { + static void test_vcpt(const size_t N0, const size_t N1) { // Create two views to test { using VIT = typename TestViewCtorProp_EmbeddedDim::ViewIntType; @@ -78,16 +78,16 @@ struct TestViewCtorProp_EmbeddedDim { HostCVT hcv1 = Kokkos::create_mirror_view(cv1); Kokkos::deep_copy(hcv1, cv1); - ASSERT_EQ((std::is_same::value), true); + ASSERT_EQ((std::is_same_v), true); #if 0 // debug output - for ( int i = 0; i < N0*N1; ++i ) { - printf(" Output check: hcv1(%d) = %lf\n ", i, hcv1(i) ); + for ( size_t i = 0; i < N0*N1; ++i ) { + printf(" Output check: hcv1(%zu) = %lf\n ", i, hcv1(i) ); } printf( " Common value type view: %s \n", typeid( CVT() ).name() ); printf( " Common value type: %s \n", typeid( CommonViewValueType() ).name() ); - if ( std::is_same< CommonViewValueType, double >::value == true ) { + if ( std::is_same_v< CommonViewValueType, double > == true ) { printf("Proper common value_type\n"); } else { @@ -115,7 +115,7 @@ struct TestViewCtorProp_EmbeddedDim { HostCVT hcv1 = Kokkos::create_mirror_view(cv1); Kokkos::deep_copy(hcv1, cv1); - ASSERT_EQ((std::is_same::value), true); + ASSERT_EQ((std::is_same_v), true); } } @@ -148,7 +148,7 @@ struct TestViewCtorProp_EmbeddedDim { HostCVT hcv1 = Kokkos::create_mirror_view(cv1); Kokkos::deep_copy(hcv1, cv1); - ASSERT_EQ((std::is_same::value), true); + ASSERT_EQ((std::is_same_v), true); } { @@ -169,7 +169,7 @@ struct TestViewCtorProp_EmbeddedDim { HostCVT hcv1 = Kokkos::create_mirror_view(cv1); Kokkos::deep_copy(hcv1, cv1); - ASSERT_EQ((std::is_same::value), true); + ASSERT_EQ((std::is_same_v), true); } } diff --git a/packages/kokkos/containers/unit_tests/TestWithoutInitializing.hpp b/packages/kokkos/containers/unit_tests/TestWithoutInitializing.hpp index e8558628dc84..2932898554c5 100644 --- a/packages/kokkos/containers/unit_tests/TestWithoutInitializing.hpp +++ b/packages/kokkos/containers/unit_tests/TestWithoutInitializing.hpp @@ -44,6 +44,12 @@ Kokkos::CudaSpace>) \ GTEST_SKIP() << "skipping since unified memory requires additional " \ "fences"; +#elif defined(KOKKOS_IMPL_HIP_UNIFIED_MEMORY) +#define GTEST_SKIP_IF_UNIFIED_MEMORY_SPACE \ + if constexpr (std::is_same_v) \ + GTEST_SKIP() << "skipping since unified memory requires additional " \ + "fences"; #else #define GTEST_SKIP_IF_UNIFIED_MEMORY_SPACE #endif @@ -51,8 +57,7 @@ TEST(TEST_CATEGORY, resize_realloc_no_init_dualview) { using namespace Kokkos::Test::Tools; listen_tool_events(Config::DisableAll(), Config::EnableKernels()); - Kokkos::DualView bla("bla", 5, 6, 7, - 8); + Kokkos::DualView bla("bla", 5, 6, 7, 8); auto success = validate_absence( [&]() { @@ -82,8 +87,7 @@ TEST(TEST_CATEGORY, resize_realloc_no_alloc_dualview) { using namespace Kokkos::Test::Tools; listen_tool_events(Config::DisableAll(), Config::EnableKernels(), Config::EnableAllocs()); - Kokkos::DualView bla("bla", 8, 7, 6, - 5); + Kokkos::DualView bla("bla", 8, 7, 6, 5); auto success = validate_absence( [&]() { @@ -112,8 +116,7 @@ TEST(TEST_CATEGORY, resize_exec_space_dualview) { using namespace Kokkos::Test::Tools; listen_tool_events(Config::DisableAll(), Config::EnableFences(), Config::EnableKernels()); - Kokkos::DualView bla("bla", 8, 7, 6, - 5); + Kokkos::DualView bla("bla", 8, 7, 6, 5); auto success = validate_absence( [&]() { @@ -245,7 +248,7 @@ TEST(TEST_CATEGORY, realloc_exec_space_dynrankview) { // FIXME_THREADS The Threads backend fences every parallel_for #ifdef KOKKOS_ENABLE_THREADS - if (std::is_same::value) + if (std::is_same_v) GTEST_SKIP() << "skipping since the Threads backend isn't asynchronous"; #endif @@ -280,7 +283,7 @@ TEST(TEST_CATEGORY, resize_realloc_no_init_scatterview) { using namespace Kokkos::Test::Tools; listen_tool_events(Config::DisableAll(), Config::EnableKernels()); Kokkos::Experimental::ScatterView< - int*** * [1][2][3], typename TEST_EXECSPACE::array_layout, TEST_EXECSPACE> + int**** [1][2][3], typename TEST_EXECSPACE::array_layout, TEST_EXECSPACE> bla("bla", 4, 5, 6, 7); auto success = validate_absence( @@ -312,7 +315,7 @@ TEST(TEST_CATEGORY, resize_realloc_no_alloc_scatterview) { listen_tool_events(Config::DisableAll(), Config::EnableKernels(), Config::EnableAllocs()); Kokkos::Experimental::ScatterView< - int*** * [1][2][3], typename TEST_EXECSPACE::array_layout, TEST_EXECSPACE> + int**** [1][2][3], typename TEST_EXECSPACE::array_layout, TEST_EXECSPACE> bla("bla", 7, 6, 5, 4); auto success = validate_absence( @@ -343,7 +346,7 @@ TEST(TEST_CATEGORY, resize_exec_space_scatterview) { listen_tool_events(Config::DisableAll(), Config::EnableFences(), Config::EnableKernels()); Kokkos::Experimental::ScatterView< - int*** * [1][2][3], typename TEST_EXECSPACE::array_layout, TEST_EXECSPACE> + int**** [1][2][3], typename TEST_EXECSPACE::array_layout, TEST_EXECSPACE> bla("bla", 7, 6, 5, 4); auto success = validate_absence( @@ -384,13 +387,12 @@ TEST(TEST_CATEGORY, realloc_exec_space_scatterview) { // FIXME_THREADS The Threads backend fences every parallel_for #ifdef KOKKOS_ENABLE_THREADS - if (std::is_same::value) + if (std::is_same_v) GTEST_SKIP() << "skipping since the Threads backend isn't asynchronous"; #endif #if defined(KOKKOS_ENABLE_HPX) && \ !defined(KOKKOS_ENABLE_IMPL_HPX_ASYNC_DISPATCH) - if (std::is_same::value) + if (std::is_same_v) GTEST_SKIP() << "skipping since the HPX backend always fences with async " "dispatch disabled"; #endif diff --git a/packages/kokkos/core/CMakeLists.txt b/packages/kokkos/core/CMakeLists.txt index 0917928001a9..21f05f627242 100644 --- a/packages/kokkos/core/CMakeLists.txt +++ b/packages/kokkos/core/CMakeLists.txt @@ -1,22 +1,14 @@ -IF (NOT Kokkos_INSTALL_TESTING) - ADD_SUBDIRECTORY(src) -ENDIF() +if(NOT Kokkos_INSTALL_TESTING) + add_subdirectory(src) +endif() -FUNCTION(KOKKOS_ADD_BENCHMARK_DIRECTORY DIR_NAME) - IF(NOT Kokkos_ENABLE_BENCHMARKS) - RETURN() - ENDIF() +function(KOKKOS_ADD_BENCHMARK_DIRECTORY DIR_NAME) + if(NOT Kokkos_ENABLE_BENCHMARKS) + return() + endif() - IF(KOKKOS_HAS_TRILINOS) - message( - STATUS - "Benchmarks are not supported when building as part of Trilinos" - ) - RETURN() - ENDIF() + add_subdirectory(${DIR_NAME}) +endfunction() - ADD_SUBDIRECTORY(${DIR_NAME}) -ENDFUNCTION() - -KOKKOS_ADD_TEST_DIRECTORIES(unit_test) -KOKKOS_ADD_BENCHMARK_DIRECTORY(perf_test) +kokkos_add_test_directories(unit_test) +kokkos_add_benchmark_directory(perf_test) diff --git a/packages/kokkos/core/perf_test/CMakeLists.txt b/packages/kokkos/core/perf_test/CMakeLists.txt index e0dba03e1ecb..0cb2c804d383 100644 --- a/packages/kokkos/core/perf_test/CMakeLists.txt +++ b/packages/kokkos/core/perf_test/CMakeLists.txt @@ -1,50 +1,36 @@ # FIXME_OPENMPTARGET - the NVIDIA HPC compiler nvc++ in the OpenMPTarget backend does not pass the perf_tests. # FIXME_OPENACC - temporarily disabled due to unimplemented features -IF ((KOKKOS_ENABLE_OPENMPTARGET OR KOKKOS_ENABLE_OPENACC) AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) - RETURN() -ENDIF() -IF (KOKKOS_ENABLE_OPENACC AND KOKKOS_CXX_COMPILER_ID STREQUAL Clang) - RETURN() -ENDIF() +if((KOKKOS_ENABLE_OPENMPTARGET OR KOKKOS_ENABLE_OPENACC) AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) + return() +endif() +if(KOKKOS_ENABLE_OPENACC AND KOKKOS_CXX_COMPILER_ID STREQUAL Clang) + return() +endif() # all PerformanceTest_* executables are part of regular tests # TODO: finish converting these into benchmarks (in progress) -IF(KOKKOS_ENABLE_TESTS) - IF(KOKKOS_ENABLE_CUDA OR KOKKOS_ENABLE_HIP OR KOKKOS_ENABLE_SYCL) - KOKKOS_ADD_EXECUTABLE ( - PerformanceTest_SharedSpace - SOURCES test_sharedSpace.cpp - ) - ENDIF() - - KOKKOS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}) - - IF(NOT Kokkos_ENABLE_OPENMPTARGET) - # FIXME OPENMPTARGET needs tasking - KOKKOS_ADD_EXECUTABLE_AND_TEST( - PerformanceTest_TaskDag - SOURCES test_taskdag.cpp - CATEGORIES PERFORMANCE - ) - ENDIF() -ENDIF() - -IF(NOT Kokkos_ENABLE_BENCHMARKS) - RETURN() -ENDIF() - -IF (KOKKOS_HAS_TRILINOS) - message(FATAL_ERROR "Benchmarks are not supported when building as part of Trilinos") -ENDIF() +if(KOKKOS_ENABLE_TESTS) + if(KOKKOS_ENABLE_CUDA OR KOKKOS_ENABLE_HIP OR KOKKOS_ENABLE_SYCL) + kokkos_add_executable(PerformanceTest_SharedSpace SOURCES test_sharedSpace.cpp) + endif() + + kokkos_include_directories(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}) + + kokkos_add_executable_and_test(PerformanceTest_TaskDag SOURCES test_taskdag.cpp CATEGORIES PERFORMANCE) +endif() + +if(NOT Kokkos_ENABLE_BENCHMARKS) + return() +endif() # Find or download google/benchmark library find_package(benchmark QUIET 1.5.6) -IF(benchmark_FOUND) - MESSAGE(STATUS "Using google benchmark found in ${benchmark_DIR}") -ELSE() +if(benchmark_FOUND) + message(STATUS "Using google benchmark found in ${benchmark_DIR}") +else() message(STATUS "No installed google benchmark found, fetching from GitHub") include(FetchContent) - SET(BENCHMARK_ENABLE_TESTING OFF) + set(BENCHMARK_ENABLE_TESTING OFF) list(APPEND CMAKE_MESSAGE_INDENT "[benchmark] ") FetchContent_Declare( @@ -57,143 +43,93 @@ ELSE() list(POP_BACK CMAKE_MESSAGE_INDENT) # Suppress clang-tidy diagnostics on code that we do not have control over - IF(CMAKE_CXX_CLANG_TIDY) - SET_TARGET_PROPERTIES(benchmark PROPERTIES CXX_CLANG_TIDY "") - ENDIF() + if(CMAKE_CXX_CLANG_TIDY) + set_target_properties(benchmark PROPERTIES CXX_CLANG_TIDY "") + endif() target_compile_options(benchmark PRIVATE -w) target_compile_options(benchmark_main PRIVATE -w) -ENDIF() +endif() +function(KOKKOS_ADD_BENCHMARK NAME) + cmake_parse_arguments(BENCHMARK "" "" "SOURCES" ${ARGN}) + if(DEFINED BENCHMARK_UNPARSED_ARGUMENTS) + message(WARNING "Unexpected arguments when adding a benchmark: " ${BENCHMARK_UNPARSED_ARGUMENTS}) + endif() -FUNCTION(KOKKOS_ADD_BENCHMARK NAME) - CMAKE_PARSE_ARGUMENTS( - BENCHMARK - "" - "" - "SOURCES" - ${ARGN} - ) - IF(DEFINED BENCHMARK_UNPARSED_ARGUMENTS) - MESSAGE( - WARNING - "Unexpected arguments when adding a benchmark: " - ${BENCHMARK_UNPARSED_ARGUMENTS} - ) - ENDIF() - - SET(BENCHMARK_NAME ${PACKAGE_NAME}_${NAME}) - LIST(APPEND BENCHMARK_SOURCES - BenchmarkMain.cpp - Benchmark_Context.cpp - ) + set(BENCHMARK_NAME Kokkos_${NAME}) + list(APPEND BENCHMARK_SOURCES BenchmarkMain.cpp Benchmark_Context.cpp) - ADD_EXECUTABLE( - ${BENCHMARK_NAME} - ${BENCHMARK_SOURCES} - ) - TARGET_LINK_LIBRARIES( - ${BENCHMARK_NAME} - PRIVATE benchmark::benchmark Kokkos::kokkos impl_git_version - ) - TARGET_INCLUDE_DIRECTORIES( - ${BENCHMARK_NAME} - SYSTEM PRIVATE ${benchmark_SOURCE_DIR}/include - ) + add_executable(${BENCHMARK_NAME} ${BENCHMARK_SOURCES}) + target_link_libraries(${BENCHMARK_NAME} PRIVATE benchmark::benchmark Kokkos::kokkos impl_git_version) + target_include_directories(${BENCHMARK_NAME} SYSTEM PRIVATE ${benchmark_SOURCE_DIR}/include) - FOREACH(SOURCE_FILE ${BENCHMARK_SOURCES}) - SET_SOURCE_FILES_PROPERTIES( - ${SOURCE_FILE} - PROPERTIES LANGUAGE ${KOKKOS_COMPILE_LANGUAGE} - ) - ENDFOREACH() - - STRING(TIMESTAMP BENCHMARK_TIME "%Y-%m-%d_T%H-%M-%S" UTC) - SET( - BENCHMARK_ARGS - --benchmark_counters_tabular=true - --benchmark_out=${BENCHMARK_NAME}_${BENCHMARK_TIME}.json - ) + foreach(SOURCE_FILE ${BENCHMARK_SOURCES}) + set_source_files_properties(${SOURCE_FILE} PROPERTIES LANGUAGE ${KOKKOS_COMPILE_LANGUAGE}) + endforeach() - ADD_TEST( - NAME ${BENCHMARK_NAME} - COMMAND ${BENCHMARK_NAME} ${BENCHMARK_ARGS} - ) -ENDFUNCTION() - -SET( - BENCHMARK_SOURCES - PerfTestGramSchmidt.cpp - PerfTest_CustomReduction.cpp - PerfTest_ExecSpacePartitioning.cpp - PerfTestHexGrad.cpp - PerfTest_MallocFree.cpp - PerfTest_ViewAllocate.cpp - PerfTest_ViewCopy_a123.cpp - PerfTest_ViewCopy_b123.cpp - PerfTest_ViewCopy_c123.cpp - PerfTest_ViewCopy_d123.cpp - PerfTest_ViewCopy_a45.cpp - PerfTest_ViewCopy_b45.cpp - PerfTest_ViewCopy_c45.cpp - PerfTest_ViewCopy_d45.cpp - PerfTest_ViewCopy_a6.cpp - PerfTest_ViewCopy_b6.cpp - PerfTest_ViewCopy_c6.cpp - PerfTest_ViewCopy_d6.cpp - PerfTest_ViewCopy_a7.cpp - PerfTest_ViewCopy_b7.cpp - PerfTest_ViewCopy_c7.cpp - PerfTest_ViewCopy_d7.cpp - PerfTest_ViewCopy_a8.cpp - PerfTest_ViewCopy_b8.cpp - PerfTest_ViewCopy_c8.cpp - PerfTest_ViewCopy_d8.cpp - PerfTest_ViewCopy_Raw.cpp - PerfTest_ViewFill_123.cpp - PerfTest_ViewFill_45.cpp - PerfTest_ViewFill_6.cpp - PerfTest_ViewFill_7.cpp - PerfTest_ViewFill_8.cpp - PerfTest_ViewFill_Raw.cpp - PerfTest_ViewResize_123.cpp - PerfTest_ViewResize_45.cpp - PerfTest_ViewResize_6.cpp - PerfTest_ViewResize_7.cpp - PerfTest_ViewResize_8.cpp - PerfTest_ViewResize_Raw.cpp -) + string(TIMESTAMP BENCHMARK_TIME "%Y-%m-%d_T%H-%M-%S" UTC) + set(BENCHMARK_ARGS --benchmark_counters_tabular=true --benchmark_out=${BENCHMARK_NAME}_${BENCHMARK_TIME}.json) + + add_test(NAME ${BENCHMARK_NAME} COMMAND ${BENCHMARK_NAME} ${BENCHMARK_ARGS}) +endfunction() -IF(Kokkos_ENABLE_OPENMPTARGET) -# FIXME OPENMPTARGET requires TeamPolicy Reductions and Custom Reduction - LIST(REMOVE_ITEM BENCHMARK_SOURCES +set(BENCHMARK_SOURCES PerfTestGramSchmidt.cpp PerfTest_CustomReduction.cpp PerfTest_ExecSpacePartitioning.cpp - ) -ENDIF() - -KOKKOS_ADD_BENCHMARK( - PerformanceTest_Benchmark - SOURCES ${BENCHMARK_SOURCES} + PerfTestHexGrad.cpp + PerfTest_MallocFree.cpp + PerfTest_ViewAllocate.cpp + PerfTest_ViewCopy_a123.cpp + PerfTest_ViewCopy_b123.cpp + PerfTest_ViewCopy_c123.cpp + PerfTest_ViewCopy_d123.cpp + PerfTest_ViewCopy_a45.cpp + PerfTest_ViewCopy_b45.cpp + PerfTest_ViewCopy_c45.cpp + PerfTest_ViewCopy_d45.cpp + PerfTest_ViewCopy_a6.cpp + PerfTest_ViewCopy_b6.cpp + PerfTest_ViewCopy_c6.cpp + PerfTest_ViewCopy_d6.cpp + PerfTest_ViewCopy_a7.cpp + PerfTest_ViewCopy_b7.cpp + PerfTest_ViewCopy_c7.cpp + PerfTest_ViewCopy_d7.cpp + PerfTest_ViewCopy_a8.cpp + PerfTest_ViewCopy_b8.cpp + PerfTest_ViewCopy_c8.cpp + PerfTest_ViewCopy_d8.cpp + PerfTest_ViewCopy_Raw.cpp + PerfTest_ViewFill_123.cpp + PerfTest_ViewFill_45.cpp + PerfTest_ViewFill_6.cpp + PerfTest_ViewFill_7.cpp + PerfTest_ViewFill_8.cpp + PerfTest_ViewFill_Raw.cpp + PerfTest_ViewResize_123.cpp + PerfTest_ViewResize_45.cpp + PerfTest_ViewResize_6.cpp + PerfTest_ViewResize_7.cpp + PerfTest_ViewResize_8.cpp + PerfTest_ViewResize_Raw.cpp ) -IF(NOT KOKKOS_ENABLE_CUDA OR KOKKOS_ENABLE_CUDA_LAMBDA) - KOKKOS_ADD_BENCHMARK( - Benchmark_Atomic_MinMax - SOURCES test_atomic_minmax_simple.cpp +if(Kokkos_ENABLE_OPENMPTARGET) + # FIXME OPENMPTARGET requires TeamPolicy Reductions and Custom Reduction + list(REMOVE_ITEM BENCHMARK_SOURCES PerfTestGramSchmidt.cpp PerfTest_CustomReduction.cpp + PerfTest_ExecSpacePartitioning.cpp ) -ENDIF() +endif() + +kokkos_add_benchmark(PerformanceTest_Benchmark SOURCES ${BENCHMARK_SOURCES}) + +kokkos_add_benchmark(Benchmark_Atomic_MinMax SOURCES test_atomic_minmax_simple.cpp) # FIXME_NVHPC -IF(NOT KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) - KOKKOS_ADD_BENCHMARK( - PerformanceTest_Mempool - SOURCES test_mempool.cpp - ) -ENDIF() +if(NOT KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) + kokkos_add_benchmark(PerformanceTest_Mempool SOURCES test_mempool.cpp) +endif() -KOKKOS_ADD_BENCHMARK( - PerformanceTest_Atomic - SOURCES test_atomic.cpp -) +kokkos_add_benchmark(PerformanceTest_Atomic SOURCES test_atomic.cpp) diff --git a/packages/kokkos/core/perf_test/PerfTestHexGrad.cpp b/packages/kokkos/core/perf_test/PerfTestHexGrad.cpp index 98cb246c71e1..1ebe750f2164 100644 --- a/packages/kokkos/core/perf_test/PerfTestHexGrad.cpp +++ b/packages/kokkos/core/perf_test/PerfTestHexGrad.cpp @@ -34,10 +34,10 @@ struct HexGrad { enum { NSpace = 3, NNode = 8 }; using elem_coord_type = - Kokkos::View; + Kokkos::View; using elem_grad_type = - Kokkos::View; + Kokkos::View; elem_coord_type coords; elem_grad_type grad_op; diff --git a/packages/kokkos/core/perf_test/PerfTest_CustomReduction.cpp b/packages/kokkos/core/perf_test/PerfTest_CustomReduction.cpp index 2110f38a916f..03340a5d6de4 100644 --- a/packages/kokkos/core/perf_test/PerfTest_CustomReduction.cpp +++ b/packages/kokkos/core/perf_test/PerfTest_CustomReduction.cpp @@ -21,7 +21,6 @@ #include #include -#ifdef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA namespace Test { template std::pair custom_reduction_test(int N, int R) { @@ -130,4 +129,3 @@ BENCHMARK(CustomReduction) ->UseManualTime(); } // namespace Test -#endif diff --git a/packages/kokkos/core/perf_test/PerfTest_ExecSpacePartitioning.cpp b/packages/kokkos/core/perf_test/PerfTest_ExecSpacePartitioning.cpp index d2a3d0b823a2..aa23ddbb6072 100644 --- a/packages/kokkos/core/perf_test/PerfTest_ExecSpacePartitioning.cpp +++ b/packages/kokkos/core/perf_test/PerfTest_ExecSpacePartitioning.cpp @@ -56,8 +56,7 @@ bool is_overlapping(const Kokkos::HIP&) { #if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_ARCH_INTEL_GPU) template <> -bool is_overlapping( - const Kokkos::Experimental::SYCL&) { +bool is_overlapping(const Kokkos::SYCL&) { return true; } #endif diff --git a/packages/kokkos/core/perf_test/PerfTest_ViewCopy_Raw.cpp b/packages/kokkos/core/perf_test/PerfTest_ViewCopy_Raw.cpp index 67a8d7e55545..e4db40e128c3 100644 --- a/packages/kokkos/core/perf_test/PerfTest_ViewCopy_Raw.cpp +++ b/packages/kokkos/core/perf_test/PerfTest_ViewCopy_Raw.cpp @@ -18,7 +18,6 @@ namespace Test { -#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) BENCHMARK(ViewDeepCopy_Raw) ->ArgName("N") ->Arg(10) @@ -38,6 +37,5 @@ BENCHMARK(ViewDeepCopy_Raw) ->ArgName("N") ->Arg(10) ->UseManualTime(); -#endif } // namespace Test diff --git a/packages/kokkos/core/perf_test/PerfTest_ViewFill_Raw.cpp b/packages/kokkos/core/perf_test/PerfTest_ViewFill_Raw.cpp index c11074d9154f..57bba83a9c1e 100644 --- a/packages/kokkos/core/perf_test/PerfTest_ViewFill_Raw.cpp +++ b/packages/kokkos/core/perf_test/PerfTest_ViewFill_Raw.cpp @@ -18,7 +18,6 @@ namespace Test { -#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) BENCHMARK(ViewFill_Raw) ->ArgName("N") ->Arg(N) @@ -28,6 +27,5 @@ BENCHMARK(ViewFill_Raw) ->ArgName("N") ->Arg(N) ->UseManualTime(); -#endif } // namespace Test diff --git a/packages/kokkos/core/perf_test/PerfTest_ViewResize_Raw.cpp b/packages/kokkos/core/perf_test/PerfTest_ViewResize_Raw.cpp index 2d1bcbb3cab5..ab469cb647ca 100644 --- a/packages/kokkos/core/perf_test/PerfTest_ViewResize_Raw.cpp +++ b/packages/kokkos/core/perf_test/PerfTest_ViewResize_Raw.cpp @@ -18,7 +18,6 @@ namespace Test { -#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) BENCHMARK(ViewResize_NoInit_Raw) ->ArgName("N") ->Arg(N) @@ -30,6 +29,5 @@ BENCHMARK(ViewResize_NoInit_Raw) ->Arg(N) ->UseManualTime() ->Iterations(R); -#endif } // namespace Test diff --git a/packages/kokkos/core/perf_test/test_mempool.cpp b/packages/kokkos/core/perf_test/test_mempool.cpp index 9905740afb4d..bdfe59b0b3bc 100644 --- a/packages/kokkos/core/perf_test/test_mempool.cpp +++ b/packages/kokkos/core/perf_test/test_mempool.cpp @@ -198,7 +198,7 @@ static void Mempool_Fill(benchmark::State& state) { int fill_level = get_parameter("--fill_level=", state.range(4)); int repeat_inner = get_parameter("--repeat_inner=", state.range(5)); int number_alloc = get_number_alloc(chunk_span, min_superblock_size, - total_alloc_size, fill_level); + total_alloc_size, fill_level); for (auto _ : state) { TestFunctor functor(total_alloc_size, min_superblock_size, number_alloc, @@ -225,7 +225,7 @@ static void Mempool_Alloc_Dealloc(benchmark::State& state) { int fill_level = get_parameter("--fill_level=", state.range(4)); int repeat_inner = get_parameter("--repeat_inner=", state.range(5)); int number_alloc = get_number_alloc(chunk_span, min_superblock_size, - total_alloc_size, fill_level); + total_alloc_size, fill_level); for (auto _ : state) { TestFunctor functor(total_alloc_size, min_superblock_size, number_alloc, diff --git a/packages/kokkos/core/perf_test/test_sharedSpace.cpp b/packages/kokkos/core/perf_test/test_sharedSpace.cpp index 4f140c9409ad..3c06770e2861 100644 --- a/packages/kokkos/core/perf_test/test_sharedSpace.cpp +++ b/packages/kokkos/core/perf_test/test_sharedSpace.cpp @@ -103,7 +103,7 @@ size_t getDeviceMemorySize() { #elif defined KOKKOS_ENABLE_HIP return Kokkos::HIP{}.hip_device_prop().totalGlobalMem; #elif defined KOKKOS_ENABLE_SYCL - auto device = Kokkos::Experimental::SYCL{}.sycl_queue().get_device(); + auto device = Kokkos::SYCL{}.sycl_queue().get_device(); return device.get_info(); #else #error \ diff --git a/packages/kokkos/core/perf_test/test_taskdag.cpp b/packages/kokkos/core/perf_test/test_taskdag.cpp index fccaab64ddf1..347d9748b5a9 100644 --- a/packages/kokkos/core/perf_test/test_taskdag.cpp +++ b/packages/kokkos/core/perf_test/test_taskdag.cpp @@ -32,6 +32,11 @@ int main() { return 0; } #include +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS +// We allow using deprecated classes in this file +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() +#endif + using ExecSpace = Kokkos::DefaultExecutionSpace; inline long eval_fib(long n) { @@ -223,4 +228,8 @@ int main(int argc, char* argv[]) { return 0; } +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() +#endif + #endif diff --git a/packages/kokkos/core/src/CMakeLists.txt b/packages/kokkos/core/src/CMakeLists.txt index b84677e61b6f..72663739a142 100644 --- a/packages/kokkos/core/src/CMakeLists.txt +++ b/packages/kokkos/core/src/CMakeLists.txt @@ -1,118 +1,125 @@ -KOKKOS_INCLUDE_DIRECTORIES( - ${CMAKE_CURRENT_BINARY_DIR} - ${CMAKE_CURRENT_SOURCE_DIR} - ${KOKKOS_TOP_BUILD_DIR} -) -IF (NOT desul_FOUND) - IF(KOKKOS_ENABLE_CUDA) - SET(DESUL_ATOMICS_ENABLE_CUDA ON) - ENDIF() - IF(KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE) - SET(DESUL_ATOMICS_ENABLE_CUDA_SEPARABLE_COMPILATION ON) - ENDIF() - IF(KOKKOS_ENABLE_HIP) - SET(DESUL_ATOMICS_ENABLE_HIP ON) - ENDIF() - IF(KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE) - SET(DESUL_ATOMICS_ENABLE_HIP_SEPARABLE_COMPILATION ON) - ENDIF() - IF(KOKKOS_ENABLE_SYCL) - SET(DESUL_ATOMICS_ENABLE_SYCL ON) - IF(KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED AND NOT KOKKOS_IMPL_HAVE_SYCL_EXT_ONEAPI_DEVICE_GLOBAL) - SET(DESUL_ATOMICS_ENABLE_SYCL_SEPARABLE_COMPILATION ON) - ENDIF() - ENDIF() - IF(KOKKOS_ENABLE_OPENMPTARGET) - SET(DESUL_ATOMICS_ENABLE_OPENMP ON) # not a typo Kokkos OpenMPTarget -> Desul OpenMP - ENDIF() - IF(KOKKOS_ENABLE_OPENACC) - SET(DESUL_ATOMICS_ENABLE_OPENACC ON) - ENDIF() - CONFIGURE_FILE( - ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/Config.hpp.cmake.in - ${CMAKE_CURRENT_BINARY_DIR}/desul/atomics/Config.hpp - ) - KOKKOS_INCLUDE_DIRECTORIES( - ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/include +kokkos_include_directories(${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR} ${KOKKOS_TOP_BUILD_DIR}) +if(NOT desul_FOUND) + if(KOKKOS_ENABLE_CUDA) + set(DESUL_ATOMICS_ENABLE_CUDA ON) + endif() + if(KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE) + set(DESUL_ATOMICS_ENABLE_CUDA_SEPARABLE_COMPILATION ON) + endif() + if(KOKKOS_ENABLE_HIP) + set(DESUL_ATOMICS_ENABLE_HIP ON) + endif() + if(KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE) + set(DESUL_ATOMICS_ENABLE_HIP_SEPARABLE_COMPILATION ON) + endif() + if(KOKKOS_ENABLE_SYCL) + set(DESUL_ATOMICS_ENABLE_SYCL ON) + if(KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED AND NOT KOKKOS_IMPL_HAVE_SYCL_EXT_ONEAPI_DEVICE_GLOBAL) + set(DESUL_ATOMICS_ENABLE_SYCL_SEPARABLE_COMPILATION ON) + endif() + endif() + if(KOKKOS_ENABLE_OPENMPTARGET) + set(DESUL_ATOMICS_ENABLE_OPENMP ON) # not a typo Kokkos OpenMPTarget -> Desul OpenMP + endif() + if(KOKKOS_ENABLE_OPENACC) + # FIXME_OPENACC FIXME_CLACC - Below condition will be removed if Clacc can compile atomics. + if(KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) + set(DESUL_ATOMICS_ENABLE_OPENACC ON) + endif() + endif() + configure_file( + ${KOKKOS_SOURCE_DIR}/tpls/desul/Config.hpp.cmake.in ${CMAKE_CURRENT_BINARY_DIR}/desul/atomics/Config.hpp ) -ENDIF() + kokkos_include_directories(${KOKKOS_SOURCE_DIR}/tpls/desul/include) +endif() -INSTALL (DIRECTORY - "${CMAKE_CURRENT_SOURCE_DIR}/" +install( + DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/" DESTINATION ${KOKKOS_HEADER_DIR} FILES_MATCHING PATTERN "*.hpp" PATTERN "*.h" ) -SET(KOKKOS_CORE_SRCS) -APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/impl/*.cpp) -SET(KOKKOS_CORE_HEADERS) -APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/*.hpp) -APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/impl/*.hpp) - -IF (KOKKOS_ENABLE_CUDA) - APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/Cuda/*.cpp) - APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/Cuda/*.hpp) -ENDIF() - -IF (KOKKOS_ENABLE_OPENMP) - APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/OpenMP/*.cpp) - APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/OpenMP/*.hpp) -ENDIF() - -IF (KOKKOS_ENABLE_OPENMPTARGET) - APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/OpenMPTarget/*.cpp) - APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/OpenMPTarget/*.hpp) -ENDIF() - -IF (KOKKOS_ENABLE_OPENACC) - APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/OpenACC/*.cpp) - APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/OpenACC/*.hpp) -ENDIF() - -IF (KOKKOS_ENABLE_THREADS) - APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/Threads/*.cpp) - APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/Threads/*.hpp) -ENDIF() - -IF (KOKKOS_ENABLE_HIP) - APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/HIP/*.cpp) - APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/HIP/*.hpp) -ENDIF() - -IF (KOKKOS_ENABLE_HPX) - APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/HPX/*.cpp) - APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/HPX/*.hpp) -ENDIF() - -IF (KOKKOS_ENABLE_SERIAL) - APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/Serial/*.cpp) - APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/Serial/*.hpp) -ENDIF() - -IF (KOKKOS_ENABLE_SYCL) - APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/SYCL/*.cpp) - APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/SYCL/*.hpp) -ENDIF() - -IF (NOT desul_FOUND) - IF (KOKKOS_ENABLE_CUDA) - APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/src/Lock_Array_CUDA.cpp) - ELSEIF (KOKKOS_ENABLE_HIP) - APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/src/Lock_Array_HIP.cpp) - ELSEIF (KOKKOS_ENABLE_SYCL) - APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/src/Lock_Array_SYCL.cpp) - ENDIF() - APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/include/desul/*.hpp) - APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/include/desul/*/*.hpp) - APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/include/desul/*/*/*.hpp) - APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/include/*/*/*.inc*) - APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_BINARY_DIR}/desul/*.hpp) - - INSTALL (DIRECTORY - "${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/include/desul" - "${CMAKE_CURRENT_BINARY_DIR}/desul" +set(KOKKOS_CORE_SRCS) +append_glob(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/impl/*.cpp) +set(KOKKOS_CORE_HEADERS) +append_glob(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/*.hpp) +append_glob(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/impl/*.hpp) + +if(KOKKOS_ENABLE_CUDA) + append_glob(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/Cuda/*.cpp) + if(NOT Kokkos_ENABLE_DEPRECATED_CODE_4) + list(REMOVE_ITEM KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/Cuda/Kokkos_Cuda_Task.cpp) + endif() + append_glob(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/Cuda/*.hpp) +endif() + +if(KOKKOS_ENABLE_OPENMP) + append_glob(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/OpenMP/*.cpp) + if(NOT Kokkos_ENABLE_DEPRECATED_CODE_4) + list(REMOVE_ITEM KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/OpenMP/Kokkos_OpenMP_Task.cpp) + endif() + append_glob(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/OpenMP/*.hpp) +endif() + +if(KOKKOS_ENABLE_OPENMPTARGET) + append_glob(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/OpenMPTarget/*.cpp) + append_glob(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/OpenMPTarget/*.hpp) +endif() + +if(KOKKOS_ENABLE_OPENACC) + append_glob(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/OpenACC/*.cpp) + append_glob(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/OpenACC/*.hpp) +endif() + +if(KOKKOS_ENABLE_THREADS) + append_glob(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/Threads/*.cpp) + append_glob(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/Threads/*.hpp) +endif() + +if(KOKKOS_ENABLE_HIP) + append_glob(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/HIP/*.cpp) + append_glob(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/HIP/*.hpp) +endif() + +if(KOKKOS_ENABLE_HPX) + append_glob(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/HPX/*.cpp) + if(NOT Kokkos_ENABLE_DEPRECATED_CODE_4) + list(REMOVE_ITEM KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/HPX/Kokkos_HPX_Task.cpp) + endif() + append_glob(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/HPX/*.hpp) +endif() + +if(KOKKOS_ENABLE_SERIAL) + append_glob(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/Serial/*.cpp) + if(NOT Kokkos_ENABLE_DEPRECATED_CODE_4) + list(REMOVE_ITEM KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/Serial/Kokkos_Serial_Task.cpp) + endif() + append_glob(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/Serial/*.hpp) +endif() + +if(KOKKOS_ENABLE_SYCL) + append_glob(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/SYCL/*.cpp) + append_glob(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/SYCL/*.hpp) +endif() + +if(NOT desul_FOUND) + if(KOKKOS_ENABLE_CUDA) + append_glob(KOKKOS_CORE_SRCS ${KOKKOS_SOURCE_DIR}/tpls/desul/src/Lock_Array_CUDA.cpp) + elseif(KOKKOS_ENABLE_HIP) + append_glob(KOKKOS_CORE_SRCS ${KOKKOS_SOURCE_DIR}/tpls/desul/src/Lock_Array_HIP.cpp) + elseif(KOKKOS_ENABLE_SYCL) + append_glob(KOKKOS_CORE_SRCS ${KOKKOS_SOURCE_DIR}/tpls/desul/src/Lock_Array_SYCL.cpp) + endif() + append_glob(KOKKOS_CORE_HEADERS ${KOKKOS_SOURCE_DIR}/tpls/desul/include/desul/*.hpp) + append_glob(KOKKOS_CORE_HEADERS ${KOKKOS_SOURCE_DIR}/tpls/desul/include/desul/*/*.hpp) + append_glob(KOKKOS_CORE_HEADERS ${KOKKOS_SOURCE_DIR}/tpls/desul/include/desul/*/*/*.hpp) + append_glob(KOKKOS_CORE_HEADERS ${KOKKOS_SOURCE_DIR}/tpls/desul/include/*/*/*.inc*) + append_glob(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_BINARY_DIR}/desul/*.hpp) + + install( + DIRECTORY "${KOKKOS_SOURCE_DIR}/tpls/desul/include/desul" "${CMAKE_CURRENT_BINARY_DIR}/desul" DESTINATION ${KOKKOS_HEADER_DIR} FILES_MATCHING PATTERN "*.inc" @@ -120,33 +127,26 @@ IF (NOT desul_FOUND) PATTERN "*.hpp" ) - MESSAGE(STATUS "Using internal desul_atomics copy") -ELSE() - MESSAGE(STATUS "Using external desul_atomics install found at:") - MESSAGE(STATUS " " ${desul_DIR}) -ENDIF() - + message(STATUS "Using internal desul_atomics copy") +else() + message(STATUS "Using external desul_atomics install found at:") + message(STATUS " " ${desul_DIR}) +endif() -KOKKOS_ADD_LIBRARY( - kokkoscore - SOURCES ${KOKKOS_CORE_SRCS} - HEADERS ${KOKKOS_CORE_HEADERS} +kokkos_add_library( + kokkoscore SOURCES ${KOKKOS_CORE_SRCS} HEADERS ${KOKKOS_CORE_HEADERS} ADD_BUILD_OPTIONS # core should be given all the necessary compiler/linker flags ) -KOKKOS_LIB_INCLUDE_DIRECTORIES(kokkoscore - ${KOKKOS_TOP_BUILD_DIR} - ${CMAKE_CURRENT_BINARY_DIR} - ${CMAKE_CURRENT_SOURCE_DIR} +kokkos_lib_include_directories( + kokkoscore ${KOKKOS_TOP_BUILD_DIR} ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR} ) -IF (NOT desul_FOUND) - KOKKOS_LIB_INCLUDE_DIRECTORIES(kokkoscore - ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/include - ) -ENDIF() +if(NOT desul_FOUND) + kokkos_lib_include_directories(kokkoscore ${KOKKOS_SOURCE_DIR}/tpls/desul/include) +endif() -IF (Kokkos_ENABLE_IMPL_MDSPAN) - MESSAGE(STATUS "Experimental mdspan support is enabled") +if(Kokkos_ENABLE_IMPL_MDSPAN) + message(STATUS "Experimental mdspan support is enabled") # Some compilers now include mdspan... we just flag on their version # for now until we can get some compiler detection support @@ -154,62 +154,56 @@ IF (Kokkos_ENABLE_IMPL_MDSPAN) check_include_file_cxx(experimental/mdspan KOKKOS_COMPILER_SUPPORTS_EXPERIMENTAL_MDSPAN) check_include_file_cxx(mdspan KOKKOS_COMPILER_SUPPORTS_MDSPAN) - if (Kokkos_ENABLE_MDSPAN_EXTERNAL) - MESSAGE(STATUS "Using external mdspan") + if(Kokkos_ENABLE_MDSPAN_EXTERNAL) + message(STATUS "Using external mdspan") target_link_libraries(kokkoscore PUBLIC std::mdspan) elseif(KOKKOS_COMPILER_SUPPORTS_MDSPAN AND NOT Kokkos_ENABLE_IMPL_SKIP_COMPILER_MDSPAN) message(STATUS "Using compiler-supplied mdspan") elseif(KOKKOS_COMPILER_SUPPORTS_EXPERIMENTAL_MDSPAN AND NOT Kokkos_ENABLE_IMPL_SKIP_COMPILER_MDSPAN) message(STATUS "Using compiler-supplied experimental/mdspan") else() - KOKKOS_LIB_INCLUDE_DIRECTORIES( - kokkoscore - ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/mdspan/include - ) + kokkos_lib_include_directories(kokkoscore ${KOKKOS_SOURCE_DIR}/tpls/mdspan/include) - APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/mdspan/include/experimental/__p0009_bits/*.hpp) - APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/mdspan/include/experimental/mdspan) + append_glob(KOKKOS_CORE_HEADERS ${KOKKOS_SOURCE_DIR}/tpls/mdspan/include/experimental/__p0009_bits/*.hpp) + append_glob(KOKKOS_CORE_HEADERS ${KOKKOS_SOURCE_DIR}/tpls/mdspan/include/experimental/mdspan) - INSTALL (DIRECTORY - "${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/mdspan/include/" + install( + DIRECTORY "${KOKKOS_SOURCE_DIR}/tpls/mdspan/include/" DESTINATION ${KOKKOS_HEADER_DIR} FILES_MATCHING PATTERN "mdspan" PATTERN "*.hpp" ) - MESSAGE(STATUS "Using internal mdspan directory ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/mdspan/include") + message(STATUS "Using internal mdspan directory ${KOKKOS_SOURCE_DIR}/tpls/mdspan/include") endif() -ENDIF() +endif() -KOKKOS_LINK_TPL(kokkoscore PUBLIC HWLOC) -KOKKOS_LINK_TPL(kokkoscore PUBLIC CUDA) -KOKKOS_LINK_TPL(kokkoscore PUBLIC HPX) -KOKKOS_LINK_TPL(kokkoscore PUBLIC LIBDL) +kokkos_link_tpl(kokkoscore PUBLIC HWLOC) +kokkos_link_tpl(kokkoscore PUBLIC CUDA) +kokkos_link_tpl(kokkoscore PUBLIC HPX) +kokkos_link_tpl(kokkoscore PUBLIC LIBDL) # On *nix-like systems (Linux, macOS) we need pthread for C++ std::thread -IF (NOT WIN32) - KOKKOS_LINK_TPL(kokkoscore PUBLIC THREADS) -ENDIF() -IF (NOT KOKKOS_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) - KOKKOS_LINK_TPL(kokkoscore PUBLIC ROCM) -ENDIF() +if(NOT WIN32) + kokkos_link_tpl(kokkoscore PUBLIC THREADS) +endif() +if(NOT KOKKOS_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) + kokkos_link_tpl(kokkoscore PUBLIC ROCM) +endif() # FIXME: We need a proper solution to figure out whether to enable # libatomic # Most compilers only require libatomic for 128-bit CAS # I (CT) had removed 128bit CAS from desul to not need libatomic. -IF (KOKKOS_ENABLE_OPENMPTARGET) +if(KOKKOS_ENABLE_OPENMPTARGET) target_link_libraries(kokkoscore PUBLIC atomic) -ENDIF() +endif() -IF (desul_FOUND) +if(desul_FOUND) target_link_libraries(kokkoscore PUBLIC desul_atomics) -ENDIF() +endif() -# FIXME_TRILINOS Trilinos doesn't allow for Kokkos to use find_dependency so we -# just append the flags in cmake/kokkos_tpls.cmake instead of linking with the -# OpenMP target. -IF(Kokkos_ENABLE_OPENMP AND NOT KOKKOS_HAS_TRILINOS) +if(Kokkos_ENABLE_OPENMP) target_link_libraries(kokkoscore PUBLIC OpenMP::OpenMP_CXX) -ENDIF() +endif() -KOKKOS_LINK_TPL(kokkoscore PUBLIC LIBQUADMATH) +kokkos_link_tpl(kokkoscore PUBLIC LIBQUADMATH) diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda.hpp index fd86976d3ba6..07c35e6611f1 100644 --- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda.hpp +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda.hpp @@ -35,7 +35,6 @@ static_assert(false, #include // CUDA_SAFE_CALL #include -#include #include #include #include diff --git a/packages/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp b/packages/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp index 6ae24022c8fd..8bcd6525c962 100644 --- a/packages/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp +++ b/packages/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp @@ -201,7 +201,14 @@ void *impl_allocate_common(const int device_id, } } #elif (defined(KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC) && CUDART_VERSION >= 11020) - if (arg_alloc_size >= memory_threshold_g) { + // FIXME_KEPLER Everything after Kepler should support cudaMallocAsync + int device_supports_cuda_malloc_async; + KOKKOS_IMPL_CUDA_SAFE_CALL( + cudaDeviceGetAttribute(&device_supports_cuda_malloc_async, + cudaDevAttrMemoryPoolsSupported, device_id)); + + if (arg_alloc_size >= memory_threshold_g && + device_supports_cuda_malloc_async == 1) { error_code = cudaMallocAsync(&ptr, arg_alloc_size, stream); if (error_code == cudaSuccess) { diff --git a/packages/kokkos/core/src/Cuda/Kokkos_CudaSpace.hpp b/packages/kokkos/core/src/Cuda/Kokkos_CudaSpace.hpp index e1d062d72d5a..1ccf38a4a158 100644 --- a/packages/kokkos/core/src/Cuda/Kokkos_CudaSpace.hpp +++ b/packages/kokkos/core/src/Cuda/Kokkos_CudaSpace.hpp @@ -73,9 +73,9 @@ class CudaSpace { CudaSpace(int device_id, cudaStream_t stream); public: - CudaSpace(CudaSpace&& rhs) = default; - CudaSpace(const CudaSpace& rhs) = default; - CudaSpace& operator=(CudaSpace&& rhs) = default; + CudaSpace(CudaSpace&& rhs) = default; + CudaSpace(const CudaSpace& rhs) = default; + CudaSpace& operator=(CudaSpace&& rhs) = default; CudaSpace& operator=(const CudaSpace& rhs) = default; ~CudaSpace() = default; @@ -174,9 +174,9 @@ class CudaUVMSpace { CudaUVMSpace(int device_id, cudaStream_t stream); public: - CudaUVMSpace(CudaUVMSpace&& rhs) = default; - CudaUVMSpace(const CudaUVMSpace& rhs) = default; - CudaUVMSpace& operator=(CudaUVMSpace&& rhs) = default; + CudaUVMSpace(CudaUVMSpace&& rhs) = default; + CudaUVMSpace(const CudaUVMSpace& rhs) = default; + CudaUVMSpace& operator=(CudaUVMSpace&& rhs) = default; CudaUVMSpace& operator=(const CudaUVMSpace& rhs) = default; ~CudaUVMSpace() = default; @@ -266,9 +266,9 @@ class CudaHostPinnedSpace { CudaHostPinnedSpace(int device_id, cudaStream_t stream); public: - CudaHostPinnedSpace(CudaHostPinnedSpace&& rhs) = default; - CudaHostPinnedSpace(const CudaHostPinnedSpace& rhs) = default; - CudaHostPinnedSpace& operator=(CudaHostPinnedSpace&& rhs) = default; + CudaHostPinnedSpace(CudaHostPinnedSpace&& rhs) = default; + CudaHostPinnedSpace(const CudaHostPinnedSpace& rhs) = default; + CudaHostPinnedSpace& operator=(CudaHostPinnedSpace&& rhs) = default; CudaHostPinnedSpace& operator=(const CudaHostPinnedSpace& rhs) = default; ~CudaHostPinnedSpace() = default; diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_GraphNodeKernel.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_GraphNodeKernel.hpp index 5a821ab64a3c..058b1f538d56 100644 --- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_GraphNodeKernel.hpp +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_GraphNodeKernel.hpp @@ -51,7 +51,8 @@ class GraphNodeKernelImpl m_graph_node_ptr = nullptr; // Basically, we have to make this mutable for the same reasons that the // global kernel buffers in the Cuda instance are mutable... - mutable Kokkos::OwningRawPtr m_driver_storage = nullptr; + mutable std::shared_ptr m_driver_storage = nullptr; + std::string label; public: using Policy = PolicyType; @@ -61,25 +62,20 @@ class GraphNodeKernelImpl - GraphNodeKernelImpl(std::string, Kokkos::Cuda const&, Functor arg_functor, + GraphNodeKernelImpl(std::string label_, Cuda const&, Functor arg_functor, PolicyDeduced&& arg_policy, ArgsDeduced&&... args) // This is super ugly, but it works for now and is the most minimal change // to the codebase for now... - : base_t(std::move(arg_functor), (PolicyDeduced &&) arg_policy, - (ArgsDeduced &&) args...) {} + : base_t(std::move(arg_functor), (PolicyDeduced&&)arg_policy, + (ArgsDeduced&&)args...), + label(std::move(label_)) {} // FIXME @graph Forward through the instance once that works in the backends template GraphNodeKernelImpl(Kokkos::Cuda const& ex, Functor arg_functor, PolicyDeduced&& arg_policy) - : GraphNodeKernelImpl("", ex, std::move(arg_functor), - (PolicyDeduced &&) arg_policy) {} - - ~GraphNodeKernelImpl() { - if (m_driver_storage) { - Kokkos::CudaSpace().deallocate(m_driver_storage, sizeof(base_t)); - } - } + : GraphNodeKernelImpl("[unlabeled]", ex, std::move(arg_functor), + (PolicyDeduced&&)arg_policy) {} void set_cuda_graph_ptr(cudaGraph_t* arg_graph_ptr) { m_graph_ptr = arg_graph_ptr; @@ -90,13 +86,21 @@ class GraphNodeKernelImpl allocate_driver_memory_buffer() const { + Kokkos::ObservingRawPtr allocate_driver_memory_buffer( + const CudaSpace& mem) const { KOKKOS_EXPECTS(m_driver_storage == nullptr) - m_driver_storage = static_cast(Kokkos::CudaSpace().allocate( - "GraphNodeKernel global memory functor storage", sizeof(base_t))); + std::string alloc_label = + label + " - GraphNodeKernel global memory functor storage"; + m_driver_storage = std::shared_ptr( + static_cast(mem.allocate(alloc_label.c_str(), sizeof(base_t))), + [alloc_label, mem](base_t* ptr) { + mem.deallocate(alloc_label.c_str(), ptr, sizeof(base_t)); + }); KOKKOS_ENSURES(m_driver_storage != nullptr) - return m_driver_storage; + return m_driver_storage.get(); } + + auto get_driver_storage() const { return m_driver_storage; } }; struct CudaGraphNodeAggregateKernel { @@ -128,7 +132,8 @@ struct get_graph_node_kernel_type // {{{1 template -auto* allocate_driver_storage_for_kernel(KernelType const& kernel) { +auto* allocate_driver_storage_for_kernel(const CudaSpace& mem, + KernelType const& kernel) { using graph_node_kernel_t = typename get_graph_node_kernel_type::type; auto const& kernel_as_graph_kernel = @@ -136,7 +141,7 @@ auto* allocate_driver_storage_for_kernel(KernelType const& kernel) { // TODO @graphs we need to somehow indicate the need for a fence in the // destructor of the GraphImpl object (so that we don't have to // just always do it) - return kernel_as_graph_kernel.allocate_driver_memory_buffer(); + return kernel_as_graph_kernel.allocate_driver_memory_buffer(mem); } template diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Graph_Impl.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Graph_Impl.hpp index 625d8c317a1c..8e800e756d2b 100644 --- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Graph_Impl.hpp +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Graph_Impl.hpp @@ -51,7 +51,14 @@ struct GraphImpl { using node_details_t = GraphNodeBackendSpecificDetails; - void _instantiate_graph() { + // Store drivers for the kernel nodes that launch in global memory. + // This is required as lifetime of drivers must be bounded to this instance's + // lifetime. + std::vector> m_driver_storage; + + public: + void instantiate() { + KOKKOS_EXPECTS(!m_graph_exec); constexpr size_t error_log_size = 256; cudaGraphNode_t error_node = nullptr; char error_log[error_log_size]; @@ -60,10 +67,10 @@ struct GraphImpl { ->cuda_graph_instantiate_wrapper(&m_graph_exec, m_graph, &error_node, error_log, error_log_size))); + KOKKOS_ENSURES(m_graph_exec); // TODO @graphs print out errors } - public: using root_node_impl_t = GraphNodeImpl; @@ -74,11 +81,11 @@ struct GraphImpl { // Not movable or copyable; it spends its whole life as a shared_ptr in the // Graph object - GraphImpl() = delete; - GraphImpl(GraphImpl const&) = delete; - GraphImpl(GraphImpl&&) = delete; + GraphImpl() = delete; + GraphImpl(GraphImpl const&) = delete; + GraphImpl(GraphImpl&&) = delete; GraphImpl& operator=(GraphImpl const&) = delete; - GraphImpl& operator=(GraphImpl&&) = delete; + GraphImpl& operator=(GraphImpl&&) = delete; ~GraphImpl() { // TODO @graphs we need to somehow indicate the need for a fence in the // destructor of the GraphImpl object (so that we don't have to @@ -129,6 +136,8 @@ struct GraphImpl { kernel.set_cuda_graph_node_ptr(&cuda_node); kernel.execute(); KOKKOS_ENSURES(bool(cuda_node)); + if (std::shared_ptr tmp = kernel.get_driver_storage()) + m_driver_storage.push_back(std::move(tmp)); } template @@ -158,13 +167,13 @@ struct GraphImpl { &cuda_node, 1))); } - void submit() { + void submit(const execution_space& exec) { if (!bool(m_graph_exec)) { - _instantiate_graph(); + instantiate(); } KOKKOS_IMPL_CUDA_SAFE_CALL( - (m_execution_space.impl_internal_space_instance() - ->cuda_graph_launch_wrapper(m_graph_exec))); + (exec.impl_internal_space_instance()->cuda_graph_launch_wrapper( + m_graph_exec))); } execution_space const& get_execution_space() const noexcept { @@ -197,6 +206,9 @@ struct GraphImpl { m_execution_space, _graph_node_kernel_ctor_tag{}, aggregate_kernel_impl_t{}); } + + cudaGraph_t cuda_graph() { return m_graph; } + cudaGraphExec_t cuda_graph_exec() { return m_graph_exec; } }; } // end namespace Impl diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp index 158c8acdda6b..ec5768a7f0f6 100644 --- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp @@ -26,10 +26,10 @@ #include -//#include -//#include -//#include -//#include +// #include +// #include +// #include +// #include #include #include #include @@ -687,16 +687,6 @@ void Cuda::print_configuration(std::ostream &os, bool /*verbose*/) const { os << " KOKKOS_ENABLE_CUDA: yes\n"; os << "Cuda Options:\n"; - os << " KOKKOS_ENABLE_CUDA_LAMBDA: "; -#ifdef KOKKOS_ENABLE_CUDA_LAMBDA - os << "yes\n"; -#else - os << "no\n"; -#endif -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 - os << " KOKKOS_ENABLE_CUDA_LDG_INTRINSIC: "; - os << "yes\n"; -#endif os << " KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE: "; #ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE os << "yes\n"; @@ -708,12 +698,6 @@ void Cuda::print_configuration(std::ostream &os, bool /*verbose*/) const { os << "yes\n"; #else os << "no\n"; -#endif - os << " KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA: "; -#ifdef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA - os << "yes\n"; -#else - os << "no\n"; #endif os << " KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC: "; #ifdef KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp index b0dadb45f72b..2d00e735cb9d 100644 --- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp @@ -209,8 +209,8 @@ inline void configure_shmem_preference(const int cuda_device, // Use multiples of 8kB const size_t max_shmem_per_sm = device_props.sharedMemPerMultiprocessor; size_t carveout = shmem_per_block == 0 - ? 0 - : 100 * + ? 0 + : 100 * (((num_blocks_desired * shmem_per_block + min_shmem_size_per_sm - 1) / min_shmem_size_per_sm) * @@ -491,7 +491,10 @@ struct CudaParallelLaunchKernelInvoker< cuda_instance->m_deviceProp, block_size, shmem, desired_occupancy); } - auto* driver_ptr = Impl::allocate_driver_storage_for_kernel(driver); + auto* driver_ptr = Impl::allocate_driver_storage_for_kernel( + CudaSpace::impl_create(cuda_instance->m_cudaDev, + cuda_instance->m_stream), + driver); // Unlike in the non-graph case, we can get away with doing an async copy // here because the `DriverType` instance is held in the GraphNodeImpl @@ -714,7 +717,7 @@ struct CudaParallelLaunch; template CudaParallelLaunch(Args&&... args) { - base_t::launch_kernel((Args &&) args...); + base_t::launch_kernel((Args&&)args...); } }; @@ -728,7 +731,7 @@ struct CudaParallelLaunch; template CudaParallelLaunch(Args&&... args) { - base_t::create_parallel_launch_graph_node((Args &&) args...); + base_t::create_parallel_launch_graph_node((Args&&)args...); } }; diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp index 630389840048..c50ff430345c 100644 --- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp @@ -95,11 +95,39 @@ class ParallelFor, Kokkos::Cuda> { inline void execute() const { if (m_rp.m_num_tiles == 0) return; - const auto maxblocks = m_rp.space().cuda_device_prop().maxGridSize; + const auto maxblocks = m_rp.space().cuda_device_prop().maxGridSize; + const auto maxthreads = m_rp.space().cuda_device_prop().maxThreadsDim; + [[maybe_unused]] const auto maxThreadsPerBlock = + m_rp.space().cuda_device_prop().maxThreadsPerBlock; + // make sure the Z dimension (it is less than x,y limits) isn't exceeded + const auto clampZ = [&](const int input) { + return (input > maxthreads[2] ? maxthreads[2] : input); + }; + // make sure the block dimensions don't exceed the max number of threads + // allowed + const auto check_block_sizes = [&]([[maybe_unused]] const dim3& block) { + KOKKOS_ASSERT(block.x > 0 && + block.x <= static_cast(maxthreads[0])); + KOKKOS_ASSERT(block.y > 0 && + block.y <= static_cast(maxthreads[1])); + KOKKOS_ASSERT(block.z > 0 && + block.z <= static_cast(maxthreads[2])); + KOKKOS_ASSERT(block.x * block.y * block.z <= + static_cast(maxThreadsPerBlock)); + }; + // make sure the grid dimensions don't exceed the max number of blocks + // allowed + const auto check_grid_sizes = [&]([[maybe_unused]] const dim3& grid) { + KOKKOS_ASSERT(grid.x > 0 && + grid.x <= static_cast(maxblocks[0])); + KOKKOS_ASSERT(grid.y > 0 && + grid.y <= static_cast(maxblocks[1])); + KOKKOS_ASSERT(grid.z > 0 && + grid.z <= static_cast(maxblocks[2])); + }; if (RP::rank == 2) { const dim3 block(m_rp.m_tile[0], m_rp.m_tile[1], 1); - KOKKOS_ASSERT(block.x > 0); - KOKKOS_ASSERT(block.y > 0); + check_block_sizes(block); const dim3 grid( std::min( (m_rp.m_upper[0] - m_rp.m_lower[0] + block.x - 1) / block.x, @@ -108,13 +136,12 @@ class ParallelFor, Kokkos::Cuda> { (m_rp.m_upper[1] - m_rp.m_lower[1] + block.y - 1) / block.y, maxblocks[1]), 1); + check_grid_sizes(grid); CudaParallelLaunch( *this, grid, block, 0, m_rp.space().impl_internal_space_instance()); } else if (RP::rank == 3) { - const dim3 block(m_rp.m_tile[0], m_rp.m_tile[1], m_rp.m_tile[2]); - KOKKOS_ASSERT(block.x > 0); - KOKKOS_ASSERT(block.y > 0); - KOKKOS_ASSERT(block.z > 0); + const dim3 block(m_rp.m_tile[0], m_rp.m_tile[1], clampZ(m_rp.m_tile[2])); + check_block_sizes(block); const dim3 grid( std::min( (m_rp.m_upper[0] - m_rp.m_lower[0] + block.x - 1) / block.x, @@ -125,15 +152,16 @@ class ParallelFor, Kokkos::Cuda> { std::min( (m_rp.m_upper[2] - m_rp.m_lower[2] + block.z - 1) / block.z, maxblocks[2])); + // ensure we don't exceed the capability of the device + check_grid_sizes(grid); CudaParallelLaunch( *this, grid, block, 0, m_rp.space().impl_internal_space_instance()); } else if (RP::rank == 4) { // id0,id1 encoded within threadIdx.x; id2 to threadIdx.y; id3 to // threadIdx.z const dim3 block(m_rp.m_tile[0] * m_rp.m_tile[1], m_rp.m_tile[2], - m_rp.m_tile[3]); - KOKKOS_ASSERT(block.y > 0); - KOKKOS_ASSERT(block.z > 0); + clampZ(m_rp.m_tile[3])); + check_block_sizes(block); const dim3 grid( std::min(m_rp.m_tile_end[0] * m_rp.m_tile_end[1], maxblocks[0]), @@ -143,14 +171,15 @@ class ParallelFor, Kokkos::Cuda> { std::min( (m_rp.m_upper[3] - m_rp.m_lower[3] + block.z - 1) / block.z, maxblocks[2])); + check_grid_sizes(grid); CudaParallelLaunch( *this, grid, block, 0, m_rp.space().impl_internal_space_instance()); } else if (RP::rank == 5) { // id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4 to // threadIdx.z const dim3 block(m_rp.m_tile[0] * m_rp.m_tile[1], - m_rp.m_tile[2] * m_rp.m_tile[3], m_rp.m_tile[4]); - KOKKOS_ASSERT(block.z > 0); + m_rp.m_tile[2] * m_rp.m_tile[3], clampZ(m_rp.m_tile[4])); + check_block_sizes(block); const dim3 grid( std::min(m_rp.m_tile_end[0] * m_rp.m_tile_end[1], maxblocks[0]), @@ -159,6 +188,7 @@ class ParallelFor, Kokkos::Cuda> { std::min( (m_rp.m_upper[4] - m_rp.m_lower[4] + block.z - 1) / block.z, maxblocks[2])); + check_grid_sizes(grid); CudaParallelLaunch( *this, grid, block, 0, m_rp.space().impl_internal_space_instance()); } else if (RP::rank == 6) { @@ -166,7 +196,8 @@ class ParallelFor, Kokkos::Cuda> { // threadIdx.z const dim3 block(m_rp.m_tile[0] * m_rp.m_tile[1], m_rp.m_tile[2] * m_rp.m_tile[3], - m_rp.m_tile[4] * m_rp.m_tile[5]); + clampZ(m_rp.m_tile[4] * m_rp.m_tile[5])); + check_block_sizes(block); const dim3 grid( std::min(m_rp.m_tile_end[0] * m_rp.m_tile_end[1], maxblocks[0]), @@ -174,6 +205,7 @@ class ParallelFor, Kokkos::Cuda> { maxblocks[1]), std::min(m_rp.m_tile_end[4] * m_rp.m_tile_end[5], maxblocks[2])); + check_grid_sizes(grid); CudaParallelLaunch( *this, grid, block, 0, m_rp.space().impl_internal_space_instance()); } else { diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp index 334834938a17..8251fcb248d3 100644 --- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp @@ -48,7 +48,7 @@ class ParallelFor, Kokkos::Cuda> { const FunctorType m_functor; const Policy m_policy; - ParallelFor() = delete; + ParallelFor() = delete; ParallelFor& operator=(const ParallelFor&) = delete; template diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp index 71e775182106..a2955e3ab61d 100644 --- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp @@ -539,9 +539,14 @@ class ParallelFor, m_vector_size(arg_policy.impl_vector_length()) { auto internal_space_instance = m_policy.space().impl_internal_space_instance(); - m_team_size = m_team_size >= 0 ? m_team_size - : arg_policy.team_size_recommended( - arg_functor, ParallelForTag()); + if (m_team_size < 0) { + m_team_size = + arg_policy.team_size_recommended(arg_functor, ParallelForTag()); + if (m_team_size <= 0) + Kokkos::Impl::throw_runtime_exception( + "Kokkos::Impl::ParallelFor could not find a " + "valid execution configuration."); + } m_shmem_begin = (sizeof(double) * (m_team_size + 2)); m_shmem_size = @@ -631,7 +636,7 @@ class ParallelReduce word_count(m_functor_reducer.get_reducer().value_size() / sizeof(word_size_type)); - reference_type value = m_functor_reducer.get_reducer().init( - kokkos_impl_cuda_shared_memory() + - threadIdx.y * word_count.value); + reference_type value = + m_functor_reducer.get_reducer().init(reinterpret_cast( + kokkos_impl_cuda_shared_memory() + + threadIdx.y * word_count.value)); // Iterate this block through the league const int int_league_size = (int)m_league_size; @@ -895,11 +901,16 @@ class ParallelReduce= 0 ? m_team_size - : arg_policy.team_size_recommended( - arg_functor_reducer.get_functor(), - arg_functor_reducer.get_reducer(), - ParallelReduceTag()); + + if (m_team_size < 0) { + m_team_size = arg_policy.team_size_recommended( + arg_functor_reducer.get_functor(), arg_functor_reducer.get_reducer(), + ParallelReduceTag()); + if (m_team_size <= 0) + Kokkos::Impl::throw_runtime_exception( + "Kokkos::Impl::ParallelReduce could not find a " + "valid execution configuration."); + } m_team_begin = UseShflReduction diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp index 86d6d91bbee1..5090e84c38cc 100644 --- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp @@ -31,6 +31,9 @@ //---------------------------------------------------------------------------- +// We allow using deprecated classes in this file +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() + #if defined(__CUDA_ARCH__) #define KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN(MSG) \ { \ @@ -584,9 +587,9 @@ class TaskExec { private: enum : int { WarpSize = Kokkos::Impl::CudaTraits::WarpSize }; - TaskExec(TaskExec&&) = delete; - TaskExec(TaskExec const&) = delete; - TaskExec& operator=(TaskExec&&) = delete; + TaskExec(TaskExec&&) = delete; + TaskExec(TaskExec const&) = delete; + TaskExec& operator=(TaskExec&&) = delete; TaskExec& operator=(TaskExec const&) = delete; friend class Kokkos::Impl::TaskQueue< @@ -1224,5 +1227,7 @@ KOKKOS_INLINE_FUNCTION void single( #undef KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() + #endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */ #endif /* #ifndef KOKKOS_IMPL_CUDA_TASK_HPP */ diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp index c2b5f1fa7894..aec692c2c366 100644 --- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp @@ -184,24 +184,37 @@ class CudaTeamMember { * ( 1 == blockDim.z ) */ template - KOKKOS_INLINE_FUNCTION std::enable_if_t::value> + KOKKOS_INLINE_FUNCTION std::enable_if_t> team_reduce(ReducerType const& reducer) const noexcept { team_reduce(reducer, reducer.reference()); } template - KOKKOS_INLINE_FUNCTION std::enable_if_t::value> + KOKKOS_INLINE_FUNCTION std::enable_if_t> team_reduce(ReducerType const& reducer, typename ReducerType::value_type& value) const noexcept { (void)reducer; (void)value; + + KOKKOS_IF_ON_DEVICE(( + typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, TeamPolicy, + ReducerType, typename ReducerType::value_type>::Reducer + wrapped_reducer(reducer); + + impl_team_reduce(wrapped_reducer, value); reducer.reference() = value;)) + } + + template + KOKKOS_INLINE_FUNCTION std::enable_if_t> + impl_team_reduce( + WrappedReducerType const& wrapped_reducer, + typename WrappedReducerType::value_type& value) const noexcept { + (void)wrapped_reducer; + (void)value; + KOKKOS_IF_ON_DEVICE( - (typename Impl::FunctorAnalysis< - Impl::FunctorPatternInterface::REDUCE, TeamPolicy, - ReducerType, typename ReducerType::value_type>::Reducer - wrapped_reducer(reducer); - cuda_intra_block_reduction(value, wrapped_reducer, blockDim.y); - reducer.reference() = value;)) + (cuda_intra_block_reduction(value, wrapped_reducer, blockDim.y);)) } //-------------------------------------------------------------------------- @@ -260,23 +273,42 @@ class CudaTeamMember { //---------------------------------------- template - KOKKOS_INLINE_FUNCTION static std::enable_if_t::value> + KOKKOS_INLINE_FUNCTION static std::enable_if_t> vector_reduce(ReducerType const& reducer) { vector_reduce(reducer, reducer.reference()); } template - KOKKOS_INLINE_FUNCTION static std::enable_if_t::value> + KOKKOS_INLINE_FUNCTION static std::enable_if_t> vector_reduce(ReducerType const& reducer, typename ReducerType::value_type& value) { (void)reducer; (void)value; + + KOKKOS_IF_ON_DEVICE( + (typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, TeamPolicy, + ReducerType, typename ReducerType::value_type>::Reducer + wrapped_reducer(reducer); + + impl_vector_reduce(wrapped_reducer, value); + reducer.reference() = value;)) + } + + template + KOKKOS_INLINE_FUNCTION static std::enable_if_t< + is_reducer_v> + impl_vector_reduce(WrappedReducerType const& wrapped_reducer, + typename WrappedReducerType::value_type& value) { + (void)wrapped_reducer; + (void)value; + KOKKOS_IF_ON_DEVICE( (if (blockDim.x == 1) return; // Intra vector lane shuffle reduction: - typename ReducerType::value_type tmp(value); - typename ReducerType::value_type tmp2 = tmp; + typename WrappedReducerType::value_type tmp(value); + typename WrappedReducerType::value_type tmp2 = tmp; unsigned mask = blockDim.x == 32 @@ -287,7 +319,7 @@ class CudaTeamMember { for (int i = blockDim.x; (i >>= 1);) { Impl::in_place_shfl_down(tmp2, tmp, i, blockDim.x, mask); if ((int)threadIdx.x < i) { - reducer.join(tmp, tmp2); + wrapped_reducer.join(&tmp, &tmp2); } } @@ -297,7 +329,7 @@ class CudaTeamMember { // and thus different threads could have different results. Impl::in_place_shfl(tmp2, tmp, 0, blockDim.x, mask); - value = tmp2; reducer.reference() = tmp2;)) + value = tmp2;)) } //---------------------------------------- @@ -487,14 +519,21 @@ parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< iType, Impl::CudaTeamMember>& loop_boundaries, const Closure& closure, const ReducerType& reducer) { KOKKOS_IF_ON_DEVICE( - (typename ReducerType::value_type value; + (using value_type = typename ReducerType::value_type; + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, + ReducerType, value_type>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; - reducer.init(value); + wrapped_reducer_type wrapped_reducer(reducer); value_type value; + wrapped_reducer.init(&value); for (iType i = loop_boundaries.start + threadIdx.y; i < loop_boundaries.end; i += blockDim.y) { closure(i, value); } - loop_boundaries.member.team_reduce(reducer, value);)) + loop_boundaries.member.impl_team_reduce(wrapped_reducer, value); + wrapped_reducer.final(&value); reducer.reference() = value;)) // Avoid bogus warning about reducer value being uninitialized with combined // reducers KOKKOS_IF_ON_HOST(((void)loop_boundaries; (void)closure; @@ -518,16 +557,25 @@ parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< (void)loop_boundaries; (void)closure; (void)result; + KOKKOS_IF_ON_DEVICE( - (ValueType val; Kokkos::Sum reducer(val); + (using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, Closure, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; - reducer.init(reducer.reference()); + wrapped_reducer_type wrapped_reducer(closure); value_type value{}; + wrapped_reducer.init(&value); for (iType i = loop_boundaries.start + threadIdx.y; - i < loop_boundaries.end; i += blockDim.y) { closure(i, val); } + i < loop_boundaries.end; i += blockDim.y) { closure(i, value); } - loop_boundaries.member.team_reduce(reducer, val); - result = reducer.reference();)) + loop_boundaries.member.impl_team_reduce(wrapped_reducer, value); + wrapped_reducer.final(&value); result = value; + + )) } template @@ -548,16 +596,27 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t::value> parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct< iType, Impl::CudaTeamMember>& loop_boundaries, const Closure& closure, const ReducerType& reducer) { - KOKKOS_IF_ON_DEVICE((typename ReducerType::value_type value; - reducer.init(value); + KOKKOS_IF_ON_DEVICE( + (using value_type = typename ReducerType::value_type; + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, + ReducerType, value_type>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; - for (iType i = loop_boundaries.start + - threadIdx.y * blockDim.x + threadIdx.x; - i < loop_boundaries.end; - i += blockDim.y * blockDim.x) { closure(i, value); } + wrapped_reducer_type wrapped_reducer(reducer); value_type value; + wrapped_reducer.init(&value); + + for (iType i = + loop_boundaries.start + threadIdx.y * blockDim.x + threadIdx.x; + i < loop_boundaries.end; + i += blockDim.y * blockDim.x) { closure(i, value); } + + loop_boundaries.member.impl_vector_reduce(wrapped_reducer, value); + loop_boundaries.member.impl_team_reduce(wrapped_reducer, value); + + wrapped_reducer.final(&value); reducer.reference() = value;)) - loop_boundaries.member.vector_reduce(reducer, value); - loop_boundaries.member.team_reduce(reducer, value);)) // Avoid bogus warning about reducer value being uninitialized with combined // reducers KOKKOS_IF_ON_HOST(((void)loop_boundaries; (void)closure; @@ -573,18 +632,27 @@ parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct< (void)loop_boundaries; (void)closure; (void)result; - KOKKOS_IF_ON_DEVICE((ValueType val; Kokkos::Sum reducer(val); - reducer.init(reducer.reference()); + KOKKOS_IF_ON_DEVICE( + (using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, Closure, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; - for (iType i = loop_boundaries.start + - threadIdx.y * blockDim.x + threadIdx.x; - i < loop_boundaries.end; - i += blockDim.y * blockDim.x) { closure(i, val); } + wrapped_reducer_type wrapped_reducer(closure); value_type value; + wrapped_reducer.init(&value); - loop_boundaries.member.vector_reduce(reducer); - loop_boundaries.member.team_reduce(reducer); - result = reducer.reference();)) + for (iType i = + loop_boundaries.start + threadIdx.y * blockDim.x + threadIdx.x; + i < loop_boundaries.end; + i += blockDim.y * blockDim.x) { closure(i, value); } + + loop_boundaries.member.impl_vector_reduce(wrapped_reducer, value); + loop_boundaries.member.impl_team_reduce(wrapped_reducer, value); + + wrapped_reducer.final(&value); result = value;)) } //---------------------------------------------------------------------------- @@ -632,13 +700,22 @@ parallel_reduce(Impl::ThreadVectorRangeBoundariesStruct< Closure const& closure, ReducerType const& reducer) { KOKKOS_IF_ON_DEVICE(( - reducer.init(reducer.reference()); + using value_type = typename ReducerType::value_type; + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, + ReducerType, value_type>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + + wrapped_reducer_type wrapped_reducer(reducer); value_type value; + wrapped_reducer.init(&value); for (iType i = loop_boundaries.start + threadIdx.x; - i < loop_boundaries.end; - i += blockDim.x) { closure(i, reducer.reference()); } + i < loop_boundaries.end; i += blockDim.x) { closure(i, value); } - Impl::CudaTeamMember::vector_reduce(reducer); + Impl::CudaTeamMember::impl_vector_reduce(wrapped_reducer, value); + + wrapped_reducer.final(&value); reducer.reference() = value; )) // Avoid bogus warning about reducer value being uninitialized with combined @@ -667,15 +744,26 @@ parallel_reduce(Impl::ThreadVectorRangeBoundariesStruct< (void)loop_boundaries; (void)closure; (void)result; - KOKKOS_IF_ON_DEVICE( - (result = ValueType(); - for (iType i = loop_boundaries.start + threadIdx.x; - i < loop_boundaries.end; i += blockDim.x) { closure(i, result); } + KOKKOS_IF_ON_DEVICE(( + + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, Closure, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; - Impl::CudaTeamMember::vector_reduce(Kokkos::Sum(result)); + wrapped_reducer_type wrapped_reducer(closure); value_type value; + wrapped_reducer.init(&value); - )) + for (iType i = loop_boundaries.start + threadIdx.x; + i < loop_boundaries.end; i += blockDim.x) { closure(i, value); } + + Impl::CudaTeamMember::impl_vector_reduce(wrapped_reducer, value); + wrapped_reducer.final(&value); result = value; + + )) } //---------------------------------------------------------------------------- diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Vectorization.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Vectorization.hpp index a3f4f2f4cccf..9e0c5819f712 100644 --- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Vectorization.hpp +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Vectorization.hpp @@ -125,8 +125,8 @@ struct in_place_shfl_op { struct in_place_shfl_fn : in_place_shfl_op { template __device__ KOKKOS_IMPL_FORCEINLINE T do_shfl_op(unsigned mask, T& val, - int lane, int width) const - noexcept { + int lane, + int width) const noexcept { (void)mask; (void)val; (void)lane; @@ -136,28 +136,28 @@ struct in_place_shfl_fn : in_place_shfl_op { }; template __device__ KOKKOS_IMPL_FORCEINLINE void in_place_shfl(Args&&... args) noexcept { - in_place_shfl_fn{}((Args &&) args...); + in_place_shfl_fn{}((Args&&)args...); } struct in_place_shfl_up_fn : in_place_shfl_op { template __device__ KOKKOS_IMPL_FORCEINLINE T do_shfl_op(unsigned mask, T& val, - int lane, int width) const - noexcept { + int lane, + int width) const noexcept { return __shfl_up_sync(mask, val, lane, width); } }; template __device__ KOKKOS_IMPL_FORCEINLINE void in_place_shfl_up( Args&&... args) noexcept { - in_place_shfl_up_fn{}((Args &&) args...); + in_place_shfl_up_fn{}((Args&&)args...); } struct in_place_shfl_down_fn : in_place_shfl_op { template __device__ KOKKOS_IMPL_FORCEINLINE T do_shfl_op(unsigned mask, T& val, - int lane, int width) const - noexcept { + int lane, + int width) const noexcept { (void)mask; (void)val; (void)lane; @@ -168,7 +168,7 @@ struct in_place_shfl_down_fn : in_place_shfl_op { template __device__ KOKKOS_IMPL_FORCEINLINE void in_place_shfl_down( Args&&... args) noexcept { - in_place_shfl_down_fn{}((Args &&) args...); + in_place_shfl_down_fn{}((Args&&)args...); } } // namespace Impl diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_ZeroMemset.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_ZeroMemset.hpp index 517c592af724..0ac2d4052d2a 100644 --- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_ZeroMemset.hpp +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_ZeroMemset.hpp @@ -23,15 +23,12 @@ namespace Kokkos { namespace Impl { -template -struct ZeroMemset> { - ZeroMemset(const Kokkos::Cuda& exec_space_instance, - const View& dst) { +template <> +struct ZeroMemset { + ZeroMemset(const Kokkos::Cuda& exec_space_instance, void* dst, size_t cnt) { KOKKOS_IMPL_CUDA_SAFE_CALL( (exec_space_instance.impl_internal_space_instance() - ->cuda_memset_async_wrapper( - dst.data(), 0, - dst.size() * sizeof(typename View::value_type)))); + ->cuda_memset_async_wrapper(dst, 0, cnt))); } }; diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP.cpp b/packages/kokkos/core/src/HIP/Kokkos_HIP.cpp index aced2083ffb5..8de3a8758fa1 100644 --- a/packages/kokkos/core/src/HIP/Kokkos_HIP.cpp +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP.cpp @@ -27,6 +27,8 @@ #include +#include + namespace Kokkos { #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 @@ -49,34 +51,44 @@ void HIP::impl_initialize(InitializationSettings const& settings) { Impl::HIPInternal::m_hipDev = hip_device_id; KOKKOS_IMPL_HIP_SAFE_CALL( hipGetDeviceProperties(&Impl::HIPInternal::m_deviceProp, hip_device_id)); - const auto& hipProp = Impl::HIPInternal::m_deviceProp; KOKKOS_IMPL_HIP_SAFE_CALL(hipSetDevice(hip_device_id)); - // number of multiprocessors - Impl::HIPInternal::m_multiProcCount = hipProp.multiProcessorCount; + // Check that we are running on the expected architecture. We print a warning + // instead of erroring out because AMD does not guarantee that gcnArchName + // will always contain the gfx flag. + if (Kokkos::show_warnings()) { + if (std::string_view arch_name = + Impl::HIPInternal::m_deviceProp.gcnArchName; + arch_name.find(KOKKOS_ARCH_AMD_GPU) != 0) { + std::cerr + << "Kokkos::HIP::initialize WARNING: running kernels compiled for " + << KOKKOS_ARCH_AMD_GPU << " on " << arch_name << " device.\n"; + } + } - //---------------------------------- - // Maximum number of warps, - // at most one warp per thread in a warp for reduction. - Impl::HIPInternal::m_maxWarpCount = - hipProp.maxThreadsPerBlock / Impl::HIPTraits::WarpSize; - if (Impl::HIPTraits::WarpSize < Impl::HIPInternal::m_maxWarpCount) { - Impl::HIPInternal::m_maxWarpCount = Impl::HIPTraits::WarpSize; + // Print a warning if the user did not select the right GFX942 architecture +#ifdef KOKKOS_ARCH_AMD_GFX942 + if ((Kokkos::show_warnings()) && + (Impl::HIPInternal::m_deviceProp.integrated == 1)) { + std::cerr << "Kokkos::HIP::initialize WARNING: running kernels for MI300X " + "(discrete GPU) on a MI300A (APU).\n"; + } +#endif +#ifdef KOKKOS_ARCH_AMD_GFX942_APU + if ((Kokkos::show_warnings()) && + (Impl::HIPInternal::m_deviceProp.integrated == 0)) { + std::cerr << "Kokkos::HIP::initialize WARNING: running kernels for MI300A " + "(APU) on a MI300X (discrete GPU).\n"; } +#endif - //---------------------------------- - // Maximum number of blocks - Impl::HIPInternal::m_maxBlock[0] = hipProp.maxGridSize[0]; - Impl::HIPInternal::m_maxBlock[1] = hipProp.maxGridSize[1]; - Impl::HIPInternal::m_maxBlock[2] = hipProp.maxGridSize[2]; - - // theoretically, we can get 40 WF's / CU, but only can sustain 32 see - // https://github.com/ROCm-Developer-Tools/HIP/blob/a0b5dfd625d99af7e288629747b40dd057183173/vdi/hip_platform.cpp#L742 - Impl::HIPInternal::m_maxWavesPerCU = 32; - Impl::HIPInternal::m_shmemPerSM = hipProp.maxSharedMemoryPerMultiProcessor; - Impl::HIPInternal::m_maxShmemPerBlock = hipProp.sharedMemPerBlock; + // theoretically on GFX 9XX GPUs, we can get 40 WF's / CU, but only can + // sustain 32 see + // https://github.com/ROCm/clr/blob/4d0b815d06751735e6a50fa46e913fdf85f751f0/hipamd/src/hip_platform.cpp#L362-L366 + const int maxWavesPerCU = + Impl::HIPInternal::m_deviceProp.major <= 9 ? 32 : 64; Impl::HIPInternal::m_maxThreadsPerSM = - Impl::HIPInternal::m_maxWavesPerCU * Impl::HIPTraits::WarpSize; + maxWavesPerCU * Impl::HIPTraits::WarpSize; // Init the array for used for arbitrarily sized atomics desul::Impl::init_lock_arrays(); // FIXME @@ -146,10 +158,6 @@ void HIP::print_configuration(std::ostream& os, bool /*verbose*/) const { #else os << "no\n"; #endif -#ifdef KOKKOS_ENABLE_IMPL_HIP_UNIFIED_MEMORY - os << " KOKKOS_ENABLE_IMPL_HIP_UNIFIED_MEMORY: "; - os << "yes\n"; -#endif os << "\nRuntime Configuration:\n"; diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_BlockSize_Deduction.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_BlockSize_Deduction.hpp index 1f084c41e50e..90e5cf73559f 100644 --- a/packages/kokkos/core/src/HIP/Kokkos_HIP_BlockSize_Deduction.hpp +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_BlockSize_Deduction.hpp @@ -113,8 +113,9 @@ unsigned hip_internal_get_block_size(const HIPInternal *hip_instance, const unsigned min_waves_per_eu = LaunchBounds::minBperSM ? LaunchBounds::minBperSM : 1; const unsigned min_threads_per_sm = min_waves_per_eu * HIPTraits::WarpSize; - const unsigned shmem_per_sm = hip_instance->m_shmemPerSM; - unsigned block_size = tperb_reg; + const unsigned shmem_per_sm = + hip_instance->m_deviceProp.maxSharedMemoryPerMultiProcessor; + unsigned block_size = tperb_reg; do { unsigned total_shmem = f(block_size); // find how many threads we can fit with this blocksize based on LDS usage diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_GraphNodeKernel.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_GraphNodeKernel.hpp index 5f0df72df179..584cc63d958c 100644 --- a/packages/kokkos/core/src/HIP/Kokkos_HIP_GraphNodeKernel.hpp +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_GraphNodeKernel.hpp @@ -44,22 +44,17 @@ class GraphNodeKernelImpl // TODO use the name and executionspace template - GraphNodeKernelImpl(std::string, Kokkos::HIP const&, Functor arg_functor, + GraphNodeKernelImpl(std::string label_, HIP const&, Functor arg_functor, PolicyDeduced&& arg_policy, ArgsDeduced&&... args) - : base_t(std::move(arg_functor), (PolicyDeduced &&) arg_policy, - (ArgsDeduced &&) args...) {} + : base_t(std::move(arg_functor), (PolicyDeduced&&)arg_policy, + (ArgsDeduced&&)args...), + label(std::move(label_)) {} template GraphNodeKernelImpl(Kokkos::HIP const& exec_space, Functor arg_functor, PolicyDeduced&& arg_policy) - : GraphNodeKernelImpl("", exec_space, std::move(arg_functor), - (PolicyDeduced &&) arg_policy) {} - - ~GraphNodeKernelImpl() { - if (m_driver_storage) { - Kokkos::HIPSpace().deallocate(m_driver_storage, sizeof(base_t)); - } - } + : GraphNodeKernelImpl("[unlabeled]", exec_space, std::move(arg_functor), + (PolicyDeduced&&)arg_policy) {} void set_hip_graph_ptr(hipGraph_t* arg_graph_ptr) { m_graph_ptr = arg_graph_ptr; @@ -73,18 +68,29 @@ class GraphNodeKernelImpl hipGraph_t const* get_hip_graph_ptr() const { return m_graph_ptr; } - Kokkos::ObservingRawPtr allocate_driver_memory_buffer() const { + Kokkos::ObservingRawPtr allocate_driver_memory_buffer( + const HIP& exec) const { KOKKOS_EXPECTS(m_driver_storage == nullptr); - m_driver_storage = static_cast(Kokkos::HIPSpace().allocate( - "GraphNodeKernel global memory functor storage", sizeof(base_t))); + std::string alloc_label = + label + " - GraphNodeKernel global memory functor storage"; + m_driver_storage = std::shared_ptr( + static_cast( + HIPSpace().allocate(exec, alloc_label.c_str(), sizeof(base_t))), + // FIXME_HIP Custom deletor should use same 'exec' as for allocation. + [alloc_label](base_t* ptr) { + HIPSpace().deallocate(alloc_label.c_str(), ptr, sizeof(base_t)); + }); KOKKOS_ENSURES(m_driver_storage != nullptr); - return m_driver_storage; + return m_driver_storage.get(); } + auto get_driver_storage() const { return m_driver_storage; } + private: Kokkos::ObservingRawPtr m_graph_ptr = nullptr; Kokkos::ObservingRawPtr m_graph_node_ptr = nullptr; - Kokkos::OwningRawPtr m_driver_storage = nullptr; + mutable std::shared_ptr m_driver_storage = nullptr; + std::string label; }; struct HIPGraphNodeAggregateKernel { @@ -114,13 +120,14 @@ struct get_graph_node_kernel_type Kokkos::ParallelReduceTag>> {}; template -auto* allocate_driver_storage_for_kernel(KernelType const& kernel) { +auto* allocate_driver_storage_for_kernel(const HIP& exec, + KernelType const& kernel) { using graph_node_kernel_t = typename get_graph_node_kernel_type::type; auto const& kernel_as_graph_kernel = static_cast(kernel); - return kernel_as_graph_kernel.allocate_driver_memory_buffer(); + return kernel_as_graph_kernel.allocate_driver_memory_buffer(exec); } template diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Graph_Impl.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Graph_Impl.hpp index a0989fe67111..4f97214ca683 100644 --- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Graph_Impl.hpp +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Graph_Impl.hpp @@ -42,11 +42,11 @@ class GraphImpl { // Not movable or copyable; it spends its whole life as a shared_ptr in the // Graph object. - GraphImpl() = delete; - GraphImpl(GraphImpl const&) = delete; - GraphImpl(GraphImpl&&) = delete; + GraphImpl() = delete; + GraphImpl(GraphImpl const&) = delete; + GraphImpl(GraphImpl&&) = delete; GraphImpl& operator=(GraphImpl const&) = delete; - GraphImpl& operator=(GraphImpl&&) = delete; + GraphImpl& operator=(GraphImpl&&) = delete; ~GraphImpl(); @@ -60,7 +60,7 @@ class GraphImpl { template void add_predecessor(NodeImplPtr arg_node_ptr, PredecessorRef arg_pred_ref); - void submit(); + void submit(const Kokkos::HIP& exec); Kokkos::HIP const& get_execution_space() const noexcept; @@ -69,18 +69,28 @@ class GraphImpl { template auto create_aggregate_ptr(PredecessorRefs&&...); - private: - void instantiate_graph() { + void instantiate() { + KOKKOS_EXPECTS(!m_graph_exec); constexpr size_t error_log_size = 256; hipGraphNode_t error_node = nullptr; char error_log[error_log_size]; KOKKOS_IMPL_HIP_SAFE_CALL(hipGraphInstantiate( &m_graph_exec, m_graph, &error_node, error_log, error_log_size)); + KOKKOS_ENSURES(m_graph_exec); } + hipGraph_t hip_graph() { return m_graph; } + hipGraphExec_t hip_graph_exec() { return m_graph_exec; } + + private: Kokkos::HIP m_execution_space; hipGraph_t m_graph = nullptr; hipGraphExec_t m_graph_exec = nullptr; + + // Store drivers for the kernel nodes that launch in global memory. + // This is required as lifetime of drivers must be bounded to this instance's + // lifetime. + std::vector> m_driver_storage; }; inline GraphImpl::~GraphImpl() { @@ -123,6 +133,8 @@ inline void GraphImpl::add_node( kernel.set_hip_graph_node_ptr(&node); kernel.execute(); KOKKOS_ENSURES(node); + if (std::shared_ptr tmp = kernel.get_driver_storage()) + m_driver_storage.push_back(std::move(tmp)); } // Requires PredecessorRef is a specialization of GraphNodeRef that has @@ -145,16 +157,15 @@ inline void GraphImpl::add_predecessor( hipGraphAddDependencies(m_graph, &pred_node, &node, 1)); } -inline void GraphImpl::submit() { +inline void GraphImpl::submit(const Kokkos::HIP& exec) { if (!m_graph_exec) { - instantiate_graph(); + instantiate(); } - KOKKOS_IMPL_HIP_SAFE_CALL( - hipGraphLaunch(m_graph_exec, m_execution_space.hip_stream())); + KOKKOS_IMPL_HIP_SAFE_CALL(hipGraphLaunch(m_graph_exec, exec.hip_stream())); } -inline Kokkos::HIP const& GraphImpl::get_execution_space() const - noexcept { +inline Kokkos::HIP const& GraphImpl::get_execution_space() + const noexcept { return m_execution_space; } diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp index e0b25c69399a..54e8c315e3f6 100644 --- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp @@ -77,7 +77,8 @@ std::size_t scratch_count(const std::size_t size) { //---------------------------------------------------------------------------- int HIPInternal::concurrency() { - static int const concurrency = m_maxThreadsPerSM * m_multiProcCount; + static int const concurrency = + m_maxThreadsPerSM * m_deviceProp.multiProcessorCount; return concurrency; } @@ -97,6 +98,13 @@ void HIPInternal::print_configuration(std::ostream &s) const { << "undefined\n"; #endif + s << "macro KOKKOS_ENABLE_IMPL_HIP_MALLOC_ASYNC: "; +#ifdef KOKKOS_ENABLE_IMPL_HIP_MALLOC_ASYNC + s << "yes\n"; +#else + s << "no\n"; +#endif + for (int i : get_visible_devices()) { hipDeviceProp_t hipProp; KOKKOS_IMPL_HIP_SAFE_CALL(hipGetDeviceProperties(&hipProp, i)); @@ -177,8 +185,16 @@ void HIPInternal::initialize(hipStream_t stream) { // and scratch space for partial reduction values. // Allocate some initial space. This will grow as needed. { + // Maximum number of warps, + // at most one warp per thread in a warp for reduction. + unsigned int maxWarpCount = + m_deviceProp.maxThreadsPerBlock / Impl::HIPTraits::WarpSize; + if (Impl::HIPTraits::WarpSize < maxWarpCount) { + maxWarpCount = Impl::HIPTraits::WarpSize; + } + const unsigned reduce_block_count = - m_maxWarpCount * Impl::HIPTraits::WarpSize; + maxWarpCount * Impl::HIPTraits::WarpSize; (void)scratch_flags(reduce_block_count * 2 * sizeof(size_type)); (void)scratch_space(reduce_block_count * 16 * sizeof(size_type)); @@ -353,14 +369,8 @@ void HIPInternal::finalize() { m_num_scratch_locks = 0; } -int HIPInternal::m_hipDev = -1; -unsigned HIPInternal::m_multiProcCount = 0; -unsigned HIPInternal::m_maxWarpCount = 0; -std::array HIPInternal::m_maxBlock = {0, 0, 0}; -unsigned HIPInternal::m_maxWavesPerCU = 0; -int HIPInternal::m_shmemPerSM = 0; -int HIPInternal::m_maxShmemPerBlock = 0; -int HIPInternal::m_maxThreadsPerSM = 0; +int HIPInternal::m_hipDev = -1; +int HIPInternal::m_maxThreadsPerSM = 0; hipDeviceProp_t HIPInternal::m_deviceProp; @@ -372,15 +382,7 @@ std::mutex HIPInternal::constantMemMutex; //---------------------------------------------------------------------------- Kokkos::HIP::size_type hip_internal_multiprocessor_count() { - return HIPInternal::singleton().m_multiProcCount; -} - -Kokkos::HIP::size_type hip_internal_maximum_warp_count() { - return HIPInternal::singleton().m_maxWarpCount; -} - -std::array hip_internal_maximum_grid_count() { - return HIPInternal::singleton().m_maxBlock; + return HIPInternal::singleton().m_deviceProp.multiProcessorCount; } Kokkos::HIP::size_type *hip_internal_scratch_space(const HIP &instance, diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.hpp index 19349e90bb16..d8043dc23d7a 100644 --- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.hpp +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.hpp @@ -31,11 +31,12 @@ namespace Impl { struct HIPTraits { #if defined(KOKKOS_ARCH_AMD_GFX906) || defined(KOKKOS_ARCH_AMD_GFX908) || \ defined(KOKKOS_ARCH_AMD_GFX90A) || defined(KOKKOS_ARCH_AMD_GFX940) || \ - defined(KOKKOS_ARCH_AMD_GFX942) + defined(KOKKOS_ARCH_AMD_GFX942) || defined(KOKKOS_ARCH_AMD_GFX942_APU) static constexpr int WarpSize = 64; static constexpr int WarpIndexMask = 0x003f; /* hexadecimal for 63 */ static constexpr int WarpIndexShift = 6; /* WarpSize == 1 << WarpShift*/ -#elif defined(KOKKOS_ARCH_AMD_GFX1030) || defined(KOKKOS_ARCH_AMD_GFX1100) +#elif defined(KOKKOS_ARCH_AMD_GFX1030) || defined(KOKKOS_ARCH_AMD_GFX1100) || \ + defined(KOKKOS_ARCH_AMD_GFX1103) static constexpr int WarpSize = 32; static constexpr int WarpIndexMask = 0x001f; /* hexadecimal for 31 */ static constexpr int WarpIndexShift = 5; /* WarpSize == 1 << WarpShift*/ @@ -51,8 +52,6 @@ struct HIPTraits { //---------------------------------------------------------------------------- -HIP::size_type hip_internal_maximum_warp_count(); -std::array hip_internal_maximum_grid_count(); HIP::size_type hip_internal_multiprocessor_count(); HIP::size_type *hip_internal_scratch_space(const HIP &instance, @@ -71,12 +70,6 @@ class HIPInternal { using size_type = ::Kokkos::HIP::size_type; static int m_hipDev; - static unsigned m_multiProcCount; - static unsigned m_maxWarpCount; - static std::array m_maxBlock; - static unsigned m_maxWavesPerCU; - static int m_shmemPerSM; - static int m_maxShmemPerBlock; static int m_maxThreadsPerSM; static hipDeviceProp_t m_deviceProp; diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp index 7cd0afcf47fc..e243eb07e784 100644 --- a/packages/kokkos/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp @@ -25,11 +25,7 @@ #include #include -#if !((HIP_VERSION_MAJOR == 5) && (HIP_VERSION_MINOR == 2)) -#define KOKKOS_IMPL_HIP_GRAPH_ENABLED -#endif - -#ifdef KOKKOS_IMPL_HIP_GRAPH_ENABLED +#ifdef KOKKOS_IMPL_HIP_NATIVE_GRAPH #include #include #endif @@ -173,15 +169,15 @@ struct DeduceHIPLaunchMechanism { static constexpr HIPLaunchMechanism launch_mechanism = ((property & force_global_launch) == force_global_launch) ? HIPLaunchMechanism::GlobalMemory - : ((property & light_weight) == light_weight) - ? (sizeof(DriverType) < HIPTraits::KernelArgumentLimit - ? HIPLaunchMechanism::LocalMemory - : HIPLaunchMechanism::GlobalMemory) - : (((property & heavy_weight) == heavy_weight) - ? (sizeof(DriverType) < HIPTraits::ConstantMemoryUsage - ? HIPLaunchMechanism::ConstantMemory - : HIPLaunchMechanism::GlobalMemory) - : (default_launch_mechanism)); + : ((property & light_weight) == light_weight) + ? (sizeof(DriverType) < HIPTraits::KernelArgumentLimit + ? HIPLaunchMechanism::LocalMemory + : HIPLaunchMechanism::GlobalMemory) + : (((property & heavy_weight) == heavy_weight) + ? (sizeof(DriverType) < HIPTraits::ConstantMemoryUsage + ? HIPLaunchMechanism::ConstantMemory + : HIPLaunchMechanism::GlobalMemory) + : (default_launch_mechanism)); }; template m_stream, ManageStream::no), driver); // Unlike in the non-graph case, we can get away with doing an async copy // here because the `DriverType` instance is held in the GraphNodeImpl // which is guaranteed to be alive until the graph instance itself is // destroyed, where there should be a fence ensuring that the allocation // associated with this kernel on the device side isn't deleted. - hipMemcpyAsync(driver_ptr, &driver, sizeof(DriverType), hipMemcpyDefault, - hip_instance->m_stream); + KOKKOS_IMPL_HIP_SAFE_CALL( + hipMemcpyAsync(driver_ptr, &driver, sizeof(DriverType), + hipMemcpyDefault, hip_instance->m_stream)); void const *args[] = {&driver_ptr}; @@ -551,11 +549,11 @@ struct HIPParallelLaunch< LaunchMechanism>; HIPParallelLaunch(const DriverType &driver, const dim3 &grid, - const dim3 &block, const int shmem, + const dim3 &block, const unsigned int shmem, const HIPInternal *hip_instance, const bool /*prefer_shmem*/) { if ((grid.x != 0) && ((block.x * block.y * block.z) != 0)) { - if (hip_instance->m_maxShmemPerBlock < shmem) { + if (hip_instance->m_deviceProp.sharedMemPerBlock < shmem) { Kokkos::Impl::throw_runtime_exception( "HIPParallelLaunch FAILED: shared memory request is too large"); } @@ -585,7 +583,7 @@ void hip_parallel_launch(const DriverType &driver, const dim3 &grid, const dim3 &block, const int shmem, const HIPInternal *hip_instance, const bool prefer_shmem) { -#ifdef KOKKOS_IMPL_HIP_GRAPH_ENABLED +#ifdef KOKKOS_IMPL_HIP_NATIVE_GRAPH if constexpr (DoGraph) { // Graph launch using base_t = HIPParallelLaunchKernelInvoker, HIP> { const Policy m_policy; public: - ParallelFor() = delete; - ParallelFor(ParallelFor const&) = default; + ParallelFor() = delete; + ParallelFor(ParallelFor const&) = default; ParallelFor& operator=(ParallelFor const&) = delete; inline __device__ void operator()() const { @@ -57,7 +57,7 @@ class ParallelFor, HIP> { inline void execute() const { using ClosureType = ParallelFor; if (m_policy.m_num_tiles == 0) return; - auto const maxblocks = hip_internal_maximum_grid_count(); + auto const maxblocks = m_policy.space().hip_device_prop().maxGridSize; if (Policy::rank == 2) { dim3 const block(m_policy.m_tile[0], m_policy.m_tile[1], 1); dim3 const grid( diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_ParallelFor_Range.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_ParallelFor_Range.hpp index 9355c1c75fbe..3985dc60f06b 100644 --- a/packages/kokkos/core/src/HIP/Kokkos_HIP_ParallelFor_Range.hpp +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_ParallelFor_Range.hpp @@ -53,8 +53,8 @@ class ParallelFor, Kokkos::HIP> { public: using functor_type = FunctorType; - ParallelFor() = delete; - ParallelFor(ParallelFor const&) = default; + ParallelFor() = delete; + ParallelFor(ParallelFor const&) = default; ParallelFor& operator=(ParallelFor const&) = delete; inline __device__ void operator()() const { diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_ParallelFor_Team.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_ParallelFor_Team.hpp index bf0c2193383f..83e890bce99a 100644 --- a/packages/kokkos/core/src/HIP/Kokkos_HIP_ParallelFor_Team.hpp +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_ParallelFor_Team.hpp @@ -71,8 +71,8 @@ class ParallelFor, HIP> { } public: - ParallelFor() = delete; - ParallelFor(ParallelFor const&) = default; + ParallelFor() = delete; + ParallelFor(ParallelFor const&) = default; ParallelFor& operator=(ParallelFor const&) = delete; __device__ inline void operator()() const { @@ -120,9 +120,14 @@ class ParallelFor, HIP> { m_vector_size(arg_policy.impl_vector_length()) { auto internal_space_instance = m_policy.space().impl_internal_space_instance(); - m_team_size = m_team_size >= 0 ? m_team_size - : arg_policy.team_size_recommended( - arg_functor, ParallelForTag()); + if (m_team_size < 0) { + m_team_size = + arg_policy.team_size_recommended(arg_functor, ParallelForTag()); + if (m_team_size <= 0) + Kokkos::Impl::throw_runtime_exception( + "Kokkos::Impl::ParallelFor could not find a " + "valid execution configuration."); + } m_shmem_begin = (sizeof(double) * (m_team_size + 2)); m_shmem_size = @@ -149,8 +154,9 @@ class ParallelFor, HIP> { static_cast(m_league_size)))); } - int const shmem_size_total = m_shmem_begin + m_shmem_size; - if (internal_space_instance->m_maxShmemPerBlock < shmem_size_total) { + unsigned int const shmem_size_total = m_shmem_begin + m_shmem_size; + if (internal_space_instance->m_deviceProp.sharedMemPerBlock < + shmem_size_total) { Kokkos::Impl::throw_runtime_exception(std::string( "Kokkos::Impl::ParallelFor< HIP > insufficient shared memory")); } diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_ParallelReduce_Team.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_ParallelReduce_Team.hpp index 0c24e5cc62ad..fb4ff937cdff 100644 --- a/packages/kokkos/core/src/HIP/Kokkos_HIP_ParallelReduce_Team.hpp +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_ParallelReduce_Team.hpp @@ -46,6 +46,22 @@ class ParallelReduce 4 bytes in size, indexing into shared/global memory relies + // on the block and grid dimensions to ensure that we index at the correct + // offset rather than at every 4 byte word; such that, when the join is + // performed, we have the correct data that was copied over in chunks of 4 + // bytes. + using word_size_type = std::conditional_t< + sizeof(value_type) < sizeof(Kokkos::HIP::size_type), + std::conditional_t, + Kokkos::HIP::size_type>; using reducer_type = ReducerType; using size_type = HIP::size_type; @@ -72,7 +88,7 @@ class ParallelReduce const - word_count(reducer.value_size() / sizeof(size_type)); + integral_nonzero_constant const + word_count(reducer.value_size() / sizeof(word_size_type)); - reference_type value = - reducer.init(kokkos_impl_hip_shared_memory() + - threadIdx.y * word_count.value); + reference_type value = reducer.init(reinterpret_cast( + kokkos_impl_hip_shared_memory() + + threadIdx.y * word_count.value)); // Iterate this block through the league iterate_through_league(threadid, value); // Reduce with final value at blockDim.y - 1 location. bool do_final_reduce = (m_league_size == 0); if (!do_final_reduce) - do_final_reduce = - hip_single_inter_block_reduce_scan( - reducer, blockIdx.x, gridDim.x, - kokkos_impl_hip_shared_memory(), m_scratch_space, - m_scratch_flags); + do_final_reduce = hip_single_inter_block_reduce_scan( + reducer, blockIdx.x, gridDim.x, + kokkos_impl_hip_shared_memory(), m_scratch_space, + m_scratch_flags); if (do_final_reduce) { // This is the final block with the final result at the final threads' // location - size_type* const shared = kokkos_impl_hip_shared_memory() + - (blockDim.y - 1) * word_count.value; - size_type* const global = m_result_ptr_device_accessible - ? reinterpret_cast(m_result_ptr) - : m_scratch_space; + word_size_type* const shared = + kokkos_impl_hip_shared_memory() + + (blockDim.y - 1) * word_count.value; + size_type* const global = + m_result_ptr_device_accessible + ? reinterpret_cast(m_result_ptr) + : m_scratch_space; if (threadIdx.y == 0) { reducer.final(reinterpret_cast(shared)); @@ -227,7 +244,8 @@ class ParallelReduce(m_scratch_space), result, m_scratch_flags, blockDim.y)) { unsigned int const id = threadIdx.y * blockDim.x + threadIdx.x; if (id == 0) { @@ -249,8 +267,9 @@ class ParallelReduce(hip_internal_scratch_space( + m_policy.space(), reducer.value_size() * block_count)); m_scratch_flags = hip_internal_scratch_flags(m_policy.space(), sizeof(size_type)); @@ -306,11 +325,15 @@ class ParallelReduce= 0 ? m_team_size - : arg_policy.team_size_recommended( - arg_functor_reducer.get_functor(), - arg_functor_reducer.get_reducer(), - ParallelReduceTag()); + if (m_team_size < 0) { + m_team_size = arg_policy.team_size_recommended( + arg_functor_reducer.get_functor(), arg_functor_reducer.get_reducer(), + ParallelReduceTag()); + if (m_team_size <= 0) + Kokkos::Impl::throw_runtime_exception( + "Kokkos::Impl::ParallelReduce could not find a " + "valid execution configuration."); + } m_team_begin = UseShflReduction @@ -356,7 +379,8 @@ class ParallelReduce bad team size")); } - if (internal_space_instance->m_maxShmemPerBlock < shmem_size_total) { + if (internal_space_instance->m_deviceProp.sharedMemPerBlock < + shmem_size_total) { Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Impl::ParallelReduce< HIP > requested too much " "L0 scratch memory")); diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.cpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.cpp index 83f829fddae3..0b679218092d 100644 --- a/packages/kokkos/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.cpp +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.cpp @@ -23,7 +23,7 @@ #include #include -#ifndef KOKKOS_ENABLE_IMPL_HIP_UNIFIED_MEMORY +#ifndef KOKKOS_IMPL_HIP_UNIFIED_MEMORY KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( Kokkos::HIPSpace); #else diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.hpp index 1ca7bd5cd0e6..a464609108cd 100644 --- a/packages/kokkos/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.hpp +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.hpp @@ -20,7 +20,7 @@ #include #include -#if defined(KOKKOS_ENABLE_IMPL_HIP_UNIFIED_MEMORY) +#if defined(KOKKOS_IMPL_HIP_UNIFIED_MEMORY) KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION(Kokkos::HIPSpace); #else KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_SPECIALIZATION( diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Shuffle_Reduce.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Shuffle_Reduce.hpp index 4035bb012132..feee44ccaf17 100644 --- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Shuffle_Reduce.hpp +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Shuffle_Reduce.hpp @@ -100,7 +100,7 @@ template __device__ inline bool hip_inter_block_shuffle_reduction( typename FunctorType::reference_type value, typename FunctorType::reference_type neutral, FunctorType const& reducer, - HIP::size_type* const m_scratch_space, + typename FunctorType::pointer_type const m_scratch_space, typename FunctorType::pointer_type const /*result*/, HIP::size_type* const m_scratch_flags, int const max_active_thread = blockDim.y) { @@ -115,9 +115,8 @@ __device__ inline bool hip_inter_block_shuffle_reduction( // One thread in the block writes block result to global scratch_memory if (id == 0) { - pointer_type global = - reinterpret_cast(m_scratch_space) + blockIdx.x; - *global = value; + pointer_type global = m_scratch_space + blockIdx.x; + *global = value; __threadfence(); } @@ -140,8 +139,7 @@ __device__ inline bool hip_inter_block_shuffle_reduction( last_block = true; value = neutral; - pointer_type const global = - reinterpret_cast(m_scratch_space); + pointer_type const global = m_scratch_space; // Reduce all global values with splitting work over threads in one warp const int step_size = blockDim.x * blockDim.y < warp_size diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Space.cpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Space.cpp index 67635fc1c4ca..47f07b31abfe 100644 --- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Space.cpp +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Space.cpp @@ -51,28 +51,54 @@ static std::atomic is_first_hip_managed_allocation(true); namespace Kokkos { -HIPSpace::HIPSpace() : m_device(HIP().hip_device()) {} +HIPSpace::HIPSpace() + : m_device(HIP().hip_device()), m_stream(HIP().hip_stream()) {} HIPHostPinnedSpace::HIPHostPinnedSpace() {} HIPManagedSpace::HIPManagedSpace() : m_device(HIP().hip_device()) {} +#ifndef KOKKOS_IMPL_HIP_UNIFIED_MEMORY +void* HIPSpace::allocate(const HIP& exec_space, + const size_t arg_alloc_size) const { + return allocate(exec_space, "[unlabeled]", arg_alloc_size); +} + +void* HIPSpace::allocate(const HIP& exec_space, const char* arg_label, + const size_t arg_alloc_size, + const size_t arg_logical_size) const { + return impl_allocate(exec_space.hip_stream(), arg_label, arg_alloc_size, + arg_logical_size, true); +} +#endif + void* HIPSpace::allocate(const size_t arg_alloc_size) const { return allocate("[unlabeled]", arg_alloc_size); } -void* HIPSpace::allocate( - const char* arg_label, const size_t arg_alloc_size, - const size_t arg_logical_size) const { - return impl_allocate(arg_label, arg_alloc_size, arg_logical_size); +void* HIPSpace::allocate(const char* arg_label, const size_t arg_alloc_size, + const size_t arg_logical_size) const { + return impl_allocate(m_stream, arg_label, arg_alloc_size, arg_logical_size, + false); } + void* HIPSpace::impl_allocate( - const char* arg_label, const size_t arg_alloc_size, - const size_t arg_logical_size, - const Kokkos::Tools::SpaceHandle arg_handle) const { + [[maybe_unused]] const hipStream_t stream, const char* arg_label, + const size_t arg_alloc_size, const size_t arg_logical_size, + [[maybe_unused]] const bool stream_sync_only) const { void* ptr = nullptr; +#ifdef KOKKOS_ENABLE_IMPL_HIP_MALLOC_ASYNC + auto const error_code = hipMallocAsync(&ptr, arg_alloc_size, stream); + if (stream_sync_only) { + KOKKOS_IMPL_HIP_SAFE_CALL(hipStreamSynchronize(stream)); + } else { + KOKKOS_IMPL_HIP_SAFE_CALL(hipDeviceSynchronize()); + } +#else auto const error_code = hipMalloc(&ptr, arg_alloc_size); +#endif + if (error_code != hipSuccess) { // This is the only way to clear the last error, which we should do here // since we're turning it into an exception here @@ -80,6 +106,8 @@ void* HIPSpace::impl_allocate( Kokkos::Impl::throw_bad_alloc(name(), arg_alloc_size, arg_label); } if (Kokkos::Profiling::profileLibraryLoaded()) { + const Kokkos::Tools::SpaceHandle arg_handle = + Kokkos::Tools::make_space_handle(name()); const size_t reported_size = (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size; Kokkos::Profiling::allocateData(arg_handle, arg_label, ptr, reported_size); @@ -219,7 +247,12 @@ void HIPSpace::impl_deallocate( Kokkos::Profiling::deallocateData(arg_handle, arg_label, arg_alloc_ptr, reported_size); } +#ifdef KOKKOS_ENABLE_IMPL_HIP_MALLOC_ASYNC + KOKKOS_IMPL_HIP_SAFE_CALL(hipFreeAsync(arg_alloc_ptr, m_stream)); + KOKKOS_IMPL_HIP_SAFE_CALL(hipDeviceSynchronize()); +#else KOKKOS_IMPL_HIP_SAFE_CALL(hipFree(arg_alloc_ptr)); +#endif } void HIPHostPinnedSpace::deallocate(void* const arg_alloc_ptr, diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Space.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Space.hpp index e1b4768b8771..2380772cacf8 100644 --- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Space.hpp +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Space.hpp @@ -58,14 +58,14 @@ class HIPSpace { /*--------------------------------*/ HIPSpace(); - HIPSpace(HIPSpace&& rhs) = default; - HIPSpace(const HIPSpace& rhs) = default; - HIPSpace& operator=(HIPSpace&& rhs) = default; + HIPSpace(HIPSpace&& rhs) = default; + HIPSpace(const HIPSpace& rhs) = default; + HIPSpace& operator=(HIPSpace&& rhs) = default; HIPSpace& operator=(const HIPSpace& rhs) = default; ~HIPSpace() = default; /**\brief Allocate untracked memory in the hip space */ -#ifdef KOKKOS_ENABLE_IMPL_HIP_UNIFIED_MEMORY +#ifdef KOKKOS_IMPL_HIP_UNIFIED_MEMORY template void* allocate(const ExecutionSpace&, const size_t arg_alloc_size) const { return allocate(arg_alloc_size); @@ -77,15 +77,10 @@ class HIPSpace { return allocate(arg_label, arg_alloc_size, arg_logical_size); } #else - // FIXME_HIP Use execution space instance - void* allocate(const HIP&, const size_t arg_alloc_size) const { - return allocate(arg_alloc_size); - } - // FIXME_HIP Use execution space instance - void* allocate(const HIP&, const char* arg_label, const size_t arg_alloc_size, - const size_t arg_logical_size = 0) const { - return allocate(arg_label, arg_alloc_size, arg_logical_size); - } + void* allocate(const HIP& exec_space, const size_t arg_alloc_size) const; + void* allocate(const HIP& exec_space, const char* arg_label, + const size_t arg_alloc_size, + const size_t arg_logical_size = 0) const; #endif void* allocate(const size_t arg_alloc_size) const; void* allocate(const char* arg_label, const size_t arg_alloc_size, @@ -98,10 +93,10 @@ class HIPSpace { const size_t arg_logical_size = 0) const; private: - void* impl_allocate(const char* arg_label, const size_t arg_alloc_size, - const size_t arg_logical_size = 0, - const Kokkos::Tools::SpaceHandle = - Kokkos::Tools::make_space_handle(name())) const; + void* impl_allocate(const hipStream_t stream, const char* arg_label, + const size_t arg_alloc_size, + const size_t arg_logical_size, + bool stream_sync_only) const; void impl_deallocate(const char* arg_label, void* const arg_alloc_ptr, const size_t arg_alloc_size, const size_t arg_logical_size = 0, @@ -114,6 +109,7 @@ class HIPSpace { private: int m_device; ///< Which HIP device + hipStream_t m_stream; }; template <> @@ -140,9 +136,9 @@ class HIPHostPinnedSpace { /*--------------------------------*/ HIPHostPinnedSpace(); - HIPHostPinnedSpace(HIPHostPinnedSpace&& rhs) = default; - HIPHostPinnedSpace(const HIPHostPinnedSpace& rhs) = default; - HIPHostPinnedSpace& operator=(HIPHostPinnedSpace&& rhs) = default; + HIPHostPinnedSpace(HIPHostPinnedSpace&& rhs) = default; + HIPHostPinnedSpace(const HIPHostPinnedSpace& rhs) = default; + HIPHostPinnedSpace& operator=(HIPHostPinnedSpace&& rhs) = default; HIPHostPinnedSpace& operator=(const HIPHostPinnedSpace& rhs) = default; ~HIPHostPinnedSpace() = default; @@ -213,9 +209,9 @@ class HIPManagedSpace { /*--------------------------------*/ HIPManagedSpace(); - HIPManagedSpace(HIPManagedSpace&& rhs) = default; - HIPManagedSpace(const HIPManagedSpace& rhs) = default; - HIPManagedSpace& operator=(HIPManagedSpace&& rhs) = default; + HIPManagedSpace(HIPManagedSpace&& rhs) = default; + HIPManagedSpace(const HIPManagedSpace& rhs) = default; + HIPManagedSpace& operator=(HIPManagedSpace&& rhs) = default; HIPManagedSpace& operator=(const HIPManagedSpace& rhs) = default; ~HIPManagedSpace() = default; @@ -280,7 +276,7 @@ static_assert(Kokkos::Impl::MemorySpaceAccess::assignable); template <> struct MemorySpaceAccess { enum : bool { assignable = false }; -#if !defined(KOKKOS_ENABLE_IMPL_HIP_UNIFIED_MEMORY) +#if !defined(KOKKOS_IMPL_HIP_UNIFIED_MEMORY) enum : bool{accessible = false}; #else enum : bool { accessible = true }; diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Team.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Team.hpp index fb466d8a721f..1724b4361db8 100644 --- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Team.hpp +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Team.hpp @@ -183,7 +183,7 @@ class HIPTeamMember { typename Kokkos::Impl::FunctorAnalysis< FunctorPatternInterface::REDUCE, TeamPolicy, ReducerType, typename ReducerType::value_type>::Reducer wrapped_reducer(reducer); - hip_intra_block_shuffle_reduction(value, wrapped_reducer, blockDim.y); + impl_team_reduce(wrapped_reducer, value); reducer.reference() = value; #else (void)reducer; @@ -191,6 +191,19 @@ class HIPTeamMember { #endif } + template + KOKKOS_INLINE_FUNCTION std::enable_if_t::value> + impl_team_reduce( + WrappedReducerType const& wrapped_reducer, + typename WrappedReducerType::value_type& value) const noexcept { +#ifdef __HIP_DEVICE_COMPILE__ + hip_intra_block_shuffle_reduction(value, wrapped_reducer, blockDim.y); +#else + (void)wrapped_reducer; + (void)value; +#endif + } + //-------------------------------------------------------------------------- /** \brief Intra-team exclusive prefix sum with team_rank() ordering * with intra-team non-deterministic ordering accumulation. @@ -261,17 +274,37 @@ class HIPTeamMember { KOKKOS_INLINE_FUNCTION static std::enable_if_t::value> vector_reduce(ReducerType const& reducer, typename ReducerType::value_type& value) { +#ifdef __HIP_DEVICE_COMPILE__ + using value_type = typename ReducerType::value_type; + using wrapped_reducer_type = + typename Impl::FunctorAnalysis, ReducerType, + value_type>::Reducer; + + impl_vector_reduce(wrapped_reducer_type(reducer), value); + reducer.reference() = value; +#else + (void)reducer; + (void)value; +#endif + } + + template + KOKKOS_INLINE_FUNCTION static std::enable_if_t< + is_reducer::value> + impl_vector_reduce(WrappedReducerType const& wrapped_reducer, + typename WrappedReducerType::value_type& value) { #ifdef __HIP_DEVICE_COMPILE__ if (blockDim.x == 1) return; // Intra vector lane shuffle reduction: - typename ReducerType::value_type tmp(value); - typename ReducerType::value_type tmp2 = tmp; + typename WrappedReducerType::value_type tmp(value); + typename WrappedReducerType::value_type tmp2 = tmp; for (int i = blockDim.x; (i >>= 1);) { in_place_shfl_down(tmp2, tmp, i, blockDim.x); if (static_cast(threadIdx.x) < i) { - reducer.join(tmp, tmp2); + wrapped_reducer.join(&tmp, &tmp2); } } @@ -281,10 +314,9 @@ class HIPTeamMember { // and thus different threads could have different results. in_place_shfl(tmp2, tmp, 0, blockDim.x); - value = tmp2; - reducer.reference() = tmp2; + value = tmp2; #else - (void)reducer; + (void)wrapped_reducer; (void)value; #endif } @@ -479,15 +511,26 @@ parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< iType, Impl::HIPTeamMember>& loop_boundaries, const Closure& closure, const ReducerType& reducer) { #ifdef __HIP_DEVICE_COMPILE__ - typename ReducerType::value_type value; - reducer.init(value); + using value_type = typename ReducerType::value_type; + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, ReducerType, + value_type>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + + wrapped_reducer_type wrapped_reducer(reducer); + value_type value; + wrapped_reducer.init(&value); for (iType i = loop_boundaries.start + threadIdx.y; i < loop_boundaries.end; i += blockDim.y) { closure(i, value); } - loop_boundaries.member.team_reduce(reducer, value); + loop_boundaries.member.impl_team_reduce(wrapped_reducer, value); + + wrapped_reducer.final(&value); + reducer.reference() = value; #else (void)loop_boundaries; (void)closure; @@ -508,24 +551,24 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t::value> parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< iType, Impl::HIPTeamMember>& loop_boundaries, const Closure& closure, ValueType& result) { -#ifdef __HIP_DEVICE_COMPILE__ - ValueType val; - Kokkos::Sum reducer(val); + KOKKOS_IF_ON_DEVICE( + (using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, Closure, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; - reducer.init(reducer.reference()); + wrapped_reducer_type wrapped_reducer(closure); value_type value; + wrapped_reducer.init(&value); - for (iType i = loop_boundaries.start + threadIdx.y; i < loop_boundaries.end; - i += blockDim.y) { - closure(i, val); - } + for (iType i = loop_boundaries.start + threadIdx.y; + i < loop_boundaries.end; i += blockDim.y) { closure(i, value); } - loop_boundaries.member.team_reduce(reducer, val); - result = reducer.reference(); -#else - (void)loop_boundaries; - (void)closure; - (void)result; -#endif + loop_boundaries.member.impl_team_reduce(wrapped_reducer, value); + + wrapped_reducer.final(&value); result = value;)) + KOKKOS_IF_ON_HOST(((void)loop_boundaries; (void)closure; (void)result;)) } /** \brief Inter-thread parallel exclusive prefix sum. @@ -620,16 +663,26 @@ parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct< iType, Impl::HIPTeamMember>& loop_boundaries, const Closure& closure, const ReducerType& reducer) { #ifdef __HIP_DEVICE_COMPILE__ - typename ReducerType::value_type value; - reducer.init(value); + using value_type = typename ReducerType::value_type; + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, ReducerType, + value_type>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + + wrapped_reducer_type wrapped_reducer(reducer); + value_type value; + wrapped_reducer.init(&value); for (iType i = loop_boundaries.start + threadIdx.y * blockDim.x + threadIdx.x; i < loop_boundaries.end; i += blockDim.y * blockDim.x) { closure(i, value); } - loop_boundaries.member.vector_reduce(reducer, value); - loop_boundaries.member.team_reduce(reducer, value); + loop_boundaries.member.impl_vector_reduce(wrapped_reducer, value); + loop_boundaries.member.impl_team_reduce(wrapped_reducer, value); + wrapped_reducer.final(&value); + reducer.reference() = value; #else (void)loop_boundaries; (void)closure; @@ -642,25 +695,27 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t::value> parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct< iType, Impl::HIPTeamMember>& loop_boundaries, const Closure& closure, ValueType& result) { -#ifdef __HIP_DEVICE_COMPILE__ - ValueType val; - Kokkos::Sum reducer(val); - - reducer.init(reducer.reference()); - - for (iType i = loop_boundaries.start + threadIdx.y * blockDim.x + threadIdx.x; - i < loop_boundaries.end; i += blockDim.y * blockDim.x) { - closure(i, val); - } - - loop_boundaries.member.vector_reduce(reducer); - loop_boundaries.member.team_reduce(reducer); - result = reducer.reference(); -#else - (void)loop_boundaries; - (void)closure; - (void)result; -#endif + KOKKOS_IF_ON_DEVICE( + (using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, Closure, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; + + wrapped_reducer_type wrapped_reducer(closure); value_type value; + wrapped_reducer.init(&value); + + for (iType i = + loop_boundaries.start + threadIdx.y * blockDim.x + threadIdx.x; + i < loop_boundaries.end; + i += blockDim.y * blockDim.x) { closure(i, value); } + + loop_boundaries.member.impl_vector_reduce(wrapped_reducer, value); + loop_boundaries.member.impl_team_reduce(wrapped_reducer, value); + wrapped_reducer.final(&value); result = value;)) + + KOKKOS_IF_ON_HOST(((void)loop_boundaries; (void)closure; (void)result;)) } //---------------------------------------------------------------------------- @@ -706,14 +761,26 @@ parallel_reduce(Impl::ThreadVectorRangeBoundariesStruct< iType, Impl::HIPTeamMember> const& loop_boundaries, Closure const& closure, ReducerType const& reducer) { #ifdef __HIP_DEVICE_COMPILE__ - reducer.init(reducer.reference()); + using value_type = typename ReducerType::value_type; + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, ReducerType, + value_type>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + + wrapped_reducer_type wrapped_reducer(reducer); + value_type value; + wrapped_reducer.init(&value); for (iType i = loop_boundaries.start + threadIdx.x; i < loop_boundaries.end; i += blockDim.x) { - closure(i, reducer.reference()); + closure(i, value); } - Impl::HIPTeamMember::vector_reduce(reducer); + Impl::HIPTeamMember::impl_vector_reduce(wrapped_reducer, value); + + wrapped_reducer.final(&value); + reducer.reference() = value; #else (void)loop_boundaries; (void)closure; @@ -737,20 +804,24 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t::value> parallel_reduce(Impl::ThreadVectorRangeBoundariesStruct< iType, Impl::HIPTeamMember> const& loop_boundaries, Closure const& closure, ValueType& result) { -#ifdef __HIP_DEVICE_COMPILE__ - result = ValueType(); + KOKKOS_IF_ON_DEVICE( + (using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, Closure, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; - for (iType i = loop_boundaries.start + threadIdx.x; i < loop_boundaries.end; - i += blockDim.x) { - closure(i, result); - } + wrapped_reducer_type wrapped_reducer(closure); value_type value; + wrapped_reducer.init(&value); - Impl::HIPTeamMember::vector_reduce(Kokkos::Sum(result)); -#else - (void)loop_boundaries; - (void)closure; - (void)result; -#endif + for (iType i = loop_boundaries.start + threadIdx.x; + i < loop_boundaries.end; i += blockDim.x) { closure(i, value); } + + Impl::HIPTeamMember::impl_vector_reduce(wrapped_reducer, value); + wrapped_reducer.final(&value); result = value;)) + + KOKKOS_IF_ON_HOST(((void)loop_boundaries; (void)closure; (void)result;)) } //---------------------------------------------------------------------------- diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_TeamPolicyInternal.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_TeamPolicyInternal.hpp index 67e1181125c2..f21c65f16dd8 100644 --- a/packages/kokkos/core/src/HIP/Kokkos_HIP_TeamPolicyInternal.hpp +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_TeamPolicyInternal.hpp @@ -222,7 +222,8 @@ class TeamPolicyInternal m_tune_team_size(bool(team_size_request <= 0)), m_tune_vector_length(bool(vector_length_request <= 0)) { // Make sure league size is permissible - if (league_size_ >= static_cast(hip_internal_maximum_grid_count()[0])) + const int max_grid_size_x = m_space.hip_device_prop().maxGridSize[0]; + if (league_size_ >= max_grid_size_x) Impl::throw_runtime_exception( "Requested too large league_size for TeamPolicy on HIP execution " "space."); diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Vectorization.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Vectorization.hpp index 30774c898b67..f5b1d321e8cf 100644 --- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Vectorization.hpp +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Vectorization.hpp @@ -40,8 +40,8 @@ struct in_place_shfl_op { template // requires _assignable_from_bits __device__ inline std::enable_if_t operator()( - Scalar& out, Scalar const& in, int lane_or_delta, int width) const - noexcept { + Scalar& out, Scalar const& in, int lane_or_delta, + int width) const noexcept { using shfl_type = int; union conv_type { Scalar orig; @@ -65,16 +65,16 @@ struct in_place_shfl_op { template // requires _assignable_from_bits __device__ inline std::enable_if_t operator()( - Scalar& out, Scalar const& in, int lane_or_delta, int width) const - noexcept { + Scalar& out, Scalar const& in, int lane_or_delta, + int width) const noexcept { reinterpret_cast(out) = self().do_shfl_op( reinterpret_cast(in), lane_or_delta, width); } template __device__ inline std::enable_if_t - operator()(Scalar& out, Scalar const& in, int lane_or_delta, int width) const - noexcept { + operator()(Scalar& out, Scalar const& in, int lane_or_delta, + int width) const noexcept { reinterpret_cast(out) = self().do_shfl_op( *reinterpret_cast(&in), lane_or_delta, width); } @@ -82,8 +82,8 @@ struct in_place_shfl_op { // sizeof(Scalar) > sizeof(double) case template __device__ inline std::enable_if_t<(sizeof(Scalar) > sizeof(double))> - operator()(Scalar& out, const Scalar& val, int lane_or_delta, int width) const - noexcept { + operator()(Scalar& out, const Scalar& val, int lane_or_delta, + int width) const noexcept { using shuffle_as_t = int; constexpr int N = sizeof(Scalar) / sizeof(shuffle_as_t); @@ -108,7 +108,7 @@ struct in_place_shfl_fn : in_place_shfl_op { template __device__ KOKKOS_IMPL_FORCEINLINE void in_place_shfl(Args&&... args) noexcept { - in_place_shfl_fn{}((Args &&) args...); + in_place_shfl_fn{}((Args&&)args...); } struct in_place_shfl_up_fn : in_place_shfl_op { @@ -123,7 +123,7 @@ struct in_place_shfl_up_fn : in_place_shfl_op { template __device__ KOKKOS_IMPL_FORCEINLINE void in_place_shfl_up( Args&&... args) noexcept { - in_place_shfl_up_fn{}((Args &&) args...); + in_place_shfl_up_fn{}((Args&&)args...); } struct in_place_shfl_down_fn : in_place_shfl_op { @@ -138,7 +138,7 @@ struct in_place_shfl_down_fn : in_place_shfl_op { template __device__ KOKKOS_IMPL_FORCEINLINE void in_place_shfl_down( Args&&... args) noexcept { - in_place_shfl_down_fn{}((Args &&) args...); + in_place_shfl_down_fn{}((Args&&)args...); } } // namespace Impl diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_ZeroMemset.cpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_ZeroMemset.cpp new file mode 100644 index 000000000000..34d5ecf1a657 --- /dev/null +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_ZeroMemset.cpp @@ -0,0 +1,36 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE +#define KOKKOS_IMPL_PUBLIC_INCLUDE +#endif + +#include +#include + +namespace Kokkos { +namespace Impl { + +// alternative to hipMemsetAsync, which sets the first `cnt` bytes of `dst` to 0 +void zero_with_hip_kernel(const HIP& exec_space, void* dst, size_t cnt) { + Kokkos::parallel_for( + "Kokkos::ZeroMemset via parallel_for", + Kokkos::RangePolicy(exec_space, 0, cnt), + KOKKOS_LAMBDA(size_t i) { static_cast(dst)[i] = 0; }); +} + +} // namespace Impl +} // namespace Kokkos diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_ZeroMemset.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_ZeroMemset.hpp index 4bca29868f78..18708cf8c566 100644 --- a/packages/kokkos/core/src/HIP/Kokkos_HIP_ZeroMemset.hpp +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_ZeroMemset.hpp @@ -23,12 +23,21 @@ namespace Kokkos { namespace Impl { -template -struct ZeroMemset> { - ZeroMemset(const HIP& exec_space, const View& dst) { - KOKKOS_IMPL_HIP_SAFE_CALL(hipMemsetAsync( - dst.data(), 0, dst.size() * sizeof(typename View::value_type), - exec_space.hip_stream())); +// hipMemsetAsync sets the first `cnt` bytes of `dst` to the provided value +void zero_with_hip_kernel(const HIP& exec_space, void* dst, size_t cnt); + +template <> +struct ZeroMemset { + ZeroMemset(const HIP& exec_space, void* dst, size_t cnt) { + // in ROCm <= 6.2.0, hipMemsetAsync on a host-allocated pointer + // returns an invalid value error, but accessing the data via a + // GPU kernel works. +#if defined(KOKKOS_IMPL_HIP_UNIFIED_MEMORY) + zero_with_hip_kernel(exec_space, dst, cnt); +#else + KOKKOS_IMPL_HIP_SAFE_CALL( + hipMemsetAsync(dst, 0, cnt, exec_space.hip_stream())); +#endif } }; diff --git a/packages/kokkos/core/src/HPX/Kokkos_HPX.hpp b/packages/kokkos/core/src/HPX/Kokkos_HPX.hpp index 245dc128ca86..7d4993379087 100644 --- a/packages/kokkos/core/src/HPX/Kokkos_HPX.hpp +++ b/packages/kokkos/core/src/HPX/Kokkos_HPX.hpp @@ -32,12 +32,10 @@ static_assert(false, #include #include #include -#include #include #include #include #include -#include #include #include @@ -75,12 +73,12 @@ class hpx_thread_buffer { } public: - hpx_thread_buffer() = default; - ~hpx_thread_buffer() = default; - hpx_thread_buffer(const hpx_thread_buffer &) = delete; - hpx_thread_buffer(hpx_thread_buffer &&) = delete; + hpx_thread_buffer() = default; + ~hpx_thread_buffer() = default; + hpx_thread_buffer(const hpx_thread_buffer &) = delete; + hpx_thread_buffer(hpx_thread_buffer &&) = delete; hpx_thread_buffer &operator=(const hpx_thread_buffer &) = delete; - hpx_thread_buffer &operator=(hpx_thread_buffer) = delete; + hpx_thread_buffer &operator=(hpx_thread_buffer) = delete; void resize(const std::size_t num_threads, const std::size_t size_per_thread, const std::size_t extra_space = 0) noexcept; @@ -140,10 +138,10 @@ class HPX { hpx::execution::experimental::unique_any_sender<> &&sender) : m_instance_id(instance_id), m_sender{std::move(sender)} {} - instance_data(const instance_data &) = delete; - instance_data(instance_data &&) = delete; + instance_data(const instance_data &) = delete; + instance_data(instance_data &&) = delete; instance_data &operator=(const instance_data &) = delete; - instance_data &operator=(instance_data) = delete; + instance_data &operator=(instance_data) = delete; uint32_t m_instance_id{HPX::impl_default_instance_id()}; hpx::execution::experimental::unique_any_sender<> m_sender{ @@ -196,7 +194,7 @@ class HPX { HPX(HPX &&other) = default; HPX(const HPX &other) = default; - HPX &operator=(HPX &&) = default; + HPX &operator=(HPX &&) = default; HPX &operator=(const HPX &) = default; void print_configuration(std::ostream &os, bool /*verbose*/ = false) const; @@ -214,9 +212,9 @@ class HPX { struct impl_in_parallel_scope { impl_in_parallel_scope() noexcept; ~impl_in_parallel_scope() noexcept; - impl_in_parallel_scope(impl_in_parallel_scope &&) = delete; - impl_in_parallel_scope(impl_in_parallel_scope const &) = delete; - impl_in_parallel_scope &operator=(impl_in_parallel_scope &&) = delete; + impl_in_parallel_scope(impl_in_parallel_scope &&) = delete; + impl_in_parallel_scope(impl_in_parallel_scope const &) = delete; + impl_in_parallel_scope &operator=(impl_in_parallel_scope &&) = delete; impl_in_parallel_scope &operator=(impl_in_parallel_scope const &) = delete; }; @@ -249,13 +247,15 @@ class HPX { impl_instance_fence(name); } - static bool is_asynchronous(HPX const & = HPX()) noexcept { +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + KOKKOS_DEPRECATED static bool is_asynchronous(HPX const & = HPX()) noexcept { #if defined(KOKKOS_ENABLE_IMPL_HPX_ASYNC_DISPATCH) return true; #else return false; #endif } +#endif #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 static int concurrency(); @@ -281,8 +281,8 @@ class HPX { return impl_get_instance_data().m_buffer; } - hpx::execution::experimental::unique_any_sender<> &impl_get_sender() const - noexcept { + hpx::execution::experimental::unique_any_sender<> &impl_get_sender() + const noexcept { return impl_get_instance_data().m_sender; } @@ -447,6 +447,20 @@ class HPX { } }; +template +std::vector partition_space(HPX const &, Args... args) { + std::vector instances(sizeof...(args)); + for (auto &in : instances) in = HPX(HPX::instance_mode::independent); + return instances; +} + +template +std::vector partition_space(HPX const &, std::vector const &weights) { + std::vector instances(weights.size()); + for (auto &in : instances) in = HPX(HPX::instance_mode::independent); + return instances; +} + extern template void HPX::impl_bulk_plain_erased( bool, bool, std::function &&, int const, hpx::threads::thread_stacksize stacksize) const; @@ -1772,11 +1786,24 @@ KOKKOS_INLINE_FUNCTION void parallel_reduce( const Impl::TeamThreadRangeBoundariesStruct &loop_boundaries, const Lambda &lambda, ValueType &result) { - result = ValueType(); + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, Lambda, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; + + wrapped_reducer_type wrapped_reducer(lambda); + value_type value; + wrapped_reducer.init(&value); + for (iType i = loop_boundaries.start; i < loop_boundaries.end; i += loop_boundaries.increment) { - lambda(i, result); + lambda(i, value); } + + wrapped_reducer.final(&value); + result = value; } /** \brief Intra-thread vector parallel_for. Executes lambda(iType i) for each @@ -1810,14 +1837,26 @@ KOKKOS_INLINE_FUNCTION void parallel_reduce( const Impl::ThreadVectorRangeBoundariesStruct &loop_boundaries, const Lambda &lambda, ValueType &result) { - result = ValueType(); + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, Lambda, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; + + wrapped_reducer_type wrapped_reducer(lambda); + value_type value; + wrapped_reducer.init(&value); + #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP #pragma ivdep #endif for (iType i = loop_boundaries.start; i < loop_boundaries.end; i += loop_boundaries.increment) { - lambda(i, result); + lambda(i, value); } + wrapped_reducer.final(&value); + result = value; } template &loop_boundaries, const Lambda &lambda, const ReducerType &reducer) { - reducer.init(reducer.reference()); + using value_type = typename ReducerType::value_type; + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, ReducerType, + value_type>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + + wrapped_reducer_type wrapped_reducer(reducer); + value_type value; + wrapped_reducer.init(&value); + for (iType i = loop_boundaries.start; i < loop_boundaries.end; i += loop_boundaries.increment) { - lambda(i, reducer.reference()); + lambda(i, value); } + + wrapped_reducer.final(&value); + reducer.reference() = value; } template &loop_boundaries, const Lambda &lambda, const ReducerType &reducer) { - reducer.init(reducer.reference()); + using value_type = typename ReducerType::value_type; + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, ReducerType, + value_type>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + + wrapped_reducer_type wrapped_reducer(reducer); + value_type value; + wrapped_reducer.init(&value); + #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP #pragma ivdep #endif for (iType i = loop_boundaries.start; i < loop_boundaries.end; i += loop_boundaries.increment) { - lambda(i, reducer.reference()); + lambda(i, value); } + + wrapped_reducer.final(&value); + reducer.reference() = value; } template @@ -1995,7 +2060,9 @@ KOKKOS_INLINE_FUNCTION void single( } // namespace Kokkos +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 #include +#endif #endif /* #if defined( KOKKOS_ENABLE_HPX ) */ #endif /* #ifndef KOKKOS_HPX_HPP */ diff --git a/packages/kokkos/core/src/HPX/Kokkos_HPX_Task.hpp b/packages/kokkos/core/src/HPX/Kokkos_HPX_Task.hpp index 28c75b2515ae..d775b7fac3b7 100644 --- a/packages/kokkos/core/src/HPX/Kokkos_HPX_Task.hpp +++ b/packages/kokkos/core/src/HPX/Kokkos_HPX_Task.hpp @@ -25,6 +25,8 @@ #include +#include + #include #include @@ -33,6 +35,11 @@ //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS +// We allow using deprecated classes in this file +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() +#endif + namespace Kokkos { namespace Impl { @@ -256,6 +263,10 @@ extern template class TaskQueue< } // namespace Impl } // namespace Kokkos +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() +#endif + //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- diff --git a/packages/kokkos/core/src/KokkosExp_MDRangePolicy.hpp b/packages/kokkos/core/src/KokkosExp_MDRangePolicy.hpp index 297b1fadee94..92dc506c5e9d 100644 --- a/packages/kokkos/core/src/KokkosExp_MDRangePolicy.hpp +++ b/packages/kokkos/core/src/KokkosExp_MDRangePolicy.hpp @@ -30,6 +30,7 @@ static_assert(false, #include #include #include +#include namespace Kokkos { @@ -60,13 +61,13 @@ namespace Impl { // NOTE the comparison below is encapsulated to silent warnings about pointless // comparison of unsigned integer with zero template -constexpr std::enable_if_t::value, bool> +constexpr std::enable_if_t, bool> is_less_than_value_initialized_variable(T) { return false; } template -constexpr std::enable_if_t::value, bool> +constexpr std::enable_if_t, bool> is_less_than_value_initialized_variable(T arg) { return arg < T{}; } @@ -75,7 +76,7 @@ is_less_than_value_initialized_variable(T arg) { template constexpr To checked_narrow_cast(From arg, std::size_t idx) { constexpr const bool is_different_signedness = - (std::is_signed::value != std::is_signed::value); + (std::is_signed_v != std::is_signed_v); auto const ret = static_cast(arg); if (static_cast(ret) != arg || (is_different_signedness && @@ -183,7 +184,7 @@ struct MDRangePolicy template friend struct MDRangePolicy; - static_assert(!std::is_void::value, + static_assert(!std::is_void_v, "Kokkos Error: MD iteration pattern not defined"); using iteration_pattern = typename traits::iteration_pattern; @@ -238,9 +239,9 @@ struct MDRangePolicy template ::value && - std::is_integral::value && - std::is_integral::value>> + typename = std::enable_if_t && + std::is_integral_v && + std::is_integral_v>> MDRangePolicy(const LT (&lower)[LN], const UT (&upper)[UN], const TT (&tile)[TN] = {}) : MDRangePolicy( @@ -257,9 +258,9 @@ struct MDRangePolicy template ::value && - std::is_integral::value && - std::is_integral::value>> + typename = std::enable_if_t && + std::is_integral_v && + std::is_integral_v>> MDRangePolicy(const typename traits::execution_space& work_space, const LT (&lower)[LN], const UT (&upper)[UN], const TT (&tile)[TN] = {}) @@ -291,14 +292,14 @@ struct MDRangePolicy } template ::value>> + typename = std::enable_if_t>> MDRangePolicy(Kokkos::Array const& lower, Kokkos::Array const& upper, Kokkos::Array const& tile = Kokkos::Array{}) : MDRangePolicy(typename traits::execution_space(), lower, upper, tile) {} template ::value>> + typename = std::enable_if_t>> MDRangePolicy(const typename traits::execution_space& work_space, Kokkos::Array const& lower, Kokkos::Array const& upper, @@ -330,7 +331,44 @@ struct MDRangePolicy } bool impl_tune_tile_size() const { return m_tune_tile_size; } + tile_type tile_size_recommended() const { + tile_type rec_tile_sizes = {}; + + for (std::size_t i = 0; i < rec_tile_sizes.size(); ++i) { + rec_tile_sizes[i] = tile_size_recommended(i); + } + return rec_tile_sizes; + } + + int max_total_tile_size() const { + return Impl::get_tile_size_properties(m_space).max_total_tile_size; + } + private: + int tile_size_recommended(const int tile_rank) const { + auto properties = Impl::get_tile_size_properties(m_space); + int last_rank = (inner_direction == Iterate::Right) ? rank - 1 : 0; + int rank_acc = + (inner_direction == Iterate::Right) ? tile_rank + 1 : tile_rank - 1; + int rec_tile_size = (std::pow(properties.default_tile_size, rank_acc) < + properties.max_total_tile_size) + ? properties.default_tile_size + : 1; + + if (tile_rank == last_rank) { + rec_tile_size = tile_size_last_rank( + properties, m_upper[last_rank] - m_lower[last_rank]); + } + return rec_tile_size; + } + + int tile_size_last_rank(const Impl::TileSizeProperties properties, + const index_type length) const { + return properties.default_largest_tile_size == 0 + ? std::max(length, 1) + : properties.default_largest_tile_size; + } + void init_helper(Impl::TileSizeProperties properties) { m_prod_tile_dims = 1; int increment = 1; @@ -341,6 +379,7 @@ struct MDRangePolicy rank_start = rank - 1; rank_end = -1; } + for (int i = rank_start; i != rank_end; i += increment) { const index_type length = m_upper[i] - m_lower[i]; @@ -368,9 +407,7 @@ struct MDRangePolicy m_tile[i] = 1; } } else { - m_tile[i] = properties.default_largest_tile_size == 0 - ? std::max(length, 1) - : properties.default_largest_tile_size; + m_tile[i] = tile_size_last_rank(properties, length); } } m_tile_end[i] = @@ -389,58 +426,55 @@ struct MDRangePolicy }; template -MDRangePolicy(const LT (&)[N], const UT (&)[N])->MDRangePolicy>; +MDRangePolicy(const LT (&)[N], const UT (&)[N]) -> MDRangePolicy>; template MDRangePolicy(const LT (&)[N], const UT (&)[N], const TT (&)[TN]) - ->MDRangePolicy>; + -> MDRangePolicy>; template MDRangePolicy(DefaultExecutionSpace const&, const LT (&)[N], const UT (&)[N]) - ->MDRangePolicy>; + -> MDRangePolicy>; template MDRangePolicy(DefaultExecutionSpace const&, const LT (&)[N], const UT (&)[N], - const TT (&)[TN]) - ->MDRangePolicy>; + const TT (&)[TN]) -> MDRangePolicy>; template >> MDRangePolicy(ES const&, const LT (&)[N], const UT (&)[N]) - ->MDRangePolicy>; + -> MDRangePolicy>; template >> MDRangePolicy(ES const&, const LT (&)[N], const UT (&)[N], const TT (&)[TN]) - ->MDRangePolicy>; + -> MDRangePolicy>; template -MDRangePolicy(Array const&, Array const&)->MDRangePolicy>; +MDRangePolicy(Array const&, Array const&) -> MDRangePolicy>; template MDRangePolicy(Array const&, Array const&, Array const&) - ->MDRangePolicy>; + -> MDRangePolicy>; template MDRangePolicy(DefaultExecutionSpace const&, Array const&, - Array const&) - ->MDRangePolicy>; + Array const&) -> MDRangePolicy>; template MDRangePolicy(DefaultExecutionSpace const&, Array const&, Array const&, Array const&) - ->MDRangePolicy>; + -> MDRangePolicy>; template >> MDRangePolicy(ES const&, Array const&, Array const&) - ->MDRangePolicy>; + -> MDRangePolicy>; template >> MDRangePolicy(ES const&, Array const&, Array const&, - Array const&) - ->MDRangePolicy>; + Array const&) -> MDRangePolicy>; } // namespace Kokkos diff --git a/packages/kokkos/core/src/Kokkos_AnonymousSpace.hpp b/packages/kokkos/core/src/Kokkos_AnonymousSpace.hpp index 9f5deed5d66f..62f527aa025c 100644 --- a/packages/kokkos/core/src/Kokkos_AnonymousSpace.hpp +++ b/packages/kokkos/core/src/Kokkos_AnonymousSpace.hpp @@ -41,10 +41,10 @@ class AnonymousSpace { using device_type = Kokkos::Device; /**\brief Default memory space instance */ - AnonymousSpace() = default; - AnonymousSpace(AnonymousSpace &&rhs) = default; - AnonymousSpace(const AnonymousSpace &rhs) = default; - AnonymousSpace &operator=(AnonymousSpace &&) = default; + AnonymousSpace() = default; + AnonymousSpace(AnonymousSpace &&rhs) = default; + AnonymousSpace(const AnonymousSpace &rhs) = default; + AnonymousSpace &operator=(AnonymousSpace &&) = default; AnonymousSpace &operator=(const AnonymousSpace &) = default; ~AnonymousSpace() = default; diff --git a/packages/kokkos/core/src/Kokkos_Array.hpp b/packages/kokkos/core/src/Kokkos_Array.hpp index 4d905fbc5538..493536b53bed 100644 --- a/packages/kokkos/core/src/Kokkos_Array.hpp +++ b/packages/kokkos/core/src/Kokkos_Array.hpp @@ -35,7 +35,7 @@ namespace Kokkos { #ifdef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK namespace Impl { -template ::value> +template > struct ArrayBoundsCheck; template @@ -195,8 +195,10 @@ struct Array { return *reinterpret_cast(-1); } - KOKKOS_INLINE_FUNCTION pointer data() { return nullptr; } - KOKKOS_INLINE_FUNCTION const_pointer data() const { return nullptr; } + KOKKOS_INLINE_FUNCTION constexpr pointer data() { return nullptr; } + KOKKOS_INLINE_FUNCTION constexpr const_pointer data() const { + return nullptr; + } friend KOKKOS_FUNCTION constexpr bool operator==(Array const&, Array const&) noexcept { @@ -365,7 +367,7 @@ struct KOKKOS_DEPRECATED #endif template -Array(T, Us...)->Array; +Array(T, Us...) -> Array; namespace Impl { @@ -377,7 +379,7 @@ KOKKOS_FUNCTION constexpr Array, N> to_array_impl( template KOKKOS_FUNCTION constexpr Array, N> to_array_impl( - T(&&a)[N], std::index_sequence) { + T (&&a)[N], std::index_sequence) { return {{std::move(a[I])...}}; } @@ -389,7 +391,7 @@ KOKKOS_FUNCTION constexpr auto to_array(T (&a)[N]) { } template -KOKKOS_FUNCTION constexpr auto to_array(T(&&a)[N]) { +KOKKOS_FUNCTION constexpr auto to_array(T (&&a)[N]) { return Impl::to_array_impl(std::move(a), std::make_index_sequence{}); } @@ -435,6 +437,32 @@ KOKKOS_FUNCTION constexpr T const&& get(Array const&& a) noexcept { } // namespace Kokkos // +// +namespace Kokkos { + +template +KOKKOS_FUNCTION constexpr T const* begin(Array const& a) noexcept { + return a.data(); +} + +template +KOKKOS_FUNCTION constexpr T* begin(Array& a) noexcept { + return a.data(); +} + +template +KOKKOS_FUNCTION constexpr T const* end(Array const& a) noexcept { + return a.data() + a.size(); +} + +template +KOKKOS_FUNCTION constexpr T* end(Array& a) noexcept { + return a.data() + a.size(); +} + +} // namespace Kokkos +// + #ifdef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_ARRAY #undef KOKKOS_IMPL_PUBLIC_INCLUDE #undef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_ARRAY diff --git a/packages/kokkos/core/src/Kokkos_Atomic.hpp b/packages/kokkos/core/src/Kokkos_Atomic.hpp index 6fc903f27434..ba6113609229 100644 --- a/packages/kokkos/core/src/Kokkos_Atomic.hpp +++ b/packages/kokkos/core/src/Kokkos_Atomic.hpp @@ -47,7 +47,6 @@ #include #include -#include #ifdef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_ATOMIC #undef KOKKOS_IMPL_PUBLIC_INCLUDE diff --git a/packages/kokkos/core/src/Kokkos_Atomics_Desul_Volatile_Wrapper.hpp b/packages/kokkos/core/src/Kokkos_Atomics_Desul_Volatile_Wrapper.hpp deleted file mode 100644 index bf57dcae650e..000000000000 --- a/packages/kokkos/core/src/Kokkos_Atomics_Desul_Volatile_Wrapper.hpp +++ /dev/null @@ -1,196 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE -#include -static_assert(false, - "Including non-public Kokkos header files is not allowed."); -#endif -#ifndef KOKKOS_DESUL_ATOMICS_VOLATILE_WRAPPER_HPP_ -#define KOKKOS_DESUL_ATOMICS_VOLATILE_WRAPPER_HPP_ -#include -#include - -#ifdef KOKKOS_ENABLE_ATOMICS_BYPASS -#define KOKKOS_DESUL_MEM_SCOPE desul::MemoryScopeCaller() -#else -#define KOKKOS_DESUL_MEM_SCOPE desul::MemoryScopeDevice() -#endif - -// clang-format off -namespace Kokkos { - -template KOKKOS_INLINE_FUNCTION -T atomic_load(volatile T* const dest) { return desul::atomic_load(const_cast(dest), desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_store(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_store(const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -// atomic_fetch_op -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_add (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_add (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_sub (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_sub (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_max (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_max (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_min (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_min (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_mul (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_mul (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_div (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_div (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_mod (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_mod (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_and (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_and (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_or (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_or (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_xor (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_xor (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_nand(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_nand(const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_lshift(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_lshift(const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_rshift(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_rshift(const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_inc(volatile T* const dest) { return desul::atomic_fetch_inc(const_cast(dest),desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_dec(volatile T* const dest) { return desul::atomic_fetch_dec(const_cast(dest),desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - - -// atomic_op_fetch -template KOKKOS_INLINE_FUNCTION -T atomic_add_fetch (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_add_fetch (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_sub_fetch (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_sub_fetch (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_max_fetch (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_max_fetch (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_min_fetch (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_min_fetch (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_mul_fetch (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_mul_fetch (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_div_fetch (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_div_fetch (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_mod_fetch (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_mod_fetch (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_and_fetch (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_and_fetch (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_or_fetch (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_or_fetch (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_xor_fetch (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_xor_fetch (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_nand_fetch(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_nand_fetch(const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_lshift_fetch(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_lshift_fetch(const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_rshift_fetch(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_rshift_fetch(const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_inc_fetch(volatile T* const dest) { return desul::atomic_inc_fetch(const_cast(dest),desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_dec_fetch(volatile T* const dest) { return desul::atomic_dec_fetch(const_cast(dest),desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - - -// atomic_op -template KOKKOS_INLINE_FUNCTION -void atomic_add(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_add (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_sub(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_sub (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_mul(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_mul (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_div(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_div (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_min(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_min (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_max(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_max (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -// FIXME: Desul doesn't have atomic_and yet so call fetch_and -template KOKKOS_INLINE_FUNCTION -void atomic_and(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { (void) desul::atomic_fetch_and (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -// FIXME: Desul doesn't have atomic_or yet so call fetch_or -template KOKKOS_INLINE_FUNCTION -void atomic_or (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { (void) desul::atomic_fetch_or (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_inc(volatile T* const dest) { return desul::atomic_inc(const_cast(dest),desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_dec(volatile T* const dest) { return desul::atomic_dec(const_cast(dest),desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_increment(volatile T* const dest) { return desul::atomic_inc(const_cast(dest),desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_decrement(volatile T* const dest) { return desul::atomic_dec(const_cast(dest),desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -// Exchange - -template KOKKOS_INLINE_FUNCTION -T atomic_exchange(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_exchange(const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -bool atomic_compare_exchange_strong(volatile T* const dest, T& expected, const T desired) { - return desul::atomic_compare_exchange_strong(const_cast(dest),expected, desired, - desul::MemoryOrderRelaxed(), desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); -} - -template KOKKOS_INLINE_FUNCTION -T atomic_compare_exchange(volatile T* const dest, const T compare, const T desired) { - return desul::atomic_compare_exchange(const_cast(dest),compare, desired, - desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); -} - -} -#undef KOKKOS_DESUL_MEM_SCOPE - -// clang-format on -#endif diff --git a/packages/kokkos/core/src/Kokkos_Atomics_Desul_Wrapper.hpp b/packages/kokkos/core/src/Kokkos_Atomics_Desul_Wrapper.hpp index 26db69ac1f11..40f51c5a3340 100644 --- a/packages/kokkos/core/src/Kokkos_Atomics_Desul_Wrapper.hpp +++ b/packages/kokkos/core/src/Kokkos_Atomics_Desul_Wrapper.hpp @@ -24,14 +24,16 @@ static_assert(false, #include #include +#include // identity_type #include -// clang-format off namespace Kokkos { -// FIXME: These functions don't have any use/test in unit tests ... -// ========================================================== -inline const char* atomic_query_version() { return "KOKKOS_DESUL_ATOMICS"; } +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +KOKKOS_DEPRECATED inline const char* atomic_query_version() { + return "KOKKOS_DESUL_ATOMICS"; +} +#endif #if defined(KOKKOS_COMPILER_GNU) && !defined(__PGIC__) && \ !defined(__CUDA_ARCH__) @@ -53,197 +55,120 @@ inline const char* atomic_query_version() { return "KOKKOS_DESUL_ATOMICS"; } #define KOKKOS_DESUL_MEM_SCOPE desul::MemoryScopeDevice() #endif -template KOKKOS_INLINE_FUNCTION -T atomic_load(T* const dest) { return desul::atomic_load(dest, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_store(T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_store(dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_assign(T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { atomic_store(dest,val); } +namespace Impl { +template +using not_deduced_atomic_t = + std::add_const_t>>; + +template +using enable_if_atomic_t = + std::enable_if_t && !std::is_const_v, + std::remove_volatile_t>; +} // namespace Impl -KOKKOS_INLINE_FUNCTION -void memory_fence() { - desul::atomic_thread_fence(desul::MemoryOrderSeqCst(), KOKKOS_DESUL_MEM_SCOPE); -} +// clang-format off -KOKKOS_INLINE_FUNCTION -void load_fence() { return desul::atomic_thread_fence(desul::MemoryOrderAcquire(), KOKKOS_DESUL_MEM_SCOPE); } +// fences +KOKKOS_INLINE_FUNCTION void memory_fence() { desul::atomic_thread_fence(desul::MemoryOrderSeqCst(), KOKKOS_DESUL_MEM_SCOPE); } +KOKKOS_INLINE_FUNCTION void load_fence() { desul::atomic_thread_fence(desul::MemoryOrderAcquire(), KOKKOS_DESUL_MEM_SCOPE); } +KOKKOS_INLINE_FUNCTION void store_fence() { desul::atomic_thread_fence(desul::MemoryOrderRelease(), KOKKOS_DESUL_MEM_SCOPE); } -KOKKOS_INLINE_FUNCTION -void store_fence() { return desul::atomic_thread_fence(desul::MemoryOrderRelease(), KOKKOS_DESUL_MEM_SCOPE); } +// load/store +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_load (T const* ptr) { return desul::atomic_load (const_cast*>(ptr), desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_store(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_store(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +template KOKKOS_DEPRECATED_WITH_COMMENT("Use atomic_store() instead!") KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_assign(T* ptr, Impl::not_deduced_atomic_t val) { atomic_store(ptr, val); } +#endif // atomic_fetch_op -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_add (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_add (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_sub (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_sub (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_max (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_max (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_min (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_min (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_mul (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_mul (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_div (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_div (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_mod (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_mod (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_and (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_and (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_or (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_or (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_xor (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_xor (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_nand(T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_nand(dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_lshift(T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_lshift(dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_rshift(T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_rshift(dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_inc(T* const dest) { return desul::atomic_fetch_inc(dest, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_dec(T* const dest) { return desul::atomic_fetch_dec(dest, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_fetch_add(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_fetch_add(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_fetch_sub(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_fetch_sub(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_fetch_max(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_fetch_max(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_fetch_min(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_fetch_min(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_fetch_mul(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_fetch_mul(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_fetch_div(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_fetch_div(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_fetch_mod(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_fetch_mod(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_fetch_and(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_fetch_and(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_fetch_or (T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_fetch_or (const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_fetch_xor(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_fetch_xor(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_fetch_nand(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_fetch_nand(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_fetch_lshift(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_fetch_lshift(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_fetch_rshift(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_fetch_rshift(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_fetch_inc(T* ptr) { return desul::atomic_fetch_inc(const_cast*>(ptr), desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_fetch_dec(T* ptr) { return desul::atomic_fetch_dec(const_cast*>(ptr), desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } // atomic_op_fetch -template KOKKOS_INLINE_FUNCTION -T atomic_add_fetch (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_add_fetch (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_sub_fetch (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_sub_fetch (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_max_fetch (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_max_fetch (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_min_fetch (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_min_fetch (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_mul_fetch (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_mul_fetch (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_div_fetch (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_div_fetch (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_mod_fetch (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_mod_fetch (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_and_fetch (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_and_fetch (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_or_fetch (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_or_fetch (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_xor_fetch (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_xor_fetch (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_nand_fetch(T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_nand_fetch(dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_lshift_fetch(T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_lshift_fetch(dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_rshift_fetch(T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_rshift_fetch(dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_inc_fetch(T* const dest) { return desul::atomic_inc_fetch(dest, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_dec_fetch(T* const dest) { return desul::atomic_dec_fetch(dest, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_add_fetch(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_add_fetch(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_sub_fetch(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_sub_fetch(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_max_fetch(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_max_fetch(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_min_fetch(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_min_fetch(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_mul_fetch(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_mul_fetch(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_div_fetch(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_div_fetch(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_mod_fetch(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_mod_fetch(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_and_fetch(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_and_fetch(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_or_fetch (T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_or_fetch (const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_xor_fetch(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_xor_fetch(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_nand_fetch(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_nand_fetch(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_lshift_fetch(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_lshift_fetch(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_rshift_fetch(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_rshift_fetch(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_inc_fetch(T* ptr) { return desul::atomic_inc_fetch(const_cast*>(ptr), desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_dec_fetch(T* ptr) { return desul::atomic_dec_fetch(const_cast*>(ptr), desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } // atomic_op -template KOKKOS_INLINE_FUNCTION -void atomic_add(T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_add (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_sub(T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_sub (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_mul(T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_mul (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_div(T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_div (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_min(T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_min (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_max(T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_max (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -// FIXME: Desul doesn't have atomic_and yet so call fetch_and -template KOKKOS_INLINE_FUNCTION -void atomic_and(T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { (void) desul::atomic_fetch_and (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -// FIXME: Desul doesn't have atomic_or yet so call fetch_or -template KOKKOS_INLINE_FUNCTION -void atomic_or(T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { (void) desul::atomic_fetch_or (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_inc(T* const dest) { return desul::atomic_inc(dest, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_dec(T* const dest) { return desul::atomic_dec(dest, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_increment(T* const dest) { return desul::atomic_inc(dest, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_add(T* ptr, Impl::not_deduced_atomic_t val) { desul::atomic_add(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_sub(T* ptr, Impl::not_deduced_atomic_t val) { desul::atomic_sub(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_max(T* ptr, Impl::not_deduced_atomic_t val) { desul::atomic_max(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_min(T* ptr, Impl::not_deduced_atomic_t val) { desul::atomic_min(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_mul(T* ptr, Impl::not_deduced_atomic_t val) { desul::atomic_mul(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_div(T* ptr, Impl::not_deduced_atomic_t val) { desul::atomic_div(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_mod(T* ptr, Impl::not_deduced_atomic_t val) { (void)desul::atomic_fetch_mod(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_and(T* ptr, Impl::not_deduced_atomic_t val) { (void)desul::atomic_fetch_and(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_or (T* ptr, Impl::not_deduced_atomic_t val) { (void)desul::atomic_fetch_or (const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_xor(T* ptr, Impl::not_deduced_atomic_t val) { (void)desul::atomic_fetch_xor(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_nand(T* ptr, Impl::not_deduced_atomic_t val) { (void)desul::atomic_nand_fetch(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_lshift(T* ptr, Impl::not_deduced_atomic_t val) { (void)desul::atomic_fetch_lshift(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_rshift(T* ptr, Impl::not_deduced_atomic_t val) { (void)desul::atomic_fetch_rshift(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_inc(T* ptr) { desul::atomic_inc(const_cast*>(ptr), desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_dec(T* ptr) { desul::atomic_dec(const_cast*>(ptr), desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +template KOKKOS_DEPRECATED_WITH_COMMENT("Use atomic_inc() instead!") KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_increment(T* ptr) { atomic_inc(ptr); } +template KOKKOS_DEPRECATED_WITH_COMMENT("Use atomic_dec() instead!") KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_decrement(T* ptr) { atomic_dec(ptr); } +#endif -template KOKKOS_INLINE_FUNCTION -void atomic_decrement(T* const dest) { return desul::atomic_dec(dest, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +// exchange +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_exchange (T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_exchange (const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_compare_exchange(T* ptr, Impl::not_deduced_atomic_t expected, Impl::not_deduced_atomic_t desired) { return desul::atomic_compare_exchange(const_cast*>(ptr), expected, desired, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +template KOKKOS_DEPRECATED_WITH_COMMENT("Use atomic_compare_exchange() instead!") KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_compare_exchange_strong(T* ptr, Impl::not_deduced_atomic_t expected, Impl::not_deduced_atomic_t desired) { return expected == atomic_compare_exchange(ptr, expected, desired); } +#endif -// Exchange +// clang-format on +} // namespace Kokkos -template KOKKOS_INLINE_FUNCTION -T atomic_exchange(T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_exchange(dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +namespace Kokkos::Impl { -template KOKKOS_INLINE_FUNCTION -bool atomic_compare_exchange_strong(T* const dest, desul::Impl::dont_deduce_this_parameter_t expected, desul::Impl::dont_deduce_this_parameter_t desired) { - T expected_ref = expected; - return desul::atomic_compare_exchange_strong(dest, expected_ref, desired, - desul::MemoryOrderRelaxed(), desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); +template +KOKKOS_FUNCTION bool atomic_compare_exchange_strong(T* const dest, T& expected, + const T desired, + MemOrderSuccess succ, + MemOrderFailure fail) { + return desul::atomic_compare_exchange_strong(dest, expected, desired, succ, + fail, KOKKOS_DESUL_MEM_SCOPE); } -template KOKKOS_INLINE_FUNCTION -T atomic_compare_exchange(T* const dest, desul::Impl::dont_deduce_this_parameter_t compare, desul::Impl::dont_deduce_this_parameter_t desired) { - return desul::atomic_compare_exchange(dest, compare, desired, - desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); +template +KOKKOS_FUNCTION T atomic_load(const T* const src, MemoryOrder order) { + return desul::atomic_load(src, order, KOKKOS_DESUL_MEM_SCOPE); } -namespace Impl { - template KOKKOS_INLINE_FUNCTION - bool atomic_compare_exchange_strong(T* const dest, T& expected, const T desired, MemOrderSuccess succ, MemOrderFailure fail) { - return desul::atomic_compare_exchange_strong(dest, expected, desired, succ, fail, KOKKOS_DESUL_MEM_SCOPE); - } - template - KOKKOS_INLINE_FUNCTION - T atomic_load(const T* const src, MemoryOrder order) { - return desul::atomic_load(src, order, KOKKOS_DESUL_MEM_SCOPE); - } - template - KOKKOS_INLINE_FUNCTION - void atomic_store(T* const src, const T val, MemoryOrder order) { - return desul::atomic_store(src, val, order, KOKKOS_DESUL_MEM_SCOPE); - } -} // namespace Impl +template +KOKKOS_FUNCTION void atomic_store(T* const src, const T val, + MemoryOrder order) { + return desul::atomic_store(src, val, order, KOKKOS_DESUL_MEM_SCOPE); +} -} // namespace Kokkos +} // namespace Kokkos::Impl #undef KOKKOS_DESUL_MEM_SCOPE -// clang-format on #endif diff --git a/packages/kokkos/core/src/Kokkos_Complex.hpp b/packages/kokkos/core/src/Kokkos_Complex.hpp index 7dd2a9ddbb71..8233c30b243c 100644 --- a/packages/kokkos/core/src/Kokkos_Complex.hpp +++ b/packages/kokkos/core/src/Kokkos_Complex.hpp @@ -70,9 +70,8 @@ class complex& operator=(const complex&) noexcept = default; /// \brief Conversion constructor from compatible RType - template < - class RType, - std::enable_if_t::value, int> = 0> + template , int> = 0> KOKKOS_INLINE_FUNCTION complex(const complex& other) noexcept // Intentionally do the conversions implicitly here so that users don't // get any warnings about narrowing, etc., that they would expect to get @@ -265,9 +264,8 @@ class #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 //! Copy constructor from volatile. - template < - class RType, - std::enable_if_t::value, int> = 0> + template , int> = 0> KOKKOS_DEPRECATED KOKKOS_INLINE_FUNCTION complex(const volatile complex& src) noexcept // Intentionally do the conversions implicitly here so that users don't @@ -296,7 +294,7 @@ class // vl = r; // vl = cr; template ::value, int> = 0> + std::enable_if_t, int> = 0> KOKKOS_DEPRECATED KOKKOS_INLINE_FUNCTION void operator=( const Complex& src) volatile noexcept { re_ = src.re_; @@ -319,7 +317,7 @@ class // vl = vr; // vl = cvr; template ::value, int> = 0> + std::enable_if_t, int> = 0> KOKKOS_DEPRECATED KOKKOS_INLINE_FUNCTION volatile complex& operator=( const volatile Complex& src) volatile noexcept { re_ = src.re_; @@ -341,7 +339,7 @@ class // l = cvr; // template ::value, int> = 0> + std::enable_if_t, int> = 0> KOKKOS_DEPRECATED KOKKOS_INLINE_FUNCTION complex& operator=( const volatile Complex& src) noexcept { re_ = src.re_; @@ -539,7 +537,7 @@ inline bool operator==(complex const& x, template < class RealType1, class RealType2, // Constraints to avoid participation in oparator==() for every possible RHS - std::enable_if_t::value, int> = 0> + std::enable_if_t, int> = 0> KOKKOS_INLINE_FUNCTION bool operator==(complex const& x, RealType2 const& y) noexcept { using common_type = std::common_type_t; @@ -551,7 +549,7 @@ KOKKOS_INLINE_FUNCTION bool operator==(complex const& x, template < class RealType1, class RealType2, // Constraints to avoid participation in oparator==() for every possible RHS - std::enable_if_t::value, int> = 0> + std::enable_if_t, int> = 0> KOKKOS_INLINE_FUNCTION bool operator==(RealType1 const& x, complex const& y) noexcept { using common_type = std::common_type_t; @@ -590,7 +588,7 @@ inline bool operator!=(complex const& x, template < class RealType1, class RealType2, // Constraints to avoid participation in oparator==() for every possible RHS - std::enable_if_t::value, int> = 0> + std::enable_if_t, int> = 0> KOKKOS_INLINE_FUNCTION bool operator!=(complex const& x, RealType2 const& y) noexcept { using common_type = std::common_type_t; @@ -602,7 +600,7 @@ KOKKOS_INLINE_FUNCTION bool operator!=(complex const& x, template < class RealType1, class RealType2, // Constraints to avoid participation in oparator==() for every possible RHS - std::enable_if_t::value, int> = 0> + std::enable_if_t, int> = 0> KOKKOS_INLINE_FUNCTION bool operator!=(RealType1 const& x, complex const& y) noexcept { using common_type = std::common_type_t; @@ -778,16 +776,14 @@ KOKKOS_INLINE_FUNCTION complex pow(const complex& x, return x == T() ? T() : exp(y * log(x)); } -template ::value>> +template >> KOKKOS_INLINE_FUNCTION complex> pow( const T& x, const complex& y) { using type = Impl::promote_2_t; return pow(type(x), complex(y)); } -template ::value>> +template >> KOKKOS_INLINE_FUNCTION complex> pow(const complex& x, const U& y) { using type = Impl::promote_2_t; diff --git a/packages/kokkos/core/src/Kokkos_Concepts.hpp b/packages/kokkos/core/src/Kokkos_Concepts.hpp index df78a644a034..0bfb9eb5fa40 100644 --- a/packages/kokkos/core/src/Kokkos_Concepts.hpp +++ b/packages/kokkos/core/src/Kokkos_Concepts.hpp @@ -41,8 +41,7 @@ struct Dynamic {}; // Schedule Wrapper Type template struct Schedule { - static_assert(std::is_same::value || - std::is_same::value, + static_assert(std::is_same_v || std::is_same_v, "Kokkos: Invalid Schedule<> type."); using schedule_type = Schedule; using type = T; @@ -51,7 +50,7 @@ struct Schedule { // Specify Iteration Index Type template struct IndexType { - static_assert(std::is_integral::value, "Kokkos: Invalid IndexType<>."); + static_assert(std::is_integral_v, "Kokkos: Invalid IndexType<>."); using index_type = IndexType; using type = T; }; @@ -139,8 +138,8 @@ namespace Kokkos { \ public: \ static constexpr bool value = \ - std::is_base_of, T>::value || \ - std::is_base_of, T>::value; \ + std::is_base_of_v, T> || \ + std::is_base_of_v, T>; \ constexpr operator bool() const noexcept { return value; } \ }; \ template \ @@ -292,44 +291,6 @@ struct is_space { using execution_space = typename is_exe::space; using memory_space = typename is_mem::space; - - // For backward compatibility, deprecated in favor of - // Kokkos::Impl::HostMirror::host_mirror_space - - private: - // The actual definitions for host_memory_space and host_execution_spaces are - // in do_not_use_host_memory_space and do_not_use_host_execution_space to be - // able to use them within this class without deprecation warnings. - using do_not_use_host_memory_space = std::conditional_t< - std::is_same::value -#if defined(KOKKOS_ENABLE_CUDA) - || std::is_same::value || - std::is_same::value -#elif defined(KOKKOS_ENABLE_HIP) - || std::is_same::value || - std::is_same::value -#elif defined(KOKKOS_ENABLE_SYCL) - || std::is_same::value || - std::is_same::value -#endif - , - memory_space, Kokkos::HostSpace>; - - using do_not_use_host_execution_space = std::conditional_t< -#if defined(KOKKOS_ENABLE_CUDA) - std::is_same::value || -#elif defined(KOKKOS_ENABLE_HIP) - std::is_same::value || -#elif defined(KOKKOS_ENABLE_SYCL) - std::is_same::value || -#elif defined(KOKKOS_ENABLE_OPENMPTARGET) - std::is_same::value || -#endif - false, - Kokkos::DefaultHostExecutionSpace, execution_space>; }; } // namespace Kokkos @@ -357,7 +318,7 @@ struct MemorySpaceAccess { * 2. All execution spaces that can access DstMemorySpace can also access * SrcMemorySpace. */ - enum { assignable = std::is_same::value }; + enum { assignable = std::is_same_v }; /**\brief For all DstExecSpace::memory_space == DstMemorySpace * DstExecSpace can access SrcMemorySpace. @@ -442,7 +403,7 @@ struct SpaceAccessibility { // If same memory space or not accessible use the AccessSpace // else construct a device with execution space and memory space. using space = std::conditional_t< - std::is_same::value || + std::is_same_v || !exe_access::accessible, AccessSpace, Kokkos::Device>; diff --git a/packages/kokkos/core/src/Kokkos_CopyViews.hpp b/packages/kokkos/core/src/Kokkos_CopyViews.hpp index e856b1924719..7da59aa4e419 100644 --- a/packages/kokkos/core/src/Kokkos_CopyViews.hpp +++ b/packages/kokkos/core/src/Kokkos_CopyViews.hpp @@ -561,21 +561,20 @@ void view_copy(const ExecutionSpace& space, const DstType& dst, int64_t strides[DstType::rank + 1]; dst.stride(strides); Kokkos::Iterate iterate; - if (std::is_same::value) { + if (std::is_same_v) { iterate = Kokkos::Iterate::Right; - } else if (std::is_same::value) { + } else if (std::is_same_v) { iterate = Kokkos::Iterate::Left; - } else if (std::is_same::value) { + } else if (std::is_same_v) { if (strides[0] > strides[DstType::rank - 1]) iterate = Kokkos::Iterate::Right; else iterate = Kokkos::Iterate::Left; } else { - if (std::is_same::value) + if (std::is_same_v) iterate = Kokkos::Iterate::Right; else iterate = Kokkos::Iterate::Left; @@ -649,21 +648,20 @@ void view_copy(const DstType& dst, const SrcType& src) { int64_t strides[DstType::rank + 1]; dst.stride(strides); Kokkos::Iterate iterate; - if (std::is_same::value) { + if (std::is_same_v) { iterate = Kokkos::Iterate::Right; - } else if (std::is_same::value) { + } else if (std::is_same_v) { iterate = Kokkos::Iterate::Left; - } else if (std::is_same::value) { + } else if (std::is_same_v) { if (strides[0] > strides[DstType::rank - 1]) iterate = Kokkos::Iterate::Right; else iterate = Kokkos::Iterate::Left; } else { - if (std::is_same::value) + if (std::is_same_v) iterate = Kokkos::Iterate::Right; else iterate = Kokkos::Iterate::Left; @@ -1350,22 +1348,20 @@ inline void contiguous_fill( } // Default implementation for execution spaces that don't provide a definition -template +template struct ZeroMemset { - ZeroMemset(const ExecutionSpace& exec_space, const ViewType& dst) { - using ValueType = typename ViewType::value_type; - alignas(alignof(ValueType)) unsigned char - zero_initialized_storage[sizeof(ValueType)] = {}; - contiguous_fill(exec_space, dst, - *reinterpret_cast(zero_initialized_storage)); + ZeroMemset(const ExecutionSpace& exec_space, void* dst, size_t cnt) { + contiguous_fill( + exec_space, + Kokkos::View( + static_cast(dst), cnt), + std::byte{}); } }; template inline std::enable_if_t< - std::is_trivial::value_type>::value && - std::is_trivially_copy_assignable< - typename ViewTraits::value_type>::value> + std::is_trivial_v::value_type>> contiguous_fill_or_memset( const ExecutionSpace& exec_space, const View& dst, typename ViewTraits::const_value_type& value) { @@ -1375,20 +1371,20 @@ contiguous_fill_or_memset( && !std::is_same_v #endif ) - // FIXME intel/19 icpc fails to deduce template parameters here, + // FIXME intel/19 icpc fails to deduce template parameter here, // resulting in compilation errors; explicitly passing the template - // parameters to ZeroMemset helps workaround the issue - // See https://github.com/kokkos/kokkos/issues/6775 - ZeroMemset>(exec_space, dst); + // parameter to ZeroMemset helps workaround the issue. + // See https://github.com/kokkos/kokkos/issues/7273. + ZeroMemset( + exec_space, dst.data(), + dst.size() * sizeof(typename ViewTraits::value_type)); else contiguous_fill(exec_space, dst, value); } template inline std::enable_if_t< - !(std::is_trivial::value_type>::value && - std::is_trivially_copy_assignable< - typename ViewTraits::value_type>::value)> + !std::is_trivial_v::value_type>> contiguous_fill_or_memset( const ExecutionSpace& exec_space, const View& dst, typename ViewTraits::const_value_type& value) { @@ -1397,9 +1393,7 @@ contiguous_fill_or_memset( template inline std::enable_if_t< - std::is_trivial::value_type>::value && - std::is_trivially_copy_assignable< - typename ViewTraits::value_type>::value> + std::is_trivial_v::value_type>> contiguous_fill_or_memset( const View& dst, typename ViewTraits::const_value_type& value) { @@ -1411,11 +1405,12 @@ contiguous_fill_or_memset( // leading to the significant performance issues #ifndef KOKKOS_ARCH_A64FX if (Impl::is_zero_byte(value)) - // FIXME intel/19 icpc fails to deduce template parameters here, + // FIXME intel/19 icpc fails to deduce template parameter here, // resulting in compilation errors; explicitly passing the template - // parameters to ZeroMemset helps workaround the issue - // See https://github.com/kokkos/kokkos/issues/6775 - ZeroMemset(exec, dst); + // parameter to ZeroMemset helps workaround the issue. + // See https://github.com/kokkos/kokkos/issues/7273. + ZeroMemset( + exec, dst.data(), dst.size() * sizeof(typename ViewType::value_type)); else #endif contiguous_fill(exec, dst, value); @@ -1423,9 +1418,7 @@ contiguous_fill_or_memset( template inline std::enable_if_t< - !(std::is_trivial::value_type>::value && - std::is_trivially_copy_assignable< - typename ViewTraits::value_type>::value)> + !std::is_trivial_v::value_type>> contiguous_fill_or_memset( const View& dst, typename ViewTraits::const_value_type& value) { @@ -1441,8 +1434,8 @@ template inline void deep_copy( const View& dst, typename ViewTraits::const_value_type& value, - std::enable_if_t::specialize, - void>::value>* = nullptr) { + std::enable_if_t::specialize, + void>>* = nullptr) { using ViewType = View; using exec_space_type = typename ViewType::execution_space; @@ -1464,8 +1457,8 @@ inline void deep_copy( } Kokkos::fence("Kokkos::deep_copy: scalar copy, pre copy fence"); - static_assert(std::is_same::value, + static_assert(std::is_same_v, "deep_copy requires non-const type"); // If contiguous we can simply do a 1D flat loop or use memset @@ -1482,21 +1475,20 @@ inline void deep_copy( int64_t strides[ViewType::rank + 1]; dst.stride(strides); Kokkos::Iterate iterate; - if (std::is_same::value) { + if (std::is_same_v) { iterate = Kokkos::Iterate::Right; - } else if (std::is_same::value) { + } else if (std::is_same_v) { iterate = Kokkos::Iterate::Left; - } else if (std::is_same::value) { + } else if (std::is_same_v) { if (strides[0] > strides[ViewType::rank > 0 ? ViewType::rank - 1 : 0]) iterate = Kokkos::Iterate::Right; else iterate = Kokkos::Iterate::Left; } else { - if (std::is_same::value) + if (std::is_same_v) iterate = Kokkos::Iterate::Right; else iterate = Kokkos::Iterate::Left; @@ -1539,8 +1531,8 @@ template inline void deep_copy( typename ViewTraits::non_const_value_type& dst, const View& src, - std::enable_if_t::specialize, - void>::value>* = nullptr) { + std::enable_if_t::specialize, + void>>* = nullptr) { using src_traits = ViewTraits; using src_memory_space = typename src_traits::memory_space; @@ -1576,8 +1568,8 @@ template inline void deep_copy( const View& dst, const View& src, std::enable_if_t< - (std::is_void::specialize>::value && - std::is_void::specialize>::value && + (std::is_void_v::specialize> && + std::is_void_v::specialize> && (unsigned(ViewTraits::rank) == unsigned(0) && unsigned(ViewTraits::rank) == unsigned(0)))>* = nullptr) { using dst_type = View; @@ -1587,8 +1579,8 @@ inline void deep_copy( using dst_memory_space = typename dst_type::memory_space; using src_memory_space = typename src_type::memory_space; - static_assert(std::is_same::value, + static_assert(std::is_same_v, "deep_copy requires matching non-const destination type"); if (Kokkos::Tools::Experimental::get_callbacks().begin_deep_copy != nullptr) { @@ -1628,8 +1620,8 @@ template inline void deep_copy( const View& dst, const View& src, std::enable_if_t< - (std::is_void::specialize>::value && - std::is_void::specialize>::value && + (std::is_void_v::specialize> && + std::is_void_v::specialize> && (unsigned(ViewTraits::rank) != 0 || unsigned(ViewTraits::rank) != 0))>* = nullptr) { using dst_type = View; @@ -1641,8 +1633,8 @@ inline void deep_copy( using dst_value_type = typename dst_type::value_type; using src_value_type = typename src_type::value_type; - static_assert(std::is_same::value, + static_assert(std::is_same_v, "deep_copy requires non-const destination type"); static_assert((unsigned(dst_type::rank) == unsigned(src_type::rank)), @@ -1772,10 +1764,10 @@ inline void deep_copy( // If same type, equal layout, equal dimensions, equal span, and contiguous // memory then can byte-wise copy - if (std::is_same::value && - (std::is_same::value || + if (std::is_same_v && + (std::is_same_v || (dst_type::rank == 1 && src_type::rank == 1)) && dst.span_is_contiguous() && src.span_is_contiguous() && ((dst_type::rank < 1) || (dst.stride_0() == src.stride_0())) && @@ -2191,8 +2183,8 @@ template void KOKKOS_INLINE_FUNCTION local_deep_copy_contiguous( const TeamType& team, const View& dst, typename ViewTraits::const_value_type& value, - std::enable_if_t::specialize, - void>::value>* = nullptr) { + std::enable_if_t::specialize, + void>>* = nullptr) { Kokkos::parallel_for(Kokkos::TeamVectorRange(team, dst.span()), [&](const int& i) { dst.data()[i] = value; }); } @@ -2201,8 +2193,8 @@ template void KOKKOS_INLINE_FUNCTION local_deep_copy_contiguous( const View& dst, typename ViewTraits::const_value_type& value, - std::enable_if_t::specialize, - void>::value>* = nullptr) { + std::enable_if_t::specialize, + void>>* = nullptr) { for (size_t i = 0; i < dst.span(); ++i) { dst.data()[i] = value; } @@ -2568,13 +2560,13 @@ inline void deep_copy( typename ViewTraits::const_value_type& value, std::enable_if_t< Kokkos::is_execution_space::value && - std::is_void::specialize>::value && + std::is_void_v::specialize> && Kokkos::SpaceAccessibility:: memory_space>::accessible>* = nullptr) { using dst_traits = ViewTraits; - static_assert(std::is_same::value, + static_assert(std::is_same_v, "deep_copy requires non-const type"); using dst_memory_space = typename dst_traits::memory_space; if (Kokkos::Tools::Experimental::get_callbacks().begin_deep_copy != nullptr) { @@ -2594,21 +2586,20 @@ inline void deep_copy( int64_t strides[ViewType::rank + 1]; dst.stride(strides); Kokkos::Iterate iterate; - if (std::is_same::value) { + if (std::is_same_v) { iterate = Kokkos::Iterate::Right; - } else if (std::is_same::value) { + } else if (std::is_same_v) { iterate = Kokkos::Iterate::Left; - } else if (std::is_same::value) { + } else if (std::is_same_v) { if (strides[0] > strides[ViewType::rank > 0 ? ViewType::rank - 1 : 0]) iterate = Kokkos::Iterate::Right; else iterate = Kokkos::Iterate::Left; } else { - if (std::is_same::value) + if (std::is_same_v) iterate = Kokkos::Iterate::Right; else iterate = Kokkos::Iterate::Left; @@ -2649,13 +2640,13 @@ inline void deep_copy( typename ViewTraits::const_value_type& value, std::enable_if_t< Kokkos::is_execution_space::value && - std::is_void::specialize>::value && + std::is_void_v::specialize> && !Kokkos::SpaceAccessibility:: memory_space>::accessible>* = nullptr) { using dst_traits = ViewTraits; - static_assert(std::is_same::value, + static_assert(std::is_same_v, "deep_copy requires non-const type"); using dst_memory_space = typename dst_traits::memory_space; if (Kokkos::Tools::Experimental::get_callbacks().begin_deep_copy != nullptr) { @@ -2696,8 +2687,8 @@ inline void deep_copy( typename ViewTraits::non_const_value_type& dst, const View& src, std::enable_if_t::value && - std::is_same::specialize, - void>::value>* = nullptr) { + std::is_same_v::specialize, + void>>* = nullptr) { using src_traits = ViewTraits; using src_memory_space = typename src_traits::memory_space; static_assert(src_traits::rank == 0, @@ -2734,8 +2725,8 @@ inline void deep_copy( const View& src, std::enable_if_t< (Kokkos::is_execution_space::value && - std::is_void::specialize>::value && - std::is_void::specialize>::value && + std::is_void_v::specialize> && + std::is_void_v::specialize> && (unsigned(ViewTraits::rank) == unsigned(0) && unsigned(ViewTraits::rank) == unsigned(0)))>* = nullptr) { using src_traits = ViewTraits; @@ -2743,8 +2734,8 @@ inline void deep_copy( using src_memory_space = typename src_traits::memory_space; using dst_memory_space = typename dst_traits::memory_space; - static_assert(std::is_same::value, + static_assert(std::is_same_v, "deep_copy requires matching non-const destination type"); if (Kokkos::Tools::Experimental::get_callbacks().begin_deep_copy != nullptr) { @@ -2784,15 +2775,15 @@ inline void deep_copy( const View& src, std::enable_if_t< (Kokkos::is_execution_space::value && - std::is_void::specialize>::value && - std::is_void::specialize>::value && + std::is_void_v::specialize> && + std::is_void_v::specialize> && (unsigned(ViewTraits::rank) != 0 || unsigned(ViewTraits::rank) != 0))>* = nullptr) { using dst_type = View; using src_type = View; - static_assert(std::is_same::value, + static_assert(std::is_same_v, "deep_copy requires non-const destination type"); static_assert((unsigned(dst_type::rank) == unsigned(src_type::rank)), @@ -2922,10 +2913,10 @@ inline void deep_copy( // If same type, equal layout, equal dimensions, equal span, and contiguous // memory then can byte-wise copy - if (std::is_same::value && - (std::is_same::value || + if (std::is_same_v && + (std::is_same_v || (dst_type::rank == 1 && src_type::rank == 1)) && dst.span_is_contiguous() && src.span_is_contiguous() && ((dst_type::rank < 1) || (dst.stride_0() == src.stride_0())) && @@ -2994,11 +2985,11 @@ bool size_mismatch(const ViewType& view, unsigned int max_extent, /** \brief Resize a view with copying old data to new data at the corresponding * indices. */ template -inline typename std::enable_if< - std::is_same::array_layout, - Kokkos::LayoutLeft>::value || - std::is_same::array_layout, - Kokkos::LayoutRight>::value>::type +inline std::enable_if_t< + std::is_same_v::array_layout, + Kokkos::LayoutLeft> || + std::is_same_v::array_layout, + Kokkos::LayoutRight>> impl_resize(const Impl::ViewCtorProp& arg_prop, Kokkos::View& v, const size_t n0, const size_t n1, const size_t n2, const size_t n3, const size_t n4, const size_t n5, @@ -3048,10 +3039,10 @@ impl_resize(const Impl::ViewCtorProp& arg_prop, template inline std::enable_if_t< - std::is_same::array_layout, - Kokkos::LayoutLeft>::value || - std::is_same::array_layout, - Kokkos::LayoutRight>::value> + std::is_same_v::array_layout, + Kokkos::LayoutLeft> || + std::is_same_v::array_layout, + Kokkos::LayoutRight>> resize(const Impl::ViewCtorProp& arg_prop, Kokkos::View& v, const size_t n0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, const size_t n1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, @@ -3066,10 +3057,10 @@ resize(const Impl::ViewCtorProp& arg_prop, template inline std::enable_if_t< - std::is_same::array_layout, - Kokkos::LayoutLeft>::value || - std::is_same::array_layout, - Kokkos::LayoutRight>::value> + std::is_same_v::array_layout, + Kokkos::LayoutLeft> || + std::is_same_v::array_layout, + Kokkos::LayoutRight>> resize(Kokkos::View& v, const size_t n0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, const size_t n1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, const size_t n2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, @@ -3085,10 +3076,10 @@ template inline std::enable_if_t< (Impl::is_view_ctor_property::value || Kokkos::is_execution_space::value) && - (std::is_same::array_layout, - Kokkos::LayoutLeft>::value || - std::is_same::array_layout, - Kokkos::LayoutRight>::value)> + (std::is_same_v::array_layout, + Kokkos::LayoutLeft> || + std::is_same_v::array_layout, + Kokkos::LayoutRight>)> resize(const I& arg_prop, Kokkos::View& v, const size_t n0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, const size_t n1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, @@ -3103,12 +3094,12 @@ resize(const I& arg_prop, Kokkos::View& v, template inline std::enable_if_t< - std::is_same::array_layout, - Kokkos::LayoutLeft>::value || - std::is_same::array_layout, - Kokkos::LayoutRight>::value || - std::is_same::array_layout, - Kokkos::LayoutStride>::value> + std::is_same_v::array_layout, + Kokkos::LayoutLeft> || + std::is_same_v::array_layout, + Kokkos::LayoutRight> || + std::is_same_v::array_layout, + Kokkos::LayoutStride>> impl_resize(const Impl::ViewCtorProp& arg_prop, Kokkos::View& v, const typename Kokkos::View::array_layout& layout) { @@ -3149,12 +3140,12 @@ impl_resize(const Impl::ViewCtorProp& arg_prop, // the same as the existing one. template inline std::enable_if_t< - !(std::is_same::array_layout, - Kokkos::LayoutLeft>::value || - std::is_same::array_layout, - Kokkos::LayoutRight>::value || - std::is_same::array_layout, - Kokkos::LayoutStride>::value)> + !(std::is_same_v::array_layout, + Kokkos::LayoutLeft> || + std::is_same_v::array_layout, + Kokkos::LayoutRight> || + std::is_same_v::array_layout, + Kokkos::LayoutStride>)> impl_resize(const Impl::ViewCtorProp& arg_prop, Kokkos::View& v, const typename Kokkos::View::array_layout& layout) { @@ -3218,10 +3209,10 @@ inline void resize(Kokkos::View& v, /** \brief Resize a view with discarding old data. */ template inline std::enable_if_t< - std::is_same::array_layout, - Kokkos::LayoutLeft>::value || - std::is_same::array_layout, - Kokkos::LayoutRight>::value> + std::is_same_v::array_layout, + Kokkos::LayoutLeft> || + std::is_same_v::array_layout, + Kokkos::LayoutRight>> impl_realloc(Kokkos::View& v, const size_t n0, const size_t n1, const size_t n2, const size_t n3, const size_t n4, const size_t n5, const size_t n6, const size_t n7, @@ -3264,10 +3255,10 @@ impl_realloc(Kokkos::View& v, const size_t n0, const size_t n1, template inline std::enable_if_t< - std::is_same::array_layout, - Kokkos::LayoutLeft>::value || - std::is_same::array_layout, - Kokkos::LayoutRight>::value> + std::is_same_v::array_layout, + Kokkos::LayoutLeft> || + std::is_same_v::array_layout, + Kokkos::LayoutRight>> realloc(const Impl::ViewCtorProp& arg_prop, Kokkos::View& v, const size_t n0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, @@ -3283,10 +3274,10 @@ realloc(const Impl::ViewCtorProp& arg_prop, template inline std::enable_if_t< - std::is_same::array_layout, - Kokkos::LayoutLeft>::value || - std::is_same::array_layout, - Kokkos::LayoutRight>::value> + std::is_same_v::array_layout, + Kokkos::LayoutLeft> || + std::is_same_v::array_layout, + Kokkos::LayoutRight>> realloc(Kokkos::View& v, const size_t n0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, const size_t n1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, @@ -3302,10 +3293,10 @@ realloc(Kokkos::View& v, template inline std::enable_if_t< Impl::is_view_ctor_property::value && - (std::is_same::array_layout, - Kokkos::LayoutLeft>::value || - std::is_same::array_layout, - Kokkos::LayoutRight>::value)> + (std::is_same_v::array_layout, + Kokkos::LayoutLeft> || + std::is_same_v::array_layout, + Kokkos::LayoutRight>)> realloc(const I& arg_prop, Kokkos::View& v, const size_t n0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, const size_t n1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, @@ -3320,12 +3311,12 @@ realloc(const I& arg_prop, Kokkos::View& v, template inline std::enable_if_t< - std::is_same::array_layout, - Kokkos::LayoutLeft>::value || - std::is_same::array_layout, - Kokkos::LayoutRight>::value || - std::is_same::array_layout, - Kokkos::LayoutStride>::value> + std::is_same_v::array_layout, + Kokkos::LayoutLeft> || + std::is_same_v::array_layout, + Kokkos::LayoutRight> || + std::is_same_v::array_layout, + Kokkos::LayoutStride>> impl_realloc(Kokkos::View& v, const typename Kokkos::View::array_layout& layout, const Impl::ViewCtorProp& arg_prop) { @@ -3365,12 +3356,12 @@ impl_realloc(Kokkos::View& v, // the same as the existing one. template inline std::enable_if_t< - !(std::is_same::array_layout, - Kokkos::LayoutLeft>::value || - std::is_same::array_layout, - Kokkos::LayoutRight>::value || - std::is_same::array_layout, - Kokkos::LayoutStride>::value)> + !(std::is_same_v::array_layout, + Kokkos::LayoutLeft> || + std::is_same_v::array_layout, + Kokkos::LayoutRight> || + std::is_same_v::array_layout, + Kokkos::LayoutStride>)> impl_realloc(Kokkos::View& v, const typename Kokkos::View::array_layout& layout, const Impl::ViewCtorProp& arg_prop) { @@ -3435,7 +3426,7 @@ struct MirrorViewType { // Check whether it is the same memory space enum { is_same_memspace = - std::is_same::value + std::is_same_v }; // The array_layout using array_layout = typename src_view_type::array_layout; @@ -3450,26 +3441,6 @@ struct MirrorViewType { std::conditional_t; }; -template -struct MirrorType { - // The incoming view_type - using src_view_type = typename Kokkos::View; - // The memory space for the mirror view - using memory_space = typename Space::memory_space; - // Check whether it is the same memory space - enum { - is_same_memspace = - std::is_same::value - }; - // The array_layout - using array_layout = typename src_view_type::array_layout; - // The data type (we probably want it non-const since otherwise we can't even - // deep_copy to it. - using data_type = typename src_view_type::non_const_data_type; - // The destination view type if it is not the same memory space - using view_type = Kokkos::View; -}; - // collection of static asserts for create_mirror and create_mirror_view template void check_view_ctor_args_create_mirror() { @@ -3503,7 +3474,7 @@ inline auto create_mirror(const Kokkos::View& src, if constexpr (Impl::ViewCtorProp::has_memory_space) { using memory_space = typename decltype(prop_copy)::memory_space; using dst_type = - typename Impl::MirrorType::view_type; + typename Impl::MirrorViewType::dest_view_type; return dst_type(prop_copy, src.layout()); } else { using dst_type = typename View::HostMirror; @@ -3636,12 +3607,12 @@ inline auto create_mirror_view( const Kokkos::View& src, [[maybe_unused]] const Impl::ViewCtorProp& arg_prop) { if constexpr (!Impl::ViewCtorProp::has_memory_space) { - if constexpr (std::is_same::memory_space, - typename Kokkos::View< - T, P...>::HostMirror::memory_space>::value && - std::is_same::data_type, - typename Kokkos::View< - T, P...>::HostMirror::data_type>::value) { + if constexpr (std::is_same_v::memory_space, + typename Kokkos::View< + T, P...>::HostMirror::memory_space> && + std::is_same_v< + typename Kokkos::View::data_type, + typename Kokkos::View::HostMirror::data_type>) { check_view_ctor_args_create_mirror(); return typename Kokkos::View::HostMirror(src); } else { @@ -3785,8 +3756,7 @@ create_mirror_view_and_copy( const Space&, const Kokkos::View& src, std::string const& name = "", std::enable_if_t< - std::is_void::specialize>::value>* = - nullptr) { + std::is_void_v::specialize>>* = nullptr) { return create_mirror_view_and_copy( Kokkos::view_alloc(typename Space::memory_space{}, name), src); } diff --git a/packages/kokkos/core/src/Kokkos_Core.hpp b/packages/kokkos/core/src/Kokkos_Core.hpp index 1f146563be20..9588d289a9ca 100644 --- a/packages/kokkos/core/src/Kokkos_Core.hpp +++ b/packages/kokkos/core/src/Kokkos_Core.hpp @@ -63,7 +63,9 @@ #include #include #include +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 #include +#endif #include #include #include @@ -248,9 +250,9 @@ class KOKKOS_ATTRIBUTE_NODISCARD ScopeGuard { } ScopeGuard& operator=(const ScopeGuard&) = delete; - ScopeGuard& operator=(ScopeGuard&&) = delete; - ScopeGuard(const ScopeGuard&) = delete; - ScopeGuard(ScopeGuard&&) = delete; + ScopeGuard& operator=(ScopeGuard&&) = delete; + ScopeGuard(const ScopeGuard&) = delete; + ScopeGuard(ScopeGuard&&) = delete; }; } // namespace Kokkos @@ -281,7 +283,7 @@ std::vector partition_space(ExecSpace const& space, "Kokkos Error: partition_space expects an Execution Space as " "first argument"); static_assert( - std::is_arithmetic::value, + std::is_arithmetic_v, "Kokkos Error: partitioning arguments must be integers or floats"); std::vector instances(weights.size()); diff --git a/packages/kokkos/core/src/Kokkos_Core_fwd.hpp b/packages/kokkos/core/src/Kokkos_Core_fwd.hpp index 7edb35f00eb4..5dbe5714293f 100644 --- a/packages/kokkos/core/src/Kokkos_Core_fwd.hpp +++ b/packages/kokkos/core/src/Kokkos_Core_fwd.hpp @@ -106,8 +106,7 @@ using DefaultExecutionSpace KOKKOS_IMPL_DEFAULT_EXEC_SPACE_ANNOTATION = #elif defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_HIP) using DefaultExecutionSpace KOKKOS_IMPL_DEFAULT_EXEC_SPACE_ANNOTATION = HIP; #elif defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SYCL) -using DefaultExecutionSpace KOKKOS_IMPL_DEFAULT_EXEC_SPACE_ANNOTATION = - Experimental::SYCL; +using DefaultExecutionSpace KOKKOS_IMPL_DEFAULT_EXEC_SPACE_ANNOTATION = SYCL; #elif defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENACC) using DefaultExecutionSpace KOKKOS_IMPL_DEFAULT_EXEC_SPACE_ANNOTATION = Experimental::OpenACC; @@ -122,7 +121,7 @@ using DefaultExecutionSpace KOKKOS_IMPL_DEFAULT_EXEC_SPACE_ANNOTATION = using DefaultExecutionSpace KOKKOS_IMPL_DEFAULT_EXEC_SPACE_ANNOTATION = Serial; #else #error \ - "At least one of the following execution spaces must be defined in order to use Kokkos: Kokkos::Cuda, Kokkos::HIP, Kokkos::Experimental::SYCL, Kokkos::Experimental::OpenMPTarget, Kokkos::Experimental::OpenACC, Kokkos::OpenMP, Kokkos::Threads, Kokkos::Experimental::HPX, or Kokkos::Serial." + "At least one of the following execution spaces must be defined in order to use Kokkos: Kokkos::Cuda, Kokkos::HIP, Kokkos::SYCL, Kokkos::Experimental::OpenMPTarget, Kokkos::Experimental::OpenACC, Kokkos::OpenMP, Kokkos::Threads, Kokkos::Experimental::HPX, or Kokkos::Serial." #endif #if defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP) @@ -162,7 +161,7 @@ using SharedSpace = CudaUVMSpace; using SharedSpace = HIPManagedSpace; #define KOKKOS_HAS_SHARED_SPACE #elif defined(KOKKOS_ENABLE_SYCL) -using SharedSpace = Experimental::SYCLSharedUSMSpace; +using SharedSpace = SYCLSharedUSMSpace; #define KOKKOS_HAS_SHARED_SPACE // if only host compile point to HostSpace #elif !defined(KOKKOS_ENABLE_OPENACC) && !defined(KOKKOS_ENABLE_OPENMPTARGET) @@ -184,7 +183,7 @@ using SharedHostPinnedSpace = CudaHostPinnedSpace; using SharedHostPinnedSpace = HIPHostPinnedSpace; #define KOKKOS_HAS_SHARED_HOST_PINNED_SPACE #elif defined(KOKKOS_ENABLE_SYCL) - using SharedHostPinnedSpace = Experimental::SYCLHostUSMSpace; + using SharedHostPinnedSpace = SYCLHostUSMSpace; #define KOKKOS_HAS_SHARED_HOST_PINNED_SPACE #elif !defined(KOKKOS_ENABLE_OPENACC) && !defined(KOKKOS_ENABLE_OPENMPTARGET) using SharedHostPinnedSpace = HostSpace; diff --git a/packages/kokkos/core/src/Kokkos_Crs.hpp b/packages/kokkos/core/src/Kokkos_Crs.hpp index 92931b584952..69223b641289 100644 --- a/packages/kokkos/core/src/Kokkos_Crs.hpp +++ b/packages/kokkos/core/src/Kokkos_Crs.hpp @@ -84,12 +84,12 @@ class Crs { /* * Default Constructors, operators and destructor */ - KOKKOS_DEFAULTED_FUNCTION Crs() = default; - KOKKOS_DEFAULTED_FUNCTION Crs(Crs const&) = default; - KOKKOS_DEFAULTED_FUNCTION Crs(Crs&&) = default; + KOKKOS_DEFAULTED_FUNCTION Crs() = default; + KOKKOS_DEFAULTED_FUNCTION Crs(Crs const&) = default; + KOKKOS_DEFAULTED_FUNCTION Crs(Crs&&) = default; KOKKOS_DEFAULTED_FUNCTION Crs& operator=(Crs const&) = default; - KOKKOS_DEFAULTED_FUNCTION Crs& operator=(Crs&&) = default; - KOKKOS_DEFAULTED_FUNCTION ~Crs() = default; + KOKKOS_DEFAULTED_FUNCTION Crs& operator=(Crs&&) = default; + KOKKOS_DEFAULTED_FUNCTION ~Crs() = default; /** \brief Assign to a view of the rhs array. * If the old view is the last view @@ -148,7 +148,7 @@ class GetCrsTransposeCounts { public: KOKKOS_INLINE_FUNCTION - void operator()(index_type i) const { atomic_increment(&out[in.entries(i)]); } + void operator()(index_type i) const { atomic_inc(&out[in.entries(i)]); } GetCrsTransposeCounts(InCrs const& arg_in, OutCounts const& arg_out) : in(arg_in), out(arg_out) { using policy_type = RangePolicy; @@ -345,7 +345,7 @@ struct CountAndFill : public CountAndFillBase { closure.execute(); } auto nentries = Kokkos::get_crs_row_map_from_counts(this->m_crs.row_map, - this->m_counts); + this->m_counts); this->m_counts = counts_type(); this->m_crs.entries = entries_type("entries", nentries); { diff --git a/packages/kokkos/core/src/Kokkos_DetectionIdiom.hpp b/packages/kokkos/core/src/Kokkos_DetectionIdiom.hpp index ae28805a42ee..8af10b2a409b 100644 --- a/packages/kokkos/core/src/Kokkos_DetectionIdiom.hpp +++ b/packages/kokkos/core/src/Kokkos_DetectionIdiom.hpp @@ -54,8 +54,8 @@ struct detector>, Op, Args...> { } // namespace Impl struct nonesuch : private Impl::nonesuch_base { - ~nonesuch() = delete; - nonesuch(nonesuch const&) = delete; + ~nonesuch() = delete; + nonesuch(nonesuch const&) = delete; void operator=(nonesuch const&) = delete; }; @@ -81,7 +81,7 @@ inline constexpr bool is_detected_v = is_detected::value; template class Op, class... Args> inline constexpr bool is_detected_exact_v = - is_detected_exact::value; + is_detected_exact::value; // NOLINT template class Op, class... Args> inline constexpr bool is_detected_convertible_v = diff --git a/packages/kokkos/core/src/Kokkos_ExecPolicy.hpp b/packages/kokkos/core/src/Kokkos_ExecPolicy.hpp index b8d7f77deb30..dd7ce5ce21f6 100644 --- a/packages/kokkos/core/src/Kokkos_ExecPolicy.hpp +++ b/packages/kokkos/core/src/Kokkos_ExecPolicy.hpp @@ -27,7 +27,10 @@ static_assert(false, #include #include #include +#include +#ifndef KOKKOS_ENABLE_IMPL_TYPEINFO #include +#endif #include //---------------------------------------------------------------------------- @@ -197,8 +200,7 @@ class RangePolicy : public Impl::PolicyTraits { /** \brief finalize chunk_size if it was set to AUTO*/ inline void set_auto_chunk_size() { #ifdef KOKKOS_ENABLE_SYCL - if (std::is_same_v) { + if (std::is_same_v) { // chunk_size <=1 lets the compiler choose the workgroup size when // launching kernels m_granularity = 1; @@ -248,46 +250,49 @@ class RangePolicy : public Impl::PolicyTraits { // To be replaced with std::in_range (c++20) template - static void check_conversion_safety(const IndexType bound) { + static void check_conversion_safety([[maybe_unused]] const IndexType bound) { + // Checking that the round-trip conversion preserves input index value + if constexpr (std::is_convertible_v) { #if !defined(KOKKOS_ENABLE_DEPRECATED_CODE_4) || \ defined(KOKKOS_ENABLE_DEPRECATION_WARNINGS) - std::string msg = - "Kokkos::RangePolicy bound type error: an unsafe implicit conversion " - "is performed on a bound (" + - std::to_string(bound) + - "), which may " - "not preserve its original value.\n"; - bool warn = false; - - if constexpr (std::is_signed_v != - std::is_signed_v) { - // check signed to unsigned - if constexpr (std::is_signed_v) - warn |= (bound < static_cast( - std::numeric_limits::min())); - - // check unsigned to signed - if constexpr (std::is_signed_v) - warn |= (bound > static_cast( - std::numeric_limits::max())); - } + std::string msg = + "Kokkos::RangePolicy bound type error: an unsafe implicit conversion " + "is performed on a bound (" + + std::to_string(bound) + + "), which may " + "not preserve its original value.\n"; + bool warn = false; + + if constexpr (std::is_arithmetic_v && + (std::is_signed_v != + std::is_signed_v)) { + // check signed to unsigned + if constexpr (std::is_signed_v) + warn |= (bound < static_cast( + std::numeric_limits::min())); + + // check unsigned to signed + if constexpr (std::is_signed_v) + warn |= (bound > static_cast( + std::numeric_limits::max())); + } - // check narrowing - warn |= (static_cast(static_cast(bound)) != bound); + // check narrowing + warn |= + (static_cast(static_cast(bound)) != bound); - if (warn) { + if (warn) { #ifndef KOKKOS_ENABLE_DEPRECATED_CODE_4 - Kokkos::abort(msg.c_str()); + Kokkos::abort(msg.c_str()); #endif #ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS - Kokkos::Impl::log_warning(msg); + Kokkos::Impl::log_warning(msg); #endif - } -#else - (void)bound; + } #endif + } } public: @@ -333,20 +338,20 @@ class RangePolicy : public Impl::PolicyTraits { }; }; -RangePolicy()->RangePolicy<>; +RangePolicy() -> RangePolicy<>; -RangePolicy(int64_t, int64_t)->RangePolicy<>; -RangePolicy(int64_t, int64_t, ChunkSize const&)->RangePolicy<>; +RangePolicy(int64_t, int64_t) -> RangePolicy<>; +RangePolicy(int64_t, int64_t, ChunkSize const&) -> RangePolicy<>; -RangePolicy(DefaultExecutionSpace const&, int64_t, int64_t)->RangePolicy<>; +RangePolicy(DefaultExecutionSpace const&, int64_t, int64_t) -> RangePolicy<>; RangePolicy(DefaultExecutionSpace const&, int64_t, int64_t, ChunkSize const&) - ->RangePolicy<>; + -> RangePolicy<>; template >> -RangePolicy(ES const&, int64_t, int64_t)->RangePolicy; +RangePolicy(ES const&, int64_t, int64_t) -> RangePolicy; template >> -RangePolicy(ES const&, int64_t, int64_t, ChunkSize const&)->RangePolicy; +RangePolicy(ES const&, int64_t, int64_t, ChunkSize const&) -> RangePolicy; } // namespace Kokkos @@ -515,24 +520,24 @@ struct PerThreadValue { template struct ExtractVectorLength { static inline iType value( - std::enable_if_t::value, iType> val, Args...) { + std::enable_if_t, iType> val, Args...) { return val; } - static inline std::enable_if_t::value, int> value( - std::enable_if_t::value, iType>, Args...) { + static inline std::enable_if_t, int> value( + std::enable_if_t, iType>, Args...) { return 1; } }; template -inline std::enable_if_t::value, iType> -extract_vector_length(iType val, Args...) { +inline std::enable_if_t, iType> extract_vector_length( + iType val, Args...) { return val; } template -inline std::enable_if_t::value, int> -extract_vector_length(iType, Args...) { +inline std::enable_if_t, int> extract_vector_length( + iType, Args...) { return 1; } @@ -577,7 +582,7 @@ struct ScratchRequest { } }; -// Throws a runtime exception if level is not `0` or `1` +// Causes abnormal program termination if level is not `0` or `1` void team_policy_check_valid_storage_level_argument(int level); /** \brief Execution policy for parallel work over a league of teams of @@ -721,55 +726,54 @@ class TeamPolicy // Execution space not provided deduces to TeamPolicy<> -TeamPolicy()->TeamPolicy<>; +TeamPolicy() -> TeamPolicy<>; -TeamPolicy(int, int)->TeamPolicy<>; -TeamPolicy(int, int, int)->TeamPolicy<>; -TeamPolicy(int, Kokkos::AUTO_t const&)->TeamPolicy<>; -TeamPolicy(int, Kokkos::AUTO_t const&, int)->TeamPolicy<>; -TeamPolicy(int, Kokkos::AUTO_t const&, Kokkos::AUTO_t const&)->TeamPolicy<>; -TeamPolicy(int, int, Kokkos::AUTO_t const&)->TeamPolicy<>; +TeamPolicy(int, int) -> TeamPolicy<>; +TeamPolicy(int, int, int) -> TeamPolicy<>; +TeamPolicy(int, Kokkos::AUTO_t const&) -> TeamPolicy<>; +TeamPolicy(int, Kokkos::AUTO_t const&, int) -> TeamPolicy<>; +TeamPolicy(int, Kokkos::AUTO_t const&, Kokkos::AUTO_t const&) -> TeamPolicy<>; +TeamPolicy(int, int, Kokkos::AUTO_t const&) -> TeamPolicy<>; // DefaultExecutionSpace deduces to TeamPolicy<> -TeamPolicy(DefaultExecutionSpace const&, int, int)->TeamPolicy<>; -TeamPolicy(DefaultExecutionSpace const&, int, int, int)->TeamPolicy<>; +TeamPolicy(DefaultExecutionSpace const&, int, int) -> TeamPolicy<>; +TeamPolicy(DefaultExecutionSpace const&, int, int, int) -> TeamPolicy<>; TeamPolicy(DefaultExecutionSpace const&, int, Kokkos::AUTO_t const&) - ->TeamPolicy<>; + -> TeamPolicy<>; TeamPolicy(DefaultExecutionSpace const&, int, Kokkos::AUTO_t const&, int) - ->TeamPolicy<>; + -> TeamPolicy<>; TeamPolicy(DefaultExecutionSpace const&, int, Kokkos::AUTO_t const&, - Kokkos::AUTO_t const&) - ->TeamPolicy<>; + Kokkos::AUTO_t const&) -> TeamPolicy<>; TeamPolicy(DefaultExecutionSpace const&, int, int, Kokkos::AUTO_t const&) - ->TeamPolicy<>; + -> TeamPolicy<>; // ES != DefaultExecutionSpace deduces to TeamPolicy template >> -TeamPolicy(ES const&, int, int)->TeamPolicy; +TeamPolicy(ES const&, int, int) -> TeamPolicy; template >> -TeamPolicy(ES const&, int, int, int)->TeamPolicy; +TeamPolicy(ES const&, int, int, int) -> TeamPolicy; template >> -TeamPolicy(ES const&, int, Kokkos::AUTO_t const&)->TeamPolicy; +TeamPolicy(ES const&, int, Kokkos::AUTO_t const&) -> TeamPolicy; template >> -TeamPolicy(ES const&, int, Kokkos::AUTO_t const&, int)->TeamPolicy; +TeamPolicy(ES const&, int, Kokkos::AUTO_t const&, int) -> TeamPolicy; template >> TeamPolicy(ES const&, int, Kokkos::AUTO_t const&, Kokkos::AUTO_t const&) - ->TeamPolicy; + -> TeamPolicy; template >> -TeamPolicy(ES const&, int, int, Kokkos::AUTO_t const&)->TeamPolicy; +TeamPolicy(ES const&, int, int, Kokkos::AUTO_t const&) -> TeamPolicy; namespace Impl { @@ -1041,7 +1045,7 @@ struct TeamThreadMDRange, TeamHandle> { template KOKKOS_DEDUCTION_GUIDE TeamThreadMDRange(TeamHandle const&, Args&&...) - ->TeamThreadMDRange, TeamHandle>; + -> TeamThreadMDRange, TeamHandle>; template struct ThreadVectorMDRange; @@ -1078,7 +1082,7 @@ struct ThreadVectorMDRange, TeamHandle> { template KOKKOS_DEDUCTION_GUIDE ThreadVectorMDRange(TeamHandle const&, Args&&...) - ->ThreadVectorMDRange, TeamHandle>; + -> ThreadVectorMDRange, TeamHandle>; template struct TeamVectorMDRange; @@ -1115,7 +1119,7 @@ struct TeamVectorMDRange, TeamHandle> { template KOKKOS_DEDUCTION_GUIDE TeamVectorMDRange(TeamHandle const&, Args&&...) - ->TeamVectorMDRange, TeamHandle>; + -> TeamVectorMDRange, TeamHandle>; template @@ -1162,7 +1166,7 @@ KOKKOS_INLINE_FUNCTION void parallel_reduce( Kokkos::HIP> #elif defined(KOKKOS_ENABLE_SYCL) || std::is_same_v + Kokkos::SYCL> #endif ) policy.team.vector_reduce( @@ -1198,7 +1202,7 @@ KOKKOS_INLINE_FUNCTION void parallel_reduce( Kokkos::HIP> #elif defined(KOKKOS_ENABLE_SYCL) || std::is_same_v + Kokkos::SYCL> #endif ) policy.team.vector_reduce( @@ -1217,15 +1221,21 @@ KOKKOS_INLINE_FUNCTION void parallel_for( namespace Impl { template ::value> + bool HasTag = !std::is_void_v> struct ParallelConstructName; template struct ParallelConstructName { ParallelConstructName(std::string const& label) : label_ref(label) { if (label.empty()) { +#ifdef KOKKOS_ENABLE_IMPL_TYPEINFO + default_name = + std::string(TypeInfo>::name()) + + "/" + std::string(TypeInfo::name()); +#else default_name = std::string(typeid(FunctorType).name()) + "/" + typeid(TagType).name(); +#endif } } std::string const& get() { @@ -1239,7 +1249,11 @@ template struct ParallelConstructName { ParallelConstructName(std::string const& label) : label_ref(label) { if (label.empty()) { - default_name = std::string(typeid(FunctorType).name()); +#ifdef KOKKOS_ENABLE_IMPL_TYPEINFO + default_name = TypeInfo>::name(); +#else + default_name = typeid(FunctorType).name(); +#endif } } std::string const& get() { diff --git a/packages/kokkos/core/src/Kokkos_Extents.hpp b/packages/kokkos/core/src/Kokkos_Extents.hpp index 9bc2eda60469..7d1f8c755d78 100644 --- a/packages/kokkos/core/src/Kokkos_Extents.hpp +++ b/packages/kokkos/core/src/Kokkos_Extents.hpp @@ -134,7 +134,7 @@ struct ApplyExtent { template struct ApplyExtent { - using type = ValueType * [Ext]; + using type = ValueType* [Ext]; }; template diff --git a/packages/kokkos/core/src/Kokkos_Future.hpp b/packages/kokkos/core/src/Kokkos_Future.hpp index 0b3a153de8c4..c26d08be1cff 100644 --- a/packages/kokkos/core/src/Kokkos_Future.hpp +++ b/packages/kokkos/core/src/Kokkos_Future.hpp @@ -14,11 +14,17 @@ // //@HEADER -#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE #include + +#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE static_assert(false, "Including non-public Kokkos header files is not allowed."); #endif + +#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_4 +#error "The tasking framework is deprecated" +#endif + #ifndef KOKKOS_FUTURE_HPP #define KOKKOS_FUTURE_HPP @@ -41,13 +47,19 @@ static_assert(false, //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS +// We allow using deprecated classes in this file +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() +#endif + namespace Kokkos { // For now, hack this in as a partial specialization // TODO @tasking @cleanup Make this the "normal" class template and make the old // code the specialization template -class BasicFuture> { +class KOKKOS_DEPRECATED + BasicFuture> { public: using value_type = ValueType; using execution_space = ExecutionSpace; @@ -244,7 +256,7 @@ class BasicFuture> { //////////////////////////////////////////////////////////////////////////////// template -class BasicFuture { +class KOKKOS_DEPRECATED BasicFuture { private: template friend class BasicTaskScheduler; @@ -413,13 +425,13 @@ class BasicFuture { // Is a Future with the given execution space template -struct is_future : public std::false_type {}; +struct KOKKOS_DEPRECATED is_future : public std::false_type {}; template -struct is_future, ExecSpace> +struct KOKKOS_DEPRECATED is_future, ExecSpace> : std::bool_constant< - std::is_same::value || - std::is_void::value> {}; + std::is_same_v || + std::is_void_v> {}; //////////////////////////////////////////////////////////////////////////////// // END OLD CODE @@ -432,8 +444,8 @@ class ResolveFutureArgOrder { private: enum { Arg1_is_space = Kokkos::is_space::value }; enum { Arg2_is_space = Kokkos::is_space::value }; - enum { Arg1_is_value = !Arg1_is_space && !std::is_void::value }; - enum { Arg2_is_value = !Arg2_is_space && !std::is_void::value }; + enum { Arg1_is_value = !Arg1_is_space && !std::is_void_v }; + enum { Arg2_is_value = !Arg2_is_space && !std::is_void_v }; static_assert(!(Arg1_is_space && Arg2_is_space), "Future cannot be given two spaces"); @@ -463,10 +475,15 @@ class ResolveFutureArgOrder { * */ template -using Future = typename Impl::ResolveFutureArgOrder::type; +using Future KOKKOS_DEPRECATED = + typename Impl::ResolveFutureArgOrder::type; } // namespace Kokkos +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() +#endif + //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- diff --git a/packages/kokkos/core/src/Kokkos_Graph.hpp b/packages/kokkos/core/src/Kokkos_Graph.hpp index 9cc6650e26ed..05d774ac61a2 100644 --- a/packages/kokkos/core/src/Kokkos_Graph.hpp +++ b/packages/kokkos/core/src/Kokkos_Graph.hpp @@ -86,10 +86,21 @@ struct [[nodiscard]] Graph { return m_impl_ptr->get_execution_space(); } - void submit() const { + void instantiate() { KOKKOS_EXPECTS(bool(m_impl_ptr)) - (*m_impl_ptr).submit(); + (*m_impl_ptr).instantiate(); } + + void submit(const execution_space& exec) const { + KOKKOS_EXPECTS(bool(m_impl_ptr)) + (*m_impl_ptr).submit(exec); + } + + void submit() const { submit(get_execution_space()); } + + decltype(auto) native_graph(); + + decltype(auto) native_graph_exec(); }; // end Graph }}}1 @@ -135,22 +146,68 @@ Graph create_graph(ExecutionSpace ex, Closure&& arg_closure) { // function template injection works. auto rv = Kokkos::Impl::GraphAccess::construct_graph(std::move(ex)); // Invoke the user's graph construction closure - ((Closure &&) arg_closure)(Kokkos::Impl::GraphAccess::create_root_ref(rv)); + ((Closure&&)arg_closure)(Kokkos::Impl::GraphAccess::create_root_ref(rv)); // and given them back the graph // KOKKOS_ENSURES(rv.m_impl_ptr.use_count() == 1) return rv; } +template +std::enable_if_t, + Graph> +create_graph(ExecutionSpace exec = ExecutionSpace{}) { + return Kokkos::Impl::GraphAccess::construct_graph(std::move(exec)); +} + template < class ExecutionSpace = DefaultExecutionSpace, class Closure = Kokkos::Impl::DoNotExplicitlySpecifyThisTemplateParameter> -Graph create_graph(Closure&& arg_closure) { - return create_graph(ExecutionSpace{}, (Closure &&) arg_closure); +std::enable_if_t< + !Kokkos::is_execution_space_v>, + Graph> +create_graph(Closure&& arg_closure) { + return create_graph(ExecutionSpace{}, (Closure&&)arg_closure); } // end create_graph }}}1 //============================================================================== +template +decltype(auto) Graph::native_graph() { + KOKKOS_EXPECTS(bool(m_impl_ptr)); +#if defined(KOKKOS_ENABLE_CUDA) + if constexpr (std::is_same_v) { + return m_impl_ptr->cuda_graph(); + } +#elif defined(KOKKOS_ENABLE_HIP) && defined(KOKKOS_IMPL_HIP_NATIVE_GRAPH) + if constexpr (std::is_same_v) { + return m_impl_ptr->hip_graph(); + } +#elif defined(KOKKOS_ENABLE_SYCL) && defined(SYCL_EXT_ONEAPI_GRAPH) + if constexpr (std::is_same_v) { + return m_impl_ptr->sycl_graph(); + } +#endif +} + +template +decltype(auto) Graph::native_graph_exec() { + KOKKOS_EXPECTS(bool(m_impl_ptr)); +#if defined(KOKKOS_ENABLE_CUDA) + if constexpr (std::is_same_v) { + return m_impl_ptr->cuda_graph_exec(); + } +#elif defined(KOKKOS_ENABLE_HIP) && defined(KOKKOS_IMPL_HIP_NATIVE_GRAPH) + if constexpr (std::is_same_v) { + return m_impl_ptr->hip_graph_exec(); + } +#elif defined(KOKKOS_ENABLE_SYCL) && defined(SYCL_EXT_ONEAPI_GRAPH) + if constexpr (std::is_same_v) { + return m_impl_ptr->sycl_graph_exec(); + } +#endif +} + } // end namespace Experimental } // namespace Kokkos @@ -163,7 +220,7 @@ Graph create_graph(Closure&& arg_closure) { #include #if defined(KOKKOS_ENABLE_HIP) // The implementation of hipGraph in ROCm 5.2 is bugged, so we cannot use it. -#if !((HIP_VERSION_MAJOR == 5) && (HIP_VERSION_MINOR == 2)) +#if defined(KOKKOS_IMPL_HIP_NATIVE_GRAPH) #include #endif #endif diff --git a/packages/kokkos/core/src/Kokkos_GraphNode.hpp b/packages/kokkos/core/src/Kokkos_GraphNode.hpp index 2a4e2cf6414a..a0a60c07d094 100644 --- a/packages/kokkos/core/src/Kokkos_GraphNode.hpp +++ b/packages/kokkos/core/src/Kokkos_GraphNode.hpp @@ -48,7 +48,7 @@ class GraphNodeRef { // intended to be SFINAE-safe, so do validation before you instantiate. static_assert( - std::is_same::value || + std::is_same_v || Kokkos::Impl::is_specialization_of::value, "Invalid predecessor template parameter given to GraphNodeRef"); @@ -56,7 +56,7 @@ class GraphNodeRef { Kokkos::is_execution_space::value, "Invalid execution space template parameter given to GraphNodeRef"); - static_assert(std::is_same::value || + static_assert(std::is_same_v || Kokkos::Impl::is_graph_kernel::value, "Invalid kernel template parameter given to GraphNodeRef"); @@ -151,7 +151,7 @@ class GraphNodeRef { typename return_t::node_impl_t>( m_node_impl->execution_space_instance(), Kokkos::Impl::_graph_node_kernel_ctor_tag{}, - (NextKernelDeduced &&) arg_kernel, + (NextKernelDeduced&&)arg_kernel, // *this is the predecessor Kokkos::Impl::_graph_node_predecessor_ctor_tag{}, *this)); @@ -184,10 +184,10 @@ class GraphNodeRef { // {{{3 // Copyable and movable (basically just shared_ptr semantics - GraphNodeRef() noexcept = default; - GraphNodeRef(GraphNodeRef const&) = default; - GraphNodeRef(GraphNodeRef&&) noexcept = default; - GraphNodeRef& operator=(GraphNodeRef const&) = default; + GraphNodeRef() noexcept = default; + GraphNodeRef(GraphNodeRef const&) = default; + GraphNodeRef(GraphNodeRef&&) noexcept = default; + GraphNodeRef& operator=(GraphNodeRef const&) = default; GraphNodeRef& operator=(GraphNodeRef&&) noexcept = default; ~GraphNodeRef() = default; @@ -197,19 +197,19 @@ class GraphNodeRef { //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // {{{3 - template < - class OtherKernel, class OtherPredecessor, - std::enable_if_t< - // Not a copy/move constructor - !std::is_same>::value && - // must be an allowed type erasure of the kernel - Kokkos::Impl::is_compatible_type_erasure::value && - // must be an allowed type erasure of the predecessor - Kokkos::Impl::is_compatible_type_erasure< - OtherPredecessor, graph_predecessor>::value, - int> = 0> + template > && + // must be an allowed type erasure of the kernel + Kokkos::Impl::is_compatible_type_erasure< + OtherKernel, graph_kernel>::value && + // must be an allowed type erasure of the predecessor + Kokkos::Impl::is_compatible_type_erasure< + OtherPredecessor, graph_predecessor>::value, + int> = 0> /* implicit */ GraphNodeRef( GraphNodeRef const& other) @@ -257,7 +257,7 @@ class GraphNodeRef { //|| policy_t::execution_space_is_defaulted, "Execution Space mismatch between execution policy and graph"); - auto policy = Experimental::require((Policy &&) arg_policy, + auto policy = Experimental::require((Policy&&)arg_policy, Kokkos::Impl::KernelInGraphProperty{}); using next_policy_t = decltype(policy); @@ -266,8 +266,8 @@ class GraphNodeRef { std::decay_t, Kokkos::ParallelForTag>; return this->_then_kernel(next_kernel_t{std::move(arg_name), policy.space(), - (Functor &&) functor, - (Policy &&) policy}); + (Functor&&)functor, + (Policy&&)policy}); } template < @@ -280,8 +280,7 @@ class GraphNodeRef { int> = 0> auto then_parallel_for(Policy&& policy, Functor&& functor) const { // needs to static assert constraint: DataParallelFunctor - return this->then_parallel_for("", (Policy &&) policy, - (Functor &&) functor); + return this->then_parallel_for("", (Policy&&)policy, (Functor&&)functor); } template @@ -290,13 +289,13 @@ class GraphNodeRef { // needs to static assert constraint: DataParallelFunctor return this->then_parallel_for(std::move(name), Kokkos::RangePolicy(0, n), - (Functor &&) functor); + (Functor&&)functor); } template auto then_parallel_for(std::size_t n, Functor&& functor) const { // needs to static assert constraint: DataParallelFunctor - return this->then_parallel_for("", n, (Functor &&) functor); + return this->then_parallel_for("", n, (Functor&&)functor); } // end then_parallel_for }}}2 @@ -359,6 +358,23 @@ class GraphNodeRef { Kokkos::is_reducer::value, "Output argument to parallel reduce in a graph must be a " "View or a Reducer"); + + if constexpr (Kokkos::is_reducer_v) { + static_assert( + Kokkos::SpaceAccessibility< + ExecutionSpace, typename return_type_remove_cvref:: + result_view_type::memory_space>::accessible, + "The reduction target must be accessible by the graph execution " + "space."); + } else { + static_assert( + Kokkos::SpaceAccessibility< + ExecutionSpace, + typename return_type_remove_cvref::memory_space>::accessible, + "The reduction target must be accessible by the graph execution " + "space."); + } + using return_type = // Yes, you do really have to do this... std::conditional_t::value, @@ -373,7 +389,7 @@ class GraphNodeRef { // End of Kokkos reducer disaster //---------------------------------------- - auto policy = Experimental::require((Policy &&) arg_policy, + auto policy = Experimental::require((Policy&&)arg_policy, Kokkos::Impl::KernelInGraphProperty{}); using passed_reducer_type = typename return_value_adapter::reducer_type; @@ -399,7 +415,7 @@ class GraphNodeRef { return this->_then_kernel(next_kernel_t{ std::move(arg_name), graph_impl_ptr->get_execution_space(), - functor_reducer, (Policy &&) policy, + functor_reducer, (Policy&&)policy, return_value_adapter::return_value(return_value, functor)}); } @@ -413,9 +429,9 @@ class GraphNodeRef { int> = 0> auto then_parallel_reduce(Policy&& arg_policy, Functor&& functor, ReturnType&& return_value) const { - return this->then_parallel_reduce("", (Policy &&) arg_policy, - (Functor &&) functor, - (ReturnType &&) return_value); + return this->then_parallel_reduce("", (Policy&&)arg_policy, + (Functor&&)functor, + (ReturnType&&)return_value); } template @@ -425,15 +441,15 @@ class GraphNodeRef { ReturnType&& return_value) const { return this->then_parallel_reduce( std::move(label), Kokkos::RangePolicy{0, idx_end}, - (Functor &&) functor, (ReturnType &&) return_value); + (Functor&&)functor, (ReturnType&&)return_value); } template auto then_parallel_reduce(typename execution_space::size_type idx_end, Functor&& functor, ReturnType&& return_value) const { - return this->then_parallel_reduce("", idx_end, (Functor &&) functor, - (ReturnType &&) return_value); + return this->then_parallel_reduce("", idx_end, (Functor&&)functor, + (ReturnType&&)return_value); } // end then_parallel_reduce }}}2 diff --git a/packages/kokkos/core/src/Kokkos_HostSpace.hpp b/packages/kokkos/core/src/Kokkos_HostSpace.hpp index 8b5f29f95b21..706586826f48 100644 --- a/packages/kokkos/core/src/Kokkos_HostSpace.hpp +++ b/packages/kokkos/core/src/Kokkos_HostSpace.hpp @@ -63,10 +63,10 @@ class HostSpace { //! This memory space preferred device_type using device_type = Kokkos::Device; - HostSpace() = default; - HostSpace(HostSpace&& rhs) = default; - HostSpace(const HostSpace& rhs) = default; - HostSpace& operator=(HostSpace&&) = default; + HostSpace() = default; + HostSpace(HostSpace&& rhs) = default; + HostSpace(const HostSpace& rhs) = default; + HostSpace& operator=(HostSpace&&) = default; HostSpace& operator=(const HostSpace&) = default; ~HostSpace() = default; @@ -183,18 +183,6 @@ namespace Kokkos { namespace Impl { -template <> -struct DeepCopy { - DeepCopy(void* dst, const void* src, size_t n) { - hostspace_parallel_deepcopy(dst, src, n); - } - - DeepCopy(const DefaultHostExecutionSpace& exec, void* dst, const void* src, - size_t n) { - hostspace_parallel_deepcopy_async(exec, dst, src, n); - } -}; - template struct DeepCopy { DeepCopy(void* dst, const void* src, size_t n) { @@ -202,10 +190,15 @@ struct DeepCopy { } DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) { - exec.fence( - "Kokkos::Impl::DeepCopy::DeepCopy: fence before copy"); - hostspace_parallel_deepcopy_async(dst, src, n); + if constexpr (!Kokkos::SpaceAccessibility::accessible) { + exec.fence( + "Kokkos::Impl::DeepCopy::DeepCopy: fence before copy"); + hostspace_parallel_deepcopy_async(dst, src, n); + } else { + hostspace_parallel_deepcopy_async(exec, dst, src, n); + } } }; diff --git a/packages/kokkos/core/src/Kokkos_Layout.hpp b/packages/kokkos/core/src/Kokkos_Layout.hpp index 37b80e54a85f..a760e7054a16 100644 --- a/packages/kokkos/core/src/Kokkos_Layout.hpp +++ b/packages/kokkos/core/src/Kokkos_Layout.hpp @@ -52,13 +52,17 @@ struct LayoutLeft { using array_layout = LayoutLeft; size_t dimension[ARRAY_LAYOUT_MAX_RANK]; + // we don't have a constructor to set the stride directly + // but we will deprecate the class anyway (or at least using an instance of + // this class) when switching the internal implementation to use mdspan + size_t stride; enum : bool { is_extent_constructible = true }; - LayoutLeft(LayoutLeft const&) = default; - LayoutLeft(LayoutLeft&&) = default; + LayoutLeft(LayoutLeft const&) = default; + LayoutLeft(LayoutLeft&&) = default; LayoutLeft& operator=(LayoutLeft const&) = default; - LayoutLeft& operator=(LayoutLeft&&) = default; + LayoutLeft& operator=(LayoutLeft&&) = default; KOKKOS_INLINE_FUNCTION explicit constexpr LayoutLeft(size_t N0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, @@ -69,7 +73,8 @@ struct LayoutLeft { size_t N5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, size_t N6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, size_t N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) - : dimension{N0, N1, N2, N3, N4, N5, N6, N7} {} + : dimension{N0, N1, N2, N3, N4, N5, N6, N7}, + stride(KOKKOS_IMPL_CTOR_DEFAULT_ARG) {} friend bool operator==(const LayoutLeft& left, const LayoutLeft& right) { for (unsigned int rank = 0; rank < ARRAY_LAYOUT_MAX_RANK; ++rank) @@ -101,13 +106,17 @@ struct LayoutRight { using array_layout = LayoutRight; size_t dimension[ARRAY_LAYOUT_MAX_RANK]; + // we don't have a constructor to set the stride directly + // but we will deprecate the class anyway (or at least using an instance of + // this class) when switching the internal implementation to use mdspan + size_t stride; enum : bool { is_extent_constructible = true }; - LayoutRight(LayoutRight const&) = default; - LayoutRight(LayoutRight&&) = default; + LayoutRight(LayoutRight const&) = default; + LayoutRight(LayoutRight&&) = default; LayoutRight& operator=(LayoutRight const&) = default; - LayoutRight& operator=(LayoutRight&&) = default; + LayoutRight& operator=(LayoutRight&&) = default; KOKKOS_INLINE_FUNCTION explicit constexpr LayoutRight(size_t N0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, @@ -118,7 +127,8 @@ struct LayoutRight { size_t N5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, size_t N6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, size_t N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) - : dimension{N0, N1, N2, N3, N4, N5, N6, N7} {} + : dimension{N0, N1, N2, N3, N4, N5, N6, N7}, + stride{KOKKOS_IMPL_CTOR_DEFAULT_ARG} {} friend bool operator==(const LayoutRight& left, const LayoutRight& right) { for (unsigned int rank = 0; rank < ARRAY_LAYOUT_MAX_RANK; ++rank) @@ -144,10 +154,10 @@ struct LayoutStride { enum : bool { is_extent_constructible = false }; - LayoutStride(LayoutStride const&) = default; - LayoutStride(LayoutStride&&) = default; + LayoutStride(LayoutStride const&) = default; + LayoutStride(LayoutStride&&) = default; LayoutStride& operator=(LayoutStride const&) = default; - LayoutStride& operator=(LayoutStride&&) = default; + LayoutStride& operator=(LayoutStride&&) = default; /** \brief Compute strides from ordered dimensions. * @@ -191,8 +201,8 @@ struct LayoutStride { size_t N5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, size_t S5 = 0, size_t N6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, size_t S6 = 0, size_t N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, size_t S7 = 0) - : dimension{N0, N1, N2, N3, N4, N5, N6, N7}, stride{S0, S1, S2, S3, - S4, S5, S6, S7} {} + : dimension{N0, N1, N2, N3, N4, N5, N6, N7}, + stride{S0, S1, S2, S3, S4, S5, S6, S7} {} friend bool operator==(const LayoutStride& left, const LayoutStride& right) { for (unsigned int rank = 0; rank < ARRAY_LAYOUT_MAX_RANK; ++rank) diff --git a/packages/kokkos/core/src/Kokkos_Macros.hpp b/packages/kokkos/core/src/Kokkos_Macros.hpp index 0a0acd303f52..97b78a3c6485 100644 --- a/packages/kokkos/core/src/Kokkos_Macros.hpp +++ b/packages/kokkos/core/src/Kokkos_Macros.hpp @@ -27,7 +27,7 @@ * KOKKOS_ENABLE_OPENMPTARGET Kokkos::Experimental::OpenMPTarget * execution space * KOKKOS_ENABLE_HIP Kokkos::HIP execution space - * KOKKOS_ENABLE_SYCL Kokkos::Experimental::SYCL execution space + * KOKKOS_ENABLE_SYCL Kokkos::SYCL execution space * KOKKOS_ENABLE_HWLOC HWLOC library is available. * KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK Insert array bounds checks, is expensive! * KOKKOS_ENABLE_CUDA_UVM Use CUDA UVM for Cuda memory space. @@ -132,7 +132,7 @@ #define KOKKOS_CLASS_LAMBDA [ =, *this ] #endif -//#if !defined( __CUDA_ARCH__ ) // Not compiling Cuda code to 'ptx'. +// #if !defined( __CUDA_ARCH__ ) // Not compiling Cuda code to 'ptx'. // Intel compiler for host code. @@ -252,10 +252,10 @@ // CLANG compiler macros #if defined(KOKKOS_COMPILER_CLANG) -//#define KOKKOS_ENABLE_PRAGMA_UNROLL 1 -//#define KOKKOS_ENABLE_PRAGMA_IVDEP 1 -//#define KOKKOS_ENABLE_PRAGMA_LOOPCOUNT 1 -//#define KOKKOS_ENABLE_PRAGMA_VECTOR 1 +// #define KOKKOS_ENABLE_PRAGMA_UNROLL 1 +// #define KOKKOS_ENABLE_PRAGMA_IVDEP 1 +// #define KOKKOS_ENABLE_PRAGMA_LOOPCOUNT 1 +// #define KOKKOS_ENABLE_PRAGMA_VECTOR 1 #if !defined(KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION) #define KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION \ @@ -273,10 +273,10 @@ // GNU Compiler macros #if defined(KOKKOS_COMPILER_GNU) -//#define KOKKOS_ENABLE_PRAGMA_UNROLL 1 -//#define KOKKOS_ENABLE_PRAGMA_IVDEP 1 -//#define KOKKOS_ENABLE_PRAGMA_LOOPCOUNT 1 -//#define KOKKOS_ENABLE_PRAGMA_VECTOR 1 +// #define KOKKOS_ENABLE_PRAGMA_UNROLL 1 +// #define KOKKOS_ENABLE_PRAGMA_IVDEP 1 +// #define KOKKOS_ENABLE_PRAGMA_LOOPCOUNT 1 +// #define KOKKOS_ENABLE_PRAGMA_VECTOR 1 #if !defined(KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION) #define KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION \ @@ -298,7 +298,7 @@ #if defined(KOKKOS_COMPILER_NVHPC) #define KOKKOS_ENABLE_PRAGMA_UNROLL 1 #define KOKKOS_ENABLE_PRAGMA_IVDEP 1 -//#define KOKKOS_ENABLE_PRAGMA_LOOPCOUNT 1 +// #define KOKKOS_ENABLE_PRAGMA_LOOPCOUNT 1 #define KOKKOS_ENABLE_PRAGMA_VECTOR 1 #endif @@ -357,6 +357,21 @@ #define KOKKOS_IMPL_DEVICE_FUNCTION #endif +// FIXME_OPENACC FIXME_OPENMPTARGET +// Move to setup files once there is more content +// clang-format off +#if defined(KOKKOS_ENABLE_OPENACC) +#define KOKKOS_IMPL_RELOCATABLE_FUNCTION @"KOKKOS_RELOCATABLE_FUNCTION is not supported for the OpenACC backend" +#endif +#if defined(KOKKOS_ENABLE_OPENMPTARGET) +#define KOKKOS_IMPL_RELOCATABLE_FUNCTION @"KOKKOS_RELOCATABLE_FUNCTION is not supported for the OpenMPTarget backend" +#endif +// clang-format on + +#if !defined(KOKKOS_IMPL_RELOCATABLE_FUNCTION) +#define KOKKOS_IMPL_RELOCATABLE_FUNCTION +#endif + //---------------------------------------------------------------------------- // Define final version of functions. This is so that clang tidy can find these // macros more easily @@ -369,10 +384,14 @@ #define KOKKOS_FORCEINLINE_FUNCTION \ KOKKOS_IMPL_FORCEINLINE_FUNCTION \ __attribute__((annotate("KOKKOS_FORCEINLINE_FUNCTION"))) +#define KOKKOS_RELOCATABLE_FUNCTION \ + KOKKOS_IMPL_RELOCATABLE_FUNCTION \ + __attribute__((annotate("KOKKOS_RELOCATABLE_FUNCTION"))) #else #define KOKKOS_FUNCTION KOKKOS_IMPL_FUNCTION #define KOKKOS_INLINE_FUNCTION KOKKOS_IMPL_INLINE_FUNCTION #define KOKKOS_FORCEINLINE_FUNCTION KOKKOS_IMPL_FORCEINLINE_FUNCTION +#define KOKKOS_RELOCATABLE_FUNCTION KOKKOS_IMPL_RELOCATABLE_FUNCTION #endif //---------------------------------------------------------------------------- @@ -537,14 +556,17 @@ static constexpr bool kokkos_omp_on_host() { return false; } // If compiling with CUDA, we must use relocatable device code to enable the // task policy. +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 #if defined(KOKKOS_ENABLE_CUDA) #if defined(KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE) #define KOKKOS_ENABLE_TASKDAG #endif // FIXME_SYCL Tasks not implemented -#elif !defined(KOKKOS_ENABLE_HIP) && !defined(KOKKOS_ENABLE_SYCL) +#elif !defined(KOKKOS_ENABLE_HIP) && !defined(KOKKOS_ENABLE_SYCL) && \ + !defined(KOKKOS_ENABLE_OPENMPTARGET) #define KOKKOS_ENABLE_TASKDAG #endif +#endif #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_ENABLE_DEPRECATED_CODE_4) #define KOKKOS_ENABLE_CUDA_LDG_INTRINSIC @@ -582,9 +604,11 @@ static constexpr bool kokkos_omp_on_host() { return false; } // clang-format off #if defined(__NVCOMPILER) #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() \ - _Pragma("diag_suppress 1216") + _Pragma("diag_suppress 1216") \ + _Pragma("diag_suppress deprecated_entity_with_custom_message") #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() \ - _Pragma("diag_default 1216") + _Pragma("diag_default 1216") \ + _Pragma("diag_suppress deprecated_entity_with_custom_message") #elif defined(__EDG__) #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() \ _Pragma("warning push") \ @@ -607,6 +631,18 @@ static constexpr bool kokkos_omp_on_host() { return false; } #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() #endif + +#if defined(__NVCOMPILER) +#define KOKKOS_IMPL_DISABLE_UNREACHABLE_WARNINGS_PUSH() \ + _Pragma("diag_suppress code_is_unreachable") \ + _Pragma("diag_suppress initialization_not_reachable") +#define KOKKOS_IMPL_DISABLE_UNREACHABLE_WARNINGS_POP() \ + _Pragma("diag_default code_is_unreachable") \ + _Pragma("diag_default initialization_not_reachable") +#else +#define KOKKOS_IMPL_DISABLE_UNREACHABLE_WARNINGS_PUSH() +#define KOKKOS_IMPL_DISABLE_UNREACHABLE_WARNINGS_POP() +#endif // clang-format on #define KOKKOS_ATTRIBUTE_NODISCARD [[nodiscard]] diff --git a/packages/kokkos/core/src/Kokkos_MemoryPool.hpp b/packages/kokkos/core/src/Kokkos_MemoryPool.hpp index ce8c9e152fa3..f7e9e2a78c45 100644 --- a/packages/kokkos/core/src/Kokkos_MemoryPool.hpp +++ b/packages/kokkos/core/src/Kokkos_MemoryPool.hpp @@ -196,9 +196,10 @@ class MemoryPool { stats.consumed_superblocks++; stats.consumed_blocks += block_used; - stats.consumed_bytes += block_used * block_size; + stats.consumed_bytes += static_cast(block_used) * block_size; stats.reserved_blocks += block_count - block_used; - stats.reserved_bytes += (block_count - block_used) * block_size; + stats.reserved_bytes += + static_cast(block_count - block_used) * block_size; } } @@ -234,9 +235,9 @@ class MemoryPool { //-------------------------------------------------------------------------- - KOKKOS_DEFAULTED_FUNCTION MemoryPool(MemoryPool &&) = default; - KOKKOS_DEFAULTED_FUNCTION MemoryPool(const MemoryPool &) = default; - KOKKOS_DEFAULTED_FUNCTION MemoryPool &operator=(MemoryPool &&) = default; + KOKKOS_DEFAULTED_FUNCTION MemoryPool(MemoryPool &&) = default; + KOKKOS_DEFAULTED_FUNCTION MemoryPool(const MemoryPool &) = default; + KOKKOS_DEFAULTED_FUNCTION MemoryPool &operator=(MemoryPool &&) = default; KOKKOS_DEFAULTED_FUNCTION MemoryPool &operator=(const MemoryPool &) = default; KOKKOS_INLINE_FUNCTION MemoryPool() diff --git a/packages/kokkos/core/src/Kokkos_NumericTraits.hpp b/packages/kokkos/core/src/Kokkos_NumericTraits.hpp index 118bf52c05fa..1304d3ba9260 100644 --- a/packages/kokkos/core/src/Kokkos_NumericTraits.hpp +++ b/packages/kokkos/core/src/Kokkos_NumericTraits.hpp @@ -114,7 +114,7 @@ template <> struct signaling_NaN_helper { static constexpr long dou #endif template struct digits_helper {}; template <> struct digits_helper { static constexpr int value = 1; }; -template <> struct digits_helper { static constexpr int value = CHAR_BIT - std::is_signed::value; }; +template <> struct digits_helper { static constexpr int value = CHAR_BIT - std::is_signed_v; }; template <> struct digits_helper { static constexpr int value = CHAR_BIT - 1; }; template <> struct digits_helper { static constexpr int value = CHAR_BIT; }; template <> struct digits_helper { static constexpr int value = CHAR_BIT*sizeof(short)-1; }; diff --git a/packages/kokkos/core/src/Kokkos_Pair.hpp b/packages/kokkos/core/src/Kokkos_Pair.hpp index e569fefc14df..c44d1f231098 100644 --- a/packages/kokkos/core/src/Kokkos_Pair.hpp +++ b/packages/kokkos/core/src/Kokkos_Pair.hpp @@ -449,7 +449,8 @@ struct KOKKOS_DEPRECATED pair { // Specialization of relational operators for Kokkos::pair. // -#if defined(KOKKOS_COMPILER_GNU) && (KOKKOS_COMPILER_GNU < 1110) +#if defined(KOKKOS_ENABLE_DEPRECATION_WARNINGS) && \ + defined(KOKKOS_COMPILER_GNU) && (KOKKOS_COMPILER_GNU < 1110) KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() #endif template @@ -487,7 +488,8 @@ KOKKOS_DEPRECATED KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator>=( const pair& lhs, const pair& rhs) { return !(lhs < rhs); } -#if defined(KOKKOS_COMPILER_GNU) && (KOKKOS_COMPILER_GNU < 1110) +#if defined(KOKKOS_ENABLE_DEPRECATION_WARNINGS) && \ + defined(KOKKOS_COMPILER_GNU) && (KOKKOS_COMPILER_GNU < 1110) KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() #endif #endif diff --git a/packages/kokkos/core/src/Kokkos_Parallel.hpp b/packages/kokkos/core/src/Kokkos_Parallel.hpp index 122239df7908..24349e95aea1 100644 --- a/packages/kokkos/core/src/Kokkos_Parallel.hpp +++ b/packages/kokkos/core/src/Kokkos_Parallel.hpp @@ -72,19 +72,19 @@ struct FunctorPolicyExecutionSpace { static_assert( !is_detected::value || !is_detected::value || - std::is_same::value, + std::is_same_v, "A policy with an execution space and a functor with an execution space " "are given but the execution space types do not match!"); static_assert(!is_detected::value || !is_detected::value || - std::is_same::value, + std::is_same_v, "A policy with an execution space and a functor with a device " "type are given but the execution space types do not match!"); static_assert(!is_detected::value || !is_detected::value || - std::is_same::value, + std::is_same_v, "A functor with both an execution space and device type is " "given but their execution space types do not match!"); @@ -134,8 +134,10 @@ inline void parallel_for(const std::string& str, const ExecPolicy& policy, const FunctorType& functor) { uint64_t kpID = 0; - ExecPolicy inner_policy = policy; - Kokkos::Tools::Impl::begin_parallel_for(inner_policy, functor, str, kpID); + /** Request a tuned policy from the tools subsystem */ + const auto& response = + Kokkos::Tools::Impl::begin_parallel_for(policy, functor, str, kpID); + const auto& inner_policy = response.policy; auto closure = Kokkos::Impl::construct_with_shared_allocation_tracking_disabled< @@ -348,9 +350,11 @@ template ::value>> inline void parallel_scan(const std::string& str, const ExecutionPolicy& policy, const FunctorType& functor) { - uint64_t kpID = 0; - ExecutionPolicy inner_policy = policy; - Kokkos::Tools::Impl::begin_parallel_scan(inner_policy, functor, str, kpID); + uint64_t kpID = 0; + /** Request a tuned policy from the tools subsystem */ + const auto& response = + Kokkos::Tools::Impl::begin_parallel_scan(policy, functor, str, kpID); + const auto& inner_policy = response.policy; auto closure = Kokkos::Impl::construct_with_shared_allocation_tracking_disabled< diff --git a/packages/kokkos/core/src/Kokkos_Parallel_Reduce.hpp b/packages/kokkos/core/src/Kokkos_Parallel_Reduce.hpp index 53913266f130..3b89d184f2a4 100644 --- a/packages/kokkos/core/src/Kokkos_Parallel_Reduce.hpp +++ b/packages/kokkos/core/src/Kokkos_Parallel_Reduce.hpp @@ -73,7 +73,7 @@ struct Sum { template KOKKOS_DEDUCTION_GUIDE Sum(View const&) - ->Sum::memory_space>; + -> Sum::memory_space>; template struct Prod { @@ -118,7 +118,7 @@ struct Prod { template KOKKOS_DEDUCTION_GUIDE Prod(View const&) - ->Prod::memory_space>; + -> Prod::memory_space>; template struct Min { @@ -165,7 +165,7 @@ struct Min { template KOKKOS_DEDUCTION_GUIDE Min(View const&) - ->Min::memory_space>; + -> Min::memory_space>; template struct Max { @@ -213,7 +213,7 @@ struct Max { template KOKKOS_DEDUCTION_GUIDE Max(View const&) - ->Max::memory_space>; + -> Max::memory_space>; template struct LAnd { @@ -259,7 +259,7 @@ struct LAnd { template KOKKOS_DEDUCTION_GUIDE LAnd(View const&) - ->LAnd::memory_space>; + -> LAnd::memory_space>; template struct LOr { @@ -306,7 +306,7 @@ struct LOr { template KOKKOS_DEDUCTION_GUIDE LOr(View const&) - ->LOr::memory_space>; + -> LOr::memory_space>; template struct BAnd { @@ -353,7 +353,7 @@ struct BAnd { template KOKKOS_DEDUCTION_GUIDE BAnd(View const&) - ->BAnd::memory_space>; + -> BAnd::memory_space>; template struct BOr { @@ -400,7 +400,7 @@ struct BOr { template KOKKOS_DEDUCTION_GUIDE BOr(View const&) - ->BOr::memory_space>; + -> BOr::memory_space>; template struct ValLocScalar { @@ -438,7 +438,12 @@ struct MinLoc { // Required KOKKOS_INLINE_FUNCTION void join(value_type& dest, const value_type& src) const { - if (src.val < dest.val) dest = src; + if (src.val < dest.val) + dest = src; + else if (src.val == dest.val && + dest.loc == reduction_identity::min()) { + dest.loc = src.loc; + } } KOKKOS_INLINE_FUNCTION @@ -458,11 +463,10 @@ struct MinLoc { }; template -KOKKOS_DEDUCTION_GUIDE MinLoc( - View, Properties...> const&) - ->MinLoc, - Properties...>::memory_space>; +KOKKOS_DEDUCTION_GUIDE +MinLoc(View, Properties...> const&) -> MinLoc< + Scalar, Index, + typename View, Properties...>::memory_space>; template struct MaxLoc { @@ -494,7 +498,12 @@ struct MaxLoc { // Required KOKKOS_INLINE_FUNCTION void join(value_type& dest, const value_type& src) const { - if (src.val > dest.val) dest = src; + if (src.val > dest.val) + dest = src; + else if (src.val == dest.val && + dest.loc == reduction_identity::min()) { + dest.loc = src.loc; + } } KOKKOS_INLINE_FUNCTION @@ -514,11 +523,10 @@ struct MaxLoc { }; template -KOKKOS_DEDUCTION_GUIDE MaxLoc( - View, Properties...> const&) - ->MaxLoc, - Properties...>::memory_space>; +KOKKOS_DEDUCTION_GUIDE +MaxLoc(View, Properties...> const&) -> MaxLoc< + Scalar, Index, + typename View, Properties...>::memory_space>; template struct MinMaxScalar { @@ -580,8 +588,8 @@ struct MinMax { template KOKKOS_DEDUCTION_GUIDE MinMax(View, Properties...> const&) - ->MinMax, Properties...>::memory_space>; + -> MinMax, Properties...>::memory_space>; template struct MinMaxLocScalar { @@ -622,10 +630,16 @@ struct MinMaxLoc { if (src.min_val < dest.min_val) { dest.min_val = src.min_val; dest.min_loc = src.min_loc; + } else if (dest.min_val == src.min_val && + dest.min_loc == reduction_identity::min()) { + dest.min_loc = src.min_loc; } if (src.max_val > dest.max_val) { dest.max_val = src.max_val; dest.max_loc = src.max_loc; + } else if (dest.max_val == src.max_val && + dest.max_loc == reduction_identity::min()) { + dest.max_loc = src.max_loc; } } @@ -650,9 +664,9 @@ struct MinMaxLoc { template KOKKOS_DEDUCTION_GUIDE MinMaxLoc( View, Properties...> const&) - ->MinMaxLoc, - Properties...>::memory_space>; + -> MinMaxLoc, + Properties...>::memory_space>; // -------------------------------------------------- // reducers added to support std algorithms @@ -718,9 +732,9 @@ struct MaxFirstLoc { template KOKKOS_DEDUCTION_GUIDE MaxFirstLoc( View, Properties...> const&) - ->MaxFirstLoc, - Properties...>::memory_space>; + -> MaxFirstLoc, + Properties...>::memory_space>; // // MaxFirstLocCustomComparator @@ -788,9 +802,9 @@ template KOKKOS_DEDUCTION_GUIDE MaxFirstLocCustomComparator( View, Properties...> const&, ComparatorType) - ->MaxFirstLocCustomComparator, - Properties...>::memory_space>; + -> MaxFirstLocCustomComparator, + Properties...>::memory_space>; // // MinFirstLoc @@ -852,9 +866,9 @@ struct MinFirstLoc { template KOKKOS_DEDUCTION_GUIDE MinFirstLoc( View, Properties...> const&) - ->MinFirstLoc, - Properties...>::memory_space>; + -> MinFirstLoc, + Properties...>::memory_space>; // // MinFirstLocCustomComparator @@ -922,9 +936,9 @@ template KOKKOS_DEDUCTION_GUIDE MinFirstLocCustomComparator( View, Properties...> const&, ComparatorType) - ->MinFirstLocCustomComparator, - Properties...>::memory_space>; + -> MinFirstLocCustomComparator, + Properties...>::memory_space>; // // MinMaxFirstLastLoc @@ -997,9 +1011,9 @@ struct MinMaxFirstLastLoc { template KOKKOS_DEDUCTION_GUIDE MinMaxFirstLastLoc( View, Properties...> const&) - ->MinMaxFirstLastLoc, - Properties...>::memory_space>; + -> MinMaxFirstLastLoc, + Properties...>::memory_space>; // // MinMaxFirstLastLocCustomComparator @@ -1077,7 +1091,7 @@ template KOKKOS_DEDUCTION_GUIDE MinMaxFirstLastLocCustomComparator( View, Properties...> const&, ComparatorType) - ->MinMaxFirstLastLocCustomComparator< + -> MinMaxFirstLastLocCustomComparator< Scalar, Index, ComparatorType, typename View, Properties...>::memory_space>; @@ -1139,10 +1153,9 @@ struct FirstLoc { }; template -KOKKOS_DEDUCTION_GUIDE FirstLoc( - View, Properties...> const&) - ->FirstLoc, - Properties...>::memory_space>; +KOKKOS_DEDUCTION_GUIDE +FirstLoc(View, Properties...> const&) -> FirstLoc< + Index, typename View, Properties...>::memory_space>; // // LastLoc @@ -1202,8 +1215,8 @@ struct LastLoc { template KOKKOS_DEDUCTION_GUIDE LastLoc(View, Properties...> const&) - ->LastLoc, Properties...>::memory_space>; + -> LastLoc, + Properties...>::memory_space>; template struct StdIsPartScalar { @@ -1270,8 +1283,8 @@ struct StdIsPartitioned { template KOKKOS_DEDUCTION_GUIDE StdIsPartitioned( View, Properties...> const&) - ->StdIsPartitioned, - Properties...>::memory_space>; + -> StdIsPartitioned, + Properties...>::memory_space>; template struct StdPartPointScalar { @@ -1333,8 +1346,8 @@ struct StdPartitionPoint { template KOKKOS_DEDUCTION_GUIDE StdPartitionPoint( View, Properties...> const&) - ->StdPartitionPoint, - Properties...>::memory_space>; + -> StdPartitionPoint, + Properties...>::memory_space>; } // namespace Kokkos namespace Kokkos { @@ -1404,9 +1417,9 @@ struct ParallelReduceReturnValue< template struct ParallelReduceReturnValue< std::enable_if_t::value && - (!std::is_array::value && - !std::is_pointer::value) && - !Kokkos::is_reducer::value>, + (!std::is_array_v && + !std::is_pointer_v< + ReturnType>)&&!Kokkos::is_reducer::value>, ReturnType, FunctorType> { using return_type = Kokkos::View; @@ -1422,8 +1435,8 @@ struct ParallelReduceReturnValue< template struct ParallelReduceReturnValue< - std::enable_if_t<(std::is_array::value || - std::is_pointer::value)>, + std::enable_if_t<(std::is_array_v || + std::is_pointer_v)>, ReturnType, FunctorType> { using return_type = Kokkos::View, Kokkos::HostSpace, Kokkos::MemoryUnmanaged>; @@ -1434,7 +1447,7 @@ struct ParallelReduceReturnValue< static return_type return_value(ReturnType& return_val, const FunctorType& functor) { - if (std::is_array::value) + if (std::is_array_v) return return_type(return_val); else return return_type(return_val, functor.value_count); @@ -1467,8 +1480,7 @@ struct ParallelReducePolicyType< template struct ParallelReducePolicyType< - std::enable_if_t::value>, PolicyType, - FunctorType> { + std::enable_if_t>, PolicyType, FunctorType> { using execution_space = typename Impl::FunctorPolicyExecutionSpace::execution_space; @@ -1501,27 +1513,28 @@ struct ParallelReduceAdaptor { using PassedReducerType = typename return_value_adapter::reducer_type; uint64_t kpID = 0; - PolicyType inner_policy = policy; - Kokkos::Tools::Impl::begin_parallel_reduce( - inner_policy, functor, label, kpID); - using ReducerSelector = - Kokkos::Impl::if_c::value, + Kokkos::Impl::if_c, FunctorType, PassedReducerType>; using Analysis = FunctorAnalysis; - using CombinedFunctorReducerType = CombinedFunctorReducer; + + CombinedFunctorReducerType functor_reducer( + functor, typename Analysis::Reducer( + ReducerSelector::select(functor, return_value))); + const auto& response = Kokkos::Tools::Impl::begin_parallel_reduce< + typename return_value_adapter::reducer_type>(policy, functor_reducer, + label, kpID); + const auto& inner_policy = response.policy; + auto closure = construct_with_shared_allocation_tracking_disabled< Impl::ParallelReduce::execution_space>>( - CombinedFunctorReducerType( - functor, typename Analysis::Reducer( - ReducerSelector::select(functor, return_value))), - inner_policy, + functor_reducer, inner_policy, return_value_adapter::return_value(return_value, functor)); closure.execute(); @@ -1536,7 +1549,7 @@ struct ParallelReduceAdaptor { template static inline std::enable_if_t::value)> + std::is_pointer_v)> execute(const std::string& label, const PolicyType& policy, const FunctorType& functor, ReturnType& return_value) { execute_impl(label, policy, functor, return_value); @@ -1568,7 +1581,7 @@ struct ReducerHasTestReferenceFunction { static std::false_type test_func(...); enum { - value = std::is_same(nullptr))>::value + value = std::is_same_v(nullptr))> }; }; @@ -1611,7 +1624,7 @@ struct ParallelReduceFence { template static void fence(const ExecutionSpace& ex, const std::string& name, ArgsDeduced&&... args) { - if (Impl::parallel_reduce_needs_fence(ex, (ArgsDeduced &&) args...)) { + if (Impl::parallel_reduce_needs_fence(ex, (ArgsDeduced&&)args...)) { ex.fence(name); } } @@ -1663,11 +1676,11 @@ template inline std::enable_if_t::value && !(Kokkos::is_view::value || Kokkos::is_reducer::value || - std::is_pointer::value)> + std::is_pointer_v)> parallel_reduce(const std::string& label, const PolicyType& policy, const FunctorType& functor, ReturnType& return_value) { static_assert( - !std::is_const::value, + !std::is_const_v, "A const reduction result type is only allowed for a View, pointer or " "reducer return type!"); @@ -1684,11 +1697,11 @@ template inline std::enable_if_t::value && !(Kokkos::is_view::value || Kokkos::is_reducer::value || - std::is_pointer::value)> + std::is_pointer_v)> parallel_reduce(const PolicyType& policy, const FunctorType& functor, ReturnType& return_value) { static_assert( - !std::is_const::value, + !std::is_const_v, "A const reduction result type is only allowed for a View, pointer or " "reducer return type!"); @@ -1704,11 +1717,11 @@ parallel_reduce(const PolicyType& policy, const FunctorType& functor, template inline std::enable_if_t::value || Kokkos::is_reducer::value || - std::is_pointer::value)> + std::is_pointer_v)> parallel_reduce(const size_t& policy, const FunctorType& functor, ReturnType& return_value) { static_assert( - !std::is_const::value, + !std::is_const_v, "A const reduction result type is only allowed for a View, pointer or " "reducer return type!"); @@ -1728,11 +1741,11 @@ parallel_reduce(const size_t& policy, const FunctorType& functor, template inline std::enable_if_t::value || Kokkos::is_reducer::value || - std::is_pointer::value)> + std::is_pointer_v)> parallel_reduce(const std::string& label, const size_t& policy, const FunctorType& functor, ReturnType& return_value) { static_assert( - !std::is_const::value, + !std::is_const_v, "A const reduction result type is only allowed for a View, pointer or " "reducer return type!"); @@ -1754,7 +1767,7 @@ template inline std::enable_if_t::value && (Kokkos::is_view::value || Kokkos::is_reducer::value || - std::is_pointer::value)> + std::is_pointer_v)> parallel_reduce(const std::string& label, const PolicyType& policy, const FunctorType& functor, const ReturnType& return_value) { ReturnType return_value_impl = return_value; @@ -1771,7 +1784,7 @@ template inline std::enable_if_t::value && (Kokkos::is_view::value || Kokkos::is_reducer::value || - std::is_pointer::value)> + std::is_pointer_v)> parallel_reduce(const PolicyType& policy, const FunctorType& functor, const ReturnType& return_value) { ReturnType return_value_impl = return_value; @@ -1787,7 +1800,7 @@ parallel_reduce(const PolicyType& policy, const FunctorType& functor, template inline std::enable_if_t::value || Kokkos::is_reducer::value || - std::is_pointer::value> + std::is_pointer_v> parallel_reduce(const size_t& policy, const FunctorType& functor, const ReturnType& return_value) { using policy_type = @@ -1806,7 +1819,7 @@ parallel_reduce(const size_t& policy, const FunctorType& functor, template inline std::enable_if_t::value || Kokkos::is_reducer::value || - std::is_pointer::value> + std::is_pointer_v> parallel_reduce(const std::string& label, const size_t& policy, const FunctorType& functor, const ReturnType& return_value) { using policy_type = diff --git a/packages/kokkos/core/src/Kokkos_Profiling_ProfileSection.hpp b/packages/kokkos/core/src/Kokkos_Profiling_ProfileSection.hpp index e7a9ba0c7ed7..1759c2b4a1c5 100644 --- a/packages/kokkos/core/src/Kokkos_Profiling_ProfileSection.hpp +++ b/packages/kokkos/core/src/Kokkos_Profiling_ProfileSection.hpp @@ -32,7 +32,7 @@ class [[nodiscard]] ProfilingSection { uint32_t sectionID; public: - ProfilingSection(ProfilingSection const&) = delete; + ProfilingSection(ProfilingSection const&) = delete; ProfilingSection& operator=(ProfilingSection const&) = delete; #if defined(__has_cpp_attribute) && __has_cpp_attribute(nodiscard) >= 201907 diff --git a/packages/kokkos/core/src/Kokkos_Profiling_ScopedRegion.hpp b/packages/kokkos/core/src/Kokkos_Profiling_ScopedRegion.hpp index f45dfa324e9f..a4168b9401fa 100644 --- a/packages/kokkos/core/src/Kokkos_Profiling_ScopedRegion.hpp +++ b/packages/kokkos/core/src/Kokkos_Profiling_ScopedRegion.hpp @@ -30,7 +30,7 @@ namespace Kokkos::Profiling { class [[nodiscard]] ScopedRegion { public: - ScopedRegion(ScopedRegion const &) = delete; + ScopedRegion(ScopedRegion const &) = delete; ScopedRegion &operator=(ScopedRegion const &) = delete; #if defined(__has_cpp_attribute) && __has_cpp_attribute(nodiscard) >= 201907 diff --git a/packages/kokkos/core/src/Kokkos_ScratchSpace.hpp b/packages/kokkos/core/src/Kokkos_ScratchSpace.hpp index a925e32a339e..f00e25fdb629 100644 --- a/packages/kokkos/core/src/Kokkos_ScratchSpace.hpp +++ b/packages/kokkos/core/src/Kokkos_ScratchSpace.hpp @@ -110,7 +110,7 @@ class ScratchMemorySpace { // Note: for team scratch m_offset is 0, since every // thread will get back the same shared pointer void* tmp = m_iter + m_offset * size; - uintptr_t increment = size * m_multiplier; + uintptr_t increment = static_cast(size) * m_multiplier; // Cast to uintptr_t to avoid problems with pointer arithmetic using SYCL const auto end_iter = diff --git a/packages/kokkos/core/src/Kokkos_TaskScheduler.hpp b/packages/kokkos/core/src/Kokkos_TaskScheduler.hpp index 869a5f8ec26a..3edecb4502a8 100644 --- a/packages/kokkos/core/src/Kokkos_TaskScheduler.hpp +++ b/packages/kokkos/core/src/Kokkos_TaskScheduler.hpp @@ -14,11 +14,17 @@ // //@HEADER -#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE #include + +#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE static_assert(false, "Including non-public Kokkos header files is not allowed."); #endif + +#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_4 +#error "The tasking framework is deprecated" +#endif + #ifndef KOKKOS_TASKSCHEDULER_HPP #define KOKKOS_TASKSCHEDULER_HPP @@ -44,6 +50,11 @@ static_assert(false, //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS +// We allow using deprecated classes in this file +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() +#endif + namespace Kokkos { namespace Impl { @@ -54,7 +65,7 @@ class TaskExec; } // end namespace Impl template -class BasicTaskScheduler : public Impl::TaskSchedulerBase { +class KOKKOS_DEPRECATED BasicTaskScheduler : public Impl::TaskSchedulerBase { public: using scheduler_type = BasicTaskScheduler; using execution_space = ExecSpace; @@ -494,8 +505,8 @@ namespace Kokkos { // Construct a TaskTeam execution policy template -Impl::TaskPolicyWithPredecessor> +KOKKOS_DEPRECATED Impl::TaskPolicyWithPredecessor< + Impl::TaskType::TaskTeam, Kokkos::BasicFuture> KOKKOS_INLINE_FUNCTION TaskTeam(Kokkos::BasicFuture arg_future, TaskPriority arg_priority = TaskPriority::Regular) { @@ -503,7 +514,8 @@ Impl::TaskPolicyWithPredecessor -Impl::TaskPolicyWithScheduler +KOKKOS_DEPRECATED Impl::TaskPolicyWithScheduler KOKKOS_INLINE_FUNCTION TaskTeam( Scheduler arg_scheduler, std::enable_if_t::value, TaskPriority> @@ -512,18 +524,18 @@ Impl::TaskPolicyWithScheduler } template -Impl::TaskPolicyWithScheduler +KOKKOS_DEPRECATED Impl::TaskPolicyWithScheduler< + Kokkos::Impl::TaskType::TaskTeam, Scheduler, PredecessorFuture> KOKKOS_INLINE_FUNCTION TaskTeam(Scheduler arg_scheduler, PredecessorFuture arg_future, std::enable_if_t::value && Kokkos::is_future::value, TaskPriority> arg_priority = TaskPriority::Regular) { - static_assert(std::is_same::value, - "Can't create a task policy from a scheduler and a future from " - "a different scheduler"); + static_assert( + std::is_same_v, + "Can't create a task policy from a scheduler and a future from " + "a different scheduler"); return {std::move(arg_scheduler), std::move(arg_future), arg_priority}; } @@ -531,8 +543,8 @@ Impl::TaskPolicyWithScheduler -Impl::TaskPolicyWithPredecessor> +KOKKOS_DEPRECATED Impl::TaskPolicyWithPredecessor< + Impl::TaskType::TaskSingle, Kokkos::BasicFuture> KOKKOS_INLINE_FUNCTION TaskSingle(Kokkos::BasicFuture arg_future, TaskPriority arg_priority = TaskPriority::Regular) { @@ -540,7 +552,8 @@ Impl::TaskPolicyWithPredecessor -Impl::TaskPolicyWithScheduler +KOKKOS_DEPRECATED Impl::TaskPolicyWithScheduler KOKKOS_INLINE_FUNCTION TaskSingle( Scheduler arg_scheduler, std::enable_if_t::value, TaskPriority> @@ -549,18 +562,18 @@ Impl::TaskPolicyWithScheduler } template -Impl::TaskPolicyWithScheduler +KOKKOS_DEPRECATED Impl::TaskPolicyWithScheduler< + Kokkos::Impl::TaskType::TaskSingle, Scheduler, PredecessorFuture> KOKKOS_INLINE_FUNCTION TaskSingle(Scheduler arg_scheduler, PredecessorFuture arg_future, std::enable_if_t::value && Kokkos::is_future::value, TaskPriority> arg_priority = TaskPriority::Regular) { - static_assert(std::is_same::value, - "Can't create a task policy from a scheduler and a future from " - "a different scheduler"); + static_assert( + std::is_same_v, + "Can't create a task policy from a scheduler and a future from " + "a different scheduler"); return {std::move(arg_scheduler), std::move(arg_future), arg_priority}; } @@ -575,7 +588,8 @@ Impl::TaskPolicyWithScheduler -typename Scheduler::template future_type_for_functor> +KOKKOS_DEPRECATED typename Scheduler::template future_type_for_functor< + std::decay_t> host_spawn(Impl::TaskPolicyWithScheduler arg_policy, FunctorType&& arg_functor) { @@ -606,7 +620,8 @@ host_spawn(Impl::TaskPolicyWithScheduler */ template -typename Scheduler::template future_type_for_functor> +KOKKOS_DEPRECATED typename Scheduler::template future_type_for_functor< + std::decay_t> KOKKOS_INLINE_FUNCTION task_spawn(Impl::TaskPolicyWithScheduler arg_policy, @@ -633,7 +648,7 @@ typename Scheduler::template future_type_for_functor> * 2) High, Normal, or Low priority */ template -void KOKKOS_INLINE_FUNCTION +KOKKOS_DEPRECATED void KOKKOS_INLINE_FUNCTION respawn(FunctorType* arg_self, T const& arg, TaskPriority const& arg_priority = TaskPriority::Regular) { static_assert(Kokkos::is_future::value || Kokkos::is_scheduler::value, @@ -656,7 +671,8 @@ respawn(FunctorType* arg_self, T const& arg, // Wait for all runnable tasks to complete template -inline void wait(BasicTaskScheduler const& scheduler) { +KOKKOS_DEPRECATED inline void wait( + BasicTaskScheduler const& scheduler) { using scheduler_type = BasicTaskScheduler; scheduler_type::specialization::execute(scheduler); // scheduler.m_queue->execute(); @@ -664,6 +680,10 @@ inline void wait(BasicTaskScheduler const& scheduler) { } // namespace Kokkos +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() +#endif + //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- diff --git a/packages/kokkos/core/src/Kokkos_TaskScheduler_fwd.hpp b/packages/kokkos/core/src/Kokkos_TaskScheduler_fwd.hpp index 203fb16eaf0b..83e1c06db9b9 100644 --- a/packages/kokkos/core/src/Kokkos_TaskScheduler_fwd.hpp +++ b/packages/kokkos/core/src/Kokkos_TaskScheduler_fwd.hpp @@ -31,31 +31,40 @@ static_assert(false, #include //---------------------------------------------------------------------------- +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS +// We allow using deprecated classes in this file +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() +#endif + namespace Kokkos { // Forward declarations used in Impl::TaskQueue template -class BasicFuture; +class KOKKOS_DEPRECATED BasicFuture; template -class SimpleTaskScheduler; +class KOKKOS_DEPRECATED SimpleTaskScheduler; template -class BasicTaskScheduler; +class KOKKOS_DEPRECATED BasicTaskScheduler; template -struct is_scheduler : public std::false_type {}; +struct KOKKOS_DEPRECATED is_scheduler : public std::false_type {}; template -struct is_scheduler> : public std::true_type { -}; +struct KOKKOS_DEPRECATED is_scheduler> + : public std::true_type {}; template -struct is_scheduler> : public std::true_type { -}; +struct KOKKOS_DEPRECATED is_scheduler> + : public std::true_type {}; -enum class TaskPriority : int { High = 0, Regular = 1, Low = 2 }; +enum class KOKKOS_DEPRECATED TaskPriority : int { + High = 0, + Regular = 1, + Low = 2 +}; } // namespace Kokkos @@ -141,28 +150,28 @@ using default_tasking_memory_space_for_execution_space_t = namespace Kokkos { template -using DeprecatedTaskScheduler = BasicTaskScheduler< +using DeprecatedTaskScheduler KOKKOS_DEPRECATED = BasicTaskScheduler< Space, Impl::TaskQueue< Space, Impl::default_tasking_memory_space_for_execution_space_t>>; template -using DeprecatedTaskSchedulerMultiple = BasicTaskScheduler< +using DeprecatedTaskSchedulerMultiple KOKKOS_DEPRECATED = BasicTaskScheduler< Space, Impl::TaskQueueMultiple< Space, Impl::default_tasking_memory_space_for_execution_space_t>>; template -using TaskScheduler = SimpleTaskScheduler< +using TaskScheduler KOKKOS_DEPRECATED = SimpleTaskScheduler< Space, Impl::SingleTaskQueue< Space, Impl::default_tasking_memory_space_for_execution_space_t, Impl::TaskQueueTraitsLockBased>>; template -using TaskSchedulerMultiple = SimpleTaskScheduler< +using TaskSchedulerMultiple KOKKOS_DEPRECATED = SimpleTaskScheduler< Space, Impl::MultipleTaskQueue< Space, Impl::default_tasking_memory_space_for_execution_space_t, @@ -172,7 +181,7 @@ using TaskSchedulerMultiple = SimpleTaskScheduler< Impl::default_tasking_memory_space_for_execution_space_t>>>>; template -using ChaseLevTaskScheduler = SimpleTaskScheduler< +using ChaseLevTaskScheduler KOKKOS_DEPRECATED = SimpleTaskScheduler< Space, Impl::MultipleTaskQueue< Space, Impl::default_tasking_memory_space_for_execution_space_t, @@ -182,7 +191,7 @@ using ChaseLevTaskScheduler = SimpleTaskScheduler< Impl::default_tasking_memory_space_for_execution_space_t>>>>; template -void wait(BasicTaskScheduler const&); +KOKKOS_DEPRECATED void wait(BasicTaskScheduler const&); namespace Impl { @@ -204,6 +213,10 @@ struct TaskPolicyData; } // namespace Kokkos +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() +#endif + //---------------------------------------------------------------------------- #endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */ diff --git a/packages/kokkos/core/src/Kokkos_Timer.hpp b/packages/kokkos/core/src/Kokkos_Timer.hpp index a210b6ff1832..ab31484d76ac 100644 --- a/packages/kokkos/core/src/Kokkos_Timer.hpp +++ b/packages/kokkos/core/src/Kokkos_Timer.hpp @@ -48,7 +48,7 @@ class Timer { inline Timer() { reset(); } - Timer(const Timer&) = delete; + Timer(const Timer&) = delete; Timer& operator=(const Timer&) = delete; inline double seconds() const { diff --git a/packages/kokkos/core/src/Kokkos_Tuners.hpp b/packages/kokkos/core/src/Kokkos_Tuners.hpp index f5ffc66af5b5..fcb061b378f2 100644 --- a/packages/kokkos/core/src/Kokkos_Tuners.hpp +++ b/packages/kokkos/core/src/Kokkos_Tuners.hpp @@ -52,6 +52,8 @@ VariableValue make_variable_value(size_t, int64_t); VariableValue make_variable_value(size_t, double); SetOrRange make_candidate_range(double lower, double upper, double step, bool openLower, bool openUpper); +SetOrRange make_candidate_range(int64_t lower, int64_t upper, int64_t step, + bool openLower, bool openUpper); size_t get_new_context_id(); void begin_context(size_t context_id); void end_context(size_t context_id); @@ -412,18 +414,19 @@ class TeamSizeTuner : public ExtendableTunerMixin { TunerType tuner; public: - TeamSizeTuner() = default; + TeamSizeTuner() = default; TeamSizeTuner& operator=(const TeamSizeTuner& other) = default; TeamSizeTuner(const TeamSizeTuner& other) = default; - TeamSizeTuner& operator=(TeamSizeTuner&& other) = default; - TeamSizeTuner(TeamSizeTuner&& other) = default; + TeamSizeTuner& operator=(TeamSizeTuner&& other) = default; + TeamSizeTuner(TeamSizeTuner&& other) = default; template TeamSizeTuner(const std::string& name, - Kokkos::TeamPolicy& policy, + const Kokkos::TeamPolicy& policy_in, const Functor& functor, const TagType& tag, ViableConfigurationCalculator calc) { - using PolicyType = Kokkos::TeamPolicy; + using PolicyType = Kokkos::TeamPolicy; + PolicyType policy(policy_in); auto initial_vector_length = policy.impl_vector_length(); if (initial_vector_length < 1) { policy.impl_set_vector_length(1); @@ -505,7 +508,8 @@ class TeamSizeTuner : public ExtendableTunerMixin { } template - void tune(Kokkos::TeamPolicy& policy) { + auto tune(const Kokkos::TeamPolicy& policy_in) { + Kokkos::TeamPolicy policy(policy_in); if (Kokkos::Tools::Experimental::have_tuning_tool()) { auto configuration = tuner.begin(); auto team_size = std::get<1>(configuration); @@ -515,6 +519,111 @@ class TeamSizeTuner : public ExtendableTunerMixin { policy.impl_set_vector_length(vector_length); } } + return policy; + } + void end() { + if (Kokkos::Tools::Experimental::have_tuning_tool()) { + tuner.end(); + } + } + + TunerType get_tuner() const { return tuner; } +}; +namespace Impl { +template +struct tuning_type_for; + +template <> +struct tuning_type_for { + static constexpr Kokkos::Tools::Experimental::ValueType value = + Kokkos::Tools::Experimental::ValueType::kokkos_value_double; + static double get( + const Kokkos::Tools::Experimental::VariableValue& value_struct) { + return value_struct.value.double_value; + } +}; +template <> +struct tuning_type_for { + static constexpr Kokkos::Tools::Experimental::ValueType value = + Kokkos::Tools::Experimental::ValueType::kokkos_value_int64; + static int64_t get( + const Kokkos::Tools::Experimental::VariableValue& value_struct) { + return value_struct.value.int_value; + } +}; +} // namespace Impl +template +class SingleDimensionalRangeTuner { + size_t id; + size_t context; + using tuning_util = Impl::tuning_type_for; + + Bound default_value; + + public: + SingleDimensionalRangeTuner() = default; + SingleDimensionalRangeTuner( + const std::string& name, + Kokkos::Tools::Experimental::StatisticalCategory category, + Bound default_val, Bound lower, Bound upper, Bound step = (Bound)0) { + default_value = default_val; + Kokkos::Tools::Experimental::VariableInfo info; + info.category = category; + info.candidates = make_candidate_range( + static_cast(lower), static_cast(upper), + static_cast(step), false, false); + info.valueQuantity = + Kokkos::Tools::Experimental::CandidateValueType::kokkos_value_range; + info.type = tuning_util::value; + id = Kokkos::Tools::Experimental::declare_output_type(name, info); + } + + Bound begin() { + context = Kokkos::Tools::Experimental::get_new_context_id(); + Kokkos::Tools::Experimental::begin_context(context); + auto tuned_value = + Kokkos::Tools::Experimental::make_variable_value(id, default_value); + Kokkos::Tools::Experimental::request_output_values(context, 1, + &tuned_value); + return tuning_util::get(tuned_value); + } + + void end() { Kokkos::Tools::Experimental::end_context(context); } + + template + void with_tuned_value(Functor& func) { + func(begin()); + end(); + } +}; + +class RangePolicyOccupancyTuner { + private: + using TunerType = SingleDimensionalRangeTuner; + TunerType tuner; + + public: + RangePolicyOccupancyTuner() = default; + template + RangePolicyOccupancyTuner(const std::string& name, + const Kokkos::RangePolicy&, + const Functor&, const TagType&, + ViableConfigurationCalculator) + : tuner(TunerType(name, + Kokkos::Tools::Experimental::StatisticalCategory:: + kokkos_value_ratio, + 100, 5, 100, 5)) {} + + template + auto tune(const Kokkos::RangePolicy& policy_in) { + Kokkos::RangePolicy policy(policy_in); + if (Kokkos::Tools::Experimental::have_tuning_tool()) { + auto occupancy = tuner.begin(); + policy.impl_set_desired_occupancy( + Kokkos::Experimental::DesiredOccupancy{static_cast(occupancy)}); + } + return policy; } void end() { if (Kokkos::Tools::Experimental::have_tuning_tool()) { @@ -578,11 +687,13 @@ struct MDRangeTuner : public ExtendableTunerMixin> { policy.impl_change_tile_size({std::get(tuple)...}); } template - void tune(Kokkos::MDRangePolicy& policy) { + auto tune(const Kokkos::MDRangePolicy& policy_in) { + Kokkos::MDRangePolicy policy(policy_in); if (Kokkos::Tools::Experimental::have_tuning_tool()) { auto configuration = tuner.begin(); set_policy_tile(policy, configuration, std::make_index_sequence{}); } + return policy; } void end() { if (Kokkos::Tools::Experimental::have_tuning_tool()) { diff --git a/packages/kokkos/core/src/Kokkos_TypeInfo.hpp b/packages/kokkos/core/src/Kokkos_TypeInfo.hpp new file mode 100644 index 000000000000..e5710da2e3d5 --- /dev/null +++ b/packages/kokkos/core/src/Kokkos_TypeInfo.hpp @@ -0,0 +1,103 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_TYPE_INFO_HPP +#define KOKKOS_TYPE_INFO_HPP + +#include +#include +#include + +#include + +// Intel C++ Compiler Classic version 2021.2.0 works but 2021.1.2 doesn't +// Both have __INTEL_COMPILER defined to 2021 so using +// __INTEL_COMPILER_BUILD_DATE to discriminate. +// Experimenting on the compiler explorer gave +// icc version | __INTEL_COMPILER | __INTEL_COMPILER_BUILD_DATE +// 2021.1.2 | 2021 | 20201208 +// 2021.2.0 | 2021 | 20210228 +// NVCC versions less than 11.3.0 segfault when that header is included +// NVCC+MSVC doesn't work at all - it simply reports "T" inside type_name +#if (!defined(KOKKOS_COMPILER_INTEL) || \ + (__INTEL_COMPILER_BUILD_DATE >= 20210228)) && \ + (!defined(KOKKOS_COMPILER_NVCC) || (KOKKOS_COMPILER_NVCC >= 1130)) && \ + (!(defined(KOKKOS_COMPILER_NVCC) && defined(KOKKOS_COMPILER_MSVC))) + +#define KOKKOS_ENABLE_IMPL_TYPEINFO + +namespace Kokkos::Impl { + +template +constexpr std::array to_array(std::string_view src) { + std::array dst{}; + for (size_t i = 0; i < N; ++i) { + dst[i] = src[i]; + } + return dst; +} + +template +constexpr auto type_name() { +#if defined(__clang__) + constexpr std::string_view func = __PRETTY_FUNCTION__; + constexpr std::string_view prefix{"[T = "}; + constexpr std::string_view suffix{"]"}; +#elif defined(__GNUC__) + constexpr std::string_view func = __PRETTY_FUNCTION__; + constexpr std::string_view prefix{"[with T = "}; + constexpr std::string_view suffix{"]"}; +#elif defined(_MSC_VER) + constexpr std::string_view func = __FUNCSIG__; + constexpr std::string_view prefix{"type_name<"}; + constexpr std::string_view suffix{">(void)"}; +#else +#error bug +#endif + constexpr auto beg = func.find(prefix) + prefix.size(); + constexpr auto end = func.rfind(suffix); + static_assert(beg != std::string_view::npos); + static_assert(end != std::string_view::npos); + return to_array(func.substr(beg, end)); +} + +template +class TypeInfo { + static constexpr auto value_ = type_name(); + + public: + static constexpr std::string_view name() noexcept { + return {value_.data(), value_.size()}; + } +}; + +} // namespace Kokkos::Impl + +#else // out of luck, using Intel C++ Compiler Classic + +namespace Kokkos::Impl { + +template +class TypeInfo { + public: + static constexpr std::string_view name() noexcept { return "not supported"; } +}; + +} // namespace Kokkos::Impl + +#endif + +#endif diff --git a/packages/kokkos/core/src/Kokkos_View.hpp b/packages/kokkos/core/src/Kokkos_View.hpp index 04d1fcf15184..d5b352876c30 100644 --- a/packages/kokkos/core/src/Kokkos_View.hpp +++ b/packages/kokkos/core/src/Kokkos_View.hpp @@ -22,2016 +22,10 @@ static_assert(false, #ifndef KOKKOS_VIEW_HPP #define KOKKOS_VIEW_HPP -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -#include -#include - -#ifdef KOKKOS_ENABLE_IMPL_MDSPAN -#include -#include -#include -#endif -#include - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Impl { - -template -struct ViewArrayAnalysis; - -template ::non_const_value_type> -struct ViewDataAnalysis; - -template -class ViewMapping { - public: - enum : bool { is_assignable_data_type = false }; - enum : bool { is_assignable = false }; -}; - -template -constexpr KOKKOS_INLINE_FUNCTION std::size_t count_valid_integers( - const IntType i0, const IntType i1, const IntType i2, const IntType i3, - const IntType i4, const IntType i5, const IntType i6, const IntType i7) { - static_assert(std::is_integral::value, - "count_valid_integers() must have integer arguments."); - - return (i0 != KOKKOS_INVALID_INDEX) + (i1 != KOKKOS_INVALID_INDEX) + - (i2 != KOKKOS_INVALID_INDEX) + (i3 != KOKKOS_INVALID_INDEX) + - (i4 != KOKKOS_INVALID_INDEX) + (i5 != KOKKOS_INVALID_INDEX) + - (i6 != KOKKOS_INVALID_INDEX) + (i7 != KOKKOS_INVALID_INDEX); -} - -// FIXME Ideally, we would not instantiate this function for every possible View -// type. We should be able to only pass "extent" when we use mdspan. -template -KOKKOS_INLINE_FUNCTION void runtime_check_rank( - const View&, const bool is_void_spec, const size_t i0, const size_t i1, - const size_t i2, const size_t i3, const size_t i4, const size_t i5, - const size_t i6, const size_t i7, const char* label) { - (void)(label); - - if (is_void_spec) { - const size_t num_passed_args = - count_valid_integers(i0, i1, i2, i3, i4, i5, i6, i7); - // We either allow to pass as many extents as the dynamic rank is, or - // as many extents as the total rank is. In the latter case, the given - // extents for the static dimensions must match the - // compile-time extents. - constexpr int rank = View::rank(); - constexpr int dyn_rank = View::rank_dynamic(); - const bool n_args_is_dyn_rank = num_passed_args == dyn_rank; - const bool n_args_is_rank = num_passed_args == rank; - - if constexpr (rank != dyn_rank) { - if (n_args_is_rank) { - size_t new_extents[8] = {i0, i1, i2, i3, i4, i5, i6, i7}; - for (int i = dyn_rank; i < rank; ++i) - if (new_extents[i] != View::static_extent(i)) { - KOKKOS_IF_ON_HOST( - const std::string message = - "The specified run-time extent for Kokkos::View '" + - std::string(label) + - "' does not match the compile-time extent in dimension " + - std::to_string(i) + ". The given extent is " + - std::to_string(new_extents[i]) + " but should be " + - std::to_string(View::static_extent(i)) + ".\n"; - Kokkos::abort(message.c_str());) - KOKKOS_IF_ON_DEVICE( - Kokkos::abort( - "The specified run-time extents for a Kokkos::View " - "do not match the compile-time extents.");) - } - } - } - - if (!n_args_is_dyn_rank && !n_args_is_rank) { - KOKKOS_IF_ON_HOST( - const std::string message = - "Constructor for Kokkos::View '" + std::string(label) + - "' has mismatched number of arguments. The number " - "of arguments = " + - std::to_string(num_passed_args) + - " neither matches the dynamic rank = " + - std::to_string(dyn_rank) + - " nor the total rank = " + std::to_string(rank) + "\n"; - Kokkos::abort(message.c_str());) - KOKKOS_IF_ON_DEVICE(Kokkos::abort("Constructor for Kokkos View has " - "mismatched number of arguments.");) - } - } -} - -} /* namespace Impl */ -} /* namespace Kokkos */ - -// Class to provide a uniform type -namespace Kokkos { -namespace Impl { -template -struct ViewUniformType; -} -} // namespace Kokkos - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { - -/** \class ViewTraits - * \brief Traits class for accessing attributes of a View. - * - * This is an implementation detail of View. It is only of interest - * to developers implementing a new specialization of View. - * - * Template argument options: - * - View< DataType > - * - View< DataType , Space > - * - View< DataType , Space , MemoryTraits > - * - View< DataType , ArrayLayout > - * - View< DataType , ArrayLayout , Space > - * - View< DataType , ArrayLayout , MemoryTraits > - * - View< DataType , ArrayLayout , Space , MemoryTraits > - * - View< DataType , MemoryTraits > - */ - -template -struct ViewTraits; - -template <> -struct ViewTraits { - using execution_space = void; - using memory_space = void; - using HostMirrorSpace = void; - using array_layout = void; - using memory_traits = void; - using specialize = void; - using hooks_policy = void; -}; - -template -struct ViewTraits { - // Ignore an extraneous 'void' - using execution_space = typename ViewTraits::execution_space; - using memory_space = typename ViewTraits::memory_space; - using HostMirrorSpace = typename ViewTraits::HostMirrorSpace; - using array_layout = typename ViewTraits::array_layout; - using memory_traits = typename ViewTraits::memory_traits; - using specialize = typename ViewTraits::specialize; - using hooks_policy = typename ViewTraits::hooks_policy; -}; - -template -struct ViewTraits< - std::enable_if_t::value>, - HooksPolicy, Prop...> { - using execution_space = typename ViewTraits::execution_space; - using memory_space = typename ViewTraits::memory_space; - using HostMirrorSpace = typename ViewTraits::HostMirrorSpace; - using array_layout = typename ViewTraits::array_layout; - using memory_traits = typename ViewTraits::memory_traits; - using specialize = typename ViewTraits::specialize; - using hooks_policy = HooksPolicy; -}; - -template -struct ViewTraits::value>, - ArrayLayout, Prop...> { - // Specify layout, keep subsequent space and memory traits arguments - - using execution_space = typename ViewTraits::execution_space; - using memory_space = typename ViewTraits::memory_space; - using HostMirrorSpace = typename ViewTraits::HostMirrorSpace; - using array_layout = ArrayLayout; - using memory_traits = typename ViewTraits::memory_traits; - using specialize = typename ViewTraits::specialize; - using hooks_policy = typename ViewTraits::hooks_policy; -}; - -template -struct ViewTraits::value>, Space, - Prop...> { - // Specify Space, memory traits should be the only subsequent argument. - - static_assert( - std::is_same::execution_space, - void>::value && - std::is_same::memory_space, - void>::value && - std::is_same::HostMirrorSpace, - void>::value && - std::is_same::array_layout, - void>::value, - "Only one View Execution or Memory Space template argument"); - - using execution_space = typename Space::execution_space; - using memory_space = typename Space::memory_space; - using HostMirrorSpace = - typename Kokkos::Impl::HostMirror::Space::memory_space; - using array_layout = typename execution_space::array_layout; - using memory_traits = typename ViewTraits::memory_traits; - using specialize = typename ViewTraits::specialize; - using hooks_policy = typename ViewTraits::hooks_policy; -}; - -template -struct ViewTraits< - std::enable_if_t::value>, - MemoryTraits, Prop...> { - // Specify memory trait, should not be any subsequent arguments - - static_assert( - std::is_same::execution_space, - void>::value && - std::is_same::memory_space, - void>::value && - std::is_same::array_layout, - void>::value && - std::is_same::memory_traits, - void>::value && - std::is_same::hooks_policy, - void>::value, - "MemoryTrait is the final optional template argument for a View"); - - using execution_space = void; - using memory_space = void; - using HostMirrorSpace = void; - using array_layout = void; - using memory_traits = MemoryTraits; - using specialize = void; - using hooks_policy = void; -}; - -template -struct ViewTraits { - private: - // Unpack the properties arguments - using prop = ViewTraits; - - using ExecutionSpace = - std::conditional_t::value, - typename prop::execution_space, - Kokkos::DefaultExecutionSpace>; - - using MemorySpace = - std::conditional_t::value, - typename prop::memory_space, - typename ExecutionSpace::memory_space>; - - using ArrayLayout = - std::conditional_t::value, - typename prop::array_layout, - typename ExecutionSpace::array_layout>; - - using HostMirrorSpace = std::conditional_t< - !std::is_void::value, - typename prop::HostMirrorSpace, - typename Kokkos::Impl::HostMirror::Space>; - - using MemoryTraits = - std::conditional_t::value, - typename prop::memory_traits, - typename Kokkos::MemoryManaged>; - - using HooksPolicy = - std::conditional_t::value, - typename prop::hooks_policy, - Kokkos::Experimental::DefaultViewHooks>; - - // Analyze data type's properties, - // May be specialized based upon the layout and value type - using data_analysis = Kokkos::Impl::ViewDataAnalysis; - - public: - //------------------------------------ - // Data type traits: - - using data_type = typename data_analysis::type; - using const_data_type = typename data_analysis::const_type; - using non_const_data_type = typename data_analysis::non_const_type; - - //------------------------------------ - // Compatible array of trivial type traits: - - using scalar_array_type = typename data_analysis::scalar_array_type; - using const_scalar_array_type = - typename data_analysis::const_scalar_array_type; - using non_const_scalar_array_type = - typename data_analysis::non_const_scalar_array_type; - - //------------------------------------ - // Value type traits: - - using value_type = typename data_analysis::value_type; - using const_value_type = typename data_analysis::const_value_type; - using non_const_value_type = typename data_analysis::non_const_value_type; - - //------------------------------------ - // Mapping traits: - - using array_layout = ArrayLayout; - using dimension = typename data_analysis::dimension; - - using specialize = std::conditional_t< - std::is_void::value, - typename prop::specialize, - typename data_analysis::specialize>; /* mapping specialization tag */ - - static constexpr unsigned rank = dimension::rank; - static constexpr unsigned rank_dynamic = dimension::rank_dynamic; - - //------------------------------------ - // Execution space, memory space, memory access traits, and host mirror space. - - using execution_space = ExecutionSpace; - using memory_space = MemorySpace; - using device_type = Kokkos::Device; - using memory_traits = MemoryTraits; - using host_mirror_space = HostMirrorSpace; - using hooks_policy = HooksPolicy; - - using size_type = typename MemorySpace::size_type; - - enum { is_hostspace = std::is_same::value }; - enum { is_managed = MemoryTraits::is_unmanaged == 0 }; - enum { is_random_access = MemoryTraits::is_random_access == 1 }; - - //------------------------------------ -}; - -#ifdef KOKKOS_ENABLE_IMPL_MDSPAN -namespace Impl { -struct UnsupportedKokkosArrayLayout; - -template -struct MDSpanViewTraits { - using mdspan_type = UnsupportedKokkosArrayLayout; -}; - -// "Natural" mdspan for a view if the View's ArrayLayout is supported. -template -struct MDSpanViewTraits::type>> { - using index_type = std::size_t; - using extents_type = - typename Impl::ExtentsFromDataType::type; - using mdspan_layout_type = - typename Impl::LayoutFromArrayLayout::type; - using accessor_type = Impl::SpaceAwareAccessor< - typename Traits::memory_space, - Kokkos::default_accessor>; - using mdspan_type = mdspan; -}; -} // namespace Impl -#endif // KOKKOS_ENABLE_IMPL_MDSPAN - -/** \class View - * \brief View to an array of data. - * - * A View represents an array of one or more dimensions. - * For details, please refer to Kokkos' tutorial materials. - * - * \section Kokkos_View_TemplateParameters Template parameters - * - * This class has both required and optional template parameters. The - * \c DataType parameter must always be provided, and must always be - * first. The parameters \c Arg1Type, \c Arg2Type, and \c Arg3Type are - * placeholders for different template parameters. The default value - * of the fifth template parameter \c Specialize suffices for most use - * cases. When explaining the template parameters, we won't refer to - * \c Arg1Type, \c Arg2Type, and \c Arg3Type; instead, we will refer - * to the valid categories of template parameters, in whatever order - * they may occur. - * - * Valid ways in which template arguments may be specified: - * - View< DataType > - * - View< DataType , Layout > - * - View< DataType , Layout , Space > - * - View< DataType , Layout , Space , MemoryTraits > - * - View< DataType , Space > - * - View< DataType , Space , MemoryTraits > - * - View< DataType , MemoryTraits > - * - * \tparam DataType (required) This indicates both the type of each - * entry of the array, and the combination of compile-time and - * run-time array dimension(s). For example, double* - * indicates a one-dimensional array of \c double with run-time - * dimension, and int*[3] a two-dimensional array of \c int - * with run-time first dimension and compile-time second dimension - * (of 3). In general, the run-time dimensions (if any) must go - * first, followed by zero or more compile-time dimensions. For - * more examples, please refer to the tutorial materials. - * - * \tparam Space (required) The memory space. - * - * \tparam Layout (optional) The array's layout in memory. For - * example, LayoutLeft indicates a column-major (Fortran style) - * layout, and LayoutRight a row-major (C style) layout. If not - * specified, this defaults to the preferred layout for the - * Space. - * - * \tparam MemoryTraits (optional) Assertion of the user's intended - * access behavior. For example, RandomAccess indicates read-only - * access with limited spatial locality, and Unmanaged lets users - * wrap externally allocated memory in a View without automatic - * deallocation. - * - * \section Kokkos_View_MT MemoryTraits discussion - * - * \subsection Kokkos_View_MT_Interp MemoryTraits interpretation depends on - * Space - * - * Some \c MemoryTraits options may have different interpretations for - * different \c Space types. For example, with the Cuda device, - * \c RandomAccess tells Kokkos to fetch the data through the texture - * cache, whereas the non-GPU devices have no such hardware construct. - * - * \subsection Kokkos_View_MT_PrefUse Preferred use of MemoryTraits - * - * Users should defer applying the optional \c MemoryTraits parameter - * until the point at which they actually plan to rely on it in a - * computational kernel. This minimizes the number of template - * parameters exposed in their code, which reduces the cost of - * compilation. Users may always assign a View without specified - * \c MemoryTraits to a compatible View with that specification. - * For example: - * \code - * // Pass in the simplest types of View possible. - * void - * doSomething (View out, - * View in) - * { - * // Assign the "generic" View in to a RandomAccess View in_rr. - * // Note that RandomAccess View objects must have const data. - * View in_rr = in; - * // ... do something with in_rr and out ... - * } - * \endcode - */ - -} // namespace Kokkos - -namespace Kokkos { - -template -struct is_always_assignable_impl; - -template -struct is_always_assignable_impl, - Kokkos::View> { - using mapping_type = Kokkos::Impl::ViewMapping< - typename Kokkos::View::traits, - typename Kokkos::View::traits, - typename Kokkos::View::traits::specialize>; - - constexpr static bool value = - mapping_type::is_assignable && - static_cast(Kokkos::View::rank_dynamic) >= - static_cast(Kokkos::View::rank_dynamic); -}; - -template -using is_always_assignable = is_always_assignable_impl< - std::remove_reference_t, - std::remove_const_t>>; - -template -inline constexpr bool is_always_assignable_v = - is_always_assignable::value; - -template -constexpr bool is_assignable(const Kokkos::View& dst, - const Kokkos::View& src) { - using DstTraits = typename Kokkos::View::traits; - using SrcTraits = typename Kokkos::View::traits; - using mapping_type = - Kokkos::Impl::ViewMapping; - - return is_always_assignable_v, - Kokkos::View> || - (mapping_type::is_assignable && - ((DstTraits::dimension::rank_dynamic >= 1) || - (dst.static_extent(0) == src.extent(0))) && - ((DstTraits::dimension::rank_dynamic >= 2) || - (dst.static_extent(1) == src.extent(1))) && - ((DstTraits::dimension::rank_dynamic >= 3) || - (dst.static_extent(2) == src.extent(2))) && - ((DstTraits::dimension::rank_dynamic >= 4) || - (dst.static_extent(3) == src.extent(3))) && - ((DstTraits::dimension::rank_dynamic >= 5) || - (dst.static_extent(4) == src.extent(4))) && - ((DstTraits::dimension::rank_dynamic >= 6) || - (dst.static_extent(5) == src.extent(5))) && - ((DstTraits::dimension::rank_dynamic >= 7) || - (dst.static_extent(6) == src.extent(6))) && - ((DstTraits::dimension::rank_dynamic >= 8) || - (dst.static_extent(7) == src.extent(7)))); -} - -} /* namespace Kokkos */ - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -#include - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { - -// FIXME_OPENMPTARGET - The `declare target` is needed for the Intel GPUs with -// the OpenMPTarget backend -#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_COMPILER_INTEL_LLVM) -#pragma omp declare target +#if defined(KOKKOS_ENABLE_IMPL_MDSPAN) && !defined(KOKKOS_COMPILER_INTEL) +#include #endif -inline constexpr Kokkos::ALL_t ALL{}; - -#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_COMPILER_INTEL_LLVM) -#pragma omp end declare target -#endif - -inline constexpr Kokkos::Impl::SequentialHostInit_t SequentialHostInit{}; - -inline constexpr Kokkos::Impl::WithoutInitializing_t WithoutInitializing{}; - -inline constexpr Kokkos::Impl::AllowPadding_t AllowPadding{}; - -/** \brief Create View allocation parameter bundle from argument list. - * - * Valid argument list members are: - * 1) label as a "string" or std::string - * 2) memory space instance of the View::memory_space type - * 3) execution space instance compatible with the View::memory_space - * 4) Kokkos::WithoutInitializing to bypass initialization - * 4) Kokkos::AllowPadding to allow allocation to pad dimensions for memory - * alignment - */ -template -inline Impl::ViewCtorProp::type...> -view_alloc(Args const&... args) { - using return_type = - Impl::ViewCtorProp::type...>; - - static_assert(!return_type::has_pointer, - "Cannot give pointer-to-memory for view allocation"); - - return return_type(args...); -} - -template -KOKKOS_INLINE_FUNCTION - Impl::ViewCtorProp::type...> - view_wrap(Args const&... args) { - using return_type = - Impl::ViewCtorProp::type...>; - - static_assert(!return_type::has_memory_space && - !return_type::has_execution_space && - !return_type::has_label && return_type::has_pointer, - "Must only give pointer-to-memory for view wrapping"); - - return return_type(args...); -} - -} /* namespace Kokkos */ - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { - -template -class View; - -template -struct is_view : public std::false_type {}; - -template -struct is_view> : public std::true_type {}; - -template -struct is_view> : public std::true_type {}; - -template -inline constexpr bool is_view_v = is_view::value; - -template -class View : public ViewTraits { - private: - template - friend class View; - template - friend class Kokkos::Impl::ViewMapping; - - using view_tracker_type = Kokkos::Impl::ViewTracker; - - public: - using traits = ViewTraits; - - private: - using map_type = - Kokkos::Impl::ViewMapping; - template - friend struct Kokkos::Impl::ViewTracker; - using hooks_policy = typename traits::hooks_policy; - - view_tracker_type m_track; - map_type m_map; - - public: - //---------------------------------------- - /** \brief Compatible view of array of scalar types */ - using array_type = - View; - - /** \brief Compatible view of const data type */ - using const_type = - View; - - /** \brief Compatible view of non-const data type */ - using non_const_type = - View; - - /** \brief Compatible HostMirror view */ - using HostMirror = - View, - typename traits::hooks_policy>; - - /** \brief Compatible HostMirror view */ - using host_mirror_type = - View; - - /** \brief Unified types */ - using uniform_type = typename Impl::ViewUniformType::type; - using uniform_const_type = - typename Impl::ViewUniformType::const_type; - using uniform_runtime_type = - typename Impl::ViewUniformType::runtime_type; - using uniform_runtime_const_type = - typename Impl::ViewUniformType::runtime_const_type; - using uniform_nomemspace_type = - typename Impl::ViewUniformType::nomemspace_type; - using uniform_const_nomemspace_type = - typename Impl::ViewUniformType::const_nomemspace_type; - using uniform_runtime_nomemspace_type = - typename Impl::ViewUniformType::runtime_nomemspace_type; - using uniform_runtime_const_nomemspace_type = - typename Impl::ViewUniformType::runtime_const_nomemspace_type; - - //---------------------------------------- - // Domain rank and extents - - static constexpr Impl::integral_constant - rank = {}; - static constexpr Impl::integral_constant - rank_dynamic = {}; -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 - enum {Rank KOKKOS_DEPRECATED_WITH_COMMENT("Use rank instead.") = - map_type::Rank}; -#endif - - template - KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t< - std::is_integral::value, size_t> - extent(const iType& r) const noexcept { - return m_map.extent(r); - } - - static KOKKOS_INLINE_FUNCTION constexpr size_t static_extent( - const unsigned r) noexcept { - return map_type::static_extent(r); - } - - template - KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t< - std::is_integral::value, int> - extent_int(const iType& r) const noexcept { - return static_cast(m_map.extent(r)); - } - - KOKKOS_INLINE_FUNCTION constexpr typename traits::array_layout layout() - const { - return m_map.layout(); - } - - //---------------------------------------- - /* Deprecate all 'dimension' functions in favor of - * ISO/C++ vocabulary 'extent'. - */ - - KOKKOS_INLINE_FUNCTION constexpr size_t size() const { - return m_map.dimension_0() * m_map.dimension_1() * m_map.dimension_2() * - m_map.dimension_3() * m_map.dimension_4() * m_map.dimension_5() * - m_map.dimension_6() * m_map.dimension_7(); - } - - KOKKOS_INLINE_FUNCTION constexpr size_t stride_0() const { - return m_map.stride_0(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_1() const { - return m_map.stride_1(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_2() const { - return m_map.stride_2(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_3() const { - return m_map.stride_3(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_4() const { - return m_map.stride_4(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_5() const { - return m_map.stride_5(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_6() const { - return m_map.stride_6(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_7() const { - return m_map.stride_7(); - } - - template - KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t< - std::is_integral::value, size_t> - stride(iType r) const { - return ( - r == 0 - ? m_map.stride_0() - : (r == 1 - ? m_map.stride_1() - : (r == 2 - ? m_map.stride_2() - : (r == 3 - ? m_map.stride_3() - : (r == 4 - ? m_map.stride_4() - : (r == 5 - ? m_map.stride_5() - : (r == 6 - ? m_map.stride_6() - : m_map.stride_7()))))))); - } - - template - KOKKOS_INLINE_FUNCTION void stride(iType* const s) const { - m_map.stride(s); - } - - //---------------------------------------- - // Range span is the span which contains all members. - - using reference_type = typename map_type::reference_type; - using pointer_type = typename map_type::pointer_type; - - enum { - reference_type_is_lvalue_reference = - std::is_lvalue_reference::value - }; - - KOKKOS_INLINE_FUNCTION constexpr size_t span() const { return m_map.span(); } - KOKKOS_INLINE_FUNCTION bool span_is_contiguous() const { - return m_map.span_is_contiguous(); - } - KOKKOS_INLINE_FUNCTION constexpr bool is_allocated() const { - return m_map.data() != nullptr; - } - KOKKOS_INLINE_FUNCTION constexpr pointer_type data() const { - return m_map.data(); - } - - //---------------------------------------- - // Allow specializations to query their specialized map - - KOKKOS_INLINE_FUNCTION - const Kokkos::Impl::ViewMapping& - impl_map() const { - return m_map; - } - KOKKOS_INLINE_FUNCTION - const Kokkos::Impl::SharedAllocationTracker& impl_track() const { - return m_track.m_tracker; - } - //---------------------------------------- - - private: - static constexpr bool is_layout_left = - std::is_same::value; - - static constexpr bool is_layout_right = - std::is_same::value; - - static constexpr bool is_layout_stride = - std::is_same::value; - - static constexpr bool is_default_map = - std::is_void::value && - (is_layout_left || is_layout_right || is_layout_stride); - -#if defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK) - -#define KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(...) \ - Kokkos::Impl::runtime_check_memory_access_violation< \ - typename traits::memory_space>( \ - "Kokkos::View ERROR: attempt to access inaccessible memory space", \ - __VA_ARGS__); \ - Kokkos::Impl::view_verify_operator_bounds( \ - __VA_ARGS__); - -#else - -#define KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(...) \ - Kokkos::Impl::runtime_check_memory_access_violation< \ - typename traits::memory_space>( \ - "Kokkos::View ERROR: attempt to access inaccessible memory space", \ - __VA_ARGS__); - -#endif - - template - static KOKKOS_FUNCTION void check_access_member_function_valid_args(Is...) { - static_assert(rank <= sizeof...(Is)); - static_assert(sizeof...(Is) <= 8); - static_assert(Kokkos::Impl::are_integral::value); - } - - template - static KOKKOS_FUNCTION void check_operator_parens_valid_args(Is...) { - static_assert(rank == sizeof...(Is)); - static_assert(Kokkos::Impl::are_integral::value); - } - - public: - //------------------------------ - // Rank 1 default map operator() - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::always_true::value && // - (1 == rank) && is_default_map && !is_layout_stride), - reference_type> - operator()(I0 i0) const { - check_operator_parens_valid_args(i0); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0) - return m_map.m_impl_handle[i0]; - } - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::always_true::value && // - (1 == rank) && is_default_map && is_layout_stride), - reference_type> - operator()(I0 i0) const { - check_operator_parens_valid_args(i0); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0) - return m_map.m_impl_handle[m_map.m_impl_offset.m_stride.S0 * i0]; - } - - //------------------------------ - // Rank 1 operator[] - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - ((1 == rank) && Kokkos::Impl::are_integral::value && !is_default_map), - reference_type> - operator[](I0 i0) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0) - return m_map.reference(i0); - } - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<((1 == rank) && Kokkos::Impl::are_integral::value && - is_default_map && !is_layout_stride), - reference_type> - operator[](I0 i0) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0) - return m_map.m_impl_handle[i0]; - } - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<((1 == rank) && Kokkos::Impl::are_integral::value && - is_default_map && is_layout_stride), - reference_type> - operator[](I0 i0) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0) - return m_map.m_impl_handle[m_map.m_impl_offset.m_stride.S0 * i0]; - } - - //------------------------------ - // Rank 2 default map operator() - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true::value && // - (2 == rank) && is_default_map && - (is_layout_left || is_layout_right || is_layout_stride)), - reference_type> - operator()(I0 i0, I1 i1) const { - check_operator_parens_valid_args(i0, i1); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1) - if constexpr (is_layout_left) { - if constexpr (rank_dynamic == 0) - return m_map.m_impl_handle[i0 + m_map.m_impl_offset.m_dim.N0 * i1]; - else - return m_map.m_impl_handle[i0 + m_map.m_impl_offset.m_stride * i1]; - } else if constexpr (is_layout_right) { - if constexpr (rank_dynamic == 0) - return m_map.m_impl_handle[i1 + m_map.m_impl_offset.m_dim.N1 * i0]; - else - return m_map.m_impl_handle[i1 + m_map.m_impl_offset.m_stride * i0]; - } else { - static_assert(is_layout_stride); - return m_map.m_impl_handle[i0 * m_map.m_impl_offset.m_stride.S0 + - i1 * m_map.m_impl_offset.m_stride.S1]; - } -#if defined KOKKOS_COMPILER_INTEL - __builtin_unreachable(); -#endif - } - - // Rank 0 -> 8 operator() except for rank-1 and rank-2 with default map which - // have "inlined" versions above - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true::value && // - (2 != rank) && (1 != rank) && (0 != rank) && is_default_map), - reference_type> - operator()(Is... indices) const { - check_operator_parens_valid_args(indices...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, indices...) - return m_map.m_impl_handle[m_map.m_impl_offset(indices...)]; - } - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::always_true::value && // - ((0 == rank) || !is_default_map)), - reference_type> - operator()(Is... indices) const { - check_operator_parens_valid_args(indices...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, indices...) - return m_map.reference(indices...); - } - - //------------------------------ - // Rank 0 - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true::value && (0 == rank)), reference_type> - access(Is... extra) const { - check_access_member_function_valid_args(extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, extra...) - return m_map.reference(); - } - - //------------------------------ - // Rank 1 - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::always_true::value && - (1 == rank) && !is_default_map), - reference_type> - access(I0 i0, Is... extra) const { - check_access_member_function_valid_args(i0, extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, extra...) - return m_map.reference(i0); - } - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::always_true::value && - (1 == rank) && is_default_map && !is_layout_stride), - reference_type> - access(I0 i0, Is... extra) const { - check_access_member_function_valid_args(i0, extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, extra...) - return m_map.m_impl_handle[i0]; - } - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::always_true::value && - (1 == rank) && is_default_map && is_layout_stride), - reference_type> - access(I0 i0, Is... extra) const { - check_access_member_function_valid_args(i0, extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, extra...) - return m_map.m_impl_handle[m_map.m_impl_offset.m_stride.S0 * i0]; - } - - //------------------------------ - // Rank 2 - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::always_true::value && - (2 == rank) && !is_default_map), - reference_type> - access(I0 i0, I1 i1, Is... extra) const { - check_access_member_function_valid_args(i0, i1, extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, extra...) - return m_map.reference(i0, i1); - } - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true::value && (2 == rank) && - is_default_map && - (is_layout_left || is_layout_right || is_layout_stride)), - reference_type> - access(I0 i0, I1 i1, Is... extra) const { - check_access_member_function_valid_args(i0, i1, extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, extra...) - if constexpr (is_layout_left) { - if constexpr (rank_dynamic == 0) - return m_map.m_impl_handle[i0 + m_map.m_impl_offset.m_dim.N0 * i1]; - else - return m_map.m_impl_handle[i0 + m_map.m_impl_offset.m_stride * i1]; - } else if constexpr (is_layout_right) { - if constexpr (rank_dynamic == 0) - return m_map.m_impl_handle[i1 + m_map.m_impl_offset.m_dim.N1 * i0]; - else - return m_map.m_impl_handle[i1 + m_map.m_impl_offset.m_stride * i0]; - } else { - static_assert(is_layout_stride); - return m_map.m_impl_handle[i0 * m_map.m_impl_offset.m_stride.S0 + - i1 * m_map.m_impl_offset.m_stride.S1]; - } -#if defined KOKKOS_COMPILER_INTEL - __builtin_unreachable(); -#endif - } - - //------------------------------ - // Rank 3 - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::always_true::value && - (3 == rank) && is_default_map), - reference_type> - access(I0 i0, I1 i1, I2 i2, Is... extra) const { - check_access_member_function_valid_args(i0, i1, i2, extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, extra...) - return m_map.m_impl_handle[m_map.m_impl_offset(i0, i1, i2)]; - } - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::always_true::value && - (3 == rank) && !is_default_map), - reference_type> - access(I0 i0, I1 i1, I2 i2, Is... extra) const { - check_access_member_function_valid_args(i0, i1, i2, extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, extra...) - return m_map.reference(i0, i1, i2); - } - - //------------------------------ - // Rank 4 - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true::value && (4 == rank) && - is_default_map), - reference_type> - access(I0 i0, I1 i1, I2 i2, I3 i3, Is... extra) const { - check_access_member_function_valid_args(i0, i1, i2, i3, extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, extra...) - return m_map.m_impl_handle[m_map.m_impl_offset(i0, i1, i2, i3)]; - } - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true::value && (4 == rank) && - !is_default_map), - reference_type> - access(I0 i0, I1 i1, I2 i2, I3 i3, Is... extra) const { - check_access_member_function_valid_args(i0, i1, i2, i3, extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, extra...) - return m_map.reference(i0, i1, i2, i3); - } - - //------------------------------ - // Rank 5 - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true::value && - (5 == rank) && is_default_map), - reference_type> - access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, Is... extra) const { - check_access_member_function_valid_args(i0, i1, i2, i3, i4, extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, - extra...) - return m_map.m_impl_handle[m_map.m_impl_offset(i0, i1, i2, i3, i4)]; - } - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true::value && - (5 == rank) && !is_default_map), - reference_type> - access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, Is... extra) const { - check_access_member_function_valid_args(i0, i1, i2, i3, i4, extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, - extra...) - return m_map.reference(i0, i1, i2, i3, i4); - } - - //------------------------------ - // Rank 6 - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true::value && - (6 == rank) && is_default_map), - reference_type> - access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, Is... extra) const { - check_access_member_function_valid_args(i0, i1, i2, i3, i4, i5, extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, i5, - extra...) - return m_map.m_impl_handle[m_map.m_impl_offset(i0, i1, i2, i3, i4, i5)]; - } - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true::value && - (6 == rank) && !is_default_map), - reference_type> - access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, Is... extra) const { - check_access_member_function_valid_args(i0, i1, i2, i3, i4, i5, extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, i5, - extra...) - return m_map.reference(i0, i1, i2, i3, i4, i5); - } - - //------------------------------ - // Rank 7 - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true::value && - (7 == rank) && is_default_map), - reference_type> - access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, I6 i6, Is... extra) const { - check_access_member_function_valid_args(i0, i1, i2, i3, i4, i5, i6, - extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, i5, i6, - extra...) - return m_map.m_impl_handle[m_map.m_impl_offset(i0, i1, i2, i3, i4, i5, i6)]; - } - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true::value && - (7 == rank) && !is_default_map), - reference_type> - access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, I6 i6, Is... extra) const { - check_access_member_function_valid_args(i0, i1, i2, i3, i4, i5, i6, - extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, i5, i6, - extra...) - return m_map.reference(i0, i1, i2, i3, i4, i5, i6); - } - - //------------------------------ - // Rank 8 - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::always_true::value && - (8 == rank) && is_default_map), - reference_type> - access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, I6 i6, I7 i7, - Is... extra) const { - check_access_member_function_valid_args(i0, i1, i2, i3, i4, i5, i6, i7, - extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, i5, i6, - i7, extra...) - return m_map - .m_impl_handle[m_map.m_impl_offset(i0, i1, i2, i3, i4, i5, i6, i7)]; - } - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::always_true::value && - (8 == rank) && !is_default_map), - reference_type> - access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, I6 i6, I7 i7, - Is... extra) const { - check_access_member_function_valid_args(i0, i1, i2, i3, i4, i5, i6, i7, - extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, i5, i6, - i7, extra...) - return m_map.reference(i0, i1, i2, i3, i4, i5, i6, i7); - } - -#undef KOKKOS_IMPL_VIEW_OPERATOR_VERIFY - - //---------------------------------------- - // Standard destructor, constructors, and assignment operators - - KOKKOS_DEFAULTED_FUNCTION - ~View() = default; - - KOKKOS_DEFAULTED_FUNCTION - View() = default; - - KOKKOS_FUNCTION - View(const View& other) : m_track(other.m_track), m_map(other.m_map) { - KOKKOS_IF_ON_HOST((hooks_policy::copy_construct(*this, other);)) - } - - KOKKOS_FUNCTION - View(View&& other) - : m_track{std::move(other.m_track)}, m_map{std::move(other.m_map)} { - KOKKOS_IF_ON_HOST((hooks_policy::move_construct(*this, other);)) - } - - KOKKOS_FUNCTION - View& operator=(const View& other) { - m_map = other.m_map; - m_track = other.m_track; - - KOKKOS_IF_ON_HOST((hooks_policy::copy_assign(*this, other);)) - - return *this; - } - - KOKKOS_FUNCTION - View& operator=(View&& other) { - m_map = std::move(other.m_map); - m_track = std::move(other.m_track); - - KOKKOS_IF_ON_HOST((hooks_policy::move_assign(*this, other);)) - - return *this; - } - - //---------------------------------------- - // Compatible view copy constructor and assignment - // may assign unmanaged from managed. - - template - KOKKOS_INLINE_FUNCTION View( - const View& rhs, - std::enable_if_t::traits, - typename traits::specialize>::is_assignable_data_type>* = nullptr) - : m_track(rhs), m_map() { - using SrcTraits = typename View::traits; - using Mapping = Kokkos::Impl::ViewMapping; - static_assert(Mapping::is_assignable, - "Incompatible View copy construction"); - Mapping::assign(m_map, rhs.m_map, rhs.m_track.m_tracker); - } - - template - KOKKOS_INLINE_FUNCTION std::enable_if_t< - Kokkos::Impl::ViewMapping< - traits, typename View::traits, - typename traits::specialize>::is_assignable_data_type, - View>& - operator=(const View& rhs) { - using SrcTraits = typename View::traits; - using Mapping = Kokkos::Impl::ViewMapping; - static_assert(Mapping::is_assignable, "Incompatible View copy assignment"); - Mapping::assign(m_map, rhs.m_map, rhs.m_track.m_tracker); - m_track.assign(rhs); - return *this; - } - - //---------------------------------------- - // Compatible subview constructor - // may assign unmanaged from managed. - - template - KOKKOS_INLINE_FUNCTION View(const View& src_view, const Arg0 arg0, - Args... args) - : m_track(src_view), m_map() { - using SrcType = View; - - using Mapping = Kokkos::Impl::ViewMapping; - - using DstType = typename Mapping::type; - - static_assert( - Kokkos::Impl::ViewMapping::is_assignable, - "Subview construction requires compatible view and subview arguments"); - - Mapping::assign(m_map, src_view.m_map, arg0, args...); - } - - //---------------------------------------- - // Allocation tracking properties - - KOKKOS_INLINE_FUNCTION - int use_count() const { return m_track.m_tracker.use_count(); } - - inline const std::string label() const { - return m_track.m_tracker - .template get_label(); - } - - public: - //---------------------------------------- - // Allocation according to allocation properties and array layout - - template - explicit inline View( - const Impl::ViewCtorProp& arg_prop, - std::enable_if_t::has_pointer, - typename traits::array_layout> const& arg_layout) - : m_track(), m_map() { - // Copy the input allocation properties with possibly defaulted properties - // We need to split it in two to avoid MSVC compiler errors - auto prop_copy_tmp = - Impl::with_properties_if_unset(arg_prop, std::string{}); - auto prop_copy = Impl::with_properties_if_unset( - prop_copy_tmp, typename traits::device_type::memory_space{}, - typename traits::device_type::execution_space{}); - using alloc_prop = decltype(prop_copy); - - static_assert(traits::is_managed, - "View allocation constructor requires managed memory"); - - if (alloc_prop::initialize && - !alloc_prop::execution_space::impl_is_initialized()) { - // If initializing view data then - // the execution space must be initialized. - Kokkos::Impl::throw_runtime_exception( - "Constructing View and initializing data with uninitialized " - "execution space"); - } - -#ifdef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK - if constexpr (std::is_same_v || - std::is_same_v || - std::is_same_v) { - size_t i0 = arg_layout.dimension[0]; - size_t i1 = arg_layout.dimension[1]; - size_t i2 = arg_layout.dimension[2]; - size_t i3 = arg_layout.dimension[3]; - size_t i4 = arg_layout.dimension[4]; - size_t i5 = arg_layout.dimension[5]; - size_t i6 = arg_layout.dimension[6]; - size_t i7 = arg_layout.dimension[7]; - - const std::string& alloc_name = - Impl::get_property(prop_copy); - Impl::runtime_check_rank( - *this, std::is_same::value, i0, i1, - i2, i3, i4, i5, i6, i7, alloc_name.c_str()); - } -#endif - - Kokkos::Impl::SharedAllocationRecord<>* record = m_map.allocate_shared( - prop_copy, arg_layout, Impl::ViewCtorProp::has_execution_space); - - // Setup and initialization complete, start tracking - m_track.m_tracker.assign_allocated_record_to_uninitialized(record); - } - - KOKKOS_INLINE_FUNCTION - void assign_data(pointer_type arg_data) { - m_track.m_tracker.clear(); - m_map.assign_data(arg_data); - } - - // Wrap memory according to properties and array layout - template - explicit KOKKOS_INLINE_FUNCTION View( - const Impl::ViewCtorProp& arg_prop, - std::enable_if_t::has_pointer, - typename traits::array_layout> const& arg_layout) - : m_track() // No memory tracking - , - m_map(arg_prop, arg_layout) { - static_assert( - std::is_same::pointer_type>::value, - "Constructing View to wrap user memory must supply matching pointer " - "type"); - -#ifdef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK - if constexpr (std::is_same_v || - std::is_same_v || - std::is_same_v) { - size_t i0 = arg_layout.dimension[0]; - size_t i1 = arg_layout.dimension[1]; - size_t i2 = arg_layout.dimension[2]; - size_t i3 = arg_layout.dimension[3]; - size_t i4 = arg_layout.dimension[4]; - size_t i5 = arg_layout.dimension[5]; - size_t i6 = arg_layout.dimension[6]; - size_t i7 = arg_layout.dimension[7]; - - Impl::runtime_check_rank( - *this, std::is_same::value, i0, i1, - i2, i3, i4, i5, i6, i7, "UNMANAGED"); - } -#endif - } - - // Simple dimension-only layout - template - explicit inline View( - const Impl::ViewCtorProp& arg_prop, - std::enable_if_t::has_pointer, size_t> const - arg_N0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N3 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N4 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) - : View(arg_prop, - typename traits::array_layout(arg_N0, arg_N1, arg_N2, arg_N3, - arg_N4, arg_N5, arg_N6, arg_N7)) { - static_assert(traits::array_layout::is_extent_constructible, - "Layout is not constructible from extent arguments. Use " - "overload taking a layout object instead."); - } - - template - explicit KOKKOS_INLINE_FUNCTION View( - const Impl::ViewCtorProp& arg_prop, - std::enable_if_t::has_pointer, size_t> const - arg_N0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N3 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N4 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) - : View(arg_prop, - typename traits::array_layout(arg_N0, arg_N1, arg_N2, arg_N3, - arg_N4, arg_N5, arg_N6, arg_N7)) { - static_assert(traits::array_layout::is_extent_constructible, - "Layout is not constructible from extent arguments. Use " - "overload taking a layout object instead."); - } - - // Allocate with label and layout - template - explicit inline View( - const Label& arg_label, - std::enable_if_t::value, - typename traits::array_layout> const& arg_layout) - : View(Impl::ViewCtorProp(arg_label), arg_layout) {} - - // Allocate label and layout, must disambiguate from subview constructor. - template - explicit inline View( - const Label& arg_label, - std::enable_if_t::value, const size_t> - arg_N0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N3 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N4 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) - : View(Impl::ViewCtorProp(arg_label), - typename traits::array_layout(arg_N0, arg_N1, arg_N2, arg_N3, - arg_N4, arg_N5, arg_N6, arg_N7)) { - static_assert(traits::array_layout::is_extent_constructible, - "Layout is not constructible from extent arguments. Use " - "overload taking a layout object instead."); - } - - // Construct view from ViewTracker and map - // This should be the preferred method because future extensions may need to - // use the ViewTracker class. - template - KOKKOS_INLINE_FUNCTION View( - const view_tracker_type& track, - const Kokkos::Impl::ViewMapping& map) - : m_track(track), m_map() { - using Mapping = - Kokkos::Impl::ViewMapping; - static_assert(Mapping::is_assignable, - "Incompatible View copy construction"); - Mapping::assign(m_map, map, track.m_tracker); - } - - // Construct View from internal shared allocation tracker object and map - // This is here for backwards compatibility for classes that derive from - // Kokkos::View - template - KOKKOS_INLINE_FUNCTION View( - const typename view_tracker_type::track_type& track, - const Kokkos::Impl::ViewMapping& map) - : m_track(track), m_map() { - using Mapping = - Kokkos::Impl::ViewMapping; - static_assert(Mapping::is_assignable, - "Incompatible View copy construction"); - Mapping::assign(m_map, map, track); - } - - //---------------------------------------- - // Memory span required to wrap these dimensions. - static constexpr size_t required_allocation_size( - typename traits::array_layout const& layout) { - return map_type::memory_span(layout); - } - - static constexpr size_t required_allocation_size( - const size_t arg_N0 = 0, const size_t arg_N1 = 0, const size_t arg_N2 = 0, - const size_t arg_N3 = 0, const size_t arg_N4 = 0, const size_t arg_N5 = 0, - const size_t arg_N6 = 0, const size_t arg_N7 = 0) { - static_assert(traits::array_layout::is_extent_constructible, - "Layout is not constructible from extent arguments. Use " - "overload taking a layout object instead."); - return map_type::memory_span(typename traits::array_layout( - arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7)); - } - - explicit KOKKOS_INLINE_FUNCTION View( - pointer_type arg_ptr, const size_t arg_N0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N3 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N4 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) - : View(Impl::ViewCtorProp(arg_ptr), - typename traits::array_layout(arg_N0, arg_N1, arg_N2, arg_N3, - arg_N4, arg_N5, arg_N6, arg_N7)) { - static_assert(traits::array_layout::is_extent_constructible, - "Layout is not constructible from extent arguments. Use " - "overload taking a layout object instead."); - } - - explicit KOKKOS_INLINE_FUNCTION View( - pointer_type arg_ptr, const typename traits::array_layout& arg_layout) - : View(Impl::ViewCtorProp(arg_ptr), arg_layout) {} - - //---------------------------------------- - // Shared scratch memory constructor - - static KOKKOS_INLINE_FUNCTION size_t - shmem_size(const size_t arg_N0 = KOKKOS_INVALID_INDEX, - const size_t arg_N1 = KOKKOS_INVALID_INDEX, - const size_t arg_N2 = KOKKOS_INVALID_INDEX, - const size_t arg_N3 = KOKKOS_INVALID_INDEX, - const size_t arg_N4 = KOKKOS_INVALID_INDEX, - const size_t arg_N5 = KOKKOS_INVALID_INDEX, - const size_t arg_N6 = KOKKOS_INVALID_INDEX, - const size_t arg_N7 = KOKKOS_INVALID_INDEX) { - static_assert(traits::array_layout::is_extent_constructible, - "Layout is not constructible from extent arguments. Use " - "overload taking a layout object instead."); - const size_t num_passed_args = Impl::count_valid_integers( - arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7); - - if (std::is_void::value && - num_passed_args != rank_dynamic) { - Kokkos::abort( - "Kokkos::View::shmem_size() rank_dynamic != number of arguments.\n"); - } - - return View::shmem_size(typename traits::array_layout( - arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7)); - } - - private: - // Want to be able to align to minimum scratch alignment or sizeof or alignof - // elements - static constexpr size_t scratch_value_alignment = - max({sizeof(typename traits::value_type), - alignof(typename traits::value_type), - static_cast( - traits::execution_space::scratch_memory_space::ALIGN)}); - - public: - static KOKKOS_INLINE_FUNCTION size_t - shmem_size(typename traits::array_layout const& arg_layout) { - return map_type::memory_span(arg_layout) + scratch_value_alignment; - } - - explicit KOKKOS_INLINE_FUNCTION View( - const typename traits::execution_space::scratch_memory_space& arg_space, - const typename traits::array_layout& arg_layout) - : View(Impl::ViewCtorProp(reinterpret_cast( - arg_space.get_shmem_aligned(map_type::memory_span(arg_layout), - scratch_value_alignment))), - arg_layout) {} - - explicit KOKKOS_INLINE_FUNCTION View( - const typename traits::execution_space::scratch_memory_space& arg_space, - const size_t arg_N0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N3 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N4 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) - : View(Impl::ViewCtorProp( - reinterpret_cast(arg_space.get_shmem_aligned( - map_type::memory_span(typename traits::array_layout( - arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, - arg_N7)), - scratch_value_alignment))), - typename traits::array_layout(arg_N0, arg_N1, arg_N2, arg_N3, - arg_N4, arg_N5, arg_N6, arg_N7)) { - static_assert(traits::array_layout::is_extent_constructible, - "Layout is not constructible from extent arguments. Use " - "overload taking a layout object instead."); - } - - //---------------------------------------- - // MDSpan converting constructors -#ifdef KOKKOS_ENABLE_IMPL_MDSPAN - template ::mdspan_type> - KOKKOS_INLINE_FUNCTION -#ifndef KOKKOS_ENABLE_CXX17 - explicit(traits::is_managed) -#endif - View(const typename Impl::MDSpanViewTraits::mdspan_type& mds, - std::enable_if_t< - !std::is_same_v>* = - nullptr) - : View(mds.data_handle(), - Impl::array_layout_from_mapping< - typename traits::array_layout, - typename Impl::MDSpanViewTraits::mdspan_type>( - mds.mapping())) { - } - - template - KOKKOS_INLINE_FUNCTION -#ifndef KOKKOS_ENABLE_CXX17 - explicit(!std::is_convertible_v< - Kokkos::mdspan, - typename Impl::MDSpanViewTraits::mdspan_type>) -#endif - View(const Kokkos::mdspan& mds) - : View(typename Impl::MDSpanViewTraits::mdspan_type(mds)) { - } - - //---------------------------------------- - // Conversion to MDSpan - template ::mdspan_type, - typename = std::enable_if_t, - std::false_type, - std::is_assignable, - ImplNaturalMDSpanType>>::value>> - KOKKOS_INLINE_FUNCTION constexpr operator mdspan< - OtherElementType, OtherExtents, OtherLayoutPolicy, OtherAccessor>() { - using mdspan_type = typename Impl::MDSpanViewTraits::mdspan_type; - return mdspan_type{data(), - Impl::mapping_from_view_mapping(m_map)}; - } - - template >, - typename = std::enable_if_t>> - KOKKOS_INLINE_FUNCTION constexpr auto to_mdspan( - const OtherAccessorType& other_accessor = - typename Impl::MDSpanViewTraits::accessor_type()) { - using mdspan_type = typename Impl::MDSpanViewTraits::mdspan_type; - using ret_mdspan_type = - mdspan; - return ret_mdspan_type{data(), - Impl::mapping_from_view_mapping(m_map), - other_accessor}; - } -#endif // KOKKOS_ENABLE_IMPL_MDSPAN -}; - -template -KOKKOS_INLINE_FUNCTION constexpr unsigned rank(const View&) { - return View::rank(); -} - -namespace Impl { - -template -struct RankDataType { - using type = typename RankDataType::type*; -}; - -template -struct RankDataType { - using type = ValueType; -}; - -template -KOKKOS_FUNCTION std::enable_if_t< - N == View::rank() && - std::is_same::specialize, void>::value, - View> -as_view_of_rank_n(View v) { - return v; -} - -// Placeholder implementation to compile generic code for DynRankView; should -// never be called -template -KOKKOS_FUNCTION std::enable_if_t< - N != View::rank() && - std::is_same::specialize, void>::value, - View::value_type, N>::type, - Args...>> -as_view_of_rank_n(View) { - Kokkos::abort("Trying to get at a View of the wrong rank"); - return {}; -} - -template -void apply_to_view_of_static_rank(Function&& f, View a) { - f(a); -} - -} // namespace Impl -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Impl { -template -struct TypeListToViewTraits; - -template -struct TypeListToViewTraits> { - using type = ViewTraits; -}; - -// It is not safe to assume that subviews of views with the Aligned memory trait -// are also aligned. Hence, just remove that attribute for subviews. -template -struct RemoveAlignedMemoryTrait { - private: - using type_list_in = Kokkos::Impl::type_list; - using memory_traits = typename ViewTraits::memory_traits; - using type_list_in_wo_memory_traits = - typename Kokkos::Impl::type_list_remove_first::type; - using new_memory_traits = - Kokkos::MemoryTraits; - using new_type_list = typename Kokkos::Impl::concat_type_list< - type_list_in_wo_memory_traits, - Kokkos::Impl::type_list>::type; - - public: - using type = typename TypeListToViewTraits::type; -}; -} // namespace Impl - -template -KOKKOS_INLINE_FUNCTION auto subview(const View& src, Args... args) { - static_assert(View::rank == sizeof...(Args), - "subview requires one argument for each source View rank"); - - return typename Kokkos::Impl::ViewMapping< - void /* deduce subview type from source view traits */ - , - typename Impl::RemoveAlignedMemoryTrait::type, - Args...>::type(src, args...); -} - -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 -template -KOKKOS_DEPRECATED KOKKOS_INLINE_FUNCTION auto subview(const View& src, - Args... args) { - static_assert(View::rank == sizeof...(Args), - "subview requires one argument for each source View rank"); - static_assert(Kokkos::is_memory_traits::value); - - return typename Kokkos::Impl::ViewMapping< - void /* deduce subview type from source view traits */ - , - typename Impl::RemoveAlignedMemoryTrait::type, - Args...>::type(src, args...); -} -#endif - -template -using Subview = decltype(subview(std::declval(), std::declval()...)); - -} /* namespace Kokkos */ - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { - -template -KOKKOS_INLINE_FUNCTION bool operator==(const View& lhs, - const View& rhs) { - // Same data, layout, dimensions - using lhs_traits = ViewTraits; - using rhs_traits = ViewTraits; - - return std::is_same::value && - std::is_same::value && - std::is_same::value && - View::rank() == View::rank() && - lhs.data() == rhs.data() && lhs.span() == rhs.span() && - lhs.extent(0) == rhs.extent(0) && lhs.extent(1) == rhs.extent(1) && - lhs.extent(2) == rhs.extent(2) && lhs.extent(3) == rhs.extent(3) && - lhs.extent(4) == rhs.extent(4) && lhs.extent(5) == rhs.extent(5) && - lhs.extent(6) == rhs.extent(6) && lhs.extent(7) == rhs.extent(7); -} - -template -KOKKOS_INLINE_FUNCTION bool operator!=(const View& lhs, - const View& rhs) { - return !(operator==(lhs, rhs)); -} - -} /* namespace Kokkos */ - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Impl { - -template -struct CommonViewValueType; - -template -struct CommonViewValueType { - using value_type = std::common_type_t; -}; - -template -struct CommonViewAllocProp; - -template -struct CommonViewAllocProp { - using value_type = ValueType; - using scalar_array_type = ValueType; - - template - KOKKOS_INLINE_FUNCTION CommonViewAllocProp(const Views&...) {} -}; - -template -struct DeduceCommonViewAllocProp; - -// Base case must provide types for: -// 1. specialize 2. value_type 3. is_view 4. prop_type -template -struct DeduceCommonViewAllocProp { - using specialize = typename FirstView::traits::specialize; - - using value_type = typename FirstView::traits::value_type; - - enum : bool { is_view = is_view::value }; - - using prop_type = CommonViewAllocProp; -}; - -template -struct DeduceCommonViewAllocProp { - using NextTraits = DeduceCommonViewAllocProp; - - using first_specialize = typename FirstView::traits::specialize; - using first_value_type = typename FirstView::traits::value_type; - - enum : bool { first_is_view = is_view::value }; - - using next_specialize = typename NextTraits::specialize; - using next_value_type = typename NextTraits::value_type; - - enum : bool { next_is_view = NextTraits::is_view }; - - // common types - - // determine specialize type - // if first and next specialize differ, but are not the same specialize, error - // out - static_assert(!(!std::is_same::value && - !std::is_void::value && - !std::is_void::value), - "Kokkos DeduceCommonViewAllocProp ERROR: Only one non-void " - "specialize trait allowed"); - - // otherwise choose non-void specialize if either/both are non-void - using specialize = std::conditional_t< - std::is_same::value, first_specialize, - std::conditional_t<(std::is_void::value && - !std::is_void::value), - next_specialize, first_specialize>>; - - using value_type = typename CommonViewValueType::value_type; - - enum : bool { is_view = (first_is_view && next_is_view) }; - - using prop_type = CommonViewAllocProp; -}; - -} // end namespace Impl - -template -using DeducedCommonPropsType = - typename Impl::DeduceCommonViewAllocProp::prop_type; - -// This function is required in certain scenarios where users customize -// Kokkos View internals. One example are dynamic length embedded ensemble -// types. The function is used to propagate necessary information -// (like the ensemble size) when creating new views. -// However, most of the time it is called with a single view. -// Furthermore, the propagated information is not just for view allocations. -// From what I can tell, the type of functionality provided by -// common_view_alloc_prop is the equivalent of propagating accessors in mdspan, -// a mechanism we will eventually use to replace this clunky approach here, when -// we are finally mdspan based. -// TODO: get rid of this when we have mdspan -template -KOKKOS_INLINE_FUNCTION DeducedCommonPropsType common_view_alloc_prop( - Views const&... views) { - return DeducedCommonPropsType(views...); -} - -} // namespace Kokkos - -#include -#include - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- +#include -#endif /* #ifndef KOKKOS_VIEW_HPP */ +#endif /* KOKKOS_VIEW_HPP */ diff --git a/packages/kokkos/core/src/Kokkos_WorkGraphPolicy.hpp b/packages/kokkos/core/src/Kokkos_WorkGraphPolicy.hpp index efa56a086e36..4d2263428154 100644 --- a/packages/kokkos/core/src/Kokkos_WorkGraphPolicy.hpp +++ b/packages/kokkos/core/src/Kokkos_WorkGraphPolicy.hpp @@ -120,7 +120,7 @@ class WorkGraphPolicy : public Kokkos::Impl::PolicyTraits { (std::int32_t)BEGIN_TOKEN))) { // Attempt to claim ready work index succeeded, // update the hint and return work index - atomic_increment(begin_hint); + atomic_inc(begin_hint); return w; } // arrive here when ready_queue[i] == BEGIN_TOKEN @@ -169,7 +169,7 @@ class WorkGraphPolicy : public Kokkos::Impl::PolicyTraits { void operator()(const TagCount, int i) const noexcept { std::int32_t* const count_queue = &m_queue[m_graph.numRows()]; - atomic_increment(count_queue + m_graph.entries[i]); + atomic_inc(count_queue + m_graph.entries[i]); } KOKKOS_INLINE_FUNCTION diff --git a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC.cpp b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC.cpp index 99daf379b6ff..37fcfb7a1d99 100644 --- a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC.cpp +++ b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC.cpp @@ -23,7 +23,19 @@ #include #include +#if defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU) +#include +#elif defined(KOKKOS_ARCH_AMD_GPU) +// FIXME_OPENACC - hip_runtime_api.h contains two implementations: one for AMD +// GPUs and the other for NVIDIA GPUs; below macro is needed to choose AMD GPUs. +#define __HIP_PLATFORM_AMD__ +#include +#elif defined(KOKKOS_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE) +#include +#endif + #include +#include Kokkos::Experimental::OpenACC::OpenACC() : m_space_instance( @@ -46,6 +58,8 @@ Kokkos::Experimental::OpenACC::OpenACC(int async_arg) void Kokkos::Experimental::OpenACC::impl_initialize( InitializationSettings const& settings) { + Impl::OpenACCInternal::m_concurrency = + 256000; // FIXME_OPENACC - random guess when cannot compute if (Impl::OpenACC_Traits::may_fallback_to_host && acc_get_num_devices(Impl::OpenACC_Traits::dev_type) == 0 && !settings.has_device_id()) { @@ -59,11 +73,46 @@ void Kokkos::Experimental::OpenACC::impl_initialize( acc_get_device_num(acc_device_host); } else { using Kokkos::Impl::get_visible_devices; + acc_set_device_type(Impl::OpenACC_Traits::dev_type); std::vector const& visible_devices = get_visible_devices(); using Kokkos::Impl::get_gpu; int const dev_num = get_gpu(settings).value_or(visible_devices[0]); acc_set_device_num(dev_num, Impl::OpenACC_Traits::dev_type); Impl::OpenACCInternal::m_acc_device_num = dev_num; +#if defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU) + cudaDeviceProp deviceProp; + cudaError error = cudaGetDeviceProperties(&deviceProp, dev_num); + if (error != cudaSuccess) { + std::ostringstream msg; + msg << "Error: During OpenACC backend initialization, failed to retrieve " + << "CUDA device properties: (" << cudaGetErrorName(error) + << "): " << cudaGetErrorString(error); + Kokkos::Impl::host_abort(msg.str().c_str()); + } + Impl::OpenACCInternal::m_concurrency = + deviceProp.maxThreadsPerMultiProcessor * deviceProp.multiProcessorCount; +#elif defined(KOKKOS_ARCH_AMD_GPU) + hipDeviceProp_t deviceProp; + hipError_t error = hipGetDeviceProperties(&deviceProp, dev_num); + if (error != hipSuccess) { + std::ostringstream msg; + msg << "Error: During OpenACC backend initialization, failed to retrieve " + << "HIP device properties: (" << hipGetErrorName(error) + << "): " << hipGetErrorString(error); + Kokkos::Impl::host_abort(msg.str().c_str()); + } + Impl::OpenACCInternal::m_concurrency = + deviceProp.maxThreadsPerMultiProcessor * deviceProp.multiProcessorCount; +#elif defined(KOKKOS_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE) + Impl::OpenACCInternal::m_concurrency = std::thread::hardware_concurrency(); + if (Impl::OpenACCInternal::m_concurrency == 0) { + Kokkos::Impl::host_abort( + "Error: During OpenACC backend initialization, failed to retrieve " + "CPU hardware concurrency"); + } +#else + // FIXME_OPENACC: Compute Impl::OpenACCInternal::m_concurrency correctly. +#endif } Impl::OpenACCInternal::singleton().initialize(); } @@ -86,6 +135,12 @@ void Kokkos::Experimental::OpenACC::print_configuration(std::ostream& os, os << "yes\n"; #else os << "no\n"; +#endif + os << " KOKKOS_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE: "; +#if defined(KOKKOS_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE) + os << "yes\n"; +#else + os << "no\n"; #endif m_space_instance->print_configuration(os, verbose); } diff --git a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC.hpp b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC.hpp index 5155bee33dc3..aee696bd34e6 100644 --- a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC.hpp +++ b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC.hpp @@ -42,6 +42,7 @@ static_assert(false, // LLVM/Clacc compiler does not need this. #ifndef KOKKOS_COMPILER_CLANG #define KOKKOS_ENABLE_OPENACC_COLLAPSE_HIERARCHICAL_CONSTRUCTS +#define KOKKOS_ENABLE_OPENACC_COLLAPSE_MDRANGE_LOOPS #endif namespace Kokkos::Experimental::Impl { @@ -87,9 +88,9 @@ class OpenACC { static char const* name() { return "OpenACC"; } #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 - static int concurrency() { return 256000; } // FIXME_OPENACC + static int concurrency(); #else - int concurrency() const { return 256000; } // FIXME_OPENACC + int concurrency() const; #endif #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 KOKKOS_DEPRECATED static bool in_parallel() { diff --git a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACCSpace.hpp b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACCSpace.hpp index 4e7170cbbdf3..75cef98a8d91 100644 --- a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACCSpace.hpp +++ b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACCSpace.hpp @@ -85,16 +85,26 @@ class OpenACCSpace { template <> struct Kokkos::Impl::MemorySpaceAccess { +#if defined(KOKKOS_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE) + enum : bool{assignable = true}; + enum : bool{accessible = true}; +#else enum : bool { assignable = false }; enum : bool { accessible = false }; +#endif enum : bool { deepcopy = true }; }; template <> struct Kokkos::Impl::MemorySpaceAccess { +#if defined(KOKKOS_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE) + enum : bool{assignable = true}; + enum : bool{accessible = true}; +#else enum : bool { assignable = false }; enum : bool { accessible = false }; +#endif enum : bool { deepcopy = true }; }; diff --git a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_FunctorAdapter.hpp b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_FunctorAdapter.hpp index 82d38586eb8f..1373f8fa7a48 100644 --- a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_FunctorAdapter.hpp +++ b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_FunctorAdapter.hpp @@ -38,7 +38,7 @@ class FunctorAdapter; \ KOKKOS_IMPL_ACC_PRAGMA(routine CLAUSE) \ template \ - KOKKOS_FUNCTION void operator()(Args &&... args) const { \ + KOKKOS_FUNCTION void operator()(Args &&...args) const { \ if constexpr (std::is_void_v) { \ m_functor(static_cast(args)...); \ } else { \ diff --git a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_Instance.cpp b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_Instance.cpp index 10a76fbd3136..1dad499c1bec 100644 --- a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_Instance.cpp +++ b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_Instance.cpp @@ -27,6 +27,7 @@ // Arbitrary value to denote that we don't know yet what device to use. int Kokkos::Experimental::Impl::OpenACCInternal::m_acc_device_num = -1; +int Kokkos::Experimental::Impl::OpenACCInternal::m_concurrency = -1; Kokkos::Experimental::Impl::OpenACCInternal& Kokkos::Experimental::Impl::OpenACCInternal::singleton() { @@ -78,8 +79,18 @@ void Kokkos::Experimental::Impl::OpenACCInternal::fence( [&]() { acc_wait(m_async_arg); }); } -uint32_t Kokkos::Experimental::Impl::OpenACCInternal::instance_id() const - noexcept { +uint32_t Kokkos::Experimental::Impl::OpenACCInternal::instance_id() + const noexcept { return Kokkos::Tools::Experimental::Impl::idForInstance( reinterpret_cast(this)); } + +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +int Kokkos::Experimental::OpenACC::concurrency() { + return Impl::OpenACCInternal::m_concurrency; +} +#else +int Kokkos::Experimental::OpenACC::concurrency() const { + return Impl::OpenACCInternal::m_concurrency; +} +#endif diff --git a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_Instance.hpp b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_Instance.hpp index c3d723687270..343d9921a95a 100644 --- a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_Instance.hpp +++ b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_Instance.hpp @@ -30,11 +30,12 @@ namespace Kokkos::Experimental::Impl { class OpenACCInternal { bool m_is_initialized = false; - OpenACCInternal(const OpenACCInternal&) = default; + OpenACCInternal(const OpenACCInternal&) = default; OpenACCInternal& operator=(const OpenACCInternal&) = default; public: static int m_acc_device_num; + static int m_concurrency; int m_async_arg = acc_async_noval; OpenACCInternal() = default; diff --git a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelFor_MDRange.hpp b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelFor_MDRange.hpp index 550436fe7bec..629d26928ed3 100644 --- a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelFor_MDRange.hpp +++ b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelFor_MDRange.hpp @@ -30,10 +30,23 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateLeft, OpenACCMDRangeBegin<2> const& begin, OpenACCMDRangeEnd<2> const& end, int async_arg) { - int begin1 = begin[1]; - int end1 = end[1]; - int begin0 = begin[0]; - int end0 = end[0]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin0 = begin[0]; + auto end0 = end[0]; +#if defined(KOKKOS_ENABLE_OPENACC_COLLAPSE_MDRANGE_LOOPS) + auto dim1 = end1 - begin1; + auto dim0 = end0 - begin0; + auto nIter = dim1 * dim0; +// clang-format off +#pragma acc parallel loop gang vector copyin(functor) async(async_arg) + // clang-format on + for (decltype(nIter) m = 0; m < nIter; ++m) { + auto i1 = m / dim0 + begin1; + auto i0 = m % dim0 + begin0; + functor(i0, i1); + } +#else // clang-format off #pragma acc parallel loop gang vector collapse(2) copyin(functor) async(async_arg) // clang-format on @@ -42,6 +55,7 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateLeft, functor(i0, i1); } } +#endif } template @@ -50,10 +64,23 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateRight, OpenACCMDRangeBegin<2> const& begin, OpenACCMDRangeEnd<2> const& end, int async_arg) { - int begin0 = begin[0]; - int end0 = end[0]; - int begin1 = begin[1]; - int end1 = end[1]; + auto begin0 = begin[0]; + auto end0 = end[0]; + auto begin1 = begin[1]; + auto end1 = end[1]; +#if defined(KOKKOS_ENABLE_OPENACC_COLLAPSE_MDRANGE_LOOPS) + auto dim1 = end1 - begin1; + auto dim0 = end0 - begin0; + auto nIter = dim1 * dim0; +// clang-format off +#pragma acc parallel loop gang vector copyin(functor) async(async_arg) + // clang-format on + for (decltype(nIter) m = 0; m < nIter; ++m) { + auto i0 = m / dim1 + begin0; + auto i1 = m % dim1 + begin1; + functor(i0, i1); + } +#else // clang-format off #pragma acc parallel loop gang vector collapse(2) copyin(functor) async(async_arg) // clang-format on @@ -62,6 +89,7 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateRight, functor(i0, i1); } } +#endif } template @@ -71,12 +99,12 @@ void OpenACCParallelForMDRangePolicy(OpenACCTile, OpenACCIterateLeft, OpenACCMDRangeEnd<2> const& end, OpenACCMDRangeTile<2> const& tile, int async_arg) { - int tile0 = tile[0]; - int tile1 = tile[1]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin0 = begin[0]; - int end0 = end[0]; + auto tile0 = tile[0]; + auto tile1 = tile[1]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin0 = begin[0]; + auto end0 = end[0]; // clang-format off #pragma acc parallel loop gang vector tile(tile0,tile1) copyin(functor) async(async_arg) // clang-format on @@ -94,12 +122,12 @@ void OpenACCParallelForMDRangePolicy(OpenACCTile, OpenACCIterateRight, OpenACCMDRangeEnd<2> const& end, OpenACCMDRangeTile<2> const& tile, int async_arg) { - int tile1 = tile[1]; - int tile0 = tile[0]; - int begin0 = begin[0]; - int end0 = end[0]; - int begin1 = begin[1]; - int end1 = end[1]; + auto tile1 = tile[1]; + auto tile0 = tile[0]; + auto begin0 = begin[0]; + auto end0 = end[0]; + auto begin1 = begin[1]; + auto end1 = end[1]; // clang-format off #pragma acc parallel loop gang vector tile(tile1,tile0) copyin(functor) async(async_arg) // clang-format on @@ -116,12 +144,29 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateLeft, OpenACCMDRangeBegin<3> const& begin, OpenACCMDRangeEnd<3> const& end, int async_arg) { - int begin2 = begin[2]; - int end2 = end[2]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin0 = begin[0]; - int end0 = end[0]; + auto begin2 = begin[2]; + auto end2 = end[2]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin0 = begin[0]; + auto end0 = end[0]; +#if defined(KOKKOS_ENABLE_OPENACC_COLLAPSE_MDRANGE_LOOPS) + auto dim2 = end2 - begin2; + auto dim1 = end1 - begin1; + auto dim0 = end0 - begin0; + auto nIter = dim2 * dim1 * dim0; +// clang-format off +#pragma acc parallel loop gang vector copyin(functor) async(async_arg) + // clang-format on + for (decltype(nIter) m = 0; m < nIter; ++m) { + auto tmp1 = dim1 * dim0; + auto i2 = m / tmp1 + begin2; + auto tmp2 = m % tmp1; + auto i1 = tmp2 / dim0 + begin1; + auto i0 = tmp2 % dim0 + begin0; + functor(i0, i1, i2); + } +#else // clang-format off #pragma acc parallel loop gang vector collapse(3) copyin(functor) async(async_arg) // clang-format on @@ -132,6 +177,7 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateLeft, } } } +#endif } template @@ -140,12 +186,29 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateRight, OpenACCMDRangeBegin<3> const& begin, OpenACCMDRangeEnd<3> const& end, int async_arg) { - int begin0 = begin[0]; - int end0 = end[0]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin2 = begin[2]; - int end2 = end[2]; + auto begin0 = begin[0]; + auto end0 = end[0]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin2 = begin[2]; + auto end2 = end[2]; +#if defined(KOKKOS_ENABLE_OPENACC_COLLAPSE_MDRANGE_LOOPS) + auto dim2 = end2 - begin2; + auto dim1 = end1 - begin1; + auto dim0 = end0 - begin0; + auto nIter = dim2 * dim1 * dim0; +// clang-format off +#pragma acc parallel loop gang vector copyin(functor) async(async_arg) + // clang-format on + for (decltype(nIter) m = 0; m < nIter; ++m) { + auto tmp1 = dim2 * dim1; + auto i0 = m / tmp1 + begin0; + auto tmp2 = m % tmp1; + auto i1 = tmp2 / dim2 + begin1; + auto i2 = tmp2 % dim2 + begin2; + functor(i0, i1, i2); + } +#else // clang-format off #pragma acc parallel loop gang vector collapse(3) copyin(functor) async(async_arg) // clang-format on @@ -156,6 +219,7 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateRight, } } } +#endif } template @@ -165,15 +229,15 @@ void OpenACCParallelForMDRangePolicy(OpenACCTile, OpenACCIterateLeft, OpenACCMDRangeEnd<3> const& end, OpenACCMDRangeTile<3> const& tile, int async_arg) { - int tile0 = tile[0]; - int tile1 = tile[1]; - int tile2 = tile[2]; - int begin2 = begin[2]; - int end2 = end[2]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin0 = begin[0]; - int end0 = end[0]; + auto tile0 = tile[0]; + auto tile1 = tile[1]; + auto tile2 = tile[2]; + auto begin2 = begin[2]; + auto end2 = end[2]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin0 = begin[0]; + auto end0 = end[0]; // clang-format off #pragma acc parallel loop gang vector tile(tile0,tile1,tile2) copyin(functor) async(async_arg) // clang-format on @@ -193,15 +257,15 @@ void OpenACCParallelForMDRangePolicy(OpenACCTile, OpenACCIterateRight, OpenACCMDRangeEnd<3> const& end, OpenACCMDRangeTile<3> const& tile, int async_arg) { - int tile2 = tile[2]; - int tile1 = tile[1]; - int tile0 = tile[0]; - int begin0 = begin[0]; - int end0 = end[0]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin2 = begin[2]; - int end2 = end[2]; + auto tile2 = tile[2]; + auto tile1 = tile[1]; + auto tile0 = tile[0]; + auto begin0 = begin[0]; + auto end0 = end[0]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin2 = begin[2]; + auto end2 = end[2]; // clang-format off #pragma acc parallel loop gang vector tile(tile2,tile1,tile0) copyin(functor) async(async_arg) // clang-format on @@ -220,14 +284,35 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateLeft, OpenACCMDRangeBegin<4> const& begin, OpenACCMDRangeEnd<4> const& end, int async_arg) { - int begin3 = begin[3]; - int end3 = end[3]; - int begin2 = begin[2]; - int end2 = end[2]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin0 = begin[0]; - int end0 = end[0]; + auto begin3 = begin[3]; + auto end3 = end[3]; + auto begin2 = begin[2]; + auto end2 = end[2]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin0 = begin[0]; + auto end0 = end[0]; +#if defined(KOKKOS_ENABLE_OPENACC_COLLAPSE_MDRANGE_LOOPS) + auto dim3 = end3 - begin3; + auto dim2 = end2 - begin2; + auto dim1 = end1 - begin1; + auto dim0 = end0 - begin0; + auto nIter = dim3 * dim2 * dim1 * dim0; +// clang-format off +#pragma acc parallel loop gang vector copyin(functor) async(async_arg) + // clang-format on + for (decltype(nIter) m = 0; m < nIter; ++m) { + auto tmp1 = dim2 * dim1 * dim0; + auto i3 = m / tmp1 + begin3; + auto tmp2 = m % tmp1; + tmp1 = dim1 * dim0; + auto i2 = tmp2 / tmp1 + begin2; + tmp2 = tmp2 % tmp1; + auto i1 = tmp2 / dim0 + begin1; + auto i0 = tmp2 % dim0 + begin0; + functor(i0, i1, i2, i3); + } +#else // clang-format off #pragma acc parallel loop gang vector collapse(4) copyin(functor) async(async_arg) // clang-format on @@ -240,6 +325,7 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateLeft, } } } +#endif } template @@ -248,14 +334,35 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateRight, OpenACCMDRangeBegin<4> const& begin, OpenACCMDRangeEnd<4> const& end, int async_arg) { - int begin0 = begin[0]; - int end0 = end[0]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin2 = begin[2]; - int end2 = end[2]; - int begin3 = begin[3]; - int end3 = end[3]; + auto begin0 = begin[0]; + auto end0 = end[0]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin2 = begin[2]; + auto end2 = end[2]; + auto begin3 = begin[3]; + auto end3 = end[3]; +#if defined(KOKKOS_ENABLE_OPENACC_COLLAPSE_MDRANGE_LOOPS) + auto dim3 = end3 - begin3; + auto dim2 = end2 - begin2; + auto dim1 = end1 - begin1; + auto dim0 = end0 - begin0; + auto nIter = dim3 * dim2 * dim1 * dim0; +// clang-format off +#pragma acc parallel loop gang vector copyin(functor) async(async_arg) + // clang-format on + for (decltype(nIter) m = 0; m < nIter; ++m) { + auto tmp1 = dim3 * dim2 * dim1; + auto i0 = m / tmp1 + begin0; + auto tmp2 = m % tmp1; + tmp1 = dim3 * dim2; + auto i1 = tmp2 / tmp1 + begin1; + tmp2 = tmp2 % tmp1; + auto i2 = tmp2 / dim3 + begin2; + auto i3 = tmp2 % dim3 + begin3; + functor(i0, i1, i2, i3); + } +#else // clang-format off #pragma acc parallel loop gang vector collapse(4) copyin(functor) async(async_arg) // clang-format on @@ -268,6 +375,7 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateRight, } } } +#endif } template @@ -277,18 +385,18 @@ void OpenACCParallelForMDRangePolicy(OpenACCTile, OpenACCIterateLeft, OpenACCMDRangeEnd<4> const& end, OpenACCMDRangeTile<4> const& tile, int async_arg) { - int tile0 = tile[0]; - int tile1 = tile[1]; - int tile2 = tile[2]; - int tile3 = tile[3]; - int begin3 = begin[3]; - int end3 = end[3]; - int begin2 = begin[2]; - int end2 = end[2]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin0 = begin[0]; - int end0 = end[0]; + auto tile0 = tile[0]; + auto tile1 = tile[1]; + auto tile2 = tile[2]; + auto tile3 = tile[3]; + auto begin3 = begin[3]; + auto end3 = end[3]; + auto begin2 = begin[2]; + auto end2 = end[2]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin0 = begin[0]; + auto end0 = end[0]; // clang-format off #pragma acc parallel loop gang vector tile(tile0,tile1,tile2,tile3) copyin(functor) async(async_arg) // clang-format on @@ -310,18 +418,18 @@ void OpenACCParallelForMDRangePolicy(OpenACCTile, OpenACCIterateRight, OpenACCMDRangeEnd<4> const& end, OpenACCMDRangeTile<4> const& tile, int async_arg) { - int tile3 = tile[3]; - int tile2 = tile[2]; - int tile1 = tile[1]; - int tile0 = tile[0]; - int begin0 = begin[0]; - int end0 = end[0]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin2 = begin[2]; - int end2 = end[2]; - int begin3 = begin[3]; - int end3 = end[3]; + auto tile3 = tile[3]; + auto tile2 = tile[2]; + auto tile1 = tile[1]; + auto tile0 = tile[0]; + auto begin0 = begin[0]; + auto end0 = end[0]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin2 = begin[2]; + auto end2 = end[2]; + auto begin3 = begin[3]; + auto end3 = end[3]; // clang-format off #pragma acc parallel loop gang vector tile(tile3,tile2,tile1,tile0) copyin(functor) async(async_arg) // clang-format on @@ -342,16 +450,41 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateLeft, OpenACCMDRangeBegin<5> const& begin, OpenACCMDRangeEnd<5> const& end, int async_arg) { - int begin4 = begin[4]; - int end4 = end[4]; - int begin3 = begin[3]; - int end3 = end[3]; - int begin2 = begin[2]; - int end2 = end[2]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin0 = begin[0]; - int end0 = end[0]; + auto begin4 = begin[4]; + auto end4 = end[4]; + auto begin3 = begin[3]; + auto end3 = end[3]; + auto begin2 = begin[2]; + auto end2 = end[2]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin0 = begin[0]; + auto end0 = end[0]; +#if defined(KOKKOS_ENABLE_OPENACC_COLLAPSE_MDRANGE_LOOPS) + auto dim4 = end4 - begin4; + auto dim3 = end3 - begin3; + auto dim2 = end2 - begin2; + auto dim1 = end1 - begin1; + auto dim0 = end0 - begin0; + auto nIter = dim4 * dim3 * dim2 * dim1 * dim0; +// clang-format off +#pragma acc parallel loop gang vector copyin(functor) async(async_arg) + // clang-format on + for (decltype(nIter) m = 0; m < nIter; ++m) { + auto tmp1 = dim3 * dim2 * dim1 * dim0; + auto i4 = m / tmp1 + begin4; + auto tmp2 = m % tmp1; + tmp1 = dim2 * dim1 * dim0; + auto i3 = tmp2 / tmp1 + begin3; + tmp2 = tmp2 % tmp1; + tmp1 = dim1 * dim0; + auto i2 = tmp2 / tmp1 + begin2; + tmp2 = tmp2 % tmp1; + auto i1 = tmp2 / dim0 + begin1; + auto i0 = tmp2 % dim0 + begin0; + functor(i0, i1, i2, i3, i4); + } +#else // clang-format off #pragma acc parallel loop gang vector collapse(5) copyin(functor) async(async_arg) // clang-format on @@ -366,6 +499,7 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateLeft, } } } +#endif } template @@ -374,16 +508,41 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateRight, OpenACCMDRangeBegin<5> const& begin, OpenACCMDRangeEnd<5> const& end, int async_arg) { - int begin0 = begin[0]; - int end0 = end[0]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin2 = begin[2]; - int end2 = end[2]; - int begin3 = begin[3]; - int end3 = end[3]; - int begin4 = begin[4]; - int end4 = end[4]; + auto begin0 = begin[0]; + auto end0 = end[0]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin2 = begin[2]; + auto end2 = end[2]; + auto begin3 = begin[3]; + auto end3 = end[3]; + auto begin4 = begin[4]; + auto end4 = end[4]; +#if defined(KOKKOS_ENABLE_OPENACC_COLLAPSE_MDRANGE_LOOPS) + auto dim4 = end4 - begin4; + auto dim3 = end3 - begin3; + auto dim2 = end2 - begin2; + auto dim1 = end1 - begin1; + auto dim0 = end0 - begin0; + auto nIter = dim4 * dim3 * dim2 * dim1 * dim0; +// clang-format off +#pragma acc parallel loop gang vector copyin(functor) async(async_arg) + // clang-format on + for (decltype(nIter) m = 0; m < nIter; ++m) { + auto tmp1 = dim4 * dim3 * dim2 * dim1; + auto i0 = m / tmp1 + begin0; + auto tmp2 = m % tmp1; + tmp1 = dim4 * dim3 * dim2; + auto i1 = tmp2 / tmp1 + begin1; + tmp2 = tmp2 % tmp1; + tmp1 = dim4 * dim3; + auto i2 = tmp2 / tmp1 + begin2; + tmp2 = tmp2 % tmp1; + auto i3 = tmp2 / dim4 + begin3; + auto i4 = tmp2 % dim4 + begin4; + functor(i0, i1, i2, i3, i4); + } +#else // clang-format off #pragma acc parallel loop gang vector collapse(5) copyin(functor) async(async_arg) // clang-format on @@ -398,6 +557,7 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateRight, } } } +#endif } template @@ -407,21 +567,21 @@ void OpenACCParallelForMDRangePolicy(OpenACCTile, OpenACCIterateLeft, OpenACCMDRangeEnd<5> const& end, OpenACCMDRangeTile<5> const& tile, int async_arg) { - int tile0 = tile[0]; - int tile1 = tile[1]; - int tile2 = tile[2]; - int tile3 = tile[3]; - int tile4 = tile[4]; - int begin4 = begin[4]; - int end4 = end[4]; - int begin3 = begin[3]; - int end3 = end[3]; - int begin2 = begin[2]; - int end2 = end[2]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin0 = begin[0]; - int end0 = end[0]; + auto tile0 = tile[0]; + auto tile1 = tile[1]; + auto tile2 = tile[2]; + auto tile3 = tile[3]; + auto tile4 = tile[4]; + auto begin4 = begin[4]; + auto end4 = end[4]; + auto begin3 = begin[3]; + auto end3 = end[3]; + auto begin2 = begin[2]; + auto end2 = end[2]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin0 = begin[0]; + auto end0 = end[0]; // clang-format off #pragma acc parallel loop gang vector tile(tile0,tile1,tile2,tile3,tile4) copyin(functor) async(async_arg) // clang-format on @@ -445,21 +605,21 @@ void OpenACCParallelForMDRangePolicy(OpenACCTile, OpenACCIterateRight, OpenACCMDRangeEnd<5> const& end, OpenACCMDRangeTile<5> const& tile, int async_arg) { - int tile4 = tile[4]; - int tile3 = tile[3]; - int tile2 = tile[2]; - int tile1 = tile[1]; - int tile0 = tile[0]; - int begin0 = begin[0]; - int end0 = end[0]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin2 = begin[2]; - int end2 = end[2]; - int begin3 = begin[3]; - int end3 = end[3]; - int begin4 = begin[4]; - int end4 = end[4]; + auto tile4 = tile[4]; + auto tile3 = tile[3]; + auto tile2 = tile[2]; + auto tile1 = tile[1]; + auto tile0 = tile[0]; + auto begin0 = begin[0]; + auto end0 = end[0]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin2 = begin[2]; + auto end2 = end[2]; + auto begin3 = begin[3]; + auto end3 = end[3]; + auto begin4 = begin[4]; + auto end4 = end[4]; // clang-format off #pragma acc parallel loop gang vector tile(tile4,tile3,tile2,tile1,tile0) copyin(functor) async(async_arg) // clang-format on @@ -482,18 +642,47 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateLeft, OpenACCMDRangeBegin<6> const& begin, OpenACCMDRangeEnd<6> const& end, int async_arg) { - int begin5 = begin[5]; - int end5 = end[5]; - int begin4 = begin[4]; - int end4 = end[4]; - int begin3 = begin[3]; - int end3 = end[3]; - int begin2 = begin[2]; - int end2 = end[2]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin0 = begin[0]; - int end0 = end[0]; + auto begin5 = begin[5]; + auto end5 = end[5]; + auto begin4 = begin[4]; + auto end4 = end[4]; + auto begin3 = begin[3]; + auto end3 = end[3]; + auto begin2 = begin[2]; + auto end2 = end[2]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin0 = begin[0]; + auto end0 = end[0]; +#if defined(KOKKOS_ENABLE_OPENACC_COLLAPSE_MDRANGE_LOOPS) + auto dim5 = end5 - begin5; + auto dim4 = end4 - begin4; + auto dim3 = end3 - begin3; + auto dim2 = end2 - begin2; + auto dim1 = end1 - begin1; + auto dim0 = end0 - begin0; + auto nIter = dim5 * dim4 * dim3 * dim2 * dim1 * dim0; +// clang-format off +#pragma acc parallel loop gang vector copyin(functor) async(async_arg) + // clang-format on + for (decltype(nIter) m = 0; m < nIter; ++m) { + auto tmp1 = dim4 * dim3 * dim2 * dim1 * dim0; + auto i5 = m / tmp1 + begin5; + auto tmp2 = m % tmp1; + tmp1 = dim3 * dim2 * dim1 * dim0; + auto i4 = tmp2 / tmp1 + begin4; + tmp2 = tmp2 % tmp1; + tmp1 = dim2 * dim1 * dim0; + auto i3 = tmp2 / tmp1 + begin3; + tmp2 = tmp2 % tmp1; + tmp1 = dim1 * dim0; + auto i2 = tmp2 / tmp1 + begin2; + tmp2 = tmp2 % tmp1; + auto i1 = tmp2 / dim0 + begin1; + auto i0 = tmp2 % dim0 + begin0; + functor(i0, i1, i2, i3, i4, i5); + } +#else // clang-format off #pragma acc parallel loop gang vector collapse(6) copyin(functor) async(async_arg) // clang-format on @@ -510,6 +699,7 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateLeft, } } } +#endif } template @@ -518,18 +708,47 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateRight, OpenACCMDRangeBegin<6> const& begin, OpenACCMDRangeEnd<6> const& end, int async_arg) { - int begin0 = begin[0]; - int end0 = end[0]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin2 = begin[2]; - int end2 = end[2]; - int begin3 = begin[3]; - int end3 = end[3]; - int begin4 = begin[4]; - int end4 = end[4]; - int begin5 = begin[5]; - int end5 = end[5]; + auto begin0 = begin[0]; + auto end0 = end[0]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin2 = begin[2]; + auto end2 = end[2]; + auto begin3 = begin[3]; + auto end3 = end[3]; + auto begin4 = begin[4]; + auto end4 = end[4]; + auto begin5 = begin[5]; + auto end5 = end[5]; +#if defined(KOKKOS_ENABLE_OPENACC_COLLAPSE_MDRANGE_LOOPS) + auto dim5 = end5 - begin5; + auto dim4 = end4 - begin4; + auto dim3 = end3 - begin3; + auto dim2 = end2 - begin2; + auto dim1 = end1 - begin1; + auto dim0 = end0 - begin0; + auto nIter = dim5 * dim4 * dim3 * dim2 * dim1 * dim0; +// clang-format off +#pragma acc parallel loop gang vector copyin(functor) async(async_arg) + // clang-format on + for (decltype(nIter) m = 0; m < nIter; ++m) { + auto tmp1 = dim5 * dim4 * dim3 * dim2 * dim1; + auto i0 = m / tmp1 + begin0; + auto tmp2 = m % tmp1; + tmp1 = dim5 * dim4 * dim3 * dim2; + auto i1 = tmp2 / tmp1 + begin1; + tmp2 = tmp2 % tmp1; + tmp1 = dim5 * dim4 * dim3; + auto i2 = tmp2 / tmp1 + begin2; + tmp2 = tmp2 % tmp1; + tmp1 = dim5 * dim4; + auto i3 = tmp2 / tmp1 + begin3; + tmp2 = tmp2 % tmp1; + auto i4 = tmp2 / dim5 + begin4; + auto i5 = tmp2 % dim5 + begin5; + functor(i0, i1, i2, i3, i4, i5); + } +#else // clang-format off #pragma acc parallel loop gang vector collapse(6) copyin(functor) async(async_arg) // clang-format on @@ -546,6 +765,7 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateRight, } } } +#endif } template @@ -555,24 +775,24 @@ void OpenACCParallelForMDRangePolicy(OpenACCTile, OpenACCIterateLeft, OpenACCMDRangeEnd<6> const& end, OpenACCMDRangeTile<6> const& tile, int async_arg) { - int tile0 = tile[0]; - int tile1 = tile[1]; - int tile2 = tile[2]; - int tile3 = tile[3]; - int tile4 = tile[4]; - int tile5 = tile[5]; - int begin5 = begin[5]; - int end5 = end[5]; - int begin4 = begin[4]; - int end4 = end[4]; - int begin3 = begin[3]; - int end3 = end[3]; - int begin2 = begin[2]; - int end2 = end[2]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin0 = begin[0]; - int end0 = end[0]; + auto tile0 = tile[0]; + auto tile1 = tile[1]; + auto tile2 = tile[2]; + auto tile3 = tile[3]; + auto tile4 = tile[4]; + auto tile5 = tile[5]; + auto begin5 = begin[5]; + auto end5 = end[5]; + auto begin4 = begin[4]; + auto end4 = end[4]; + auto begin3 = begin[3]; + auto end3 = end[3]; + auto begin2 = begin[2]; + auto end2 = end[2]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin0 = begin[0]; + auto end0 = end[0]; // clang-format off #pragma acc parallel loop gang vector tile(tile0,tile1,tile2,tile3,tile4,tile5) copyin(functor) async(async_arg) // clang-format on @@ -598,24 +818,24 @@ void OpenACCParallelForMDRangePolicy(OpenACCTile, OpenACCIterateRight, OpenACCMDRangeEnd<6> const& end, OpenACCMDRangeTile<6> const& tile, int async_arg) { - int tile5 = tile[5]; - int tile4 = tile[4]; - int tile3 = tile[3]; - int tile2 = tile[2]; - int tile1 = tile[1]; - int tile0 = tile[0]; - int begin0 = begin[0]; - int end0 = end[0]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin2 = begin[2]; - int end2 = end[2]; - int begin3 = begin[3]; - int end3 = end[3]; - int begin4 = begin[4]; - int end4 = end[4]; - int begin5 = begin[5]; - int end5 = end[5]; + auto tile5 = tile[5]; + auto tile4 = tile[4]; + auto tile3 = tile[3]; + auto tile2 = tile[2]; + auto tile1 = tile[1]; + auto tile0 = tile[0]; + auto begin0 = begin[0]; + auto end0 = end[0]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin2 = begin[2]; + auto end2 = end[2]; + auto begin3 = begin[3]; + auto end3 = end[3]; + auto begin4 = begin[4]; + auto end4 = end[4]; + auto begin5 = begin[5]; + auto end5 = end[5]; // clang-format off #pragma acc parallel loop gang vector tile(tile5,tile4,tile3,tile2,tile1,tile0) copyin(functor) async(async_arg) // clang-format on diff --git a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_MDRange.hpp b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_MDRange.hpp index 5afb5e75d392..2b5631d6f8a3 100644 --- a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_MDRange.hpp +++ b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_MDRange.hpp @@ -113,6 +113,404 @@ class Kokkos::Impl::ParallelReduce \ + void OpenACCParallelReduce##REDUCER(OpenACCIterateLeft, ValueType& aval, \ + Functor const& afunctor, \ + OpenACCMDRangeBegin<2> const& begin, \ + OpenACCMDRangeEnd<2> const& end, \ + int async_arg) { \ + auto val = aval; \ + auto const functor(afunctor); \ + auto begin1 = begin[1]; \ + auto end1 = end[1]; \ + auto begin0 = begin[0]; \ + auto end0 = end[0]; \ + auto dim1 = end1 - begin1; \ + auto dim0 = end0 - begin0; \ + auto nIter = dim1 * dim0; \ + /* clang-format off */ \ + KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector reduction(OPERATOR:val) copyin(functor) async(async_arg)) \ + /* clang-format on */ \ + for (decltype(nIter) m = 0; m < nIter; ++m) { \ + auto i1 = m / dim0 + begin1; \ + auto i0 = m % dim0 + begin0; \ + functor(i0, i1, val); \ + } \ + acc_wait(async_arg); \ + aval = val; \ + } \ + \ + template \ + void OpenACCParallelReduce##REDUCER(OpenACCIterateRight, ValueType& aval, \ + Functor const& afunctor, \ + OpenACCMDRangeBegin<2> const& begin, \ + OpenACCMDRangeEnd<2> const& end, \ + int async_arg) { \ + auto val = aval; \ + auto const functor(afunctor); \ + auto begin0 = begin[0]; \ + auto end0 = end[0]; \ + auto begin1 = begin[1]; \ + auto end1 = end[1]; \ + auto dim1 = end1 - begin1; \ + auto dim0 = end0 - begin0; \ + auto nIter = dim1 * dim0; \ + /* clang-format off */ \ + KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector reduction(OPERATOR:val) copyin(functor) async(async_arg)) \ + /* clang-format on */ \ + for (decltype(nIter) m = 0; m < nIter; ++m) { \ + auto i0 = m / dim1 + begin0; \ + auto i1 = m % dim1 + begin1; \ + functor(i0, i1, val); \ + } \ + acc_wait(async_arg); \ + aval = val; \ + } \ + \ + template \ + void OpenACCParallelReduce##REDUCER(OpenACCIterateLeft, ValueType& aval, \ + Functor const& afunctor, \ + OpenACCMDRangeBegin<3> const& begin, \ + OpenACCMDRangeEnd<3> const& end, \ + int async_arg) { \ + auto val = aval; \ + auto const functor(afunctor); \ + auto begin2 = begin[2]; \ + auto end2 = end[2]; \ + auto begin1 = begin[1]; \ + auto end1 = end[1]; \ + auto begin0 = begin[0]; \ + auto end0 = end[0]; \ + auto dim2 = end2 - begin2; \ + auto dim1 = end1 - begin1; \ + auto dim0 = end0 - begin0; \ + auto nIter = dim2 * dim1 * dim0; \ + /* clang-format off */ \ + KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector reduction(OPERATOR:val) copyin(functor) async(async_arg)) \ + /* clang-format on */ \ + for (decltype(nIter) m = 0; m < nIter; ++m) { \ + auto tmp1 = dim1 * dim0; \ + auto i2 = m / tmp1 + begin2; \ + auto tmp2 = m % tmp1; \ + auto i1 = tmp2 / dim0 + begin1; \ + auto i0 = tmp2 % dim0 + begin0; \ + functor(i0, i1, i2, val); \ + } \ + acc_wait(async_arg); \ + aval = val; \ + } \ + \ + template \ + void OpenACCParallelReduce##REDUCER(OpenACCIterateRight, ValueType& aval, \ + Functor const& afunctor, \ + OpenACCMDRangeBegin<3> const& begin, \ + OpenACCMDRangeEnd<3> const& end, \ + int async_arg) { \ + auto val = aval; \ + auto const functor(afunctor); \ + auto begin0 = begin[0]; \ + auto end0 = end[0]; \ + auto begin1 = begin[1]; \ + auto end1 = end[1]; \ + auto begin2 = begin[2]; \ + auto end2 = end[2]; \ + auto dim2 = end2 - begin2; \ + auto dim1 = end1 - begin1; \ + auto dim0 = end0 - begin0; \ + auto nIter = dim2 * dim1 * dim0; \ + /* clang-format off */ \ + KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector reduction(OPERATOR:val) copyin(functor) async(async_arg)) \ + /* clang-format on */ \ + for (decltype(nIter) m = 0; m < nIter; ++m) { \ + auto tmp1 = dim2 * dim1; \ + auto i0 = m / tmp1 + begin0; \ + auto tmp2 = m % tmp1; \ + auto i1 = tmp2 / dim2 + begin1; \ + auto i2 = tmp2 % dim2 + begin2; \ + functor(i0, i1, i2, val); \ + } \ + acc_wait(async_arg); \ + aval = val; \ + } \ + \ + template \ + void OpenACCParallelReduce##REDUCER(OpenACCIterateLeft, ValueType& aval, \ + Functor const& afunctor, \ + OpenACCMDRangeBegin<4> const& begin, \ + OpenACCMDRangeEnd<4> const& end, \ + int async_arg) { \ + auto val = aval; \ + auto const functor(afunctor); \ + auto begin3 = begin[3]; \ + auto end3 = end[3]; \ + auto begin2 = begin[2]; \ + auto end2 = end[2]; \ + auto begin1 = begin[1]; \ + auto end1 = end[1]; \ + auto begin0 = begin[0]; \ + auto end0 = end[0]; \ + auto dim3 = end3 - begin3; \ + auto dim2 = end2 - begin2; \ + auto dim1 = end1 - begin1; \ + auto dim0 = end0 - begin0; \ + auto nIter = dim3 * dim2 * dim1 * dim0; \ + /* clang-format off */ \ + KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector reduction(OPERATOR:val) copyin(functor) async(async_arg)) \ + /* clang-format on */ \ + for (decltype(nIter) m = 0; m < nIter; ++m) { \ + auto tmp1 = dim2 * dim1 * dim0; \ + auto i3 = m / tmp1 + begin3; \ + auto tmp2 = m % tmp1; \ + tmp1 = dim1 * dim0; \ + auto i2 = tmp2 / tmp1 + begin2; \ + tmp2 = tmp2 % tmp1; \ + auto i1 = tmp2 / dim0 + begin1; \ + auto i0 = tmp2 % dim0 + begin0; \ + functor(i0, i1, i2, i3, val); \ + } \ + acc_wait(async_arg); \ + aval = val; \ + } \ + \ + template \ + void OpenACCParallelReduce##REDUCER(OpenACCIterateRight, ValueType& aval, \ + Functor const& afunctor, \ + OpenACCMDRangeBegin<4> const& begin, \ + OpenACCMDRangeEnd<4> const& end, \ + int async_arg) { \ + auto val = aval; \ + auto const functor(afunctor); \ + auto begin0 = begin[0]; \ + auto end0 = end[0]; \ + auto begin1 = begin[1]; \ + auto end1 = end[1]; \ + auto begin2 = begin[2]; \ + auto end2 = end[2]; \ + auto begin3 = begin[3]; \ + auto end3 = end[3]; \ + auto dim3 = end3 - begin3; \ + auto dim2 = end2 - begin2; \ + auto dim1 = end1 - begin1; \ + auto dim0 = end0 - begin0; \ + auto nIter = dim3 * dim2 * dim1 * dim0; \ + /* clang-format off */ \ + KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector reduction(OPERATOR:val) copyin(functor) async(async_arg)) \ + /* clang-format on */ \ + for (decltype(nIter) m = 0; m < nIter; ++m) { \ + auto tmp1 = dim3 * dim2 * dim1; \ + auto i0 = m / tmp1 + begin0; \ + auto tmp2 = m % tmp1; \ + tmp1 = dim3 * dim2; \ + auto i1 = tmp2 / tmp1 + begin1; \ + tmp2 = tmp2 % tmp1; \ + auto i2 = tmp2 / dim3 + begin2; \ + auto i3 = tmp2 % dim3 + begin3; \ + functor(i0, i1, i2, i3, val); \ + } \ + acc_wait(async_arg); \ + aval = val; \ + } \ + \ + template \ + void OpenACCParallelReduce##REDUCER(OpenACCIterateLeft, ValueType& aval, \ + Functor const& afunctor, \ + OpenACCMDRangeBegin<5> const& begin, \ + OpenACCMDRangeEnd<5> const& end, \ + int async_arg) { \ + auto val = aval; \ + auto const functor(afunctor); \ + auto begin4 = begin[4]; \ + auto end4 = end[4]; \ + auto begin3 = begin[3]; \ + auto end3 = end[3]; \ + auto begin2 = begin[2]; \ + auto end2 = end[2]; \ + auto begin1 = begin[1]; \ + auto end1 = end[1]; \ + auto begin0 = begin[0]; \ + auto end0 = end[0]; \ + auto dim4 = end4 - begin4; \ + auto dim3 = end3 - begin3; \ + auto dim2 = end2 - begin2; \ + auto dim1 = end1 - begin1; \ + auto dim0 = end0 - begin0; \ + auto nIter = dim4 * dim3 * dim2 * dim1 * dim0; \ + /* clang-format off */ \ + KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector reduction(OPERATOR:val) copyin(functor) async(async_arg)) \ + /* clang-format on */ \ + for (decltype(nIter) m = 0; m < nIter; ++m) { \ + auto tmp1 = dim3 * dim2 * dim1 * dim0; \ + auto i4 = m / tmp1 + begin4; \ + auto tmp2 = m % tmp1; \ + tmp1 = dim2 * dim1 * dim0; \ + auto i3 = tmp2 / tmp1 + begin3; \ + tmp2 = tmp2 % tmp1; \ + tmp1 = dim1 * dim0; \ + auto i2 = tmp2 / tmp1 + begin2; \ + tmp2 = tmp2 % tmp1; \ + auto i1 = tmp2 / dim0 + begin1; \ + auto i0 = tmp2 % dim0 + begin0; \ + functor(i0, i1, i2, i3, i4, val); \ + } \ + acc_wait(async_arg); \ + aval = val; \ + } \ + \ + template \ + void OpenACCParallelReduce##REDUCER(OpenACCIterateRight, ValueType& aval, \ + Functor const& afunctor, \ + OpenACCMDRangeBegin<5> const& begin, \ + OpenACCMDRangeEnd<5> const& end, \ + int async_arg) { \ + auto val = aval; \ + auto const functor(afunctor); \ + auto begin0 = begin[0]; \ + auto end0 = end[0]; \ + auto begin1 = begin[1]; \ + auto end1 = end[1]; \ + auto begin2 = begin[2]; \ + auto end2 = end[2]; \ + auto begin3 = begin[3]; \ + auto end3 = end[3]; \ + auto begin4 = begin[4]; \ + auto end4 = end[4]; \ + auto dim4 = end4 - begin4; \ + auto dim3 = end3 - begin3; \ + auto dim2 = end2 - begin2; \ + auto dim1 = end1 - begin1; \ + auto dim0 = end0 - begin0; \ + auto nIter = dim4 * dim3 * dim2 * dim1 * dim0; \ + /* clang-format off */ \ + KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector reduction(OPERATOR:val) copyin(functor) async(async_arg)) \ + /* clang-format on */ \ + for (decltype(nIter) m = 0; m < nIter; ++m) { \ + auto tmp1 = dim4 * dim3 * dim2 * dim1; \ + auto i0 = m / tmp1 + begin0; \ + auto tmp2 = m % tmp1; \ + tmp1 = dim4 * dim3 * dim2; \ + auto i1 = tmp2 / tmp1 + begin1; \ + tmp2 = tmp2 % tmp1; \ + tmp1 = dim4 * dim3; \ + auto i2 = tmp2 / tmp1 + begin2; \ + tmp2 = tmp2 % tmp1; \ + auto i3 = tmp2 / dim4 + begin3; \ + auto i4 = tmp2 % dim4 + begin4; \ + functor(i0, i1, i2, i3, i4, val); \ + } \ + acc_wait(async_arg); \ + aval = val; \ + } \ + \ + template \ + void OpenACCParallelReduce##REDUCER(OpenACCIterateLeft, ValueType& aval, \ + Functor const& afunctor, \ + OpenACCMDRangeBegin<6> const& begin, \ + OpenACCMDRangeEnd<6> const& end, \ + int async_arg) { \ + auto val = aval; \ + auto const functor(afunctor); \ + auto begin5 = begin[5]; \ + auto end5 = end[5]; \ + auto begin4 = begin[4]; \ + auto end4 = end[4]; \ + auto begin3 = begin[3]; \ + auto end3 = end[3]; \ + auto begin2 = begin[2]; \ + auto end2 = end[2]; \ + auto begin1 = begin[1]; \ + auto end1 = end[1]; \ + auto begin0 = begin[0]; \ + auto end0 = end[0]; \ + auto dim5 = end5 - begin5; \ + auto dim4 = end4 - begin4; \ + auto dim3 = end3 - begin3; \ + auto dim2 = end2 - begin2; \ + auto dim1 = end1 - begin1; \ + auto dim0 = end0 - begin0; \ + auto nIter = dim5 * dim4 * dim3 * dim2 * dim1 * dim0; \ + /* clang-format off */ \ + KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector reduction(OPERATOR:val) copyin(functor) async(async_arg)) \ + /* clang-format on */ \ + for (decltype(nIter) m = 0; m < nIter; ++m) { \ + auto tmp1 = dim4 * dim3 * dim2 * dim1 * dim0; \ + auto i5 = m / tmp1 + begin5; \ + auto tmp2 = m % tmp1; \ + tmp1 = dim3 * dim2 * dim1 * dim0; \ + auto i4 = tmp2 / tmp1 + begin4; \ + tmp2 = tmp2 % tmp1; \ + tmp1 = dim2 * dim1 * dim0; \ + auto i3 = tmp2 / tmp1 + begin3; \ + tmp2 = tmp2 % tmp1; \ + tmp1 = dim1 * dim0; \ + auto i2 = tmp2 / tmp1 + begin2; \ + tmp2 = tmp2 % tmp1; \ + auto i1 = tmp2 / dim0 + begin1; \ + auto i0 = tmp2 % dim0 + begin0; \ + functor(i0, i1, i2, i3, i4, i5, val); \ + } \ + acc_wait(async_arg); \ + aval = val; \ + } \ + \ + template \ + void OpenACCParallelReduce##REDUCER(OpenACCIterateRight, ValueType& aval, \ + Functor const& afunctor, \ + OpenACCMDRangeBegin<6> const& begin, \ + OpenACCMDRangeEnd<6> const& end, \ + int async_arg) { \ + auto val = aval; \ + auto const functor(afunctor); \ + auto begin0 = begin[0]; \ + auto end0 = end[0]; \ + auto begin1 = begin[1]; \ + auto end1 = end[1]; \ + auto begin2 = begin[2]; \ + auto end2 = end[2]; \ + auto begin3 = begin[3]; \ + auto end3 = end[3]; \ + auto begin4 = begin[4]; \ + auto end4 = end[4]; \ + auto begin5 = begin[5]; \ + auto end5 = end[5]; \ + auto dim5 = end5 - begin5; \ + auto dim4 = end4 - begin4; \ + auto dim3 = end3 - begin3; \ + auto dim2 = end2 - begin2; \ + auto dim1 = end1 - begin1; \ + auto dim0 = end0 - begin0; \ + auto nIter = dim5 * dim4 * dim3 * dim2 * dim1 * dim0; \ + /* clang-format off */ \ + KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector reduction(OPERATOR:val) copyin(functor) async(async_arg)) \ + /* clang-format on */ \ + for (decltype(nIter) m = 0; m < nIter; ++m) { \ + auto tmp1 = dim5 * dim4 * dim3 * dim2 * dim1; \ + auto i0 = m / tmp1 + begin0; \ + auto tmp2 = m % tmp1; \ + tmp1 = dim5 * dim4 * dim3 * dim2; \ + auto i1 = tmp2 / tmp1 + begin1; \ + tmp2 = tmp2 % tmp1; \ + tmp1 = dim5 * dim4 * dim3; \ + auto i2 = tmp2 / tmp1 + begin2; \ + tmp2 = tmp2 % tmp1; \ + tmp1 = dim5 * dim4; \ + auto i3 = tmp2 / tmp1 + begin3; \ + tmp2 = tmp2 % tmp1; \ + auto i4 = tmp2 / dim5 + begin4; \ + auto i5 = tmp2 % dim5 + begin5; \ + functor(i0, i1, i2, i3, i4, i5, val); \ + } \ + acc_wait(async_arg); \ + aval = val; \ + } \ + } // namespace Kokkos::Experimental::Impl + +#else + #define KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_DISPATCH_ITERATE(REDUCER, \ OPERATOR) \ namespace Kokkos::Experimental::Impl { \ @@ -124,10 +522,10 @@ class Kokkos::Impl::ParallelReduce \ diff --git a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Team.hpp b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Team.hpp index 430bdcb68088..d4cb73164d20 100644 --- a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Team.hpp +++ b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Team.hpp @@ -163,13 +163,24 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t> parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< iType, Impl::OpenACCTeamMember>& loop_boundaries, const Lambda& lambda, ValueType& result) { - ValueType tmp = ValueType(); + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, Lambda, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; + + wrapped_reducer_type wrapped_reducer(lambda); + value_type tmp; + wrapped_reducer.init(&tmp); + iType j_start = loop_boundaries.team.team_rank() / loop_boundaries.team.vector_length(); if (j_start == 0) { #pragma acc loop seq for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) lambda(i, tmp); + wrapped_reducer.final(&tmp); result = tmp; } } @@ -180,15 +191,25 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t> parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< iType, Impl::OpenACCTeamMember>& loop_boundaries, const Lambda& lambda, const ReducerType& reducer) { - using ValueType = typename ReducerType::value_type; - ValueType tmp; - reducer.init(tmp); + using value_type = typename ReducerType::value_type; + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, + ReducerType, value_type>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + + wrapped_reducer_type wrapped_reducer(reducer); + value_type tmp; + wrapped_reducer.init(&tmp); + iType j_start = loop_boundaries.team.team_rank() / loop_boundaries.team.vector_length(); if (j_start == 0) { #pragma acc loop seq for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) lambda(i, tmp); + + wrapped_reducer.final(&tmp); reducer.reference() = tmp; } } @@ -200,7 +221,17 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t> parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct< iType, Impl::OpenACCTeamMember>& loop_boundaries, const Lambda& lambda, ValueType& result) { - ValueType tmp = ValueType(); + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, Lambda, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; + + wrapped_reducer_type wrapped_reducer(lambda); + value_type tmp; + wrapped_reducer.init(&tmp); + iType j_start = loop_boundaries.team.team_rank() % loop_boundaries.team.vector_length(); if (j_start == 0) { @@ -208,6 +239,7 @@ parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct< for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { lambda(i, tmp); } + wrapped_reducer.final(&tmp); result = tmp; } } @@ -218,9 +250,17 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t> parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct< iType, Impl::OpenACCTeamMember>& loop_boundaries, const Lambda& lambda, const ReducerType& reducer) { - using ValueType = typename ReducerType::value_type; - ValueType tmp; - reducer.init(tmp); + using value_type = typename ReducerType::value_type; + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, + ReducerType, value_type>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + + wrapped_reducer_type wrapped_reducer(reducer); + value_type tmp; + wrapped_reducer.init(&tmp); + iType j_start = loop_boundaries.team.team_rank() % loop_boundaries.team.vector_length(); if (j_start == 0) { @@ -228,6 +268,8 @@ parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct< for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { lambda(i, tmp); } + + wrapped_reducer.final(&tmp); reducer.reference() = tmp; } } @@ -239,7 +281,17 @@ KOKKOS_INLINE_FUNCTION void parallel_reduce( const Impl::TeamVectorRangeBoundariesStruct& loop_boundaries, const Lambda& lambda, ValueType& result) { - ValueType tmp = ValueType(); + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, Lambda, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; + + wrapped_reducer_type wrapped_reducer(lambda); + value_type tmp; + wrapped_reducer.init(&tmp); + iType j_start = loop_boundaries.team.team_rank() % loop_boundaries.team.vector_length(); if (j_start == 0) { @@ -247,6 +299,7 @@ KOKKOS_INLINE_FUNCTION void parallel_reduce( for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { lambda(i, tmp); } + wrapped_reducer.final(&tmp); result = tmp; } } @@ -273,10 +326,23 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t> parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< iType, Impl::OpenACCTeamMember>& loop_boundaries, const Lambda& lambda, ValueType& result) { + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, Lambda, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; + + wrapped_reducer_type wrapped_reducer(lambda); + value_type tmp; + wrapped_reducer.init(&tmp); + ValueType tmp = ValueType(); #pragma acc loop worker reduction(+ : tmp) for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) lambda(i, tmp); + + wrapped_reducer.final(&tmp); result = tmp; } @@ -314,11 +380,22 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t> parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct< iType, Impl::OpenACCTeamMember>& loop_boundaries, const Lambda& lambda, ValueType& result) { - ValueType tmp = ValueType(); + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, Lambda, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; + + wrapped_reducer_type wrapped_reducer(lambda); + value_type tmp; + wrapped_reducer.init(&tmp); + #pragma acc loop vector reduction(+ : tmp) for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { lambda(i, tmp); } + wrapped_reducer.final(&tmp); result = tmp; } @@ -357,11 +434,23 @@ KOKKOS_INLINE_FUNCTION void parallel_reduce( const Impl::TeamVectorRangeBoundariesStruct& loop_boundaries, const Lambda& lambda, ValueType& result) { - ValueType tmp = ValueType(); + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, Lambda, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; + + wrapped_reducer_type wrapped_reducer(lambda); + value_type tmp; + wrapped_reducer.init(&tmp); + #pragma acc loop vector reduction(+ : tmp) for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { lambda(i, tmp); } + + wrapped_reducer.final(&tmp); result = tmp; } diff --git a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelScan_Range.hpp b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelScan_Range.hpp index c6d3267bdb0a..b1c48baa1e73 100644 --- a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelScan_Range.hpp +++ b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelScan_Range.hpp @@ -225,7 +225,7 @@ KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector_length(chunk_size) KOKKOS_IMPL_ } #pragma acc exit data delete (functor, chunk_values, offset_values, \ - final_reducer)async(async_arg) + final_reducer)async(async_arg) acc_wait(async_arg); } diff --git a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp index faa50aa7c388..95526aa7849c 100644 --- a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp +++ b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp @@ -28,8 +28,11 @@ struct OpenACC_Traits { #elif defined(KOKKOS_ARCH_AMD_GPU) static constexpr acc_device_t dev_type = acc_device_radeon; static constexpr bool may_fallback_to_host = false; +#elif defined(KOKKOS_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE) + static constexpr acc_device_t dev_type = acc_device_host; + static constexpr bool may_fallback_to_host = true; #else - static constexpr acc_device_t dev_type = acc_device_not_host; + static constexpr acc_device_t dev_type = acc_device_default; static constexpr bool may_fallback_to_host = true; #endif }; diff --git a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP.hpp b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP.hpp index a403909f677c..aa4be87ceb62 100644 --- a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP.hpp +++ b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP.hpp @@ -30,7 +30,6 @@ static_assert(false, #include #include #include -#include #include #include #include @@ -93,11 +92,16 @@ class OpenMP { void fence(std::string const& name = "Kokkos::OpenMP::fence: Unnamed Instance Fence") const; +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 /// \brief Does the given instance return immediately after launching /// a parallel algorithm /// /// This always returns false on OpenMP - inline static bool is_asynchronous(OpenMP const& = OpenMP()) noexcept; + KOKKOS_DEPRECATED inline static bool is_asynchronous( + OpenMP const& = OpenMP()) noexcept { + return false; + } +#endif #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 static int concurrency(OpenMP const& = OpenMP()); @@ -154,10 +158,6 @@ inline int OpenMP::impl_thread_pool_rank() noexcept { KOKKOS_IF_ON_DEVICE((return -1;)) } -inline bool OpenMP::is_asynchronous(OpenMP const& /*instance*/) noexcept { - return false; -} - inline int OpenMP::impl_thread_pool_size(int depth) const { return depth < 2 ? impl_thread_pool_size() : 1; } @@ -202,7 +202,9 @@ struct MemorySpaceAccess #include +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 #include +#endif #include /*--------------------------------------------------------------------------*/ diff --git a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp index 2877d940fafc..6edcbff0c26b 100644 --- a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp +++ b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp @@ -26,12 +26,19 @@ #include #include +#include + #include #include //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS +// We allow using deprecated classes in this file +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() +#endif + namespace Kokkos { namespace Impl { @@ -360,6 +367,10 @@ extern template class TaskQueue #include #include -#include #include #include #include @@ -148,7 +147,6 @@ struct DeviceTypeTraits<::Kokkos::Experimental::OpenMPTarget> { #include #include #include -#include /*--------------------------------------------------------------------------*/ diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.hpp index ed625cfcc82c..ec33d25b9695 100644 --- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.hpp +++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.hpp @@ -28,6 +28,7 @@ static_assert(false, #include #include +#include #ifdef KOKKOS_ENABLE_OPENMPTARGET @@ -91,9 +92,9 @@ class OpenMPTargetSpace { /**\brief Default memory space instance */ OpenMPTargetSpace(); - OpenMPTargetSpace(OpenMPTargetSpace&& rhs) = default; - OpenMPTargetSpace(const OpenMPTargetSpace& rhs) = default; - OpenMPTargetSpace& operator=(OpenMPTargetSpace&&) = default; + OpenMPTargetSpace(OpenMPTargetSpace&& rhs) = default; + OpenMPTargetSpace(const OpenMPTargetSpace& rhs) = default; + OpenMPTargetSpace& operator=(OpenMPTargetSpace&&) = default; OpenMPTargetSpace& operator=(const OpenMPTargetSpace&) = default; ~OpenMPTargetSpace() = default; @@ -141,79 +142,5 @@ class OpenMPTargetSpace { KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_SPECIALIZATION( Kokkos::Experimental::OpenMPTargetSpace); -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Impl { - -// TODO: implement all possible deep_copies -template -struct DeepCopy { - DeepCopy(void* dst, const void* src, size_t n) { - // In the Release and RelWithDebInfo builds, the size of the memcpy should - // be greater than zero to avoid error. omp_target_memcpy returns zero on - // success. - if (n > 0) - KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( - dst, const_cast(src), n, 0, 0, omp_get_default_device(), - omp_get_default_device())); - } - DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) { - exec.fence( - "Kokkos::Impl::DeepCopy: fence " - "before " - "copy"); - if (n > 0) - KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( - dst, const_cast(src), n, 0, 0, omp_get_default_device(), - omp_get_default_device())); - } -}; - -template -struct DeepCopy { - DeepCopy(void* dst, const void* src, size_t n) { - if (n > 0) - KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( - dst, const_cast(src), n, 0, 0, omp_get_default_device(), - omp_get_initial_device())); - } - DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) { - exec.fence( - "Kokkos::Impl::DeepCopy: fence before " - "copy"); - if (n > 0) - KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( - dst, const_cast(src), n, 0, 0, omp_get_default_device(), - omp_get_initial_device())); - } -}; - -template -struct DeepCopy { - DeepCopy(void* dst, const void* src, size_t n) { - if (n > 0) - KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( - dst, const_cast(src), n, 0, 0, omp_get_initial_device(), - omp_get_default_device())); - } - DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) { - exec.fence( - "Kokkos::Impl::DeepCopy: fence before " - "copy"); - if (n > 0) - KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( - dst, const_cast(src), n, 0, 0, omp_get_initial_device(), - omp_get_default_device())); - } -}; - -} // namespace Impl -} // namespace Kokkos - #endif #endif /* #define KOKKOS_OPENMPTARGETSPACE_HPP */ diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_DeepCopy.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_DeepCopy.hpp new file mode 100644 index 000000000000..aace09e266b0 --- /dev/null +++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_DeepCopy.hpp @@ -0,0 +1,101 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE +#include +static_assert(false, + "Including non-public Kokkos header files is not allowed."); +#endif +#ifndef KOKKOS_OPENMPTARGET_DEEP_COPY_HPP +#define KOKKOS_OPENMPTARGET_DEEP_COPY_HPP + +#include + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +// TODO: implement all possible deep_copies +template +struct DeepCopy { + DeepCopy(void* dst, const void* src, size_t n) { + // In the Release and RelWithDebInfo builds, the size of the memcpy should + // be greater than zero to avoid error. omp_target_memcpy returns zero on + // success. + if (n > 0) + KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( + dst, const_cast(src), n, 0, 0, omp_get_default_device(), + omp_get_default_device())); + } + DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) { + exec.fence( + "Kokkos::Impl::DeepCopy: fence " + "before " + "copy"); + if (n > 0) + KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( + dst, const_cast(src), n, 0, 0, omp_get_default_device(), + omp_get_default_device())); + } +}; + +template +struct DeepCopy { + DeepCopy(void* dst, const void* src, size_t n) { + if (n > 0) + KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( + dst, const_cast(src), n, 0, 0, omp_get_default_device(), + omp_get_initial_device())); + } + DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) { + exec.fence( + "Kokkos::Impl::DeepCopy: fence before " + "copy"); + if (n > 0) + KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( + dst, const_cast(src), n, 0, 0, omp_get_default_device(), + omp_get_initial_device())); + } +}; + +template +struct DeepCopy { + DeepCopy(void* dst, const void* src, size_t n) { + if (n > 0) + KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( + dst, const_cast(src), n, 0, 0, omp_get_initial_device(), + omp_get_default_device())); + } + DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) { + exec.fence( + "Kokkos::Impl::DeepCopy: fence before " + "copy"); + if (n > 0) + KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( + dst, const_cast(src), n, 0, 0, omp_get_initial_device(), + omp_get_default_device())); + } +}; + +} // namespace Impl +} // namespace Kokkos + +#endif // KOKKOS_OPENMPTARGET_DEEP_COPY_HPP diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp deleted file mode 100644 index 6c5eb048e34e..000000000000 --- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp +++ /dev/null @@ -1,130 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE -#define KOKKOS_IMPL_PUBLIC_INCLUDE -#endif - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#ifdef KOKKOS_ENABLE_OPENMPTARGET - -// FIXME_OPENMPTARGET currently unused -/* -namespace Kokkos { -namespace Impl { -namespace { - -KOKKOS_INLINE_FUNCTION -int kokkos_omp_in_parallel(); - -KOKKOS_INLINE_FUNCTION -int kokkos_omp_in_parallel() { return omp_in_parallel(); } - -bool s_using_hwloc = false; - -} // namespace -} // namespace Impl -} // namespace Kokkos -*/ - -namespace Kokkos { -namespace Impl { - -void OpenMPTargetExec::verify_is_process(const char* const label) { - // Fails if the current task is in a parallel region or is not on the host. - if (omp_in_parallel() && (!omp_is_initial_device())) { - std::string msg(label); - msg.append(" ERROR: in parallel or on device"); - Kokkos::Impl::throw_runtime_exception(msg); - } -} - -void OpenMPTargetExec::verify_initialized(const char* const label) { - if (0 == Kokkos::Experimental::OpenMPTarget().impl_is_initialized()) { - std::string msg(label); - msg.append(" ERROR: not initialized"); - Kokkos::Impl::throw_runtime_exception(msg); - } -} - -void* OpenMPTargetExec::m_scratch_ptr = nullptr; -int64_t OpenMPTargetExec::m_scratch_size = 0; -uint32_t* OpenMPTargetExec::m_uniquetoken_ptr = nullptr; -int OpenMPTargetExec::MAX_ACTIVE_THREADS = 0; -std::mutex OpenMPTargetExec::m_mutex_scratch_ptr; - -void OpenMPTargetExec::clear_scratch() { - Kokkos::Experimental::OpenMPTargetSpace space; - space.deallocate(m_scratch_ptr, m_scratch_size); - m_scratch_ptr = nullptr; - m_scratch_size = 0; -} - -void* OpenMPTargetExec::get_scratch_ptr() { return m_scratch_ptr; } - -void OpenMPTargetExec::resize_scratch(int64_t team_size, int64_t shmem_size_L0, - int64_t shmem_size_L1, - int64_t league_size) { - Kokkos::Experimental::OpenMPTargetSpace space; - // Level-0 scratch when using clang/17 and higher comes from their OpenMP - // extension, `ompx_dyn_cgroup_mem`. -#if defined(KOKKOS_IMPL_OPENMPTARGET_LLVM_EXTENSIONS) - shmem_size_L0 = 0; -#endif - const int64_t shmem_size = - shmem_size_L0 + shmem_size_L1; // L0 + L1 scratch memory per team. - const int64_t padding = shmem_size * 10 / 100; // Padding per team. - - // Maximum active teams possible. - // The number should not exceed the maximum in-flight teams possible or the - // league_size. - int max_active_teams = - std::min(OpenMPTargetExec::MAX_ACTIVE_THREADS / team_size, league_size); - - // max_active_teams is the number of active teams on the given hardware. - // We set the number of teams to be twice the number of max_active_teams for - // the compiler to pick the right number in its case. - // FIXME_OPENMPTARGET: Cray compiler did not yet implement omp_set_num_teams. -#if !defined(KOKKOS_COMPILER_CRAY_LLVM) - omp_set_num_teams(max_active_teams * 2); -#endif - - // Total amount of scratch memory allocated is depenedent - // on the maximum number of in-flight teams possible. - int64_t total_size = - (shmem_size + OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE + padding) * - max_active_teams * 2; - - if (total_size > m_scratch_size) { - space.deallocate(m_scratch_ptr, m_scratch_size); - m_scratch_size = total_size; - m_scratch_ptr = space.allocate(total_size); - } -} - -} // namespace Impl -} // namespace Kokkos - -#endif // KOKKOS_ENABLE_OPENMPTARGET diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_FunctorAdapter.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_FunctorAdapter.hpp new file mode 100644 index 000000000000..13b509c0ada0 --- /dev/null +++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_FunctorAdapter.hpp @@ -0,0 +1,48 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_OPENMPTARGET_FUNCTOR_ADAPTER_HPP +#define KOKKOS_OPENMPTARGET_FUNCTOR_ADAPTER_HPP + +#include +#include + +namespace Kokkos::Experimental::Impl { + +template +class FunctorAdapter { + Functor m_functor; + using WorkTag = typename Policy::work_tag; + + public: + FunctorAdapter() = default; + FunctorAdapter(Functor const &functor) : m_functor(functor) {} + + Functor get_functor() const { return m_functor; } + + template + KOKKOS_FUNCTION void operator()(Args &&...args) const { + if constexpr (std::is_void_v) { + m_functor(static_cast(args)...); + } else { + m_functor(WorkTag(), static_cast(args)...); + } + } +}; + +} // namespace Kokkos::Experimental::Impl + +#endif // KOKKOS_OPENMPTARGET_FUNCTOR_ADAPTER_HPP diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp index 44e9119ea886..53e723882f55 100644 --- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp +++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp @@ -27,11 +27,11 @@ // constructor. undef'ed at the end #define KOKKOS_IMPL_OPENMPTARGET_WORKAROUND +#include #include #include #include #include -#include #include @@ -105,18 +105,15 @@ void OpenMPTargetInternal::print_configuration(std::ostream& os, void OpenMPTargetInternal::impl_finalize() { m_is_initialized = false; - Kokkos::Impl::OpenMPTargetExec space; - if (space.m_uniquetoken_ptr != nullptr) + if (m_uniquetoken_ptr != nullptr) Kokkos::kokkos_free( - space.m_uniquetoken_ptr); + m_uniquetoken_ptr); } void OpenMPTargetInternal::impl_initialize() { m_is_initialized = true; - Kokkos::Impl::OpenMPTargetExec::MAX_ACTIVE_THREADS = concurrency(); - // FIXME_OPENMPTARGET: Only fix the number of teams for NVIDIA architectures // from Pascal and upwards. // FIXME_OPENMPTARGTE: Cray compiler did not yet implement omp_set_num_teams. @@ -136,7 +133,75 @@ OpenMPTargetInternal* OpenMPTargetInternal::impl_singleton() { return &self; } -} // Namespace Impl +void OpenMPTargetInternal::verify_is_process(const char* const label) { + // Fails if the current task is in a parallel region or is not on the host. + if (omp_in_parallel() && (!omp_is_initial_device())) { + std::string msg(label); + msg.append(" ERROR: in parallel or on device"); + Kokkos::Impl::throw_runtime_exception(msg); + } +} + +void OpenMPTargetInternal::verify_initialized(const char* const label) { + if (0 == Kokkos::Experimental::OpenMPTarget().impl_is_initialized()) { + std::string msg(label); + msg.append(" ERROR: not initialized"); + Kokkos::Impl::throw_runtime_exception(msg); + } +} + +void OpenMPTargetInternal::clear_scratch() { + Kokkos::Experimental::OpenMPTargetSpace space; + space.deallocate(m_scratch_ptr, m_scratch_size); + m_scratch_ptr = nullptr; + m_scratch_size = 0; +} + +void* OpenMPTargetInternal::get_scratch_ptr() { return m_scratch_ptr; } + +void OpenMPTargetInternal::resize_scratch(int64_t team_size, + int64_t shmem_size_L0, + int64_t shmem_size_L1, + int64_t league_size) { + Kokkos::Experimental::OpenMPTargetSpace space; + // Level-0 scratch when using clang/17 and higher comes from their OpenMP + // extension, `ompx_dyn_cgroup_mem`. +#if defined(KOKKOS_IMPL_OPENMPTARGET_LLVM_EXTENSIONS) + shmem_size_L0 = 0; +#endif + const int64_t shmem_size = + shmem_size_L0 + shmem_size_L1; // L0 + L1 scratch memory per team. + const int64_t padding = shmem_size * 10 / 100; // Padding per team. + + // Maximum active teams possible. + // The number should not exceed the maximum in-flight teams possible or the + // league_size. + int max_active_teams = + std::min(OpenMPTargetInternal::concurrency() / team_size, league_size); + + // max_active_teams is the number of active teams on the given hardware. + // We set the number of teams to be twice the number of max_active_teams for + // the compiler to pick the right number in its case. + // FIXME_OPENMPTARGET: Cray compiler did not yet implement omp_set_num_teams. +#if !defined(KOKKOS_COMPILER_CRAY_LLVM) + omp_set_num_teams(max_active_teams * 2); +#endif + + // Total amount of scratch memory allocated is depenedent + // on the maximum number of in-flight teams possible. + int64_t total_size = + (shmem_size + + ::Kokkos::Impl::OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE + padding) * + max_active_teams * 2; + + if (total_size > m_scratch_size) { + space.deallocate(m_scratch_ptr, m_scratch_size); + m_scratch_size = total_size; + m_scratch_ptr = space.allocate(total_size); + } +} + +} // namespace Impl OpenMPTarget::OpenMPTarget() : m_space_instance(Impl::OpenMPTargetInternal::impl_singleton()) {} @@ -206,9 +271,9 @@ namespace Experimental { UniqueToken:: - UniqueToken(Kokkos::Experimental::OpenMPTarget const&) { + UniqueToken(Kokkos::Experimental::OpenMPTarget const& space) { #ifdef KOKKOS_IMPL_OPENMPTARGET_WORKAROUND - uint32_t* ptr = Kokkos::Impl::OpenMPTargetExec::m_uniquetoken_ptr; + uint32_t* ptr = space.impl_internal_space_instance()->m_uniquetoken_ptr; int count = Kokkos::Experimental::OpenMPTarget().concurrency(); if (ptr == nullptr) { int size = count * sizeof(uint32_t); @@ -221,7 +286,7 @@ UniqueTokenm_uniquetoken_ptr = ptr; } #else // FIXME_OPENMPTARGET - 2 versions of non-working implementations to fill `ptr` @@ -229,8 +294,7 @@ UniqueToken - namespace Kokkos { namespace Experimental { namespace Impl { @@ -27,9 +25,9 @@ enum class openmp_fence_is_static { yes, no }; class OpenMPTargetInternal { private: - OpenMPTargetInternal() = default; - OpenMPTargetInternal(const OpenMPTargetInternal&) = default; - OpenMPTargetInternal& operator=(const OpenMPTargetInternal&) = default; + OpenMPTargetInternal() = default; + OpenMPTargetInternal(const OpenMPTargetInternal&) = delete; + OpenMPTargetInternal& operator=(const OpenMPTargetInternal&) = delete; public: void fence(openmp_fence_is_static is_static = openmp_fence_is_static::no); @@ -55,6 +53,19 @@ class OpenMPTargetInternal { static OpenMPTargetInternal* impl_singleton(); + static void verify_is_process(const char* const); + static void verify_initialized(const char* const); + + void* get_scratch_ptr(); + void clear_scratch(); + void resize_scratch(int64_t team_reduce_bytes, int64_t team_shared_bytes, + int64_t thread_local_bytes, int64_t league_size); + + void* m_scratch_ptr = nullptr; + std::mutex m_mutex_scratch_ptr; + int64_t m_scratch_size = 0; + uint32_t* m_uniquetoken_ptr = nullptr; + private: bool m_is_initialized = false; uint32_t m_instance_id = Kokkos::Tools::Experimental::Impl::idForInstance< diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp index e222d6525010..f71f8887135e 100644 --- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp +++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp @@ -116,8 +116,8 @@ class OpenMPTargetExecTeamMember { // FIXME_OPENMPTARGET this function currently ignores the reducer passed. template KOKKOS_INLINE_FUNCTION std::enable_if_t::value> - team_reduce(ReducerType const&, typename ReducerType::value_type& value) const - noexcept { + team_reduce(ReducerType const&, + typename ReducerType::value_type& value) const noexcept { #pragma omp barrier using value_type = typename ReducerType::value_type; @@ -741,43 +741,6 @@ struct TeamVectorRangeBoundariesStruct { } // namespace Impl -} // namespace Kokkos -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- -namespace Kokkos { -namespace Impl { - -//---------------------------------------------------------------------------- -/** \brief Data for OpenMPTarget thread execution */ - -class OpenMPTargetExec { - public: - // FIXME_OPENMPTARGET - Currently the maximum number of - // teams possible is calculated based on NVIDIA's Volta GPU. In - // future this value should be based on the chosen architecture for the - // OpenMPTarget backend. - static int MAX_ACTIVE_THREADS; - - private: - static void* scratch_ptr; - - public: - static void verify_is_process(const char* const); - static void verify_initialized(const char* const); - - static void* get_scratch_ptr(); - static void clear_scratch(); - static void resize_scratch(int64_t team_reduce_bytes, - int64_t team_shared_bytes, - int64_t thread_local_bytes, int64_t league_size); - - static void* m_scratch_ptr; - static std::mutex m_mutex_scratch_ptr; - static int64_t m_scratch_size; - static uint32_t* m_uniquetoken_ptr; -}; - -} // namespace Impl } // namespace Kokkos #endif /* KOKKOS_OPENMPTARGET_PARALLEL_HPP */ diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_MDRange.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_MDRange.hpp index bd7d3eef5d73..38ed7c5681a1 100644 --- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_MDRange.hpp +++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_MDRange.hpp @@ -20,6 +20,8 @@ #include #include #include "Kokkos_OpenMPTarget_MDRangePolicy.hpp" +#include "Kokkos_OpenMPTarget_Instance.hpp" +#include "Kokkos_OpenMPTarget_FunctorAdapter.hpp" //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- @@ -31,38 +33,38 @@ template class ParallelFor, Kokkos::Experimental::OpenMPTarget> { private: - using Policy = Kokkos::MDRangePolicy; - using WorkTag = typename Policy::work_tag; - using Member = typename Policy::member_type; - using Index = typename Policy::index_type; + using Policy = Kokkos::MDRangePolicy; + using Member = typename Policy::member_type; + using Index = typename Policy::index_type; + + using FunctorAdapter = + Kokkos::Experimental::Impl::FunctorAdapter; + const FunctorAdapter m_functor; - const FunctorType m_functor; const Policy m_policy; public: inline void execute() const { - OpenMPTargetExec::verify_is_process( + Experimental::Impl::OpenMPTargetInternal::verify_is_process( "Kokkos::Experimental::OpenMPTarget parallel_for"); - OpenMPTargetExec::verify_initialized( + Experimental::Impl::OpenMPTargetInternal::verify_initialized( "Kokkos::Experimental::OpenMPTarget parallel_for"); - FunctorType functor(m_functor); + Policy policy = m_policy; - typename Policy::point_type unused; static_assert(1 < Policy::rank && Policy::rank < 7); static_assert(Policy::inner_direction == Iterate::Left || Policy::inner_direction == Iterate::Right); execute_tile( - unused, functor, policy, + m_functor, policy, std::integral_constant()); } template inline std::enable_if_t execute_tile( - typename Policy::point_type offset, const FunctorType& functor, - const Policy& policy, OpenMPTargetIterateRight) const { - (void)offset; + const FunctorAdapter& functor, const Policy& policy, + OpenMPTargetIterateRight) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; @@ -72,18 +74,14 @@ class ParallelFor, #pragma omp target teams distribute parallel for collapse(2) map(to : functor) for (auto i0 = begin_0; i0 < end_0; ++i0) for (auto i1 = begin_1; i1 < end_1; ++i1) { - if constexpr (std::is_void::value) - functor(i0, i1); - else - functor(typename Policy::work_tag(), i0, i1); + functor(i0, i1); } } template inline std::enable_if_t execute_tile( - typename Policy::point_type offset, const FunctorType& functor, - const Policy& policy, OpenMPTargetIterateRight) const { - (void)offset; + const FunctorAdapter& functor, const Policy& policy, + OpenMPTargetIterateRight) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; const Index begin_2 = policy.m_lower[2]; @@ -96,10 +94,7 @@ class ParallelFor, for (auto i0 = begin_0; i0 < end_0; ++i0) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i2 = begin_2; i2 < end_2; ++i2) { - if constexpr (std::is_void::value) - functor(i0, i1, i2); - else - functor(typename Policy::work_tag(), i0, i1, i2); + functor(i0, i1, i2); } } } @@ -107,9 +102,8 @@ class ParallelFor, template inline std::enable_if_t execute_tile( - typename Policy::point_type offset, const FunctorType& functor, - const Policy& policy, OpenMPTargetIterateRight) const { - (void)offset; + const FunctorAdapter& functor, const Policy& policy, + OpenMPTargetIterateRight) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; const Index begin_2 = policy.m_lower[2]; @@ -125,10 +119,7 @@ class ParallelFor, for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i2 = begin_2; i2 < end_2; ++i2) { for (auto i3 = begin_3; i3 < end_3; ++i3) { - if constexpr (std::is_void::value) - functor(i0, i1, i2, i3); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3); + functor(i0, i1, i2, i3); } } } @@ -137,9 +128,8 @@ class ParallelFor, template inline std::enable_if_t execute_tile( - typename Policy::point_type offset, const FunctorType& functor, - const Policy& policy, OpenMPTargetIterateRight) const { - (void)offset; + const FunctorAdapter& functor, const Policy& policy, + OpenMPTargetIterateRight) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; const Index begin_2 = policy.m_lower[2]; @@ -158,11 +148,7 @@ class ParallelFor, for (auto i2 = begin_2; i2 < end_2; ++i2) { for (auto i3 = begin_3; i3 < end_3; ++i3) { for (auto i4 = begin_4; i4 < end_4; ++i4) { - if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, i4); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, i4); + functor(i0, i1, i2, i3, i4); } } } @@ -172,9 +158,8 @@ class ParallelFor, template inline std::enable_if_t execute_tile( - typename Policy::point_type offset, const FunctorType& functor, - const Policy& policy, OpenMPTargetIterateRight) const { - (void)offset; + const FunctorAdapter& functor, const Policy& policy, + OpenMPTargetIterateRight) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; const Index begin_2 = policy.m_lower[2]; @@ -197,12 +182,7 @@ class ParallelFor, for (auto i4 = begin_4; i4 < end_4; ++i4) { for (auto i5 = begin_5; i5 < end_5; ++i5) { { - if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, i4, i5); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, - i5); + functor(i0, i1, i2, i3, i4, i5); } } } @@ -214,9 +194,8 @@ class ParallelFor, template inline std::enable_if_t execute_tile( - typename Policy::point_type offset, const FunctorType& functor, - const Policy& policy, OpenMPTargetIterateLeft) const { - (void)offset; + const FunctorAdapter& functor, const Policy& policy, + OpenMPTargetIterateLeft) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; @@ -226,18 +205,14 @@ class ParallelFor, #pragma omp target teams distribute parallel for collapse(2) map(to : functor) for (auto i1 = begin_1; i1 < end_1; ++i1) for (auto i0 = begin_0; i0 < end_0; ++i0) { - if constexpr (std::is_void::value) - functor(i0, i1); - else - functor(typename Policy::work_tag(), i0, i1); + functor(i0, i1); } } template inline std::enable_if_t execute_tile( - typename Policy::point_type offset, const FunctorType& functor, - const Policy& policy, OpenMPTargetIterateLeft) const { - (void)offset; + const FunctorAdapter& functor, const Policy& policy, + OpenMPTargetIterateLeft) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; const Index begin_2 = policy.m_lower[2]; @@ -250,10 +225,7 @@ class ParallelFor, for (auto i2 = begin_2; i2 < end_2; ++i2) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i0 = begin_0; i0 < end_0; ++i0) { - if constexpr (std::is_void::value) - functor(i0, i1, i2); - else - functor(typename Policy::work_tag(), i0, i1, i2); + functor(i0, i1, i2); } } } @@ -261,9 +233,8 @@ class ParallelFor, template inline std::enable_if_t execute_tile( - typename Policy::point_type offset, const FunctorType& functor, - const Policy& policy, OpenMPTargetIterateLeft) const { - (void)offset; + const FunctorAdapter& functor, const Policy& policy, + OpenMPTargetIterateLeft) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; const Index begin_2 = policy.m_lower[2]; @@ -279,10 +250,7 @@ class ParallelFor, for (auto i2 = begin_2; i2 < end_2; ++i2) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i0 = begin_0; i0 < end_0; ++i0) { - if constexpr (std::is_void::value) - functor(i0, i1, i2, i3); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3); + functor(i0, i1, i2, i3); } } } @@ -291,9 +259,8 @@ class ParallelFor, template inline std::enable_if_t execute_tile( - typename Policy::point_type offset, const FunctorType& functor, - const Policy& policy, OpenMPTargetIterateLeft) const { - (void)offset; + const FunctorAdapter& functor, const Policy& policy, + OpenMPTargetIterateLeft) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; const Index begin_2 = policy.m_lower[2]; @@ -312,11 +279,7 @@ class ParallelFor, for (auto i2 = begin_2; i2 < end_2; ++i2) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i0 = begin_0; i0 < end_0; ++i0) { - if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, i4); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, i4); + functor(i0, i1, i2, i3, i4); } } } @@ -326,9 +289,8 @@ class ParallelFor, template inline std::enable_if_t execute_tile( - typename Policy::point_type offset, const FunctorType& functor, - const Policy& policy, OpenMPTargetIterateLeft) const { - (void)offset; + const FunctorAdapter& functor, const Policy& policy, + OpenMPTargetIterateLeft) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; const Index begin_2 = policy.m_lower[2]; @@ -351,12 +313,7 @@ class ParallelFor, for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i0 = begin_0; i0 < end_0; ++i0) { { - if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, i4, i5); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, - i5); + functor(i0, i1, i2, i3, i4, i5); } } } diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Range.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Range.hpp index a674637a3b1a..502461cc5e08 100644 --- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Range.hpp +++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Range.hpp @@ -20,6 +20,8 @@ #include #include #include +#include "Kokkos_OpenMPTarget_Instance.hpp" +#include "Kokkos_OpenMPTarget_FunctorAdapter.hpp" namespace Kokkos { namespace Impl { @@ -28,36 +30,30 @@ template class ParallelFor, Kokkos::Experimental::OpenMPTarget> { private: - using Policy = Kokkos::RangePolicy; - using WorkTag = typename Policy::work_tag; - using Member = typename Policy::member_type; + using Policy = Kokkos::RangePolicy; + using Member = typename Policy::member_type; - const FunctorType m_functor; + Kokkos::Experimental::Impl::FunctorAdapter m_functor; const Policy m_policy; public: - void execute() const { execute_impl(); } + void execute() const { execute_impl(); } - template void execute_impl() const { - OpenMPTargetExec::verify_is_process( + Experimental::Impl::OpenMPTargetInternal::verify_is_process( "Kokkos::Experimental::OpenMPTarget parallel_for"); - OpenMPTargetExec::verify_initialized( + Experimental::Impl::OpenMPTargetInternal::verify_initialized( "Kokkos::Experimental::OpenMPTarget parallel_for"); const auto begin = m_policy.begin(); const auto end = m_policy.end(); if (end <= begin) return; - FunctorType a_functor(m_functor); + auto const a_functor(m_functor); #pragma omp target teams distribute parallel for map(to : a_functor) for (auto i = begin; i < end; ++i) { - if constexpr (std::is_void::value) { - a_functor(i); - } else { - a_functor(TagType(), i); - } + a_functor(i); } } diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Team.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Team.hpp index 26085f11400f..77dc71a87b78 100644 --- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Team.hpp +++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Team.hpp @@ -22,6 +22,7 @@ #include #include #include +#include namespace Kokkos { @@ -76,28 +77,27 @@ class ParallelFor, using Policy = Kokkos::Impl::TeamPolicyInternal; - using WorkTag = typename Policy::work_tag; - using Member = typename Policy::member_type; + using Member = typename Policy::member_type; + + Kokkos::Experimental::Impl::FunctorAdapter m_functor; - const FunctorType m_functor; const Policy m_policy; const size_t m_shmem_size; public: void execute() const { - OpenMPTargetExec::verify_is_process( + Experimental::Impl::OpenMPTargetInternal::verify_is_process( "Kokkos::Experimental::OpenMPTarget parallel_for"); - OpenMPTargetExec::verify_initialized( + Experimental::Impl::OpenMPTargetInternal::verify_initialized( "Kokkos::Experimental::OpenMPTarget parallel_for"); - execute_impl(); + execute_impl(); } private: - template void execute_impl() const { - OpenMPTargetExec::verify_is_process( + Experimental::Impl::OpenMPTargetInternal::verify_is_process( "Kokkos::Experimental::OpenMPTarget parallel_for"); - OpenMPTargetExec::verify_initialized( + Experimental::Impl::OpenMPTargetInternal::verify_initialized( "Kokkos::Experimental::OpenMPTarget parallel_for"); const auto league_size = m_policy.league_size(); const auto team_size = m_policy.team_size(); @@ -105,11 +105,12 @@ class ParallelFor, const size_t shmem_size_L0 = m_policy.scratch_size(0, team_size); const size_t shmem_size_L1 = m_policy.scratch_size(1, team_size); - OpenMPTargetExec::resize_scratch(team_size, shmem_size_L0, shmem_size_L1, - league_size); + m_policy.space().impl_internal_space_instance()->resize_scratch( + team_size, shmem_size_L0, shmem_size_L1, league_size); - void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr(); - FunctorType a_functor(m_functor); + void* scratch_ptr = + m_policy.space().impl_internal_space_instance()->get_scratch_ptr(); + auto const a_functor(m_functor); // FIXME_OPENMPTARGET - If the team_size is not a multiple of 32, the // scratch implementation does not work in the Release or RelWithDebugInfo @@ -122,7 +123,7 @@ class ParallelFor, int max_active_teams = omp_get_max_teams(); #else int max_active_teams = - std::min(OpenMPTargetExec::MAX_ACTIVE_THREADS / team_size, league_size); + std::min(m_policy.space().concurrency() / team_size, league_size); #endif // FIXME_OPENMPTARGET: Although the maximum number of teams is set using the @@ -161,16 +162,13 @@ class ParallelFor, typename Policy::member_type team(league_id, league_size, team_size, vector_length, scratch_ptr, blockIdx, shmem_size_L0, shmem_size_L1); - if constexpr (std::is_void_v) - m_functor(team); - else - m_functor(TagType(), team); + a_functor(team); } } #else #pragma omp target teams distribute firstprivate(a_functor) \ is_device_ptr(scratch_ptr) num_teams(max_active_teams) \ - thread_limit(team_size) + thread_limit(team_size) for (int i = 0; i < league_size; i++) { #pragma omp parallel { @@ -180,10 +178,7 @@ class ParallelFor, typename Policy::member_type team(i, league_size, team_size, vector_length, scratch_ptr, i, shmem_size_L0, shmem_size_L1); - if constexpr (std::is_void_v) - m_functor(team); - else - m_functor(TagType(), team); + a_functor(team); } } #endif diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_MDRange.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_MDRange.hpp index e86a12197497..bee604834c78 100644 --- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_MDRange.hpp +++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_MDRange.hpp @@ -37,9 +37,8 @@ class ParallelReduce; + public: inline void execute() const { // Only let one ParallelReduce instance at a time use the scratch memory. std::scoped_lock scratch_memory_lock( - OpenMPTargetExec::m_mutex_scratch_ptr); + m_policy.space().impl_internal_space_instance()->m_mutex_scratch_ptr); + + auto const functor = FunctorAdapter(m_functor_reducer.get_functor()); execute_tile( - m_functor_reducer.get_functor(), m_policy, m_result_ptr, + functor, m_policy, m_result_ptr, std::integral_constant()); } @@ -77,7 +81,7 @@ class ParallelReduce inline std::enable_if_t execute_tile( - const FunctorType& functor, const Policy& policy, pointer_type ptr, + const FunctorAdapter& functor, const Policy& policy, pointer_type ptr, OpenMPTargetIterateLeft) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; @@ -90,32 +94,23 @@ class ParallelReduce::join(omp_out, omp_in)) \ - initializer(OpenMPTargetReducerWrapper ::init(omp_priv)) - -#pragma omp target teams distribute parallel for collapse(2) map(to \ - : functor) \ - reduction(custom \ - : result) +#pragma omp declare reduction(custom \ +:ValueType : OpenMPTargetReducerWrapper::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper::init(omp_priv)) + +#pragma omp target teams distribute parallel for collapse(2) map(to : functor) \ + reduction(custom : result) for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i0 = begin_0; i0 < end_0; ++i0) { - if constexpr (std::is_void::value) - functor(i0, i1, result); - else - functor(typename Policy::work_tag(), i0, i1, result); + functor(i0, i1, result); } } } else { #pragma omp target teams distribute parallel for collapse(2) map(to : functor) \ -reduction(+:result) + reduction(+ : result) for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i0 = begin_0; i0 < end_0; ++i0) { - if constexpr (std::is_void::value) - functor(i0, i1, result); - else - functor(typename Policy::work_tag(), i0, i1, result); + functor(i0, i1, result); } } } @@ -126,7 +121,7 @@ reduction(+:result) template inline std::enable_if_t execute_tile( - const FunctorType& functor, const Policy& policy, pointer_type ptr, + const FunctorAdapter& functor, const Policy& policy, pointer_type ptr, OpenMPTargetIterateLeft) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; @@ -141,38 +136,29 @@ reduction(+:result) // FIXME_OPENMPTARGET: Unable to separate directives and their companion // loops which leads to code duplication for different reduction types. if constexpr (UseReducer) { -#pragma omp declare reduction( \ - custom:ValueType \ - : OpenMPTargetReducerWrapper ::join( \ - omp_out, omp_in)) \ - initializer( \ - OpenMPTargetReducerWrapper ::init( \ - omp_priv)) - -#pragma omp target teams distribute parallel for collapse(3) map(to \ - : functor) \ - reduction(custom \ - : result) +#pragma omp declare reduction( \ + custom \ +:ValueType : OpenMPTargetReducerWrapper< \ + typename ReducerType::functor_type>::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper< \ + typename ReducerType::functor_type>::init(omp_priv)) + +#pragma omp target teams distribute parallel for collapse(3) map(to : functor) \ + reduction(custom : result) for (auto i2 = begin_2; i2 < end_2; ++i2) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i0 = begin_0; i0 < end_0; ++i0) { - if constexpr (std::is_void::value) - functor(i0, i1, i2, result); - else - functor(typename Policy::work_tag(), i0, i1, i2, result); + functor(i0, i1, i2, result); } } } } else { #pragma omp target teams distribute parallel for collapse(3) map(to : functor) \ -reduction(+:result) + reduction(+ : result) for (auto i2 = begin_2; i2 < end_2; ++i2) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i0 = begin_0; i0 < end_0; ++i0) { - if constexpr (std::is_void::value) - functor(i0, i1, i2, result); - else - functor(typename Policy::work_tag(), i0, i1, i2, result); + functor(i0, i1, i2, result); } } } @@ -184,7 +170,7 @@ reduction(+:result) template inline std::enable_if_t execute_tile( - const FunctorType& functor, const Policy& policy, pointer_type ptr, + const FunctorAdapter& functor, const Policy& policy, pointer_type ptr, OpenMPTargetIterateLeft) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; @@ -201,40 +187,29 @@ reduction(+:result) // FIXME_OPENMPTARGET: Unable to separate directives and their companion // loops which leads to code duplication for different reduction types. if constexpr (UseReducer) { -#pragma omp declare reduction( \ - custom:ValueType \ - : OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ - initializer(OpenMPTargetReducerWrapper ::init(omp_priv)) - -#pragma omp target teams distribute parallel for collapse(4) map(to \ - : functor) \ - reduction(custom \ - : result) +#pragma omp declare reduction(custom \ +:ValueType : OpenMPTargetReducerWrapper::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper::init(omp_priv)) + +#pragma omp target teams distribute parallel for collapse(4) map(to : functor) \ + reduction(custom : result) for (auto i3 = begin_3; i3 < end_3; ++i3) { for (auto i2 = begin_2; i2 < end_2; ++i2) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i0 = begin_0; i0 < end_0; ++i0) { - if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, result); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, result); + functor(i0, i1, i2, i3, result); } } } } } else { #pragma omp target teams distribute parallel for collapse(4) map(to : functor) \ -reduction(+:result) + reduction(+ : result) for (auto i3 = begin_3; i3 < end_3; ++i3) { for (auto i2 = begin_2; i2 < end_2; ++i2) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i0 = begin_0; i0 < end_0; ++i0) { - if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, result); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, result); + functor(i0, i1, i2, i3, result); } } } @@ -247,7 +222,7 @@ reduction(+:result) template inline std::enable_if_t execute_tile( - const FunctorType& functor, const Policy& policy, pointer_type ptr, + const FunctorAdapter& functor, const Policy& policy, pointer_type ptr, OpenMPTargetIterateLeft) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; @@ -266,26 +241,18 @@ reduction(+:result) // FIXME_OPENMPTARGET: Unable to separate directives and their companion // loops which leads to code duplication for different reduction types. if constexpr (UseReducer) { -#pragma omp declare reduction( \ - custom:ValueType \ - : OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ - initializer(OpenMPTargetReducerWrapper ::init(omp_priv)) - -#pragma omp target teams distribute parallel for collapse(5) map(to \ - : functor) \ - reduction(custom \ - : result) +#pragma omp declare reduction(custom \ +:ValueType : OpenMPTargetReducerWrapper::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper::init(omp_priv)) + +#pragma omp target teams distribute parallel for collapse(5) map(to : functor) \ + reduction(custom : result) for (auto i4 = begin_4; i4 < end_4; ++i4) { for (auto i3 = begin_3; i3 < end_3; ++i3) { for (auto i2 = begin_2; i2 < end_2; ++i2) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i0 = begin_0; i0 < end_0; ++i0) { - if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, i4, result); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, - result); + functor(i0, i1, i2, i3, i4, result); } } } @@ -293,18 +260,13 @@ reduction(+:result) } } else { #pragma omp target teams distribute parallel for collapse(5) map(to : functor) \ -reduction(+:result) + reduction(+ : result) for (auto i4 = begin_4; i4 < end_4; ++i4) { for (auto i3 = begin_3; i3 < end_3; ++i3) { for (auto i2 = begin_2; i2 < end_2; ++i2) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i0 = begin_0; i0 < end_0; ++i0) { - if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, i4, result); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, - result); + functor(i0, i1, i2, i3, i4, result); } } } @@ -318,7 +280,7 @@ reduction(+:result) template inline std::enable_if_t execute_tile( - const FunctorType& functor, const Policy& policy, pointer_type ptr, + const FunctorAdapter& functor, const Policy& policy, pointer_type ptr, OpenMPTargetIterateLeft) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; @@ -339,27 +301,19 @@ reduction(+:result) // FIXME_OPENMPTARGET: Unable to separate directives and their companion // loops which leads to code duplication for different reduction types. if constexpr (UseReducer) { -#pragma omp declare reduction( \ - custom:ValueType \ - : OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ - initializer(OpenMPTargetReducerWrapper ::init(omp_priv)) - -#pragma omp target teams distribute parallel for collapse(6) map(to \ - : functor) \ - reduction(custom \ - : result) +#pragma omp declare reduction(custom \ +:ValueType : OpenMPTargetReducerWrapper::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper::init(omp_priv)) + +#pragma omp target teams distribute parallel for collapse(6) map(to : functor) \ + reduction(custom : result) for (auto i5 = begin_5; i5 < end_5; ++i5) { for (auto i4 = begin_4; i4 < end_4; ++i4) { for (auto i3 = begin_3; i3 < end_3; ++i3) { for (auto i2 = begin_2; i2 < end_2; ++i2) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i0 = begin_0; i0 < end_0; ++i0) { - if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, i4, i5, result); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, i5, - result); + functor(i0, i1, i2, i3, i4, i5, result); } } } @@ -368,19 +322,14 @@ reduction(+:result) } } else { #pragma omp target teams distribute parallel for collapse(6) map(to : functor) \ -reduction(+:result) + reduction(+ : result) for (auto i5 = begin_5; i5 < end_5; ++i5) { for (auto i4 = begin_4; i4 < end_4; ++i4) { for (auto i3 = begin_3; i3 < end_3; ++i3) { for (auto i2 = begin_2; i2 < end_2; ++i2) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i0 = begin_0; i0 < end_0; ++i0) { - if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, i4, i5, result); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, i5, - result); + functor(i0, i1, i2, i3, i4, i5, result); } } } @@ -395,7 +344,7 @@ reduction(+:result) template inline std::enable_if_t execute_tile( - const FunctorType& functor, const Policy& policy, pointer_type ptr, + const FunctorAdapter& functor, const Policy& policy, pointer_type ptr, OpenMPTargetIterateRight) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; @@ -408,32 +357,23 @@ reduction(+:result) // FIXME_OPENMPTARGET: Unable to separate directives and their companion // loops which leads to code duplication for different reduction types. if constexpr (UseReducer) { -#pragma omp declare reduction( \ - custom:ValueType \ - : OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ - initializer(OpenMPTargetReducerWrapper ::init(omp_priv)) - -#pragma omp target teams distribute parallel for collapse(2) map(to \ - : functor) \ - reduction(custom \ - : result) +#pragma omp declare reduction(custom \ +:ValueType : OpenMPTargetReducerWrapper::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper::init(omp_priv)) + +#pragma omp target teams distribute parallel for collapse(2) map(to : functor) \ + reduction(custom : result) for (auto i0 = begin_0; i0 < end_0; ++i0) { for (auto i1 = begin_1; i1 < end_1; ++i1) { - if constexpr (std::is_void::value) - functor(i0, i1, result); - else - functor(typename Policy::work_tag(), i0, i1, result); + functor(i0, i1, result); } } } else { #pragma omp target teams distribute parallel for collapse(2) map(to : functor) \ -reduction(+:result) + reduction(+ : result) for (auto i0 = begin_0; i0 < end_0; ++i0) { for (auto i1 = begin_1; i1 < end_1; ++i1) { - if constexpr (std::is_void::value) - functor(i0, i1, result); - else - functor(typename Policy::work_tag(), i0, i1, result); + functor(i0, i1, result); } } } @@ -444,7 +384,7 @@ reduction(+:result) template inline std::enable_if_t execute_tile( - const FunctorType& functor, const Policy& policy, pointer_type ptr, + const FunctorAdapter& functor, const Policy& policy, pointer_type ptr, OpenMPTargetIterateRight) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; @@ -459,38 +399,29 @@ reduction(+:result) // FIXME_OPENMPTARGET: Unable to separate directives and their companion // loops which leads to code duplication for different reduction types. if constexpr (UseReducer) { -#pragma omp declare reduction( \ - custom:ValueType \ - : OpenMPTargetReducerWrapper ::join( \ - omp_out, omp_in)) \ - initializer( \ - OpenMPTargetReducerWrapper ::init( \ - omp_priv)) - -#pragma omp target teams distribute parallel for collapse(3) map(to \ - : functor) \ - reduction(custom \ - : result) +#pragma omp declare reduction( \ + custom \ +:ValueType : OpenMPTargetReducerWrapper< \ + typename ReducerType::functor_type>::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper< \ + typename ReducerType::functor_type>::init(omp_priv)) + +#pragma omp target teams distribute parallel for collapse(3) map(to : functor) \ + reduction(custom : result) for (auto i0 = begin_0; i0 < end_0; ++i0) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i2 = begin_2; i2 < end_2; ++i2) { - if constexpr (std::is_void::value) - functor(i0, i1, i2, result); - else - functor(typename Policy::work_tag(), i0, i1, i2, result); + functor(i0, i1, i2, result); } } } } else { #pragma omp target teams distribute parallel for collapse(3) map(to : functor) \ -reduction(+:result) + reduction(+ : result) for (auto i0 = begin_0; i0 < end_0; ++i0) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i2 = begin_2; i2 < end_2; ++i2) { - if constexpr (std::is_void::value) - functor(i0, i1, i2, result); - else - functor(typename Policy::work_tag(), i0, i1, i2, result); + functor(i0, i1, i2, result); } } } @@ -502,7 +433,7 @@ reduction(+:result) template inline std::enable_if_t execute_tile( - const FunctorType& functor, const Policy& policy, pointer_type ptr, + const FunctorAdapter& functor, const Policy& policy, pointer_type ptr, OpenMPTargetIterateRight) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; @@ -519,40 +450,29 @@ reduction(+:result) // FIXME_OPENMPTARGET: Unable to separate directives and their companion // loops which leads to code duplication for different reduction types. if constexpr (UseReducer) { -#pragma omp declare reduction( \ - custom:ValueType \ - : OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ - initializer(OpenMPTargetReducerWrapper ::init(omp_priv)) - -#pragma omp target teams distribute parallel for collapse(4) map(to \ - : functor) \ - reduction(custom \ - : result) +#pragma omp declare reduction(custom \ +:ValueType : OpenMPTargetReducerWrapper::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper::init(omp_priv)) + +#pragma omp target teams distribute parallel for collapse(4) map(to : functor) \ + reduction(custom : result) for (auto i0 = begin_0; i0 < end_0; ++i0) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i2 = begin_2; i2 < end_2; ++i2) { for (auto i3 = begin_3; i3 < end_3; ++i3) { - if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, result); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, result); + functor(i0, i1, i2, i3, result); } } } } } else { #pragma omp target teams distribute parallel for collapse(4) map(to : functor) \ -reduction(+:result) + reduction(+ : result) for (auto i0 = begin_0; i0 < end_0; ++i0) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i2 = begin_2; i2 < end_2; ++i2) { for (auto i3 = begin_3; i3 < end_3; ++i3) { - if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, result); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, result); + functor(i0, i1, i2, i3, result); } } } @@ -565,7 +485,7 @@ reduction(+:result) template inline std::enable_if_t execute_tile( - const FunctorType& functor, const Policy& policy, pointer_type ptr, + const FunctorAdapter& functor, const Policy& policy, pointer_type ptr, OpenMPTargetIterateRight) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; @@ -584,26 +504,18 @@ reduction(+:result) // FIXME_OPENMPTARGET: Unable to separate directives and their companion // loops which leads to code duplication for different reduction types. if constexpr (UseReducer) { -#pragma omp declare reduction( \ - custom:ValueType \ - : OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ - initializer(OpenMPTargetReducerWrapper ::init(omp_priv)) - -#pragma omp target teams distribute parallel for collapse(5) map(to \ - : functor) \ - reduction(custom \ - : result) +#pragma omp declare reduction(custom \ +:ValueType : OpenMPTargetReducerWrapper::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper::init(omp_priv)) + +#pragma omp target teams distribute parallel for collapse(5) map(to : functor) \ + reduction(custom : result) for (auto i0 = begin_0; i0 < end_0; ++i0) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i2 = begin_2; i2 < end_2; ++i2) { for (auto i3 = begin_3; i3 < end_3; ++i3) { for (auto i4 = begin_4; i4 < end_4; ++i4) { - if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, i4, result); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, - result); + functor(i0, i1, i2, i3, i4, result); } } } @@ -611,18 +523,13 @@ reduction(+:result) } } else { #pragma omp target teams distribute parallel for collapse(5) map(to : functor) \ -reduction(+:result) + reduction(+ : result) for (auto i0 = begin_0; i0 < end_0; ++i0) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i2 = begin_2; i2 < end_2; ++i2) { for (auto i3 = begin_3; i3 < end_3; ++i3) { for (auto i4 = begin_4; i4 < end_4; ++i4) { - if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, i4, result); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, - result); + functor(i0, i1, i2, i3, i4, result); } } } @@ -636,7 +543,7 @@ reduction(+:result) template inline std::enable_if_t execute_tile( - const FunctorType& functor, const Policy& policy, pointer_type ptr, + const FunctorAdapter& functor, const Policy& policy, pointer_type ptr, OpenMPTargetIterateRight) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; @@ -657,27 +564,19 @@ reduction(+:result) // FIXME_OPENMPTARGET: Unable to separate directives and their companion // loops which leads to code duplication for different reduction types. if constexpr (UseReducer) { -#pragma omp declare reduction( \ - custom:ValueType \ - : OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ - initializer(OpenMPTargetReducerWrapper ::init(omp_priv)) - -#pragma omp target teams distribute parallel for collapse(6) map(to \ - : functor) \ - reduction(custom \ - : result) +#pragma omp declare reduction(custom \ +:ValueType : OpenMPTargetReducerWrapper::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper::init(omp_priv)) + +#pragma omp target teams distribute parallel for collapse(6) map(to : functor) \ + reduction(custom : result) for (auto i0 = begin_0; i0 < end_0; ++i0) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i2 = begin_2; i2 < end_2; ++i2) { for (auto i3 = begin_3; i3 < end_3; ++i3) { for (auto i4 = begin_4; i4 < end_4; ++i4) { for (auto i5 = begin_5; i5 < end_5; ++i5) { - if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, i4, i5, result); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, i5, - result); + functor(i0, i1, i2, i3, i4, i5, result); } } } @@ -686,19 +585,14 @@ reduction(+:result) } } else { #pragma omp target teams distribute parallel for collapse(6) map(to : functor) \ -reduction(+:result) + reduction(+ : result) for (auto i0 = begin_0; i0 < end_0; ++i0) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i2 = begin_2; i2 < end_2; ++i2) { for (auto i3 = begin_3; i3 < end_3; ++i3) { for (auto i4 = begin_4; i4 < end_4; ++i4) { for (auto i5 = begin_5; i5 < end_5; ++i5) { - if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, i4, i5, result); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, i5, - result); + functor(i0, i1, i2, i3, i4, i5, result); } } } diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp index 4a112ed11d06..b7c8abcb4495 100644 --- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp +++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp @@ -21,6 +21,7 @@ #include #include #include +#include namespace Kokkos { namespace Impl { @@ -33,8 +34,6 @@ class ParallelReduce, using FunctorType = typename CombinedFunctorReducerType::functor_type; using ReducerType = typename CombinedFunctorReducerType::reducer_type; - using WorkTag = typename Policy::work_tag; - using pointer_type = typename ReducerType::pointer_type; using reference_type = typename ReducerType::reference_type; @@ -55,14 +54,17 @@ class ParallelReduce, const pointer_type m_result_ptr; bool m_result_ptr_on_device; const int m_result_ptr_num_elems; - using TagType = typename Policy::work_tag; public: void execute() const { // Only let one ParallelReduce instance at a time use the scratch memory. std::scoped_lock scratch_memory_lock( - OpenMPTargetExec::m_mutex_scratch_ptr); - const FunctorType& functor = m_functor_reducer.get_functor(); + m_policy.space().impl_internal_space_instance()->m_mutex_scratch_ptr); + + auto const functor = + Kokkos::Experimental::Impl::FunctorAdapter( + m_functor_reducer.get_functor()); + if constexpr (FunctorHasJoin) { // Enter this loop if the Functor has a init-join. ParReduceSpecialize::execute_init_join(functor, m_policy, m_result_ptr, @@ -75,26 +77,26 @@ class ParallelReduce, // Enter this loop if the reduction is on an array and the routine is // templated over the size of the array. if (m_result_ptr_num_elems <= 2) { - ParReduceSpecialize::template execute_array( + ParReduceSpecialize::template execute_array<2>( functor, m_policy, m_result_ptr, m_result_ptr_on_device); } else if (m_result_ptr_num_elems <= 4) { - ParReduceSpecialize::template execute_array( + ParReduceSpecialize::template execute_array<4>( functor, m_policy, m_result_ptr, m_result_ptr_on_device); } else if (m_result_ptr_num_elems <= 8) { - ParReduceSpecialize::template execute_array( + ParReduceSpecialize::template execute_array<8>( functor, m_policy, m_result_ptr, m_result_ptr_on_device); } else if (m_result_ptr_num_elems <= 16) { - ParReduceSpecialize::template execute_array( + ParReduceSpecialize::template execute_array<16>( functor, m_policy, m_result_ptr, m_result_ptr_on_device); } else if (m_result_ptr_num_elems <= 32) { - ParReduceSpecialize::template execute_array( + ParReduceSpecialize::template execute_array<32>( functor, m_policy, m_result_ptr, m_result_ptr_on_device); } else { Kokkos::abort("array reduction length must be <= 32"); } } else { // This loop handles the basic scalar reduction. - ParReduceSpecialize::template execute_array( + ParReduceSpecialize::template execute_array<1>( functor, m_policy, m_result_ptr, m_result_ptr_on_device); } } diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Team.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Team.hpp index 16c0eedb8185..b81e3aa7ed0b 100644 --- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Team.hpp +++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Team.hpp @@ -59,7 +59,7 @@ parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< #pragma omp barrier if constexpr (std::is_arithmetic::value) { -#pragma omp for reduction(+ : TeamThread_scratch[:1]) +#pragma omp for reduction(+ : TeamThread_scratch[ : 1]) for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { ValueType tmp = ValueType(); lambda(i, tmp); @@ -68,7 +68,7 @@ parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< } else { #pragma omp declare reduction(custom:ValueType : omp_out += omp_in) -#pragma omp for reduction(custom : TeamThread_scratch[:1]) +#pragma omp for reduction(custom : TeamThread_scratch[ : 1]) for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { ValueType tmp = ValueType(); lambda(i, tmp); @@ -90,11 +90,10 @@ parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< const Lambda& lambda, ReducerType result) { using ValueType = typename ReducerType::value_type; -#pragma omp declare reduction( \ - custominner:ValueType \ - : Impl::OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ - initializer( \ - Impl::OpenMPTargetReducerWrapper ::init(omp_priv)) +#pragma omp declare reduction(custominner \ +:ValueType : Impl::OpenMPTargetReducerWrapper::join(omp_out, \ + omp_in)) \ + initializer(Impl::OpenMPTargetReducerWrapper::init(omp_priv)) // FIXME_OPENMPTARGET - Make sure that if its an array reduction, number of // elements in the array <= 32. For reduction we allocate, 16 bytes per @@ -109,7 +108,7 @@ parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< Impl::OpenMPTargetReducerWrapper::init(TeamThread_scratch[0]); #pragma omp barrier -#pragma omp for reduction(custominner : TeamThread_scratch[:1]) +#pragma omp for reduction(custominner : TeamThread_scratch[ : 1]) for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { lambda(i, TeamThread_scratch[0]); } @@ -132,11 +131,10 @@ parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< ValueType* TeamThread_scratch = static_cast(loop_boundaries.team.impl_reduce_scratch()); -#pragma omp declare reduction( \ - omp_red_teamthread_reducer:ValueType \ - : Impl::OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ - initializer( \ - Impl::OpenMPTargetReducerWrapper ::init(omp_priv)) +#pragma omp declare reduction(omp_red_teamthread_reducer \ +:ValueType : Impl::OpenMPTargetReducerWrapper::join(omp_out, \ + omp_in)) \ + initializer(Impl::OpenMPTargetReducerWrapper::init(omp_priv)) #pragma omp barrier ValueType tmp; @@ -145,8 +143,9 @@ parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< #pragma omp barrier iType team_size = iType(omp_get_num_threads()); -#pragma omp for reduction(omp_red_teamthread_reducer \ - : TeamThread_scratch[:1]) schedule(static, 1) +#pragma omp for reduction( \ + omp_red_teamthread_reducer : TeamThread_scratch[ : 1]) \ + schedule(static, 1) for (iType t = 0; t < team_size; t++) { ValueType tmp2; result.init(tmp2); @@ -259,11 +258,10 @@ parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct< const Lambda& lambda, ReducerType const& result) { using ValueType = typename ReducerType::value_type; -#pragma omp declare reduction( \ - custom:ValueType \ - : Impl::OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ - initializer( \ - Impl::OpenMPTargetReducerWrapper ::init(omp_priv)) +#pragma omp declare reduction(custom \ +:ValueType : Impl::OpenMPTargetReducerWrapper::join(omp_out, \ + omp_in)) \ + initializer(Impl::OpenMPTargetReducerWrapper::init(omp_priv)) ValueType vector_reduce; Impl::OpenMPTargetReducerWrapper::init(vector_reduce); @@ -329,7 +327,7 @@ KOKKOS_INLINE_FUNCTION void parallel_reduce( #pragma omp barrier if constexpr (std::is_arithmetic::value) { -#pragma omp for simd reduction(+ : TeamVector_scratch[:1]) +#pragma omp for simd reduction(+ : TeamVector_scratch[ : 1]) for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { ValueType tmp = ValueType(); lambda(i, tmp); @@ -338,7 +336,7 @@ KOKKOS_INLINE_FUNCTION void parallel_reduce( } else { #pragma omp declare reduction(custom:ValueType : omp_out += omp_in) -#pragma omp for simd reduction(custom : TeamVector_scratch[:1]) +#pragma omp for simd reduction(custom : TeamVector_scratch[ : 1]) for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { ValueType tmp = ValueType(); lambda(i, tmp); @@ -363,11 +361,10 @@ parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct< static_assert(sizeof(ValueType) <= Impl::OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE); -#pragma omp declare reduction( \ - custom:ValueType \ - : Impl::OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ - initializer( \ - Impl::OpenMPTargetReducerWrapper ::init(omp_priv)) +#pragma omp declare reduction(custom \ +:ValueType : Impl::OpenMPTargetReducerWrapper::join(omp_out, \ + omp_in)) \ + initializer(Impl::OpenMPTargetReducerWrapper::init(omp_priv)) ValueType* TeamVector_scratch = static_cast(loop_boundaries.team.impl_reduce_scratch()); @@ -376,7 +373,7 @@ parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct< Impl::OpenMPTargetReducerWrapper::init(TeamVector_scratch[0]); #pragma omp barrier -#pragma omp for simd reduction(custom : TeamVector_scratch[:1]) +#pragma omp for simd reduction(custom : TeamVector_scratch[ : 1]) for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { lambda(i, TeamVector_scratch[0]); } @@ -400,11 +397,10 @@ parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct< ValueType* TeamVector_scratch = static_cast(loop_boundaries.team.impl_reduce_scratch()); -#pragma omp declare reduction( \ - omp_red_teamthread_reducer:ValueType \ - : Impl::OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ - initializer( \ - Impl::OpenMPTargetReducerWrapper ::init(omp_priv)) +#pragma omp declare reduction(omp_red_teamthread_reducer \ +:ValueType : Impl::OpenMPTargetReducerWrapper::join(omp_out, \ + omp_in)) \ + initializer(Impl::OpenMPTargetReducerWrapper::init(omp_priv)) #pragma omp barrier ValueType tmp; @@ -413,8 +409,9 @@ parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct< #pragma omp barrier iType team_size = iType(omp_get_num_threads()); -#pragma omp for simd reduction(omp_red_teamthread_reducer \ - : TeamVector_scratch[:1]) schedule(static, 1) +#pragma omp for simd reduction( \ + omp_red_teamthread_reducer : TeamVector_scratch[ : 1]) \ + schedule(static, 1) for (iType t = 0; t < team_size; t++) { ValueType tmp2; result.init(tmp2); @@ -443,8 +440,7 @@ class ParallelReduce scratch_memory_lock( - OpenMPTargetExec::m_mutex_scratch_ptr); - const FunctorType& functor = m_functor_reducer.get_functor(); + m_policy.space().impl_internal_space_instance()->m_mutex_scratch_ptr); + auto const functor = + Kokkos::Experimental::Impl::FunctorAdapter( + m_functor_reducer.get_functor()); if constexpr (FunctorHasJoin) { ParReduceSpecialize::execute_init_join(functor, m_policy, m_result_ptr, m_result_ptr_on_device); diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp index b0d693280243..ec8a96cb2f36 100644 --- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp +++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp @@ -20,6 +20,7 @@ #include #include #include +#include namespace Kokkos { namespace Impl { @@ -30,7 +31,6 @@ class ParallelScan, protected: using Policy = Kokkos::RangePolicy; - using WorkTag = typename Policy::work_tag; using Member = typename Policy::member_type; using idx_type = typename Policy::index_type; @@ -48,18 +48,8 @@ class ParallelScan, value_type* m_result_ptr; const bool m_result_ptr_device_accessible; - template - std::enable_if_t::value> call_with_tag( - const FunctorType& f, const idx_type& idx, value_type& val, - const bool& is_final) const { - f(idx, val, is_final); - } - template - std::enable_if_t::value> call_with_tag( - const FunctorType& f, const idx_type& idx, value_type& val, - const bool& is_final) const { - f(WorkTag(), idx, val, is_final); - } + using FunctorAdapter = + Kokkos::Experimental::Impl::FunctorAdapter; public: void impl_execute( @@ -77,8 +67,10 @@ class ParallelScan, idx_type team_size = 128; auto a_functor_reducer = m_functor_reducer; -#pragma omp target teams distribute map(to \ - : a_functor_reducer) num_teams(nteams) + auto a_functor = FunctorAdapter(m_functor_reducer.get_functor()); + +#pragma omp target teams distribute map(to : a_functor_reducer, a_functor) \ + num_teams(nteams) for (idx_type team_id = 0; team_id < n_chunks; ++team_id) { const typename Analysis::Reducer& reducer = a_functor_reducer.get_reducer(); @@ -91,9 +83,8 @@ class ParallelScan, const idx_type idx = local_offset + i; value_type val; reducer.init(&val); - if (idx < N) - call_with_tag(a_functor_reducer.get_functor(), idx, val, - false); + if (idx < N) a_functor(idx, val, false); + element_values(team_id, i) = val; } #pragma omp barrier @@ -120,9 +111,8 @@ class ParallelScan, } } -#pragma omp target teams distribute map(to \ - : a_functor_reducer) num_teams(nteams) \ - thread_limit(team_size) +#pragma omp target teams distribute map(to : a_functor_reducer, a_functor) \ + num_teams(nteams) thread_limit(team_size) for (idx_type team_id = 0; team_id < n_chunks; ++team_id) { const typename Analysis::Reducer& reducer = a_functor_reducer.get_reducer(); @@ -143,14 +133,9 @@ class ParallelScan, local_offset_value = element_values(team_id, i - 1); // FIXME_OPENMPTARGET We seem to access memory illegaly on AMD GPUs #if defined(KOKKOS_ARCH_AMD_GPU) && !defined(KOKKOS_ARCH_AMD_GFX1030) && \ - !defined(KOKKOS_ARCH_AMD_GFX1100) + !defined(KOKKOS_ARCH_AMD_GFX1100) && !defined(KOKKOS_ARCH_AMD_GFX1103) if constexpr (Analysis::Reducer::has_join_member_function()) { - if constexpr (std::is_void_v) - a_functor_reducer.get_functor().join(local_offset_value, - offset_value); - else - a_functor_reducer.get_functor().join( - WorkTag{}, local_offset_value, offset_value); + a_functor.get_functor().join(local_offset_value, offset_value); } else local_offset_value += offset_value; #else @@ -158,9 +143,8 @@ class ParallelScan, #endif } else local_offset_value = offset_value; - if (idx < N) - call_with_tag(a_functor_reducer.get_functor(), idx, - local_offset_value, true); + if (idx < N) a_functor(idx, local_offset_value, true); + if (idx == N - 1 && m_result_ptr_device_accessible) *m_result_ptr = local_offset_value; } @@ -169,9 +153,9 @@ class ParallelScan, } void execute() const { - OpenMPTargetExec::verify_is_process( + Experimental::Impl::OpenMPTargetInternal::verify_is_process( "Kokkos::Experimental::OpenMPTarget parallel_for"); - OpenMPTargetExec::verify_initialized( + Experimental::Impl::OpenMPTargetInternal::verify_initialized( "Kokkos::Experimental::OpenMPTarget parallel_for"); const idx_type N = m_policy.end() - m_policy.begin(); const idx_type chunk_size = 128; @@ -179,7 +163,7 @@ class ParallelScan, // Only let one ParallelReduce instance at a time use the scratch memory. std::scoped_lock scratch_memory_lock( - OpenMPTargetExec::m_mutex_scratch_ptr); + m_policy.space().impl_internal_space_instance()->m_mutex_scratch_ptr); // This could be scratch memory per team Kokkos::View, public: void execute() const { - OpenMPTargetExec::verify_is_process( + Experimental::Impl::OpenMPTargetInternal::verify_is_process( "Kokkos::Experimental::OpenMPTarget parallel_for"); - OpenMPTargetExec::verify_initialized( + Experimental::Impl::OpenMPTargetInternal::verify_initialized( "Kokkos::Experimental::OpenMPTarget parallel_for"); const int64_t N = base_t::m_policy.end() - base_t::m_policy.begin(); const int chunk_size = 128; @@ -231,7 +215,9 @@ class ParallelScanWithTotal, if (N > 0) { // Only let one ParallelReduce instance at a time use the scratch memory. std::scoped_lock scratch_memory_lock( - OpenMPTargetExec::m_mutex_scratch_ptr); + base_t::m_policy.space() + .impl_internal_space_instance() + ->m_mutex_scratch_ptr); // This could be scratch memory per team Kokkos::View #include #include +#include namespace Kokkos { namespace Impl { @@ -72,7 +73,6 @@ template , ReducerType, PointerType, ValueType> { using PolicyType = Kokkos::RangePolicy; - using TagType = typename PolicyType::work_tag; using ReducerTypeFwd = std::conditional_t::value, FunctorType, ReducerType>; @@ -82,12 +82,15 @@ struct ParallelReduceSpecialize, using ParReduceCopy = ParallelReduceCopy; - static void execute_reducer(const FunctorType& f, const PolicyType& p, + using FunctorAdapter = + Kokkos::Experimental::Impl::FunctorAdapter; + + static void execute_reducer(const FunctorAdapter& f, const PolicyType& p, PointerType result_ptr, bool ptr_on_device) { - OpenMPTargetExec::verify_is_process( + Experimental::Impl::OpenMPTargetInternal::verify_is_process( "Kokkos::Experimental::OpenMPTarget RangePolicy " "parallel_reduce:reducer"); - OpenMPTargetExec::verify_initialized( + Experimental::Impl::OpenMPTargetInternal::verify_initialized( "Kokkos::Experimental::OpenMPTarget RangePolicy " "parallel_reduce:reducer"); const auto begin = p.begin(); @@ -104,33 +107,27 @@ struct ParallelReduceSpecialize, return; } -#pragma omp declare reduction( \ - custom:ValueType \ - : OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ - initializer(OpenMPTargetReducerWrapper ::init(omp_priv)) +#pragma omp declare reduction(custom \ +:ValueType : OpenMPTargetReducerWrapper::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper::init(omp_priv)) -#pragma omp target teams distribute parallel for map(to \ - : f) reduction(custom \ - : result) +#pragma omp target teams distribute parallel for map(to : f) \ + reduction(custom : result) for (auto i = begin; i < end; ++i) { - if constexpr (std::is_void_v) { - f(i, result); - } else { - f(TagType(), i, result); - } + f(i, result); } ParReduceCopy::memcpy_result(result_ptr, &result, sizeof(ValueType), ptr_on_device); } - template - static void execute_array(const FunctorType& f, const PolicyType& p, + template + static void execute_array(const FunctorAdapter& f, const PolicyType& p, PointerType result_ptr, bool ptr_on_device) { - OpenMPTargetExec::verify_is_process( + Experimental::Impl::OpenMPTargetInternal::verify_is_process( "Kokkos::Experimental::OpenMPTarget RangePolicy " "parallel_reduce:array_reduction"); - OpenMPTargetExec::verify_initialized( + Experimental::Impl::OpenMPTargetInternal::verify_initialized( "Kokkos::Experimental::OpenMPTarget RangePolicy " "parallel_reduce:array_reduction"); const auto begin = p.begin(); @@ -150,27 +147,14 @@ struct ParallelReduceSpecialize, // Case where reduction is on a native data type. if constexpr (std::is_arithmetic::value) { -#pragma omp target teams distribute parallel for \ - map(to:f) reduction(+: result) - for (auto i = begin; i < end; ++i) - - if constexpr (std::is_void_v) { - f(i, result); - } else { - f(TagType(), i, result); - } +#pragma omp target teams distribute parallel for map(to : f) \ + reduction(+ : result) + for (auto i = begin; i < end; ++i) f(i, result); } else { #pragma omp declare reduction(custom:ValueType : omp_out += omp_in) -#pragma omp target teams distribute parallel for map(to \ - : f) reduction(custom \ - : result) - for (auto i = begin; i < end; ++i) - - if constexpr (std::is_void_v) { - f(i, result); - } else { - f(TagType(), i, result); - } +#pragma omp target teams distribute parallel for map(to : f) \ + reduction(custom : result) + for (auto i = begin; i < end; ++i) f(i, result); } ParReduceCopy::memcpy_result(result_ptr, &result, sizeof(ValueType), @@ -186,13 +170,10 @@ struct ParallelReduceSpecialize, ptr_on_device); return; } -#pragma omp target teams distribute parallel for map(to:f) reduction(+:result[:NumReductions]) +#pragma omp target teams distribute parallel for map(to : f) \ + reduction(+ : result[ : NumReductions]) for (auto i = begin; i < end; ++i) { - if constexpr (std::is_void_v) { - f(i, result); - } else { - f(TagType(), i, result); - } + f(i, result); } ParReduceCopy::memcpy_result( @@ -200,12 +181,12 @@ struct ParallelReduceSpecialize, } } - static void execute_init_join(const FunctorType& f, const PolicyType& p, + static void execute_init_join(const FunctorAdapter& f, const PolicyType& p, PointerType ptr, const bool ptr_on_device) { - OpenMPTargetExec::verify_is_process( + Experimental::Impl::OpenMPTargetInternal::verify_is_process( "Kokkos::Experimental::OpenMPTarget RangePolicy " "parallel_reduce:init_join"); - OpenMPTargetExec::verify_initialized( + Experimental::Impl::OpenMPTargetInternal::verify_initialized( "Kokkos::Experimental::OpenMPTarget RangePolicy " "parallel_reduce:init_join"); const auto begin = p.begin(); @@ -219,23 +200,25 @@ struct ParallelReduceSpecialize, const auto size = end - begin; - // FIXME_OPENMPTARGET: The team size and MAX_ACTIVE_THREADS are currently + // FIXME_OPENMPTARGET: The team size and concurrency are currently // based on NVIDIA-V100 and should be modifid to be based on the // architecture in the future. const int max_team_threads = 32; const int max_teams = - OpenMPTargetExec::MAX_ACTIVE_THREADS / max_team_threads; + p.space().impl_internal_space_instance()->concurrency() / + max_team_threads; // Number of elements in the reduction - const auto value_count = FunctorAnalysis::value_count(f); + const auto value_count = FunctorAnalysis::value_count(f.get_functor()); // Allocate scratch per active thread. Achieved by setting the first // parameter of `resize_scratch=1`. - OpenMPTargetExec::resize_scratch(1, 0, value_count * sizeof(ValueType), - std::numeric_limits::max()); - ValueType* scratch_ptr = - static_cast(OpenMPTargetExec::get_scratch_ptr()); + p.space().impl_internal_space_instance()->resize_scratch( + 1, 0, value_count * sizeof(ValueType), + std::numeric_limits::max()); + ValueType* scratch_ptr = static_cast( + p.space().impl_internal_space_instance()->get_scratch_ptr()); - typename FunctorAnalysis::Reducer final_reducer(f); + typename FunctorAnalysis::Reducer final_reducer(f.get_functor()); if (end <= begin) { #pragma omp target map(to : final_reducer) is_device_ptr(scratch_ptr) @@ -260,8 +243,7 @@ struct ParallelReduceSpecialize, } #pragma omp target teams num_teams(max_teams) thread_limit(max_team_threads) \ - map(to \ - : final_reducer) is_device_ptr(scratch_ptr) + map(to : final_reducer) is_device_ptr(scratch_ptr) { #pragma omp parallel { @@ -279,11 +261,7 @@ struct ParallelReduceSpecialize, // Accumulate partial results in thread specific storage. #pragma omp for simd for (auto i = team_begin; i < team_end; ++i) { - if constexpr (std::is_void_v) { - f(i, result); - } else { - f(TagType(), i, result); - } + f(i, result); } // Reduce all paritial results within a team. @@ -304,8 +282,7 @@ struct ParallelReduceSpecialize, int tree_neighbor_offset = 1; do { -#pragma omp target teams distribute parallel for simd map(to \ - : f) \ +#pragma omp target teams distribute parallel for simd map(to : f) \ is_device_ptr(scratch_ptr) for (int i = 0; i < max_teams - tree_neighbor_offset; i += 2 * tree_neighbor_offset) { @@ -344,7 +321,6 @@ template , ReducerType, PointerType, ValueType> { using PolicyType = TeamPolicyInternal; - using TagType = typename PolicyType::work_tag; using ReducerTypeFwd = std::conditional_t::value, FunctorType, ReducerType>; @@ -355,12 +331,15 @@ struct ParallelReduceSpecialize, using ParReduceCopy = ParallelReduceCopy; - static void execute_reducer(const FunctorType& f, const PolicyType& p, + using FunctorAdapter = + Kokkos::Experimental::Impl::FunctorAdapter; + + static void execute_reducer(const FunctorAdapter& f, const PolicyType& p, PointerType result_ptr, bool ptr_on_device) { - OpenMPTargetExec::verify_is_process( + Experimental::Impl::OpenMPTargetInternal::verify_is_process( "Kokkos::Experimental::OpenMPTarget TeamPolicy " "parallel_reduce:reducer"); - OpenMPTargetExec::verify_initialized( + Experimental::Impl::OpenMPTargetInternal::verify_initialized( "Kokkos::Experimental::OpenMPTarget TeamPolicy " "parallel_reduce:reducer"); @@ -370,9 +349,11 @@ struct ParallelReduceSpecialize, const size_t shmem_size_L0 = p.scratch_size(0, team_size); const size_t shmem_size_L1 = p.scratch_size(1, team_size); - OpenMPTargetExec::resize_scratch(PolicyType::member_type::TEAM_REDUCE_SIZE, - shmem_size_L0, shmem_size_L1, league_size); - void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr(); + p.space().impl_internal_space_instance()->resize_scratch( + PolicyType::member_type::TEAM_REDUCE_SIZE, shmem_size_L0, shmem_size_L1, + league_size); + void* scratch_ptr = + p.space().impl_internal_space_instance()->get_scratch_ptr(); ValueType result = ValueType(); @@ -383,16 +364,15 @@ struct ParallelReduceSpecialize, int max_active_teams = omp_get_max_teams(); #else int max_active_teams = - std::min(OpenMPTargetExec::MAX_ACTIVE_THREADS / team_size, league_size); + std::min(p.space().concurrency() / team_size, league_size); #endif // If the league size is <=0, do not launch the kernel. if (max_active_teams <= 0) return; -#pragma omp declare reduction( \ - custom:ValueType \ - : OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ - initializer(OpenMPTargetReducerWrapper ::init(omp_priv)) +#pragma omp declare reduction(custom \ +:ValueType : OpenMPTargetReducerWrapper::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper::init(omp_priv)) #if !defined(KOKKOS_IMPL_OPENMPTARGET_HIERARCHICAL_INTEL_GPU) KOKKOS_IMPL_OMPTARGET_PRAGMA( @@ -414,16 +394,13 @@ struct ParallelReduceSpecialize, typename PolicyType::member_type team( league_id, league_size, team_size, vector_length, scratch_ptr, blockIdx, shmem_size_L0, shmem_size_L1); - if constexpr (std::is_void_v) - f(team, result); - else - f(TagType(), team, result); + f(team, result); } } #else #pragma omp target teams distribute firstprivate(f) is_device_ptr(scratch_ptr) \ - num_teams(max_active_teams) thread_limit(team_size) reduction(custom \ - : result) + num_teams(max_active_teams) thread_limit(team_size) \ + reduction(custom : result) for (int i = 0; i < league_size; i++) { #pragma omp parallel reduction(custom : result) { @@ -433,10 +410,7 @@ struct ParallelReduceSpecialize, typename PolicyType::member_type team(i, league_size, team_size, vector_length, scratch_ptr, i, shmem_size_L0, shmem_size_L1); - if constexpr (std::is_void_v) - f(team, result); - else - f(TagType(), team, result); + f(team, result); } } #endif @@ -447,12 +421,12 @@ struct ParallelReduceSpecialize, } template - static void execute_array(const FunctorType& f, const PolicyType& p, + static void execute_array(const FunctorAdapter& f, const PolicyType& p, PointerType result_ptr, bool ptr_on_device) { - OpenMPTargetExec::verify_is_process( + Experimental::Impl::OpenMPTargetInternal::verify_is_process( "Kokkos::Experimental::OpenMPTarget TeamPolicy " "parallel_reduce:array_reduction"); - OpenMPTargetExec::verify_initialized( + Experimental::Impl::OpenMPTargetInternal::verify_initialized( "Kokkos::Experimental::OpenMPTarget TeamPolicy " "parallel_reduce:array_reduction"); @@ -462,9 +436,11 @@ struct ParallelReduceSpecialize, const size_t shmem_size_L0 = p.scratch_size(0, team_size); const size_t shmem_size_L1 = p.scratch_size(1, team_size); - OpenMPTargetExec::resize_scratch(PolicyType::member_type::TEAM_REDUCE_SIZE, - shmem_size_L0, shmem_size_L1, league_size); - void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr(); + p.space().impl_internal_space_instance()->resize_scratch( + PolicyType::member_type::TEAM_REDUCE_SIZE, shmem_size_L0, shmem_size_L1, + league_size); + void* scratch_ptr = + p.space().impl_internal_space_instance()->get_scratch_ptr(); // Maximum active teams possible. // FIXME_OPENMPTARGET: Cray compiler did not yet implement @@ -473,7 +449,7 @@ struct ParallelReduceSpecialize, int max_active_teams = omp_get_max_teams(); #else int max_active_teams = - std::min(OpenMPTargetExec::MAX_ACTIVE_THREADS / team_size, league_size); + std::min(p.space().concurrency() / team_size, league_size); #endif // If the league size is <=0, do not launch the kernel. @@ -504,19 +480,14 @@ struct ParallelReduceSpecialize, typename PolicyType::member_type team( league_id, league_size, team_size, vector_length, scratch_ptr, blockIdx, shmem_size_L0, shmem_size_L1); - if constexpr (std::is_void_v) - f(team, result); - else - f(TagType(), team, result); + f(team, result); } } } else { // Case where the reduction is on a non-native data type. #pragma omp declare reduction(custom:ValueType : omp_out += omp_in) #pragma omp target teams num_teams(max_active_teams) thread_limit(team_size) \ - map(to \ - : f) is_device_ptr(scratch_ptr) reduction(custom \ - : result) + map(to : f) is_device_ptr(scratch_ptr) reduction(custom : result) #pragma omp parallel reduction(custom : result) { if (omp_get_num_teams() > max_active_teams) @@ -531,10 +502,7 @@ struct ParallelReduceSpecialize, typename PolicyType::member_type team( league_id, league_size, team_size, vector_length, scratch_ptr, blockIdx, shmem_size_L0, shmem_size_L1); - if constexpr (std::is_void_v) - f(team, result); - else - f(TagType(), team, result); + f(team, result); } } } @@ -545,10 +513,10 @@ struct ParallelReduceSpecialize, } else { ValueType result[NumReductions] = {}; // Case where the reduction is on an array. -#pragma omp target teams num_teams(max_active_teams) thread_limit(team_size) map(to \ - : f) \ - is_device_ptr(scratch_ptr) reduction(+ : result[:NumReductions]) -#pragma omp parallel reduction(+ : result[:NumReductions]) +#pragma omp target teams num_teams(max_active_teams) thread_limit(team_size) \ + map(to : f) is_device_ptr(scratch_ptr) \ + reduction(+ : result[ : NumReductions]) +#pragma omp parallel reduction(+ : result[ : NumReductions]) { if (omp_get_num_teams() > max_active_teams) Kokkos::abort("`omp_set_num_teams` call was not respected.\n"); @@ -562,10 +530,7 @@ struct ParallelReduceSpecialize, typename PolicyType::member_type team( league_id, league_size, team_size, vector_length, scratch_ptr, blockIdx, shmem_size_L0, shmem_size_L1); - if constexpr (std::is_void_v) - f(team, result); - else - f(TagType(), team, result); + f(team, result); } } @@ -577,12 +542,12 @@ struct ParallelReduceSpecialize, // FIXME_OPENMPTARGET : This routine is a copy from `parallel_reduce` over // RangePolicy. Need a new implementation. - static void execute_init_join(const FunctorType& f, const PolicyType& p, + static void execute_init_join(const FunctorAdapter& f, const PolicyType& p, PointerType ptr, const bool ptr_on_device) { - OpenMPTargetExec::verify_is_process( + Experimental::Impl::OpenMPTargetInternal::verify_is_process( "Kokkos::Experimental::OpenMPTarget TeamPolicy " "parallel_reduce:init_join "); - OpenMPTargetExec::verify_initialized( + Experimental::Impl::OpenMPTargetInternal::verify_initialized( "Kokkos::Experimental::OpenMPTarget TeamPolicy " "parallel_reduce:init_join"); using FunctorAnalysis = @@ -611,13 +576,14 @@ struct ParallelReduceSpecialize, const auto nteams = league_size; // Number of elements in the reduction - const auto value_count = FunctorAnalysis::value_count(f); + const auto value_count = FunctorAnalysis::value_count(f.get_functor()); // Allocate scratch per active thread. - OpenMPTargetExec::resize_scratch(1, 0, value_count * sizeof(ValueType), - league_size); - void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr(); - typename FunctorAnalysis::Reducer final_reducer(f); + p.space().impl_internal_space_instance()->resize_scratch( + 1, 0, value_count * sizeof(ValueType), league_size); + void* scratch_ptr = + p.space().impl_internal_space_instance()->get_scratch_ptr(); + typename FunctorAnalysis::Reducer final_reducer(f.get_functor()); if (end <= begin) { // If there is no work to be done, copy back the initialized values and @@ -661,11 +627,7 @@ struct ParallelReduceSpecialize, typename PolicyType::member_type team( league_id, league_size, team_size, vector_length, scratch_ptr, team_num, shmem_size_L0, shmem_size_L1); - if constexpr (std::is_void_v) { - f(team, result); - } else { - f(TagType(), team, result); - } + f(team, result); } } // end parallel } // end target @@ -673,7 +635,7 @@ struct ParallelReduceSpecialize, int tree_neighbor_offset = 1; do { #pragma omp target teams distribute parallel for simd firstprivate( \ - final_reducer) is_device_ptr(scratch_ptr) + final_reducer) is_device_ptr(scratch_ptr) for (int i = 0; i < nteams - tree_neighbor_offset; i += 2 * tree_neighbor_offset) { ValueType* team_scratch = static_cast(scratch_ptr); diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Reducer.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Reducer.hpp index 9b578aca1129..4308fb042a34 100644 --- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Reducer.hpp +++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Reducer.hpp @@ -34,9 +34,6 @@ struct OpenMPTargetReducerWrapper { KOKKOS_INLINE_FUNCTION static void join(value_type&, const value_type&) = delete; - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type&, const volatile value_type&) = delete; - KOKKOS_INLINE_FUNCTION static void init(value_type&) = delete; }; @@ -51,11 +48,6 @@ struct OpenMPTargetReducerWrapper> { KOKKOS_INLINE_FUNCTION static void join(value_type& dest, const value_type& src) { dest += src; } - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - dest += src; - } - KOKKOS_INLINE_FUNCTION static void init(value_type& val) { val = reduction_identity::sum(); @@ -72,11 +64,6 @@ struct OpenMPTargetReducerWrapper> { KOKKOS_INLINE_FUNCTION static void join(value_type& dest, const value_type& src) { dest *= src; } - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - dest *= src; - } - KOKKOS_INLINE_FUNCTION static void init(value_type& val) { val = reduction_identity::prod(); @@ -95,11 +82,6 @@ struct OpenMPTargetReducerWrapper> { if (src < dest) dest = src; } - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - if (src < dest) dest = src; - } - KOKKOS_INLINE_FUNCTION static void init(value_type& val) { val = reduction_identity::min(); @@ -118,11 +100,6 @@ struct OpenMPTargetReducerWrapper> { if (src > dest) dest = src; } - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - if (src > dest) dest = src; - } - // Required KOKKOS_INLINE_FUNCTION static void init(value_type& val) { @@ -141,11 +118,6 @@ struct OpenMPTargetReducerWrapper> { dest = dest && src; } - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - dest = dest && src; - } - KOKKOS_INLINE_FUNCTION static void init(value_type& val) { val = reduction_identity::land(); @@ -166,11 +138,6 @@ struct OpenMPTargetReducerWrapper> { dest = dest || src; } - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - dest = dest || src; - } - KOKKOS_INLINE_FUNCTION static void init(value_type& val) { val = reduction_identity::lor(); @@ -189,11 +156,6 @@ struct OpenMPTargetReducerWrapper> { dest = dest & src; } - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - dest = dest & src; - } - KOKKOS_INLINE_FUNCTION static void init(value_type& val) { val = reduction_identity::band(); @@ -212,11 +174,6 @@ struct OpenMPTargetReducerWrapper> { dest = dest | src; } - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - dest = dest | src; - } - KOKKOS_INLINE_FUNCTION static void init(value_type& val) { val = reduction_identity::bor(); @@ -236,12 +193,12 @@ struct OpenMPTargetReducerWrapper> { // Required KOKKOS_INLINE_FUNCTION static void join(value_type& dest, const value_type& src) { - if (src.val < dest.val) dest = src; - } - - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - if (src.val < dest.val) dest = src; + if (src.val < dest.val) + dest = src; + else if (src.val == dest.val && + dest.loc == reduction_identity::min()) { + dest.loc = src.loc; + } } KOKKOS_INLINE_FUNCTION @@ -263,12 +220,12 @@ struct OpenMPTargetReducerWrapper> { KOKKOS_INLINE_FUNCTION static void join(value_type& dest, const value_type& src) { - if (src.val > dest.val) dest = src; - } - - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - if (src.val > dest.val) dest = src; + if (src.val > dest.val) + dest = src; + else if (src.val == dest.val && + dest.loc == reduction_identity::min()) { + dest.loc = src.loc; + } } KOKKOS_INLINE_FUNCTION @@ -298,16 +255,6 @@ struct OpenMPTargetReducerWrapper> { } } - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - if (src.min_val < dest.min_val) { - dest.min_val = src.min_val; - } - if (src.max_val > dest.max_val) { - dest.max_val = src.max_val; - } - } - KOKKOS_INLINE_FUNCTION static void init(value_type& val) { val.max_val = reduction_identity::max(); @@ -331,22 +278,16 @@ struct OpenMPTargetReducerWrapper> { if (src.min_val < dest.min_val) { dest.min_val = src.min_val; dest.min_loc = src.min_loc; - } - if (src.max_val > dest.max_val) { - dest.max_val = src.max_val; - dest.max_loc = src.max_loc; - } - } - - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - if (src.min_val < dest.min_val) { - dest.min_val = src.min_val; + } else if (dest.min_val == src.min_val && + dest.min_loc == reduction_identity::min()) { dest.min_loc = src.min_loc; } if (src.max_val > dest.max_val) { dest.max_val = src.max_val; dest.max_loc = src.max_loc; + } else if (dest.max_val == src.max_val && + dest.max_loc == reduction_identity::min()) { + dest.max_loc = src.max_loc; } } @@ -385,15 +326,6 @@ struct OpenMPTargetReducerWrapper> { } } - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - if (dest.val < src.val) { - dest = src; - } else if (!(src.val < dest.val)) { - dest.loc = (src.loc < dest.loc) ? src.loc : dest.loc; - } - } - KOKKOS_INLINE_FUNCTION static void init(value_type& val) { val.val = reduction_identity::max(); @@ -428,15 +360,6 @@ struct OpenMPTargetReducerWrapper> { } } - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - if (src.val < dest.val) { - dest = src; - } else if (!(dest.val < src.val)) { - dest.loc = (src.loc < dest.loc) ? src.loc : dest.loc; - } - } - KOKKOS_INLINE_FUNCTION static void init(value_type& val) { val.val = reduction_identity::min(); @@ -480,23 +403,6 @@ struct OpenMPTargetReducerWrapper> { } } - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - if (src.min_val < dest.min_val) { - dest.min_val = src.min_val; - dest.min_loc = src.min_loc; - } else if (!(dest.min_val < src.min_val)) { - dest.min_loc = (src.min_loc < dest.min_loc) ? src.min_loc : dest.min_loc; - } - - if (dest.max_val < src.max_val) { - dest.max_val = src.max_val; - dest.max_loc = src.max_loc; - } else if (!(src.max_val < dest.max_val)) { - dest.max_loc = (src.max_loc > dest.max_loc) ? src.max_loc : dest.max_loc; - } - } - KOKKOS_INLINE_FUNCTION static void init(value_type& val) { val.max_val = reduction_identity::max(); @@ -531,13 +437,6 @@ struct OpenMPTargetReducerWrapper> { : dest.min_loc_true; } - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - dest.min_loc_true = (src.min_loc_true < dest.min_loc_true) - ? src.min_loc_true - : dest.min_loc_true; - } - KOKKOS_INLINE_FUNCTION static void init(value_type& val) { val.min_loc_true = reduction_identity::min(); @@ -569,13 +468,6 @@ struct OpenMPTargetReducerWrapper> { : dest.max_loc_true; } - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - dest.max_loc_true = (src.max_loc_true > dest.max_loc_true) - ? src.max_loc_true - : dest.max_loc_true; - } - KOKKOS_INLINE_FUNCTION static void init(value_type& val) { val.max_loc_true = reduction_identity::max(); @@ -611,17 +503,6 @@ struct OpenMPTargetReducerWrapper> { : src.min_loc_false; } - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - dest.max_loc_true = (dest.max_loc_true < src.max_loc_true) - ? src.max_loc_true - : dest.max_loc_true; - - dest.min_loc_false = (dest.min_loc_false < src.min_loc_false) - ? dest.min_loc_false - : src.min_loc_false; - } - KOKKOS_INLINE_FUNCTION static void init(value_type& val) { val.max_loc_true = ::Kokkos::reduction_identity::max(); @@ -654,13 +535,6 @@ struct OpenMPTargetReducerWrapper> { : src.min_loc_false; } - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - dest.min_loc_false = (dest.min_loc_false < src.min_loc_false) - ? dest.min_loc_false - : src.min_loc_false; - } - KOKKOS_INLINE_FUNCTION static void init(value_type& val) { val.min_loc_false = ::Kokkos::reduction_identity::min(); diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.cpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.cpp deleted file mode 100644 index 458c4c9a43e6..000000000000 --- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.cpp +++ /dev/null @@ -1,251 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE -#define KOKKOS_IMPL_PUBLIC_INCLUDE -#endif - -#include - -#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ENABLE_TASKPOLICY) - -#include - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Impl { - -template class TaskQueue; - -//---------------------------------------------------------------------------- - -TaskExec::TaskExec() - : m_self_exec(0), - m_team_exec(0), - m_sync_mask(0), - m_sync_value(0), - m_sync_step(0), - m_group_rank(0), - m_team_rank(0), - m_team_size(1) {} - -TaskExec::TaskExec( - Kokkos::Impl::OpenMPTargetExec &arg_exec, int const arg_team_size) - : m_self_exec(&arg_exec), - m_team_exec(arg_exec.pool_rev(arg_exec.pool_rank_rev() / arg_team_size)), - m_sync_mask(0), - m_sync_value(0), - m_sync_step(0), - m_group_rank(arg_exec.pool_rank_rev() / arg_team_size), - m_team_rank(arg_exec.pool_rank_rev() % arg_team_size), - m_team_size(arg_team_size) { - // This team spans - // m_self_exec->pool_rev( team_size * group_rank ) - // m_self_exec->pool_rev( team_size * ( group_rank + 1 ) - 1 ) - - int64_t volatile *const sync = (int64_t *)m_self_exec->scratch_reduce(); - - sync[0] = int64_t(0); - sync[1] = int64_t(0); - - for (int i = 0; i < m_team_size; ++i) { - m_sync_value |= int64_t(1) << (8 * i); - m_sync_mask |= int64_t(3) << (8 * i); - } - - Kokkos::memory_fence(); -} - -void TaskExec::team_barrier_impl() const { - if (m_team_exec->scratch_reduce_size() < int(2 * sizeof(int64_t))) { - Kokkos::abort("TaskQueue scratch_reduce memory too small"); - } - - // Use team shared memory to synchronize. - // Alternate memory locations between barriers to avoid a sequence - // of barriers overtaking one another. - - int64_t volatile *const sync = - ((int64_t *)m_team_exec->scratch_reduce()) + (m_sync_step & 0x01); - - // This team member sets one byte within the sync variable - int8_t volatile *const sync_self = ((int8_t *)sync) + m_team_rank; - - *sync_self = int8_t(m_sync_value & 0x03); // signal arrival - - while (m_sync_value != *sync) - ; // wait for team to arrive - - ++m_sync_step; - - if (0 == (0x01 & m_sync_step)) { // Every other step - m_sync_value ^= m_sync_mask; - if (1000 < m_sync_step) m_sync_step = 0; - } -} - -//---------------------------------------------------------------------------- - -void TaskQueueSpecialization::execute( - TaskQueue *const queue) { - using execution_space = Kokkos::Experimental::OpenMPTarget; - using queue_type = TaskQueue; - using task_root_type = TaskBase; - using PoolExec = Kokkos::Impl::OpenMPTargetExec; - using Member = TaskExec; - - task_root_type *const end = (task_root_type *)task_root_type::EndTag; - - // Required: team_size <= 8 - - const int team_size = PoolExec::pool_size(2); // Threads per core - // const int team_size = PoolExec::pool_size(1); // Threads per NUMA - - if (8 < team_size) { - Kokkos::abort("TaskQueue unsupported team size"); - } - -#pragma omp parallel - { - PoolExec &self = *PoolExec::get_thread_omp(); - - Member single_exec; - Member team_exec(self, team_size); - - // Team shared memory - task_root_type *volatile *const task_shared = - (task_root_type **)team_exec.m_team_exec->scratch_thread(); - -// Barrier across entire OpenMPTarget thread pool to insure initialization -#pragma omp barrier - - // Loop until all queues are empty and no tasks in flight - - do { - task_root_type *task = 0; - - // Each team lead attempts to acquire either a thread team task - // or a single thread task for the team. - - if (0 == team_exec.team_rank()) { - task = 0 < *((volatile int *)&queue->m_ready_count) ? end : 0; - - // Loop by priority and then type - for (int i = 0; i < queue_type::NumQueue && end == task; ++i) { - for (int j = 0; j < 2 && end == task; ++j) { - task = queue_type::pop_task(&queue->m_ready[i][j]); - } - } - } - - // Team lead broadcast acquired task to team members: - - if (1 < team_exec.team_size()) { - if (0 == team_exec.team_rank()) *task_shared = task; - - // Fence to be sure task_shared is stored before the barrier - Kokkos::memory_fence(); - - // Whole team waits for every team member to reach this statement - team_exec.team_barrier(); - - // Fence to be sure task_shared is stored - Kokkos::memory_fence(); - - task = *task_shared; - } - - if (0 == task) break; // 0 == m_ready_count - - if (end == task) { - // All team members wait for whole team to reach this statement. - // Is necessary to prevent task_shared from being updated - // before it is read by all threads. - team_exec.team_barrier(); - } else if (task_root_type::TaskTeam == task->m_task_type) { - // Thread Team Task - (*task->m_apply)(task, &team_exec); - - // The m_apply function performs a barrier - - if (0 == team_exec.team_rank()) { - // team member #0 completes the task, which may delete the task - queue->complete(task); - } - } else { - // Single Thread Task - - if (0 == team_exec.team_rank()) { - (*task->m_apply)(task, &single_exec); - - queue->complete(task); - } - - // All team members wait for whole team to reach this statement. - // Not necessary to complete the task. - // Is necessary to prevent task_shared from being updated - // before it is read by all threads. - team_exec.team_barrier(); - } - } while (1); - } - // END #pragma omp parallel -} - -void TaskQueueSpecialization:: - iff_single_thread_recursive_execute( - TaskQueue *const queue) { - using execution_space = Kokkos::Experimental::OpenMPTarget; - using queue_type = TaskQueue; - using task_root_type = TaskBase; - using Member = TaskExec; - - if (1 == omp_get_num_threads()) { - task_root_type *const end = (task_root_type *)task_root_type::EndTag; - - Member single_exec; - - task_root_type *task = end; - - do { - task = end; - - // Loop by priority and then type - for (int i = 0; i < queue_type::NumQueue && end == task; ++i) { - for (int j = 0; j < 2 && end == task; ++j) { - task = queue_type::pop_task(&queue->m_ready[i][j]); - } - } - - if (end == task) break; - - (*task->m_apply)(task, &single_exec); - - queue->complete(task); - - } while (1); - } -} - -} // namespace Impl -} // namespace Kokkos - -//---------------------------------------------------------------------------- - -#endif /* #if defined( KOKKOS_ENABLE_OPENMPTARGET ) && defined( \ - KOKKOS_ENABLE_TASKPOLICY ) */ diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.hpp deleted file mode 100644 index c9aa7b128f17..000000000000 --- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.hpp +++ /dev/null @@ -1,319 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOS_IMPL_OPENMP_TASK_HPP -#define KOKKOS_IMPL_OPENMP_TASK_HPP - -#if defined(KOKKOS_ENABLE_TASKPOLICY) - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Impl { - -template <> -class TaskQueueSpecialization { - public: - using execution_space = Kokkos::Experimental::OpenMPTarget; - using queue_type = Kokkos::Impl::TaskQueue; - using task_base_type = Kokkos::Impl::TaskBase; - - // Must specify memory space - using memory_space = Kokkos::HostSpace; - - static void iff_single_thread_recursive_execute(queue_type* const); - - // Must provide task queue execution function - static void execute(queue_type* const); - - // Must provide mechanism to set function pointer in - // execution space from the host process. - template - static void proc_set_apply(task_base_type::function_type* ptr) { - using TaskType = TaskBase; - *ptr = TaskType::apply; - } -}; - -extern template class TaskQueue; - -//---------------------------------------------------------------------------- - -template <> -class TaskExec { - private: - TaskExec(TaskExec&&) = delete; - TaskExec(TaskExec const&) = delete; - TaskExec& operator=(TaskExec&&) = delete; - TaskExec& operator=(TaskExec const&) = delete; - - using PoolExec = Kokkos::Impl::OpenMPTargetExec; - - friend class Kokkos::Impl::TaskQueue; - friend class Kokkos::Impl::TaskQueueSpecialization< - Kokkos::Experimental::OpenMPTarget>; - - PoolExec* const m_self_exec; ///< This thread's thread pool data structure - PoolExec* const m_team_exec; ///< Team thread's thread pool data structure - int64_t m_sync_mask; - int64_t mutable m_sync_value; - int mutable m_sync_step; - int m_group_rank; ///< Which "team" subset of thread pool - int m_team_rank; ///< Which thread within a team - int m_team_size; - - TaskExec(); - TaskExec(PoolExec& arg_exec, int arg_team_size); - - void team_barrier_impl() const; - - public: - KOKKOS_FUNCTION void* team_shared() const { - KOKKOS_IF_ON_HOST( - (return m_team_exec ? m_team_exec->scratch_thread() : nullptr;)) - - KOKKOS_IF_ON_DEVICE((return nullptr;)) - } - - KOKKOS_FUNCTION int team_shared_size() const { - KOKKOS_IF_ON_HOST( - (return m_team_exec ? m_team_exec->scratch_thread_size() : 0;)) - - KOKKOS_IF_ON_DEVICE((return 0;)) - } - - /**\brief Whole team enters this function call - * before any teeam member returns from - * this function call. - */ - KOKKOS_FUNCTION void team_barrier() const { - KOKKOS_IF_ON_HOST((if (1 < m_team_size) { team_barrier_impl(); })) - } - - KOKKOS_INLINE_FUNCTION - int team_rank() const { return m_team_rank; } - - KOKKOS_INLINE_FUNCTION - int team_size() const { return m_team_size; } -}; - -} // namespace Impl -} // namespace Kokkos - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { - -template -KOKKOS_INLINE_FUNCTION Impl::TeamThreadRangeBoundariesStruct< - iType, Impl::TaskExec > -TeamThreadRange(Impl::TaskExec& thread, - const iType& count) { - return Impl::TeamThreadRangeBoundariesStruct< - iType, Impl::TaskExec >(thread, - count); -} - -template -KOKKOS_INLINE_FUNCTION Impl::TeamThreadRangeBoundariesStruct< - iType, Impl::TaskExec > -TeamThreadRange(Impl::TaskExec& thread, - const iType& start, const iType& end) { - return Impl::TeamThreadRangeBoundariesStruct< - iType, Impl::TaskExec >(thread, start, - end); -} - -/** \brief Inter-thread parallel_for. Executes lambda(iType i) for each - * i=0..N-1. - * - * The range i=0..N-1 is mapped to all threads of the the calling thread team. - */ -template -KOKKOS_INLINE_FUNCTION void parallel_for( - const Impl::TeamThreadRangeBoundariesStruct< - iType, Impl::TaskExec >& - loop_boundaries, - const Lambda& lambda) { - for (iType i = loop_boundaries.start; i < loop_boundaries.end; - i += loop_boundaries.increment) { - lambda(i); - } -} - -template -KOKKOS_INLINE_FUNCTION void parallel_reduce( - const Impl::TeamThreadRangeBoundariesStruct< - iType, Impl::TaskExec >& - loop_boundaries, - const Lambda& lambda, ValueType& initialized_result) { - int team_rank = - loop_boundaries.thread.team_rank(); // member num within the team - ValueType result = initialized_result; - - for (iType i = loop_boundaries.start; i < loop_boundaries.end; - i += loop_boundaries.increment) { - lambda(i, result); - } - - if (1 < loop_boundaries.thread.team_size()) { - ValueType* shared = (ValueType*)loop_boundaries.thread.team_shared(); - - loop_boundaries.thread.team_barrier(); - shared[team_rank] = result; - - loop_boundaries.thread.team_barrier(); - - // reduce across threads to thread 0 - if (team_rank == 0) { - for (int i = 1; i < loop_boundaries.thread.team_size(); i++) { - shared[0] += shared[i]; - } - } - - loop_boundaries.thread.team_barrier(); - - // broadcast result - initialized_result = shared[0]; - } else { - initialized_result = result; - } -} - -template -KOKKOS_INLINE_FUNCTION void parallel_reduce( - const Impl::TeamThreadRangeBoundariesStruct< - iType, Impl::TaskExec >& - loop_boundaries, - const Lambda& lambda, const JoinType& join, ValueType& initialized_result) { - int team_rank = - loop_boundaries.thread.team_rank(); // member num within the team - ValueType result = initialized_result; - - for (iType i = loop_boundaries.start; i < loop_boundaries.end; - i += loop_boundaries.increment) { - lambda(i, result); - } - - if (1 < loop_boundaries.thread.team_size()) { - ValueType* shared = (ValueType*)loop_boundaries.thread.team_shared(); - - loop_boundaries.thread.team_barrier(); - shared[team_rank] = result; - - loop_boundaries.thread.team_barrier(); - - // reduce across threads to thread 0 - if (team_rank == 0) { - for (int i = 1; i < loop_boundaries.thread.team_size(); i++) { - join(shared[0], shared[i]); - } - } - - loop_boundaries.thread.team_barrier(); - - // broadcast result - initialized_result = shared[0]; - } else { - initialized_result = result; - } -} - -// placeholder for future function -template -KOKKOS_INLINE_FUNCTION void parallel_reduce( - const Impl::ThreadVectorRangeBoundariesStruct< - iType, Impl::TaskExec >& - loop_boundaries, - const Lambda& lambda, ValueType& initialized_result) {} - -// placeholder for future function -template -KOKKOS_INLINE_FUNCTION void parallel_reduce( - const Impl::ThreadVectorRangeBoundariesStruct< - iType, Impl::TaskExec >& - loop_boundaries, - const Lambda& lambda, const JoinType& join, ValueType& initialized_result) { -} - -template -KOKKOS_INLINE_FUNCTION void parallel_scan( - const Impl::TeamThreadRangeBoundariesStruct< - iType, Impl::TaskExec >& - loop_boundaries, - const Lambda& lambda) { - ValueType accum = 0; - ValueType val, local_total; - ValueType* shared = (ValueType*)loop_boundaries.thread.team_shared(); - int team_size = loop_boundaries.thread.team_size(); - int team_rank = - loop_boundaries.thread.team_rank(); // member num within the team - - // Intra-member scan - for (iType i = loop_boundaries.start; i < loop_boundaries.end; - i += loop_boundaries.increment) { - local_total = 0; - lambda(i, local_total, false); - val = accum; - lambda(i, val, true); - accum += local_total; - } - - shared[team_rank] = accum; - loop_boundaries.thread.team_barrier(); - - // Member 0 do scan on accumulated totals - if (team_rank == 0) { - for (iType i = 1; i < team_size; i += 1) { - shared[i] += shared[i - 1]; - } - accum = 0; // Member 0 set accum to 0 in preparation for inter-member scan - } - - loop_boundaries.thread.team_barrier(); - - // Inter-member scan adding in accumulated totals - if (team_rank != 0) { - accum = shared[team_rank - 1]; - } - for (iType i = loop_boundaries.start; i < loop_boundaries.end; - i += loop_boundaries.increment) { - local_total = 0; - lambda(i, local_total, false); - val = accum; - lambda(i, val, true); - accum += local_total; - } -} - -// placeholder for future function -template -KOKKOS_INLINE_FUNCTION void parallel_scan( - const Impl::ThreadVectorRangeBoundariesStruct< - iType, Impl::TaskExec >& - loop_boundaries, - const Lambda& lambda) {} - -} /* namespace Kokkos */ - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */ -#endif /* #ifndef KOKKOS_IMPL_OPENMP_TASK_HPP */ diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL.cpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL.cpp index 4de6931918e4..2583a1cdc047 100644 --- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL.cpp +++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL.cpp @@ -46,7 +46,6 @@ struct Container { } // namespace namespace Kokkos { -namespace Experimental { SYCL::SYCL() : m_space_instance(&Impl::SYCLInternal::singleton(), [](Impl::SYCLInternal*) {}) { @@ -100,6 +99,11 @@ void SYCL::print_configuration(std::ostream& os, bool verbose) const { #else os << "macro KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED : undefined\n"; #endif +#ifdef KOKKOS_ENABLE_SYCL_RELOCATABLE_DEVICE_CODE + os << "macro KOKKOS_ENABLE_SYCL_RELOCATABLE_DEVICE_CODE : defined\n"; +#else + os << "macro KOKKOS_ENABLE_SYCL_RELOCATABLE_DEVICE_CODE : undefined\n"; +#endif #ifdef SYCL_EXT_ONEAPI_DEVICE_GLOBAL os << "macro SYCL_EXT_ONEAPI_DEVICE_GLOBAL : defined\n"; #else @@ -172,8 +176,7 @@ void SYCL::fence(const std::string& name) const { } void SYCL::impl_static_fence(const std::string& name) { - Kokkos::Tools::Experimental::Impl::profile_fence_event< - Kokkos::Experimental::SYCL>( + Kokkos::Tools::Experimental::Impl::profile_fence_event( name, Kokkos::Tools::Experimental::SpecialSynchronizationCases:: GlobalDeviceSynchronization, @@ -261,8 +264,6 @@ std::ostream& SYCL::impl_sycl_info(std::ostream& os, << device.get_info() << "\nImage Max Buffer Size: " << device.get_info() - << "\nImage Max Array Size: " - << device.get_info() << "\nMax Samplers: " << device.get_info() << "\nMax Parameter Size: " << device.get_info() @@ -317,5 +318,4 @@ int g_sycl_space_factory_initialized = Kokkos::Impl::initialize_space_factory("170_SYCL"); } // namespace Impl -} // namespace Experimental } // namespace Kokkos diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL.hpp index 0f3d1f0994df..937dcceab483 100644 --- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL.hpp +++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL.hpp @@ -39,7 +39,6 @@ static_assert(false, #include namespace Kokkos { -namespace Experimental { namespace Impl { class SYCLInternal; } @@ -91,9 +90,8 @@ class SYCL { /** \brief Wait until all dispatched functors complete. A noop for OpenMP. */ static void impl_static_fence(const std::string& name); - void fence( - const std::string& name = - "Kokkos::Experimental::SYCL::fence: Unnamed Instance Fence") const; + void fence(const std::string& name = + "Kokkos::SYCL::fence: Unnamed Instance Fence") const; /// \brief Print configuration information to the given output stream. void print_configuration(std::ostream& os, bool verbose = false) const; @@ -131,15 +129,13 @@ class SYCL { Kokkos::Impl::HostSharedPtr m_space_instance; }; -} // namespace Experimental - namespace Tools { namespace Experimental { template <> -struct DeviceTypeTraits { +struct DeviceTypeTraits { /// \brief An ID to differentiate (for example) Serial from OpenMP in Tooling static constexpr DeviceType id = DeviceType::SYCL; - static int device_id(const Kokkos::Experimental::SYCL& exec) { + static int device_id(const Kokkos::SYCL& exec) { return exec.impl_internal_space_instance()->m_syclDev; } }; @@ -185,10 +181,11 @@ std::vector partition_space(const SYCL& sycl_space, return instances; } +} // namespace Experimental + namespace Impl { std::vector get_sycl_devices(); } // namespace Impl -} // namespace Experimental } // namespace Kokkos diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_DeepCopy.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_DeepCopy.hpp index afc7eebd3881..a9e2eca4fb3a 100644 --- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_DeepCopy.hpp +++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_DeepCopy.hpp @@ -28,37 +28,34 @@ namespace Kokkos { namespace Impl { void DeepCopySYCL(void* dst, const void* src, size_t n); -void DeepCopyAsyncSYCL(const Kokkos::Experimental::SYCL& instance, void* dst, - const void* src, size_t n); +void DeepCopyAsyncSYCL(const Kokkos::SYCL& instance, void* dst, const void* src, + size_t n); void DeepCopyAsyncSYCL(void* dst, const void* src, size_t n); template -struct DeepCopy::value>> { DeepCopy(void* dst, const void* src, size_t n) { DeepCopySYCL(dst, src, n); } - DeepCopy(const Kokkos::Experimental::SYCL& instance, void* dst, - const void* src, size_t n) { + DeepCopy(const Kokkos::SYCL& instance, void* dst, const void* src, size_t n) { DeepCopyAsyncSYCL(instance, dst, src, n); } }; template -struct DeepCopy::value>> { DeepCopy(void* dst, const void* src, size_t n) { DeepCopySYCL(dst, src, n); } - DeepCopy(const Kokkos::Experimental::SYCL& instance, void* dst, - const void* src, size_t n) { + DeepCopy(const Kokkos::SYCL& instance, void* dst, const void* src, size_t n) { DeepCopyAsyncSYCL(instance, dst, src, n); } }; template -struct DeepCopy::value && is_sycl_type_space::value>> { DeepCopy(void* dst, const void* src, size_t n) { DeepCopySYCL(dst, src, n); } - DeepCopy(const Kokkos::Experimental::SYCL& instance, void* dst, - const void* src, size_t n) { + DeepCopy(const Kokkos::SYCL& instance, void* dst, const void* src, size_t n) { DeepCopyAsyncSYCL(instance, dst, src, n); } }; @@ -66,10 +63,9 @@ struct DeepCopy struct DeepCopy< MemSpace1, MemSpace2, ExecutionSpace, - std::enable_if_t< - is_sycl_type_space::value && - is_sycl_type_space::value && - !std::is_same::value>> { + std::enable_if_t::value && + is_sycl_type_space::value && + !std::is_same::value>> { inline DeepCopy(void* dst, const void* src, size_t n) { DeepCopySYCL(dst, src, n); } @@ -93,9 +89,8 @@ struct DeepCopy< template struct DeepCopy< MemSpace, HostSpace, ExecutionSpace, - std::enable_if_t< - is_sycl_type_space::value && - !std::is_same::value>> { + std::enable_if_t::value && + !std::is_same::value>> { inline DeepCopy(void* dst, const void* src, size_t n) { DeepCopySYCL(dst, src, n); } @@ -118,9 +113,8 @@ struct DeepCopy< template struct DeepCopy< HostSpace, MemSpace, ExecutionSpace, - std::enable_if_t< - is_sycl_type_space::value && - !std::is_same::value>> { + std::enable_if_t::value && + !std::is_same::value>> { inline DeepCopy(void* dst, const void* src, size_t n) { DeepCopySYCL(dst, src, n); } diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_GraphNodeKernel.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_GraphNodeKernel.hpp index 9c39df941592..54ca64599532 100644 --- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_GraphNodeKernel.hpp +++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_GraphNodeKernel.hpp @@ -32,30 +32,29 @@ namespace Impl { template -class GraphNodeKernelImpl - : public PatternImplSpecializationFromTag< - PatternTag, Functor, PolicyType, Args..., - Kokkos::Experimental::SYCL>::type { +class GraphNodeKernelImpl + : public PatternImplSpecializationFromTag::type { public: using Policy = PolicyType; using graph_kernel = GraphNodeKernelImpl; - using base_t = typename PatternImplSpecializationFromTag< - PatternTag, Functor, Policy, Args..., Kokkos::Experimental::SYCL>::type; + using base_t = + typename PatternImplSpecializationFromTag::type; // TODO use the name and executionspace template - GraphNodeKernelImpl(std::string, Kokkos::Experimental::SYCL const&, - Functor arg_functor, PolicyDeduced&& arg_policy, - ArgsDeduced&&... args) - : base_t(std::move(arg_functor), (PolicyDeduced &&) arg_policy, - (ArgsDeduced &&) args...) {} + GraphNodeKernelImpl(std::string, Kokkos::SYCL const&, Functor arg_functor, + PolicyDeduced&& arg_policy, ArgsDeduced&&... args) + : base_t(std::move(arg_functor), (PolicyDeduced&&)arg_policy, + (ArgsDeduced&&)args...) {} template - GraphNodeKernelImpl(Kokkos::Experimental::SYCL const& exec_space, - Functor arg_functor, PolicyDeduced&& arg_policy) + GraphNodeKernelImpl(Kokkos::SYCL const& exec_space, Functor arg_functor, + PolicyDeduced&& arg_policy) : GraphNodeKernelImpl("", exec_space, std::move(arg_functor), - (PolicyDeduced &&) arg_policy) {} + (PolicyDeduced&&)arg_policy) {} void set_sycl_graph_ptr( sycl::ext::oneapi::experimental::command_graph< @@ -102,14 +101,14 @@ template ::type> struct get_graph_node_kernel_type - : type_identity> {}; + : type_identity< + GraphNodeKernelImpl> {}; template struct get_graph_node_kernel_type : type_identity, Kokkos::ParallelReduceTag>> {}; diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_GraphNode_Impl.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_GraphNode_Impl.hpp index 6bbe6711a2e8..828f1cacb4ac 100644 --- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_GraphNode_Impl.hpp +++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_GraphNode_Impl.hpp @@ -28,7 +28,7 @@ namespace Kokkos { namespace Impl { template <> -struct GraphNodeBackendSpecificDetails { +struct GraphNodeBackendSpecificDetails { std::optional node; explicit GraphNodeBackendSpecificDetails() = default; @@ -38,16 +38,16 @@ struct GraphNodeBackendSpecificDetails { }; template -struct GraphNodeBackendDetailsBeforeTypeErasure { +struct GraphNodeBackendDetailsBeforeTypeErasure { protected: GraphNodeBackendDetailsBeforeTypeErasure( - Kokkos::Experimental::SYCL const &, Kernel &, PredecessorRef const &, - GraphNodeBackendSpecificDetails &) noexcept {} + Kokkos::SYCL const &, Kernel &, PredecessorRef const &, + GraphNodeBackendSpecificDetails &) noexcept {} GraphNodeBackendDetailsBeforeTypeErasure( - Kokkos::Experimental::SYCL const &, _graph_node_is_root_ctor_tag, - GraphNodeBackendSpecificDetails &) noexcept {} + Kokkos::SYCL const &, _graph_node_is_root_ctor_tag, + GraphNodeBackendSpecificDetails &) noexcept {} }; } // namespace Impl diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Graph_Impl.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Graph_Impl.hpp index 1dc4a9c99739..dc63052dd7a7 100644 --- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Graph_Impl.hpp +++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Graph_Impl.hpp @@ -31,29 +31,28 @@ namespace Kokkos { namespace Impl { template <> -class GraphImpl { +class GraphImpl { public: - using node_details_t = - GraphNodeBackendSpecificDetails; - using root_node_impl_t = GraphNodeImpl; + using node_details_t = GraphNodeBackendSpecificDetails; + using root_node_impl_t = + GraphNodeImpl; using aggregate_kernel_impl_t = SYCLGraphNodeAggregateKernel; using aggregate_node_impl_t = - GraphNodeImpl; // Not movable or copyable; it spends its whole life as a shared_ptr in the // Graph object. - GraphImpl() = delete; - GraphImpl(GraphImpl const&) = delete; - GraphImpl(GraphImpl&&) = delete; + GraphImpl() = delete; + GraphImpl(GraphImpl const&) = delete; + GraphImpl(GraphImpl&&) = delete; GraphImpl& operator=(GraphImpl const&) = delete; - GraphImpl& operator=(GraphImpl&&) = delete; + GraphImpl& operator=(GraphImpl&&) = delete; ~GraphImpl(); - explicit GraphImpl(Kokkos::Experimental::SYCL instance); + explicit GraphImpl(Kokkos::SYCL instance); void add_node(std::shared_ptr const& arg_node_ptr); @@ -63,19 +62,25 @@ class GraphImpl { template void add_predecessor(NodeImplPtr arg_node_ptr, PredecessorRef arg_pred_ref); - void submit(); + void submit(const Kokkos::SYCL& exec); - Kokkos::Experimental::SYCL const& get_execution_space() const noexcept; + Kokkos::SYCL const& get_execution_space() const noexcept; auto create_root_node_ptr(); template auto create_aggregate_ptr(PredecessorRefs&&...); - private: - void instantiate_graph() { m_graph_exec = m_graph.finalize(); } + void instantiate() { + KOKKOS_EXPECTS(!m_graph_exec.has_value()); + m_graph_exec = m_graph.finalize(); + } - Kokkos::Experimental::SYCL m_execution_space; + auto& sycl_graph() { return m_graph; } + auto& sycl_graph_exec() { return m_graph_exec; } + + private: + Kokkos::SYCL m_execution_space; sycl::ext::oneapi::experimental::command_graph< sycl::ext::oneapi::experimental::graph_state::modifiable> m_graph; @@ -84,17 +89,16 @@ class GraphImpl { m_graph_exec; }; -inline GraphImpl::~GraphImpl() { +inline GraphImpl::~GraphImpl() { m_execution_space.fence("Kokkos::GraphImpl::~GraphImpl: Graph Destruction"); } -inline GraphImpl::GraphImpl( - Kokkos::Experimental::SYCL instance) +inline GraphImpl::GraphImpl(Kokkos::SYCL instance) : m_execution_space(std::move(instance)), m_graph(m_execution_space.sycl_queue().get_context(), m_execution_space.sycl_queue().get_device()) {} -inline void GraphImpl::add_node( +inline void GraphImpl::add_node( std::shared_ptr const& arg_node_ptr) { // add an empty node that needs to be set up before finalizing the graph arg_node_ptr->node_details_t::node = m_graph.add(); @@ -103,7 +107,7 @@ inline void GraphImpl::add_node( // Requires NodeImplPtr is a shared_ptr to specialization of GraphNodeImpl // Also requires that the kernel has the graph node tag in its policy template -inline void GraphImpl::add_node( +inline void GraphImpl::add_node( std::shared_ptr const& arg_node_ptr) { static_assert(NodeImpl::kernel_type::Policy::is_graph_kernel::value); KOKKOS_EXPECTS(arg_node_ptr); @@ -122,7 +126,7 @@ inline void GraphImpl::add_node( // already been added to this graph and NodeImpl is a specialization of // GraphNodeImpl that has already been added to this graph. template -inline void GraphImpl::add_predecessor( +inline void GraphImpl::add_predecessor( NodeImplPtr arg_node_ptr, PredecessorRef arg_pred_ref) { KOKKOS_EXPECTS(arg_node_ptr); auto pred_ptr = GraphAccess::get_node_ptr(arg_pred_ref); @@ -137,19 +141,19 @@ inline void GraphImpl::add_predecessor( m_graph.make_edge(*pred_node, *node); } -inline void GraphImpl::submit() { +inline void GraphImpl::submit(const Kokkos::SYCL& exec) { if (!m_graph_exec) { - instantiate_graph(); + instantiate(); } - m_execution_space.sycl_queue().ext_oneapi_graph(*m_graph_exec); + exec.sycl_queue().ext_oneapi_graph(*m_graph_exec); } -inline Kokkos::Experimental::SYCL const& -GraphImpl::get_execution_space() const noexcept { +inline Kokkos::SYCL const& GraphImpl::get_execution_space() + const noexcept { return m_execution_space; } -inline auto GraphImpl::create_root_node_ptr() { +inline auto GraphImpl::create_root_node_ptr() { KOKKOS_EXPECTS(!m_graph_exec); auto rv = std::make_shared(get_execution_space(), _graph_node_is_root_ctor_tag{}); @@ -158,7 +162,7 @@ inline auto GraphImpl::create_root_node_ptr() { } template -inline auto GraphImpl::create_aggregate_ptr( +inline auto GraphImpl::create_aggregate_ptr( PredecessorRefs&&...) { // The attachment to predecessors, which is all we really need, happens // in the generic layer, which calls through to add_predecessor for diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.cpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.cpp index 5843dca81239..5af1330d9390 100644 --- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.cpp +++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.cpp @@ -24,14 +24,12 @@ #include namespace Kokkos { -namespace Experimental { namespace Impl { namespace { // FIXME_SYCL Should be a multiple of the maximum subgroup size. -static constexpr auto sizeScratchGrain = - sizeof(Kokkos::Experimental::SYCL::size_type[32]); +static constexpr auto sizeScratchGrain = sizeof(Kokkos::SYCL::size_type[32]); std::size_t scratch_count(const std::size_t size) { return (size + sizeScratchGrain - 1) / sizeScratchGrain; @@ -55,8 +53,8 @@ Kokkos::View sycl_global_unique_token_locks( SYCLInternal::~SYCLInternal() { if (!was_finalized || m_scratchSpace || m_scratchHost || m_scratchFlags) { - std::cerr << "Kokkos::Experimental::SYCL ERROR: Failed to call " - "Kokkos::Experimental::SYCL::finalize()" + std::cerr << "Kokkos::SYCL ERROR: Failed to call " + "Kokkos::SYCL::finalize()" << std::endl; std::cerr.flush(); } @@ -64,7 +62,7 @@ SYCLInternal::~SYCLInternal() { int SYCLInternal::verify_is_initialized(const char* const label) const { if (!is_initialized()) { - Kokkos::abort((std::string("Kokkos::Experimental::SYCL::") + label + + Kokkos::abort((std::string("Kokkos::SYCL::") + label + " : ERROR device not initialized\n") .c_str()); } @@ -171,12 +169,12 @@ Kokkos::Impl::sycl_device_ptr SYCLInternal::resize_team_scratch_space( // Multiple ParallelFor/Reduce Teams can call this function at the same time // and invalidate the m_team_scratch_ptr. We use a pool to avoid any race // condition. - auto mem_space = Kokkos::Experimental::SYCLDeviceUSMSpace(*m_queue); + auto mem_space = Kokkos::SYCLDeviceUSMSpace(*m_queue); if (m_team_scratch_current_size[scratch_pool_id] == 0 && bytes > 0) { m_team_scratch_current_size[scratch_pool_id] = bytes; - m_team_scratch_ptr[scratch_pool_id] = mem_space.allocate( - "Kokkos::Experimental::SYCL::InternalTeamScratchMemory", - m_team_scratch_current_size[scratch_pool_id]); + m_team_scratch_ptr[scratch_pool_id] = + mem_space.allocate("Kokkos::SYCL::InternalTeamScratchMemory", + m_team_scratch_current_size[scratch_pool_id]); } if ((bytes > m_team_scratch_current_size[scratch_pool_id]) || ((bytes < m_team_scratch_current_size[scratch_pool_id]) && @@ -184,9 +182,9 @@ Kokkos::Impl::sycl_device_ptr SYCLInternal::resize_team_scratch_space( mem_space.deallocate(m_team_scratch_ptr[scratch_pool_id], m_team_scratch_current_size[scratch_pool_id]); m_team_scratch_current_size[scratch_pool_id] = bytes; - m_team_scratch_ptr[scratch_pool_id] = mem_space.allocate( - "Kokkos::Experimental::SYCL::InternalTeamScratchMemory", - m_team_scratch_current_size[scratch_pool_id]); + m_team_scratch_ptr[scratch_pool_id] = + mem_space.allocate("Kokkos::SYCL::InternalTeamScratchMemory", + m_team_scratch_current_size[scratch_pool_id]); } return m_team_scratch_ptr[scratch_pool_id]; } @@ -255,7 +253,7 @@ Kokkos::Impl::sycl_device_ptr SYCLInternal::scratch_space( const std::size_t size) { if (verify_is_initialized("scratch_space") && m_scratchSpaceCount < scratch_count(size)) { - auto mem_space = Kokkos::Experimental::SYCLDeviceUSMSpace(*m_queue); + auto mem_space = Kokkos::SYCLDeviceUSMSpace(*m_queue); if (nullptr != m_scratchSpace) mem_space.deallocate(m_scratchSpace, @@ -265,8 +263,8 @@ Kokkos::Impl::sycl_device_ptr SYCLInternal::scratch_space( std::size_t alloc_size = Kokkos::Impl::multiply_overflow_abort( m_scratchSpaceCount, sizeScratchGrain); - m_scratchSpace = static_cast(mem_space.allocate( - "Kokkos::Experimental::SYCL::InternalScratchSpace", alloc_size)); + m_scratchSpace = static_cast( + mem_space.allocate("Kokkos::SYCL::InternalScratchSpace", alloc_size)); } return m_scratchSpace; @@ -276,7 +274,7 @@ Kokkos::Impl::sycl_host_ptr SYCLInternal::scratch_host( const std::size_t size) { if (verify_is_initialized("scratch_unified") && m_scratchHostCount < scratch_count(size)) { - auto mem_space = Kokkos::Experimental::SYCLHostUSMSpace(*m_queue); + auto mem_space = Kokkos::SYCLHostUSMSpace(*m_queue); if (nullptr != m_scratchHost) mem_space.deallocate(m_scratchHost, @@ -286,8 +284,8 @@ Kokkos::Impl::sycl_host_ptr SYCLInternal::scratch_host( std::size_t alloc_size = Kokkos::Impl::multiply_overflow_abort( m_scratchHostCount, sizeScratchGrain); - m_scratchHost = static_cast(mem_space.allocate( - "Kokkos::Experimental::SYCL::InternalScratchHost", alloc_size)); + m_scratchHost = static_cast( + mem_space.allocate("Kokkos::SYCL::InternalScratchHost", alloc_size)); } return m_scratchHost; @@ -297,7 +295,7 @@ Kokkos::Impl::sycl_device_ptr SYCLInternal::scratch_flags( const std::size_t size) { if (verify_is_initialized("scratch_flags") && m_scratchFlagsCount < scratch_count(size)) { - auto mem_space = Kokkos::Experimental::SYCLDeviceUSMSpace(*m_queue); + auto mem_space = Kokkos::SYCLDeviceUSMSpace(*m_queue); if (nullptr != m_scratchFlags) mem_space.deallocate(m_scratchFlags, @@ -307,8 +305,8 @@ Kokkos::Impl::sycl_device_ptr SYCLInternal::scratch_flags( std::size_t alloc_size = Kokkos::Impl::multiply_overflow_abort( m_scratchFlagsCount, sizeScratchGrain); - m_scratchFlags = static_cast(mem_space.allocate( - "Kokkos::Experimental::SYCL::InternalScratchFlags", alloc_size)); + m_scratchFlags = static_cast( + mem_space.allocate("Kokkos::SYCL::InternalScratchFlags", alloc_size)); // We only zero-initialize the allocation when we actually allocate. // It's the responsibility of the features using scratch_flags, @@ -326,8 +324,7 @@ Kokkos::Impl::sycl_device_ptr SYCLInternal::scratch_flags( template void SYCLInternal::fence_helper(WAT& wat, const std::string& name, uint32_t instance_id) { - Kokkos::Tools::Experimental::Impl::profile_fence_event< - Kokkos::Experimental::SYCL>( + Kokkos::Tools::Experimental::Impl::profile_fence_event( name, Kokkos::Tools::Experimental::Impl::DirectFenceIDHandle{instance_id}, [&]() { try { @@ -364,8 +361,7 @@ size_t SYCLInternal::USMObjectMem::reserve(size_t n) { AllocationSpace alloc_space(*m_q); if (m_data) alloc_space.deallocate(m_data, m_capacity); - m_data = - alloc_space.allocate("Kokkos::Experimental::SYCL::USMObjectMem", n); + m_data = alloc_space.allocate("Kokkos::SYCL::USMObjectMem", n); if constexpr (sycl::usm::alloc::device == Kind) m_staging.reset(new char[n]); @@ -396,5 +392,4 @@ template class SYCLInternal::USMObjectMem; template class SYCLInternal::USMObjectMem; } // namespace Impl -} // namespace Experimental } // namespace Kokkos diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.hpp index 2d784ef8a5f0..c982154a9a82 100644 --- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.hpp +++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.hpp @@ -28,7 +28,6 @@ #include #include namespace Kokkos { -namespace Experimental { namespace Impl { class SYCLInternal { @@ -38,10 +37,10 @@ class SYCLInternal { SYCLInternal() = default; ~SYCLInternal(); - SYCLInternal(const SYCLInternal&) = delete; + SYCLInternal(const SYCLInternal&) = delete; SYCLInternal& operator=(const SYCLInternal&) = delete; - SYCLInternal& operator=(SYCLInternal&&) = delete; - SYCLInternal(SYCLInternal&&) = delete; + SYCLInternal& operator=(SYCLInternal&&) = delete; + SYCLInternal(SYCLInternal&&) = delete; Kokkos::Impl::sycl_device_ptr scratch_space(const std::size_t size); Kokkos::Impl::sycl_device_ptr scratch_flags(const std::size_t size); @@ -76,8 +75,9 @@ class SYCLInternal { mutable sycl::event m_team_scratch_event[m_n_team_scratch] = {}; mutable std::mutex m_team_scratch_mutex; - uint32_t m_instance_id = Kokkos::Tools::Experimental::Impl::idForInstance< - Kokkos::Experimental::SYCL>(reinterpret_cast(this)); + uint32_t m_instance_id = + Kokkos::Tools::Experimental::Impl::idForInstance( + reinterpret_cast(this)); std::optional m_queue; // Using std::vector> reveals a compiler bug when @@ -102,9 +102,9 @@ class SYCLInternal { explicit USMObjectMem(sycl::queue q, uint32_t instance_id) noexcept : m_q(std::move(q)), m_instance_id(instance_id) {} - USMObjectMem(USMObjectMem const&) = delete; - USMObjectMem(USMObjectMem&&) = delete; - USMObjectMem& operator=(USMObjectMem&&) = delete; + USMObjectMem(USMObjectMem const&) = delete; + USMObjectMem(USMObjectMem&&) = delete; + USMObjectMem& operator=(USMObjectMem&&) = delete; USMObjectMem& operator=(USMObjectMem const&) = delete; ~USMObjectMem() { reset(); }; @@ -119,12 +119,12 @@ class SYCLInternal { size_t reserve(size_t n); private: - using AllocationSpace = std::conditional_t< - Kind == sycl::usm::alloc::device, - Kokkos::Experimental::SYCLDeviceUSMSpace, - std::conditional_t>; + using AllocationSpace = + std::conditional_t>; public: // Performs either sycl::memcpy (for USM device memory) or std::memcpy @@ -144,11 +144,10 @@ class SYCLInternal { } void fence() { - SYCLInternal::fence( - m_last_event, - "Kokkos::Experimental::SYCLInternal::USMObject fence to wait for " - "last event to finish", - m_instance_id); + SYCLInternal::fence(m_last_event, + "Kokkos::SYCLInternal::USMObject fence to wait for " + "last event to finish", + m_instance_id); } void register_event(sycl::event event) { @@ -324,13 +323,12 @@ auto make_sycl_function_wrapper(const Functor& functor, Storage& storage) { return SYCLFunctionWrapper(functor, storage); } } // namespace Impl -} // namespace Experimental } // namespace Kokkos #if defined(SYCL_DEVICE_COPYABLE) && defined(KOKKOS_ARCH_INTEL_GPU) template struct sycl::is_device_copyable< - Kokkos::Experimental::Impl::SYCLFunctionWrapper> + Kokkos::Impl::SYCLFunctionWrapper> : std::true_type {}; #if (defined(__INTEL_LLVM_COMPILER) && __INTEL_LLVM_COMPILER < 20240000) || \ @@ -352,8 +350,7 @@ static_assert( template struct sycl::is_device_copyable< - const Kokkos::Experimental::Impl::SYCLFunctionWrapper, + const Kokkos::Impl::SYCLFunctionWrapper, std::enable_if_t>>> : std::true_type {}; diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_MDRangePolicy.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_MDRangePolicy.hpp index d212e2dacc3a..9498513a3e8c 100644 --- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_MDRangePolicy.hpp +++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_MDRangePolicy.hpp @@ -22,13 +22,13 @@ namespace Kokkos { template <> -struct default_outer_direction { +struct default_outer_direction { using type = Iterate; static constexpr Iterate value = Iterate::Left; }; template <> -struct default_inner_direction { +struct default_inner_direction { using type = Iterate; static constexpr Iterate value = Iterate::Left; }; @@ -37,8 +37,8 @@ namespace Impl { // Settings for MDRangePolicy template <> -inline TileSizeProperties get_tile_size_properties( - const Kokkos::Experimental::SYCL& space) { +inline TileSizeProperties get_tile_size_properties( + const Kokkos::SYCL& space) { TileSizeProperties properties; properties.max_threads = space.impl_internal_space_instance()->m_maxWorkgroupSize; @@ -50,8 +50,7 @@ inline TileSizeProperties get_tile_size_properties( // Settings for TeamMDRangePolicy template -struct ThreadAndVectorNestLevel +struct ThreadAndVectorNestLevel : AcceleratorBasedNestLevel {}; } // namespace Impl diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_MDRange.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_MDRange.hpp index cb7b1048da35..3dbd63d81ad5 100644 --- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_MDRange.hpp +++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_MDRange.hpp @@ -25,7 +25,7 @@ template class Kokkos::Impl::ParallelFor, - Kokkos::Experimental::SYCL> { + Kokkos::SYCL> { public: using Policy = Kokkos::MDRangePolicy; @@ -54,7 +54,7 @@ class Kokkos::Impl::ParallelFor, const typename Policy::index_type m_num_tiles; static constexpr Iterate inner_direction = Policy::inner_direction; } m_policy; - const Kokkos::Experimental::SYCL& m_space; + const Kokkos::SYCL& m_space; sycl::nd_range<3> compute_ranges() const { const auto& m_tile = m_policy.m_tile; @@ -180,12 +180,11 @@ class Kokkos::Impl::ParallelFor, } void execute() const { - Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem& - indirectKernelMem = - m_space.impl_internal_space_instance()->get_indirect_kernel_mem(); + Kokkos::Impl::SYCLInternal::IndirectKernelMem& indirectKernelMem = + m_space.impl_internal_space_instance()->get_indirect_kernel_mem(); - auto functor_wrapper = Experimental::Impl::make_sycl_function_wrapper( - m_functor, indirectKernelMem); + auto functor_wrapper = + Impl::make_sycl_function_wrapper(m_functor, indirectKernelMem); sycl::event event = sycl_direct_launch(functor_wrapper, functor_wrapper.get_copy_event()); functor_wrapper.register_event(event); diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_Range.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_Range.hpp index 8ef43d392c6a..da75f3e901d1 100644 --- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_Range.hpp +++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_Range.hpp @@ -67,7 +67,7 @@ struct FunctorWrapperRangePolicyParallelForCustom { template class Kokkos::Impl::ParallelFor, - Kokkos::Experimental::SYCL> { + Kokkos::SYCL> { public: using Policy = Kokkos::RangePolicy; @@ -82,8 +82,8 @@ class Kokkos::Impl::ParallelFor, sycl::event sycl_direct_launch(const Policy& policy, const Functor& functor, const sycl::event& memcpy_event) const { // Convenience references - const Kokkos::Experimental::SYCL& space = policy.space(); - sycl::queue& q = space.sycl_queue(); + const Kokkos::SYCL& space = policy.space(); + sycl::queue& q = space.sycl_queue(); desul::ensure_sycl_lock_arrays_on_device(q); @@ -160,13 +160,13 @@ class Kokkos::Impl::ParallelFor, void execute() const { if (m_policy.begin() == m_policy.end()) return; - Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem& - indirectKernelMem = m_policy.space() - .impl_internal_space_instance() - ->get_indirect_kernel_mem(); + Kokkos::Impl::SYCLInternal::IndirectKernelMem& indirectKernelMem = + m_policy.space() + .impl_internal_space_instance() + ->get_indirect_kernel_mem(); - auto functor_wrapper = Experimental::Impl::make_sycl_function_wrapper( - m_functor, indirectKernelMem); + auto functor_wrapper = + Impl::make_sycl_function_wrapper(m_functor, indirectKernelMem); sycl::event event = sycl_direct_launch(m_policy, functor_wrapper, functor_wrapper.get_copy_event()); functor_wrapper.register_event(event); diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp index cf7f582bc79f..d8859cda9f3e 100644 --- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp +++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp @@ -27,11 +27,11 @@ template class Kokkos::Impl::ParallelFor, - Kokkos::Experimental::SYCL> { + Kokkos::SYCL> { public: using Policy = TeamPolicy; using functor_type = FunctorType; - using size_type = ::Kokkos::Experimental::SYCL::size_type; + using size_type = ::Kokkos::SYCL::size_type; private: using member_type = typename Policy::member_type; @@ -52,8 +52,8 @@ class Kokkos::Impl::ParallelFor, const FunctorWrapper& functor_wrapper, const sycl::event& memcpy_event) const { // Convenience references - const Kokkos::Experimental::SYCL& space = m_policy.space(); - sycl::queue& q = space.sycl_queue(); + const Kokkos::SYCL& space = m_policy.space(); + sycl::queue& q = space.sycl_queue(); desul::ensure_sycl_lock_arrays_on_device(q); @@ -146,11 +146,11 @@ class Kokkos::Impl::ParallelFor, scratch_pool_id, static_cast(m_scratch_size[1]) * m_league_size)); - Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem& - indirectKernelMem = instance.get_indirect_kernel_mem(); + Kokkos::Impl::SYCLInternal::IndirectKernelMem& indirectKernelMem = + instance.get_indirect_kernel_mem(); - auto functor_wrapper = Experimental::Impl::make_sycl_function_wrapper( - m_functor, indirectKernelMem); + auto functor_wrapper = + Impl::make_sycl_function_wrapper(m_functor, indirectKernelMem); sycl::event event = sycl_direct_launch(global_scratch_ptr, functor_wrapper, functor_wrapper.get_copy_event()); @@ -164,10 +164,14 @@ class Kokkos::Impl::ParallelFor, m_league_size(arg_policy.league_size()), m_team_size(arg_policy.team_size()), m_vector_size(arg_policy.impl_vector_length()) { - // FIXME_SYCL optimize - if (m_team_size < 0) + if (m_team_size < 0) { m_team_size = m_policy.team_size_recommended(arg_functor, ParallelForTag{}); + if (m_team_size <= 0) + Kokkos::Impl::throw_runtime_exception( + "Kokkos::Impl::ParallelFor could not find a " + "valid execution configuration."); + } m_shmem_begin = (sizeof(double) * (m_team_size + 2)); m_shmem_size = diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp index 0774b24bca16..1e313549757b 100644 --- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp +++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp @@ -30,7 +30,7 @@ template class Kokkos::Impl::ParallelReduce, - Kokkos::Experimental::SYCL> { + Kokkos::SYCL> { public: using Policy = Kokkos::MDRangePolicy; using FunctorType = typename CombinedFunctorReducerType::functor_type; @@ -76,7 +76,7 @@ class Kokkos::Impl::ParallelReduce::accessible) {} private: @@ -85,7 +85,7 @@ class Kokkos::Impl::ParallelReduce scratch_buffers_lock( instance.m_mutexScratchSpace); - using IndirectKernelMem = - Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem; + using IndirectKernelMem = Kokkos::Impl::SYCLInternal::IndirectKernelMem; IndirectKernelMem& indirectKernelMem = instance.get_indirect_kernel_mem(); auto functor_reducer_wrapper = - Experimental::Impl::make_sycl_function_wrapper(m_functor_reducer, - indirectKernelMem); + Impl::make_sycl_function_wrapper(m_functor_reducer, indirectKernelMem); sycl::event event = sycl_direct_launch( functor_reducer_wrapper, functor_reducer_wrapper.get_copy_event()); @@ -370,7 +368,7 @@ class Kokkos::Impl::ParallelReduce template -class Kokkos::Impl::ParallelReduce, - Kokkos::Experimental::SYCL> { +class Kokkos::Impl::ParallelReduce< + CombinedFunctorReducerType, Kokkos::RangePolicy, Kokkos::SYCL> { public: using Policy = Kokkos::RangePolicy; using FunctorType = typename CombinedFunctorReducerType::functor_type; @@ -49,7 +48,7 @@ class Kokkos::Impl::ParallelReduce::accessible) {} private: @@ -59,8 +58,8 @@ class Kokkos::Impl::ParallelReduce scratch_buffers_lock( instance.m_mutexScratchSpace); - using IndirectKernelMem = - Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem; + using IndirectKernelMem = Kokkos::Impl::SYCLInternal::IndirectKernelMem; IndirectKernelMem& indirectKernelMem = instance.get_indirect_kernel_mem(); auto functor_reducer_wrapper = - Experimental::Impl::make_sycl_function_wrapper(m_functor_reducer, - indirectKernelMem); + Impl::make_sycl_function_wrapper(m_functor_reducer, indirectKernelMem); sycl::event event = sycl_direct_launch(m_policy, functor_reducer_wrapper, diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp index b443bcbf9023..8f5310cbb21c 100644 --- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp +++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp @@ -29,7 +29,7 @@ template class Kokkos::Impl::ParallelReduce, - Kokkos::Experimental::SYCL> { + Kokkos::SYCL> { public: using Policy = TeamPolicy; using FunctorType = typename CombinedFunctorReducerType::functor_type; @@ -46,7 +46,7 @@ class Kokkos::Impl::ParallelReduce(m_scratch_size[1]) * m_league_size)); - using IndirectKernelMem = - Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem; + using IndirectKernelMem = Kokkos::Impl::SYCLInternal::IndirectKernelMem; IndirectKernelMem& indirectKernelMem = instance.get_indirect_kernel_mem(); auto functor_reducer_wrapper = - Experimental::Impl::make_sycl_function_wrapper(m_functor_reducer, - indirectKernelMem); + Impl::make_sycl_function_wrapper(m_functor_reducer, indirectKernelMem); sycl::event event = sycl_direct_launch(global_scratch_ptr, functor_reducer_wrapper, @@ -436,16 +434,21 @@ class Kokkos::Impl::ParallelReduce::accessible), m_league_size(arg_policy.league_size()), m_team_size(arg_policy.team_size()), m_vector_size(arg_policy.impl_vector_length()) { - // FIXME_SYCL optimize - if (m_team_size < 0) + if (m_team_size < 0) { m_team_size = m_policy.team_size_recommended( m_functor_reducer.get_functor(), m_functor_reducer.get_reducer(), ParallelReduceTag{}); + if (m_team_size <= 0) + Kokkos::Impl::throw_runtime_exception( + "Kokkos::Impl::ParallelReduce could not find a " + "valid execution configuration."); + } + // Must be a power of two greater than two, get the one not bigger than the // requested one. if ((m_team_size & m_team_size - 1) || m_team_size < 2) { @@ -461,7 +464,7 @@ class Kokkos::Impl::ParallelReduce(instance.m_maxShmemPerBlock) < m_shmem_size - m_shmem_begin) { diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp index bdb5b8837705..ed7cee2805d9 100644 --- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp +++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp @@ -145,7 +145,7 @@ class ParallelScanSYCLBase { using value_type = typename Analysis::value_type; using reference_type = typename Analysis::reference_type; using functor_type = FunctorType; - using size_type = Kokkos::Experimental::SYCL::size_type; + using size_type = Kokkos::SYCL::size_type; using index_type = typename Policy::index_type; protected: @@ -161,8 +161,8 @@ class ParallelScanSYCLBase { sycl::event sycl_direct_launch(const FunctorWrapper& functor_wrapper, sycl::event memcpy_event) { // Convenience references - const Kokkos::Experimental::SYCL& space = m_policy.space(); - Kokkos::Experimental::Impl::SYCLInternal& instance = + const Kokkos::SYCL& space = m_policy.space(); + Kokkos::Impl::SYCLInternal& instance = *space.impl_internal_space_instance(); sycl::queue& q = space.sycl_queue(); @@ -374,11 +374,11 @@ class ParallelScanSYCLBase { std::scoped_lock scratch_buffers_lock( instance.m_mutexScratchSpace); - Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem& - indirectKernelMem = instance.get_indirect_kernel_mem(); + Kokkos::Impl::SYCLInternal::IndirectKernelMem& indirectKernelMem = + instance.get_indirect_kernel_mem(); - auto functor_wrapper = Experimental::Impl::make_sycl_function_wrapper( - m_functor_reducer, indirectKernelMem); + auto functor_wrapper = + Impl::make_sycl_function_wrapper(m_functor_reducer, indirectKernelMem); sycl::event event = sycl_direct_launch(functor_wrapper, functor_wrapper.get_copy_event()); @@ -399,7 +399,7 @@ class ParallelScanSYCLBase { template class Kokkos::Impl::ParallelScan, - Kokkos::Experimental::SYCL> + Kokkos::SYCL> : private ParallelScanSYCLBase { public: using Base = ParallelScanSYCLBase; @@ -417,13 +417,12 @@ class Kokkos::Impl::ParallelScan, template class Kokkos::Impl::ParallelScanWithTotal< - FunctorType, Kokkos::RangePolicy, ReturnType, - Kokkos::Experimental::SYCL> + FunctorType, Kokkos::RangePolicy, ReturnType, Kokkos::SYCL> : public ParallelScanSYCLBase { public: using Base = ParallelScanSYCLBase; - const Kokkos::Experimental::SYCL& m_exec; + const Kokkos::SYCL& m_exec; inline void execute() { Base::impl_execute([&]() { @@ -445,7 +444,7 @@ class Kokkos::Impl::ParallelScanWithTotal< const typename Base::Policy& arg_policy, const ViewType& arg_result_view) : Base(arg_functor, arg_policy, arg_result_view.data(), - MemorySpaceAccess::accessible), m_exec(arg_policy.space()) {} }; diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Space.cpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Space.cpp index 19fad29150e5..022f88e0a812 100644 --- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Space.cpp +++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Space.cpp @@ -33,11 +33,11 @@ namespace Kokkos { namespace Impl { void DeepCopySYCL(void* dst, const void* src, size_t n) { - Experimental::Impl::SYCLInternal::singleton().m_queue->memcpy(dst, src, n); + Impl::SYCLInternal::singleton().m_queue->memcpy(dst, src, n); } -void DeepCopyAsyncSYCL(const Kokkos::Experimental::SYCL& instance, void* dst, - const void* src, size_t n) { +void DeepCopyAsyncSYCL(const Kokkos::SYCL& instance, void* dst, const void* src, + size_t n) { sycl::queue& q = *instance.impl_internal_space_instance()->m_queue; auto event = q.memcpy(dst, src, n); #ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES @@ -46,9 +46,8 @@ void DeepCopyAsyncSYCL(const Kokkos::Experimental::SYCL& instance, void* dst, } void DeepCopyAsyncSYCL(void* dst, const void* src, size_t n) { - Experimental::Impl::SYCLInternal::singleton().m_queue->memcpy(dst, src, n); - Experimental::SYCL().fence( - "Kokkos::Impl::DeepCopyAsyncSYCL: fence after memcpy"); + Impl::SYCLInternal::singleton().m_queue->memcpy(dst, src, n); + SYCL().fence("Kokkos::Impl::DeepCopyAsyncSYCL: fence after memcpy"); } } // namespace Impl @@ -60,12 +59,9 @@ namespace { std::string_view get_memory_space_name(sycl::usm::alloc allocation_kind) { switch (allocation_kind) { - case sycl::usm::alloc::host: - return Kokkos::Experimental::SYCLHostUSMSpace::name(); - case sycl::usm::alloc::device: - return Kokkos::Experimental::SYCLDeviceUSMSpace::name(); - case sycl::usm::alloc::shared: - return Kokkos::Experimental::SYCLSharedUSMSpace::name(); + case sycl::usm::alloc::host: return Kokkos::SYCLHostUSMSpace::name(); + case sycl::usm::alloc::device: return Kokkos::SYCLDeviceUSMSpace::name(); + case sycl::usm::alloc::shared: return Kokkos::SYCLSharedUSMSpace::name(); default: Kokkos::abort("bug: unknown sycl allocation type"); return "unreachable"; @@ -75,7 +71,6 @@ std::string_view get_memory_space_name(sycl::usm::alloc allocation_kind) { } // namespace namespace Kokkos { -namespace Experimental { SYCLDeviceUSMSpace::SYCLDeviceUSMSpace() : m_queue(*SYCL().impl_internal_space_instance()->m_queue) {} @@ -114,12 +109,12 @@ void* allocate_sycl(const char* arg_label, const size_t arg_alloc_size, return hostPtr; } -void* SYCLDeviceUSMSpace::allocate(const Kokkos::Experimental::SYCL& exec_space, +void* SYCLDeviceUSMSpace::allocate(const Kokkos::SYCL& exec_space, const size_t arg_alloc_size) const { return allocate(exec_space, "[unlabeled]", arg_alloc_size); } -void* SYCLDeviceUSMSpace::allocate(const Kokkos::Experimental::SYCL& exec_space, +void* SYCLDeviceUSMSpace::allocate(const Kokkos::SYCL& exec_space, const char* arg_label, const size_t arg_alloc_size, const size_t arg_logical_size) const { @@ -244,7 +239,6 @@ void SYCLHostUSMSpace::deallocate(const char* arg_label, Kokkos::Tools::make_space_handle(name()), m_queue); } -} // namespace Experimental } // namespace Kokkos //============================================================================== @@ -253,11 +247,11 @@ void SYCLHostUSMSpace::deallocate(const char* arg_label, #include KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( - Kokkos::Experimental::SYCLDeviceUSMSpace); + Kokkos::SYCLDeviceUSMSpace); KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( - Kokkos::Experimental::SYCLSharedUSMSpace); + Kokkos::SYCLSharedUSMSpace); KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( - Kokkos::Experimental::SYCLHostUSMSpace); + Kokkos::SYCLHostUSMSpace); // end Explicit instantiations of CRTP Base classes }}}1 //============================================================================== diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Space.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Space.hpp index b86cfca413c0..5a37da130caf 100644 --- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Space.hpp +++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Space.hpp @@ -39,8 +39,6 @@ template struct is_sycl_type_space : public std::false_type {}; } // namespace Impl -namespace Experimental { - class SYCLDeviceUSMSpace { public: using execution_space = SYCL; @@ -154,45 +152,40 @@ class SYCLHostUSMSpace { sycl::queue m_queue; }; -} // namespace Experimental - namespace Impl { template <> -struct is_sycl_type_space - : public std::true_type {}; +struct is_sycl_type_space : public std::true_type { +}; template <> -struct is_sycl_type_space - : public std::true_type {}; +struct is_sycl_type_space : public std::true_type { +}; template <> -struct is_sycl_type_space - : public std::true_type {}; +struct is_sycl_type_space : public std::true_type {}; -static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLDeviceUSMSpace, - Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable); +static_assert( + Kokkos::Impl::MemorySpaceAccess::assignable); -static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLSharedUSMSpace, - Kokkos::Experimental::SYCLSharedUSMSpace>::assignable); +static_assert( + Kokkos::Impl::MemorySpaceAccess::assignable); -static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLDeviceUSMSpace, - Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable); +static_assert( + Kokkos::Impl::MemorySpaceAccess::assignable); template <> -struct MemorySpaceAccess { +struct MemorySpaceAccess { enum : bool { assignable = false }; enum : bool { accessible = false }; enum : bool { deepcopy = true }; }; template <> -struct MemorySpaceAccess { +struct MemorySpaceAccess { // HostSpace::execution_space != SYCLSharedUSMSpace::execution_space enum : bool { assignable = false }; enum : bool { accessible = true }; @@ -200,26 +193,24 @@ struct MemorySpaceAccess -struct MemorySpaceAccess { +struct MemorySpaceAccess { // HostSpace::execution_space == - // Experimental::SYCLHostUSMSpace::execution_space + // SYCLHostUSMSpace::execution_space enum : bool { assignable = true }; enum : bool { accessible = true }; enum : bool { deepcopy = true }; }; template <> -struct MemorySpaceAccess { +struct MemorySpaceAccess { enum : bool { assignable = false }; enum : bool { accessible = false }; enum : bool { deepcopy = true }; }; template <> -struct MemorySpaceAccess { +struct MemorySpaceAccess { // SYCLDeviceUSMSpace::execution_space == SYCLSharedUSMSpace::execution_space enum : bool { assignable = true }; enum : bool { accessible = true }; @@ -227,14 +218,11 @@ struct MemorySpaceAccess -struct MemorySpaceAccess { - // Experimental::SYCLDeviceUSMSpace::execution_space != - // Experimental::SYCLHostUSMSpace::execution_space +struct MemorySpaceAccess { + // SYCLDeviceUSMSpace::execution_space != + // SYCLHostUSMSpace::execution_space enum : bool { assignable = false }; - enum : bool { - accessible = true - }; // Experimental::SYCLDeviceUSMSpace::execution_space + enum : bool { accessible = true }; // SYCLDeviceUSMSpace::execution_space enum : bool { deepcopy = true }; }; @@ -243,16 +231,15 @@ struct MemorySpaceAccess -struct MemorySpaceAccess { +struct MemorySpaceAccess { enum : bool { assignable = false }; enum : bool { accessible = false }; // SYCL cannot access HostSpace enum : bool { deepcopy = true }; }; template <> -struct MemorySpaceAccess { +struct MemorySpaceAccess { // SYCLSharedUSMSpace::execution_space == SYCLDeviceUSMSpace::execution_space // Can access SYCLSharedUSMSpace from Host but cannot access // SYCLDeviceUSMSpace from Host @@ -264,47 +251,38 @@ struct MemorySpaceAccess -struct MemorySpaceAccess { - // Experimental::SYCLSharedUSMSpace::execution_space != - // Experimental::SYCLHostUSMSpace::execution_space +struct MemorySpaceAccess { + // SYCLSharedUSMSpace::execution_space != + // SYCLHostUSMSpace::execution_space enum : bool { assignable = false }; - enum : bool { - accessible = true - }; // Experimental::SYCLSharedUSMSpace::execution_space + enum : bool { accessible = true }; // SYCLSharedUSMSpace::execution_space enum : bool { deepcopy = true }; }; template <> -struct MemorySpaceAccess { +struct MemorySpaceAccess { enum : bool { assignable = false }; // Cannot access from SYCL - enum : bool { - accessible = true - }; // Experimental::SYCLHostUSMSpace::execution_space + enum : bool { accessible = true }; // SYCLHostUSMSpace::execution_space enum : bool { deepcopy = true }; }; template <> -struct MemorySpaceAccess { +struct MemorySpaceAccess { enum : bool { assignable = false }; // Cannot access from Host enum : bool { accessible = false }; enum : bool { deepcopy = true }; }; template <> -struct MemorySpaceAccess { +struct MemorySpaceAccess { enum : bool { assignable = false }; // different execution_space enum : bool { accessible = true }; // same accessibility enum : bool { deepcopy = true }; }; template <> -struct MemorySpaceAccess< - Kokkos::Experimental::SYCLDeviceUSMSpace, - Kokkos::ScratchMemorySpace> { +struct MemorySpaceAccess> { enum : bool { assignable = false }; enum : bool { accessible = true }; enum : bool { deepcopy = false }; @@ -315,11 +293,9 @@ struct MemorySpaceAccess< } // namespace Kokkos KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_SPECIALIZATION( - Kokkos::Experimental::SYCLDeviceUSMSpace); -KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION( - Kokkos::Experimental::SYCLSharedUSMSpace); -KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION( - Kokkos::Experimental::SYCLHostUSMSpace); + Kokkos::SYCLDeviceUSMSpace); +KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION(Kokkos::SYCLSharedUSMSpace); +KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION(Kokkos::SYCLHostUSMSpace); #endif #endif diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Team.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Team.hpp index 1e42faa5a833..6359e4a2d9e7 100644 --- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Team.hpp +++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Team.hpp @@ -34,7 +34,7 @@ namespace Impl { */ class SYCLTeamMember { public: - using execution_space = Kokkos::Experimental::SYCL; + using execution_space = Kokkos::SYCL; using scratch_memory_space = execution_space::scratch_memory_space; using team_handle = SYCLTeamMember; @@ -126,6 +126,20 @@ class SYCLTeamMember { team_reduce(ReducerType const& reducer, typename ReducerType::value_type& value) const noexcept { using value_type = typename ReducerType::value_type; + using wrapped_reducer_type = + typename Impl::FunctorAnalysis, ReducerType, + value_type>::Reducer; + impl_team_reduce(wrapped_reducer_type(reducer), value); + reducer.reference() = value; + } + + template + KOKKOS_INLINE_FUNCTION std::enable_if_t::value> + impl_team_reduce( + WrappedReducerType const& wrapped_reducer, + typename WrappedReducerType::value_type& value) const noexcept { + using value_type = typename WrappedReducerType::value_type; auto sg = m_item.get_sub_group(); const auto sub_group_range = sg.get_local_range()[0]; @@ -139,7 +153,7 @@ class SYCLTeamMember { if (vector_range * shift < sub_group_range) { const value_type tmp = Kokkos::Impl::SYCLReduction::shift_group_left( sg, value, vector_range * shift); - if (team_rank_ + shift < team_size_) reducer.join(value, tmp); + if (team_rank_ + shift < team_size_) wrapped_reducer.join(&value, &tmp); } }; shuffle_combine(1); @@ -153,14 +167,13 @@ class SYCLTeamMember { shift <<= 1) { auto tmp = Kokkos::Impl::SYCLReduction::shift_group_left( sg, value, vector_range * shift); - if (team_rank_ + shift < team_size_) reducer.join(value, tmp); + if (team_rank_ + shift < team_size_) wrapped_reducer.join(&value, &tmp); } #endif value = Kokkos::Impl::SYCLReduction::select_from_group(sg, value, 0); const int n_subgroups = sg.get_group_range()[0]; if (n_subgroups == 1) { - reducer.reference() = value; return; } @@ -187,16 +200,15 @@ class SYCLTeamMember { for (int start = step_width; start < n_subgroups; start += step_width) { if (id_in_sg == 0 && group_id >= start && group_id < std::min(start + step_width, n_subgroups)) - reducer.join(reduction_array[group_id - start], value); + wrapped_reducer.join(&reduction_array[group_id - start], &value); sycl::group_barrier(m_item.get_group()); } // Do the final reduction for all threads redundantly value = reduction_array[0]; for (int i = 1; i < std::min(step_width, n_subgroups); ++i) - reducer.join(value, reduction_array[i]); + wrapped_reducer.join(&value, &reduction_array[i]); - reducer.reference() = value; // Make sure that every thread is done using the reduction array. sycl::group_barrier(m_item.get_group()); } @@ -271,8 +283,8 @@ class SYCLTeamMember { const auto update = Kokkos::Impl::SYCLReduction::shift_group_right(sg, value, vector_range); - Type intermediate = (group_id > 0 ? base_data[group_id - 1] : 0) + - (id_in_sg >= vector_range ? update : 0); + Type intermediate = (group_id > 0 ? base_data[group_id - 1] : Type{0}) + + (id_in_sg >= vector_range ? update : Type{0}); if (global_accum) { if (id_in_sg == sub_group_range - 1 && @@ -311,6 +323,19 @@ class SYCLTeamMember { KOKKOS_INLINE_FUNCTION std::enable_if_t::value> vector_reduce(ReducerType const& reducer, typename ReducerType::value_type& value) const { + using value_type = typename ReducerType::value_type; + using wrapped_reducer_type = + typename Impl::FunctorAnalysis, ReducerType, + value_type>::Reducer; + impl_vector_reduce(wrapped_reducer_type(reducer), value); + reducer.reference() = value; + } + + template + KOKKOS_INLINE_FUNCTION std::enable_if_t::value> + impl_vector_reduce(WrappedReducerType const& wrapped_reducer, + typename WrappedReducerType::value_type& value) const { const auto tidx1 = m_item.get_local_id(1); const auto grange1 = m_item.get_local_range(1); @@ -319,13 +344,13 @@ class SYCLTeamMember { if (grange1 == 1) return; // Intra vector lane shuffle reduction: - typename ReducerType::value_type tmp(value); - typename ReducerType::value_type tmp2 = tmp; + typename WrappedReducerType::value_type tmp(value); + typename WrappedReducerType::value_type tmp2 = tmp; for (int i = grange1; (i >>= 1);) { tmp2 = Kokkos::Impl::SYCLReduction::shift_group_left(sg, tmp, i); if (static_cast(tidx1) < i) { - reducer.join(tmp, tmp2); + wrapped_reducer.join(&tmp, &tmp2); } } @@ -336,8 +361,7 @@ class SYCLTeamMember { tmp2 = Kokkos::Impl::SYCLReduction::select_from_group( sg, tmp, (sg.get_local_id() / grange1) * grange1); - value = tmp2; - reducer.reference() = tmp2; + value = tmp2; } //---------------------------------------- @@ -531,8 +555,16 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t::value> parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< iType, Impl::SYCLTeamMember>& loop_boundaries, const Closure& closure, const ReducerType& reducer) { - typename ReducerType::value_type value; - reducer.init(value); + using value_type = typename ReducerType::value_type; + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, ReducerType, + value_type>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + + wrapped_reducer_type wrapped_reducer(reducer); + value_type value; + wrapped_reducer.init(&value); for (iType i = loop_boundaries.start + loop_boundaries.member.item().get_local_id(0); @@ -541,7 +573,9 @@ parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< closure(i, value); } - loop_boundaries.member.team_reduce(reducer, value); + loop_boundaries.member.impl_team_reduce(wrapped_reducer, value); + wrapped_reducer.final(&value); + reducer.reference() = value; } /** \brief Inter-thread parallel_reduce assuming summation. @@ -557,20 +591,28 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t::value> parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< iType, Impl::SYCLTeamMember>& loop_boundaries, const Closure& closure, ValueType& result) { - ValueType val; - Kokkos::Sum reducer(val); + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, Closure, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; - reducer.init(reducer.reference()); + wrapped_reducer_type wrapped_reducer(closure); + value_type value; + wrapped_reducer.init(&value); for (iType i = loop_boundaries.start + loop_boundaries.member.item().get_local_id(0); i < loop_boundaries.end; i += loop_boundaries.member.item().get_local_range(0)) { - closure(i, val); + closure(i, value); } - loop_boundaries.member.team_reduce(reducer, val); - result = reducer.reference(); + loop_boundaries.member.impl_team_reduce(wrapped_reducer, value); + + wrapped_reducer.final(&value); + result = value; } /** \brief Inter-thread parallel exclusive prefix sum. @@ -657,8 +699,16 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t::value> parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct< iType, Impl::SYCLTeamMember>& loop_boundaries, const Closure& closure, const ReducerType& reducer) { - typename ReducerType::value_type value; - reducer.init(value); + using value_type = typename ReducerType::value_type; + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, ReducerType, + value_type>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + + wrapped_reducer_type wrapped_reducer(reducer); + value_type value; + wrapped_reducer.init(&value); const iType tidx0 = loop_boundaries.member.item().get_local_id(0); const iType tidx1 = loop_boundaries.member.item().get_local_id(1); @@ -670,8 +720,11 @@ parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct< i < loop_boundaries.end; i += grange0 * grange1) closure(i, value); - loop_boundaries.member.vector_reduce(reducer, value); - loop_boundaries.member.team_reduce(reducer, value); + loop_boundaries.member.impl_vector_reduce(wrapped_reducer, value); + loop_boundaries.member.impl_team_reduce(wrapped_reducer, value); + + wrapped_reducer.final(&value); + reducer.reference() = value; } template @@ -679,10 +732,16 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t::value> parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct< iType, Impl::SYCLTeamMember>& loop_boundaries, const Closure& closure, ValueType& result) { - ValueType val; - Kokkos::Sum reducer(val); + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, Closure, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; - reducer.init(reducer.reference()); + wrapped_reducer_type wrapped_reducer(closure); + value_type value; + wrapped_reducer.init(&value); const iType tidx0 = loop_boundaries.member.item().get_local_id(0); const iType tidx1 = loop_boundaries.member.item().get_local_id(1); @@ -692,11 +751,13 @@ parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct< for (iType i = loop_boundaries.start + tidx0 * grange1 + tidx1; i < loop_boundaries.end; i += grange0 * grange1) - closure(i, val); + closure(i, value); + + loop_boundaries.member.impl_vector_reduce(wrapped_reducer, value); + loop_boundaries.member.impl_team_reduce(wrapped_reducer, value); - loop_boundaries.member.vector_reduce(reducer); - loop_boundaries.member.team_reduce(reducer); - result = reducer.reference(); + wrapped_reducer.final(&value); + result = value; } //---------------------------------------------------------------------------- @@ -746,16 +807,27 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t::value> parallel_reduce(Impl::ThreadVectorRangeBoundariesStruct< iType, Impl::SYCLTeamMember> const& loop_boundaries, Closure const& closure, ReducerType const& reducer) { - reducer.init(reducer.reference()); + using value_type = typename ReducerType::value_type; + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, ReducerType, + value_type>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + + wrapped_reducer_type wrapped_reducer(reducer); + value_type value; + wrapped_reducer.init(&value); const iType tidx1 = loop_boundaries.member.item().get_local_id(1); const iType grange1 = loop_boundaries.member.item().get_local_range(1); for (iType i = loop_boundaries.start + tidx1; i < loop_boundaries.end; i += grange1) - closure(i, reducer.reference()); + closure(i, value); - loop_boundaries.member.vector_reduce(reducer); + loop_boundaries.member.impl_vector_reduce(wrapped_reducer, value); + wrapped_reducer.final(&value); + reducer.reference() = value; } /** \brief Intra-thread vector parallel_reduce. @@ -774,16 +846,27 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t::value> parallel_reduce(Impl::ThreadVectorRangeBoundariesStruct< iType, Impl::SYCLTeamMember> const& loop_boundaries, Closure const& closure, ValueType& result) { - result = ValueType(); + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, Closure, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; + + wrapped_reducer_type wrapped_reducer(closure); + value_type value; + wrapped_reducer.init(&value); const iType tidx1 = loop_boundaries.member.item().get_local_id(1); const int grange1 = loop_boundaries.member.item().get_local_range(1); for (iType i = loop_boundaries.start + tidx1; i < loop_boundaries.end; i += grange1) - closure(i, result); + closure(i, value); - loop_boundaries.member.vector_reduce(Kokkos::Sum(result)); + loop_boundaries.member.impl_vector_reduce(wrapped_reducer, value); + wrapped_reducer.final(&value); + result = value; } //---------------------------------------------------------------------------- diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_TeamPolicy.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_TeamPolicy.hpp index 17ce59058bdd..556ca0d28186 100644 --- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_TeamPolicy.hpp +++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_TeamPolicy.hpp @@ -22,8 +22,7 @@ #include template -class Kokkos::Impl::TeamPolicyInternal +class Kokkos::Impl::TeamPolicyInternal : public PolicyTraits { public: using execution_policy = TeamPolicyInternal; @@ -45,7 +44,7 @@ class Kokkos::Impl::TeamPolicyInternal TeamPolicyInternal(TeamPolicyInternal const& p) { diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_UniqueToken.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_UniqueToken.hpp index d55fc6a84ba4..79d9e8a8d482 100644 --- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_UniqueToken.hpp +++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_UniqueToken.hpp @@ -22,13 +22,14 @@ #include namespace Kokkos { -namespace Experimental { namespace Impl { Kokkos::View sycl_global_unique_token_locks( bool deallocate = false); } +namespace Experimental { + // both global and instance Unique Tokens are implemented in the same way // the global version has one shared static lock array underneath // but it can't be a static member variable since we need to acces it on device @@ -42,7 +43,7 @@ class UniqueToken { using size_type = int32_t; explicit UniqueToken(execution_space const& = execution_space()) - : m_locks(Impl::sycl_global_unique_token_locks()) {} + : m_locks(Kokkos::Impl::sycl_global_unique_token_locks()) {} KOKKOS_DEFAULTED_FUNCTION UniqueToken(const UniqueToken&) = default; @@ -75,11 +76,15 @@ class UniqueToken { /// \brief acquire value such that 0 <= value < size() KOKKOS_INLINE_FUNCTION size_type impl_acquire() const { +#if defined(__INTEL_LLVM_COMPILER) && __INTEL_LLVM_COMPILER >= 20250000 + auto item = sycl::ext::oneapi::this_work_item::get_nd_item<3>(); +#else auto item = sycl::ext::oneapi::experimental::this_nd_item<3>(); +#endif std::size_t threadIdx[3] = {item.get_local_id(2), item.get_local_id(1), item.get_local_id(0)}; std::size_t blockIdx[3] = {item.get_group(2), item.get_group(1), - item.get_group(0)}; + item.get_group(0)}; std::size_t blockDim[3] = {item.get_local_range(2), item.get_local_range(1), item.get_local_range(0)}; @@ -122,11 +127,11 @@ class UniqueToken public: UniqueToken() : UniqueToken( - Kokkos::Experimental::SYCL().concurrency()) {} + Kokkos::SYCL().concurrency()) {} explicit UniqueToken(execution_space const& arg) : UniqueToken( - Kokkos::Experimental::SYCL().concurrency(), arg) {} + Kokkos::SYCL().concurrency(), arg) {} explicit UniqueToken(size_type max_size) : UniqueToken(max_size) {} diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ZeroMemset.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ZeroMemset.hpp index 61db6b34aac0..2905733a4de9 100644 --- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ZeroMemset.hpp +++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ZeroMemset.hpp @@ -23,12 +23,11 @@ namespace Kokkos { namespace Impl { -template -struct ZeroMemset> { - ZeroMemset(const Kokkos::Experimental::SYCL& exec_space, - const View& dst) { - auto event = exec_space.impl_internal_space_instance()->m_queue->memset( - dst.data(), 0, dst.size() * sizeof(typename View::value_type)); +template <> +struct ZeroMemset { + ZeroMemset(const Kokkos::SYCL& exec_space, void* dst, size_t cnt) { + auto event = + exec_space.impl_internal_space_instance()->m_queue->memset(dst, 0, cnt); #ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES exec_space.impl_internal_space_instance() ->m_queue->ext_oneapi_submit_barrier(std::vector{event}); diff --git a/packages/kokkos/core/src/Serial/Kokkos_Serial.hpp b/packages/kokkos/core/src/Serial/Kokkos_Serial.hpp index 81d43b31b35b..a1fa9e43e083 100644 --- a/packages/kokkos/core/src/Serial/Kokkos_Serial.hpp +++ b/packages/kokkos/core/src/Serial/Kokkos_Serial.hpp @@ -34,7 +34,6 @@ static_assert(false, #include #include #include -#include #include #include #include @@ -267,7 +266,7 @@ template std::vector partition_space(const Serial&, std::vector const& weights) { static_assert( - std::is_arithmetic::value, + std::is_arithmetic_v, "Kokkos Error: partitioning arguments must be integers or floats"); // We only care about the number of instances to create and ignore weights @@ -284,7 +283,9 @@ std::vector partition_space(const Serial&, #include #include #include +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 #include +#endif #include #endif // defined( KOKKOS_ENABLE_SERIAL ) diff --git a/packages/kokkos/core/src/Serial/Kokkos_Serial_Parallel_MDRange.hpp b/packages/kokkos/core/src/Serial/Kokkos_Serial_Parallel_MDRange.hpp index 34e115eca9b7..addcaba009fa 100644 --- a/packages/kokkos/core/src/Serial/Kokkos_Serial_Parallel_MDRange.hpp +++ b/packages/kokkos/core/src/Serial/Kokkos_Serial_Parallel_MDRange.hpp @@ -44,11 +44,16 @@ class ParallelFor, public: inline void execute() const { + // caused a possibly codegen-related slowdown, especially in GCC 9-11 + // with KOKKOS_ARCH_NATIVE + // https://github.com/kokkos/kokkos/issues/7268 +#ifndef KOKKOS_ENABLE_ATOMICS_BYPASS // Make sure kernels are running sequentially even when using multiple // threads auto* internal_instance = m_iter.m_rp.space().impl_internal_space_instance(); std::lock_guard lock(internal_instance->m_instance_mutex); +#endif this->exec(); } template @@ -112,10 +117,15 @@ class ParallelReduce instance_lock( internal_instance->m_instance_mutex); +#endif internal_instance->resize_thread_team_data( pool_reduce_size, team_reduce_size, team_shared_size, thread_local_size); diff --git a/packages/kokkos/core/src/Serial/Kokkos_Serial_Parallel_Range.hpp b/packages/kokkos/core/src/Serial/Kokkos_Serial_Parallel_Range.hpp index 80faec9041d5..2ab7b7f80348 100644 --- a/packages/kokkos/core/src/Serial/Kokkos_Serial_Parallel_Range.hpp +++ b/packages/kokkos/core/src/Serial/Kokkos_Serial_Parallel_Range.hpp @@ -31,7 +31,7 @@ class ParallelFor, Kokkos::Serial> { const Policy m_policy; template - std::enable_if_t::value> exec() const { + std::enable_if_t> exec() const { const typename Policy::member_type e = m_policy.end(); for (typename Policy::member_type i = m_policy.begin(); i < e; ++i) { m_functor(i); @@ -39,7 +39,7 @@ class ParallelFor, Kokkos::Serial> { } template - std::enable_if_t::value> exec() const { + std::enable_if_t> exec() const { const TagType t{}; const typename Policy::member_type e = m_policy.end(); for (typename Policy::member_type i = m_policy.begin(); i < e; ++i) { @@ -49,10 +49,15 @@ class ParallelFor, Kokkos::Serial> { public: inline void execute() const { + // caused a possibly codegen-related slowdown, especially in GCC 9-11 + // with KOKKOS_ARCH_NATIVE + // https://github.com/kokkos/kokkos/issues/7268 +#ifndef KOKKOS_ENABLE_ATOMICS_BYPASS // Make sure kernels are running sequentially even when using multiple // threads auto* internal_instance = m_policy.space().impl_internal_space_instance(); std::lock_guard lock(internal_instance->m_instance_mutex); +#endif this->template exec(); } @@ -79,7 +84,7 @@ class ParallelReduce, const pointer_type m_result_ptr; template - inline std::enable_if_t::value> exec( + inline std::enable_if_t> exec( reference_type update) const { const typename Policy::member_type e = m_policy.end(); for (typename Policy::member_type i = m_policy.begin(); i < e; ++i) { @@ -88,7 +93,7 @@ class ParallelReduce, } template - inline std::enable_if_t::value> exec( + inline std::enable_if_t> exec( reference_type update) const { const TagType t{}; @@ -108,10 +113,15 @@ class ParallelReduce, auto* internal_instance = m_policy.space().impl_internal_space_instance(); + // caused a possibly codegen-related slowdown, especially in GCC 9-11 + // with KOKKOS_ARCH_NATIVE + // https://github.com/kokkos/kokkos/issues/7268 +#ifndef KOKKOS_ENABLE_ATOMICS_BYPASS // Make sure kernels are running sequentially even when using multiple // threads, lock resize_thread_team_data std::lock_guard instance_lock( internal_instance->m_instance_mutex); +#endif internal_instance->resize_thread_team_data( pool_reduce_size, team_reduce_size, team_shared_size, thread_local_size); @@ -166,7 +176,7 @@ class ParallelScan, const Policy m_policy; template - inline std::enable_if_t::value> exec( + inline std::enable_if_t> exec( reference_type update) const { const typename Policy::member_type e = m_policy.end(); for (typename Policy::member_type i = m_policy.begin(); i < e; ++i) { @@ -175,7 +185,7 @@ class ParallelScan, } template - inline std::enable_if_t::value> exec( + inline std::enable_if_t> exec( reference_type update) const { const TagType t{}; const typename Policy::member_type e = m_policy.end(); @@ -194,10 +204,16 @@ class ParallelScan, const size_t thread_local_size = 0; // Never shrinks auto* internal_instance = m_policy.space().impl_internal_space_instance(); + + // caused a possibly codegen-related slowdown, especially in GCC 9-11 + // with KOKKOS_ARCH_NATIVE + // https://github.com/kokkos/kokkos/issues/7268 +#ifndef KOKKOS_ENABLE_ATOMICS_BYPASS // Make sure kernels are running sequentially even when using multiple // threads, lock resize_thread_team_data std::lock_guard instance_lock( internal_instance->m_instance_mutex); +#endif internal_instance->resize_thread_team_data( pool_reduce_size, team_reduce_size, team_shared_size, @@ -235,7 +251,7 @@ class ParallelScanWithTotal, const pointer_type m_result_ptr; template - inline std::enable_if_t::value> exec( + inline std::enable_if_t> exec( reference_type update) const { const typename Policy::member_type e = m_policy.end(); for (typename Policy::member_type i = m_policy.begin(); i < e; ++i) { @@ -244,7 +260,7 @@ class ParallelScanWithTotal, } template - inline std::enable_if_t::value> exec( + inline std::enable_if_t> exec( reference_type update) const { const TagType t{}; const typename Policy::member_type e = m_policy.end(); @@ -262,10 +278,16 @@ class ParallelScanWithTotal, const size_t thread_local_size = 0; // Never shrinks auto* internal_instance = m_policy.space().impl_internal_space_instance(); + + // caused a possibly codegen-related slowdown, especially in GCC 9-11 + // with KOKKOS_ARCH_NATIVE + // https://github.com/kokkos/kokkos/issues/7268 +#ifndef KOKKOS_ENABLE_ATOMICS_BYPASS // Make sure kernels are running sequentially even when using multiple // threads, lock resize_thread_team_data std::lock_guard instance_lock( internal_instance->m_instance_mutex); +#endif internal_instance->resize_thread_team_data( pool_reduce_size, team_reduce_size, team_shared_size, diff --git a/packages/kokkos/core/src/Serial/Kokkos_Serial_Parallel_Team.hpp b/packages/kokkos/core/src/Serial/Kokkos_Serial_Parallel_Team.hpp index a523cc86c97b..7a6faf3d9fb5 100644 --- a/packages/kokkos/core/src/Serial/Kokkos_Serial_Parallel_Team.hpp +++ b/packages/kokkos/core/src/Serial/Kokkos_Serial_Parallel_Team.hpp @@ -223,7 +223,7 @@ class ParallelFor, const size_t m_shared; template - inline std::enable_if_t::value> exec( + inline std::enable_if_t> exec( HostThreadTeamData& data) const { for (int ileague = 0; ileague < m_league; ++ileague) { m_functor(Member(data, ileague, m_league)); @@ -231,7 +231,7 @@ class ParallelFor, } template - inline std::enable_if_t::value> exec( + inline std::enable_if_t> exec( HostThreadTeamData& data) const { const TagType t{}; for (int ileague = 0; ileague < m_league; ++ileague) { @@ -247,10 +247,16 @@ class ParallelFor, const size_t thread_local_size = 0; // Never shrinks auto* internal_instance = m_policy.space().impl_internal_space_instance(); + + // caused a possibly codegen-related slowdown, especially in GCC 9-11 + // with KOKKOS_ARCH_NATIVE + // https://github.com/kokkos/kokkos/issues/7268 +#ifndef KOKKOS_ENABLE_ATOMICS_BYPASS // Make sure kernels are running sequentially even when using multiple // threads, lock resize_thread_team_data std::lock_guard instance_lock( internal_instance->m_instance_mutex); +#endif internal_instance->resize_thread_team_data( pool_reduce_size, team_reduce_size, team_shared_size, @@ -293,7 +299,7 @@ class ParallelReduce - inline std::enable_if_t::value> exec( + inline std::enable_if_t> exec( HostThreadTeamData& data, reference_type update) const { for (int ileague = 0; ileague < m_league; ++ileague) { m_functor_reducer.get_functor()(Member(data, ileague, m_league), update); @@ -301,7 +307,7 @@ class ParallelReduce - inline std::enable_if_t::value> exec( + inline std::enable_if_t> exec( HostThreadTeamData& data, reference_type update) const { const TagType t{}; @@ -321,10 +327,16 @@ class ParallelReduce instance_lock( internal_instance->m_instance_mutex); +#endif internal_instance->resize_thread_team_data( pool_reduce_size, team_reduce_size, team_shared_size, diff --git a/packages/kokkos/core/src/Serial/Kokkos_Serial_Task.hpp b/packages/kokkos/core/src/Serial/Kokkos_Serial_Task.hpp index 5905d6d32e14..678d18250474 100644 --- a/packages/kokkos/core/src/Serial/Kokkos_Serial_Task.hpp +++ b/packages/kokkos/core/src/Serial/Kokkos_Serial_Task.hpp @@ -25,10 +25,16 @@ #include #include #include +#include //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS +// We allow using deprecated classes in this file +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() +#endif + namespace Kokkos { namespace Impl { @@ -102,9 +108,8 @@ class TaskQueueSpecialization> { template class TaskQueueSpecializationConstrained< - Scheduler, - std::enable_if_t::value>> { + Scheduler, std::enable_if_t>> { public: // Note: Scheduler may be an incomplete type at class scope (but not inside // of the methods, obviously) @@ -215,6 +220,10 @@ extern template class TaskQueue, FunctorType m_functor; template - std::enable_if_t::value> exec_one( + std::enable_if_t> exec_one( const std::int32_t w) const noexcept { m_functor(w); } template - std::enable_if_t::value> exec_one( + std::enable_if_t> exec_one( const std::int32_t w) const noexcept { const TagType t{}; m_functor(t, w); diff --git a/packages/kokkos/core/src/Serial/Kokkos_Serial_ZeroMemset.hpp b/packages/kokkos/core/src/Serial/Kokkos_Serial_ZeroMemset.hpp index 6ad6aabc5a7c..527e09407989 100644 --- a/packages/kokkos/core/src/Serial/Kokkos_Serial_ZeroMemset.hpp +++ b/packages/kokkos/core/src/Serial/Kokkos_Serial_ZeroMemset.hpp @@ -31,15 +31,11 @@ namespace Impl { // parallel execution space since the specialization for // DefaultHostExecutionSpace is defined elsewhere. struct DummyExecutionSpace; -template +template <> struct ZeroMemset< - std::conditional_t::value, - Serial, DummyExecutionSpace>, - View> { - ZeroMemset(const Serial&, const View& dst) { - using ValueType = typename View::value_type; - std::memset(dst.data(), 0, sizeof(ValueType) * dst.size()); - } + std::conditional_t, + Serial, DummyExecutionSpace>> { + ZeroMemset(const Serial&, void* dst, size_t cnt) { std::memset(dst, 0, cnt); } }; } // namespace Impl diff --git a/packages/kokkos/core/src/Threads/Kokkos_Threads_Instance.cpp b/packages/kokkos/core/src/Threads/Kokkos_Threads_Instance.cpp index 3842966cd77b..edc9489f67e7 100644 --- a/packages/kokkos/core/src/Threads/Kokkos_Threads_Instance.cpp +++ b/packages/kokkos/core/src/Threads/Kokkos_Threads_Instance.cpp @@ -67,8 +67,9 @@ std::pair int s_thread_pool_size[3] = {0, 0, 0}; -void (*volatile s_current_function)(ThreadsInternal &, const void *); -const void *volatile s_current_function_arg = nullptr; +using s_current_function_type = void (*)(ThreadsInternal &, const void *); +std::atomic s_current_function; +std::atomic s_current_function_arg = nullptr; inline unsigned fan_size(const unsigned rank, const unsigned size) { const unsigned rank_rev = size - (rank + 1); @@ -79,7 +80,7 @@ inline unsigned fan_size(const unsigned rank, const unsigned size) { return count; } -void wait_yield(volatile ThreadState &flag, const ThreadState value) { +void wait_yield(std::atomic &flag, const ThreadState value) { while (value == flag) { std::this_thread::yield(); } @@ -135,11 +136,12 @@ ThreadsInternal::ThreadsInternal() ThreadsInternal *const nil = nullptr; // Which entry in 's_threads_exec', possibly determined from hwloc binding - const int entry = reinterpret_cast(s_current_function_arg) < - size_t(s_thread_pool_size[0]) - ? reinterpret_cast(s_current_function_arg) - : size_t(Kokkos::hwloc::bind_this_thread( - s_thread_pool_size[0], s_threads_coord)); + const int entry = + reinterpret_cast(s_current_function_arg.load()) < + size_t(s_thread_pool_size[0]) + ? reinterpret_cast(s_current_function_arg.load()) + : size_t(Kokkos::hwloc::bind_this_thread(s_thread_pool_size[0], + s_threads_coord)); // Given a good entry set this thread in the 's_threads_exec' array if (entry < s_thread_pool_size[0] && @@ -543,7 +545,7 @@ void ThreadsInternal::initialize(int thread_count_arg) { for (unsigned ith = 1; ith < thread_count; ++ith) { // Try to protect against cache coherency failure by casting to volatile. ThreadsInternal *const th = - ((ThreadsInternal * volatile *)s_threads_exec)[ith]; + ((ThreadsInternal *volatile *)s_threads_exec)[ith]; if (th) { wait_yield(th->m_pool_state, ThreadState::Active); } else { diff --git a/packages/kokkos/core/src/Threads/Kokkos_Threads_Instance.hpp b/packages/kokkos/core/src/Threads/Kokkos_Threads_Instance.hpp index a5eb231cb011..130b3433d026 100644 --- a/packages/kokkos/core/src/Threads/Kokkos_Threads_Instance.hpp +++ b/packages/kokkos/core/src/Threads/Kokkos_Threads_Instance.hpp @@ -60,7 +60,7 @@ class ThreadsInternal { int m_pool_rank_rev; int m_pool_size; int m_pool_fan_size; - ThreadState volatile m_pool_state; ///< State for global synchronizations + std::atomic m_pool_state; ///< State for global synchronizations // Members for dynamic scheduling // Which thread am I stealing from currently @@ -96,7 +96,7 @@ class ThreadsInternal { return reinterpret_cast(m_scratch) + m_scratch_reduce_end; } - KOKKOS_INLINE_FUNCTION ThreadState volatile &state() { return m_pool_state; } + KOKKOS_INLINE_FUNCTION auto &state() { return m_pool_state; } KOKKOS_INLINE_FUNCTION ThreadsInternal *const *pool_base() const { return m_pool_base; } @@ -225,7 +225,7 @@ class ThreadsInternal { // to inactive triggers another thread to exit a spinwait // and read the 'reduce_memory'. // Must 'memory_fence()' to guarantee that storing the update to - // 'reduce_memory()' will complete before storing the the update to + // 'reduce_memory()' will complete before storing the update to // 'm_pool_state'. memory_fence(); @@ -403,7 +403,7 @@ class ThreadsInternal { static void start(void (*)(ThreadsInternal &, const void *), const void *); #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 - KOKKOS_DEPRECATED static int in_parallel(); + static int in_parallel(); #endif static void fence(); static void fence(const std::string &); diff --git a/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_MDRange.hpp b/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_MDRange.hpp index 59577609ab78..711b1b69261f 100644 --- a/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_MDRange.hpp +++ b/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_MDRange.hpp @@ -51,7 +51,7 @@ class ParallelFor, } template - static std::enable_if_t::value> + static std::enable_if_t> exec_schedule(ThreadsInternal &instance, const void *arg) { const ParallelFor &self = *((const ParallelFor *)arg); @@ -65,7 +65,7 @@ class ParallelFor, } template - static std::enable_if_t::value> + static std::enable_if_t> exec_schedule(ThreadsInternal &instance, const void *arg) { const ParallelFor &self = *((const ParallelFor *)arg); diff --git a/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_Range.hpp b/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_Range.hpp index 4a89c4fad823..25aab9ebfbc1 100644 --- a/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_Range.hpp +++ b/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_Range.hpp @@ -35,7 +35,7 @@ class ParallelFor, const Policy m_policy; template - inline static std::enable_if_t::value> exec_range( + inline static std::enable_if_t> exec_range( const FunctorType &functor, const Member ibeg, const Member iend) { #if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \ defined(KOKKOS_ENABLE_PRAGMA_IVDEP) @@ -47,7 +47,7 @@ class ParallelFor, } template - inline static std::enable_if_t::value> exec_range( + inline static std::enable_if_t> exec_range( const FunctorType &functor, const Member ibeg, const Member iend) { const TagType t{}; #if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \ @@ -64,7 +64,7 @@ class ParallelFor, } template - static std::enable_if_t::value> + static std::enable_if_t> exec_schedule(ThreadsInternal &instance, const void *arg) { const ParallelFor &self = *((const ParallelFor *)arg); @@ -77,7 +77,7 @@ class ParallelFor, } template - static std::enable_if_t::value> + static std::enable_if_t> exec_schedule(ThreadsInternal &instance, const void *arg) { const ParallelFor &self = *((const ParallelFor *)arg); diff --git a/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_Team.hpp b/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_Team.hpp index f927d7c6a67e..40be3884c3d4 100644 --- a/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_Team.hpp +++ b/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_Team.hpp @@ -36,8 +36,8 @@ class ParallelFor, const size_t m_shared; template - inline static std::enable_if_t::value && - std::is_same::value> + inline static std::enable_if_t && + std::is_same_v> exec_team(const FunctorType &functor, Member member) { for (; member.valid_static(); member.next_static()) { functor(member); @@ -45,8 +45,8 @@ class ParallelFor, } template - inline static std::enable_if_t::value && - std::is_same::value> + inline static std::enable_if_t && + std::is_same_v> exec_team(const FunctorType &functor, Member member) { const TagType t{}; for (; member.valid_static(); member.next_static()) { @@ -55,8 +55,8 @@ class ParallelFor, } template - inline static std::enable_if_t::value && - std::is_same::value> + inline static std::enable_if_t && + std::is_same_v> exec_team(const FunctorType &functor, Member member) { for (; member.valid_dynamic(); member.next_dynamic()) { functor(member); @@ -64,8 +64,8 @@ class ParallelFor, } template - inline static std::enable_if_t::value && - std::is_same::value> + inline static std::enable_if_t && + std::is_same_v> exec_team(const FunctorType &functor, Member member) { const TagType t{}; for (; member.valid_dynamic(); member.next_dynamic()) { @@ -88,8 +88,12 @@ class ParallelFor, policy.impl_set_vector_length(1); } if (policy.team_size() < 0) { - policy.impl_set_team_size( - policy.team_size_recommended(m_functor, ParallelForTag{})); + int team_size = policy.team_size_recommended(m_functor, ParallelForTag{}); + if (team_size <= 0) + Kokkos::Impl::throw_runtime_exception( + "Kokkos::Impl::ParallelFor could not find a " + "valid execution configuration."); + policy.impl_set_team_size(team_size); } return policy; } diff --git a/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelReduce_MDRange.hpp b/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelReduce_MDRange.hpp index fa63215a9e5d..9f28f9bbfcc2 100644 --- a/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelReduce_MDRange.hpp +++ b/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelReduce_MDRange.hpp @@ -59,7 +59,7 @@ class ParallelReduce - static std::enable_if_t::value> + static std::enable_if_t> exec_schedule(ThreadsInternal &instance, const void *arg) { const ParallelReduce &self = *((const ParallelReduce *)arg); @@ -76,7 +76,7 @@ class ParallelReduce - static std::enable_if_t::value> + static std::enable_if_t> exec_schedule(ThreadsInternal &instance, const void *arg) { const ParallelReduce &self = *((const ParallelReduce *)arg); @@ -91,8 +91,8 @@ class ParallelReduce(instance.reduce_memory())); + reference_type update = + reducer.init(static_cast(instance.reduce_memory())); while (work_index != -1) { const Member begin = static_cast(work_index); const Member end = begin + 1 < num_tiles ? begin + 1 : num_tiles; @@ -100,7 +100,7 @@ class ParallelReduce, const pointer_type m_result_ptr; template - inline static std::enable_if_t::value> exec_range( + inline static std::enable_if_t> exec_range( const FunctorType &functor, const Member &ibeg, const Member &iend, reference_type update) { #if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \ @@ -55,7 +55,7 @@ class ParallelReduce, } template - inline static std::enable_if_t::value> exec_range( + inline static std::enable_if_t> exec_range( const FunctorType &functor, const Member &ibeg, const Member &iend, reference_type update) { const TagType t{}; @@ -73,7 +73,7 @@ class ParallelReduce, } template - static std::enable_if_t::value> + static std::enable_if_t> exec_schedule(ThreadsInternal &instance, const void *arg) { const ParallelReduce &self = *((const ParallelReduce *)arg); const WorkRange range(self.m_policy, instance.pool_rank(), @@ -89,7 +89,7 @@ class ParallelReduce, } template - static std::enable_if_t::value> + static std::enable_if_t> exec_schedule(ThreadsInternal &instance, const void *arg) { const ParallelReduce &self = *((const ParallelReduce *)arg); const WorkRange range(self.m_policy, instance.pool_rank(), diff --git a/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelReduce_Team.hpp b/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelReduce_Team.hpp index 4db310701f9f..69527ee3e65e 100644 --- a/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelReduce_Team.hpp +++ b/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelReduce_Team.hpp @@ -42,7 +42,7 @@ class ParallelReduce - inline static std::enable_if_t::value> exec_team( + inline static std::enable_if_t> exec_team( const FunctorType &functor, Member member, reference_type update) { for (; member.valid_static(); member.next_static()) { functor(member, update); @@ -50,7 +50,7 @@ class ParallelReduce - inline static std::enable_if_t::value> exec_team( + inline static std::enable_if_t> exec_team( const FunctorType &functor, Member member, reference_type update) { const TagType t{}; for (; member.valid_static(); member.next_static()) { @@ -106,9 +106,14 @@ class ParallelReduce could not find " + "a valid execution configuration."); + policy.impl_set_team_size(team_size); } return policy; } diff --git a/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelScan_Range.hpp b/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelScan_Range.hpp index 62f34d741ff4..d54f4ca952e6 100644 --- a/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelScan_Range.hpp +++ b/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelScan_Range.hpp @@ -39,7 +39,7 @@ class ParallelScan, const Policy m_policy; template - inline static std::enable_if_t::value> exec_range( + inline static std::enable_if_t> exec_range( const FunctorType &functor, const Member &ibeg, const Member &iend, reference_type update, const bool final) { #if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \ @@ -52,7 +52,7 @@ class ParallelScan, } template - inline static std::enable_if_t::value> exec_range( + inline static std::enable_if_t> exec_range( const FunctorType &functor, const Member &ibeg, const Member &iend, reference_type update, const bool final) { const TagType t{}; @@ -119,7 +119,7 @@ class ParallelScanWithTotal, const pointer_type m_result_ptr; template - inline static std::enable_if_t::value> exec_range( + inline static std::enable_if_t> exec_range( const FunctorType &functor, const Member &ibeg, const Member &iend, reference_type update, const bool final) { #if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \ @@ -132,7 +132,7 @@ class ParallelScanWithTotal, } template - inline static std::enable_if_t::value> exec_range( + inline static std::enable_if_t> exec_range( const FunctorType &functor, const Member &ibeg, const Member &iend, reference_type update, const bool final) { const TagType t{}; diff --git a/packages/kokkos/core/src/Threads/Kokkos_Threads_Spinwait.cpp b/packages/kokkos/core/src/Threads/Kokkos_Threads_Spinwait.cpp index 3df9dc07bf43..0f9a77f2afa9 100644 --- a/packages/kokkos/core/src/Threads/Kokkos_Threads_Spinwait.cpp +++ b/packages/kokkos/core/src/Threads/Kokkos_Threads_Spinwait.cpp @@ -108,7 +108,7 @@ void host_thread_yield(const uint32_t i, const WaitMode mode) { #endif /* defined( KOKKOS_ENABLE_ASM ) */ } -void spinwait_while_equal(ThreadState const volatile& flag, +void spinwait_while_equal(std::atomic const& flag, ThreadState const value) { Kokkos::store_fence(); uint32_t i = 0; diff --git a/packages/kokkos/core/src/Threads/Kokkos_Threads_Spinwait.hpp b/packages/kokkos/core/src/Threads/Kokkos_Threads_Spinwait.hpp index b98b6dbb73bc..7ab43cdb7af6 100644 --- a/packages/kokkos/core/src/Threads/Kokkos_Threads_Spinwait.hpp +++ b/packages/kokkos/core/src/Threads/Kokkos_Threads_Spinwait.hpp @@ -20,6 +20,7 @@ #include #include +#include namespace Kokkos { namespace Impl { @@ -34,7 +35,7 @@ enum class WaitMode : int { void host_thread_yield(const uint32_t i, const WaitMode mode); -void spinwait_while_equal(ThreadState const volatile& flag, +void spinwait_while_equal(std::atomic const& flag, ThreadState const value); } // namespace Impl diff --git a/packages/kokkos/core/src/Threads/Kokkos_Threads_Team.hpp b/packages/kokkos/core/src/Threads/Kokkos_Threads_Team.hpp index a3501a437d29..f627e0d47a51 100644 --- a/packages/kokkos/core/src/Threads/Kokkos_Threads_Team.hpp +++ b/packages/kokkos/core/src/Threads/Kokkos_Threads_Team.hpp @@ -143,8 +143,8 @@ class ThreadsExecTeamMember { KOKKOS_IF_ON_HOST(( // Make sure there is enough scratch space: - using type = typename if_c::type; + using type = std::conditional_t; if (m_team_base) { type* const local_value = ((type*)m_team_base[0]->scratch_memory()); @@ -164,8 +164,8 @@ class ThreadsExecTeamMember { KOKKOS_IF_ON_HOST(( // Make sure there is enough scratch space: - using type = typename if_c::type; + using type = std::conditional_t; f(value); if (m_team_base) { type* const local_value = ((type*)m_team_base[0]->scratch_memory()); memory_fence(); @@ -186,7 +186,7 @@ class ThreadsExecTeamMember { KOKKOS_IF_ON_HOST(( // Make sure there is enough scratch space: using type = - typename if_c::type; + std::conditional_t; if (team_rank() != team_size() - 1) * ((volatile type*)m_instance->scratch_memory()) = value; @@ -215,52 +215,65 @@ class ThreadsExecTeamMember { } template - KOKKOS_INLINE_FUNCTION - std::enable_if_t::value> - team_reduce(const ReducerType& reducer, - const typename ReducerType::value_type contribution) const { + KOKKOS_INLINE_FUNCTION std::enable_if_t::value> + team_reduce(const ReducerType& reducer, + typename ReducerType::value_type& contribution) const { KOKKOS_IF_ON_DEVICE(((void)reducer; (void)contribution;)) - KOKKOS_IF_ON_HOST(( - using value_type = typename ReducerType::value_type; - // Make sure there is enough scratch space: - using type = typename if_c::type; - - type* const local_value = ((type*)m_instance->scratch_memory()); + KOKKOS_IF_ON_HOST( + (using value_type = typename ReducerType::value_type; + using wrapped_reducer_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, TeamPolicy, + ReducerType, value_type>::Reducer; + impl_team_reduce(wrapped_reducer_type(reducer), contribution); + reducer.reference() = contribution;)) + } - // Set this thread's contribution - if (team_rank() != team_size() - 1) { *local_value = contribution; } + template + KOKKOS_INLINE_FUNCTION std::enable_if_t::value> + impl_team_reduce( + const WrappedReducerType& wrapped_reducer, + typename WrappedReducerType::value_type& contribution) const { + using value_type = typename WrappedReducerType::value_type; + // Make sure there is enough scratch space: + using type = std::conditional_t; + + type* const local_value = ((type*)m_instance->scratch_memory()); + + // Set this thread's contribution + if (team_rank() != team_size() - 1) { + *local_value = contribution; + } - // Fence to make sure the base team member has access: - memory_fence(); + // Fence to make sure the base team member has access: + memory_fence(); - if (team_fan_in()) { - // The last thread to synchronize returns true, all other threads - // wait for team_fan_out() - type* const team_value = ((type*)m_team_base[0]->scratch_memory()); + if (team_fan_in()) { + // The last thread to synchronize returns true, all other threads + // wait for team_fan_out() + type* const team_value = ((type*)m_team_base[0]->scratch_memory()); - *team_value = contribution; - // Join to the team value: - for (int i = 1; i < m_team_size; ++i) { - reducer.join(*team_value, - *((type*)m_team_base[i]->scratch_memory())); - } + *team_value = contribution; + // Join to the team value: + for (int i = 1; i < m_team_size; ++i) { + wrapped_reducer.join(team_value, + ((type*)m_team_base[i]->scratch_memory())); + } - // Team base thread may "lap" member threads so copy out to their - // local value. - for (int i = 1; i < m_team_size; ++i) { - *((type*)m_team_base[i]->scratch_memory()) = *team_value; - } + // Team base thread may "lap" member threads so copy out to their + // local value. + for (int i = 1; i < m_team_size; ++i) { + *((type*)m_team_base[i]->scratch_memory()) = *team_value; + } - // Fence to make sure all team members have access - memory_fence(); - } + // Fence to make sure all team members have access + memory_fence(); + } - team_fan_out(); + team_fan_out(); - // Value was changed by the team base - reducer.reference() = *local_value;)) + contribution = *local_value; } /** \brief Intra-team exclusive prefix sum with team_rank() ordering @@ -278,8 +291,8 @@ class ThreadsExecTeamMember { KOKKOS_IF_ON_DEVICE(((void)global_accum; return value;)) KOKKOS_IF_ON_HOST(( // Make sure there is enough scratch space: - using type = typename if_c::type; + using type = std::conditional_t; volatile type* const work_value = ((type*)m_instance->scratch_memory()); @@ -887,19 +900,25 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t::value> parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< iType, Impl::ThreadsExecTeamMember>& loop_boundaries, const Lambda& lambda, ValueType& result) { - ValueType intermediate; - Sum sum(intermediate); - sum.init(intermediate); + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, Lambda, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; + + wrapped_reducer_type wrapped_reducer(lambda); + value_type value; + wrapped_reducer.init(&value); for (iType i = loop_boundaries.start; i < loop_boundaries.end; i += loop_boundaries.increment) { - ValueType tmp = ValueType(); - lambda(i, tmp); - intermediate += tmp; + lambda(i, value); } - loop_boundaries.thread.team_reduce(sum, intermediate); - result = sum.reference(); + loop_boundaries.thread.impl_team_reduce(wrapped_reducer, value); + wrapped_reducer.final(&value); + result = value; } template @@ -907,15 +926,25 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t::value> parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< iType, Impl::ThreadsExecTeamMember>& loop_boundaries, const Lambda& lambda, const ReducerType& reducer) { - typename ReducerType::value_type value; - reducer.init(value); + using value_type = typename ReducerType::value_type; + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, + ReducerType, value_type>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + + wrapped_reducer_type wrapped_reducer(reducer); + value_type value; + wrapped_reducer.init(&value); for (iType i = loop_boundaries.start; i < loop_boundaries.end; i += loop_boundaries.increment) { lambda(i, value); } - loop_boundaries.thread.team_reduce(reducer, value); + loop_boundaries.thread.impl_team_reduce(wrapped_reducer, value); + wrapped_reducer.final(&value); + reducer.reference() = value; } } // namespace Kokkos @@ -950,11 +979,24 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t::value> parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct< iType, Impl::ThreadsExecTeamMember>& loop_boundaries, const Lambda& lambda, ValueType& result) { - result = ValueType(); + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, Lambda, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; + + wrapped_reducer_type wrapped_reducer(lambda); + value_type value; + wrapped_reducer.init(&value); + for (iType i = loop_boundaries.start; i < loop_boundaries.end; i += loop_boundaries.increment) { - lambda(i, result); + lambda(i, value); } + + wrapped_reducer.final(&value); + result = value; } template @@ -962,11 +1004,24 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t::value> parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct< iType, Impl::ThreadsExecTeamMember>& loop_boundaries, const Lambda& lambda, const ReducerType& reducer) { - reducer.init(reducer.reference()); + using value_type = typename ReducerType::value_type; + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, + ReducerType, value_type>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + + wrapped_reducer_type wrapped_reducer(reducer); + value_type value; + wrapped_reducer.init(&value); + for (iType i = loop_boundaries.start; i < loop_boundaries.end; i += loop_boundaries.increment) { - lambda(i, reducer.reference()); + lambda(i, value); } + + wrapped_reducer.final(&value); + reducer.reference() = value; } /** \brief Inter-thread parallel exclusive prefix sum. Executes @@ -1049,7 +1104,7 @@ KOKKOS_INLINE_FUNCTION void parallel_scan( typename Impl::FunctorAnalysis, FunctorType, void>::value_type; - static_assert(std::is_same::value, + static_assert(std::is_same_v, "Non-matching value types of closure and return type"); ValueType scan_val = ValueType(); diff --git a/packages/kokkos/core/src/Threads/Kokkos_Threads_WorkGraphPolicy.hpp b/packages/kokkos/core/src/Threads/Kokkos_Threads_WorkGraphPolicy.hpp index c88d66db5f9a..5fed92db26de 100644 --- a/packages/kokkos/core/src/Threads/Kokkos_Threads_WorkGraphPolicy.hpp +++ b/packages/kokkos/core/src/Threads/Kokkos_Threads_WorkGraphPolicy.hpp @@ -36,13 +36,13 @@ class ParallelFor, FunctorType m_functor; template - std::enable_if_t::value> exec_one( + std::enable_if_t> exec_one( const std::int32_t w) const noexcept { m_functor(w); } template - std::enable_if_t::value> exec_one( + std::enable_if_t> exec_one( const std::int32_t w) const noexcept { const TagType t{}; m_functor(t, w); diff --git a/packages/kokkos/core/src/View/Kokkos_BasicView.hpp b/packages/kokkos/core/src/View/Kokkos_BasicView.hpp new file mode 100644 index 000000000000..29eafca62eef --- /dev/null +++ b/packages/kokkos/core/src/View/Kokkos_BasicView.hpp @@ -0,0 +1,652 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE +static_assert(false, + "Including non-public Kokkos header files is not allowed."); +#endif + +#ifndef KOKKOS_BASIC_VIEW_HPP +#define KOKKOS_BASIC_VIEW_HPP +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +// FIXME: we need to make this work for not using our mdspan impl +#define KOKKOS_IMPL_NO_UNIQUE_ADDRESS _MDSPAN_NO_UNIQUE_ADDRESS +namespace Kokkos::Impl { + +constexpr inline struct SubViewCtorTag { + explicit SubViewCtorTag() = default; +} subview_ctor_tag{}; + +template +struct KokkosSliceToMDSpanSliceImpl { + using type = T; + KOKKOS_FUNCTION + static constexpr decltype(auto) transform(const T &s) { return s; } +}; + +template <> +struct KokkosSliceToMDSpanSliceImpl { + using type = full_extent_t; + KOKKOS_FUNCTION + static constexpr decltype(auto) transform(Kokkos::ALL_t) { + return full_extent; + } +}; + +template +using kokkos_slice_to_mdspan_slice = + typename KokkosSliceToMDSpanSliceImpl::type; + +template +KOKKOS_INLINE_FUNCTION constexpr decltype(auto) +transform_kokkos_slice_to_mdspan_slice(const T &s) { + return KokkosSliceToMDSpanSliceImpl::transform(s); +} + +// We do have implementation detail versions of these in our mdspan impl +// However they are not part of the public standard interface +template +struct is_layout_right_padded : public std::false_type {}; + +template +struct is_layout_right_padded> + : public std::true_type {}; + +template +struct is_layout_left_padded : public std::false_type {}; + +template +struct is_layout_left_padded> + : public std::true_type {}; + +template +class BasicView { + public: + using mdspan_type = + mdspan; + using extents_type = typename mdspan_type::extents_type; + using layout_type = typename mdspan_type::layout_type; + using accessor_type = typename mdspan_type::accessor_type; + using mapping_type = typename mdspan_type::mapping_type; + using element_type = typename mdspan_type::element_type; + using value_type = typename mdspan_type::value_type; + using index_type = typename mdspan_type::index_type; + using size_type = typename mdspan_type::size_type; + using rank_type = typename mdspan_type::rank_type; + using data_handle_type = typename mdspan_type::data_handle_type; + using reference = typename mdspan_type::reference; + using memory_space = typename accessor_type::memory_space; + using execution_space = typename memory_space::execution_space; + + // For now View and BasicView will have a restriction that the data handle + // needs to be convertible to element_type* and vice versa + static_assert(std::is_constructible_v); + static_assert(std::is_constructible_v); + + KOKKOS_FUNCTION static constexpr rank_type rank() noexcept { + return extents_type::rank(); + } + KOKKOS_FUNCTION static constexpr rank_type rank_dynamic() noexcept { + return extents_type::rank_dynamic(); + } + KOKKOS_FUNCTION static constexpr size_t static_extent(rank_type r) noexcept { + return extents_type::static_extent(r); + } + KOKKOS_FUNCTION constexpr index_type extent(rank_type r) const noexcept { + return m_map.extents().extent(r); + }; + + protected: + // These are pre-condition checks which are unconditionally (i.e. in release + // mode) enabled in Kokkos::View 4.4 + template + KOKKOS_FUNCTION static constexpr void check_basic_view_constructibility( + [[maybe_unused]] const OtherMapping &rhs) { + using src_t = typename OtherMapping::layout_type; + using dst_t = layout_type; + constexpr size_t rnk = mdspan_type::rank(); + if constexpr (!std::is_same_v) { + if constexpr (Impl::is_layout_left_padded::value) { + if constexpr (std::is_same_v) { + index_type stride = 1; + for (size_t r = 0; r < rnk; r++) { + if (rhs.stride(r) != stride) + Kokkos::abort("View assignment must have compatible layouts"); + if constexpr (rnk > 1) + stride *= (r == 0 ? rhs.stride(1) : rhs.extents().extent(r)); + } + } + } + if constexpr (Impl::is_layout_right_padded::value) { + if constexpr (std::is_same_v) { + index_type stride = 1; + if constexpr (rnk > 0) { + for (size_t r = rnk; r > 0; r--) { + if (rhs.stride(r - 1) != stride) + Kokkos::abort("View assignment must have compatible layouts"); + if constexpr (rnk > 1) + stride *= (r == rnk ? rhs.stride(r - 2) + : rhs.extents().extent(r - 1)); + } + } + } + } + if constexpr (std::is_same_v) { + if constexpr (std::is_same_v) { + index_type stride = 1; + for (size_t r = 0; r < rnk; r++) { + if (rhs.stride(r) != stride) + Kokkos::abort("View assignment must have compatible layouts"); + stride *= rhs.extents().extent(r); + } + } else if constexpr (Impl::is_layout_left_padded::value && + rnk > 1) { + if (rhs.stride(1) != rhs.extents().extent(0)) + Kokkos::abort("View assignment must have compatible layouts"); + } + } + if constexpr (std::is_same_v) { + if constexpr (std::is_same_v) { + index_type stride = 1; + if constexpr (rnk > 0) { + for (size_t r = rnk; r > 0; r--) { + if (rhs.stride(r - 1) != stride) + Kokkos::abort("View assignment must have compatible layouts"); + stride *= rhs.extents().extent(r - 1); + } + } + } else if constexpr (Impl::is_layout_right_padded::value && + rnk > 1) { + if (rhs.stride(rnk - 2) != rhs.extents().extent(rnk - 1)) + Kokkos::abort("View assignment must have compatible layouts"); + } + } + } + } + + public: + KOKKOS_DEFAULTED_FUNCTION constexpr BasicView() = default; + + KOKKOS_FUNCTION constexpr BasicView(const mdspan_type &other) + : m_ptr(other.data_handle()), + m_map(other.mapping()), + m_acc(other.accessor()){}; + KOKKOS_FUNCTION constexpr BasicView(mdspan_type &&other) + : m_ptr(std::move(other.data_handle())), + m_map(std::move(other.mapping())), + m_acc(std::move(other.accessor())){}; + + template + // requires(std::is_constructible_v) + KOKKOS_FUNCTION explicit constexpr BasicView( + std::enable_if_t, + data_handle_type> + p, + OtherIndexTypes... exts) + : m_ptr(std::move(p)), + m_map(extents_type(static_cast(std::move(exts))...)), + m_acc{} {} + + template + // When doing C++20 we should switch to this, the conditional explicit we + // can't do in 17 + // requires(std::is_constructible_v>) + // explicit(Size != rank_dynamic()) + KOKKOS_FUNCTION constexpr BasicView( + std::enable_if_t< + std::is_constructible_v>, + data_handle_type> + p, + const Array &exts) + : m_ptr(std::move(p)), m_map(extents_type(exts)), m_acc{} {} + + KOKKOS_FUNCTION constexpr BasicView(data_handle_type p, + const extents_type &exts) +// Compilation will simply fail in C++17 and overload set should not be an issue +#ifndef KOKKOS_ENABLE_CXX17 + requires(std::is_default_constructible_v && + std::is_constructible_v) +#endif + : m_ptr(std::move(p)), m_map(exts), m_acc{} { + } + + KOKKOS_FUNCTION constexpr BasicView(data_handle_type p, const mapping_type &m) +// Compilation will simply fail in C++17 and overload set should not be an issue +#ifndef KOKKOS_ENABLE_CXX17 + requires(std::is_default_constructible_v) +#endif + : m_ptr(std::move(p)), m_map(m), m_acc{} { + } + + KOKKOS_FUNCTION constexpr BasicView(data_handle_type p, const mapping_type &m, + const accessor_type &a) + : m_ptr(std::move(p)), m_map(m), m_acc(a) {} + + template +// requires(std::is_constructible_v::mdspan_type>) +#ifndef KOKKOS_ENABLE_CXX17 + explicit( + !std::is_convertible_v &, + mapping_type> || + !std::is_convertible_v) +#endif + KOKKOS_INLINE_FUNCTION + BasicView(const BasicView &other, + std::enable_if_t< + std::is_constructible_v< + mdspan_type, typename BasicView::mdspan_type>, + void *> = nullptr) + : m_ptr(other.m_ptr), m_map(other.m_map), m_acc(other.m_acc) { + // Kokkos View precondition checks happen in release builds + check_basic_view_constructibility(other.mapping()); + + static_assert( + std::is_constructible_v, + "Kokkos::View: incompatible data_handle_type for View construction"); + static_assert(std::is_constructible_v, + "Kokkos::View: incompatible extents for View construction"); + } + + template +// requires(std::is_constructible_v>) +#ifndef KOKKOS_ENABLE_CXX17 + explicit( + !std::is_convertible_v &, + mapping_type> || + !std::is_convertible_v) +#endif + KOKKOS_INLINE_FUNCTION + BasicView(const mdspan &other, + std::enable_if_t< + std::is_constructible_v< + mdspan_type, mdspan>, + void *> = nullptr) + : m_ptr(other.data_handle()), + m_map(other.mapping()), + m_acc(other.accessor()) { + // Kokkos View precondition checks happen in release builds + check_basic_view_constructibility(other.mapping()); + + static_assert( + std::is_constructible_v, + "Kokkos::View: incompatible data_handle_type for View construction"); + static_assert(std::is_constructible_v, + "Kokkos::View: incompatible extents for View construction"); + } + + // Allocating constructors specific to BasicView + /// + /// Construct from a given mapping + /// + explicit constexpr BasicView(const std::string &label, + const mapping_type &mapping) + : BasicView(view_alloc(label), mapping) {} + + /// + /// Construct from a given extents + /// + explicit constexpr BasicView(const std::string &label, + const extents_type &ext) + : BasicView(view_alloc(label), mapping_type{ext}) {} + + private: + template + data_handle_type create_data_handle( + const Impl::ViewCtorProp &arg_prop, + const typename mdspan_type::mapping_type &arg_mapping) { + constexpr bool has_exec = Impl::ViewCtorProp::has_execution_space; + // Copy the input allocation properties with possibly defaulted properties + // We need to split it in two to avoid MSVC compiler errors + auto prop_copy_tmp = + Impl::with_properties_if_unset(arg_prop, std::string{}); + auto prop_copy = Impl::with_properties_if_unset( + prop_copy_tmp, memory_space{}, execution_space{}); + using alloc_prop = decltype(prop_copy); + + if (alloc_prop::initialize && + !alloc_prop::execution_space::impl_is_initialized()) { + // If initializing view data then + // the execution space must be initialized. + Kokkos::Impl::throw_runtime_exception( + "Constructing View and initializing data with uninitialized " + "execution space"); + } + return data_handle_type(Impl::make_shared_allocation_record( + arg_mapping.required_span_size(), + Impl::get_property(prop_copy), + Impl::get_property(prop_copy), + has_exec ? std::optional{Impl::get_property< + Impl::ExecutionSpaceTag>(prop_copy)} + : std::optional{std::nullopt}, + std::integral_constant(), + std::integral_constant())); + } + + public: + template + // requires(!Impl::ViewCtorProp::has_pointer) + explicit inline BasicView( + const Impl::ViewCtorProp &arg_prop, + std::enable_if_t::has_pointer, + typename mdspan_type::mapping_type> const &arg_mapping) + : BasicView(create_data_handle(arg_prop, arg_mapping), arg_mapping) {} + + template + // requires(Impl::ViewCtorProp::has_pointer) + KOKKOS_FUNCTION explicit inline BasicView( + const Impl::ViewCtorProp &arg_prop, + std::enable_if_t::has_pointer, + typename mdspan_type::mapping_type> const &arg_mapping) + : BasicView( + data_handle_type(Impl::get_property(arg_prop)), + arg_mapping) {} + + protected: + template + KOKKOS_INLINE_FUNCTION BasicView( + Impl::SubViewCtorTag, + const BasicView &src_view, + SliceSpecifiers... slices) + : BasicView(submdspan( + src_view.to_mdspan(), + Impl::transform_kokkos_slice_to_mdspan_slice(slices)...)) {} + + public: + //---------------------------------------- + // Conversion to MDSpan + template , + mdspan_type>>> + KOKKOS_INLINE_FUNCTION constexpr + operator mdspan() const { + return mdspan_type(m_ptr, m_map, m_acc); + } + + // Here we use an overload instead of a default parameter as a workaround + // to a potential compiler bug with clang 17. It may be present in other + // compilers + template >> + KOKKOS_INLINE_FUNCTION constexpr auto to_mdspan() const { + using ret_mdspan_type = + mdspan; + return ret_mdspan_type( + static_cast( + data_handle()), + mapping(), static_cast(accessor())); + } + + template < + class OtherAccessorType = AccessorPolicy, + typename = std::enable_if_t>> + KOKKOS_INLINE_FUNCTION constexpr auto to_mdspan( + const OtherAccessorType &other_accessor) const { + using ret_mdspan_type = + mdspan; + return ret_mdspan_type( + static_cast( + data_handle()), + mapping(), other_accessor); + } + + KOKKOS_FUNCTION void assign_data(element_type *ptr) { m_ptr = ptr; } + + // ========================= mdspan ================================= + + // [mdspan.mdspan.members], members + +// Introducing the C++20 and C++23 variants of the operators already +#ifndef KOKKOS_ENABLE_CXX17 +#ifndef KOKKOS_ENABLE_CXX20 + // C++23 only operator[] + template + requires((std::is_convertible_v && ...) && + (std::is_nothrow_constructible_v && + ...) && + (sizeof...(OtherIndexTypes) == rank())) + KOKKOS_FUNCTION constexpr reference operator[]( + OtherIndexTypes... indices) const { + return m_acc.access(m_ptr, + m_map(static_cast(std::move(indices))...)); + } + + template + requires( + std::is_convertible_v && + std::is_nothrow_constructible_v) + KOKKOS_FUNCTION constexpr reference operator[]( + const Array &indices) const { + return m_acc.access(m_ptr, + [&](std::index_sequence) { + return m_map(indices[Idxs]...); + }(std::make_index_sequence())); + } + + template + requires( + std::is_convertible_v && + std::is_nothrow_constructible_v) + KOKKOS_FUNCTION constexpr reference operator[]( + std::span indices) const { + return m_acc.access(m_ptr, + [&](std::index_sequence) { + return m_map(indices[Idxs]...); + }(std::make_index_sequence())); + } +#endif + + // C++20 operator() + template + requires((std::is_convertible_v && ...) && + (std::is_nothrow_constructible_v && + ...) && + (sizeof...(OtherIndexTypes) == rank())) + KOKKOS_FUNCTION constexpr reference operator()( + OtherIndexTypes... indices) const { + return m_acc.access(m_ptr, + m_map(static_cast(std::move(indices))...)); + } + + template + requires( + std::is_convertible_v && + std::is_nothrow_constructible_v) + KOKKOS_FUNCTION constexpr reference operator()( + const Array &indices) const { + return m_acc.access(m_ptr, + [&](std::index_sequence) { + return m_map(indices[Idxs]...); + }(std::make_index_sequence())); + } + + template + requires( + std::is_convertible_v && + std::is_nothrow_constructible_v) + KOKKOS_FUNCTION constexpr reference operator()( + std::span indices) const { + return m_acc.access(m_ptr, + [&](std::index_sequence) { + return m_map(indices[Idxs]...); + }(std::make_index_sequence())); + } +#else + // C++17 variant of operator() + + // Some weird unexplained issue in compiling the SFINAE version with CUDA/MSVC + // So we just use post factor check here with static_assert +#if defined(KOKKOS_ENABLE_CUDA) && defined(_WIN32) + template + KOKKOS_FUNCTION constexpr reference operator()( + OtherIndexTypes... indices) const { + static_assert((std::is_convertible_v && ...)); + static_assert( + (std::is_nothrow_constructible_v && ...)); + static_assert((sizeof...(OtherIndexTypes)) == rank()); + return m_acc.access(m_ptr, + m_map(static_cast(std::move(indices))...)); + } +#else + template + KOKKOS_FUNCTION constexpr std::enable_if_t< + ((std::is_convertible_v && ...)) && + ((std::is_nothrow_constructible_v && + ...)) && + ((sizeof...(OtherIndexTypes)) == rank()), + reference> + operator()(OtherIndexTypes... indices) const { + return m_acc.access(m_ptr, + m_map(static_cast(std::move(indices))...)); + } +#endif +#endif + + private: + // FIXME_CXX20: could use inline templated lambda in C++20 mode inside size() + template + KOKKOS_FUNCTION constexpr size_type size_impl( + std::index_sequence) const noexcept { + // Note we restrict data_handle to be convertible to element_type* for now. + // This is also different from mdspan: mdspan can NOT be legally in a state + // where m_ptr is nullptr and the product of extents is non-zero + // The default constructor of mdspan is constrained to dynamic_rank > 0 + // For View we do not have that constraint today + if (data_handle() == nullptr) return 0u; + return ((static_cast(m_map.extents().extent(Idxs))) * ... * + size_type(1)); + } + + public: + KOKKOS_FUNCTION constexpr size_type size() const noexcept { + return size_impl(std::make_index_sequence()); + } + + private: + // FIXME_CXX20: could use inline templated lambda in C++20 mode inside empty() + template + KOKKOS_FUNCTION constexpr bool empty_impl( + std::index_sequence) const noexcept { + // Note we restrict data_handle to be convertible to element_type* for now. + // This is also different from mdspan: mdspan can NOT be legally in a state + // where m_ptr is nullptr and the product of extents is non-zero + // The default constructor of mdspan is constrained to dynamic_rank > 0 + // For View we do not have that constraint today + if (data_handle() == nullptr) return true; + return (rank() > 0) && + ((m_map.extents().extent(Idxs) == index_type(0)) || ... || false); + } + + public: + [[nodiscard]] KOKKOS_FUNCTION constexpr bool empty() const noexcept { + return empty_impl(std::make_index_sequence()); + } + + KOKKOS_FUNCTION friend constexpr void swap(BasicView &x, + BasicView &y) noexcept { + kokkos_swap(x.m_ptr, y.m_ptr); + kokkos_swap(x.m_map, y.m_map); + kokkos_swap(x.m_acc, y.m_acc); + } + + KOKKOS_FUNCTION constexpr const extents_type &extents() const noexcept { + return m_map.extents(); + }; + KOKKOS_FUNCTION constexpr const data_handle_type &data_handle() + const noexcept { + return m_ptr; + }; + KOKKOS_FUNCTION constexpr const mapping_type &mapping() const noexcept { + return m_map; + }; + KOKKOS_FUNCTION constexpr const accessor_type &accessor() const noexcept { + return m_acc; + }; + + KOKKOS_FUNCTION static constexpr bool is_always_unique() noexcept { + return mapping_type::is_always_unique(); + }; + KOKKOS_FUNCTION static constexpr bool is_always_exhaustive() noexcept { + return mapping_type::is_always_exhaustive(); + }; + KOKKOS_FUNCTION static constexpr bool is_always_strided() noexcept { + return mapping_type::is_always_strided(); + }; + + KOKKOS_FUNCTION constexpr bool is_unique() const { + return m_map.is_unique(); + }; + KOKKOS_FUNCTION constexpr bool is_exhaustive() const { + return m_map.is_exhaustive(); + }; + KOKKOS_FUNCTION constexpr bool is_strided() const { + return m_map.is_strided(); + }; + KOKKOS_FUNCTION constexpr index_type stride(rank_type r) const { + return m_map.stride(r); + }; + + protected: +#ifndef __NVCC__ + KOKKOS_IMPL_NO_UNIQUE_ADDRESS data_handle_type m_ptr{}; + KOKKOS_IMPL_NO_UNIQUE_ADDRESS mapping_type m_map{}; + KOKKOS_IMPL_NO_UNIQUE_ADDRESS accessor_type m_acc{}; +#else + data_handle_type m_ptr{}; + mapping_type m_map{}; + accessor_type m_acc{}; +#endif + + template + friend class BasicView; +}; +} // namespace Kokkos::Impl + +#endif diff --git a/packages/kokkos/core/src/View/Kokkos_ViewAlloc.hpp b/packages/kokkos/core/src/View/Kokkos_ViewAlloc.hpp index 1ade75692f1f..eb11630b21b8 100644 --- a/packages/kokkos/core/src/View/Kokkos_ViewAlloc.hpp +++ b/packages/kokkos/core/src/View/Kokkos_ViewAlloc.hpp @@ -26,6 +26,7 @@ static_assert(false, #include #include #include +#include #include #include @@ -41,22 +42,8 @@ bool is_zero_byte(const T& x) { return std::memcmp(&x, all_zeroes, sizeof(T)) == 0; } -//---------------------------------------------------------------------------- - -/* - * The construction, assignment to default, and destruction - * are merged into a single functor. - * Primarily to work around an unresolved CUDA back-end bug - * that would lose the destruction cuda device function when - * called from the shared memory tracking destruction. - * Secondarily to have two fewer partial specializations. - */ -template ::value> -struct ViewValueFunctor; - template -struct ViewValueFunctor { +struct ViewValueFunctor { using ExecSpace = typename DeviceType::execution_space; struct DestroyTag {}; @@ -68,20 +55,31 @@ struct ViewValueFunctor { std::string name; bool default_exec_space; - template - KOKKOS_INLINE_FUNCTION - std::enable_if_t::value> - operator()(ConstructTag const&, const size_t i) const { + template + KOKKOS_FUNCTION + std::enable_if_t> + operator()(ConstructTag, const size_t i) const { new (ptr + i) ValueType(); } - KOKKOS_INLINE_FUNCTION void operator()(DestroyTag const&, - const size_t i) const { + KOKKOS_FUNCTION void operator()(DestroyTag, const size_t i) const { + // When instantiating a View on host execution space with a host only + // destructor the workaround for CUDA device symbol instantiation tries to + // still compile a destruction kernel for the device, and issues a warning + // for host from host-device +#ifdef KOKKOS_ENABLE_CUDA + if constexpr (std::is_same_v) { + KOKKOS_IF_ON_DEVICE(((ptr + i)->~ValueType();)) + } else { + KOKKOS_IF_ON_HOST(((ptr + i)->~ValueType();)) + } +#else (ptr + i)->~ValueType(); +#endif } - ViewValueFunctor() = default; - ViewValueFunctor(const ViewValueFunctor&) = default; + ViewValueFunctor() = default; + ViewValueFunctor(const ViewValueFunctor&) = default; ViewValueFunctor& operator=(const ViewValueFunctor&) = default; ViewValueFunctor(ExecSpace const& arg_space, ValueType* const arg_ptr, @@ -104,49 +102,6 @@ struct ViewValueFunctor { functor_instantiate_workaround(); } - template - std::enable_if_t::value && - std::is_trivially_copy_assignable::value> - construct_dispatch() { - ValueType value{}; -// On A64FX memset seems to do the wrong thing with regards to first touch -// leading to the significant performance issues -#ifndef KOKKOS_ARCH_A64FX - if (Impl::is_zero_byte(value)) { - uint64_t kpID = 0; - if (Kokkos::Profiling::profileLibraryLoaded()) { - // We are not really using parallel_for here but using beginParallelFor - // instead of begin_parallel_for (and adding "via memset") is the best - // we can do to indicate that this is not supposed to be tunable (and - // doesn't really execute a parallel_for). - Kokkos::Profiling::beginParallelFor( - "Kokkos::View::initialization [" + name + "] via memset", - Kokkos::Profiling::Experimental::device_id(space), &kpID); - } - (void)ZeroMemset( - space, Kokkos::View>(ptr, n)); - - if (Kokkos::Profiling::profileLibraryLoaded()) { - Kokkos::Profiling::endParallelFor(kpID); - } - if (default_exec_space) - space.fence("Kokkos::Impl::ViewValueFunctor: View init/destroy fence"); - } else { -#endif - parallel_for_implementation(); -#ifndef KOKKOS_ARCH_A64FX - } -#endif - } - - template - std::enable_if_t::value && - std::is_trivially_copy_assignable::value)> - construct_dispatch() { - parallel_for_implementation(); - } - template void parallel_for_implementation() { using PolicyType = @@ -172,24 +127,62 @@ struct ViewValueFunctor { const Kokkos::Impl::ParallelFor closure( *this, policy); closure.execute(); - if (default_exec_space || std::is_same_v) - space.fence("Kokkos::Impl::ViewValueFunctor: View init/destroy fence"); if (Kokkos::Profiling::profileLibraryLoaded()) { Kokkos::Profiling::endParallelFor(kpID); } + if (default_exec_space || std::is_same_v) { + space.fence(std::is_same_v + ? "Kokkos::View::destruction before deallocate" + : "Kokkos::View::initialization"); + } } - void construct_shared_allocation() { construct_dispatch(); } + // Shortcut for zero initialization + void zero_memset_implementation() { + uint64_t kpID = 0; + if (Kokkos::Profiling::profileLibraryLoaded()) { + // We are not really using parallel_for here but using beginParallelFor + // instead of begin_parallel_for (and adding "via memset") is the best + // we can do to indicate that this is not supposed to be tunable (and + // doesn't really execute a parallel_for). + Kokkos::Profiling::beginParallelFor( + "Kokkos::View::initialization [" + name + "] via memset", + Kokkos::Profiling::Experimental::device_id(space), &kpID); + } + + (void)ZeroMemset(space, ptr, n * sizeof(ValueType)); + + if (Kokkos::Profiling::profileLibraryLoaded()) { + Kokkos::Profiling::endParallelFor(kpID); + } + if (default_exec_space) { + space.fence("Kokkos::View::initialization via memset"); + } + } + + void construct_shared_allocation() { +// On A64FX memset seems to do the wrong thing with regards to first touch +// leading to the significant performance issues +#ifndef KOKKOS_ARCH_A64FX + if constexpr (std::is_trivial_v) { + // value-initialization is equivalent to filling with zeros + zero_memset_implementation(); + } else +#endif + parallel_for_implementation(); + } void destroy_shared_allocation() { + if constexpr (std::is_trivially_destructible_v) { + // do nothing, don't bother calling the destructor + } else { #ifdef KOKKOS_ENABLE_IMPL_VIEW_OF_VIEWS_DESTRUCTOR_PRECONDITION_VIOLATION_WORKAROUND - if constexpr (std::is_same_v) - for (size_t i = 0; i < n; ++i) (ptr + i)->~ValueType(); - else + if constexpr (std::is_same_v) + for (size_t i = 0; i < n; ++i) (ptr + i)->~ValueType(); + else #endif - { - parallel_for_implementation(); + parallel_for_implementation(); } } @@ -206,114 +199,6 @@ struct ViewValueFunctor { } }; -template -struct ViewValueFunctor { - using ExecSpace = typename DeviceType::execution_space; - using PolicyType = Kokkos::RangePolicy>; - - ExecSpace space; - ValueType* ptr; - size_t n; - std::string name; - bool default_exec_space; - - KOKKOS_INLINE_FUNCTION - void operator()(const size_t i) const { ptr[i] = ValueType(); } - - ViewValueFunctor() = default; - ViewValueFunctor(const ViewValueFunctor&) = default; - ViewValueFunctor& operator=(const ViewValueFunctor&) = default; - - ViewValueFunctor(ExecSpace const& arg_space, ValueType* const arg_ptr, - size_t const arg_n, std::string arg_name) - : space(arg_space), - ptr(arg_ptr), - n(arg_n), - name(std::move(arg_name)), - default_exec_space(false) {} - - ViewValueFunctor(ValueType* const arg_ptr, size_t const arg_n, - std::string arg_name) - : space(ExecSpace{}), - ptr(arg_ptr), - n(arg_n), - name(std::move(arg_name)), - default_exec_space(true) {} - - template - std::enable_if_t::value && - std::is_trivially_copy_assignable::value> - construct_shared_allocation() { - // Shortcut for zero initialization -// On A64FX memset seems to do the wrong thing with regards to first touch -// leading to the significant performance issues -#ifndef KOKKOS_ARCH_A64FX - ValueType value{}; - if (Impl::is_zero_byte(value)) { - uint64_t kpID = 0; - if (Kokkos::Profiling::profileLibraryLoaded()) { - // We are not really using parallel_for here but using beginParallelFor - // instead of begin_parallel_for (and adding "via memset") is the best - // we can do to indicate that this is not supposed to be tunable (and - // doesn't really execute a parallel_for). - Kokkos::Profiling::beginParallelFor( - "Kokkos::View::initialization [" + name + "] via memset", - Kokkos::Profiling::Experimental::device_id(space), &kpID); - } - - (void)ZeroMemset( - space, Kokkos::View>(ptr, n)); - - if (Kokkos::Profiling::profileLibraryLoaded()) { - Kokkos::Profiling::endParallelFor(kpID); - } - if (default_exec_space) - space.fence("Kokkos::Impl::ViewValueFunctor: View init/destroy fence"); - } else { -#endif - parallel_for_implementation(); -#ifndef KOKKOS_ARCH_A64FX - } -#endif - } - - template - std::enable_if_t::value && - std::is_trivially_copy_assignable::value)> - construct_shared_allocation() { - parallel_for_implementation(); - } - - void parallel_for_implementation() { - PolicyType policy(space, 0, n); - uint64_t kpID = 0; - if (Kokkos::Profiling::profileLibraryLoaded()) { - Kokkos::Profiling::beginParallelFor( - "Kokkos::View::initialization [" + name + "]", - Kokkos::Profiling::Experimental::device_id(space), &kpID); - } -#ifdef KOKKOS_ENABLE_CUDA - if (std::is_same::value) { - Kokkos::Impl::cuda_prefetch_pointer(space, ptr, sizeof(ValueType) * n, - true); - } -#endif - const Kokkos::Impl::ParallelFor closure( - *this, policy); - closure.execute(); - if (default_exec_space) - space.fence( - "Kokkos::Impl::ViewValueFunctor: Fence after setting values in " - "view"); - if (Kokkos::Profiling::profileLibraryLoaded()) { - Kokkos::Profiling::endParallelFor(kpID); - } - } - - void destroy_shared_allocation() {} -}; - template struct ViewValueFunctorSequentialHostInit { using ExecSpace = typename DeviceType::execution_space; @@ -358,6 +243,63 @@ struct ViewValueFunctorSequentialHostInit { } }; +template +Kokkos::Impl::SharedAllocationRecord* make_shared_allocation_record( + const size_t& required_span_size, std::string_view label, + const MemorySpace& memory_space, + const std::optional exec_space, + std::bool_constant, std::bool_constant) { + static_assert(SpaceAccessibility::accessible); + + // Use this for constructing and destroying the view + using device_type = Kokkos::Device; + using functor_type = std::conditional_t< + SequentialInit, + ViewValueFunctorSequentialHostInit, + ViewValueFunctor>; + using record_type = + Kokkos::Impl::SharedAllocationRecord; + + /* Force alignment of allocations on on 8 byte boundaries even for + * element types smaller than 8 bytes */ + static constexpr std::size_t align_mask = 0x7; + + // Calculate the total size of the memory, in bytes, and make sure it is + // byte-aligned + const std::size_t alloc_size = + (required_span_size * sizeof(ElementType) + align_mask) & ~align_mask; + + auto* record = + exec_space + ? record_type::allocate(*exec_space, memory_space, std::string{label}, + alloc_size) + : record_type::allocate(memory_space, std::string{label}, alloc_size); + + auto ptr = static_cast(record->data()); + + auto functor = + exec_space ? functor_type(*exec_space, ptr, required_span_size, + std::string{label}) + : functor_type(ptr, required_span_size, std::string{label}); + + // Only initialize if the allocation is non-zero. + // May be zero if one of the dimensions is zero. + if constexpr (Initialize) { + if (alloc_size) { + // Assume destruction is only required when construction is requested. + // The ViewValueFunctor has both value construction and destruction + // operators. + record->m_destroy = std::move(functor); + + // Construct values + record->m_destroy.construct_shared_allocation(); + } + } + + return record; +} + } // namespace Kokkos::Impl #endif // KOKKOS_VIEW_ALLOC_HPP diff --git a/packages/kokkos/core/src/impl/Kokkos_Atomic_View.hpp b/packages/kokkos/core/src/View/Kokkos_ViewAtomic.hpp similarity index 96% rename from packages/kokkos/core/src/impl/Kokkos_Atomic_View.hpp rename to packages/kokkos/core/src/View/Kokkos_ViewAtomic.hpp index 23d4c2524c79..f77066b70f57 100644 --- a/packages/kokkos/core/src/impl/Kokkos_Atomic_View.hpp +++ b/packages/kokkos/core/src/View/Kokkos_ViewAtomic.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef KOKKOS_ATOMIC_VIEW_HPP -#define KOKKOS_ATOMIC_VIEW_HPP +#ifndef KOKKOS_VIEWATOMIC_HPP +#define KOKKOS_VIEWATOMIC_HPP #include #include @@ -44,10 +44,10 @@ class AtomicDataElement { } KOKKOS_INLINE_FUNCTION - void inc() const { Kokkos::atomic_increment(ptr); } + void inc() const { Kokkos::atomic_inc(ptr); } KOKKOS_INLINE_FUNCTION - void dec() const { Kokkos::atomic_decrement(ptr); } + void dec() const { Kokkos::atomic_dec(ptr); } KOKKOS_INLINE_FUNCTION const_value_type operator++() const { @@ -215,7 +215,7 @@ class AtomicViewDataHandle { } KOKKOS_INLINE_FUNCTION - operator typename ViewTraits::value_type*() const { return ptr; } + operator typename ViewTraits::value_type *() const { return ptr; } }; } // namespace Impl diff --git a/packages/kokkos/core/src/impl/Kokkos_ViewCtor.hpp b/packages/kokkos/core/src/View/Kokkos_ViewCtor.hpp similarity index 84% rename from packages/kokkos/core/src/impl/Kokkos_ViewCtor.hpp rename to packages/kokkos/core/src/View/Kokkos_ViewCtor.hpp index 379180ae6435..f08047471728 100644 --- a/packages/kokkos/core/src/impl/Kokkos_ViewCtor.hpp +++ b/packages/kokkos/core/src/View/Kokkos_ViewCtor.hpp @@ -72,8 +72,8 @@ struct ViewCtorProp {}; */ template struct ViewCtorProp> { - ViewCtorProp() = default; - ViewCtorProp(const ViewCtorProp &) = default; + ViewCtorProp() = default; + ViewCtorProp(const ViewCtorProp &) = default; ViewCtorProp &operator=(const ViewCtorProp &) = default; using type = CommonViewAllocProp; @@ -92,8 +92,8 @@ struct ViewCtorProp || std::is_same_v || std::is_same_v>, P> { - ViewCtorProp() = default; - ViewCtorProp(const ViewCtorProp &) = default; + ViewCtorProp() = default; + ViewCtorProp(const ViewCtorProp &) = default; ViewCtorProp &operator=(const ViewCtorProp &) = default; using type = P; @@ -106,14 +106,14 @@ struct ViewCtorProp || /* Map input label type to std::string */ template struct ViewCtorProp::value>, Label> { - ViewCtorProp() = default; - ViewCtorProp(const ViewCtorProp &) = default; + ViewCtorProp() = default; + ViewCtorProp(const ViewCtorProp &) = default; ViewCtorProp &operator=(const ViewCtorProp &) = default; using type = std::string; ViewCtorProp(const type &arg) : value(arg) {} - ViewCtorProp(type &&arg) : value(arg) {} + ViewCtorProp(type &&arg) : value(std::move(arg)) {} type value; }; @@ -122,8 +122,8 @@ template struct ViewCtorProp::value || Kokkos::is_execution_space::value>, Space> { - ViewCtorProp() = default; - ViewCtorProp(const ViewCtorProp &) = default; + ViewCtorProp() = default; + ViewCtorProp(const ViewCtorProp &) = default; ViewCtorProp &operator=(const ViewCtorProp &) = default; using type = Space; @@ -135,8 +135,8 @@ struct ViewCtorProp::value || template struct ViewCtorProp { - ViewCtorProp() = default; - ViewCtorProp(const ViewCtorProp &) = default; + ViewCtorProp() = default; + ViewCtorProp(const ViewCtorProp &) = default; ViewCtorProp &operator=(const ViewCtorProp &) = default; using type = T *; @@ -213,14 +213,19 @@ struct ViewCtorProp : public ViewCtorProp... { using execution_space = typename var_execution_space::type; using pointer_type = typename var_pointer::type; - /* Copy from a matching argument list. - * Requires std::is_same< P , ViewCtorProp< void , Args >::value ... - */ - template - inline ViewCtorProp(Args const &... args) : ViewCtorProp(args)... {} + // Construct from a matching argument list. + // + // Note that if P is empty, this constructor is the default constructor. + // On the other hand, if P is not empty, the constraint implies that + // there is no default constructor. + template , Args &&>...>>> + ViewCtorProp(Args &&...args) + : ViewCtorProp(std::forward(args))... {} template - KOKKOS_FUNCTION ViewCtorProp(pointer_type arg0, Args const &... args) + KOKKOS_FUNCTION ViewCtorProp(pointer_type arg0, Args const &...args) : ViewCtorProp(arg0), ViewCtorProp::type>(args)... {} @@ -252,7 +257,7 @@ auto with_properties_if_unset(const ViewCtorProp &view_ctor_prop) { template auto with_properties_if_unset(const ViewCtorProp &view_ctor_prop, [[maybe_unused]] const Property &property, - const Properties &... properties) { + const Properties &...properties) { if constexpr ((is_execution_space::value && !ViewCtorProp::has_execution_space) || (is_memory_space::value && @@ -302,7 +307,7 @@ template struct WithPropertiesIfUnset, Property, Properties...> { static constexpr auto apply_prop(const ViewCtorProp &view_ctor_prop, const Property &prop, - const Properties &... properties) { + const Properties &...properties) { if constexpr ((is_execution_space::value && !ViewCtorProp::has_execution_space) || (is_memory_space::value && @@ -328,7 +333,7 @@ struct WithPropertiesIfUnset, Property, Properties...> { template auto with_properties_if_unset(const ViewCtorProp &view_ctor_prop, - const Properties &... properties) { + const Properties &...properties) { return WithPropertiesIfUnset, Properties...>::apply_prop( view_ctor_prop, properties...); } @@ -437,6 +442,48 @@ using ViewAllocateWithoutInitializing = Impl::ViewCtorProp; +inline constexpr Kokkos::Impl::SequentialHostInit_t SequentialHostInit{}; + +inline constexpr Kokkos::Impl::WithoutInitializing_t WithoutInitializing{}; + +inline constexpr Kokkos::Impl::AllowPadding_t AllowPadding{}; + +/** \brief Create View allocation parameter bundle from argument list. + * + * Valid argument list members are: + * 1) label as a "string" or std::string + * 2) memory space instance of the View::memory_space type + * 3) execution space instance compatible with the View::memory_space + * 4) Kokkos::WithoutInitializing to bypass initialization + * 4) Kokkos::AllowPadding to allow allocation to pad dimensions for memory + * alignment + */ +template +auto view_alloc(Args &&...args) { + using return_type = Impl::ViewCtorProp>::type...>; + + static_assert(!return_type::has_pointer, + "Cannot give pointer-to-memory for view allocation"); + + return return_type(std::forward(args)...); +} + +template +KOKKOS_INLINE_FUNCTION + Impl::ViewCtorProp::type...> + view_wrap(Args const &...args) { + using return_type = + Impl::ViewCtorProp::type...>; + + static_assert(!return_type::has_memory_space && + !return_type::has_execution_space && + !return_type::has_label && return_type::has_pointer, + "Must only give pointer-to-memory for view wrapping"); + + return return_type(args...); +} + } /* namespace Kokkos */ //---------------------------------------------------------------------------- diff --git a/packages/kokkos/core/src/impl/Kokkos_ViewDataAnalysis.hpp b/packages/kokkos/core/src/View/Kokkos_ViewDataAnalysis.hpp similarity index 96% rename from packages/kokkos/core/src/impl/Kokkos_ViewDataAnalysis.hpp rename to packages/kokkos/core/src/View/Kokkos_ViewDataAnalysis.hpp index 04c0c9aeede7..37b6e2802fc9 100644 --- a/packages/kokkos/core/src/impl/Kokkos_ViewDataAnalysis.hpp +++ b/packages/kokkos/core/src/View/Kokkos_ViewDataAnalysis.hpp @@ -60,8 +60,8 @@ struct rank_dynamic { static constexpr size_t ArgN##R = (V != KOKKOS_INVALID_INDEX ? V : 1); \ static constexpr size_t N##R = (V != KOKKOS_INVALID_INDEX ? V : 1); \ KOKKOS_INLINE_FUNCTION explicit ViewDimension##R(size_t) {} \ - ViewDimension##R() = default; \ - ViewDimension##R(const ViewDimension##R&) = default; \ + ViewDimension##R() = default; \ + ViewDimension##R(const ViewDimension##R&) = default; \ ViewDimension##R& operator=(const ViewDimension##R&) = default; \ }; \ template \ @@ -72,8 +72,8 @@ struct rank_dynamic { struct ViewDimension##R<0u, RD> { \ static constexpr size_t ArgN##R = 0; \ std::conditional_t<(RD < 3), size_t, unsigned> N##R; \ - ViewDimension##R() = default; \ - ViewDimension##R(const ViewDimension##R&) = default; \ + ViewDimension##R() = default; \ + ViewDimension##R(const ViewDimension##R&) = default; \ ViewDimension##R& operator=(const ViewDimension##R&) = default; \ KOKKOS_INLINE_FUNCTION explicit ViewDimension##R(size_t V) : N##R(V) {} \ }; \ @@ -149,8 +149,8 @@ struct KOKKOS_IMPL_ENFORCE_EMPTY_BASE_OPTIMIZATION ViewDimension static constexpr unsigned rank = sizeof...(Vals); static constexpr unsigned rank_dynamic = Impl::rank_dynamic::value; - ViewDimension() = default; - ViewDimension(const ViewDimension&) = default; + ViewDimension() = default; + ViewDimension(const ViewDimension&) = default; ViewDimension& operator=(const ViewDimension&) = default; KOKKOS_INLINE_FUNCTION @@ -370,8 +370,7 @@ struct ViewDataAnalysis { // ValueType is opportunity for partial specialization. // Must match array analysis when this default template is used. static_assert( - std::is_same::value); + std::is_same_v); public: using specialize = void; // No specialization diff --git a/packages/kokkos/core/src/View/Kokkos_ViewLegacy.hpp b/packages/kokkos/core/src/View/Kokkos_ViewLegacy.hpp new file mode 100644 index 000000000000..fd406d58ccae --- /dev/null +++ b/packages/kokkos/core/src/View/Kokkos_ViewLegacy.hpp @@ -0,0 +1,1604 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE +#include +static_assert(false, + "Including non-public Kokkos header files is not allowed."); +#endif +#ifndef KOKKOS_VIEWLEGACY_HPP +#define KOKKOS_VIEWLEGACY_HPP + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include + +#ifdef KOKKOS_ENABLE_IMPL_MDSPAN +#include +#include +#include +#endif +#include + +#include + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { + +/** \class View + * \brief View to an array of data. + * + * A View represents an array of one or more dimensions. + * For details, please refer to Kokkos' tutorial materials. + * + * \section Kokkos_View_TemplateParameters Template parameters + * + * This class has both required and optional template parameters. The + * \c DataType parameter must always be provided, and must always be + * first. The parameters \c Arg1Type, \c Arg2Type, and \c Arg3Type are + * placeholders for different template parameters. The default value + * of the fifth template parameter \c Specialize suffices for most use + * cases. When explaining the template parameters, we won't refer to + * \c Arg1Type, \c Arg2Type, and \c Arg3Type; instead, we will refer + * to the valid categories of template parameters, in whatever order + * they may occur. + * + * Valid ways in which template arguments may be specified: + * - View< DataType > + * - View< DataType , Layout > + * - View< DataType , Layout , Space > + * - View< DataType , Layout , Space , MemoryTraits > + * - View< DataType , Space > + * - View< DataType , Space , MemoryTraits > + * - View< DataType , MemoryTraits > + * + * \tparam DataType (required) This indicates both the type of each + * entry of the array, and the combination of compile-time and + * run-time array dimension(s). For example, double* + * indicates a one-dimensional array of \c double with run-time + * dimension, and int*[3] a two-dimensional array of \c int + * with run-time first dimension and compile-time second dimension + * (of 3). In general, the run-time dimensions (if any) must go + * first, followed by zero or more compile-time dimensions. For + * more examples, please refer to the tutorial materials. + * + * \tparam Space (required) The memory space. + * + * \tparam Layout (optional) The array's layout in memory. For + * example, LayoutLeft indicates a column-major (Fortran style) + * layout, and LayoutRight a row-major (C style) layout. If not + * specified, this defaults to the preferred layout for the + * Space. + * + * \tparam MemoryTraits (optional) Assertion of the user's intended + * access behavior. For example, RandomAccess indicates read-only + * access with limited spatial locality, and Unmanaged lets users + * wrap externally allocated memory in a View without automatic + * deallocation. + * + * \section Kokkos_View_MT MemoryTraits discussion + * + * \subsection Kokkos_View_MT_Interp MemoryTraits interpretation depends on + * Space + * + * Some \c MemoryTraits options may have different interpretations for + * different \c Space types. For example, with the Cuda device, + * \c RandomAccess tells Kokkos to fetch the data through the texture + * cache, whereas the non-GPU devices have no such hardware construct. + * + * \subsection Kokkos_View_MT_PrefUse Preferred use of MemoryTraits + * + * Users should defer applying the optional \c MemoryTraits parameter + * until the point at which they actually plan to rely on it in a + * computational kernel. This minimizes the number of template + * parameters exposed in their code, which reduces the cost of + * compilation. Users may always assign a View without specified + * \c MemoryTraits to a compatible View with that specification. + * For example: + * \code + * // Pass in the simplest types of View possible. + * void + * doSomething (View out, + * View in) + * { + * // Assign the "generic" View in to a RandomAccess View in_rr. + * // Note that RandomAccess View objects must have const data. + * View in_rr = in; + * // ... do something with in_rr and out ... + * } + * \endcode + */ + +} // namespace Kokkos + +namespace Kokkos { + +template +struct is_always_assignable_impl; + +template +struct is_always_assignable_impl, + Kokkos::View> { + using mapping_type = Kokkos::Impl::ViewMapping< + typename Kokkos::View::traits, + typename Kokkos::View::traits, + typename Kokkos::View::traits::specialize>; + + constexpr static bool value = + mapping_type::is_assignable && + static_cast(Kokkos::View::rank_dynamic) >= + static_cast(Kokkos::View::rank_dynamic); +}; + +template +using is_always_assignable = is_always_assignable_impl< + std::remove_reference_t, + std::remove_const_t>>; + +template +inline constexpr bool is_always_assignable_v = + is_always_assignable::value; + +template +constexpr bool is_assignable(const Kokkos::View& dst, + const Kokkos::View& src) { + using DstTraits = typename Kokkos::View::traits; + using SrcTraits = typename Kokkos::View::traits; + using mapping_type = + Kokkos::Impl::ViewMapping; + + return is_always_assignable_v, + Kokkos::View> || + (mapping_type::is_assignable && + ((DstTraits::dimension::rank_dynamic >= 1) || + (dst.static_extent(0) == src.extent(0))) && + ((DstTraits::dimension::rank_dynamic >= 2) || + (dst.static_extent(1) == src.extent(1))) && + ((DstTraits::dimension::rank_dynamic >= 3) || + (dst.static_extent(2) == src.extent(2))) && + ((DstTraits::dimension::rank_dynamic >= 4) || + (dst.static_extent(3) == src.extent(3))) && + ((DstTraits::dimension::rank_dynamic >= 5) || + (dst.static_extent(4) == src.extent(4))) && + ((DstTraits::dimension::rank_dynamic >= 6) || + (dst.static_extent(5) == src.extent(5))) && + ((DstTraits::dimension::rank_dynamic >= 7) || + (dst.static_extent(6) == src.extent(6))) && + ((DstTraits::dimension::rank_dynamic >= 8) || + (dst.static_extent(7) == src.extent(7)))); +} + +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#include + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { + +template +class View; + +template +struct is_view : public std::false_type {}; + +template +struct is_view> : public std::true_type {}; + +template +struct is_view> : public std::true_type {}; + +template +inline constexpr bool is_view_v = is_view::value; + +template +class View : public ViewTraits { + private: + template + friend class View; + template + friend class Kokkos::Impl::ViewMapping; + + using view_tracker_type = Kokkos::Impl::ViewTracker; + + public: + using traits = ViewTraits; + + private: + using map_type = + Kokkos::Impl::ViewMapping; + template + friend struct Kokkos::Impl::ViewTracker; + using hooks_policy = typename traits::hooks_policy; + + view_tracker_type m_track; + map_type m_map; + + public: + //---------------------------------------- + /** \brief Compatible view of array of scalar types */ + using array_type = + View; + + /** \brief Compatible view of const data type */ + using const_type = + View; + + /** \brief Compatible view of non-const data type */ + using non_const_type = + View; + + /** \brief Compatible host mirror view */ + using host_mirror_type = + View, + typename traits::hooks_policy>; + + /** \brief Compatible host mirror view */ + using HostMirror = host_mirror_type; + + /** \brief Unified types */ + using uniform_type = typename Impl::ViewUniformType::type; + using uniform_const_type = + typename Impl::ViewUniformType::const_type; + using uniform_runtime_type = + typename Impl::ViewUniformType::runtime_type; + using uniform_runtime_const_type = + typename Impl::ViewUniformType::runtime_const_type; + using uniform_nomemspace_type = + typename Impl::ViewUniformType::nomemspace_type; + using uniform_const_nomemspace_type = + typename Impl::ViewUniformType::const_nomemspace_type; + using uniform_runtime_nomemspace_type = + typename Impl::ViewUniformType::runtime_nomemspace_type; + using uniform_runtime_const_nomemspace_type = + typename Impl::ViewUniformType::runtime_const_nomemspace_type; + + using reference_type = typename map_type::reference_type; + using pointer_type = typename map_type::pointer_type; + + // Typedefs from mdspan + // using extents_type -> not applicable + // Defining layout_type here made MSVC+CUDA fail + // using layout_type = typename traits::array_layout; + // using accessor_type -> not applicable + // using mapping_type -> not applicable + using element_type = typename traits::value_type; + // using value_type -> conflicts with traits::value_type + using index_type = typename traits::memory_space::size_type; + // using size_type -> already from traits::size_type; where it is + // memory_space::size_type + using rank_type = size_t; + using data_handle_type = pointer_type; + using reference = reference_type; + + //---------------------------------------- + // Domain rank and extents + + static constexpr Impl::integral_constant + rank = {}; + static constexpr Impl::integral_constant + rank_dynamic = {}; +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + enum {Rank KOKKOS_DEPRECATED_WITH_COMMENT("Use rank instead.") = + map_type::Rank}; +#endif + + template + KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t, + size_t> + extent(const iType& r) const noexcept { + return m_map.extent(r); + } + + static KOKKOS_INLINE_FUNCTION constexpr size_t static_extent( + const unsigned r) noexcept { + return map_type::static_extent(r); + } + + template + KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t, + int> + extent_int(const iType& r) const noexcept { + return static_cast(m_map.extent(r)); + } + + KOKKOS_INLINE_FUNCTION constexpr typename traits::array_layout layout() + const { + return m_map.layout(); + } + + //---------------------------------------- + /* Deprecate all 'dimension' functions in favor of + * ISO/C++ vocabulary 'extent'. + */ + + KOKKOS_INLINE_FUNCTION constexpr size_t size() const { + return m_map.dimension_0() * m_map.dimension_1() * m_map.dimension_2() * + m_map.dimension_3() * m_map.dimension_4() * m_map.dimension_5() * + m_map.dimension_6() * m_map.dimension_7(); + } + + KOKKOS_INLINE_FUNCTION constexpr size_t stride_0() const { + return m_map.stride_0(); + } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_1() const { + return m_map.stride_1(); + } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_2() const { + return m_map.stride_2(); + } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_3() const { + return m_map.stride_3(); + } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_4() const { + return m_map.stride_4(); + } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_5() const { + return m_map.stride_5(); + } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_6() const { + return m_map.stride_6(); + } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_7() const { + return m_map.stride_7(); + } + + template + KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t, + size_t> + stride(iType r) const { + return ( + r == 0 + ? m_map.stride_0() + : (r == 1 + ? m_map.stride_1() + : (r == 2 + ? m_map.stride_2() + : (r == 3 + ? m_map.stride_3() + : (r == 4 + ? m_map.stride_4() + : (r == 5 + ? m_map.stride_5() + : (r == 6 + ? m_map.stride_6() + : m_map.stride_7()))))))); + } + + template + KOKKOS_INLINE_FUNCTION void stride(iType* const s) const { + m_map.stride(s); + } + + //---------------------------------------- + // Range span is the span which contains all members. + + enum { + reference_type_is_lvalue_reference = + std::is_lvalue_reference_v + }; + + KOKKOS_INLINE_FUNCTION constexpr size_t span() const { return m_map.span(); } + KOKKOS_INLINE_FUNCTION bool span_is_contiguous() const { + return m_map.span_is_contiguous(); + } + KOKKOS_INLINE_FUNCTION constexpr bool is_allocated() const { + return m_map.data() != nullptr; + } + KOKKOS_INLINE_FUNCTION constexpr pointer_type data() const { + return m_map.data(); + } + + //---------------------------------------- + // Allow specializations to query their specialized map + + KOKKOS_INLINE_FUNCTION + const Kokkos::Impl::ViewMapping& + impl_map() const { + return m_map; + } + KOKKOS_INLINE_FUNCTION + const Kokkos::Impl::SharedAllocationTracker& impl_track() const { + return m_track.m_tracker; + } + //---------------------------------------- + + private: + static constexpr bool is_layout_left = + std::is_same_v; + + static constexpr bool is_layout_right = + std::is_same_v; + + static constexpr bool is_layout_stride = + std::is_same_v; + + static constexpr bool is_default_map = + std::is_void_v && + (is_layout_left || is_layout_right || is_layout_stride); + +#if defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK) + +#define KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(...) \ + Kokkos::Impl::runtime_check_memory_access_violation< \ + typename traits::memory_space>( \ + "Kokkos::View ERROR: attempt to access inaccessible memory space", \ + __VA_ARGS__); \ + Kokkos::Impl::view_verify_operator_bounds( \ + __VA_ARGS__); + +#else + +#define KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(...) \ + Kokkos::Impl::runtime_check_memory_access_violation< \ + typename traits::memory_space>( \ + "Kokkos::View ERROR: attempt to access inaccessible memory space", \ + __VA_ARGS__); + +#endif + + template + static KOKKOS_FUNCTION void check_access_member_function_valid_args(Is...) { + static_assert(rank <= sizeof...(Is)); + static_assert(sizeof...(Is) <= 8); + static_assert(Kokkos::Impl::are_integral::value); + } + + template + static KOKKOS_FUNCTION void check_operator_parens_valid_args(Is...) { + static_assert(rank == sizeof...(Is)); + static_assert(Kokkos::Impl::are_integral::value); + } + + public: + //------------------------------ + // Rank 1 default map operator() + + template + KOKKOS_FORCEINLINE_FUNCTION + std::enable_if_t<(Kokkos::Impl::always_true::value && // + (1 == rank) && is_default_map && !is_layout_stride), + reference_type> + operator()(I0 i0) const { + check_operator_parens_valid_args(i0); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0) + return m_map.m_impl_handle[i0]; + } + + template + KOKKOS_FORCEINLINE_FUNCTION + std::enable_if_t<(Kokkos::Impl::always_true::value && // + (1 == rank) && is_default_map && is_layout_stride), + reference_type> + operator()(I0 i0) const { + check_operator_parens_valid_args(i0); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0) + return m_map.m_impl_handle[m_map.m_impl_offset.m_stride.S0 * i0]; + } + + //------------------------------ + // Rank 1 operator[] + + template + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + ((1 == rank) && Kokkos::Impl::are_integral::value && !is_default_map), + reference_type> + operator[](I0 i0) const { + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0) + return m_map.reference(i0); + } + + template + KOKKOS_FORCEINLINE_FUNCTION + std::enable_if_t<((1 == rank) && Kokkos::Impl::are_integral::value && + is_default_map && !is_layout_stride), + reference_type> + operator[](I0 i0) const { + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0) + return m_map.m_impl_handle[i0]; + } + + template + KOKKOS_FORCEINLINE_FUNCTION + std::enable_if_t<((1 == rank) && Kokkos::Impl::are_integral::value && + is_default_map && is_layout_stride), + reference_type> + operator[](I0 i0) const { + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0) + return m_map.m_impl_handle[m_map.m_impl_offset.m_stride.S0 * i0]; + } + + //------------------------------ + // Rank 2 default map operator() + + template + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + (Kokkos::Impl::always_true::value && // + (2 == rank) && is_default_map && + (is_layout_left || is_layout_right || is_layout_stride)), + reference_type> + operator()(I0 i0, I1 i1) const { + check_operator_parens_valid_args(i0, i1); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1) + if constexpr (is_layout_left) { + if constexpr (rank_dynamic == 0) + return m_map.m_impl_handle[i0 + m_map.m_impl_offset.m_dim.N0 * i1]; + else + return m_map.m_impl_handle[i0 + m_map.m_impl_offset.m_stride * i1]; + } else if constexpr (is_layout_right) { + if constexpr (rank_dynamic == 0) + return m_map.m_impl_handle[i1 + m_map.m_impl_offset.m_dim.N1 * i0]; + else + return m_map.m_impl_handle[i1 + m_map.m_impl_offset.m_stride * i0]; + } else { + static_assert(is_layout_stride); + return m_map.m_impl_handle[i0 * m_map.m_impl_offset.m_stride.S0 + + i1 * m_map.m_impl_offset.m_stride.S1]; + } +#if defined KOKKOS_COMPILER_INTEL + __builtin_unreachable(); +#endif + } + + // Rank 0 -> 8 operator() except for rank-1 and rank-2 with default map which + // have "inlined" versions above + + template + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + (Kokkos::Impl::always_true::value && // + (2 != rank) && (1 != rank) && (0 != rank) && is_default_map), + reference_type> + operator()(Is... indices) const { + check_operator_parens_valid_args(indices...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, indices...) + return m_map.m_impl_handle[m_map.m_impl_offset(indices...)]; + } + + template + KOKKOS_FORCEINLINE_FUNCTION + std::enable_if_t<(Kokkos::Impl::always_true::value && // + ((0 == rank) || !is_default_map)), + reference_type> + operator()(Is... indices) const { + check_operator_parens_valid_args(indices...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, indices...) + return m_map.reference(indices...); + } + + //------------------------------ + // Rank 0 + + template + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + (Kokkos::Impl::always_true::value && (0 == rank)), reference_type> + access(Is... extra) const { + check_access_member_function_valid_args(extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, extra...) + return m_map.reference(); + } + + //------------------------------ + // Rank 1 + + template + KOKKOS_FORCEINLINE_FUNCTION + std::enable_if_t<(Kokkos::Impl::always_true::value && + (1 == rank) && !is_default_map), + reference_type> + access(I0 i0, Is... extra) const { + check_access_member_function_valid_args(i0, extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, extra...) + return m_map.reference(i0); + } + + template + KOKKOS_FORCEINLINE_FUNCTION + std::enable_if_t<(Kokkos::Impl::always_true::value && + (1 == rank) && is_default_map && !is_layout_stride), + reference_type> + access(I0 i0, Is... extra) const { + check_access_member_function_valid_args(i0, extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, extra...) + return m_map.m_impl_handle[i0]; + } + + template + KOKKOS_FORCEINLINE_FUNCTION + std::enable_if_t<(Kokkos::Impl::always_true::value && + (1 == rank) && is_default_map && is_layout_stride), + reference_type> + access(I0 i0, Is... extra) const { + check_access_member_function_valid_args(i0, extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, extra...) + return m_map.m_impl_handle[m_map.m_impl_offset.m_stride.S0 * i0]; + } + + //------------------------------ + // Rank 2 + + template + KOKKOS_FORCEINLINE_FUNCTION + std::enable_if_t<(Kokkos::Impl::always_true::value && + (2 == rank) && !is_default_map), + reference_type> + access(I0 i0, I1 i1, Is... extra) const { + check_access_member_function_valid_args(i0, i1, extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, extra...) + return m_map.reference(i0, i1); + } + + template + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + (Kokkos::Impl::always_true::value && (2 == rank) && + is_default_map && + (is_layout_left || is_layout_right || is_layout_stride)), + reference_type> + access(I0 i0, I1 i1, Is... extra) const { + check_access_member_function_valid_args(i0, i1, extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, extra...) + if constexpr (is_layout_left) { + if constexpr (rank_dynamic == 0) + return m_map.m_impl_handle[i0 + m_map.m_impl_offset.m_dim.N0 * i1]; + else + return m_map.m_impl_handle[i0 + m_map.m_impl_offset.m_stride * i1]; + } else if constexpr (is_layout_right) { + if constexpr (rank_dynamic == 0) + return m_map.m_impl_handle[i1 + m_map.m_impl_offset.m_dim.N1 * i0]; + else + return m_map.m_impl_handle[i1 + m_map.m_impl_offset.m_stride * i0]; + } else { + static_assert(is_layout_stride); + return m_map.m_impl_handle[i0 * m_map.m_impl_offset.m_stride.S0 + + i1 * m_map.m_impl_offset.m_stride.S1]; + } +#if defined KOKKOS_COMPILER_INTEL + __builtin_unreachable(); +#endif + } + + //------------------------------ + // Rank 3 + + template + KOKKOS_FORCEINLINE_FUNCTION + std::enable_if_t<(Kokkos::Impl::always_true::value && + (3 == rank) && is_default_map), + reference_type> + access(I0 i0, I1 i1, I2 i2, Is... extra) const { + check_access_member_function_valid_args(i0, i1, i2, extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, extra...) + return m_map.m_impl_handle[m_map.m_impl_offset(i0, i1, i2)]; + } + + template + KOKKOS_FORCEINLINE_FUNCTION + std::enable_if_t<(Kokkos::Impl::always_true::value && + (3 == rank) && !is_default_map), + reference_type> + access(I0 i0, I1 i1, I2 i2, Is... extra) const { + check_access_member_function_valid_args(i0, i1, i2, extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, extra...) + return m_map.reference(i0, i1, i2); + } + + //------------------------------ + // Rank 4 + + template + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + (Kokkos::Impl::always_true::value && (4 == rank) && + is_default_map), + reference_type> + access(I0 i0, I1 i1, I2 i2, I3 i3, Is... extra) const { + check_access_member_function_valid_args(i0, i1, i2, i3, extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, extra...) + return m_map.m_impl_handle[m_map.m_impl_offset(i0, i1, i2, i3)]; + } + + template + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + (Kokkos::Impl::always_true::value && (4 == rank) && + !is_default_map), + reference_type> + access(I0 i0, I1 i1, I2 i2, I3 i3, Is... extra) const { + check_access_member_function_valid_args(i0, i1, i2, i3, extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, extra...) + return m_map.reference(i0, i1, i2, i3); + } + + //------------------------------ + // Rank 5 + + template + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + (Kokkos::Impl::always_true::value && + (5 == rank) && is_default_map), + reference_type> + access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, Is... extra) const { + check_access_member_function_valid_args(i0, i1, i2, i3, i4, extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, + extra...) + return m_map.m_impl_handle[m_map.m_impl_offset(i0, i1, i2, i3, i4)]; + } + + template + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + (Kokkos::Impl::always_true::value && + (5 == rank) && !is_default_map), + reference_type> + access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, Is... extra) const { + check_access_member_function_valid_args(i0, i1, i2, i3, i4, extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, + extra...) + return m_map.reference(i0, i1, i2, i3, i4); + } + + //------------------------------ + // Rank 6 + + template + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + (Kokkos::Impl::always_true::value && + (6 == rank) && is_default_map), + reference_type> + access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, Is... extra) const { + check_access_member_function_valid_args(i0, i1, i2, i3, i4, i5, extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, i5, + extra...) + return m_map.m_impl_handle[m_map.m_impl_offset(i0, i1, i2, i3, i4, i5)]; + } + + template + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + (Kokkos::Impl::always_true::value && + (6 == rank) && !is_default_map), + reference_type> + access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, Is... extra) const { + check_access_member_function_valid_args(i0, i1, i2, i3, i4, i5, extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, i5, + extra...) + return m_map.reference(i0, i1, i2, i3, i4, i5); + } + + //------------------------------ + // Rank 7 + + template + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + (Kokkos::Impl::always_true::value && + (7 == rank) && is_default_map), + reference_type> + access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, I6 i6, Is... extra) const { + check_access_member_function_valid_args(i0, i1, i2, i3, i4, i5, i6, + extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, i5, i6, + extra...) + return m_map.m_impl_handle[m_map.m_impl_offset(i0, i1, i2, i3, i4, i5, i6)]; + } + + template + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + (Kokkos::Impl::always_true::value && + (7 == rank) && !is_default_map), + reference_type> + access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, I6 i6, Is... extra) const { + check_access_member_function_valid_args(i0, i1, i2, i3, i4, i5, i6, + extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, i5, i6, + extra...) + return m_map.reference(i0, i1, i2, i3, i4, i5, i6); + } + + //------------------------------ + // Rank 8 + + template + KOKKOS_FORCEINLINE_FUNCTION + std::enable_if_t<(Kokkos::Impl::always_true::value && + (8 == rank) && is_default_map), + reference_type> + access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, I6 i6, I7 i7, + Is... extra) const { + check_access_member_function_valid_args(i0, i1, i2, i3, i4, i5, i6, i7, + extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, i5, i6, + i7, extra...) + return m_map + .m_impl_handle[m_map.m_impl_offset(i0, i1, i2, i3, i4, i5, i6, i7)]; + } + + template + KOKKOS_FORCEINLINE_FUNCTION + std::enable_if_t<(Kokkos::Impl::always_true::value && + (8 == rank) && !is_default_map), + reference_type> + access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, I6 i6, I7 i7, + Is... extra) const { + check_access_member_function_valid_args(i0, i1, i2, i3, i4, i5, i6, i7, + extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, i5, i6, + i7, extra...) + return m_map.reference(i0, i1, i2, i3, i4, i5, i6, i7); + } + +#undef KOKKOS_IMPL_VIEW_OPERATOR_VERIFY + + //---------------------------------------- + // Standard destructor, constructors, and assignment operators + + KOKKOS_DEFAULTED_FUNCTION + ~View() = default; + + KOKKOS_DEFAULTED_FUNCTION + View() = default; + + KOKKOS_FUNCTION + View(const View& other) : m_track(other.m_track), m_map(other.m_map) { + KOKKOS_IF_ON_HOST((hooks_policy::copy_construct(*this, other);)) + } + + KOKKOS_FUNCTION + View(View&& other) + : m_track{std::move(other.m_track)}, m_map{std::move(other.m_map)} { + KOKKOS_IF_ON_HOST((hooks_policy::move_construct(*this, other);)) + } + + KOKKOS_FUNCTION + View& operator=(const View& other) { + m_map = other.m_map; + m_track = other.m_track; + + KOKKOS_IF_ON_HOST((hooks_policy::copy_assign(*this, other);)) + + return *this; + } + + KOKKOS_FUNCTION + View& operator=(View&& other) { + m_map = std::move(other.m_map); + m_track = std::move(other.m_track); + + KOKKOS_IF_ON_HOST((hooks_policy::move_assign(*this, other);)) + + return *this; + } + + //---------------------------------------- + // Compatible view copy constructor and assignment + // may assign unmanaged from managed. + + template + KOKKOS_INLINE_FUNCTION View( + const View& rhs, + std::enable_if_t::traits, + typename traits::specialize>::is_assignable_data_type>* = nullptr) + : m_track(rhs), m_map() { + using SrcTraits = typename View::traits; + using Mapping = Kokkos::Impl::ViewMapping; + static_assert(Mapping::is_assignable, + "Incompatible View copy construction"); + Mapping::assign(m_map, rhs.m_map, rhs.m_track.m_tracker); + } + + template + KOKKOS_INLINE_FUNCTION std::enable_if_t< + Kokkos::Impl::ViewMapping< + traits, typename View::traits, + typename traits::specialize>::is_assignable_data_type, + View>& + operator=(const View& rhs) { + using SrcTraits = typename View::traits; + using Mapping = Kokkos::Impl::ViewMapping; + static_assert(Mapping::is_assignable, "Incompatible View copy assignment"); + Mapping::assign(m_map, rhs.m_map, rhs.m_track.m_tracker); + m_track.assign(rhs); + return *this; + } + + //---------------------------------------- + // Compatible subview constructor + // may assign unmanaged from managed. + + template + KOKKOS_INLINE_FUNCTION View(const View& src_view, const Arg0 arg0, + Args... args) + : m_track(src_view), m_map() { + using SrcType = View; + + using Mapping = Kokkos::Impl::ViewMapping; + + using DstType = typename Mapping::type; + + static_assert( + Kokkos::Impl::ViewMapping::is_assignable, + "Subview construction requires compatible view and subview arguments"); + + Mapping::assign(m_map, src_view.m_map, arg0, args...); + } + + //---------------------------------------- + // Allocation tracking properties + + KOKKOS_INLINE_FUNCTION + int use_count() const { return m_track.m_tracker.use_count(); } + + inline const std::string label() const { + return m_track.m_tracker + .template get_label(); + } + + public: + //---------------------------------------- + // Allocation according to allocation properties and array layout + + template + explicit inline View( + const Impl::ViewCtorProp& arg_prop, + std::enable_if_t::has_pointer, + typename traits::array_layout> const& arg_layout) + : m_track(), m_map() { + // Copy the input allocation properties with possibly defaulted properties + // We need to split it in two to avoid MSVC compiler errors + auto prop_copy_tmp = + Impl::with_properties_if_unset(arg_prop, std::string{}); + auto prop_copy = Impl::with_properties_if_unset( + prop_copy_tmp, typename traits::device_type::memory_space{}, + typename traits::device_type::execution_space{}); + using alloc_prop = decltype(prop_copy); + + static_assert(traits::is_managed, + "View allocation constructor requires managed memory"); + + if (alloc_prop::initialize && + !alloc_prop::execution_space::impl_is_initialized()) { + // If initializing view data then + // the execution space must be initialized. + Kokkos::Impl::throw_runtime_exception( + "Constructing View and initializing data with uninitialized " + "execution space"); + } + +#ifdef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK + if constexpr (std::is_same_v || + std::is_same_v || + std::is_same_v) { + size_t i0 = arg_layout.dimension[0]; + size_t i1 = arg_layout.dimension[1]; + size_t i2 = arg_layout.dimension[2]; + size_t i3 = arg_layout.dimension[3]; + size_t i4 = arg_layout.dimension[4]; + size_t i5 = arg_layout.dimension[5]; + size_t i6 = arg_layout.dimension[6]; + size_t i7 = arg_layout.dimension[7]; + + const std::string& alloc_name = + Impl::get_property(prop_copy); + Impl::runtime_check_rank( + *this, std::is_same::value, i0, i1, + i2, i3, i4, i5, i6, i7, alloc_name.c_str()); + } +#endif + + Kokkos::Impl::SharedAllocationRecord<>* record = m_map.allocate_shared( + prop_copy, arg_layout, Impl::ViewCtorProp::has_execution_space); + + // Setup and initialization complete, start tracking + m_track.m_tracker.assign_allocated_record_to_uninitialized(record); + } + + KOKKOS_INLINE_FUNCTION + void assign_data(pointer_type arg_data) { + m_track.m_tracker.clear(); + m_map.assign_data(arg_data); + } + + // Wrap memory according to properties and array layout + template + explicit KOKKOS_INLINE_FUNCTION View( + const Impl::ViewCtorProp& arg_prop, + std::enable_if_t::has_pointer, + typename traits::array_layout> const& arg_layout) + : m_track() // No memory tracking + , + m_map(arg_prop, arg_layout) { + static_assert( + std::is_same::pointer_type>::value, + "Constructing View to wrap user memory must supply matching pointer " + "type"); + +#ifdef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK + if constexpr (std::is_same_v || + std::is_same_v || + std::is_same_v) { + size_t i0 = arg_layout.dimension[0]; + size_t i1 = arg_layout.dimension[1]; + size_t i2 = arg_layout.dimension[2]; + size_t i3 = arg_layout.dimension[3]; + size_t i4 = arg_layout.dimension[4]; + size_t i5 = arg_layout.dimension[5]; + size_t i6 = arg_layout.dimension[6]; + size_t i7 = arg_layout.dimension[7]; + + Impl::runtime_check_rank( + *this, std::is_same::value, i0, i1, + i2, i3, i4, i5, i6, i7, "UNMANAGED"); + } +#endif + } + + // Simple dimension-only layout + template + explicit inline View( + const Impl::ViewCtorProp& arg_prop, + std::enable_if_t::has_pointer, size_t> const + arg_N0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N3 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N4 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) + : View(arg_prop, + typename traits::array_layout(arg_N0, arg_N1, arg_N2, arg_N3, + arg_N4, arg_N5, arg_N6, arg_N7)) { + static_assert(traits::array_layout::is_extent_constructible, + "Layout is not constructible from extent arguments. Use " + "overload taking a layout object instead."); + } + + template + explicit KOKKOS_INLINE_FUNCTION View( + const Impl::ViewCtorProp& arg_prop, + std::enable_if_t::has_pointer, size_t> const + arg_N0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N3 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N4 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) + : View(arg_prop, + typename traits::array_layout(arg_N0, arg_N1, arg_N2, arg_N3, + arg_N4, arg_N5, arg_N6, arg_N7)) { + static_assert(traits::array_layout::is_extent_constructible, + "Layout is not constructible from extent arguments. Use " + "overload taking a layout object instead."); + } + + // Allocate with label and layout + template + explicit inline View( + const Label& arg_label, + std::enable_if_t::value, + typename traits::array_layout> const& arg_layout) + : View(Impl::ViewCtorProp(arg_label), arg_layout) {} + + // Allocate label and layout, must disambiguate from subview constructor. + template + explicit inline View( + const Label& arg_label, + std::enable_if_t::value, const size_t> + arg_N0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N3 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N4 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) + : View(Impl::ViewCtorProp(arg_label), + typename traits::array_layout(arg_N0, arg_N1, arg_N2, arg_N3, + arg_N4, arg_N5, arg_N6, arg_N7)) { + static_assert(traits::array_layout::is_extent_constructible, + "Layout is not constructible from extent arguments. Use " + "overload taking a layout object instead."); + } + + // Construct view from ViewTracker and map + // This should be the preferred method because future extensions may need to + // use the ViewTracker class. + template + KOKKOS_INLINE_FUNCTION View( + const view_tracker_type& track, + const Kokkos::Impl::ViewMapping& map) + : m_track(track), m_map() { + using Mapping = + Kokkos::Impl::ViewMapping; + static_assert(Mapping::is_assignable, + "Incompatible View copy construction"); + Mapping::assign(m_map, map, track.m_tracker); + } + + // Construct View from internal shared allocation tracker object and map + // This is here for backwards compatibility for classes that derive from + // Kokkos::View + template + KOKKOS_INLINE_FUNCTION View( + const typename view_tracker_type::track_type& track, + const Kokkos::Impl::ViewMapping& map) + : m_track(track), m_map() { + using Mapping = + Kokkos::Impl::ViewMapping; + static_assert(Mapping::is_assignable, + "Incompatible View copy construction"); + Mapping::assign(m_map, map, track); + } + + //---------------------------------------- + // Memory span required to wrap these dimensions. + static constexpr size_t required_allocation_size( + typename traits::array_layout const& layout) { + return map_type::memory_span(layout); + } + + static constexpr size_t required_allocation_size( + const size_t arg_N0 = 0, const size_t arg_N1 = 0, const size_t arg_N2 = 0, + const size_t arg_N3 = 0, const size_t arg_N4 = 0, const size_t arg_N5 = 0, + const size_t arg_N6 = 0, const size_t arg_N7 = 0) { + static_assert(traits::array_layout::is_extent_constructible, + "Layout is not constructible from extent arguments. Use " + "overload taking a layout object instead."); + return map_type::memory_span(typename traits::array_layout( + arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7)); + } + + explicit KOKKOS_INLINE_FUNCTION View( + pointer_type arg_ptr, const size_t arg_N0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N3 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N4 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) + : View(Impl::ViewCtorProp(arg_ptr), + typename traits::array_layout(arg_N0, arg_N1, arg_N2, arg_N3, + arg_N4, arg_N5, arg_N6, arg_N7)) { + static_assert(traits::array_layout::is_extent_constructible, + "Layout is not constructible from extent arguments. Use " + "overload taking a layout object instead."); + } + + explicit KOKKOS_INLINE_FUNCTION View( + pointer_type arg_ptr, const typename traits::array_layout& arg_layout) + : View(Impl::ViewCtorProp(arg_ptr), arg_layout) {} + + //---------------------------------------- + // Shared scratch memory constructor + + static KOKKOS_INLINE_FUNCTION size_t + shmem_size(const size_t arg_N0 = KOKKOS_INVALID_INDEX, + const size_t arg_N1 = KOKKOS_INVALID_INDEX, + const size_t arg_N2 = KOKKOS_INVALID_INDEX, + const size_t arg_N3 = KOKKOS_INVALID_INDEX, + const size_t arg_N4 = KOKKOS_INVALID_INDEX, + const size_t arg_N5 = KOKKOS_INVALID_INDEX, + const size_t arg_N6 = KOKKOS_INVALID_INDEX, + const size_t arg_N7 = KOKKOS_INVALID_INDEX) { + static_assert(traits::array_layout::is_extent_constructible, + "Layout is not constructible from extent arguments. Use " + "overload taking a layout object instead."); + const size_t num_passed_args = Impl::count_valid_integers( + arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7); + + if (std::is_void_v && + num_passed_args != rank_dynamic) { + Kokkos::abort( + "Kokkos::View::shmem_size() rank_dynamic != number of arguments.\n"); + } + + return View::shmem_size(typename traits::array_layout( + arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7)); + } + + private: + // Want to be able to align to minimum scratch alignment or sizeof or alignof + // elements + static constexpr size_t scratch_value_alignment = + max({sizeof(typename traits::value_type), + alignof(typename traits::value_type), + static_cast( + traits::execution_space::scratch_memory_space::ALIGN)}); + + public: + static KOKKOS_INLINE_FUNCTION size_t + shmem_size(typename traits::array_layout const& arg_layout) { + return map_type::memory_span(arg_layout) + scratch_value_alignment; + } + + explicit KOKKOS_INLINE_FUNCTION View( + const typename traits::execution_space::scratch_memory_space& arg_space, + const typename traits::array_layout& arg_layout) + : View(Impl::ViewCtorProp(reinterpret_cast( + arg_space.get_shmem_aligned(map_type::memory_span(arg_layout), + scratch_value_alignment))), + arg_layout) {} + + explicit KOKKOS_INLINE_FUNCTION View( + const typename traits::execution_space::scratch_memory_space& arg_space, + const size_t arg_N0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N3 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N4 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) + : View(Impl::ViewCtorProp( + reinterpret_cast(arg_space.get_shmem_aligned( + map_type::memory_span(typename traits::array_layout( + arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, + arg_N7)), + scratch_value_alignment))), + typename traits::array_layout(arg_N0, arg_N1, arg_N2, arg_N3, + arg_N4, arg_N5, arg_N6, arg_N7)) { + static_assert(traits::array_layout::is_extent_constructible, + "Layout is not constructible from extent arguments. Use " + "overload taking a layout object instead."); + } + + //---------------------------------------- + // MDSpan converting constructors +#ifdef KOKKOS_ENABLE_IMPL_MDSPAN + template ::mdspan_type> + KOKKOS_INLINE_FUNCTION +#ifndef KOKKOS_ENABLE_CXX17 + explicit(traits::is_managed) +#endif + View(const typename Impl::MDSpanViewTraits::mdspan_type& mds, + std::enable_if_t< + !std::is_same_v>* = + nullptr) + : View(mds.data_handle(), + Impl::array_layout_from_mapping< + typename traits::array_layout, + typename Impl::MDSpanViewTraits::mdspan_type>( + mds.mapping())) { + } + + template + KOKKOS_INLINE_FUNCTION +#ifndef KOKKOS_ENABLE_CXX17 + explicit(!std::is_convertible_v< + Kokkos::mdspan, + typename Impl::MDSpanViewTraits::mdspan_type>) +#endif + View(const Kokkos::mdspan& mds) + : View(typename Impl::MDSpanViewTraits::mdspan_type(mds)) { + } + + //---------------------------------------- + // Conversion to MDSpan + template ::mdspan_type, + typename = std::enable_if_t, + std::false_type, + std::is_assignable, + ImplNaturalMDSpanType>>::value>> + KOKKOS_INLINE_FUNCTION constexpr operator mdspan< + OtherElementType, OtherExtents, OtherLayoutPolicy, OtherAccessor>() { + using mdspan_type = typename Impl::MDSpanViewTraits::mdspan_type; + return mdspan_type{data(), + Impl::mapping_from_view_mapping(m_map)}; + } + + template >, + typename = std::enable_if_t>> + KOKKOS_INLINE_FUNCTION constexpr auto to_mdspan( + const OtherAccessorType& other_accessor = + typename Impl::MDSpanViewTraits::accessor_type()) { + using mdspan_type = typename Impl::MDSpanViewTraits::mdspan_type; + using ret_mdspan_type = + mdspan; + return ret_mdspan_type{data(), + Impl::mapping_from_view_mapping(m_map), + other_accessor}; + } +#endif // KOKKOS_ENABLE_IMPL_MDSPAN +}; + +template +KOKKOS_INLINE_FUNCTION constexpr unsigned rank(const View&) { + return View::rank(); +} + +namespace Impl { + +template +struct RankDataType { + using type = typename RankDataType::type*; +}; + +template +struct RankDataType { + using type = ValueType; +}; + +template +KOKKOS_FUNCTION std::enable_if_t< + N == View::rank() && + std::is_same_v::specialize, void>, + View> +as_view_of_rank_n(View v) { + return v; +} + +// Placeholder implementation to compile generic code for DynRankView; should +// never be called +template +KOKKOS_FUNCTION std::enable_if_t< + N != View::rank() && + std::is_same_v::specialize, void>, + View::value_type, N>::type, + Args...>> +as_view_of_rank_n(View) { + Kokkos::abort("Trying to get at a View of the wrong rank"); + return {}; +} + +template +void apply_to_view_of_static_rank(Function&& f, View a) { + f(a); +} + +} // namespace Impl + +template +KOKKOS_INLINE_FUNCTION auto subview(const View& src, Args... args) { + static_assert(View::rank == sizeof...(Args), + "subview requires one argument for each source View rank"); + + return typename Kokkos::Impl::ViewMapping< + void /* deduce subview type from source view traits */ + , + typename Impl::RemoveAlignedMemoryTrait::type, + Args...>::type(src, args...); +} + +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +template +KOKKOS_DEPRECATED KOKKOS_INLINE_FUNCTION auto subview(const View& src, + Args... args) { + static_assert(View::rank == sizeof...(Args), + "subview requires one argument for each source View rank"); + static_assert(Kokkos::is_memory_traits::value); + + return typename Kokkos::Impl::ViewMapping< + void /* deduce subview type from source view traits */ + , + typename Impl::RemoveAlignedMemoryTrait::type, + Args...>::type(src, args...); +} +#endif + +template +using Subview = decltype(subview(std::declval(), std::declval()...)); + +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { + +template +KOKKOS_INLINE_FUNCTION bool operator==(const View& lhs, + const View& rhs) { + // Same data, layout, dimensions + using lhs_traits = ViewTraits; + using rhs_traits = ViewTraits; + + return std::is_same_v && + std::is_same_v && + std::is_same_v && + View::rank() == View::rank() && + lhs.data() == rhs.data() && lhs.span() == rhs.span() && + lhs.extent(0) == rhs.extent(0) && lhs.extent(1) == rhs.extent(1) && + lhs.extent(2) == rhs.extent(2) && lhs.extent(3) == rhs.extent(3) && + lhs.extent(4) == rhs.extent(4) && lhs.extent(5) == rhs.extent(5) && + lhs.extent(6) == rhs.extent(6) && lhs.extent(7) == rhs.extent(7); +} + +template +KOKKOS_INLINE_FUNCTION bool operator!=(const View& lhs, + const View& rhs) { + return !(operator==(lhs, rhs)); +} + +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +template +struct CommonViewValueType; + +template +struct CommonViewValueType { + using value_type = std::common_type_t; +}; + +template +struct CommonViewAllocProp; + +template +struct CommonViewAllocProp { + using value_type = ValueType; + using scalar_array_type = ValueType; + + template + KOKKOS_INLINE_FUNCTION CommonViewAllocProp(const Views&...) {} +}; + +template +struct DeduceCommonViewAllocProp; + +// Base case must provide types for: +// 1. specialize 2. value_type 3. is_view 4. prop_type +template +struct DeduceCommonViewAllocProp { + using specialize = typename FirstView::traits::specialize; + + using value_type = typename FirstView::traits::value_type; + + enum : bool { is_view = is_view::value }; + + using prop_type = CommonViewAllocProp; +}; + +template +struct DeduceCommonViewAllocProp { + using NextTraits = DeduceCommonViewAllocProp; + + using first_specialize = typename FirstView::traits::specialize; + using first_value_type = typename FirstView::traits::value_type; + + enum : bool { first_is_view = is_view::value }; + + using next_specialize = typename NextTraits::specialize; + using next_value_type = typename NextTraits::value_type; + + enum : bool { next_is_view = NextTraits::is_view }; + + // common types + + // determine specialize type + // if first and next specialize differ, but are not the same specialize, error + // out + static_assert(!(!std::is_same_v && + !std::is_void_v && + !std::is_void_v), + "Kokkos DeduceCommonViewAllocProp ERROR: Only one non-void " + "specialize trait allowed"); + + // otherwise choose non-void specialize if either/both are non-void + using specialize = + std::conditional_t, + first_specialize, + std::conditional_t<(std::is_void_v && + !std::is_void_v), + next_specialize, first_specialize>>; + + using value_type = typename CommonViewValueType::value_type; + + enum : bool { is_view = (first_is_view && next_is_view) }; + + using prop_type = CommonViewAllocProp; +}; + +} // end namespace Impl + +template +using DeducedCommonPropsType = + typename Impl::DeduceCommonViewAllocProp::prop_type; + +// This function is required in certain scenarios where users customize +// Kokkos View internals. One example are dynamic length embedded ensemble +// types. The function is used to propagate necessary information +// (like the ensemble size) when creating new views. +// However, most of the time it is called with a single view. +// Furthermore, the propagated information is not just for view allocations. +// From what I can tell, the type of functionality provided by +// common_view_alloc_prop is the equivalent of propagating accessors in mdspan, +// a mechanism we will eventually use to replace this clunky approach here, when +// we are finally mdspan based. +// TODO: get rid of this when we have mdspan +template +KOKKOS_INLINE_FUNCTION DeducedCommonPropsType common_view_alloc_prop( + Views const&... views) { + return DeducedCommonPropsType(views...); +} + +} // namespace Kokkos + +#include +#include + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#endif /* #ifndef KOKKOS_VIEWLEGACY_HPP */ diff --git a/packages/kokkos/core/src/impl/Kokkos_ViewMapping.hpp b/packages/kokkos/core/src/View/Kokkos_ViewMapping.hpp similarity index 90% rename from packages/kokkos/core/src/impl/Kokkos_ViewMapping.hpp rename to packages/kokkos/core/src/View/Kokkos_ViewMapping.hpp index 10aaa63b7c82..ecc19eaf5e25 100644 --- a/packages/kokkos/core/src/impl/Kokkos_ViewMapping.hpp +++ b/packages/kokkos/core/src/View/Kokkos_ViewMapping.hpp @@ -28,61 +28,41 @@ #include #include #include -#include -#include -#include +#include +#include +#include +#include #include #include #include -#include +#include #include //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- -namespace Kokkos { - -struct ALL_t { - KOKKOS_INLINE_FUNCTION - constexpr const ALL_t& operator()() const { return *this; } - - KOKKOS_INLINE_FUNCTION - constexpr bool operator==(const ALL_t&) const { return true; } -}; - -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 -namespace Impl { -// TODO This alias declaration forces us to fully qualify ALL_t inside the -// Kokkos::Impl namespace to avoid deprecation warnings. Replace the -// fully-qualified name when we remove Kokkos::Impl::ALL_t. -using ALL_t KOKKOS_DEPRECATED_WITH_COMMENT("Use Kokkos::ALL_t instead!") = - Kokkos::ALL_t; -} // namespace Impl -#endif -} // namespace Kokkos - namespace Kokkos { namespace Impl { template struct is_integral_extent_type { - enum : bool { value = std::is_same::value ? 1 : 0 }; + enum : bool { value = std::is_same_v ? 1 : 0 }; }; template struct is_integral_extent_type> { - enum : bool { value = std::is_integral::value ? 1 : 0 }; + enum : bool { value = std::is_integral_v ? 1 : 0 }; }; template struct is_integral_extent_type> { - enum : bool { value = std::is_integral::value ? 1 : 0 }; + enum : bool { value = std::is_integral_v ? 1 : 0 }; }; // Assuming '2 == initializer_list::size()' template struct is_integral_extent_type> { - enum : bool { value = std::is_integral::value ? 1 : 0 }; + enum : bool { value = std::is_integral_v ? 1 : 0 }; }; template @@ -93,8 +73,7 @@ struct is_integral_extent { enum : bool { value = is_integral_extent_type::value }; - static_assert(value || std::is_integral::value || - std::is_void::value, + static_assert(value || std::is_integral_v || std::is_void_v, "subview argument must be either integral or integral extent"); }; @@ -112,16 +91,16 @@ struct SubviewLegalArgsCompileTime { enum { - value = (((CurrentArg == RankDest - 1) && - (Kokkos::Impl::is_integral_extent_type::value)) || - ((CurrentArg >= RankDest) && (std::is_integral::value)) || - ((CurrentArg < RankDest) && - (std::is_same::value)) || - ((CurrentArg == 0) && - (Kokkos::Impl::is_integral_extent_type::value))) && - (SubviewLegalArgsCompileTime::value) + value = + (((CurrentArg == RankDest - 1) && + (Kokkos::Impl::is_integral_extent_type::value)) || + ((CurrentArg >= RankDest) && (std::is_integral_v)) || + ((CurrentArg < RankDest) && (std::is_same_v)) || + ((CurrentArg == 0) && + (Kokkos::Impl::is_integral_extent_type::value))) && + (SubviewLegalArgsCompileTime::value) }; }; @@ -129,7 +108,7 @@ template struct SubviewLegalArgsCompileTime { enum { - value = ((CurrentArg == RankDest - 1) || (std::is_integral::value)) && + value = ((CurrentArg == RankDest - 1) || (std::is_integral_v)) && (CurrentArg == RankSrc - 1) }; }; @@ -144,10 +123,9 @@ struct SubviewLegalArgsCompileTime::value)) || - ((CurrentArg < RankSrc - RankDest) && - (std::is_integral::value)) || + ((CurrentArg < RankSrc - RankDest) && (std::is_integral_v)) || ((CurrentArg >= RankSrc - RankDest) && - (std::is_same::value))) && + (std::is_same_v))) && (SubviewLegalArgsCompileTime::value) @@ -158,8 +136,8 @@ template struct SubviewLegalArgsCompileTime { enum { - value = ((CurrentArg == RankSrc - 1) && - (std::is_same::value)) + value = + ((CurrentArg == RankSrc - 1) && (std::is_same_v)) }; }; @@ -392,7 +370,7 @@ struct SubviewExtents { const int n = snprintf(buffer, LEN, "Kokkos::subview bounds error ("); error(buffer + n, LEN - n, 0, 0, dim, args...); - Kokkos::Impl::throw_runtime_exception(std::string(buffer));)) + Kokkos::abort(buffer);)) KOKKOS_IF_ON_DEVICE(((void)dim; Kokkos::abort("Kokkos::subview bounds error"); @@ -718,8 +696,8 @@ struct ViewOffset< return *this; } #else - ViewOffset() = default; - ViewOffset(const ViewOffset&) = default; + ViewOffset() = default; + ViewOffset(const ViewOffset&) = default; ViewOffset& operator=(const ViewOffset&) = default; #endif @@ -885,14 +863,17 @@ struct ViewOffset< KOKKOS_INLINE_FUNCTION constexpr array_layout layout() const { constexpr auto r = dimension_type::rank; - return array_layout((r > 0 ? m_dim.N0 : KOKKOS_INVALID_INDEX), - (r > 1 ? m_dim.N1 : KOKKOS_INVALID_INDEX), - (r > 2 ? m_dim.N2 : KOKKOS_INVALID_INDEX), - (r > 3 ? m_dim.N3 : KOKKOS_INVALID_INDEX), - (r > 4 ? m_dim.N4 : KOKKOS_INVALID_INDEX), - (r > 5 ? m_dim.N5 : KOKKOS_INVALID_INDEX), - (r > 6 ? m_dim.N6 : KOKKOS_INVALID_INDEX), - (r > 7 ? m_dim.N7 : KOKKOS_INVALID_INDEX)); + array_layout l((r > 0 ? m_dim.N0 : KOKKOS_INVALID_INDEX), + (r > 1 ? m_dim.N1 : KOKKOS_INVALID_INDEX), + (r > 2 ? m_dim.N2 : KOKKOS_INVALID_INDEX), + (r > 3 ? m_dim.N3 : KOKKOS_INVALID_INDEX), + (r > 4 ? m_dim.N4 : KOKKOS_INVALID_INDEX), + (r > 5 ? m_dim.N5 : KOKKOS_INVALID_INDEX), + (r > 6 ? m_dim.N6 : KOKKOS_INVALID_INDEX), + (r > 7 ? m_dim.N7 : KOKKOS_INVALID_INDEX)); + // Without span_is_contiguous Sacado hidden dimensions get messed up + l.stride = span_is_contiguous() ? KOKKOS_IMPL_CTOR_DEFAULT_ARG : m_stride; + return l; } KOKKOS_INLINE_FUNCTION constexpr size_type dimension_0() const { @@ -1071,8 +1052,8 @@ struct ViewOffset< } #else - ViewOffset() = default; - ViewOffset(const ViewOffset&) = default; + ViewOffset() = default; + ViewOffset(const ViewOffset&) = default; ViewOffset& operator=(const ViewOffset&) = default; #endif @@ -1086,7 +1067,11 @@ struct ViewOffset< arg_layout.dimension[2], arg_layout.dimension[3], arg_layout.dimension[4], arg_layout.dimension[5], arg_layout.dimension[6], arg_layout.dimension[7]), - m_stride(Padding::stride(arg_layout.dimension[0])) {} + m_stride( + arg_layout.stride != KOKKOS_IMPL_CTOR_DEFAULT_ARG + ? arg_layout.stride + : Padding::stride(arg_layout.dimension[0])) { + } template KOKKOS_INLINE_FUNCTION constexpr ViewOffset( @@ -1407,8 +1392,8 @@ struct ViewOffset< } #else - ViewOffset() = default; - ViewOffset(const ViewOffset&) = default; + ViewOffset() = default; + ViewOffset(const ViewOffset&) = default; ViewOffset& operator=(const ViewOffset&) = default; #endif @@ -1565,14 +1550,17 @@ struct ViewOffset< KOKKOS_INLINE_FUNCTION constexpr array_layout layout() const { constexpr auto r = dimension_type::rank; - return array_layout((r > 0 ? m_dim.N0 : KOKKOS_INVALID_INDEX), - (r > 1 ? m_dim.N1 : KOKKOS_INVALID_INDEX), - (r > 2 ? m_dim.N2 : KOKKOS_INVALID_INDEX), - (r > 3 ? m_dim.N3 : KOKKOS_INVALID_INDEX), - (r > 4 ? m_dim.N4 : KOKKOS_INVALID_INDEX), - (r > 5 ? m_dim.N5 : KOKKOS_INVALID_INDEX), - (r > 6 ? m_dim.N6 : KOKKOS_INVALID_INDEX), - (r > 7 ? m_dim.N7 : KOKKOS_INVALID_INDEX)); + array_layout l((r > 0 ? m_dim.N0 : KOKKOS_INVALID_INDEX), + (r > 1 ? m_dim.N1 : KOKKOS_INVALID_INDEX), + (r > 2 ? m_dim.N2 : KOKKOS_INVALID_INDEX), + (r > 3 ? m_dim.N3 : KOKKOS_INVALID_INDEX), + (r > 4 ? m_dim.N4 : KOKKOS_INVALID_INDEX), + (r > 5 ? m_dim.N5 : KOKKOS_INVALID_INDEX), + (r > 6 ? m_dim.N6 : KOKKOS_INVALID_INDEX), + (r > 7 ? m_dim.N7 : KOKKOS_INVALID_INDEX)); + // Without span_is_contiguous Sacado hidden dimensions get messed up + l.stride = span_is_contiguous() ? KOKKOS_IMPL_CTOR_DEFAULT_ARG : m_stride; + return l; } KOKKOS_INLINE_FUNCTION constexpr size_type dimension_0() const { @@ -1614,8 +1602,8 @@ struct ViewOffset< } KOKKOS_INLINE_FUNCTION constexpr bool span_is_contiguous() const { - return m_stride == m_dim.N7 * m_dim.N6 * m_dim.N5 * m_dim.N4 * m_dim.N3 * - m_dim.N2 * m_dim.N1; + return m_stride == static_cast(m_dim.N7) * m_dim.N6 * m_dim.N5 * + m_dim.N4 * m_dim.N3 * m_dim.N2 * m_dim.N1; } /* Strides of dimensions */ @@ -1624,19 +1612,21 @@ struct ViewOffset< return m_dim.N7; } KOKKOS_INLINE_FUNCTION constexpr size_type stride_5() const { - return m_dim.N7 * m_dim.N6; + return static_cast(m_dim.N7) * m_dim.N6; } KOKKOS_INLINE_FUNCTION constexpr size_type stride_4() const { - return m_dim.N7 * m_dim.N6 * m_dim.N5; + return static_cast(m_dim.N7) * m_dim.N6 * m_dim.N5; } KOKKOS_INLINE_FUNCTION constexpr size_type stride_3() const { - return m_dim.N7 * m_dim.N6 * m_dim.N5 * m_dim.N4; + return static_cast(m_dim.N7) * m_dim.N6 * m_dim.N5 * m_dim.N4; } KOKKOS_INLINE_FUNCTION constexpr size_type stride_2() const { - return m_dim.N7 * m_dim.N6 * m_dim.N5 * m_dim.N4 * m_dim.N3; + return static_cast(m_dim.N7) * m_dim.N6 * m_dim.N5 * m_dim.N4 * + m_dim.N3; } KOKKOS_INLINE_FUNCTION constexpr size_type stride_1() const { - return m_dim.N7 * m_dim.N6 * m_dim.N5 * m_dim.N4 * m_dim.N3 * m_dim.N2; + return static_cast(m_dim.N7) * m_dim.N6 * m_dim.N5 * m_dim.N4 * + m_dim.N3 * m_dim.N2; } KOKKOS_INLINE_FUNCTION constexpr size_type stride_0() const { return m_stride; @@ -1749,13 +1739,31 @@ struct ViewOffset< } #else - ViewOffset() = default; - ViewOffset(const ViewOffset&) = default; + ViewOffset() = default; + ViewOffset(const ViewOffset&) = default; ViewOffset& operator=(const ViewOffset&) = default; #endif /* Enable padding for trivial scalar types with non-zero trivial scalar size. */ + + private: + template + KOKKOS_FUNCTION constexpr size_type compute_stride( + const Kokkos::LayoutRight& arg_layout) { + if (arg_layout.stride != KOKKOS_IMPL_CTOR_DEFAULT_ARG) + return arg_layout.stride; + size_type value = m_dim.N1; + if constexpr (dimension_type::rank > 2) value *= m_dim.N2; + if constexpr (dimension_type::rank > 3) value *= m_dim.N3; + if constexpr (dimension_type::rank > 4) value *= m_dim.N4; + if constexpr (dimension_type::rank > 5) value *= m_dim.N5; + if constexpr (dimension_type::rank > 6) value *= m_dim.N6; + if constexpr (dimension_type::rank > 7) value *= m_dim.N7; + return Padding::stride(value); + } + + public: template KOKKOS_INLINE_FUNCTION constexpr ViewOffset( std::integral_constant const&, @@ -1764,37 +1772,7 @@ struct ViewOffset< arg_layout.dimension[2], arg_layout.dimension[3], arg_layout.dimension[4], arg_layout.dimension[5], arg_layout.dimension[6], arg_layout.dimension[7]), - m_stride( - Padding:: - stride(/* 2 <= rank */ - m_dim.N1 * - (dimension_type::rank == 2 - ? size_t(1) - : m_dim.N2 * - (dimension_type::rank == 3 - ? size_t(1) - : m_dim.N3 * - (dimension_type::rank == 4 - ? size_t(1) - : m_dim.N4 * - (dimension_type::rank == - 5 - ? size_t(1) - : m_dim.N5 * - (dimension_type:: - rank == - 6 - ? size_t( - 1) - : m_dim.N6 * - (dimension_type:: - rank == - 7 - ? size_t( - 1) - : m_dim - .N7)))))))) { - } + m_stride(compute_stride(arg_layout)) {} template KOKKOS_INLINE_FUNCTION constexpr ViewOffset( @@ -1886,8 +1864,8 @@ struct ViewStride<0> { static constexpr size_t S0 = 0, S1 = 0, S2 = 0, S3 = 0, S4 = 0, S5 = 0, S6 = 0, S7 = 0; - ViewStride() = default; - ViewStride(const ViewStride&) = default; + ViewStride() = default; + ViewStride(const ViewStride&) = default; ViewStride& operator=(const ViewStride&) = default; KOKKOS_INLINE_FUNCTION @@ -1901,8 +1879,8 @@ struct ViewStride<1> { static constexpr size_t S1 = 0, S2 = 0, S3 = 0, S4 = 0, S5 = 0, S6 = 0, S7 = 0; - ViewStride() = default; - ViewStride(const ViewStride&) = default; + ViewStride() = default; + ViewStride(const ViewStride&) = default; ViewStride& operator=(const ViewStride&) = default; KOKKOS_INLINE_FUNCTION @@ -1916,8 +1894,8 @@ struct ViewStride<2> { size_t S0, S1; static constexpr size_t S2 = 0, S3 = 0, S4 = 0, S5 = 0, S6 = 0, S7 = 0; - ViewStride() = default; - ViewStride(const ViewStride&) = default; + ViewStride() = default; + ViewStride(const ViewStride&) = default; ViewStride& operator=(const ViewStride&) = default; KOKKOS_INLINE_FUNCTION @@ -1931,8 +1909,8 @@ struct ViewStride<3> { size_t S0, S1, S2; static constexpr size_t S3 = 0, S4 = 0, S5 = 0, S6 = 0, S7 = 0; - ViewStride() = default; - ViewStride(const ViewStride&) = default; + ViewStride() = default; + ViewStride(const ViewStride&) = default; ViewStride& operator=(const ViewStride&) = default; KOKKOS_INLINE_FUNCTION @@ -1946,8 +1924,8 @@ struct ViewStride<4> { size_t S0, S1, S2, S3; static constexpr size_t S4 = 0, S5 = 0, S6 = 0, S7 = 0; - ViewStride() = default; - ViewStride(const ViewStride&) = default; + ViewStride() = default; + ViewStride(const ViewStride&) = default; ViewStride& operator=(const ViewStride&) = default; KOKKOS_INLINE_FUNCTION @@ -1961,8 +1939,8 @@ struct ViewStride<5> { size_t S0, S1, S2, S3, S4; static constexpr size_t S5 = 0, S6 = 0, S7 = 0; - ViewStride() = default; - ViewStride(const ViewStride&) = default; + ViewStride() = default; + ViewStride(const ViewStride&) = default; ViewStride& operator=(const ViewStride&) = default; KOKKOS_INLINE_FUNCTION @@ -1976,8 +1954,8 @@ struct ViewStride<6> { size_t S0, S1, S2, S3, S4, S5; static constexpr size_t S6 = 0, S7 = 0; - ViewStride() = default; - ViewStride(const ViewStride&) = default; + ViewStride() = default; + ViewStride(const ViewStride&) = default; ViewStride& operator=(const ViewStride&) = default; KOKKOS_INLINE_FUNCTION @@ -1991,8 +1969,8 @@ struct ViewStride<7> { size_t S0, S1, S2, S3, S4, S5, S6; static constexpr size_t S7 = 0; - ViewStride() = default; - ViewStride(const ViewStride&) = default; + ViewStride() = default; + ViewStride(const ViewStride&) = default; ViewStride& operator=(const ViewStride&) = default; KOKKOS_INLINE_FUNCTION @@ -2005,8 +1983,8 @@ template <> struct ViewStride<8> { size_t S0, S1, S2, S3, S4, S5, S6, S7; - ViewStride() = default; - ViewStride(const ViewStride&) = default; + ViewStride() = default; + ViewStride(const ViewStride&) = default; ViewStride& operator=(const ViewStride&) = default; KOKKOS_INLINE_FUNCTION @@ -2283,8 +2261,8 @@ struct ViewOffset { } #else - ViewOffset() = default; - ViewOffset(const ViewOffset&) = default; + ViewOffset() = default; + ViewOffset(const ViewOffset&) = default; ViewOffset& operator=(const ViewOffset&) = default; #endif @@ -2398,9 +2376,9 @@ struct ViewDataHandle { template struct ViewDataHandle< Traits, - std::enable_if_t<(std::is_same::value && - std::is_void::value && + std::enable_if_t<(std::is_same_v && + std::is_void_v && Traits::memory_traits::is_atomic)>> { using value_type = typename Traits::value_type; using handle_type = typename Kokkos::Impl::AtomicViewDataHandle; @@ -2422,11 +2400,10 @@ struct ViewDataHandle< template struct ViewDataHandle< - Traits, - std::enable_if_t<(std::is_void::value && - (!Traits::memory_traits::is_aligned) && - Traits::memory_traits::is_restrict && - (!Traits::memory_traits::is_atomic))>> { + Traits, std::enable_if_t<(std::is_void_v && + (!Traits::memory_traits::is_aligned) && + Traits::memory_traits::is_restrict && + (!Traits::memory_traits::is_atomic))>> { using value_type = typename Traits::value_type; using handle_type = typename Traits::value_type* KOKKOS_RESTRICT; using return_type = typename Traits::value_type& KOKKOS_RESTRICT; @@ -2446,11 +2423,10 @@ struct ViewDataHandle< template struct ViewDataHandle< - Traits, - std::enable_if_t<(std::is_void::value && - Traits::memory_traits::is_aligned && - (!Traits::memory_traits::is_restrict) && - (!Traits::memory_traits::is_atomic))>> { + Traits, std::enable_if_t<(std::is_void_v && + Traits::memory_traits::is_aligned && + (!Traits::memory_traits::is_restrict) && + (!Traits::memory_traits::is_atomic))>> { using value_type = typename Traits::value_type; // typedef work-around for intel compilers error #3186: expected typedef // declaration @@ -2485,11 +2461,10 @@ struct ViewDataHandle< template struct ViewDataHandle< - Traits, - std::enable_if_t<(std::is_void::value && - Traits::memory_traits::is_aligned && - Traits::memory_traits::is_restrict && - (!Traits::memory_traits::is_atomic))>> { + Traits, std::enable_if_t<(std::is_void_v && + Traits::memory_traits::is_aligned && + Traits::memory_traits::is_restrict && + (!Traits::memory_traits::is_atomic))>> { using value_type = typename Traits::value_type; // typedef work-around for intel compilers error #3186: expected typedef // declaration @@ -2533,11 +2508,10 @@ namespace Impl { /** \brief View mapping for non-specialized data type and standard layout */ template class ViewMapping< - Traits, - std::enable_if_t<( - std::is_void::value && - ViewOffset::is_mapping_plugin::value)>> { + Traits, std::enable_if_t<(std::is_void_v && + ViewOffset::is_mapping_plugin::value)>> { public: using offset_type = ViewOffset; @@ -2680,28 +2654,26 @@ class ViewMapping< reference_type reference() const { return m_impl_handle[0]; } template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(std::is_integral::value && - // if layout is neither stride nor irregular, - // then just use the handle directly - !(std::is_same::value || - !is_regular::value)), - reference_type> - reference(const I0& i0) const { + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + (std::is_integral_v && + // if layout is neither stride nor irregular, + // then just use the handle directly + !(std::is_same_v || + !is_regular::value)), + reference_type> + reference(const I0& i0) const { return m_impl_handle[i0]; } template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(std::is_integral::value && - // if the layout is strided or irregular, then - // we have to use the offset - (std::is_same::value || - !is_regular::value)), - reference_type> - reference(const I0& i0) const { + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + (std::is_integral_v && + // if the layout is strided or irregular, then + // we have to use the offset + (std::is_same_v || + !is_regular::value)), + reference_type> + reference(const I0& i0) const { return m_impl_handle[m_impl_offset(i0)]; } @@ -2780,7 +2752,7 @@ class ViewMapping< KOKKOS_DEFAULTED_FUNCTION ViewMapping& operator=(const ViewMapping&) = default; - KOKKOS_DEFAULTED_FUNCTION ViewMapping(ViewMapping&&) = default; + KOKKOS_DEFAULTED_FUNCTION ViewMapping(ViewMapping&&) = default; KOKKOS_DEFAULTED_FUNCTION ViewMapping& operator=(ViewMapping&&) = default; //---------------------------------------- @@ -2894,29 +2866,34 @@ template class ViewMapping< DstTraits, SrcTraits, std::enable_if_t<( - !(std::is_same:: - value) && // Added to have a new specialization for SrcType of - // LayoutStride + !(std::is_same_v)&& // Added to have a new + // specialization for + // SrcType of + // LayoutStride // default mappings - std::is_void::value && - std::is_void::value && + std::is_void_v && + std::is_void_v && ( // same layout - std::is_same::value || + std::is_same_v || // known layout - ((std::is_same::value || - std::is_same::value || - std::is_same::value) && - (std::is_same::value || - std::is_same::value || - std::is_same::value))))>> { + ((std::is_same_v || + std::is_same_v || + std::is_same_v< + typename DstTraits::array_layout, + Kokkos::LayoutStride>)&&(std::is_same_v || + std::is_same_v< + typename SrcTraits::array_layout, + Kokkos::LayoutRight> || + std::is_same_v< + typename SrcTraits::array_layout, + Kokkos::LayoutStride>))))>> { private: enum { is_assignable_space = Kokkos::Impl::MemorySpaceAccess< @@ -2926,10 +2903,10 @@ class ViewMapping< enum { is_assignable_value_type = - std::is_same::value || - std::is_same::value + std::is_same_v || + std::is_same_v }; enum { @@ -2939,12 +2916,12 @@ class ViewMapping< }; enum { - is_assignable_layout = - std::is_same::value || - std::is_same::value || - (DstTraits::dimension::rank == 0) || (DstTraits::dimension::rank == 1) + is_assignable_layout = std::is_same_v || + std::is_same_v || + (DstTraits::dimension::rank == 0) || + (DstTraits::dimension::rank == 1) }; public: @@ -3032,22 +3009,21 @@ class ViewMapping< template class ViewMapping< DstTraits, SrcTraits, - std::enable_if_t<( - std::is_same::value && - std::is_void::value && - std::is_void::value && - ( - // same layout - std::is_same::value || - // known layout - (std::is_same::value || - std::is_same::value || - std::is_same::value)))>> { + std::enable_if_t<(std::is_same_v && + std::is_void_v && + std::is_void_v && + ( + // same layout + std::is_same_v || + // known layout + (std::is_same_v || + std::is_same_v || + std::is_same_v)))>> { private: enum { is_assignable_space = Kokkos::Impl::MemorySpaceAccess< @@ -3057,10 +3033,10 @@ class ViewMapping< enum { is_assignable_value_type = - std::is_same::value || - std::is_same::value + std::is_same_v || + std::is_same_v }; enum { @@ -3091,8 +3067,7 @@ class ViewMapping< bool assignable = true; src.stride(strides); size_t exp_stride = 1; - if (std::is_same::value) { + if (std::is_same_v) { for (int i = 0; i < (int)src.Rank; i++) { if (i > 0) exp_stride *= src.extent(i - 1); if (strides[i] != exp_stride) { @@ -3100,8 +3075,8 @@ class ViewMapping< break; } } - } else if (std::is_same::value) { + } else if (std::is_same_v) { for (int i = 0; i < (int)src.Rank; i++) { if (i > 0) exp_stride *= src.extent(src.Rank - i); if (strides[src.Rank - 1 - i] != exp_stride) { @@ -3197,8 +3172,8 @@ struct SubViewDataTypeImpl> { template struct SubViewDataTypeImpl< - std::enable_if_t>::value>, - ValueType, Kokkos::Experimental::Extents, Integral, Args...> + std::enable_if_t>>, ValueType, + Kokkos::Experimental::Extents, Integral, Args...> : SubViewDataTypeImpl, Args...> {}; @@ -3230,13 +3205,13 @@ struct SubViewDataType : SubViewDataTypeImpl {}; template class ViewMapping< - std::enable_if_t<(std::is_void::value && - (std::is_same::value || - std::is_same::value || - std::is_same::value))>, + std::enable_if_t<( + std::is_void_v && + (std::is_same_v || + std::is_same_v || + std::is_same_v))>, SrcTraits, Args...> { private: static_assert(SrcTraits::rank == sizeof...(Args), @@ -3292,14 +3267,14 @@ class ViewMapping< // OutputRank 1 or 2, InputLayout Left, Interval 0 // because single stride one or second index has a stride. (rank <= 2 && R0 && - std::is_same::value) // replace with input rank + std::is_same_v) // replace with input rank || // OutputRank 1 or 2, InputLayout Right, Interval [InputRank-1] // because single stride one or second index has a stride. (rank <= 2 && R0_rev && - std::is_same::value) // replace input rank + std::is_same_v) // replace input rank ), typename SrcTraits::array_layout, Kokkos::LayoutStride>; diff --git a/packages/kokkos/core/src/impl/Kokkos_ViewTracker.hpp b/packages/kokkos/core/src/View/Kokkos_ViewTracker.hpp similarity index 100% rename from packages/kokkos/core/src/impl/Kokkos_ViewTracker.hpp rename to packages/kokkos/core/src/View/Kokkos_ViewTracker.hpp diff --git a/packages/kokkos/core/src/View/Kokkos_ViewTraits.hpp b/packages/kokkos/core/src/View/Kokkos_ViewTraits.hpp new file mode 100644 index 000000000000..5eddfc68e070 --- /dev/null +++ b/packages/kokkos/core/src/View/Kokkos_ViewTraits.hpp @@ -0,0 +1,457 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE +#include +static_assert(false, + "Including non-public Kokkos header files is not allowed."); +#endif +#ifndef KOKKOS_VIEWTRAITS_HPP +#define KOKKOS_VIEWTRAITS_HPP + +#include +#include +#include +#include +#include +#ifdef KOKKOS_ENABLE_IMPL_MDSPAN +#include +#include +#endif + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { + +struct ALL_t { + KOKKOS_FUNCTION + constexpr const ALL_t& operator()() const { return *this; } + + KOKKOS_FUNCTION + constexpr bool operator==(const ALL_t&) const { return true; } +}; + +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +namespace Impl { +// TODO This alias declaration forces us to fully qualify ALL_t inside the +// Kokkos::Impl namespace to avoid deprecation warnings. Replace the +// fully-qualified name when we remove Kokkos::Impl::ALL_t. +using ALL_t KOKKOS_DEPRECATED_WITH_COMMENT("Use Kokkos::ALL_t instead!") = + Kokkos::ALL_t; +} // namespace Impl +#endif + +// FIXME_OPENMPTARGET - The `declare target` is needed for the Intel GPUs with +// the OpenMPTarget backend +#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_COMPILER_INTEL_LLVM) +#pragma omp declare target +#endif + +inline constexpr Kokkos::ALL_t ALL{}; + +#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_COMPILER_INTEL_LLVM) +#pragma omp end declare target +#endif + +namespace Impl { + +template +struct ViewArrayAnalysis; + +template ::non_const_value_type> +struct ViewDataAnalysis; + +template +class ViewMapping { + public: + enum : bool { is_assignable_data_type = false }; + enum : bool { is_assignable = false }; +}; + +template +constexpr KOKKOS_INLINE_FUNCTION std::size_t count_valid_integers( + const IntType i0, const IntType i1, const IntType i2, const IntType i3, + const IntType i4, const IntType i5, const IntType i6, const IntType i7) { + static_assert(std::is_integral_v, + "count_valid_integers() must have integer arguments."); + + return (i0 != KOKKOS_INVALID_INDEX) + (i1 != KOKKOS_INVALID_INDEX) + + (i2 != KOKKOS_INVALID_INDEX) + (i3 != KOKKOS_INVALID_INDEX) + + (i4 != KOKKOS_INVALID_INDEX) + (i5 != KOKKOS_INVALID_INDEX) + + (i6 != KOKKOS_INVALID_INDEX) + (i7 != KOKKOS_INVALID_INDEX); +} + +// FIXME Ideally, we would not instantiate this function for every possible View +// type. We should be able to only pass "extent" when we use mdspan. +template +KOKKOS_INLINE_FUNCTION void runtime_check_rank( + const View&, const bool is_void_spec, const size_t i0, const size_t i1, + const size_t i2, const size_t i3, const size_t i4, const size_t i5, + const size_t i6, const size_t i7, const char* label) { + (void)(label); + + if (is_void_spec) { + const size_t num_passed_args = + count_valid_integers(i0, i1, i2, i3, i4, i5, i6, i7); + // We either allow to pass as many extents as the dynamic rank is, or + // as many extents as the total rank is. In the latter case, the given + // extents for the static dimensions must match the + // compile-time extents. + constexpr int rank = View::rank(); + constexpr int dyn_rank = View::rank_dynamic(); + const bool n_args_is_dyn_rank = num_passed_args == dyn_rank; + const bool n_args_is_rank = num_passed_args == rank; + + if constexpr (rank != dyn_rank) { + if (n_args_is_rank) { + size_t new_extents[8] = {i0, i1, i2, i3, i4, i5, i6, i7}; + for (int i = dyn_rank; i < rank; ++i) + if (new_extents[i] != View::static_extent(i)) { + KOKKOS_IF_ON_HOST( + const std::string message = + "The specified run-time extent for Kokkos::View '" + + std::string(label) + + "' does not match the compile-time extent in dimension " + + std::to_string(i) + ". The given extent is " + + std::to_string(new_extents[i]) + " but should be " + + std::to_string(View::static_extent(i)) + ".\n"; + Kokkos::abort(message.c_str());) + KOKKOS_IF_ON_DEVICE( + Kokkos::abort( + "The specified run-time extents for a Kokkos::View " + "do not match the compile-time extents.");) + } + } + } + + if (!n_args_is_dyn_rank && !n_args_is_rank) { + KOKKOS_IF_ON_HOST( + const std::string message = + "Constructor for Kokkos::View '" + std::string(label) + + "' has mismatched number of arguments. The number " + "of arguments = " + + std::to_string(num_passed_args) + + " neither matches the dynamic rank = " + + std::to_string(dyn_rank) + + " nor the total rank = " + std::to_string(rank) + "\n"; + Kokkos::abort(message.c_str());) + KOKKOS_IF_ON_DEVICE(Kokkos::abort("Constructor for Kokkos View has " + "mismatched number of arguments.");) + } + } +} + +} /* namespace Impl */ +} /* namespace Kokkos */ + +// Class to provide a uniform type +namespace Kokkos { +namespace Impl { +template +struct ViewUniformType; +} +} // namespace Kokkos + +namespace Kokkos { + +#ifdef KOKKOS_ENABLE_IMPL_MDSPAN +namespace Impl { +struct UnsupportedKokkosArrayLayout; + +template +struct MDSpanViewTraits { + using mdspan_type = UnsupportedKokkosArrayLayout; +}; + +// "Natural" mdspan for a view if the View's ArrayLayout is supported. +template +struct MDSpanViewTraits::type>> { + using index_type = std::size_t; + using extents_type = + typename Impl::ExtentsFromDataType::type; + using mdspan_layout_type = + typename LayoutFromArrayLayout::type; + using accessor_type = + SpaceAwareAccessor>; + using mdspan_type = mdspan; +}; +} // namespace Impl +#endif // KOKKOS_ENABLE_IMPL_MDSPAN + +/** \class ViewTraits + * \brief Traits class for accessing attributes of a View. + * + * This is an implementation detail of View. It is only of interest + * to developers implementing a new specialization of View. + * + * Template argument options: + * - View< DataType > + * - View< DataType , Space > + * - View< DataType , Space , MemoryTraits > + * - View< DataType , ArrayLayout > + * - View< DataType , ArrayLayout , Space > + * - View< DataType , ArrayLayout , MemoryTraits > + * - View< DataType , ArrayLayout , Space , MemoryTraits > + * - View< DataType , MemoryTraits > + */ + +template +struct ViewTraits; + +template <> +struct ViewTraits { + using execution_space = void; + using memory_space = void; + using HostMirrorSpace = void; + using array_layout = void; + using memory_traits = void; + using specialize = void; + using hooks_policy = void; +}; + +template +struct ViewTraits { + // Ignore an extraneous 'void' + using execution_space = typename ViewTraits::execution_space; + using memory_space = typename ViewTraits::memory_space; + using HostMirrorSpace = typename ViewTraits::HostMirrorSpace; + using array_layout = typename ViewTraits::array_layout; + using memory_traits = typename ViewTraits::memory_traits; + using specialize = typename ViewTraits::specialize; + using hooks_policy = typename ViewTraits::hooks_policy; +}; + +template +struct ViewTraits< + std::enable_if_t::value>, + HooksPolicy, Prop...> { + using execution_space = typename ViewTraits::execution_space; + using memory_space = typename ViewTraits::memory_space; + using HostMirrorSpace = typename ViewTraits::HostMirrorSpace; + using array_layout = typename ViewTraits::array_layout; + using memory_traits = typename ViewTraits::memory_traits; + using specialize = typename ViewTraits::specialize; + using hooks_policy = HooksPolicy; +}; + +template +struct ViewTraits::value>, + ArrayLayout, Prop...> { + // Specify layout, keep subsequent space and memory traits arguments + + using execution_space = typename ViewTraits::execution_space; + using memory_space = typename ViewTraits::memory_space; + using HostMirrorSpace = typename ViewTraits::HostMirrorSpace; + using array_layout = ArrayLayout; + using memory_traits = typename ViewTraits::memory_traits; + using specialize = typename ViewTraits::specialize; + using hooks_policy = typename ViewTraits::hooks_policy; +}; + +template +struct ViewTraits::value>, Space, + Prop...> { + // Specify Space, memory traits should be the only subsequent argument. + + static_assert( + std::is_same_v::execution_space, + void> && + std::is_same_v::memory_space, + void> && + std::is_same_v::HostMirrorSpace, + void> && + std::is_same_v::array_layout, + void>, + "Only one View Execution or Memory Space template argument"); + + using execution_space = typename Space::execution_space; + using memory_space = typename Space::memory_space; + using HostMirrorSpace = + typename Kokkos::Impl::HostMirror::Space::memory_space; + using array_layout = typename execution_space::array_layout; + using memory_traits = typename ViewTraits::memory_traits; + using specialize = typename ViewTraits::specialize; + using hooks_policy = typename ViewTraits::hooks_policy; +}; + +template +struct ViewTraits< + std::enable_if_t::value>, + MemoryTraits, Prop...> { + // Specify memory trait, should not be any subsequent arguments + + static_assert( + std::is_same_v::execution_space, + void> && + std::is_same_v::memory_space, + void> && + std::is_same_v::array_layout, + void> && + std::is_same_v::memory_traits, + void> && + std::is_same_v::hooks_policy, + void>, + "MemoryTrait is the final optional template argument for a View"); + + using execution_space = void; + using memory_space = void; + using HostMirrorSpace = void; + using array_layout = void; + using memory_traits = MemoryTraits; + using specialize = void; + using hooks_policy = void; +}; + +template +struct ViewTraits { + private: + // Unpack the properties arguments + using prop = ViewTraits; + + using ExecutionSpace = + std::conditional_t, + typename prop::execution_space, + Kokkos::DefaultExecutionSpace>; + + using MemorySpace = + std::conditional_t, + typename prop::memory_space, + typename ExecutionSpace::memory_space>; + + using ArrayLayout = + std::conditional_t, + typename prop::array_layout, + typename ExecutionSpace::array_layout>; + + using HostMirrorSpace = std::conditional_t< + !std::is_void_v, + typename prop::HostMirrorSpace, + typename Kokkos::Impl::HostMirror::Space>; + + using MemoryTraits = + std::conditional_t, + typename prop::memory_traits, + typename Kokkos::MemoryManaged>; + + using HooksPolicy = + std::conditional_t, + typename prop::hooks_policy, + Kokkos::Experimental::DefaultViewHooks>; + + // Analyze data type's properties, + // May be specialized based upon the layout and value type + using data_analysis = Kokkos::Impl::ViewDataAnalysis; + + public: + //------------------------------------ + // Data type traits: + + using data_type = typename data_analysis::type; + using const_data_type = typename data_analysis::const_type; + using non_const_data_type = typename data_analysis::non_const_type; + + //------------------------------------ + // Compatible array of trivial type traits: + + using scalar_array_type = typename data_analysis::scalar_array_type; + using const_scalar_array_type = + typename data_analysis::const_scalar_array_type; + using non_const_scalar_array_type = + typename data_analysis::non_const_scalar_array_type; + + //------------------------------------ + // Value type traits: + + using value_type = typename data_analysis::value_type; + using const_value_type = typename data_analysis::const_value_type; + using non_const_value_type = typename data_analysis::non_const_value_type; + + //------------------------------------ + // Mapping traits: + + using array_layout = ArrayLayout; + using dimension = typename data_analysis::dimension; + + using specialize = std::conditional_t< + std::is_void_v, + typename prop::specialize, + typename data_analysis::specialize>; /* mapping specialization tag */ + + static constexpr unsigned rank = dimension::rank; + static constexpr unsigned rank_dynamic = dimension::rank_dynamic; + + //------------------------------------ + // Execution space, memory space, memory access traits, and host mirror space. + + using execution_space = ExecutionSpace; + using memory_space = MemorySpace; + using device_type = Kokkos::Device; + using memory_traits = MemoryTraits; + using host_mirror_space = HostMirrorSpace; + using hooks_policy = HooksPolicy; + + using size_type = typename MemorySpace::size_type; + + enum { is_hostspace = std::is_same_v }; + enum { is_managed = MemoryTraits::is_unmanaged == 0 }; + enum { is_random_access = MemoryTraits::is_random_access == 1 }; + + //------------------------------------ +}; + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Impl { +template +struct TypeListToViewTraits; + +template +struct TypeListToViewTraits> { + using type = ViewTraits; +}; + +// It is not safe to assume that subviews of views with the Aligned memory trait +// are also aligned. Hence, just remove that attribute for subviews. +template +struct RemoveAlignedMemoryTrait { + private: + using type_list_in = Kokkos::Impl::type_list; + using memory_traits = typename ViewTraits::memory_traits; + using type_list_in_wo_memory_traits = + typename Kokkos::Impl::type_list_remove_first::type; + using new_memory_traits = + Kokkos::MemoryTraits; + using new_type_list = typename Kokkos::Impl::concat_type_list< + type_list_in_wo_memory_traits, + Kokkos::Impl::type_list>::type; + + public: + using type = typename TypeListToViewTraits::type; +}; +} // namespace Impl + +} /* namespace Kokkos */ + +#endif /* KOKKOS_VIEWTRAITS_HPP */ diff --git a/packages/kokkos/core/src/impl/Kokkos_ViewUniformType.hpp b/packages/kokkos/core/src/View/Kokkos_ViewUniformType.hpp similarity index 88% rename from packages/kokkos/core/src/impl/Kokkos_ViewUniformType.hpp rename to packages/kokkos/core/src/View/Kokkos_ViewUniformType.hpp index 7de2869a0d88..1e476132858c 100644 --- a/packages/kokkos/core/src/impl/Kokkos_ViewUniformType.hpp +++ b/packages/kokkos/core/src/View/Kokkos_ViewUniformType.hpp @@ -24,11 +24,14 @@ namespace Impl { template struct ViewScalarToDataType { using type = typename ViewScalarToDataType::type *; + using const_type = + typename ViewScalarToDataType::const_type *; }; template struct ViewScalarToDataType { - using type = ScalarType; + using type = ScalarType; + using const_type = const ScalarType; }; template @@ -49,12 +52,13 @@ struct ViewUniformLayout { template struct ViewUniformType { using data_type = typename ViewType::data_type; - using const_data_type = std::add_const_t; + using const_data_type = typename ViewType::const_data_type; using runtime_data_type = typename ViewScalarToDataType::type; - using runtime_const_data_type = typename ViewScalarToDataType< - std::add_const_t, ViewType::rank>::type; + using runtime_const_data_type = + typename ViewScalarToDataType::const_type; using array_layout = typename ViewUniformLayout { } KOKKOS_FUNCTION - constexpr typename offset_policy::data_handle_type offset(data_handle_type p, - size_t i) const - noexcept { + constexpr typename offset_policy::data_handle_type offset( + data_handle_type p, size_t i) const noexcept { return nested_acc.offset(p, i); } @@ -214,6 +212,199 @@ struct AtomicAccessorRelaxed { } }; +//===================================================================== +//============= Reference Counted Accessor and DataHandle ============= +//===================================================================== + +template +class ReferenceCountedDataHandle { + public: + using value_type = ElementType; + using pointer = value_type*; + using reference = value_type&; + using memory_space = MemorySpace; + + KOKKOS_DEFAULTED_FUNCTION + ReferenceCountedDataHandle() = default; + + // this only ever works on host + explicit ReferenceCountedDataHandle(SharedAllocationRecord* rec) { + m_tracker.assign_allocated_record_to_uninitialized(rec); + m_handle = static_cast(get_record()->data()); + } + + KOKKOS_FUNCTION + ReferenceCountedDataHandle(const SharedAllocationTracker& tracker, + pointer data_handle) + : m_tracker(tracker), m_handle(data_handle) {} + + // unmanaged ctor + template >> + KOKKOS_FUNCTION ReferenceCountedDataHandle(OtherElementType* ptr) + : m_tracker(), m_handle(ptr) {} + + // subview ctor + template >> + KOKKOS_FUNCTION ReferenceCountedDataHandle( + const ReferenceCountedDataHandle& other, OtherElementType* ptr) + : m_tracker(other.m_tracker), m_handle(ptr) {} + + // converting ctor + template >> + KOKKOS_FUNCTION ReferenceCountedDataHandle( + const ReferenceCountedDataHandle& other) + : m_tracker(other.m_tracker), m_handle(other.m_handle) {} + + template < + class OtherElementType, class OtherSpace, + class = std::enable_if_t< + std::is_convertible_v && + (std::is_same_v || + std::is_same_v)>> + KOKKOS_FUNCTION ReferenceCountedDataHandle( + const ReferenceCountedDataHandle& other) + : m_tracker(other.m_tracker), m_handle(other.m_handle) {} + + KOKKOS_FUNCTION + pointer get() const noexcept { return m_handle; } + KOKKOS_FUNCTION + explicit operator pointer() const noexcept { return m_handle; } + + bool has_record() const { return m_tracker.has_record(); } + auto* get_record() const { return m_tracker.get_record(); } + int use_count() const noexcept { return m_tracker.use_count(); } + + std::string get_label() const { return m_tracker.get_label(); } + KOKKOS_FUNCTION + const SharedAllocationTracker& tracker() const noexcept { return m_tracker; } + + KOKKOS_FUNCTION + friend bool operator==(const ReferenceCountedDataHandle& lhs, + const value_type* rhs) { + return lhs.m_handle == rhs; + } + + KOKKOS_FUNCTION + friend bool operator==(const value_type* lhs, + const ReferenceCountedDataHandle& rhs) { + return lhs == rhs.m_handle; + } + + private: + template + friend class ReferenceCountedDataHandle; + + template + friend class ReferenceCountedAccessor; + + SharedAllocationTracker m_tracker; + pointer m_handle = nullptr; +}; + +template +class ReferenceCountedAccessor; + +template +struct IsReferenceCountedAccessor : std::false_type {}; + +template +struct IsReferenceCountedAccessor< + ReferenceCountedAccessor> + : std::true_type {}; + +template +class ReferenceCountedAccessor { + public: + using element_type = ElementType; + using data_handle_type = ReferenceCountedDataHandle; + using reference = typename NestedAccessor::reference; + using offset_policy = + ReferenceCountedAccessor; + using memory_space = MemorySpace; + + KOKKOS_DEFAULTED_FUNCTION + constexpr ReferenceCountedAccessor() noexcept = default; + + template < + class OtherElementType, class OtherNestedAccessor, + class = std::enable_if_t< + std::is_convertible_v && + std::is_constructible_v>> + KOKKOS_FUNCTION constexpr ReferenceCountedAccessor( + const ReferenceCountedAccessor&) {} + + template < + class OtherElementType, class OtherSpace, class OtherNestedAccessor, + class = std::enable_if_t< + std::is_convertible_v && + (std::is_same_v || + std::is_same_v)&&std:: + is_constructible_v>> + KOKKOS_FUNCTION constexpr ReferenceCountedAccessor( + const ReferenceCountedAccessor&) {} + + template >> + KOKKOS_FUNCTION constexpr ReferenceCountedAccessor( + const default_accessor&) {} + + template ::value && + std::is_convertible_v>> + KOKKOS_FUNCTION operator DstAccessor() const { + return m_nested_acc; + } + + KOKKOS_FUNCTION + constexpr reference access(data_handle_type p, size_t i) const { + return m_nested_acc.access(p.get(), i); + } + + KOKKOS_FUNCTION + constexpr data_handle_type offset(data_handle_type p, size_t i) const { + return data_handle_type(p, m_nested_acc.offset(p.get(), i)); + } + + KOKKOS_FUNCTION + constexpr auto nested_accessor() const { return m_nested_acc; } + + private: +#ifdef _MDSPAN_NO_UNIQUE_ADDRESS + _MDSPAN_NO_UNIQUE_ADDRESS +#else + [[no_unique_address]] +#endif + NestedAccessor m_nested_acc; +}; + +template +using CheckedReferenceCountedAccessor = + SpaceAwareAccessor>>; + +template +using CheckedRelaxedAtomicAccessor = + SpaceAwareAccessor>; + +template +using CheckedReferenceCountedRelaxedAtomicAccessor = SpaceAwareAccessor< + MemorySpace, ReferenceCountedAccessor>>; + } // namespace Impl } // namespace Kokkos diff --git a/packages/kokkos/core/src/View/MDSpan/Kokkos_MDSpan_Layout.hpp b/packages/kokkos/core/src/View/MDSpan/Kokkos_MDSpan_Layout.hpp index 089628137d75..f990d158bfad 100644 --- a/packages/kokkos/core/src/View/MDSpan/Kokkos_MDSpan_Layout.hpp +++ b/packages/kokkos/core/src/View/MDSpan/Kokkos_MDSpan_Layout.hpp @@ -23,7 +23,11 @@ static_assert(false, #define KOKKOS_EXPERIMENTAL_MDSPAN_LAYOUT_HPP #include "Kokkos_MDSpan_Extents.hpp" -#include +#include + +// The difference between a legacy Kokkos array layout and an +// mdspan layout is that the array layouts can have state, but don't have the +// nested mapping. This file provides interoperability helpers. namespace Kokkos::Impl { @@ -77,32 +81,7 @@ KOKKOS_INLINE_FUNCTION auto array_layout_from_mapping( rank > 7 ? mapping.stride(7) : 0, }; } else { - // FIXME: Kokkos Layouts don't store stride (it's in the mapping) - // We could conceivably fix this by adding an extra ViewCtorProp for - // an abritrary padding. For now we will check for this. - if constexpr (rank > 1 && - (std::is_same_v> || - std::is_same_v>)) { - [[maybe_unused]] constexpr size_t strided_index = - std::is_same_v< - typename mapping_type::layout_type, - Kokkos::Experimental::layout_left_padded> - ? 1 - : rank - 2; - [[maybe_unused]] constexpr size_t extent_index = - std::is_same_v< - typename mapping_type::layout_type, - Kokkos::Experimental::layout_left_padded> - ? 0 - : rank - 1; - KOKKOS_ASSERT(mapping.stride(strided_index) == ext.extent(extent_index)); - } - - return ArrayLayout{rank > 0 ? ext.extent(0) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + ArrayLayout layout{rank > 0 ? ext.extent(0) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, rank > 1 ? ext.extent(1) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, rank > 2 ? ext.extent(2) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, rank > 3 ? ext.extent(3) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, @@ -110,12 +89,98 @@ KOKKOS_INLINE_FUNCTION auto array_layout_from_mapping( rank > 5 ? ext.extent(5) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, rank > 6 ? ext.extent(6) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, rank > 7 ? ext.extent(7) : KOKKOS_IMPL_CTOR_DEFAULT_ARG}; + + if constexpr (rank > 1 && + std::is_same_v>) { + layout.stride = mapping.stride(1); + } + if constexpr (std::is_same_v>) { + if constexpr (rank == 2) { + layout.stride = mapping.stride(0); + } + if constexpr (rank > 2) { + if (mapping.stride(rank - 2) != mapping.extents().extent(rank - 1)) + Kokkos::abort( + "Invalid conversion from layout_right_padded to LayoutRight"); + } + } + return layout; } #ifdef KOKKOS_COMPILER_INTEL __builtin_unreachable(); #endif } +template +KOKKOS_INLINE_FUNCTION auto mapping_from_array_layout_impl( + ArrayLayout layout, std::index_sequence) { + using index_type = typename MappingType::index_type; + using extents_type = typename MappingType::extents_type; + if constexpr (std::is_same_v || + std::is_same_v) { + return MappingType{ + extents_type{dextents{ + layout.dimension[Idx]...}}}; + } else { + if (layout.stride == KOKKOS_IMPL_CTOR_DEFAULT_ARG || + extents_type::rank() < 2) { + return MappingType{ + extents_type{dextents{ + layout.dimension[Idx]...}}}; + } else { + if constexpr (std::is_same_v && + extents_type::rank() > 2) { + size_t product_of_dimensions = 1; + for (size_t r = 1; r < extents_type::rank(); r++) + product_of_dimensions *= layout.dimension[r]; + if (product_of_dimensions != layout.stride) + Kokkos::abort( + "Invalid conversion from LayoutRight to layout_right_padded"); + } else { + return MappingType{ + extents_type{ + dextents{ + layout.dimension[Idx]...}}, + layout.stride}; + } + } + } +} +template +KOKKOS_INLINE_FUNCTION auto mapping_from_array_layout_impl( + LayoutStride layout, std::index_sequence) { + static_assert( + std::is_same_v); + using index_type = typename MappingType::index_type; + index_type strides[MappingType::extents_type::rank()] = { + layout.stride[Idx]...}; + return MappingType{ + mdspan_non_standard_tag(), + static_cast( + dextents{ + layout.dimension[Idx]...}), + strides}; +} + +// specialization for rank 0 to avoid empty array +template +KOKKOS_INLINE_FUNCTION auto mapping_from_array_layout_impl( + LayoutStride, std::index_sequence<>) { + return MappingType{}; +} + +template +KOKKOS_INLINE_FUNCTION auto mapping_from_array_layout(ArrayLayout layout) { + return mapping_from_array_layout_impl( + layout, std::make_index_sequence()); +} + template KOKKOS_INLINE_FUNCTION auto mapping_from_view_mapping(const VM &view_mapping) { using mapping_type = typename MDSpanType::mapping_type; diff --git a/packages/kokkos/core/src/decl/Kokkos_Declare_CUDA.hpp b/packages/kokkos/core/src/decl/Kokkos_Declare_CUDA.hpp index ebdf2c8211fe..79c137bfddd4 100644 --- a/packages/kokkos/core/src/decl/Kokkos_Declare_CUDA.hpp +++ b/packages/kokkos/core/src/decl/Kokkos_Declare_CUDA.hpp @@ -28,7 +28,9 @@ #include #include #include +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 #include +#endif #include #include #include diff --git a/packages/kokkos/core/src/decl/Kokkos_Declare_SYCL.hpp b/packages/kokkos/core/src/decl/Kokkos_Declare_SYCL.hpp index d13c90825c5a..3570ed2b6e14 100644 --- a/packages/kokkos/core/src/decl/Kokkos_Declare_SYCL.hpp +++ b/packages/kokkos/core/src/decl/Kokkos_Declare_SYCL.hpp @@ -35,6 +35,16 @@ #include #include #include + +namespace Kokkos { +namespace Experimental { +using SYCLDeviceUSMSpace = ::Kokkos::SYCLDeviceUSMSpace; +using SYCLHostUSMSpace = ::Kokkos::SYCLHostUSMSpace; +using SYCLSharedUSMSpace = ::Kokkos::SYCLSharedUSMSpace; +using SYCL = ::Kokkos::SYCL; +} // namespace Experimental +} // namespace Kokkos + #endif #endif diff --git a/packages/kokkos/core/src/fwd/Kokkos_Fwd_SYCL.hpp b/packages/kokkos/core/src/fwd/Kokkos_Fwd_SYCL.hpp index 400794f86591..399b986041e9 100644 --- a/packages/kokkos/core/src/fwd/Kokkos_Fwd_SYCL.hpp +++ b/packages/kokkos/core/src/fwd/Kokkos_Fwd_SYCL.hpp @@ -19,7 +19,6 @@ #if defined(KOKKOS_ENABLE_SYCL) namespace Kokkos { -namespace Experimental { class SYCLDeviceUSMSpace; ///< Memory space on SYCL device, not accessible from ///< the host class SYCLSharedUSMSpace; ///< Memory space accessible from both the SYCL @@ -27,7 +26,6 @@ class SYCLSharedUSMSpace; ///< Memory space accessible from both the SYCL class SYCLHostUSMSpace; ///< Memory space accessible from both the SYCL ///< device and the host (host pinned) class SYCL; ///< Execution space for SYCL -} // namespace Experimental } // namespace Kokkos #endif #endif diff --git a/packages/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp b/packages/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp index a44ffefa6b72..a9db2c4cf4a3 100644 --- a/packages/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp +++ b/packages/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp @@ -1458,7 +1458,7 @@ struct Tile_Loop_Type<8, IsLeft, IType, void, void> { template struct Tile_Loop_Type<1, IsLeft, IType, Tagged, - std::enable_if_t::value>> { + std::enable_if_t>> { template static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { @@ -1477,7 +1477,7 @@ struct Tile_Loop_Type<1, IsLeft, IType, Tagged, template struct Tile_Loop_Type<2, IsLeft, IType, Tagged, - std::enable_if_t::value>> { + std::enable_if_t>> { template static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { @@ -1496,7 +1496,7 @@ struct Tile_Loop_Type<2, IsLeft, IType, Tagged, template struct Tile_Loop_Type<3, IsLeft, IType, Tagged, - std::enable_if_t::value>> { + std::enable_if_t>> { template static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { @@ -1515,7 +1515,7 @@ struct Tile_Loop_Type<3, IsLeft, IType, Tagged, template struct Tile_Loop_Type<4, IsLeft, IType, Tagged, - std::enable_if_t::value>> { + std::enable_if_t>> { template static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { @@ -1534,7 +1534,7 @@ struct Tile_Loop_Type<4, IsLeft, IType, Tagged, template struct Tile_Loop_Type<5, IsLeft, IType, Tagged, - std::enable_if_t::value>> { + std::enable_if_t>> { template static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { @@ -1553,7 +1553,7 @@ struct Tile_Loop_Type<5, IsLeft, IType, Tagged, template struct Tile_Loop_Type<6, IsLeft, IType, Tagged, - std::enable_if_t::value>> { + std::enable_if_t>> { template static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { @@ -1572,7 +1572,7 @@ struct Tile_Loop_Type<6, IsLeft, IType, Tagged, template struct Tile_Loop_Type<7, IsLeft, IType, Tagged, - std::enable_if_t::value>> { + std::enable_if_t>> { template static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { @@ -1591,7 +1591,7 @@ struct Tile_Loop_Type<7, IsLeft, IType, Tagged, template struct Tile_Loop_Type<8, IsLeft, IType, Tagged, - std::enable_if_t::value>> { + std::enable_if_t>> { template static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { @@ -1616,7 +1616,7 @@ struct HostIterateTile; // For ParallelFor template struct HostIterateTile::value>> { + std::enable_if_t>> { using index_type = typename RP::index_type; using point_type = typename RP::point_type; @@ -1635,12 +1635,11 @@ struct HostIterateTile 0 - ? (m_rp.m_upper[i] - offset[i]) - : (m_rp.m_upper[i] - - m_rp.m_lower[i]); // when single tile encloses range + (m_rp.m_upper[i] - 1 - offset[i]) == 0 ? 1 + : (m_rp.m_upper[i] - m_rp.m_tile[i]) > 0 + ? (m_rp.m_upper[i] - offset[i]) + : (m_rp.m_upper[i] - + m_rp.m_lower[i]); // when single tile encloses range } } @@ -2000,30 +1999,28 @@ struct HostIterateTile - std::enable_if_t<(sizeof...(Args) == RP::rank && std::is_void::value), - void> + std::enable_if_t<(sizeof...(Args) == RP::rank && std::is_void_v), void> apply(Args&&... args) const { m_func(args...); } template - std::enable_if_t<(sizeof...(Args) == RP::rank && !std::is_void::value), - void> + std::enable_if_t<(sizeof...(Args) == RP::rank && !std::is_void_v), void> apply(Args&&... args) const { m_func(m_tag, args...); } RP const m_rp; Functor const m_func; - std::conditional_t::value, int, Tag> m_tag; + std::conditional_t, int, Tag> m_tag; }; // For ParallelReduce // ValueType - scalar: For reductions template struct HostIterateTile::value && - !std::is_array::value>> { + std::enable_if_t && + !std::is_array_v>> { using index_type = typename RP::index_type; using point_type = typename RP::point_type; @@ -2050,12 +2047,11 @@ struct HostIterateTile 0 - ? (m_rp.m_upper[i] - offset[i]) - : (m_rp.m_upper[i] - - m_rp.m_lower[i]); // when single tile encloses range + (m_rp.m_upper[i] - 1 - offset[i]) == 0 ? 1 + : (m_rp.m_upper[i] - m_rp.m_tile[i]) > 0 + ? (m_rp.m_upper[i] - offset[i]) + : (m_rp.m_upper[i] - + m_rp.m_lower[i]); // when single tile encloses range } } @@ -2430,7 +2426,7 @@ struct HostIterateTile::value, int, Tag> m_tag; + std::conditional_t, int, Tag> m_tag; }; // For ParallelReduce @@ -2438,8 +2434,8 @@ struct HostIterateTile struct HostIterateTile::value && - std::is_array::value>> { + std::enable_if_t && + std::is_array_v>> { using index_type = typename RP::index_type; using point_type = typename RP::point_type; @@ -2463,12 +2459,11 @@ struct HostIterateTile 0 - ? (m_rp.m_upper[i] - offset[i]) - : (m_rp.m_upper[i] - - m_rp.m_lower[i]); // when single tile encloses range + (m_rp.m_upper[i] - 1 - offset[i]) == 0 ? 1 + : (m_rp.m_upper[i] - m_rp.m_tile[i]) > 0 + ? (m_rp.m_upper[i] - offset[i]) + : (m_rp.m_upper[i] - + m_rp.m_lower[i]); // when single tile encloses range } } @@ -2842,7 +2837,7 @@ struct HostIterateTile::value, int, Tag> m_tag; + std::conditional_t, int, Tag> m_tag; }; // ------------------------------------------------------------------ // diff --git a/packages/kokkos/core/src/impl/KokkosExp_IterateTileGPU.hpp b/packages/kokkos/core/src/impl/KokkosExp_IterateTileGPU.hpp index e1273ab9e3bd..e6b2fcbef4bc 100644 --- a/packages/kokkos/core/src/impl/KokkosExp_IterateTileGPU.hpp +++ b/packages/kokkos/core/src/impl/KokkosExp_IterateTileGPU.hpp @@ -41,13 +41,13 @@ struct EmulateCUDADim3 { template KOKKOS_IMPL_FORCEINLINE_FUNCTION std::enable_if_t::value> _tag_invoke(Functor const& f, Args&&... args) { - f((Args &&) args...); + f((Args&&)args...); } template KOKKOS_IMPL_FORCEINLINE_FUNCTION std::enable_if_t::value> _tag_invoke(Functor const& f, Args&&... args) { - f(Tag{}, (Args &&) args...); + f(Tag{}, (Args&&)args...); } template , Args&&... args) { - _tag_invoke(f, vals[Idxs]..., (Args &&) args...); + _tag_invoke(f, vals[Idxs]..., (Args&&)args...); } template @@ -63,7 +63,7 @@ KOKKOS_IMPL_FORCEINLINE_FUNCTION void _tag_invoke_array(Functor const& f, T (&vals)[N], Args&&... args) { _tag_invoke_array_helper(f, vals, std::make_index_sequence{}, - (Args &&) args...); + (Args&&)args...); } // ------------------------------------------------------------------ // diff --git a/packages/kokkos/core/src/impl/Kokkos_AnalyzePolicy.hpp b/packages/kokkos/core/src/impl/Kokkos_AnalyzePolicy.hpp index d77ec0c7537f..b483653021a5 100644 --- a/packages/kokkos/core/src/impl/Kokkos_AnalyzePolicy.hpp +++ b/packages/kokkos/core/src/impl/Kokkos_AnalyzePolicy.hpp @@ -143,7 +143,7 @@ struct AnalyzeExecPolicyUseMatcher, Trait, Traits...> { static constexpr auto trigger_error_message = show_name_of_invalid_execution_policy_trait{}; static_assert( - /* always false: */ std::is_void::value, + /* always false: */ std::is_void_v, "Unknown execution policy trait. Search compiler output for " "'show_name_of_invalid_execution_policy_trait' to see the type of the " "invalid trait."); diff --git a/packages/kokkos/core/src/impl/Kokkos_ChaseLev.hpp b/packages/kokkos/core/src/impl/Kokkos_ChaseLev.hpp index d8ab77b20563..4ea0b8d343b5 100644 --- a/packages/kokkos/core/src/impl/Kokkos_ChaseLev.hpp +++ b/packages/kokkos/core/src/impl/Kokkos_ChaseLev.hpp @@ -95,12 +95,12 @@ struct non_owning_variable_size_circular_buffer { non_owning_variable_size_circular_buffer( non_owning_variable_size_circular_buffer const&) = delete; non_owning_variable_size_circular_buffer( - non_owning_variable_size_circular_buffer&&) = default; - non_owning_variable_size_circular_buffer& operator =( + non_owning_variable_size_circular_buffer&&) = default; + non_owning_variable_size_circular_buffer& operator=( non_owning_variable_size_circular_buffer const&) = delete; - non_owning_variable_size_circular_buffer& operator =( + non_owning_variable_size_circular_buffer& operator=( non_owning_variable_size_circular_buffer&&) = default; - ~non_owning_variable_size_circular_buffer() = default; + ~non_owning_variable_size_circular_buffer() = default; KOKKOS_FORCEINLINE_FUNCTION constexpr size_type size() const noexcept { return m_size; } @@ -138,7 +138,7 @@ struct ChaseLevDeque { public: template ::value>> + std::is_default_constructible_v>> ChaseLevDeque() : m_array() {} explicit ChaseLevDeque(CircularBufferT buffer) : m_array(std::move(buffer)) {} @@ -165,7 +165,7 @@ struct ChaseLevDeque { #ifdef _WIN32 Kokkos::memory_fence(); bool const success = - Kokkos::atomic_compare_exchange_strong(&m_top, t, t + 1); + (t == Kokkos::atomic_compare_exchange(&m_top, t, t + 1)); Kokkos::memory_fence(); if (!success) { return_value = nullptr; @@ -226,7 +226,7 @@ struct ChaseLevDeque { #ifdef _WIN32 Kokkos::memory_fence(); bool const success = - Kokkos::atomic_compare_exchange_strong(&m_top, t, t + 1); + (t == Kokkos::atomic_compare_exchange(&m_top, t, t + 1)); Kokkos::memory_fence(); if (!success) { return_value = nullptr; diff --git a/packages/kokkos/core/src/impl/Kokkos_ClockTic.hpp b/packages/kokkos/core/src/impl/Kokkos_ClockTic.hpp index 6e3d99ebd685..ee53fd8bc6d4 100644 --- a/packages/kokkos/core/src/impl/Kokkos_ClockTic.hpp +++ b/packages/kokkos/core/src/impl/Kokkos_ClockTic.hpp @@ -27,8 +27,9 @@ // To use OpenCL(TM) built-in intrinsics inside kernels, we have to // forward-declare their prototype, also see // https://github.com/intel/pti-gpu/blob/master/chapters/binary_instrumentation/OpenCLBuiltIn.md -#if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_ARCH_INTEL_GPU) && \ - defined(__SYCL_DEVICE_ONLY__) +#if defined(KOKKOS_ENABLE_SYCL) && \ + defined(KOKKOS_ENABLE_SYCL_RELOCATABLE_DEVICE_CODE) && \ + defined(KOKKOS_ARCH_INTEL_GPU) && defined(__SYCL_DEVICE_ONLY__) extern SYCL_EXTERNAL unsigned long __attribute__((overloadable)) intel_get_cycle_counter(); #endif @@ -55,8 +56,10 @@ KOKKOS_IMPL_DEVICE_FUNCTION inline uint64_t clock_tic_device() noexcept { // Return value of 64-bit hi-res clock register. return clock64(); -#elif defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_ARCH_INTEL_GPU) && \ - defined(__SYCL_DEVICE_ONLY__) +// FIXME_SYCL We can only return something useful for Intel GPUs and with RDC +#elif defined(KOKKOS_ENABLE_SYCL) && \ + defined(KOKKOS_ENABLE_SYCL_RELOCATABLE_DEVICE_CODE) && \ + defined(KOKKOS_ARCH_INTEL_GPU) && defined(__SYCL_DEVICE_ONLY__) return intel_get_cycle_counter(); diff --git a/packages/kokkos/core/src/impl/Kokkos_Combined_Reducer.hpp b/packages/kokkos/core/src/impl/Kokkos_Combined_Reducer.hpp index e6dd3c63391d..d7319e80c871 100644 --- a/packages/kokkos/core/src/impl/Kokkos_Combined_Reducer.hpp +++ b/packages/kokkos/core/src/impl/Kokkos_Combined_Reducer.hpp @@ -93,7 +93,7 @@ struct CombinedReducerValueImpl, std::move(arg_values))... {} template - KOKKOS_INLINE_FUNCTION ValueType& get() & noexcept { + KOKKOS_INLINE_FUNCTION ValueType& get() & noexcept { return this->CombinedReducerValueItemImpl::ref(); } template @@ -181,7 +181,7 @@ struct CombinedReducerImpl, Space, KOKKOS_DEFAULTED_FUNCTION constexpr CombinedReducerImpl( CombinedReducerImpl const&) = default; KOKKOS_DEFAULTED_FUNCTION constexpr CombinedReducerImpl( - CombinedReducerImpl&&) = default; + CombinedReducerImpl&&) = default; KOKKOS_DEFAULTED_FUNCTION constexpr CombinedReducerImpl& operator=( CombinedReducerImpl const&) = default; KOKKOS_DEFAULTED_FUNCTION constexpr CombinedReducerImpl& operator=( @@ -192,8 +192,8 @@ struct CombinedReducerImpl, Space, template KOKKOS_FUNCTION constexpr explicit CombinedReducerImpl( value_type& value, ReducersDeduced&&... reducers) noexcept - : CombinedReducerStorageImpl((ReducersDeduced &&) - reducers)..., + : CombinedReducerStorageImpl( + (ReducersDeduced&&)reducers)..., m_value_view(&value) {} KOKKOS_FUNCTION constexpr void join(value_type& dest, @@ -348,8 +348,8 @@ struct CombinedReductionFunctorWrapperImpl< IndexOrMemberOrTagType1&& arg_first, IndexOrMemberTypesThenValueType&&... args) const { this->template _call_op_impl( - (IndexOrMemberOrTagType1 &&) arg_first, - (IndexOrMemberTypesThenValueType &&) args...); + (IndexOrMemberOrTagType1&&)arg_first, + (IndexOrMemberTypesThenValueType&&)args...); } // end call operator }}}2 @@ -369,19 +369,19 @@ struct CombinedReductionFunctorWrapperImpl< template KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - !std::is_same, value_type>::value> + !std::is_same_v, value_type>> _call_op_impl(IdxOrMemberTypes&&... idxs, IdxOrMemberType1&& idx, IdxOrMemberTypesThenValueType&&... args) const { this->template _call_op_impl( - (IdxOrMemberTypes &&) idxs..., (IdxOrMemberType1 &&) idx, - (IdxOrMemberTypesThenValueType &&) args...); + (IdxOrMemberTypes&&)idxs..., (IdxOrMemberType1&&)idx, + (IdxOrMemberTypesThenValueType&&)args...); } // base case template KOKKOS_FORCEINLINE_FUNCTION void _call_op_impl(IdxOrMemberTypes&&... idxs, value_type& out) const { - m_functor((IdxOrMemberTypes &&) idxs..., + m_functor((IdxOrMemberTypes&&)idxs..., out.template get()...); } }; @@ -464,8 +464,8 @@ KOKKOS_INLINE_FUNCTION constexpr auto make_combined_reducer_value( typename _reducer_from_arg_t::value_type...>{ // This helper function is now poorly named after refactoring. - _get_value_from_combined_reducer_ctor_arg((ReferencesOrViewsOrReducers &&) - args)...}; + _get_value_from_combined_reducer_ctor_arg( + (ReferencesOrViewsOrReducers&&)args)...}; //---------------------------------------- } @@ -480,7 +480,7 @@ KOKKOS_INLINE_FUNCTION constexpr auto make_combined_reducer( Space, _reducer_from_arg_t...>; return reducer_type(value, _reducer_from_arg_t{ - (ReferencesOrViewsOrReducers &&) args}...); + (ReferencesOrViewsOrReducers&&)args}...); //---------------------------------------- } diff --git a/packages/kokkos/core/src/impl/Kokkos_ConcurrentBitset.hpp b/packages/kokkos/core/src/impl/Kokkos_ConcurrentBitset.hpp index ca4edce5c388..9bde2f72a3ff 100644 --- a/packages/kokkos/core/src/impl/Kokkos_ConcurrentBitset.hpp +++ b/packages/kokkos/core/src/impl/Kokkos_ConcurrentBitset.hpp @@ -110,15 +110,15 @@ struct concurrent_bitset { // when is full at the atomic_fetch_add(+1) // then a release occurs before the atomic_fetch_add(-1). - const uint32_t state = (uint32_t)Kokkos::atomic_fetch_add( - reinterpret_cast(buffer), 1); + const uint32_t state = + Kokkos::atomic_fetch_add(const_cast(buffer), 1); const uint32_t state_error = state_header != (state & state_header_mask); const uint32_t state_bit_used = state & state_used_mask; if (state_error || (bit_bound <= state_bit_used)) { - Kokkos::atomic_fetch_add(reinterpret_cast(buffer), -1); + Kokkos::atomic_fetch_sub(const_cast(buffer), 1); return state_error ? type(-2, -2) : type(-1, -1); } @@ -132,7 +132,8 @@ struct concurrent_bitset { while (1) { const uint32_t word = bit >> bits_per_int_lg2; const uint32_t mask = 1u << (bit & bits_per_int_mask); - const uint32_t prev = Kokkos::atomic_fetch_or(buffer + word + 1, mask); + const uint32_t prev = Kokkos::atomic_fetch_or( + const_cast(buffer) + word + 1, mask); if (!(prev & mask)) { // Successfully claimed 'result.first' by @@ -194,15 +195,15 @@ struct concurrent_bitset { // when is full at the atomic_fetch_add(+1) // then a release occurs before the atomic_fetch_add(-1). - const uint32_t state = (uint32_t)Kokkos::atomic_fetch_add( - reinterpret_cast(buffer), 1); + const uint32_t state = + Kokkos::atomic_fetch_add(const_cast(buffer), 1); const uint32_t state_error = state_header != (state & state_header_mask); const uint32_t state_bit_used = state & state_used_mask; if (state_error || (bit_bound <= state_bit_used)) { - Kokkos::atomic_fetch_add(reinterpret_cast(buffer), -1); + Kokkos::atomic_fetch_sub(const_cast(buffer), 1); return state_error ? type(-2, -2) : type(-1, -1); } @@ -216,7 +217,8 @@ struct concurrent_bitset { while (1) { const uint32_t word = bit >> bits_per_int_lg2; const uint32_t mask = 1u << (bit & bits_per_int_mask); - const uint32_t prev = Kokkos::atomic_fetch_or(buffer + word + 1, mask); + const uint32_t prev = Kokkos::atomic_fetch_or( + const_cast(buffer) + word + 1, mask); if (!(prev & mask)) { // Successfully claimed 'result.first' by @@ -262,8 +264,8 @@ struct concurrent_bitset { } const uint32_t mask = 1u << (bit & bits_per_int_mask); - const uint32_t prev = - Kokkos::atomic_fetch_and(buffer + (bit >> bits_per_int_lg2) + 1, ~mask); + const uint32_t prev = Kokkos::atomic_fetch_and( + const_cast(buffer) + (bit >> bits_per_int_lg2) + 1, ~mask); if (!(prev & mask)) { return -1; @@ -273,7 +275,7 @@ struct concurrent_bitset { Kokkos::memory_fence(); const int count = - Kokkos::atomic_fetch_add(reinterpret_cast(buffer), -1); + Kokkos::atomic_fetch_sub(const_cast(buffer), 1); // Flush the store-release Kokkos::memory_fence(); @@ -299,8 +301,8 @@ struct concurrent_bitset { } const uint32_t mask = 1u << (bit & bits_per_int_mask); - const uint32_t prev = - Kokkos::atomic_fetch_or(buffer + (bit >> bits_per_int_lg2) + 1, mask); + const uint32_t prev = Kokkos::atomic_fetch_or( + const_cast(buffer) + (bit >> bits_per_int_lg2) + 1, mask); if (!(prev & mask)) { return -1; @@ -310,7 +312,7 @@ struct concurrent_bitset { Kokkos::memory_fence(); const int count = - Kokkos::atomic_fetch_add(reinterpret_cast(buffer), -1); + Kokkos::atomic_fetch_sub(const_cast(buffer), 1); return (count & state_used_mask) - 1; } diff --git a/packages/kokkos/core/src/impl/Kokkos_Core.cpp b/packages/kokkos/core/src/impl/Kokkos_Core.cpp index 6f862718bcb0..72f33ffaab90 100644 --- a/packages/kokkos/core/src/impl/Kokkos_Core.cpp +++ b/packages/kokkos/core/src/impl/Kokkos_Core.cpp @@ -138,7 +138,7 @@ int get_device_count() { KOKKOS_IMPL_HIP_SAFE_CALL(hipGetDeviceCount(&count)); return count; #elif defined(KOKKOS_ENABLE_SYCL) - return Kokkos::Experimental::Impl::get_sycl_devices().size(); + return Kokkos::Impl::get_sycl_devices().size(); #elif defined(KOKKOS_ENABLE_OPENACC) return acc_get_num_devices( Kokkos::Experimental::Impl::OpenACC_Traits::dev_type); @@ -183,7 +183,7 @@ std::vector const& Kokkos::Impl::get_visible_devices() { #elif defined(KOKKOS_ENABLE_OPENMPTARGET) int device = omp_get_default_device(); // FIXME_OPENMPTARGET #elif defined(KOKKOS_ENABLE_SYCL) - int device = Experimental::Impl::SYCLInternal::m_syclDev; + int device = Impl::SYCLInternal::m_syclDev; #else int device = -1; return device; @@ -271,7 +271,7 @@ int Kokkos::Impl::get_ctest_gpu(int local_rank) { ss << "Error: local rank " << local_rank << " is outside the bounds of resource groups provided by CTest. Raised" << " by Kokkos::Impl::get_ctest_gpu()."; - throw_runtime_exception(ss.str()); + abort(ss.str().c_str()); } // Get the resource types allocated to this resource group @@ -284,7 +284,7 @@ int Kokkos::Impl::get_ctest_gpu(int local_rank) { std::ostringstream ss; ss << "Error: " << ctest_resource_group_name << " is not specified. Raised" << " by Kokkos::Impl::get_ctest_gpu()."; - throw_runtime_exception(ss.str()); + abort(ss.str().c_str()); } // Look for the device type specified in CTEST_KOKKOS_DEVICE_TYPE @@ -308,7 +308,7 @@ int Kokkos::Impl::get_ctest_gpu(int local_rank) { ss << "Error: device type '" << ctest_kokkos_device_type << "' not included in " << ctest_resource_group_name << ". Raised by Kokkos::Impl::get_ctest_gpu()."; - throw_runtime_exception(ss.str()); + abort(ss.str().c_str()); } // Get the device ID @@ -324,7 +324,7 @@ int Kokkos::Impl::get_ctest_gpu(int local_rank) { std::ostringstream ss; ss << "Error: " << ctest_resource_group_id_name << " is not specified. Raised by Kokkos::Impl::get_ctest_gpu()."; - throw_runtime_exception(ss.str()); + abort(ss.str().c_str()); } auto const* comma = std::strchr(resource_str, ','); @@ -332,7 +332,7 @@ int Kokkos::Impl::get_ctest_gpu(int local_rank) { std::ostringstream ss; ss << "Error: invalid value of " << ctest_resource_group_id_name << ": '" << resource_str << "'. Raised by Kokkos::Impl::get_ctest_gpu()."; - throw_runtime_exception(ss.str()); + abort(ss.str().c_str()); } std::string id(resource_str + 3, comma - resource_str - 3); @@ -613,7 +613,7 @@ void pre_initialize_internal(const Kokkos::InitializationSettings& settings) { #endif declare_configuration_metadata("architecture", "Default Device", - typeid(Kokkos::DefaultExecutionSpace).name()); + Kokkos::DefaultExecutionSpace::name()); #if defined(KOKKOS_ARCH_A64FX) declare_configuration_metadata("architecture", "CPU architecture", "A64FX"); @@ -666,6 +666,9 @@ void pre_initialize_internal(const Kokkos::InitializationSettings& settings) { #elif defined(KOKKOS_ARCH_RISCV_SG2042) declare_configuration_metadata("architecture", "CPU architecture", "SG2042 (RISC-V)") +#elif defined(KOKKOS_ARCH_RISCV_RVA22V) + declare_configuration_metadata("architecture", "CPU architecture", + "RVA22V (RISC-V)") #else declare_configuration_metadata("architecture", "CPU architecture", "none"); #endif @@ -738,8 +741,8 @@ void pre_initialize_internal(const Kokkos::InitializationSettings& settings) { declare_configuration_metadata("architecture", "GPU architecture", "HOPPER90"); #elif defined(KOKKOS_ARCH_AMD_GFX906) - declare_configuration_metadata("architecture", "GPU architecture", - "AMD_GFX906"); + declare_configuration_metadata("architecture", "GPU architecture", + "AMD_GFX906"); #elif defined(KOKKOS_ARCH_AMD_GFX908) declare_configuration_metadata("architecture", "GPU architecture", "AMD_GFX908"); @@ -752,6 +755,9 @@ void pre_initialize_internal(const Kokkos::InitializationSettings& settings) { #elif defined(KOKKOS_ARCH_AMD_GFX1100) declare_configuration_metadata("architecture", "GPU architecture", "AMD_GFX1100"); +#elif defined(KOKKOS_ARCH_AMD_GFX1103) + declare_configuration_metadata("architecture", "GPU architecture", + "AMD_GFX1103"); #else declare_configuration_metadata("architecture", "GPU architecture", "none"); @@ -973,7 +979,7 @@ void Kokkos::Impl::parse_environment_variables( Tools::Impl::parse_environment_variables(tools_init_arguments); if (init_result.result == Tools::Impl::InitializationStatus::environment_argument_mismatch) { - Impl::throw_runtime_exception(init_result.error_message); + Kokkos::abort(init_result.error_message.c_str()); } combine(settings, tools_init_arguments); diff --git a/packages/kokkos/core/src/impl/Kokkos_Default_GraphNodeKernel.hpp b/packages/kokkos/core/src/impl/Kokkos_Default_GraphNodeKernel.hpp index c71c21d2ac98..cd00fdadebaf 100644 --- a/packages/kokkos/core/src/impl/Kokkos_Default_GraphNodeKernel.hpp +++ b/packages/kokkos/core/src/impl/Kokkos_Default_GraphNodeKernel.hpp @@ -36,15 +36,22 @@ struct GraphNodeKernelDefaultImpl { // TODO @graphs decide if this should use vtable or intrusive erasure via // function pointers like in the rest of the graph interface virtual void execute_kernel() = 0; + + GraphNodeKernelDefaultImpl() = default; + + explicit GraphNodeKernelDefaultImpl(ExecutionSpace exec) + : m_execution_space(std::move(exec)) {} + + ExecutionSpace m_execution_space; }; // TODO Indicate that this kernel specialization is only for the Host somehow? template class GraphNodeKernelImpl - : public PatternImplSpecializationFromTag::type, - public GraphNodeKernelDefaultImpl { + : public GraphNodeKernelDefaultImpl, + public PatternImplSpecializationFromTag::type { public: using base_t = typename PatternImplSpecializationFromTag - GraphNodeKernelImpl(std::string const&, ExecutionSpace const&, - Functor arg_functor, PolicyDeduced&& arg_policy, - ArgsDeduced&&... args) - : base_t(std::move(arg_functor), (PolicyDeduced &&) arg_policy, - (ArgsDeduced &&) args...), - execute_kernel_vtable_base_t() {} + GraphNodeKernelImpl(std::string const &, ExecutionSpace const &, + Functor arg_functor, PolicyDeduced &&arg_policy, + ArgsDeduced &&...args) + : execute_kernel_vtable_base_t(arg_policy.space()), + base_t(std::move(arg_functor), (PolicyDeduced &&)arg_policy, + (ArgsDeduced &&)args...) {} // FIXME @graph Forward through the instance once that works in the backends template - GraphNodeKernelImpl(ExecutionSpace const& ex, Functor arg_functor, - PolicyDeduced&& arg_policy, ArgsDeduced&&... args) + GraphNodeKernelImpl(ExecutionSpace const &ex, Functor arg_functor, + PolicyDeduced &&arg_policy, ArgsDeduced &&...args) : GraphNodeKernelImpl("", ex, std::move(arg_functor), - (PolicyDeduced &&) arg_policy, - (ArgsDeduced &&) args...) {} + (PolicyDeduced &&)arg_policy, + (ArgsDeduced &&)args...) { + // FIXME This constructor seem unused. + } - void execute_kernel() final { this->base_t::execute(); } + void execute_kernel() override final { this->base_t::execute(); } }; // end GraphNodeKernelImpl }}}1 @@ -88,7 +97,7 @@ struct GraphNodeAggregateKernelDefaultImpl using is_graph_kernel = std::true_type; }; using graph_kernel = GraphNodeAggregateKernelDefaultImpl; - void execute_kernel() final {} + void execute_kernel() override final {} }; } // end namespace Impl diff --git a/packages/kokkos/core/src/impl/Kokkos_Default_GraphNode_Impl.hpp b/packages/kokkos/core/src/impl/Kokkos_Default_GraphNode_Impl.hpp index 223ae391ab40..31d147ea894b 100644 --- a/packages/kokkos/core/src/impl/Kokkos_Default_GraphNode_Impl.hpp +++ b/packages/kokkos/core/src/impl/Kokkos_Default_GraphNode_Impl.hpp @@ -69,10 +69,10 @@ struct GraphNodeBackendSpecificDetails { GraphNodeBackendSpecificDetails(GraphNodeBackendSpecificDetails&&) noexcept = delete; - GraphNodeBackendSpecificDetails& operator =( + GraphNodeBackendSpecificDetails& operator=( GraphNodeBackendSpecificDetails const&) = delete; - GraphNodeBackendSpecificDetails& operator =( + GraphNodeBackendSpecificDetails& operator=( GraphNodeBackendSpecificDetails&&) noexcept = delete; ~GraphNodeBackendSpecificDetails() = default; @@ -92,6 +92,18 @@ struct GraphNodeBackendSpecificDetails { m_is_aggregate = true; } + // A node is awaitable if it can execute a kernel. + // A root node or an aggregate node cannot be waited for, because it does + // not launch anything. + bool awaitable() const { return (!m_is_root) && (!m_is_aggregate); } + + // Retrieve the execution space instance that has been passed to + // the kernel at construction phase. + const ExecutionSpace& get_execution_space() const { + KOKKOS_EXPECTS(m_kernel_ptr != nullptr) + return m_kernel_ptr->m_execution_space; + } + void set_predecessor( std::shared_ptr> arg_pred_impl) { @@ -104,7 +116,7 @@ struct GraphNodeBackendSpecificDetails { m_predecessors.push_back(std::move(arg_pred_impl)); } - void execute_node() { + void execute_node(const ExecutionSpace& exec) { // This node could have already been executed as the predecessor of some // other KOKKOS_EXPECTS(bool(m_kernel_ptr) || m_has_executed) @@ -115,8 +127,18 @@ struct GraphNodeBackendSpecificDetails { // supported semantics, but instinct I have feels like it should be... m_has_executed = true; for (auto const& predecessor : m_predecessors) { - predecessor->execute_node(); + predecessor->execute_node(exec); } + + // Before executing the kernel, be sure to fence the execution space + // instance of predecessors. + for (const auto& predecessor : m_predecessors) { + if (predecessor->awaitable() && + predecessor->get_execution_space() != this->get_execution_space()) + predecessor->get_execution_space().fence( + "Kokkos::DefaultGraphNode::execute_node: sync with predecessors"); + } + m_kernel_ptr->execute_kernel(); } KOKKOS_ENSURES(m_has_executed) diff --git a/packages/kokkos/core/src/impl/Kokkos_Default_Graph_Impl.hpp b/packages/kokkos/core/src/impl/Kokkos_Default_Graph_Impl.hpp index 05d485491932..8dfa19a178cf 100644 --- a/packages/kokkos/core/src/impl/Kokkos_Default_Graph_Impl.hpp +++ b/packages/kokkos/core/src/impl/Kokkos_Default_Graph_Impl.hpp @@ -58,12 +58,12 @@ struct GraphImpl : private ExecutionSpaceInstanceStorage { // Not movable or copyable; it spends its whole live as a shared_ptr in the // Graph object - GraphImpl() = default; - GraphImpl(GraphImpl const&) = delete; - GraphImpl(GraphImpl&&) = delete; + GraphImpl() = default; + GraphImpl(GraphImpl const&) = delete; + GraphImpl(GraphImpl&&) = delete; GraphImpl& operator=(GraphImpl const&) = delete; - GraphImpl& operator=(GraphImpl&&) = delete; - ~GraphImpl() = default; + GraphImpl& operator=(GraphImpl&&) = delete; + ~GraphImpl() = default; explicit GraphImpl(ExecutionSpace arg_space) : execution_space_instance_storage_base_t(std::move(arg_space)) {} @@ -136,17 +136,40 @@ struct GraphImpl : private ExecutionSpaceInstanceStorage { return rv; } - void submit() { + void instantiate() { + KOKKOS_EXPECTS(!m_has_been_instantiated); + m_has_been_instantiated = true; + } + + void submit(const ExecutionSpace& exec) { + if (!m_has_been_instantiated) instantiate(); // This reset is gross, but for the purposes of our simple host // implementation... for (auto& sink : m_sinks) { sink->reset_has_executed(); } + + // We don't know where the nodes will execute, so we need to fence the given + // execution space instance before proceeding. This is the simplest way + // of guaranteeing that the kernels in the graph are correctly "enqueued". + exec.fence( + "Kokkos::DefaultGraph::submit: fencing before launching graph nodes"); + for (auto& sink : m_sinks) { - sink->execute_node(); + sink->execute_node(exec); + } + + // Once all sinks have been executed, we need to fence them. + for (const auto& sink : m_sinks) { + if (sink->awaitable() && sink->get_execution_space() != exec) + sink->get_execution_space().fence( + "Kokkos::DefaultGraph::submit: fencing before ending graph submit"); } } + private: + bool m_has_been_instantiated = false; + // end required customizations }}}2 //---------------------------------------------------------------------------- }; diff --git a/packages/kokkos/core/src/impl/Kokkos_EBO.hpp b/packages/kokkos/core/src/impl/Kokkos_EBO.hpp index 8ba94ba4ccc4..a8a4d6617bcd 100644 --- a/packages/kokkos/core/src/impl/Kokkos_EBO.hpp +++ b/packages/kokkos/core/src/impl/Kokkos_EBO.hpp @@ -52,16 +52,16 @@ struct EBOBaseImpl; template class CtorNotOnDevice> struct EBOBaseImpl { template ::value && - std::is_constructible::value && + std::enable_if_t && + std::is_constructible_v && !CtorNotOnDevice::value, int> = 0> KOKKOS_FORCEINLINE_FUNCTION constexpr explicit EBOBaseImpl( Args&&...) noexcept {} template ::value && - std::is_constructible::value && + std::enable_if_t && + std::is_constructible_v && CtorNotOnDevice::value, long> = 0> inline constexpr explicit EBOBaseImpl(Args&&...) noexcept {} @@ -110,18 +110,18 @@ struct EBOBaseImpl { T m_ebo_object; template ::value && + std::enable_if_t && !CTorsNotOnDevice::value && - std::is_constructible::value, + std::is_constructible_v, int> = 0> KOKKOS_FORCEINLINE_FUNCTION constexpr explicit EBOBaseImpl( Args&&... args) noexcept(noexcept(T(std::forward(args)...))) : m_ebo_object(std::forward(args)...) {} template ::value && + std::enable_if_t && CTorsNotOnDevice::value && - std::is_constructible::value, + std::is_constructible_v, long> = 0> inline constexpr explicit EBOBaseImpl(Args&&... args) noexcept( noexcept(T(std::forward(args)...))) @@ -167,9 +167,9 @@ struct EBOBaseImpl { template class CtorsNotOnDevice = NoCtorsNotOnDevice> struct StandardLayoutNoUniqueAddressMemberEmulation - : EBOBaseImpl::value, CtorsNotOnDevice> { + : EBOBaseImpl, CtorsNotOnDevice> { private: - using ebo_base_t = EBOBaseImpl::value, CtorsNotOnDevice>; + using ebo_base_t = EBOBaseImpl, CtorsNotOnDevice>; public: using ebo_base_t::ebo_base_t; diff --git a/packages/kokkos/core/src/impl/Kokkos_ExecPolicy.cpp b/packages/kokkos/core/src/impl/Kokkos_ExecPolicy.cpp index 04c5e0bd22a2..58a5de2aa626 100644 --- a/packages/kokkos/core/src/impl/Kokkos_ExecPolicy.cpp +++ b/packages/kokkos/core/src/impl/Kokkos_ExecPolicy.cpp @@ -41,7 +41,7 @@ void team_policy_check_valid_storage_level_argument(int level) { std::stringstream ss; ss << "TeamPolicy::set_scratch_size(/*level*/ " << level << ", ...) storage level argument must be 0 or 1 to be valid\n"; - Impl::throw_runtime_exception(ss.str()); + abort(ss.str().c_str()); } } diff --git a/packages/kokkos/core/src/impl/Kokkos_ExecSpaceManager.hpp b/packages/kokkos/core/src/impl/Kokkos_ExecSpaceManager.hpp index 58ed54275a64..5805b78ee75b 100644 --- a/packages/kokkos/core/src/impl/Kokkos_ExecSpaceManager.hpp +++ b/packages/kokkos/core/src/impl/Kokkos_ExecSpaceManager.hpp @@ -123,14 +123,14 @@ template struct ExecSpaceDerived : ExecSpaceBase { static_assert(check_valid_execution_space()); static_assert(check_is_regular()); - void initialize(InitializationSettings const& settings) final { + void initialize(InitializationSettings const& settings) override final { ExecutionSpace::impl_initialize(settings); } - void finalize() final { ExecutionSpace::impl_finalize(); } - void static_fence(std::string const& label) final { + void finalize() override final { ExecutionSpace::impl_finalize(); } + void static_fence(std::string const& label) override final { ExecutionSpace::impl_static_fence(label); } - void print_configuration(std::ostream& os, bool verbose) final { + void print_configuration(std::ostream& os, bool verbose) override final { ExecutionSpace().print_configuration(os, verbose); } }; diff --git a/packages/kokkos/core/src/impl/Kokkos_FixedBufferMemoryPool.hpp b/packages/kokkos/core/src/impl/Kokkos_FixedBufferMemoryPool.hpp deleted file mode 100644 index 4726a87b97cb..000000000000 --- a/packages/kokkos/core/src/impl/Kokkos_FixedBufferMemoryPool.hpp +++ /dev/null @@ -1,279 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOS_IMPL_KOKKOS_FIXEDBUFFERMEMORYPOOL_HPP -#define KOKKOS_IMPL_KOKKOS_FIXEDBUFFERMEMORYPOOL_HPP - -#include -#include - -#include -#include - -namespace Kokkos { -namespace Impl { - -template -class FixedBlockSizeMemoryPool - : private MemorySpaceInstanceStorage { - public: - using memory_space = typename DeviceType::memory_space; - using size_type = SizeType; - - private: - using memory_space_storage_base = - MemorySpaceInstanceStorage; - using tracker_type = Kokkos::Impl::SharedAllocationTracker; - using record_type = Kokkos::Impl::SharedAllocationRecord; - - struct alignas(Align) Block { - union { - char ignore; - char data[Size]; - }; - }; - - static constexpr auto actual_size = sizeof(Block); - - // TODO shared allocation tracker - // TODO @optimization put the index values on different cache lines (CPU) or - // pages (GPU)? - - tracker_type m_tracker = {}; - size_type m_num_blocks = 0; - size_type m_first_free_idx = 0; - size_type m_last_free_idx = 0; - Kokkos::OwningRawPtr m_first_block = nullptr; - Kokkos::OwningRawPtr m_free_indices = nullptr; - - enum : size_type { IndexInUse = ~size_type(0) }; - - public: - FixedBlockSizeMemoryPool(memory_space const& mem_space, size_type num_blocks) - : memory_space_storage_base(mem_space), - m_tracker(), - m_num_blocks(num_blocks), - m_first_free_idx(0), - m_last_free_idx(num_blocks) { - // TODO alignment? - auto block_record = record_type::allocate( - mem_space, "FixedBlockSizeMemPool_blocks", num_blocks * sizeof(Block)); - KOKKOS_ASSERT(intptr_t(block_record->data()) % Align == 0); - m_tracker.assign_allocated_record_to_uninitialized(block_record); - m_first_block = (Block*)block_record->data(); - - auto idx_record = - record_type::allocate(mem_space, "Kokkos::FixedBlockSizeMemPool_blocks", - num_blocks * sizeof(size_type)); - KOKKOS_ASSERT(intptr_t(idx_record->data()) % alignof(size_type) == 0); - m_tracker.assign_allocated_record_to_uninitialized(idx_record); - m_free_indices = (size_type*)idx_record->data(); - - for (size_type i = 0; i < num_blocks; ++i) { - m_free_indices[i] = i; - } - - Kokkos::memory_fence(); - } - - // For compatibility with MemoryPool<> - FixedBlockSizeMemoryPool(memory_space const& mem_space, - size_t mempool_capacity, unsigned, unsigned, - unsigned) - : FixedBlockSizeMemoryPool( - mem_space, mempool_capacity / - actual_size) { /* forwarding ctor, must be empty */ - } - - KOKKOS_DEFAULTED_FUNCTION FixedBlockSizeMemoryPool() = default; - KOKKOS_DEFAULTED_FUNCTION FixedBlockSizeMemoryPool( - FixedBlockSizeMemoryPool&&) = default; - KOKKOS_DEFAULTED_FUNCTION FixedBlockSizeMemoryPool( - FixedBlockSizeMemoryPool const&) = default; - KOKKOS_DEFAULTED_FUNCTION FixedBlockSizeMemoryPool& operator=( - FixedBlockSizeMemoryPool&&) = default; - KOKKOS_DEFAULTED_FUNCTION FixedBlockSizeMemoryPool& operator=( - FixedBlockSizeMemoryPool const&) = default; - - KOKKOS_INLINE_FUNCTION - void* allocate(size_type alloc_size) const noexcept { - (void)alloc_size; - KOKKOS_EXPECTS(alloc_size <= Size); - auto free_idx_counter = Kokkos::atomic_fetch_add( - (volatile size_type*)&m_first_free_idx, size_type(1)); - auto free_idx_idx = free_idx_counter % m_num_blocks; - - // We don't have exclusive access to m_free_indices[free_idx_idx] because - // the allocate counter might have lapped us since we incremented it - auto current_free_idx = m_free_indices[free_idx_idx]; - size_type free_idx = IndexInUse; - free_idx = Kokkos::atomic_compare_exchange(&m_free_indices[free_idx_idx], - current_free_idx, free_idx); - Kokkos::memory_fence(); - - // TODO figure out how to decrement here? - - if (free_idx == IndexInUse) { - return nullptr; - } else { - return (void*)&m_first_block[free_idx]; - } - } - - KOKKOS_INLINE_FUNCTION - void deallocate(void* ptr, size_type /*alloc_size*/) const noexcept { - // figure out which block we are - auto offset = intptr_t(ptr) - intptr_t(m_first_block); - - KOKKOS_EXPECTS(offset % actual_size == 0 && - offset / actual_size < m_num_blocks); - - Kokkos::memory_fence(); - auto last_idx_idx = Kokkos::atomic_fetch_add( - (volatile size_type*)&m_last_free_idx, size_type(1)); - last_idx_idx %= m_num_blocks; - m_free_indices[last_idx_idx] = offset / actual_size; - } -}; - -#if 0 -template < - class DeviceType, - size_t Size, - size_t Align=1, - class SizeType = typename DeviceType::execution_space::size_type -> -class FixedBlockSizeChaseLevMemoryPool - : private MemorySpaceInstanceStorage -{ -public: - - using memory_space = typename DeviceType::memory_space; - using size_type = SizeType; - -private: - - using memory_space_storage_base = MemorySpaceInstanceStorage; - using tracker_type = Kokkos::Impl::SharedAllocationTracker; - using record_type = Kokkos::Impl::SharedAllocationRecord; - - struct alignas(Align) Block { union { char ignore; char data[Size]; }; }; - - static constexpr auto actual_size = sizeof(Block); - - tracker_type m_tracker = { }; - size_type m_num_blocks = 0; - size_type m_first_free_idx = 0; - size_type m_last_free_idx = 0; - - - enum : size_type { IndexInUse = ~size_type(0) }; - -public: - - FixedBlockSizeMemoryPool( - memory_space const& mem_space, - size_type num_blocks - ) : memory_space_storage_base(mem_space), - m_tracker(), - m_num_blocks(num_blocks), - m_first_free_idx(0), - m_last_free_idx(num_blocks) - { - // TODO alignment? - auto block_record = record_type::allocate( - mem_space, "FixedBlockSizeMemPool_blocks", num_blocks * sizeof(Block) - ); - KOKKOS_ASSERT(intptr_t(block_record->data()) % Align == 0); - m_tracker.assign_allocated_record_to_uninitialized(block_record); - m_first_block = (Block*)block_record->data(); - - auto idx_record = record_type::allocate( - mem_space, "FixedBlockSizeMemPool_blocks", num_blocks * sizeof(size_type) - ); - KOKKOS_ASSERT(intptr_t(idx_record->data()) % alignof(size_type) == 0); - m_tracker.assign_allocated_record_to_uninitialized(idx_record); - m_free_indices = (size_type*)idx_record->data(); - - for(size_type i = 0; i < num_blocks; ++i) { - m_free_indices[i] = i; - } - - Kokkos::memory_fence(); - } - - // For compatibility with MemoryPool<> - FixedBlockSizeMemoryPool( - memory_space const& mem_space, - size_t mempool_capacity, - unsigned, unsigned, unsigned - ) : FixedBlockSizeMemoryPool(mem_space, mempool_capacity / actual_size) - { /* forwarding ctor, must be empty */ } - - KOKKOS_DEFAULTED_FUNCTION FixedBlockSizeMemoryPool() = default; - KOKKOS_DEFAULTED_FUNCTION FixedBlockSizeMemoryPool(FixedBlockSizeMemoryPool&&) = default; - KOKKOS_DEFAULTED_FUNCTION FixedBlockSizeMemoryPool(FixedBlockSizeMemoryPool const&) = default; - KOKKOS_DEFAULTED_FUNCTION FixedBlockSizeMemoryPool& operator=(FixedBlockSizeMemoryPool&&) = default; - KOKKOS_DEFAULTED_FUNCTION FixedBlockSizeMemoryPool& operator=(FixedBlockSizeMemoryPool const&) = default; - - - KOKKOS_INLINE_FUNCTION - void* allocate(size_type alloc_size) const noexcept - { - KOKKOS_EXPECTS(alloc_size <= Size); - auto free_idx_counter = Kokkos::atomic_fetch_add((volatile size_type*)&m_first_free_idx, size_type(1)); - auto free_idx_idx = free_idx_counter % m_num_blocks; - - // We don't have exclusive access to m_free_indices[free_idx_idx] because - // the allocate counter might have lapped us since we incremented it - auto current_free_idx = m_free_indices[free_idx_idx]; - size_type free_idx = IndexInUse; - free_idx = - Kokkos::atomic_compare_exchange(&m_free_indices[free_idx_idx], current_free_idx, free_idx); - Kokkos::memory_fence(); - - // TODO figure out how to decrement here? - - if(free_idx == IndexInUse) { - return nullptr; - } - else { - return (void*)&m_first_block[free_idx]; - } - } - - KOKKOS_INLINE_FUNCTION - void deallocate(void* ptr, size_type alloc_size) const noexcept - { - // figure out which block we are - auto offset = intptr_t(ptr) - intptr_t(m_first_block); - - KOKKOS_EXPECTS(offset % actual_size == 0 && offset/actual_size < m_num_blocks); - - Kokkos::memory_fence(); - auto last_idx_idx = Kokkos::atomic_fetch_add((volatile size_type*)&m_last_free_idx, size_type(1)); - last_idx_idx %= m_num_blocks; - m_free_indices[last_idx_idx] = offset / actual_size; - } - -}; -#endif - -} // end namespace Impl -} // end namespace Kokkos - -#endif // KOKKOS_IMPL_KOKKOS_FIXEDBUFFERMEMORYPOOL_HPP diff --git a/packages/kokkos/core/src/impl/Kokkos_FunctorAnalysis.hpp b/packages/kokkos/core/src/impl/Kokkos_FunctorAnalysis.hpp index e844a5295e50..29a365e6e418 100644 --- a/packages/kokkos/core/src/impl/Kokkos_FunctorAnalysis.hpp +++ b/packages/kokkos/core/src/impl/Kokkos_FunctorAnalysis.hpp @@ -118,8 +118,8 @@ struct FunctorAnalysis { using functor_has_space = has_execution_space; static_assert(!policy_has_space::value || !functor_has_space::value || - std::is_same::value, + std::is_same_v, "Execution Policy and Functor execution space must match"); //---------------------------------------- @@ -136,9 +136,8 @@ struct FunctorAnalysis { typename std::is_void::type> { using type = typename F::value_type; - static_assert(!std::is_reference::value && - std::rank::value <= 1 && - std::extent::value == 0, + static_assert(!std::is_reference_v && std::rank_v <= 1 && + std::extent_v == 0, "Kokkos Functor::value_type is T or T[]"); }; @@ -149,7 +148,7 @@ struct FunctorAnalysis { template ::type, - bool T = std::is_void::value> + bool T = std::is_void_v> struct deduce_value_type { using type = V; }; @@ -290,8 +289,8 @@ struct FunctorAnalysis { using candidate_type = typename deduce_value_type::type; enum { - candidate_is_void = std::is_void::value, - candidate_is_array = std::rank::value == 1 + candidate_is_void = std::is_void_v, + candidate_is_array = std::rank_v == 1 }; //---------------------------------------- @@ -306,7 +305,7 @@ struct FunctorAnalysis { using value_type = std::remove_extent_t; - static_assert(!std::is_const::value, + static_assert(!std::is_const_v, "Kokkos functor operator reduce argument cannot be const"); private: @@ -614,21 +613,20 @@ struct FunctorAnalysis { }; template - struct DeduceJoinNoTag::value || - (!is_reducer::value && - std::is_void::value)) && - detected_join_no_tag::value>> + struct DeduceJoinNoTag< + F, std::enable_if_t<(is_reducer::value || + (!is_reducer::value && std::is_void_v)) && + detected_join_no_tag::value>> : public has_join_no_tag_function { enum : bool { value = true }; }; template struct DeduceJoinNoTag< - F, - std::enable_if_t<(is_reducer::value || - (!is_reducer::value && std::is_void::value)) && - (!detected_join_no_tag::value && - detected_volatile_join_no_tag::value)>> + F, std::enable_if_t<(is_reducer::value || + (!is_reducer::value && std::is_void_v)) && + (!detected_join_no_tag::value && + detected_volatile_join_no_tag::value)>> : public has_volatile_join_no_tag_function { enum : bool { value = true }; static_assert(Impl::dependent_false_v, @@ -735,8 +733,8 @@ struct FunctorAnalysis { template struct DeduceInitNoTag< - F, std::enable_if_t::value || (!is_reducer::value && - std::is_void::value), + F, std::enable_if_t::value || + (!is_reducer::value && std::is_void_v), decltype(has_init_no_tag_function::enable_if( &F::init))>> : public has_init_no_tag_function { @@ -835,8 +833,8 @@ struct FunctorAnalysis { template struct DeduceFinalNoTag< - F, std::enable_if_t::value || (!is_reducer::value && - std::is_void::value), + F, std::enable_if_t::value || + (!is_reducer::value && std::is_void_v), decltype(has_final_no_tag_function::enable_if( &F::final))>> : public has_final_no_tag_function { @@ -906,14 +904,14 @@ struct FunctorAnalysis { Functor m_functor; template - KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t len() const - noexcept { + KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t len() + const noexcept { return m_functor.value_count; } template - KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t len() const - noexcept { + KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t len() + const noexcept { return candidate_is_void ? 0 : 1; } @@ -973,8 +971,8 @@ struct FunctorAnalysis { DeduceJoin<>::join(&m_functor, dst, src); } - KOKKOS_INLINE_FUNCTION reference_type init(ValueType* const dst) const - noexcept { + KOKKOS_INLINE_FUNCTION reference_type + init(ValueType* const dst) const noexcept { DeduceInit<>::init(&m_functor, dst); return reference(dst); } @@ -987,11 +985,11 @@ struct FunctorAnalysis { KOKKOS_INLINE_FUNCTION const Functor& get_functor() const { return m_functor; } - Reducer(Reducer const&) = default; - Reducer(Reducer&&) = default; + Reducer(Reducer const&) = default; + Reducer(Reducer&&) = default; Reducer& operator=(Reducer const&) = delete; - Reducer& operator=(Reducer&&) = delete; - ~Reducer() = default; + Reducer& operator=(Reducer&&) = delete; + ~Reducer() = default; KOKKOS_INLINE_FUNCTION explicit constexpr Reducer( Functor const& arg_functor) noexcept diff --git a/packages/kokkos/core/src/impl/Kokkos_GraphImpl.hpp b/packages/kokkos/core/src/impl/Kokkos_GraphImpl.hpp index 56f95c814d88..6d3ebf64befc 100644 --- a/packages/kokkos/core/src/impl/Kokkos_GraphImpl.hpp +++ b/packages/kokkos/core/src/impl/Kokkos_GraphImpl.hpp @@ -56,7 +56,7 @@ struct GraphAccess { static_assert( Kokkos::Impl::is_specialization_of::value, "Kokkos Internal Error in graph interface"); - return std::make_shared((Args &&) args...); + return std::make_shared((Args&&)args...); } template ::value, "Kokkos Internal Implementation error (bad argument to " "`GraphAccess::get_node_ptr()`)"); - return ((NodeRef &&) node_ref).get_node_ptr(); + return ((NodeRef&&)node_ref).get_node_ptr(); } template @@ -93,7 +93,7 @@ struct GraphAccess { Kokkos::Experimental::GraphNodeRef>::value, "Kokkos Internal Implementation error (bad argument to " "`GraphAccess::get_graph_weak_ptr()`)"); - return ((NodeRef &&) node_ref).get_graph_weak_ptr(); + return ((NodeRef&&)node_ref).get_graph_weak_ptr(); } // end accessors for private members of public interface }}}2 diff --git a/packages/kokkos/core/src/impl/Kokkos_GraphImpl_Utilities.hpp b/packages/kokkos/core/src/impl/Kokkos_GraphImpl_Utilities.hpp index 2ab05cb8e439..b02a26547223 100644 --- a/packages/kokkos/core/src/impl/Kokkos_GraphImpl_Utilities.hpp +++ b/packages/kokkos/core/src/impl/Kokkos_GraphImpl_Utilities.hpp @@ -54,9 +54,9 @@ template