diff --git a/src/acc/PACKAGE b/src/acc/PACKAGE index 9b1679e7c76..d7d0347d84f 100644 --- a/src/acc/PACKAGE +++ b/src/acc/PACKAGE @@ -1,5 +1,5 @@ { "description": "Generic accelerator API", "archive": "libdbcsr", -"requires": ["../base", "cuda", "hip", "opencl", "libsmm_acc"] +"requires": ["../base", "../core", "cuda", "hip", "opencl", "libsmm_acc"] } diff --git a/src/acc/dbcsr_acc_init.F b/src/acc/dbcsr_acc_init.F index fe7d6725273..c27247dce20 100644 --- a/src/acc/dbcsr_acc_init.F +++ b/src/acc/dbcsr_acc_init.F @@ -12,6 +12,8 @@ MODULE dbcsr_acc_init #if defined (__DBCSR_ACC) USE ISO_C_BINDING, ONLY: C_INT, C_CHAR, C_PTR, C_NULL_PTR, C_NULL_CHAR, C_ASSOCIATED #endif + USE dbcsr_acc_device, ONLY: dbcsr_acc_set_active_device + USE dbcsr_config, ONLY: get_accdrv_active_device_id #include "base/dbcsr_base_uses.f90" IMPLICIT NONE @@ -47,9 +49,15 @@ SUBROUTINE acc_init() DBCSR_ABORT("__DBCSR_ACC not compiled in.") #else INTEGER :: istat + ! Set active device first + CALL dbcsr_acc_set_active_device(get_accdrv_active_device_id()) +!$OMP PARALLEL DEFAULT(NONE) PRIVATE(istat) +!$OMP MASTER istat = acc_interface_drv_init() IF (istat /= 0) & DBCSR_ABORT("acc_init failed") +!$OMP END MASTER +!$OMP END PARALLEL #endif END SUBROUTINE acc_init @@ -58,10 +66,14 @@ SUBROUTINE acc_finalize() #if ! defined (__DBCSR_ACC) DBCSR_ABORT("__DBCSR_ACC not compiled in.") #else - INTEGER :: istat + INTEGER :: istat +!$OMP PARALLEL DEFAULT(NONE) PRIVATE(istat) +!$OMP MASTER istat = acc_interface_drv_finalize() IF (istat /= 0) & DBCSR_ABORT("acc_finalize failed") +!$OMP END MASTER +!$OMP END PARALLEL #endif END SUBROUTINE acc_finalize diff --git a/src/core/dbcsr_lib.F b/src/core/dbcsr_lib.F index 34a7d913706..1de0d9882ca 100644 --- a/src/core/dbcsr_lib.F +++ b/src/core/dbcsr_lib.F @@ -11,9 +11,8 @@ MODULE dbcsr_lib !! Routines that affect the DBCSR library as a whole USE dbcsr_acc_init, ONLY: acc_finalize, acc_init - USE dbcsr_acc_device, ONLY: dbcsr_acc_get_ndevices, dbcsr_acc_set_active_device - USE dbcsr_config, ONLY: get_accdrv_active_device_id, & - set_accdrv_active_device_id, & + USE dbcsr_acc_device, ONLY: dbcsr_acc_get_ndevices + USE dbcsr_config, ONLY: set_accdrv_active_device_id, & reset_accdrv_active_device_id, & dbcsr_set_config, & has_acc @@ -204,24 +203,18 @@ SUBROUTINE dbcsr_init_lib_pre(mp_comm, io_unit, accdrv_active_device_id) ! Initialize Acc and set active device IF (has_acc) THEN -!$OMP PARALLEL -!$OMP MASTER - CALL acc_init() -!$OMP END MASTER -!$OMP END PARALLEL - IF (dbcsr_acc_get_ndevices() > 0) THEN - IF (PRESENT(accdrv_active_device_id)) THEN - CALL set_accdrv_active_device_id(accdrv_active_device_id) - ELSE - ! Use round-robin assignment per rank - CALL set_accdrv_active_device_id(MOD(mynode, dbcsr_acc_get_ndevices())) - END IF + IF (PRESENT(accdrv_active_device_id)) THEN + CALL set_accdrv_active_device_id(accdrv_active_device_id) + ELSEIF (dbcsr_acc_get_ndevices() > 0) THEN + ! Use round-robin assignment per rank + CALL set_accdrv_active_device_id(MOD(mynode, dbcsr_acc_get_ndevices())) + ELSE + DBCSR_ABORT("dbcsr_init_lib: No recongnized GPU devices") END IF + CALL acc_init() END IF #if defined(__DBCSR_ACC) - CALL dbcsr_acc_set_active_device(get_accdrv_active_device_id()) - ! Checks related to DBCSR's GPU backend: check consistency in threading level libsmm_acc_thread_safe = libsmm_acc_is_thread_safe() ! 0: not threaded, 1: threaded dbcsr_thread_safe = 0 ! not threaded @@ -310,11 +303,7 @@ SUBROUTINE dbcsr_finalize_lib() ! Reset Acc ID CALL reset_accdrv_active_device_id() IF (has_acc) THEN -!$OMP PARALLEL -!$OMP MASTER CALL acc_finalize() -!$OMP END MASTER -!$OMP END PARALLEL END IF ! Check the number of communicators diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 2be5a7bdcc4..d768b69d3cb 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -241,14 +241,14 @@ if (USE_ACCEL MATCHES "cuda|hip") $<$:CUDA::cuda_driver> libsmm_acc) endforeach () - add_test(NAME libsmm_acc_unittest_multiply - COMMAND libsmm_acc_unittest_multiply) - add_test(NAME libsmm_acc_unittest_transpose - COMMAND libsmm_acc_unittest_transpose) - add_test(NAME libsmm_acc_timer_multiply-autotuned - COMMAND libsmm_acc_timer_multiply autotuned) - add_test(NAME libsmm_acc_timer_multiply-predicted - COMMAND libsmm_acc_timer_multiply predicted) + # Comment for the moment, they are not parallelized, very slow... Check issue + # https://github.com/cp2k/dbcsr/issues/427 add_test(NAME + # libsmm_acc_unittest_multiply COMMAND libsmm_acc_unittest_multiply) + # add_test(NAME libsmm_acc_unittest_transpose COMMAND + # libsmm_acc_unittest_transpose) add_test(NAME + # libsmm_acc_timer_multiply-autotuned COMMAND libsmm_acc_timer_multiply + # autotuned) add_test(NAME libsmm_acc_timer_multiply-predicted COMMAND + # libsmm_acc_timer_multiply predicted) endif ()