Skip to content

Commit

Permalink
refact in GPU direct & fixed bug in read dipole integrals
Browse files Browse the repository at this point in the history
  • Loading branch information
AbdAmmar committed Nov 28, 2024
1 parent 6ff3fc2 commit e43a56e
Show file tree
Hide file tree
Showing 10 changed files with 259 additions and 39 deletions.
156 changes: 156 additions & 0 deletions src/GPU/phRRPA_GPU.f90
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
subroutine phRRPA_GPU(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC,nO,nV,nR,nS,ENuc,ERHF,ERI,dipole_int,eHF)

use cu_quack_module

! Perform a direct random phase approximation calculation

implicit none
include 'parameters.h'
include 'quadrature.h'

! Input variables

logical,intent(in) :: dotest

logical,intent(in) :: TDA
logical,intent(in) :: doACFDT
logical,intent(in) :: exchange_kernel
logical,intent(in) :: singlet
logical,intent(in) :: triplet
integer,intent(in) :: nBas
integer,intent(in) :: nC
integer,intent(in) :: nO
integer,intent(in) :: nV
integer,intent(in) :: nR
integer,intent(in) :: nS
double precision,intent(in) :: ENuc
double precision,intent(in) :: ERHF
double precision,intent(in) :: eHF(nBas)
double precision,intent(in) :: ERI(nBas,nBas,nBas,nBas)
double precision,intent(in) :: dipole_int(nBas,nBas,ncart)

! Local variables

integer :: i
integer :: ispin
logical :: dRPA
double precision :: t1, t2
double precision :: lambda
double precision,allocatable :: Aph(:,:)
double precision,allocatable :: Bph(:,:)
double precision,allocatable :: Om(:)
double precision,allocatable :: XpY(:,:)
double precision,allocatable :: XmY(:,:)
! DEBUG
!double precision, allocatable :: XpY_gpu(:,:), XmY_gpu(:,:), Om_gpu(:)

double precision :: EcRPA(nspin)

! Hello world

write(*,*)
write(*,*)'*********************************'
write(*,*)'* Restricted ph-RPA Calculation *'
write(*,*)'*********************************'
write(*,*)

! TDA

if(TDA) then
write(*,*) 'Tamm-Dancoff approximation activated!'
write(*,*)
end if

! Initialization

dRPA = .true.
EcRPA(:) = 0d0
lambda = 1d0

! Memory allocation

allocate(Om(nS),XpY(nS,nS),XmY(nS,nS),Aph(nS,nS),Bph(nS,nS))

! Singlet manifold

if(singlet) then

if(TDA) then

call wall_time(t1)
call ph_drpa_tda_sing(nO, nBas, nS, eHF(1), ERI(1,1,1,1), Om(1), XpY(1,1))
call wall_time(t2)
print*, 'diag time on GPU (sec):', t2 - t1
stop
XmY(:,:) = XpY(:,:)

else

! TODO
!call ph_drpa_sing(nO, nBas, nS, eHF(1), ERI(1,1,1,1), Om(1), XpY(1,1))
!XmY(:,:) = XpY(:,:)

endif

call print_excitation_energies('phRPA@RHF','singlet',nS,Om)
call phLR_transition_vectors(.true.,nBas,nC,nO,nV,nR,nS,dipole_int,Om,XpY,XmY)

end if

! Triplet manifold

if(triplet) then

ispin = 2

call phLR_A(ispin,dRPA,nBas,nC,nO,nV,nR,nS,lambda,eHF,ERI,Aph)
if(.not.TDA) call phLR_B(ispin,dRPA,nBas,nC,nO,nV,nR,nS,lambda,ERI,Bph)

call phLR(TDA,nS,Aph,Bph,EcRPA(ispin),Om,XpY,XmY)
call print_excitation_energies('phRPA@RHF','triplet',nS,Om)
call phLR_transition_vectors(.false.,nBas,nC,nO,nV,nR,nS,dipole_int,Om,XpY,XmY)

end if

if(exchange_kernel) then

EcRPA(1) = 0.5d0*EcRPA(1)
EcRPA(2) = 1.5d0*EcRPA(2)

end if

write(*,*)
write(*,*)'-------------------------------------------------------------------------------'
write(*,'(2X,A50,F20.10,A3)') 'Tr@phRPA@RHF correlation energy (singlet) = ',EcRPA(1),' au'
write(*,'(2X,A50,F20.10,A3)') 'Tr@phRPA@RHF correlation energy (triplet) = ',EcRPA(2),' au'
write(*,'(2X,A50,F20.10,A3)') 'Tr@phRPA@RHF correlation energy = ',sum(EcRPA),' au'
write(*,'(2X,A50,F20.10,A3)') 'Tr@phRPA@RHF total energy = ',ENuc + ERHF + sum(EcRPA),' au'
write(*,*)'-------------------------------------------------------------------------------'
write(*,*)

deallocate(Om,XpY,XmY,Aph,Bph)

! Compute the correlation energy via the adiabatic connection

if(doACFDT) then

call phACFDT(exchange_kernel,dRPA,TDA,singlet,triplet,nBas,nC,nO,nV,nR,nS,ERI,eHF,EcRPA)

write(*,*)
write(*,*)'-------------------------------------------------------------------------------'
write(*,'(2X,A50,F20.10,A3)') 'AC@phRPA@RHF correlation energy (singlet) = ',EcRPA(1),' au'
write(*,'(2X,A50,F20.10,A3)') 'AC@phRPA@RHF correlation energy (triplet) = ',EcRPA(2),' au'
write(*,'(2X,A50,F20.10,A3)') 'AC@phRPA@RHF correlation energy = ',sum(EcRPA),' au'
write(*,'(2X,A50,F20.10,A3)') 'AC@phRPA@RHF total energy = ',ENuc + ERHF + sum(EcRPA),' au'
write(*,*)'-------------------------------------------------------------------------------'
write(*,*)

end if

if(dotest) then

call dump_test_value('R','phRPA correlation energy',sum(EcRPA))

end if

end subroutine
4 changes: 4 additions & 0 deletions src/LR/phLR.f90
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ subroutine phLR(TDA,nS,Aph,Bph,EcRPA,Om,XpY,XmY)
! Local variables

double precision :: trace_matrix
double precision :: t1, t2
double precision,allocatable :: ApB(:,:)
double precision,allocatable :: AmB(:,:)
double precision,allocatable :: AmBSq(:,:)
Expand All @@ -38,7 +39,10 @@ subroutine phLR(TDA,nS,Aph,Bph,EcRPA,Om,XpY,XmY)
if(TDA) then

XpY(:,:) = Aph(:,:)
!call wall_time(t1)
call diagonalize_matrix(nS,XpY,Om)
!call wall_time(t2)
!print*, 'diag time on CPU (sec):', t2 - t1
XpY(:,:) = transpose(XpY(:,:))
XmY(:,:) = XpY(:,:)

Expand Down
1 change: 0 additions & 1 deletion src/QuAcK/QuAcK.f90
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,6 @@ program QuAcK

call read_integrals(working_dir,nBas,S,T,V,Hc,ERI_AO)
call read_dipole_integrals(working_dir,nBas,dipole_int_AO)

call wall_time(end_int)

t_int = end_int - start_int
Expand Down
2 changes: 2 additions & 0 deletions src/RPA/RRPA.f90
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@ subroutine RRPA(dotest,dophRPA,dophRPAx,docrRPA,doppRPA,TDA,doACFDT,exchange_ker
write(*,'(A65,1X,F9.3,A8)') 'Total wall time for RPA = ',t_RPA,' seconds'
write(*,*)

!call phRRPA_GPU(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC,nO,nV,nR,nS,ENuc,ERHF,ERI,dipole_int,eHF)

end if

!------------------------------------------------------------------------
Expand Down
18 changes: 0 additions & 18 deletions src/RPA/phRRPA.f90
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
subroutine phRRPA(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC,nO,nV,nR,nS,ENuc,ERHF,ERI,dipole_int,eHF)

! use cu_quack_module

! Perform a direct random phase approximation calculation

implicit none
Expand Down Expand Up @@ -40,8 +38,6 @@ subroutine phRRPA(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC,nO,
double precision,allocatable :: Om(:)
double precision,allocatable :: XpY(:,:)
double precision,allocatable :: XmY(:,:)
! DEBUG
!double precision, allocatable :: XpY_gpu(:,:), XmY_gpu(:,:), Om_gpu(:)

double precision :: EcRPA(nspin)

Expand Down Expand Up @@ -80,20 +76,6 @@ subroutine phRRPA(dotest,TDA,doACFDT,exchange_kernel,singlet,triplet,nBas,nC,nO,
if(.not.TDA) call phLR_B(ispin,dRPA,nBas,nC,nO,nV,nR,nS,lambda,ERI,Bph)

call phLR(TDA,nS,Aph,Bph,EcRPA(ispin),Om,XpY,XmY)

!! DEBUG
!allocate(Om_gpu(nS), XpY_gpu(nS,nS), XmY_gpu(nS,nS))
!call ph_drpa_tda(nO, nBas, nS, eHF(1), ERI(1,1,1,1), Om_gpu(1), XpY_gpu(1,1))
!do i = 1, nS
! print *, i, Om(i), Om_gpu(i)
! if(dabs(Om(i) - Om_gpu(i)) .gt. 1d-13) then
! print *, 'GPU FAILED!'
! stop
! endif
!enddo
!print *, 'GPU DONE!'
!stop

call print_excitation_energies('phRPA@RHF','singlet',nS,Om)
call phLR_transition_vectors(.true.,nBas,nC,nO,nV,nR,nS,dipole_int,Om,XpY,XmY)

Expand Down
6 changes: 5 additions & 1 deletion src/cuda/src/ph_drpa_a_sing.cu
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ __global__ void ph_dRPA_A_sing_kernel(int nO, int nV, int nBas, int nS, double *
int i_A0, i_A1, i_A2;
int i_I0, i_I1, i_I2;

bool a_eq_b;

nVS = nV * nS;

nBas2 = nBas * nBas;
Expand All @@ -27,6 +29,8 @@ __global__ void ph_dRPA_A_sing_kernel(int nO, int nV, int nBas, int nS, double *
while(bb < nV) {
b = bb + nO;

a_eq_b = a == b;

i_A1 = i_A0 + bb;
i_I1 = i_I0 + b * nBas;

Expand All @@ -40,7 +44,7 @@ __global__ void ph_dRPA_A_sing_kernel(int nO, int nV, int nBas, int nS, double *
while(j < nO) {

A[i_A2 + j * nV] = 2.0 * ERI[i_I2 + j * nBas3];
if((a==b) && (i==j)) {
if(a_eq_b && (i==j)) {
A[i_A2 + j * nV] += eps[a] - eps[i];
}

Expand Down
53 changes: 40 additions & 13 deletions src/cuda/src/ph_drpa_tda.c → src/cuda/src/ph_drpa_tda_sing.c
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@
#include "utils.h"
#include "ph_rpa.h"

void ph_drpa_tda(int nO, int nBas, int nS, double *h_eps, double *h_ERI,
double *h_Omega, double *h_X) {
void ph_drpa_tda_sing(int nO, int nBas, int nS, double *h_eps, double *h_ERI,
double *h_Omega, double *h_X) {

double *d_eps = NULL;
double *d_ERI = NULL;
Expand All @@ -20,23 +20,39 @@ void ph_drpa_tda(int nO, int nBas, int nS, double *h_eps, double *h_ERI,
int nBas2 = nBas * nBas;
int nBas4 = nBas2 * nBas2;

float elapsedTime;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);



check_Cuda_Errors(cudaMalloc((void**)&d_eps, nBas * sizeof(double)),
"cudaMalloc", __FILE__, __LINE__);
check_Cuda_Errors(cudaMalloc((void**)&d_ERI, nBas4 * sizeof(double)),
"cudaMalloc", __FILE__, __LINE__);

cudaEventRecord(start, 0);
check_Cuda_Errors(cudaMemcpy(d_eps, h_eps, nBas * sizeof(double), cudaMemcpyHostToDevice),
"cudaMemcpy", __FILE__, __LINE__);
check_Cuda_Errors(cudaMemcpy(d_ERI, h_ERI, nBas4 * sizeof(double), cudaMemcpyHostToDevice),
"cudaMemcpy", __FILE__, __LINE__);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
printf("Time elapsed on CPU->GPU transfer = %f msec\n", elapsedTime);

// construct A
double *d_A = NULL;
check_Cuda_Errors(cudaMalloc((void**)&d_A, nS * nS * sizeof(double)), "cudaMalloc", __FILE__, __LINE__);

cudaEventRecord(start, 0);
ph_dRPA_A_sing(nO, nV, nBas, nS, d_eps, d_ERI, d_A);
check_Cuda_Errors(cudaGetLastError(), "cudaGetLastError", __FILE__, __LINE__);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
printf("Time elapsed on A kernel = %f msec\n", elapsedTime);


// diagonalize A
Expand All @@ -47,24 +63,35 @@ void ph_drpa_tda(int nO, int nBas, int nS, double *h_eps, double *h_ERI,
check_Cuda_Errors(cudaMalloc((void**)&d_Omega, nS * sizeof(double)),
"cudaMalloc", __FILE__, __LINE__);

cudaEventRecord(start, 0);
diag_dn_dsyevd(nS, d_info, d_Omega, d_A);
check_Cuda_Errors(cudaGetLastError(), "cudaGetLastError", __FILE__, __LINE__);

int info_gpu = 0;
check_Cuda_Errors(cudaMemcpy(&info_gpu, d_info, sizeof(int), cudaMemcpyDeviceToHost),
"cudaMemcpy", __FILE__, __LINE__);
if (info_gpu != 0) {
printf("Error: diag_dn_dsyevd returned error code %d\n", info_gpu);
exit(EXIT_FAILURE);
}


cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
printf("Time elapsed on diagonalization = %f msec\n", elapsedTime);

//int info_gpu = 0;
cudaEventRecord(start, 0);
//check_Cuda_Errors(cudaMemcpy(&info_gpu, d_info, sizeof(int), cudaMemcpyDeviceToHost),
// "cudaMemcpy", __FILE__, __LINE__);
//if (info_gpu != 0) {
// printf("Error: diag_dn_dsyevd returned error code %d\n", info_gpu);
// exit(EXIT_FAILURE);
//}
check_Cuda_Errors(cudaMemcpy(h_X, d_A, nS * nS * sizeof(double), cudaMemcpyDeviceToHost),
"cudaMemcpy", __FILE__, __LINE__);

check_Cuda_Errors(cudaMemcpy(h_Omega, d_Omega, nS * sizeof(double), cudaMemcpyDeviceToHost),
"cudaMemcpy", __FILE__, __LINE__);

cudaEventRecord(start, 0);
diag_dn_dsyevd(nS, d_info, d_Omega, d_A);
check_Cuda_Errors(cudaGetLastError(), "cudaGetLastError", __FILE__, __LINE__);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
printf("Time elapsed on GPU -> CPU transfer = %f msec\n", elapsedTime);

check_Cuda_Errors(cudaFree(d_info), "cudaFree", __FILE__, __LINE__);
check_Cuda_Errors(cudaFree(d_eps), "cudaFree", __FILE__, __LINE__);
check_Cuda_Errors(cudaFree(d_ERI), "cudaFree", __FILE__, __LINE__);
Expand Down
2 changes: 2 additions & 0 deletions src/make_ninja.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,8 @@ def check_compiler_exists(compiler):
x not in exe_dirs, os.listdir(".")))
i = lib_dirs.index("mod")
lib_dirs[0], lib_dirs[i] = lib_dirs[i], lib_dirs[0]
if not USE_GPU:
lib_dirs.remove("GPU")

def create_ninja_in_libdir(directory):
def write_rule(f, source_file, replace):
Expand Down
Loading

0 comments on commit e43a56e

Please sign in to comment.