Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feature: parallel solve subspace diagonalization in dav_subspace #5549

Open
wants to merge 10 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 13 additions & 1 deletion docs/advanced/input_files/input-main.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
- [pw\_diag\_thr](#pw_diag_thr)
- [pw\_diag\_nmax](#pw_diag_nmax)
- [pw\_diag\_ndim](#pw_diag_ndim)
- [diag\_subspace\_method](#diag_subspace)
- [erf\_ecut](#erf_ecut)
- [fft\_mode](#fft_mode)
- [erf\_height](#erf_height)
Expand Down Expand Up @@ -784,7 +785,18 @@ These variables are used to control the plane wave related parameters.

- **Type**: Integer
- **Description**: Only useful when you use `ks_solver = dav` or `ks_solver = dav_subspace`. It indicates dimension of workspace(number of wavefunction packets, at least 2 needed) for the Davidson method. A larger value may yield a smaller number of iterations in the algorithm but uses more memory and more CPU time in subspace diagonalization.
- **Default**: 4
- **Default**: 4

### diag_subspace

- **Type**: Integer
- **Description**: The method to diagonalize subspace in dav_subspace method. The available options are:
- 0: by LAPACK
- 1: by GenELPA
- 2: by ScaLAPACK
LAPACK only solve in one core, GenELPA and ScaLAPACK can solve in parallel. If the system is small (such as the band number is less than 100), LAPACK is recommended. If the system is large and MPI parallel is used, then GenELPA or ScaLAPACK is recommended, and GenELPA usually has better performance. For GenELPA and ScaLAPACK, the block size can be set by [nb2d](#nb2d).

- **Default**: 0

### erf_ecut

Expand Down
3 changes: 3 additions & 0 deletions python/pyabacus/src/hsolver/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@ list(APPEND _diago
${HSOLVER_PATH}/diago_cg.cpp
${HSOLVER_PATH}/diag_const_nums.cpp
${HSOLVER_PATH}/diago_iter_assist.cpp
${HSOLVER_PATH}/diag_hs_para.cpp
${HSOLVER_PATH}/diago_pxxxgvx.cpp


${HSOLVER_PATH}/kernels/dngvd_op.cpp
${HSOLVER_PATH}/kernels/math_kernel_op.cpp
Expand Down
4 changes: 3 additions & 1 deletion python/pyabacus/src/hsolver/py_diago_dav_subspace.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,9 @@ class PyDiagoDavSubspace
tol,
max_iter,
need_subspace,
comm_info
comm_info,
PARAM.inp.diag_subspace,
PARAM.inp.nb2d
);

return obj->diag(hpsi_func, psi, nbasis, eigenvalue, diag_ethr.data(), scf_type);
Expand Down
2 changes: 2 additions & 0 deletions source/Makefile.Objects
Original file line number Diff line number Diff line change
Expand Up @@ -335,6 +335,8 @@ OBJS_HSOLVER=diago_cg.o\
math_kernel_op.o\
dngvd_op.o\
diag_const_nums.o\
diag_hs_para.o\
diago_pxxxgvx.o\

OBJS_HSOLVER_LCAO=hsolver_lcao.o\
diago_scalapack.o\
Expand Down
2 changes: 1 addition & 1 deletion source/module_base/blacs_connector.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ extern "C"
// Informational and Miscellaneous
void Cblacs_gridinfo(int icontxt, int* nprow, int *npcol, int *myprow, int *mypcol);
void Cblacs_gridinit(int* icontxt, char* layout, int nprow, int npcol);
void Cblacs_gridexit(int* icontxt);
void Cblacs_gridexit(int icontxt);
int Cblacs_pnum(int icontxt, int prow, int pcol);
void Cblacs_pcoord(int icontxt, int pnum, int *prow, int *pcol);
void Cblacs_exit(int icontxt);
Expand Down
14 changes: 14 additions & 0 deletions source/module_base/scalapack_connector.h
Original file line number Diff line number Diff line change
Expand Up @@ -80,12 +80,26 @@ extern "C"
const double* vl, const double* vu, const int* il, const int* iu,
const double* abstol, int* m, int* nz, double* w, const double*orfac, double* Z, const int* iz, const int* jz, const int*descz,
double* work, int* lwork, int*iwork, int*liwork, int* ifail, int*iclustr, double*gap, int* info);

void pzhegvx_(const int* itype, const char* jobz, const char* range, const char* uplo,
const int* n, std::complex<double>* A, const int* ia, const int* ja, const int*desca, std::complex<double>* B, const int* ib, const int* jb, const int*descb,
const double* vl, const double* vu, const int* il, const int* iu,
const double* abstol, int* m, int* nz, double* w, const double*orfac, std::complex<double>* Z, const int* iz, const int* jz, const int*descz,
std::complex<double>* work, int* lwork, double* rwork, int* lrwork, int*iwork, int*liwork, int* ifail, int*iclustr, double*gap, int* info);

void pssygvx_(const int* itype, const char* jobz, const char* range, const char* uplo,
const int* n, float* A, const int* ia, const int* ja, const int*desca, float* B, const int* ib, const int* jb, const int*descb,
const float* vl, const float* vu, const int* il, const int* iu,
const float* abstol, int* m, int* nz, float* w, const float*orfac, float* Z, const int* iz, const int* jz, const int*descz,
float* work, int* lwork, int*iwork, int*liwork, int* ifail, int*iclustr, float*gap, int* info);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

at least leave a blank line between two functions


void pchegvx_(const int* itype, const char* jobz, const char* range, const char* uplo,
const int* n, std::complex<float>* A, const int* ia, const int* ja, const int*desca, std::complex<float>* B, const int* ib, const int* jb, const int*descb,
const float* vl, const float* vu, const int* il, const int* iu,
const float* abstol, int* m, int* nz, float* w, const float*orfac, std::complex<float>* Z, const int* iz, const int* jz, const int*descz,
std::complex<float>* work, int* lwork, float* rwork, int* lrwork, int*iwork, int*liwork, int* ifail, int*iclustr, float*gap, int* info);


void pzgetri_(
const int *n,
const std::complex<double> *A, const int *ia, const int *ja, const int *desca,
Expand Down
3 changes: 3 additions & 0 deletions source/module_hsolver/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@ list(APPEND objects
hsolver_pw_sdft.cpp
diago_iter_assist.cpp
hsolver.cpp
diago_pxxxgvx.cpp
diag_hs_para.cpp

)

if(ENABLE_LCAO)
Expand Down
205 changes: 205 additions & 0 deletions source/module_hsolver/diag_hs_para.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,205 @@
#include "module_hsolver/diag_hs_para.h"

#include "module_base/scalapack_connector.h"
#include "module_basis/module_ao/parallel_2d.h"
#include "module_hsolver/diago_pxxxgvx.h"

#ifdef __ELPA
#include "module_hsolver/genelpa/elpa_solver.h"
#endif

#include <iostream>

namespace hsolver
{

#ifdef __ELPA
void elpa_diag(MPI_Comm comm,
const int nband,
std::complex<double>* h_local,
std::complex<double>* s_local,
double* ekb,
std::complex<double>* wfc_2d,
Parallel_2D& para2d_local)
{
int DecomposedState = 0;
ELPA_Solver es(false, comm, nband, para2d_local.get_row_size(), para2d_local.get_col_size(), para2d_local.desc);
es.generalized_eigenvector(h_local, s_local, DecomposedState, ekb, wfc_2d);
es.exit();
}

void elpa_diag(MPI_Comm comm,
const int nband,
double* h_local,
double* s_local,
double* ekb,
double* wfc_2d,
Parallel_2D& para2d_local)
{
int DecomposedState = 0;
ELPA_Solver es(true, comm, nband, para2d_local.get_row_size(), para2d_local.get_col_size(), para2d_local.desc);
es.generalized_eigenvector(h_local, s_local, DecomposedState, ekb, wfc_2d);
es.exit();
}

void elpa_diag(MPI_Comm comm,
const int nband,
std::complex<float>* h_local,
std::complex<float>* s_local,
float* ekb,
std::complex<float>* wfc_2d,
Parallel_2D& para2d_local)
{
std::cout << "Error: ELPA do not support single precision. " << std::endl;
exit(1);
}

void elpa_diag(MPI_Comm comm,
const int nband,
float* h_local,
float* s_local,
float* ekb,
float* wfc_2d,
Parallel_2D& para2d_local)
{
std::cout << "Error: ELPA do not support single precision. " << std::endl;
exit(1);
}

#endif

#ifdef __MPI

template <typename T>
void Diago_HS_para(T* h,
T* s,
const int lda,
const int nband,
typename GetTypeReal<T>::type* const ekb,
T* const wfc,
const MPI_Comm& comm,
const int diag_subspace,
const int block_size)
{
int myrank = 0;
MPI_Comm_rank(comm, &myrank);
Parallel_2D para2d_global;
Parallel_2D para2d_local;
para2d_global.init(lda, lda, lda, comm);

int max_nb = block_size;
if (block_size == 0)
{
if (nband > 500)
{
max_nb = 32;
}
else
{
max_nb = 16;
}
}
else if (block_size < 0)
{
std::cout << "Error: block_size in diago_subspace should be a positive integer. " << std::endl;
exit(1);
}

// for genelpa, if the block size is too large that some cores have no data, then it will cause error.
if (diag_subspace == 1)
{
if (max_nb * (std::max(para2d_global.dim0, para2d_global.dim1) - 1) >= lda)
{
max_nb = lda / std::max(para2d_global.dim0, para2d_global.dim1);
}
}

para2d_local.init(lda, lda, max_nb, comm);
std::vector<T> h_local(para2d_local.get_col_size() * para2d_local.get_row_size());
std::vector<T> s_local(para2d_local.get_col_size() * para2d_local.get_row_size());
std::vector<T> wfc_2d(para2d_local.get_col_size() * para2d_local.get_row_size());

// distribute h and s to 2D
Cpxgemr2d(lda, lda, h, 1, 1, para2d_global.desc, h_local.data(), 1, 1, para2d_local.desc, para2d_local.blacs_ctxt);
Cpxgemr2d(lda, lda, s, 1, 1, para2d_global.desc, s_local.data(), 1, 1, para2d_local.desc, para2d_local.blacs_ctxt);

if (diag_subspace == 1)
{
#ifdef __ELPA
elpa_diag(comm, nband, h_local.data(), s_local.data(), ekb, wfc_2d.data(), para2d_local);
#else
std::cout << "ERROR: try to use ELPA to solve the generalized eigenvalue problem, but ELPA is not compiled. "
<< std::endl;
exit(1);
#endif
}
else if (diag_subspace == 2)
{
hsolver::pxxxgvx_diag(para2d_local.desc,
para2d_local.get_row_size(),
para2d_local.get_col_size(),
nband,
h_local.data(),
s_local.data(),
ekb,
wfc_2d.data());
}
else
{
std::cout << "Error: parallel diagonalization method is not supported. " << "diag_subspace = " << diag_subspace
<< std::endl;
exit(1);
}

// gather wfc
Cpxgemr2d(lda, lda, wfc_2d.data(), 1, 1, para2d_local.desc, wfc, 1, 1, para2d_global.desc, para2d_local.blacs_ctxt);

// free the context
Cblacs_gridexit(para2d_local.blacs_ctxt);
Cblacs_gridexit(para2d_global.blacs_ctxt);
}

// template instantiation
template void Diago_HS_para<double>(double* h,
double* s,
const int lda,
const int nband,
typename GetTypeReal<double>::type* const ekb,
double* const wfc,
const MPI_Comm& comm,
const int diag_subspace,
const int block_size);

template void Diago_HS_para<std::complex<double>>(std::complex<double>* h,
std::complex<double>* s,
const int lda,
const int nband,
typename GetTypeReal<std::complex<double>>::type* const ekb,
std::complex<double>* const wfc,
const MPI_Comm& comm,
const int diag_subspace,
const int block_size);

template void Diago_HS_para<float>(float* h,
float* s,
const int lda,
const int nband,
typename GetTypeReal<float>::type* const ekb,
float* const wfc,
const MPI_Comm& comm,
const int diag_subspace,
const int block_size);

template void Diago_HS_para<std::complex<float>>(std::complex<float>* h,
std::complex<float>* s,
const int lda,
const int nband,
typename GetTypeReal<std::complex<float>>::type* const ekb,
std::complex<float>* const wfc,
const MPI_Comm& comm,
const int diag_subspace,
const int block_size);

#endif

} // namespace hsolver
46 changes: 46 additions & 0 deletions source/module_hsolver/diag_hs_para.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#include "module_basis/module_ao/parallel_2d.h"
#include "module_base/macros.h"

#ifdef __MPI
#include <mpi.h>
#endif

namespace hsolver
{


#ifdef __MPI

/**
* @brief Parallel do the generalized eigenvalue problem
*
* @tparam T double or complex<double> or float or complex<float>
* @param H the hermitian matrix H.
* @param S the overlap matrix S.
* @param lda the leading dimension of H and S
* @param nband the number of bands to be calculated
* @param ekb to store the eigenvalues.
* @param wfc to store the eigenvectors
* @param comm the communicator
* @param diag_subspace the method to solve the generalized eigenvalue problem
* @param block_size the block size in 2d block cyclic distribution if use elpa or scalapack.
*
* @note 1. h and s should be full matrix in rank 0 of the communicator, and the other ranks is not concerned.
* @note 2. wfc is complete in rank 0, and not store in other ranks.
* @note 3. diag_subspace should be 1: by elpa, 2: by scalapack
* @note 4. block_size should be 0 or a positive integer. If it is 0, then will use a value as large as possible that is allowed
*/
template <typename T>
void Diago_HS_para(T* h,
T* s,
const int lda,
const int nband,
typename GetTypeReal<T>::type* const ekb,
T* const wfc,
const MPI_Comm& comm,
const int diag_subspace,
const int block_size = 0);
#endif

} // namespace hsolver

Loading
Loading