deepmodeling · pxlxingliang · Nov 21, 2024 · Nov 21, 2024 · Nov 21, 2024 · Nov 21, 2024
diff --git a/docs/advanced/input_files/input-main.md b/docs/advanced/input_files/input-main.md
@@ -39,6 +39,7 @@
     - [pw\_diag\_thr](#pw_diag_thr)
     - [pw\_diag\_nmax](#pw_diag_nmax)
     - [pw\_diag\_ndim](#pw_diag_ndim)
+    - [diag\_subspace\_method](#diag_subspace)
     - [erf\_ecut](#erf_ecut)
     - [fft\_mode](#fft_mode)
     - [erf\_height](#erf_height)
@@ -784,7 +785,18 @@ These variables are used to control the plane wave related parameters.
 
 - **Type**: Integer
 - **Description**: Only useful when you use `ks_solver = dav` or `ks_solver = dav_subspace`. It indicates dimension of workspace(number of wavefunction packets, at least 2 needed) for the Davidson method. A larger value may yield a smaller number of iterations in the algorithm but uses more memory and more CPU time in subspace diagonalization.
-- **Default**: 4
+- **Default**: 4 
+
+### diag_subspace
+
+- **Type**: Integer
+- **Description**: The method to diagonalize subspace in dav_subspace method. The available options are:
+  - 0: by LAPACK
+  - 1: by GenELPA
+  - 2: by ScaLAPACK
+  LAPACK only solve in one core, GenELPA and ScaLAPACK can solve in parallel. If the system is small (such as the band number is less than 100), LAPACK is recommended. If the system is large and MPI parallel is used, then GenELPA or ScaLAPACK is recommended, and GenELPA usually has better performance. For GenELPA and ScaLAPACK, the block size can be set by [nb2d](#nb2d).
+
+- **Default**: 0
 
 ### erf_ecut
 

diff --git a/python/pyabacus/src/hsolver/CMakeLists.txt b/python/pyabacus/src/hsolver/CMakeLists.txt
@@ -5,6 +5,9 @@ list(APPEND _diago
     ${HSOLVER_PATH}/diago_cg.cpp
     ${HSOLVER_PATH}/diag_const_nums.cpp
     ${HSOLVER_PATH}/diago_iter_assist.cpp
+    ${HSOLVER_PATH}/diag_hs_para.cpp
+    ${HSOLVER_PATH}/diago_pxxxgvx.cpp
+
 
     ${HSOLVER_PATH}/kernels/dngvd_op.cpp
     ${HSOLVER_PATH}/kernels/math_kernel_op.cpp

diff --git a/python/pyabacus/src/hsolver/py_diago_dav_subspace.hpp b/python/pyabacus/src/hsolver/py_diago_dav_subspace.hpp
@@ -138,7 +138,9 @@ class PyDiagoDavSubspace
             tol, 
             max_iter, 
             need_subspace, 
-            comm_info
+            comm_info,
+            PARAM.inp.diag_subspace,
+            PARAM.inp.nb2d
         );
 
         return obj->diag(hpsi_func, psi, nbasis, eigenvalue, diag_ethr.data(), scf_type);

diff --git a/source/Makefile.Objects b/source/Makefile.Objects
@@ -335,6 +335,8 @@ OBJS_HSOLVER=diago_cg.o\
     math_kernel_op.o\
     dngvd_op.o\
     diag_const_nums.o\
+    diag_hs_para.o\
+    diago_pxxxgvx.o\
 
 OBJS_HSOLVER_LCAO=hsolver_lcao.o\
       diago_scalapack.o\

diff --git a/source/module_base/blacs_connector.h b/source/module_base/blacs_connector.h
@@ -39,7 +39,7 @@ extern "C"
 		// Informational and Miscellaneous
 	void Cblacs_gridinfo(int icontxt, int* nprow, int *npcol, int *myprow, int *mypcol);
     void Cblacs_gridinit(int* icontxt, char* layout, int nprow, int npcol);
-    void Cblacs_gridexit(int* icontxt);
+    void Cblacs_gridexit(int icontxt);
     int Cblacs_pnum(int icontxt, int prow, int pcol);
     void Cblacs_pcoord(int icontxt, int pnum, int *prow, int *pcol);
 	void Cblacs_exit(int icontxt);

diff --git a/source/module_base/scalapack_connector.h b/source/module_base/scalapack_connector.h
@@ -80,12 +80,26 @@ extern "C"
 		const double* vl, const double* vu, const int* il, const int* iu,
 		const double* abstol, int* m, int* nz, double* w, const double*orfac, double* Z, const int* iz, const int* jz, const int*descz,
 		double* work, int* lwork, int*iwork, int*liwork, int* ifail, int*iclustr, double*gap, int* info);
+
 	void pzhegvx_(const int* itype, const char* jobz, const char* range, const char* uplo,
 		const int* n, std::complex<double>* A, const int* ia, const int* ja, const int*desca, std::complex<double>* B, const int* ib, const int* jb, const int*descb,
 		const double* vl, const double* vu, const int* il, const int* iu,
 		const double* abstol, int* m, int* nz, double* w, const double*orfac, std::complex<double>* Z, const int* iz, const int* jz, const int*descz,
 		std::complex<double>* work, int* lwork, double* rwork, int* lrwork, int*iwork, int*liwork, int* ifail, int*iclustr, double*gap, int* info);
 
+	void pssygvx_(const int* itype, const char* jobz, const char* range, const char* uplo,
+		const int* n, float* A, const int* ia, const int* ja, const int*desca, float* B, const int* ib, const int* jb, const int*descb,
+		const float* vl, const float* vu, const int* il, const int* iu,
+		const float* abstol, int* m, int* nz, float* w, const float*orfac, float* Z, const int* iz, const int* jz, const int*descz,
+		float* work, int* lwork, int*iwork, int*liwork, int* ifail, int*iclustr, float*gap, int* info);
+
+	void pchegvx_(const int* itype, const char* jobz, const char* range, const char* uplo,
+		const int* n, std::complex<float>* A, const int* ia, const int* ja, const int*desca, std::complex<float>* B, const int* ib, const int* jb, const int*descb,
+		const float* vl, const float* vu, const int* il, const int* iu,
+		const float* abstol, int* m, int* nz, float* w, const float*orfac, std::complex<float>* Z, const int* iz, const int* jz, const int*descz,
+		std::complex<float>* work, int* lwork, float* rwork, int* lrwork, int*iwork, int*liwork, int* ifail, int*iclustr, float*gap, int* info);
+
+
 	void pzgetri_(
 		const int *n, 
 		const std::complex<double> *A, const int *ia, const int *ja, const int *desca,

diff --git a/source/module_hsolver/CMakeLists.txt b/source/module_hsolver/CMakeLists.txt
@@ -9,6 +9,9 @@ list(APPEND objects
     hsolver_pw_sdft.cpp
     diago_iter_assist.cpp
     hsolver.cpp
+    diago_pxxxgvx.cpp
+    diag_hs_para.cpp
+
 )
 
 if(ENABLE_LCAO)

diff --git a/source/module_hsolver/diag_hs_para.cpp b/source/module_hsolver/diag_hs_para.cpp
@@ -0,0 +1,205 @@
+#include "module_hsolver/diag_hs_para.h"
+
+#include "module_base/scalapack_connector.h"
+#include "module_basis/module_ao/parallel_2d.h"
+#include "module_hsolver/diago_pxxxgvx.h"
+
+#ifdef __ELPA
+#include "module_hsolver/genelpa/elpa_solver.h"
+#endif
+
+#include <iostream>
+
+namespace hsolver
+{
+
+#ifdef __ELPA
+void elpa_diag(MPI_Comm comm,
+               const int nband,
+               std::complex<double>* h_local,
+               std::complex<double>* s_local,
+               double* ekb,
+               std::complex<double>* wfc_2d,
+               Parallel_2D& para2d_local)
+{
+    int DecomposedState = 0;
+    ELPA_Solver es(false, comm, nband, para2d_local.get_row_size(), para2d_local.get_col_size(), para2d_local.desc);
+    es.generalized_eigenvector(h_local, s_local, DecomposedState, ekb, wfc_2d);
+    es.exit();
+}
+
+void elpa_diag(MPI_Comm comm,
+               const int nband,
+               double* h_local,
+               double* s_local,
+               double* ekb,
+               double* wfc_2d,
+               Parallel_2D& para2d_local)
+{
+    int DecomposedState = 0;
+    ELPA_Solver es(true, comm, nband, para2d_local.get_row_size(), para2d_local.get_col_size(), para2d_local.desc);
+    es.generalized_eigenvector(h_local, s_local, DecomposedState, ekb, wfc_2d);
+    es.exit();
+}
+
+void elpa_diag(MPI_Comm comm,
+               const int nband,
+               std::complex<float>* h_local,
+               std::complex<float>* s_local,
+               float* ekb,
+               std::complex<float>* wfc_2d,
+               Parallel_2D& para2d_local)
+{
+    std::cout << "Error: ELPA do not support single precision. " << std::endl;
+    exit(1);
+}
+
+void elpa_diag(MPI_Comm comm,
+               const int nband,
+               float* h_local,
+               float* s_local,
+               float* ekb,
+               float* wfc_2d,
+               Parallel_2D& para2d_local)
+{
+    std::cout << "Error: ELPA do not support single precision. " << std::endl;
+    exit(1);
+}
+
+#endif
+
+#ifdef __MPI
+
+template <typename T>
+void Diago_HS_para(T* h,
+                   T* s,
+                   const int lda,
+                   const int nband,
+                   typename GetTypeReal<T>::type* const ekb,
+                   T* const wfc,
+                   const MPI_Comm& comm,
+                   const int diag_subspace,
+                   const int block_size)
+{
+    int myrank = 0;
+    MPI_Comm_rank(comm, &myrank);
+    Parallel_2D para2d_global;
+    Parallel_2D para2d_local;
+    para2d_global.init(lda, lda, lda, comm);
+
+    int max_nb = block_size;
+    if (block_size == 0)
+    {
+        if (nband > 500)
+        {
+            max_nb = 32;
+        }
+        else
+        {
+            max_nb = 16;
+        }
+    }
+    else if (block_size < 0)
+    {
+        std::cout << "Error: block_size in diago_subspace should be a positive integer. " << std::endl;
+        exit(1);
+    }
+
+    // for genelpa, if the block size is too large that some cores have no data, then it will cause error.
+    if (diag_subspace == 1)
+    {
+        if (max_nb * (std::max(para2d_global.dim0, para2d_global.dim1) - 1) >= lda)
+        {
+            max_nb = lda / std::max(para2d_global.dim0, para2d_global.dim1);
+        }
+    }
+
+    para2d_local.init(lda, lda, max_nb, comm);
+    std::vector<T> h_local(para2d_local.get_col_size() * para2d_local.get_row_size());
+    std::vector<T> s_local(para2d_local.get_col_size() * para2d_local.get_row_size());
+    std::vector<T> wfc_2d(para2d_local.get_col_size() * para2d_local.get_row_size());
+
+    // distribute h and s to 2D
+    Cpxgemr2d(lda, lda, h, 1, 1, para2d_global.desc, h_local.data(), 1, 1, para2d_local.desc, para2d_local.blacs_ctxt);
+    Cpxgemr2d(lda, lda, s, 1, 1, para2d_global.desc, s_local.data(), 1, 1, para2d_local.desc, para2d_local.blacs_ctxt);
+
+    if (diag_subspace == 1)
+    {
+#ifdef __ELPA
+        elpa_diag(comm, nband, h_local.data(), s_local.data(), ekb, wfc_2d.data(), para2d_local);
+#else
+        std::cout << "ERROR: try to use ELPA to solve the generalized eigenvalue problem, but ELPA is not compiled. "
+                  << std::endl;
+        exit(1);
+#endif
+    }
+    else if (diag_subspace == 2)
+    {
+        hsolver::pxxxgvx_diag(para2d_local.desc,
+                              para2d_local.get_row_size(),
+                              para2d_local.get_col_size(),
+                              nband,
+                              h_local.data(),
+                              s_local.data(),
+                              ekb,
+                              wfc_2d.data());
+    }
+    else
+    {
+        std::cout << "Error: parallel diagonalization method is not supported. " << "diag_subspace = " << diag_subspace
+                  << std::endl;
+        exit(1);
+    }
+
+    // gather wfc
+    Cpxgemr2d(lda, lda, wfc_2d.data(), 1, 1, para2d_local.desc, wfc, 1, 1, para2d_global.desc, para2d_local.blacs_ctxt);
+
+    // free the context
+    Cblacs_gridexit(para2d_local.blacs_ctxt);
+    Cblacs_gridexit(para2d_global.blacs_ctxt);
+}
+
+// template instantiation
+template void Diago_HS_para<double>(double* h,
+                                    double* s,
+                                    const int lda,
+                                    const int nband,
+                                    typename GetTypeReal<double>::type* const ekb,
+                                    double* const wfc,
+                                    const MPI_Comm& comm,
+                                    const int diag_subspace,
+                                    const int block_size);
+
+template void Diago_HS_para<std::complex<double>>(std::complex<double>* h,
+                                                  std::complex<double>* s,
+                                                  const int lda,
+                                                  const int nband,
+                                                  typename GetTypeReal<std::complex<double>>::type* const ekb,
+                                                  std::complex<double>* const wfc,
+                                                  const MPI_Comm& comm,
+                                                  const int diag_subspace,
+                                                  const int block_size);
+
+template void Diago_HS_para<float>(float* h,
+                                   float* s,
+                                   const int lda,
+                                   const int nband,
+                                   typename GetTypeReal<float>::type* const ekb,
+                                   float* const wfc,
+                                   const MPI_Comm& comm,
+                                   const int diag_subspace,
+                                   const int block_size);
+
+template void Diago_HS_para<std::complex<float>>(std::complex<float>* h,
+                                                 std::complex<float>* s,
+                                                 const int lda,
+                                                 const int nband,
+                                                 typename GetTypeReal<std::complex<float>>::type* const ekb,
+                                                 std::complex<float>* const wfc,
+                                                 const MPI_Comm& comm,
+                                                 const int diag_subspace,
+                                                 const int block_size);
+
+#endif
+
+} // namespace hsolver
diff --git a/source/module_hsolver/diag_hs_para.h b/source/module_hsolver/diag_hs_para.h
@@ -0,0 +1,46 @@
+#include "module_basis/module_ao/parallel_2d.h"
+#include "module_base/macros.h"
+
+#ifdef __MPI
+#include <mpi.h>
+#endif
+
+namespace hsolver
+{
+
+
+#ifdef __MPI
+
+/**
+ * @brief Parallel do the generalized eigenvalue problem
+ * 
+ * @tparam T double or complex<double> or float or complex<float>
+ * @param H the hermitian matrix H.
+ * @param S the overlap matrix S.
+ * @param lda the leading dimension of H and S
+ * @param nband the number of bands to be calculated
+ * @param ekb to store the eigenvalues.
+ * @param wfc to store the eigenvectors
+ * @param comm the communicator
+ * @param diag_subspace the method to solve the generalized eigenvalue problem
+ * @param block_size the block size in 2d block cyclic distribution if use elpa or scalapack. 
+ * 
+ * @note 1. h and s should be full matrix in rank 0 of the communicator, and the other ranks is not concerned.
+ * @note 2. wfc is complete in rank 0, and not store in other ranks.
+ * @note 3. diag_subspace should be 1: by elpa, 2: by scalapack
+ * @note 4. block_size should be 0 or a positive integer. If it is 0, then will use a value as large as possible that is allowed
+ */
+template <typename T>
+void Diago_HS_para(T* h,
+                   T* s,
+                   const int lda,
+                   const int nband,
+                   typename GetTypeReal<T>::type* const ekb,
+                   T* const wfc,
+                   const MPI_Comm& comm,
+                   const int diag_subspace,
+                   const int block_size = 0);
+#endif
+
+} // namespace hsolver                 
+