Merge branch 'develop' of https://github.com/deepmodeling/abacus-develop

into after_scf
YuLiu98 · Aug 7, 2024 · ef64ac0 · ef64ac0
2 parents 15944ef + 9f55e3a
commit ef64ac0
Show file tree

Hide file tree

Showing 131 changed files with 3,595 additions and 2,692 deletions.
diff --git a/docs/advanced/acceleration/cuda.md b/docs/advanced/acceleration/cuda.md
@@ -6,13 +6,13 @@ In ABACUS, we provide the option to use GPU devices to accelerate performance. T
 
 - **Electronic state data**: (e.g. electronic density) are moved from the GPU to the CPU(s) every scf step.
 
-- **Acclerated by the NVIDIA libraries**: `cuBLAS` for common linear algebra calculations, `cuSolver` for eigen values/vectors, and `cuFFT` for the conversions between the real and recip spaces.
+- **Accelerated by the NVIDIA libraries**: `cuBLAS` for common linear algebra calculations, `cuSolver` for eigen values/vectors, and `cuFFT` for the conversions between the real and recip spaces.
 
 - **Multi GPU supprted**: Using multiple MPI tasks will often give the best performance. Note each MPI task will be bind to a GPU device with automatically computing load balancing.
 
 - **Parallel strategy**: K point parallel.
 
-Unlike PW basis, only the grid integration module (module_gint) and the diagonalization of the Hamiltonian matrix (module_hsolver) have been implemented with GPU acceleration under LCAO basis, and the acceleration is limited to gamma only calculation. Additionally, LCAO basis does not support multi-GPU acceleration. Both the grid integration module and the Hamiltonian matrix solver only support acceleration on a single GPU.
+Unlike PW basis, only the grid integration module (module_gint) and the diagonalization of the Hamiltonian matrix (module_hsolver) have been implemented with GPU acceleration under LCAO basis.
 
 ## Required hardware/software
 
@@ -31,17 +31,14 @@ Check the [Advanced Installation Options](https://abacus-rtd.readthedocs.io/en/l
 
 ## Run with the GPU support by editing the INPUT script:
 
-In `INPUT` file we need to set the value keyword [device](../input_files/input-main.md#device) to be `gpu`.
+In `INPUT` file we need to set the input parameter [device](../input_files/input-main.md#device) to `gpu`. If this parameter is not set, ABACUS will try to determine if there are available GPUs.
+- Set `ks_solver`: For the PW basis, CG, BPCG and Davidson methods are supported on GPU; set the input parameter [ks_solver](../input_files/input-main.md#ks_solver) to `cg`, `bpcg` or `dav`. For the LCAO basis, `cusolver` is supported on GPU.
+- **multi-card**: ABACUS allows for multi-GPU acceleration. If you have multiple GPU cards, you can run ABACUS with several MPI processes, and each process will utilize one GPU card. For example, the command `mpirun -n 2 abacus` will by default launch two GPUs for computation. If you only have one card, this command will only start one GPU.
 
 ## Examples
 We provides [examples](https://github.com/deepmodeling/abacus-develop/tree/develop/examples/gpu) of gpu calculations.
 
 ## Known limitations
 PW basis:
-- CG, BPCG and Davidson methods are supported, so the input keyword `ks_solver` can take the values `cg`, `bpcg` or `dav`.
 - Only k point parallelization is supported, so the input keyword `kpar` will be set to match the number of MPI tasks automatically.
 - By default, CUDA architectures 60, 70, 75, 80, 86, and 89 are compiled (if supported). It can be overriden using the CMake variable [`CMAKE_CUDA_ARCHITECTURES`](https://cmake.org/cmake/help/latest/variable/CMAKE_CUDA_ARCHITECTURES.html) or the environmental variable [`CUDAARCHS`](https://cmake.org/cmake/help/latest/envvar/CUDAARCHS.html).
-
-LCAO basis:
-- Does not support multi-k calculation, so if the input keyword `device` is set to `gpu`, the input keyword `gamma_only` can only take the value `1`.
-- Does not support multi-GPU acceleration.
diff --git a/docs/advanced/input_files/input-main.md b/docs/advanced/input_files/input-main.md
@@ -629,7 +629,7 @@ If only one value is set (such as `kspacing 0.5`), then kspacing values of a/b/c
   - cpu: for CPUs via Intel, AMD, or Other supported CPU devices
   - gpu: for GPUs via CUDA or ROCm.
 
-  Known limitations: If using the pw basis, the ks_solver must be cg/bpcg/dav to support `gpu` acceleration. If using the lcao basis, `gamma_only` must be set to `1`, as multi-k calculation is currently not supported for `gpu`. lcao_in_pw also does not support `gpu`.
+  Known limitations: `ks_solver` must also be set to the algorithms supported. lcao_in_pw currently does not support `gpu`.
 
 - **Default**: cpu
 

diff --git a/source/Makefile.Objects b/source/Makefile.Objects
@@ -302,6 +302,7 @@ OBJS_HAMILT=hamilt_pw.o\
     meta_pw.o\
     meta_op.o\
     velocity_pw.o\
+    radial_proj.o\
 
 OBJS_HAMILT_OF=kedf_tf.o\
     kedf_vw.o\

diff --git a/source/module_base/cubic_spline.cpp b/source/module_base/cubic_spline.cpp
@@ -56,6 +56,18 @@ CubicSpline::CubicSpline(
 }
 
 
+CubicSpline::CubicSpline(int n, const double* x)
+    : n_spline_(0), n_(n), xmin_(x[0]), xmax_(x[n - 1]), x_(x, x + n)
+{
+}
+
+
+CubicSpline::CubicSpline(int n, double x0, double dx)
+    : n_spline_(0), n_(n), xmin_(x0), xmax_(x0 + (n - 1) * dx), dx_(dx)
+{
+}
+
+
 void CubicSpline::add(
     const double* y,
     const BoundaryCondition& bc_start,

diff --git a/source/module_base/cubic_spline.h b/source/module_base/cubic_spline.h
@@ -81,6 +81,13 @@ namespace ModuleBase
  *      cubspl5.add(y4, {}, {CubicSpline::BoundaryType::second_deriv, 2.0});
  *      cubspl5.add(y5);
  *
+ *      // alternative, one may start with an empty object with knots only:
+ *      // CubicSpline cubspl5(n, x);
+ *      // cubspl5.reserve(5);
+ *      // cubspl5.add(y);
+ *      // cubspl5.add(y2);
+ *      // ...
+ *
  *      // evaluates the five interpolants simultaneously at a single place
  *      cubspl5.multi_eval(x_interp, y_interp)
  *
@@ -223,7 +230,35 @@ class CubicSpline
 
 
     /**
-     * @brief Add an interpolant that shares the same knots.
+     * @brief Builds an empty object with specified knots only.
+     *
+     * An object of this class can hold multiple interpolants with the same knots.
+     * This constructor allows the user to initialize the object with knots only,
+     * so that interpolants can be added later.
+     * 
+     * @param[in]   n               number of knots
+     * @param[in]   x               x coordinates of data points
+     *                              ("knots", must be strictly increasing)
+     *
+     */
+    CubicSpline(int n, const double* x);
+
+    /**
+     * @brief Builds an empty object with specified knots only.
+     *
+     * An object of this class can hold multiple interpolants with the same knots.
+     * This constructor allows the user to initialize the object with knots only,
+     * so that interpolants can be added later.
+     *
+     * @param[in]   n               number of knots
+     * @param[in]   x0              x coordinate of the first data point (first knot)
+     * @param[in]   dx              spacing between knots (must be positive)
+     *
+     */
+    CubicSpline(int n, double x0, double dx);
+
+    /**
+     * @brief Adds an interpolant that shares the same knots.
      *
      * An object of this class can hold multiple interpolants with the same knots.
      * Once constructed, more interpolants sharing the same knots can be added by
@@ -336,6 +371,9 @@ class CubicSpline
     /// last knot
     double xmax() const { return xmax_; }
 
+    /// number of interpolants held by this object
+    int n_spline() const { return n_spline_; }
+
 
 private:
 

diff --git a/source/module_base/formatter.h b/source/module_base/formatter.h
@@ -159,6 +159,10 @@ class FmtTable
     enum class Align{LEFT, RIGHT, CENTER};
 private:
     typedef FmtCore core;
+    struct Alignments{
+        Alignments(const Align& val = Align::RIGHT, const Align& title = Align::CENTER): val_(val), title_(title) {};
+        Align val_, title_; // value and title alignments
+    } aligns_;
     struct Frames{
         Frames(char up = '-', char mid = '-', char dw = '-', char l = ' ', char r = ' '): up_(up), mid_(mid), dw_(dw), l_(l), r_(r) {};
         char up_, mid_ , dw_, l_, r_; // up, middle, down, left, right frames. up: the frame above title, middle: the one between title and data
@@ -167,10 +171,6 @@ class FmtTable
         Delimiters(char h = '-', char v = ' '): h_(h), v_(v) {};
         char h_, v_; // horizontal and vertical delimiters
     } delimiters_;
-    struct Alignments{
-        Alignments(const Align& val = Align::RIGHT, const Align& title = Align::CENTER): val_(val), title_(title) {};
-        Align val_, title_; // value and title alignments
-    } aligns_;
 public:
     /**
      * @brief Construct a new Fmt Table object
@@ -186,7 +186,7 @@ class FmtTable
              const std::vector<std::string>& fmts,
              const Alignments& aligns = {},
              const Frames& frames = {},
-             const Delimiters& delimiters = {}): titles_(titles), fmts_(fmts), data_(nrows, titles.size()), aligns_(aligns), frames_(frames), delimiters_(delimiters)
+             const Delimiters& delimiters = {}): titles_(titles), data_(nrows, titles.size()), fmts_(fmts), aligns_(aligns), frames_(frames), delimiters_(delimiters)
     { assert(titles.size() == fmts.size()); };
     ~FmtTable() {};
     /**
@@ -381,8 +381,8 @@ class FmtTable
     size_t j_ = 0;
 
     std::vector<std::string> titles_;
-    std::vector<std::string> fmts_; // format strings for each column
     NDArray<std::string> data_; // data
+    std::vector<std::string> fmts_; // format strings for each column
 };
 
 #endif
diff --git a/source/module_base/lapack_connector.h b/source/module_base/lapack_connector.h
@@ -174,6 +174,10 @@ extern "C"
 
     // solve a tridiagonal linear system
     void dgtsv_(int* N, int* NRHS, double* DL, double* D, double* DU, double* B, int* LDB, int* INFO);
+
+    // solve Ax = b 
+    void dsysv_(const char* uplo, const int* n, const int* m, double * a, const int* lda,
+                 int *ipiv, double * b, const int* ldb, double *work, const int* lwork ,int *info);
 }
 
 #ifdef GATHER_INFO

diff --git a/source/module_base/module_device/device.cpp b/source/module_base/module_device/device.cpp
@@ -149,8 +149,7 @@ int set_device_by_rank(const MPI_Comm mpi_comm) {
 
 std::string get_device_flag(const std::string &device,
                             const std::string &ks_solver,
-                            const std::string &basis_type,
-                            const bool &gamma_only) {
+                            const std::string &basis_type) {
 if (device == "cpu") {
   return "cpu"; // no extra checks required
 }
@@ -181,10 +180,6 @@ if (basis_type == "lcao_in_pw") {
   error_message +=
       "The GPU currently does not support the basis type \"lcao_in_pw\"!";
 }
-if (basis_type == "lcao" && gamma_only == false) {
-  error_message += "The GPU currently does not support the basis type "
-                    "\"lcao\" with \"gamma_only\" set to \"0\"!";
-}
 if(error_message.empty())
 {
   return "gpu"; // possibly automatically set to GPU

diff --git a/source/module_base/module_device/device.h b/source/module_base/module_device/device.h
@@ -48,8 +48,7 @@ int get_device_kpar(const int& kpar);
  */
 std::string get_device_flag(const std::string& device,
                             const std::string& ks_solver,
-                            const std::string& basis_type,
-                            const bool& gamma_only);
+                            const std::string& basis_type);
 
 #if __MPI
 int get_node_rank();

diff --git a/source/module_base/module_mixing/broyden_mixing.cpp b/source/module_base/module_mixing/broyden_mixing.cpp
@@ -137,41 +137,24 @@ void Broyden_Mixing::tem_cal_coef(const Mixing_Data& mdata, std::function<double
                 }
             }
         }
-        double* work = new double[ndim_cal_dF];
-        int* iwork = new int[ndim_cal_dF];
+        double* work = new double[ndim_cal_dF];   // workspace
+        int* iwork = new int[ndim_cal_dF];   // ipiv
         char uu = 'U';
         int info;
-        dsytrf_(&uu, &ndim_cal_dF, beta_tmp.c, &ndim_cal_dF, iwork, work, &ndim_cal_dF, &info);
-        if (info != 0)
-            ModuleBase::WARNING_QUIT("Charge_Mixing", "Error when factorizing beta.");
-        dsytri_(&uu, &ndim_cal_dF, beta_tmp.c, &ndim_cal_dF, iwork, work, &info);
-        if (info != 0)
-            ModuleBase::WARNING_QUIT("Charge_Mixing", "Error when DSYTRI beta.");
-        for (int i = 0; i < ndim_cal_dF; ++i)
-        {
-            for (int j = i + 1; j < ndim_cal_dF; ++j)
-            {
-                beta_tmp(i, j) = beta_tmp(j, i);
-            }
-        }
+        int m = 1;
+        // gamma means the coeficients for mixing
+        // but now gamma store <dFi|Fm>, namely c
+        std::vector<double> gamma(ndim_cal_dF);
         for (int i = 0; i < ndim_cal_dF; ++i)
         {
             FPTYPE* dFi = FP_dF + i * length;
-            work[i] = inner_product(dFi, FP_F);
+            gamma[i] = inner_product(dFi, FP_F);
         }
-        // gamma[i] = \sum_j beta_tmp(i,j) * work[j]
-        std::vector<double> gamma(ndim_cal_dF);
-        container::BlasConnector::gemv('N',
-                                       ndim_cal_dF,
-                                       ndim_cal_dF,
-                                       1.0,
-                                       beta_tmp.c,
-                                       ndim_cal_dF,
-                                       work,
-                                       1,
-                                       0.0,
-                                       gamma.data(),
-                                       1);
+        // solve aG = c 
+        dsysv_(&uu, &ndim_cal_dF, &m, beta_tmp.c, &ndim_cal_dF, iwork, gamma.data(), &ndim_cal_dF, work, &ndim_cal_dF, &info);
+        if (info != 0)
+            ModuleBase::WARNING_QUIT("Charge_Mixing", "Error when DSYSV.");
+        // after solving, gamma store the coeficients for mixing
         coef[mdata.start] = 1 + gamma[dFindex_move(0)];
         for (int i = 1; i < ndim_cal_dF; ++i)
         {

diff --git a/source/module_base/test/cubic_spline_test.cpp b/source/module_base/test/cubic_spline_test.cpp
@@ -270,19 +270,19 @@ TEST_F(CubicSplineTest, MultiEval)
 
     std::for_each(x_, x_ + n, [&](double& x) { x = xmin + (&x - x_) * dx; });
 
-    std::transform(x_, x_ + n, y_, [this](double x) { return f_[0][0](x); });
-    CubicSpline cubspl(n, xmin, dx, y_,
-                       {BoundaryType::first_deriv, f_[0][1](xmin)},
-                       {BoundaryType::first_deriv, f_[0][1](xmax)});
-
+    // empty interpolant with specified knots
+    CubicSpline cubspl(n, xmin, dx);
     cubspl.reserve(f_.size());
-    for (size_t i = 1; i < f_.size(); ++i)
+
+    for (size_t i = 0; i < f_.size(); ++i)
     {
         std::transform(x_, x_ + n, y_, [this, i](double x) { return f_[i][0](x); });
         cubspl.add(y_, {BoundaryType::first_deriv, f_[i][1](xmin)},
                        {BoundaryType::first_deriv, f_[i][1](xmax)});
     }
 
+    EXPECT_EQ(cubspl.n_spline(), f_.size());
+
     std::vector<std::vector<double>> err_bound(f_.size(), std::vector<double>(3));
     for (size_t i = 0; i < f_.size(); ++i)
     {

diff --git a/source/module_base/timer.cpp b/source/module_base/timer.cpp
@@ -5,7 +5,7 @@
 //==========================================================
 #include "timer.h"
 
-#include <math.h>
+#include <cmath>
 
 #ifdef __MPI
 #include <mpi.h>
@@ -32,20 +32,21 @@ std::map<std::string,std::map<std::string,timer::Timer_One>> timer::timer_pool;
 void timer::finish(std::ofstream &ofs,const bool print_flag)
 {
 	timer::tick("","total");
-	if(print_flag)
+	if(print_flag) {
 		print_all( ofs );
 }
+}
 
 //----------------------------------------------------------
 //
 //----------------------------------------------------------
-void timer::start(void)
+void timer::start()
 {
 	// first init ,then we can use tick
 	timer::tick("","total");
 }
 
-double timer::cpu_time(void)
+double timer::cpu_time()
 {
 //----------------------------------------------------------
 // EXPLAIN : here static is important !!
@@ -72,8 +73,9 @@ void timer::tick(const std::string &class_name,const std::string &name)
 //----------------------------------------------------------
 // EXPLAIN : if timer is disabled , return
 //----------------------------------------------------------
-	if (disabled)
+	if (disabled) {
 		return;
+}
 
 #ifdef _OPENMP
 	if(!omp_get_thread_num())
@@ -130,7 +132,7 @@ void timer::tick(const std::string &class_name,const std::string &name)
 	} // end if(!omp_get_thread_num())
 }
 
-long double timer::print_until_now(void)
+long double timer::print_until_now()
 {
 	// stop the clock
 	timer::tick("","total");
@@ -146,12 +148,14 @@ void timer::write_to_json(std::string file_name)
 	// if mpi is not initialized, we do not run this function
 	int is_initialized = 0;
     MPI_Initialized(&is_initialized);
-	if (!is_initialized)
+	if (!is_initialized) {
 		return;	
+}
 	int my_rank = 0;
 	MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
-	if (my_rank != 0)
+	if (my_rank != 0) {
 		return;
+}
 #endif
 
     // check if a double is inf, if so, return "null", else return a string of the input double
@@ -268,7 +272,13 @@ void timer::print_all(std::ofstream &ofs)
 		times.push_back(timer_one.cpu_second);
 		calls.push_back(timer_one.calls);
 		avgs.push_back(timer_one.cpu_second/timer_one.calls);
-		pers.push_back(timer_one.cpu_second / timer_pool_order[0].second.cpu_second * 100);
+
+		// if the total time is too small, we do not calculate the percentage
+		if (timer_pool_order[0].second.cpu_second < 1e-9) {
+    		pers.push_back(0);
+		} else {
+    		pers.push_back(timer_one.cpu_second / timer_pool_order[0].second.cpu_second * 100);
+		}
 	}
 	assert(class_names.size() == names.size());
 	assert(class_names.size() == times.size());