From 9adb190d713c401a42604ad0d344da81ecdef9b8 Mon Sep 17 00:00:00 2001
From: Duo <50307526+iProzd@users.noreply.github.com>
Date: Tue, 26 Nov 2024 10:05:59 +0800
Subject: [PATCH 01/13] chore(pt): update multitask example (#4419)

<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->

## Summary by CodeRabbit

- **New Features**
- Updated multi-task model configuration with a new descriptor for
enhanced representation learning.
- Introduced additional parameters for model initialization and
attention mechanisms.

- **Bug Fixes**
- Replaced outdated descriptor references in model configurations to
ensure compatibility with new settings.

<!-- end of auto-generated comment: release notes by coderabbit.ai -->

(cherry picked from commit e7ad8dca6ea9d0fcaf83fcee231a1f1358443169)
---
 .../pytorch_example/input_torch.json          | 69 ++++++++++++++-----
 1 file changed, 50 insertions(+), 19 deletions(-)

diff --git a/examples/water_multi_task/pytorch_example/input_torch.json b/examples/water_multi_task/pytorch_example/input_torch.json
index 04d848538d..f0d9a67ea5 100644
--- a/examples/water_multi_task/pytorch_example/input_torch.json
+++ b/examples/water_multi_task/pytorch_example/input_torch.json
@@ -6,23 +6,54 @@
         "O",
         "H"
       ],
-      "sea_descriptor_1": {
-        "type": "se_e2_a",
-        "sel": [
-          46,
-          92
-        ],
-        "rcut_smth": 0.50,
-        "rcut": 6.00,
-        "neuron": [
-          25,
-          50,
-          100
-        ],
-        "resnet_dt": false,
-        "axis_neuron": 16,
-        "type_one_side": true,
-        "seed": 1,
+      "dpa2_descriptor": {
+        "type": "dpa2",
+        "repinit": {
+          "tebd_dim": 8,
+          "rcut": 6.0,
+          "rcut_smth": 0.5,
+          "nsel": 120,
+          "neuron": [
+            25,
+            50,
+            100
+          ],
+          "axis_neuron": 12,
+          "activation_function": "tanh",
+          "three_body_sel": 48,
+          "three_body_rcut": 4.0,
+          "three_body_rcut_smth": 3.5,
+          "use_three_body": true
+        },
+        "repformer": {
+          "rcut": 4.0,
+          "rcut_smth": 3.5,
+          "nsel": 48,
+          "nlayers": 6,
+          "g1_dim": 128,
+          "g2_dim": 32,
+          "attn2_hidden": 32,
+          "attn2_nhead": 4,
+          "attn1_hidden": 128,
+          "attn1_nhead": 4,
+          "axis_neuron": 4,
+          "update_h2": false,
+          "update_g1_has_conv": true,
+          "update_g1_has_grrg": true,
+          "update_g1_has_drrd": true,
+          "update_g1_has_attn": false,
+          "update_g2_has_g1g1": false,
+          "update_g2_has_attn": true,
+          "update_style": "res_residual",
+          "update_residual": 0.01,
+          "update_residual_init": "norm",
+          "attn2_has_gate": true,
+          "use_sqrt_nnei": true,
+          "g1_out_conv": true,
+          "g1_out_mlp": true
+        },
+        "precision": "float64",
+        "add_tebd_to_repinit_out": false,
         "_comment": " that's all"
       },
       "_comment": "that's all"
@@ -30,7 +61,7 @@
     "model_dict": {
       "water_1": {
         "type_map": "type_map_all",
-        "descriptor": "sea_descriptor_1",
+        "descriptor": "dpa2_descriptor",
         "fitting_net": {
           "neuron": [
             240,
@@ -44,7 +75,7 @@
       },
       "water_2": {
         "type_map": "type_map_all",
-        "descriptor": "sea_descriptor_1",
+        "descriptor": "dpa2_descriptor",
         "fitting_net": {
           "neuron": [
             240,

From c9315ffbf40eba6dee7b8e2dce1648f3ed2b367a Mon Sep 17 00:00:00 2001
From: Anyang Peng <137014849+anyangml@users.noreply.github.com>
Date: Wed, 27 Nov 2024 02:14:18 +0800
Subject: [PATCH 02/13] Fix: add model_def_script to ZBL (#4423)

<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->

## Summary by CodeRabbit

- **New Features**
- Enhanced model definition handling for improved encapsulation and
consistency across different model types.

- **Bug Fixes**
- Ensured that model definition scripts are correctly set to a JSON
string representation for all model instances.

<!-- end of auto-generated comment: release notes by coderabbit.ai -->

(cherry picked from commit f343a3b212edab5525502e0261f3068c0b6fb1f6)
---
 deepmd/pt/model/model/__init__.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/deepmd/pt/model/model/__init__.py b/deepmd/pt/model/model/__init__.py
index a46909622b..f2e03fb99e 100644
--- a/deepmd/pt/model/model/__init__.py
+++ b/deepmd/pt/model/model/__init__.py
@@ -196,7 +196,7 @@ def get_zbl_model(model_params):
     rmax = model_params["sw_rmax"]
     atom_exclude_types = model_params.get("atom_exclude_types", [])
     pair_exclude_types = model_params.get("pair_exclude_types", [])
-    return DPZBLModel(
+    model = DPZBLModel(
         dp_model,
         pt_model,
         rmin,
@@ -205,6 +205,8 @@ def get_zbl_model(model_params):
         atom_exclude_types=atom_exclude_types,
         pair_exclude_types=pair_exclude_types,
     )
+    model.model_def_script = json.dumps(model_params)
+    return model
 
 
 def _can_be_converted_to_float(value) -> Optional[bool]:

From ed80fc02961f00347e37c80b45e3006a47cdafda Mon Sep 17 00:00:00 2001
From: Anyang Peng <137014849+anyangml@users.noreply.github.com>
Date: Wed, 27 Nov 2024 15:02:31 +0800
Subject: [PATCH 03/13] Feat: add pairtab compression (#4432)

<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->
## Summary by CodeRabbit

- **New Features**
- Introduced a new method `enable_compression` in the PairTabAtomicModel
class, indicating that the model does not support compression settings.

- **Documentation**
- Added docstring for the `enable_compression` method to clarify its
purpose.
<!-- end of auto-generated comment: release notes by coderabbit.ai -->

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
(cherry picked from commit 3cdf407b0b811ae6ee2d6a1168113ba15de04b8d)
---
 deepmd/dpmodel/atomic_model/pairtab_atomic_model.py  | 11 +++++++++++
 deepmd/pt/model/atomic_model/pairtab_atomic_model.py | 11 +++++++++++
 2 files changed, 22 insertions(+)

diff --git a/deepmd/dpmodel/atomic_model/pairtab_atomic_model.py b/deepmd/dpmodel/atomic_model/pairtab_atomic_model.py
index aefdbf7f1c..a4bffe508d 100644
--- a/deepmd/dpmodel/atomic_model/pairtab_atomic_model.py
+++ b/deepmd/dpmodel/atomic_model/pairtab_atomic_model.py
@@ -425,3 +425,14 @@ def is_aparam_nall(self) -> bool:
         If False, the shape is (nframes, nloc, ndim).
         """
         return False
+
+    def enable_compression(
+        self,
+        min_nbor_dist: float,
+        table_extrapolate: float = 5,
+        table_stride_1: float = 0.01,
+        table_stride_2: float = 0.1,
+        check_frequency: int = -1,
+    ) -> None:
+        """Pairtab model does not support compression."""
+        pass
diff --git a/deepmd/pt/model/atomic_model/pairtab_atomic_model.py b/deepmd/pt/model/atomic_model/pairtab_atomic_model.py
index c535808d83..0d3b2c0c41 100644
--- a/deepmd/pt/model/atomic_model/pairtab_atomic_model.py
+++ b/deepmd/pt/model/atomic_model/pairtab_atomic_model.py
@@ -484,3 +484,14 @@ def is_aparam_nall(self) -> bool:
         If False, the shape is (nframes, nloc, ndim).
         """
         return False
+
+    def enable_compression(
+        self,
+        min_nbor_dist: float,
+        table_extrapolate: float = 5,
+        table_stride_1: float = 0.01,
+        table_stride_2: float = 0.1,
+        check_frequency: int = -1,
+    ) -> None:
+        """Pairtab model does not support compression."""
+        pass

From db3d48e54e186b997fffdd540de3a8f80fdd6409 Mon Sep 17 00:00:00 2001
From: Chun Cai <amoycaic@gmail.com>
Date: Thu, 28 Nov 2024 02:00:30 +0800
Subject: [PATCH 04/13] Perf: print summary on rank 0 (#4434)

<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->

## Summary by CodeRabbit

- **Bug Fixes**
- Adjusted the summary printing functionality to ensure it only executes
from the main process in distributed settings, preventing duplicate
outputs.

<!-- end of auto-generated comment: release notes by coderabbit.ai -->

(cherry picked from commit 4b92b6d8a57a543f8d826e6c87dd4dac6b722566)
---
 deepmd/pt/utils/dataloader.py | 28 +++++++++++++++-------------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/deepmd/pt/utils/dataloader.py b/deepmd/pt/utils/dataloader.py
index 9920622792..2fea6b72d2 100644
--- a/deepmd/pt/utils/dataloader.py
+++ b/deepmd/pt/utils/dataloader.py
@@ -185,19 +185,21 @@ def print_summary(
         name: str,
         prob: list[float],
     ) -> None:
-        print_summary(
-            name,
-            len(self.systems),
-            [ss.system for ss in self.systems],
-            [ss._natoms for ss in self.systems],
-            self.batch_sizes,
-            [
-                ss._data_system.get_sys_numb_batch(self.batch_sizes[ii])
-                for ii, ss in enumerate(self.systems)
-            ],
-            prob,
-            [ss._data_system.pbc for ss in self.systems],
-        )
+        rank = dist.get_rank() if dist.is_initialized() else 0
+        if rank == 0:
+            print_summary(
+                name,
+                len(self.systems),
+                [ss.system for ss in self.systems],
+                [ss._natoms for ss in self.systems],
+                self.batch_sizes,
+                [
+                    ss._data_system.get_sys_numb_batch(self.batch_sizes[ii])
+                    for ii, ss in enumerate(self.systems)
+                ],
+                prob,
+                [ss._data_system.pbc for ss in self.systems],
+            )
 
 
 _sentinel = object()

From 69a162827f93b092f475f1c96fc8345be9db6818 Mon Sep 17 00:00:00 2001
From: Chun Cai <amoycaic@gmail.com>
Date: Thu, 28 Nov 2024 06:08:42 +0800
Subject: [PATCH 05/13] perf: optimize training loop (#4426)

Improvements to the training process:

*
[`deepmd/pt/train/training.py`](diffhunk://#diff-a90c90dc0e6a17fbe2e930f91182805b83260484c9dc1cfac3331378ffa34935R659):
Added a check to skip setting the model to training mode if it already
is. The profiling result shows it takes some time to recursively set it
to all models.

*
[`deepmd/pt/train/training.py`](diffhunk://#diff-a90c90dc0e6a17fbe2e930f91182805b83260484c9dc1cfac3331378ffa34935L686-L690):
Modified the gradient clipping function to include the
`error_if_nonfinite` parameter, and removed the manual check for
non-finite gradients and the associated exception raising.

<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->
## Summary by CodeRabbit

- **New Features**
	- Improved training loop with enhanced error handling and control flow.
	- Updated gradient clipping logic for better error detection.
	- Refined logging functionality for training and validation results.

- **Bug Fixes**
	- Prevented redundant training calls by adding conditional checks.

- **Documentation**
- Clarified method logic in the `Trainer` class without changing method
signatures.
<!-- end of auto-generated comment: release notes by coderabbit.ai -->

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
(cherry picked from commit 037cf3f3add2a24ae7c5ecc412c8f1c7669a21ea)
---
 deepmd/pt/train/training.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/deepmd/pt/train/training.py b/deepmd/pt/train/training.py
index f74c4769bf..af6e48191d 100644
--- a/deepmd/pt/train/training.py
+++ b/deepmd/pt/train/training.py
@@ -656,7 +656,6 @@ def step(_step_id, task_key="Default") -> None:
             # PyTorch Profiler
             if self.enable_profiler or self.profiling:
                 prof.step()
-            self.wrapper.train()
             if isinstance(self.lr_exp, dict):
                 _lr = self.lr_exp[task_key]
             else:
@@ -682,12 +681,11 @@ def step(_step_id, task_key="Default") -> None:
                 )
                 loss.backward()
                 if self.gradient_max_norm > 0.0:
-                    grad_norm = torch.nn.utils.clip_grad_norm_(
-                        self.wrapper.parameters(), self.gradient_max_norm
+                    torch.nn.utils.clip_grad_norm_(
+                        self.wrapper.parameters(),
+                        self.gradient_max_norm,
+                        error_if_nonfinite=True,
                     )
-                    if not torch.isfinite(grad_norm).all():
-                        # check local gradnorm single GPU case, trigger NanDetector
-                        raise FloatingPointError("gradients are Nan/Inf")
                 with torch.device("cpu"):
                     self.optimizer.step()
                 self.scheduler.step()
@@ -766,7 +764,7 @@ def fake_model():
             if self.display_in_training and (
                 display_step_id % self.disp_freq == 0 or display_step_id == 1
             ):
-                self.wrapper.eval()
+                self.wrapper.eval()  # Will set to train mode before fininshing validation
 
                 def log_loss_train(_loss, _more_loss, _task_key="Default"):
                     results = {}
@@ -872,6 +870,7 @@ def log_loss_valid(_task_key="Default"):
                                         learning_rate=None,
                                     )
                                 )
+                self.wrapper.train()
 
                 current_time = time.time()
                 train_time = current_time - self.t0
@@ -927,6 +926,7 @@ def log_loss_valid(_task_key="Default"):
                         f"{task_key}/{item}", more_loss[item], display_step_id
                     )
 
+        self.wrapper.train()
         self.t0 = time.time()
         self.total_train_time = 0.0
         for step_id in range(self.num_steps):

From 09bdfc5ba6401cf6a29b2f9d57d1b0ad9daca4b4 Mon Sep 17 00:00:00 2001
From: Chun Cai <amoycaic@gmail.com>
Date: Sat, 30 Nov 2024 11:05:20 +0800
Subject: [PATCH 06/13] chore: refactor training loop (#4435)

<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->
## Summary by CodeRabbit

- **New Features**
- Enhanced training loop to support multi-task training, allowing for
more flexible model selection.

- **Improvements**
- Streamlined `step` function to accept only the step ID, simplifying
its usage.
- Adjusted logging and model saving mechanisms for consistency with the
new training flow.
- Improved random seed management for enhanced reproducibility in data
processing.
- Enhanced error handling in data retrieval to ensure seamless operation
during data loading.
	- Added type hints for better clarity in data loader attributes.
<!-- end of auto-generated comment: release notes by coderabbit.ai -->

---------

Signed-off-by: Chun Cai <amoycaic@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
(cherry picked from commit 03c6e49f4718f2dcb1bc9eadccc4e3c3f3b54e09)
---
 deepmd/pt/train/training.py   | 26 ++++++++------------------
 deepmd/pt/utils/dataloader.py |  2 ++
 2 files changed, 10 insertions(+), 18 deletions(-)

diff --git a/deepmd/pt/train/training.py b/deepmd/pt/train/training.py
index af6e48191d..ccdefcdafb 100644
--- a/deepmd/pt/train/training.py
+++ b/deepmd/pt/train/training.py
@@ -653,6 +653,12 @@ def run(self) -> None:
             prof.start()
 
         def step(_step_id, task_key="Default") -> None:
+            if self.multi_task:
+                model_index = dp_random.choice(
+                    np.arange(self.num_model, dtype=np.int_),
+                    p=self.model_prob,
+                )
+                task_key = self.model_keys[model_index]
             # PyTorch Profiler
             if self.enable_profiler or self.profiling:
                 prof.step()
@@ -929,24 +935,8 @@ def log_loss_valid(_task_key="Default"):
         self.wrapper.train()
         self.t0 = time.time()
         self.total_train_time = 0.0
-        for step_id in range(self.num_steps):
-            if step_id < self.start_step:
-                continue
-            if self.multi_task:
-                chosen_index_list = dp_random.choice(
-                    np.arange(
-                        self.num_model, dtype=np.int32
-                    ),  # int32 should be enough for # models...
-                    p=np.array(self.model_prob),
-                    size=self.world_size,
-                    replace=True,
-                )
-                assert chosen_index_list.size == self.world_size
-                model_index = chosen_index_list[self.rank]
-                model_key = self.model_keys[model_index]
-            else:
-                model_key = "Default"
-            step(step_id, model_key)
+        for step_id in range(self.start_step, self.num_steps):
+            step(step_id)
             if JIT:
                 break
 
diff --git a/deepmd/pt/utils/dataloader.py b/deepmd/pt/utils/dataloader.py
index 2fea6b72d2..d33b17b035 100644
--- a/deepmd/pt/utils/dataloader.py
+++ b/deepmd/pt/utils/dataloader.py
@@ -28,6 +28,7 @@
 )
 
 from deepmd.pt.utils import (
+    dp_random,
     env,
 )
 from deepmd.pt.utils.dataset import (
@@ -50,6 +51,7 @@ def setup_seed(seed) -> None:
     torch.manual_seed(seed)
     torch.cuda.manual_seed_all(seed)
     torch.backends.cudnn.deterministic = True
+    dp_random.seed(seed)
 
 
 class DpLoaderSet(Dataset):

From 5420de763b7411e1feace0a6d8d8064ab68a4983 Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Sun, 1 Dec 2024 21:48:10 -0500
Subject: [PATCH 07/13] fix(tf): pass `type_one_side` & `exclude_types` to
 `DPTabulate` in `se_r` (#4446)

Fix #4445.

* Modify `DPTabulate` instance creation to include `self.type_one_side`
and `self.exclude_types`

<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->

## Summary by CodeRabbit

- **New Features**
- Enhanced configurability for the `DescrptSeR` class, allowing users to
customize compression behavior with new parameters.
- Introduced optional parameters for improved management of atom types
and interactions during the embedding process.

- **Bug Fixes**
- Added validation for excluded types to ensure proper handling within
the compression logic.

<!-- end of auto-generated comment: release notes by coderabbit.ai -->

(cherry picked from commit 9b70351327b8657ca198053720fb6931611659df)
---
 deepmd/tf/descriptor/se_r.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/deepmd/tf/descriptor/se_r.py b/deepmd/tf/descriptor/se_r.py
index d74b6df9ee..4cc867cf0d 100644
--- a/deepmd/tf/descriptor/se_r.py
+++ b/deepmd/tf/descriptor/se_r.py
@@ -356,6 +356,8 @@ def enable_compression(
             self.filter_neuron,
             graph,
             graph_def,
+            type_one_side=self.type_one_side,
+            exclude_types=self.exclude_types,
             activation_fn=self.filter_activation_fn,
             suffix=suffix,
         )

From 24232a9b3500b9ec6e41891400055d3c1b1a285e Mon Sep 17 00:00:00 2001
From: Chun Cai <amoycaic@gmail.com>
Date: Tue, 3 Dec 2024 03:39:56 +0800
Subject: [PATCH 08/13] Perf: remove redundant checks on data integrity (#4433)

Systems are aggregated here

https://github.com/deepmodeling/deepmd-kit/blob/f343a3b212edab5525502e0261f3068c0b6fb1f6/deepmd/utils/data_system.py#L802
and later initialized here

https://github.com/deepmodeling/deepmd-kit/blob/f343a3b212edab5525502e0261f3068c0b6fb1f6/deepmd/utils/data_system.py#L809-L810

This process will instantiate `DeepmdData` class, and it will perform
data integrity checks

https://github.com/deepmodeling/deepmd-kit/blob/e695a91ca6f7a1c9c830ab1c58b7b7a05db3da23/deepmd/utils/data.py#L80-L82

Besides, the checking process enumerates all items for all ranks, which
is unnecessary and quite slow. So this PR removes this check.

<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->
## Summary by CodeRabbit

- **New Features**
- Enhanced flexibility in defining test sizes by allowing percentage
input for the `test_size` parameter.
- Introduced a new method to automatically compute test sizes based on
the specified percentage of total data.
- Improved path handling to accept both string and Path inputs,
enhancing usability.

- **Bug Fixes**
- Improved error handling for invalid paths, ensuring users receive
clear feedback when files are not found.

- **Deprecation Notice**
- The `get_test` method is now deprecated, with new logic implemented
for loading test data when necessary.
<!-- end of auto-generated comment: release notes by coderabbit.ai -->

---------

Signed-off-by: Chun Cai <amoycaic@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>
(cherry picked from commit 3917cf0be777f548b9bc4da63af97a78e8e9e952)
---
 deepmd/utils/data.py        |  2 ++
 deepmd/utils/data_system.py | 22 ++--------------------
 deepmd/utils/path.py        | 12 ++++++------
 3 files changed, 10 insertions(+), 26 deletions(-)

diff --git a/deepmd/utils/data.py b/deepmd/utils/data.py
index 493a9d8d54..39af73cab3 100644
--- a/deepmd/utils/data.py
+++ b/deepmd/utils/data.py
@@ -60,6 +60,8 @@ def __init__(
     ) -> None:
         """Constructor."""
         root = DPPath(sys_path)
+        if not root.is_dir():
+            raise FileNotFoundError(f"System {sys_path} is not found!")
         self.dirs = root.glob(set_prefix + ".*")
         if not len(self.dirs):
             raise FileNotFoundError(f"No {set_prefix}.* is found in {sys_path}")
diff --git a/deepmd/utils/data_system.py b/deepmd/utils/data_system.py
index 0e960d0ba1..445461b387 100644
--- a/deepmd/utils/data_system.py
+++ b/deepmd/utils/data_system.py
@@ -28,9 +28,6 @@
 from deepmd.utils.out_stat import (
     compute_stats_from_redu,
 )
-from deepmd.utils.path import (
-    DPPath,
-)
 
 log = logging.getLogger(__name__)
 
@@ -103,6 +100,8 @@ def __init__(
         del rcut
         self.system_dirs = systems
         self.nsystems = len(self.system_dirs)
+        if self.nsystems <= 0:
+            raise ValueError("No systems provided")
         self.data_systems = []
         for ii in self.system_dirs:
             self.data_systems.append(
@@ -755,23 +754,6 @@ def process_systems(systems: Union[str, list[str]]) -> list[str]:
         systems = expand_sys_str(systems)
     elif isinstance(systems, list):
         systems = systems.copy()
-    help_msg = "Please check your setting for data systems"
-    # check length of systems
-    if len(systems) == 0:
-        msg = "cannot find valid a data system"
-        log.fatal(msg)
-        raise OSError(msg, help_msg)
-    # roughly check all items in systems are valid
-    for ii in systems:
-        ii = DPPath(ii)
-        if not ii.is_dir():
-            msg = f"dir {ii} is not a valid dir"
-            log.fatal(msg)
-            raise OSError(msg, help_msg)
-        if not (ii / "type.raw").is_file():
-            msg = f"dir {ii} is not a valid data system dir"
-            log.fatal(msg)
-            raise OSError(msg, help_msg)
     return systems
 
 
diff --git a/deepmd/utils/path.py b/deepmd/utils/path.py
index 6c52caac1d..c542ccf661 100644
--- a/deepmd/utils/path.py
+++ b/deepmd/utils/path.py
@@ -14,6 +14,7 @@
 from typing import (
     ClassVar,
     Optional,
+    Union,
 )
 
 import h5py
@@ -157,19 +158,16 @@ class DPOSPath(DPPath):
 
     Parameters
     ----------
-    path : str
+    path : Union[str, Path]
         path
     mode : str, optional
         mode, by default "r"
     """
 
-    def __init__(self, path: str, mode: str = "r") -> None:
+    def __init__(self, path: Union[str, Path], mode: str = "r") -> None:
         super().__init__()
         self.mode = mode
-        if isinstance(path, Path):
-            self.path = path
-        else:
-            self.path = Path(path)
+        self.path = Path(path)
 
     def load_numpy(self) -> np.ndarray:
         """Load NumPy array.
@@ -300,6 +298,8 @@ def __init__(self, path: str, mode: str = "r") -> None:
         # so we do not support file names containing #...
         s = path.split("#")
         self.root_path = s[0]
+        if not os.path.isfile(self.root_path):
+            raise FileNotFoundError(f"{self.root_path} not found")
         self.root = self._load_h5py(s[0], mode)
         # h5 path: default is the root path
         self._name = s[1] if len(s) > 1 else "/"

From f9d24fae96b28c930fd6c3e19791c8158577e1e3 Mon Sep 17 00:00:00 2001
From: Chun Cai <amoycaic@gmail.com>
Date: Wed, 18 Dec 2024 06:37:03 +0800
Subject: [PATCH 09/13] Perf: use fused Adam optimizer (#4463)

This PR sets the Adam optimizer to use the `fused=True` parameter.
For the profiling result shown below, this modification brings an 2.75x
improvement on optimizer update (22ms vs. 8ms) and ~3% improvement for
total speed up (922ms vs. 892ms). The benchmark case is training a DPA-2
Q3 release model. Please note that the absolute time may differs between
steps.

<details><summary>Before</summary>
<p>

![image](https://github.com/user-attachments/assets/d6b05a1d-6e6c-478d-921f-c497718bc551)

</p>
</details>

<details><summary>After</summary>
<p>

![image](https://github.com/user-attachments/assets/b216b919-094c-441f-96a7-146e1e3db483)

</p>
</details>

[Ref](https://pytorch.org/docs/stable/generated/torch.optim.Adam.html):
> The foreach and fused implementations are typically faster than the
for-loop, single-tensor implementation, with **fused being theoretically
fastest** with both vertical and horizontal fusion. As such, if the user
has not specified either flag (i.e., when foreach = fused = None), we
will attempt defaulting to the foreach implementation when the tensors
are all on CUDA. Why not fused? Since the fused implementation is
relatively new, we want to give it sufficient bake-in time.

<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->

## Summary by CodeRabbit

- **Bug Fixes**
- Improved optimizer performance during training by modifying the
initialization of the Adam optimizer.

- **Documentation**
	- Updated method signature for clarity in the `Trainer` class.

<!-- end of auto-generated comment: release notes by coderabbit.ai -->

(cherry picked from commit 104fc365ed8d6cef0c0583be755cc4c0e961cbe9)
---
 deepmd/pt/train/training.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deepmd/pt/train/training.py b/deepmd/pt/train/training.py
index ccdefcdafb..0feb7fbbd2 100644
--- a/deepmd/pt/train/training.py
+++ b/deepmd/pt/train/training.py
@@ -579,7 +579,7 @@ def warm_up_linear(step, warmup_steps):
         # author: iProzd
         if self.opt_type == "Adam":
             self.optimizer = torch.optim.Adam(
-                self.wrapper.parameters(), lr=self.lr_exp.start_lr
+                self.wrapper.parameters(), lr=self.lr_exp.start_lr, fused=True
             )
             if optimizer_state_dict is not None and self.restart_training:
                 self.optimizer.load_state_dict(optimizer_state_dict)

From 865025525a1aa64afb1dbbd95f798e4ea241013b Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Sun, 22 Dec 2024 01:46:47 -0500
Subject: [PATCH 10/13] docs: update DPA-2 citation (#4483)

<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->
## Summary by CodeRabbit

- **New Features**
- Updated references in the bibliography for the DPA-2 model to include
a new article entry for 2024.
	- Added a new reference for an attention-based descriptor.

- **Bug Fixes**
- Corrected reference links in documentation to point to updated DOI
links instead of arXiv.

- **Documentation**
- Revised entries in the credits and model documentation to reflect the
latest citations and details.
- Enhanced clarity and detail in fine-tuning documentation for
TensorFlow and PyTorch implementations.
<!-- end of auto-generated comment: release notes by coderabbit.ai -->

---------

Signed-off-by: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
(cherry picked from commit deaeec9c9b1f51c9b306724eb6e8d195755ac8dd)
---
 CITATIONS.bib                      | 32 +++++++++++++++---------------
 deepmd/dpmodel/descriptor/dpa2.py  |  7 ++++++-
 deepmd/pt/model/descriptor/dpa2.py |  7 ++++++-
 doc/credits.rst                    |  2 +-
 doc/model/dpa2.md                  |  2 +-
 doc/train/finetuning.md            |  2 +-
 doc/train/multi-task-training.md   |  2 +-
 7 files changed, 32 insertions(+), 22 deletions(-)

diff --git a/CITATIONS.bib b/CITATIONS.bib
index d5524a14f6..52c8045bf3 100644
--- a/CITATIONS.bib
+++ b/CITATIONS.bib
@@ -128,26 +128,26 @@ @article{Zhang_NpjComputMater_2024_v10_p94
   doi          = {10.1038/s41524-024-01278-7},
 }
 
-@misc{Zhang_2023_DPA2,
+@article{Zhang_npjComputMater_2024_v10_p293,
   annote       = {DPA-2},
   author       = {
     Duo Zhang and Xinzijian Liu and Xiangyu Zhang and Chengqian Zhang and Chun
-    Cai and Hangrui Bi and Yiming Du and Xuejian Qin and Jiameng Huang and
-    Bowen Li and Yifan Shan and Jinzhe Zeng and Yuzhi Zhang and Siyuan Liu and
-    Yifan Li and Junhan Chang and Xinyan Wang and Shuo Zhou and Jianchuan Liu
-    and Xiaoshan Luo and Zhenyu Wang and Wanrun Jiang and Jing Wu and Yudi Yang
-    and Jiyuan Yang and Manyi Yang and Fu-Qiang Gong and Linshuang Zhang and
-    Mengchao Shi and Fu-Zhi Dai and Darrin M. York and Shi Liu and Tong Zhu and
-    Zhicheng Zhong and Jian Lv and Jun Cheng and Weile Jia and Mohan Chen and
-    Guolin Ke and Weinan E and Linfeng Zhang and Han Wang
+    Cai and Hangrui Bi and Yiming Du and Xuejian Qin and Anyang Peng and
+    Jiameng Huang and Bowen Li and Yifan Shan and Jinzhe Zeng and Yuzhi Zhang
+    and Siyuan Liu and Yifan Li and Junhan Chang and Xinyan Wang and Shuo Zhou
+    and Jianchuan Liu and Xiaoshan Luo and Zhenyu Wang and Wanrun Jiang and
+    Jing Wu and Yudi Yang and Jiyuan Yang and Manyi Yang and Fu-Qiang Gong and
+    Linshuang Zhang and Mengchao Shi and Fu-Zhi Dai and Darrin M. York and Shi
+    Liu and Tong Zhu and Zhicheng Zhong and Jian Lv and Jun Cheng and Weile Jia
+    and Mohan Chen and Guolin Ke and Weinan E and Linfeng Zhang and Han Wang
   },
-  title        = {
-    {DPA-2: Towards a universal large atomic model for molecular and material
-    simulation}
-  },
-  publisher    = {arXiv},
-  year         = 2023,
-  doi          = {10.48550/arXiv.2312.15492},
+  title        = {{DPA-2: a large atomic model as a multi-task learner}},
+  journal      = {npj Comput. Mater},
+  year         = 2024,
+  volume       = 10,
+  number       = 1,
+  pages        = 293,
+  doi          = {10.1038/s41524-024-01493-2},
 }
 
 @article{Zhang_PhysPlasmas_2020_v27_p122704,
diff --git a/deepmd/dpmodel/descriptor/dpa2.py b/deepmd/dpmodel/descriptor/dpa2.py
index e4cadb7b36..55ae331593 100644
--- a/deepmd/dpmodel/descriptor/dpa2.py
+++ b/deepmd/dpmodel/descriptor/dpa2.py
@@ -387,7 +387,7 @@ def __init__(
         use_tebd_bias: bool = False,
         type_map: Optional[list[str]] = None,
     ) -> None:
-        r"""The DPA-2 descriptor. see https://arxiv.org/abs/2312.15492.
+        r"""The DPA-2 descriptor[1]_.
 
         Parameters
         ----------
@@ -434,6 +434,11 @@ def __init__(
         sw:                 torch.Tensor
             The switch function for decaying inverse distance.
 
+        References
+        ----------
+        .. [1] Zhang, D., Liu, X., Zhang, X. et al. DPA-2: a
+           large atomic model as a multi-task learner. npj
+           Comput Mater 10, 293 (2024). https://doi.org/10.1038/s41524-024-01493-2
         """
 
         def init_subclass_params(sub_data, sub_class):
diff --git a/deepmd/pt/model/descriptor/dpa2.py b/deepmd/pt/model/descriptor/dpa2.py
index ebad588e32..e28c3af46a 100644
--- a/deepmd/pt/model/descriptor/dpa2.py
+++ b/deepmd/pt/model/descriptor/dpa2.py
@@ -100,7 +100,7 @@ def __init__(
         use_tebd_bias: bool = False,
         type_map: Optional[list[str]] = None,
     ) -> None:
-        r"""The DPA-2 descriptor. see https://arxiv.org/abs/2312.15492.
+        r"""The DPA-2 descriptor[1]_.
 
         Parameters
         ----------
@@ -147,6 +147,11 @@ def __init__(
         sw:                 torch.Tensor
             The switch function for decaying inverse distance.
 
+        References
+        ----------
+        .. [1] Zhang, D., Liu, X., Zhang, X. et al. DPA-2: a
+           large atomic model as a multi-task learner. npj
+           Comput Mater 10, 293 (2024). https://doi.org/10.1038/s41524-024-01493-2
         """
         super().__init__()
 
diff --git a/doc/credits.rst b/doc/credits.rst
index 1b39dc1e0e..059746ee0b 100644
--- a/doc/credits.rst
+++ b/doc/credits.rst
@@ -54,7 +54,7 @@ Cite DeePMD-kit and methods
 .. bibliography::
    :filter: False
 
-   Zhang_2023_DPA2
+   Zhang_npjComputMater_2024_v10_p293
 
 - If frame-specific parameters (`fparam`, e.g. electronic temperature) is used,
 
diff --git a/doc/model/dpa2.md b/doc/model/dpa2.md
index eb641d6b01..300876bf05 100644
--- a/doc/model/dpa2.md
+++ b/doc/model/dpa2.md
@@ -4,7 +4,7 @@
 **Supported backends**: PyTorch {{ pytorch_icon }}, JAX {{ jax_icon }}, DP {{ dpmodel_icon }}
 :::
 
-The DPA-2 model implementation. See https://arxiv.org/abs/2312.15492 for more details.
+The DPA-2 model implementation. See https://doi.org/10.1038/s41524-024-01493-2 for more details.
 
 Training example: `examples/water/dpa2/input_torch_medium.json`, see [README](../../examples/water/dpa2/README.md) for inputs in different levels.
 
diff --git a/doc/train/finetuning.md b/doc/train/finetuning.md
index cf2f5fde4f..04d86cfc98 100644
--- a/doc/train/finetuning.md
+++ b/doc/train/finetuning.md
@@ -94,7 +94,7 @@ The model section will be overwritten (except the `type_map` subsection) by that
 
 #### Fine-tuning from a multi-task pre-trained model
 
-Additionally, within the PyTorch implementation and leveraging the flexibility offered by the framework and the multi-task training process proposed in DPA2 [paper](https://arxiv.org/abs/2312.15492),
+Additionally, within the PyTorch implementation and leveraging the flexibility offered by the framework and the multi-task training process proposed in DPA2 [paper](https://doi.org/10.1038/s41524-024-01493-2),
 we also support more general multitask pre-trained models, which includes multiple datasets for pre-training. These pre-training datasets share a common descriptor while maintaining their individual fitting nets,
 as detailed in the paper above.
 
diff --git a/doc/train/multi-task-training.md b/doc/train/multi-task-training.md
index 9d5b71592e..dbb43fc0e6 100644
--- a/doc/train/multi-task-training.md
+++ b/doc/train/multi-task-training.md
@@ -26,7 +26,7 @@ and the Adam optimizer is executed to minimize $L^{(t)}$ for one step to update
 In the case of multi-GPU parallel training, different GPUs will independently select their tasks.
 In the DPA-2 model, this multi-task training framework is adopted.[^1]
 
-[^1]: Duo Zhang, Xinzijian Liu, Xiangyu Zhang, Chengqian Zhang, Chun Cai, Hangrui Bi, Yiming Du, Xuejian Qin, Jiameng Huang, Bowen Li, Yifan Shan, Jinzhe Zeng, Yuzhi Zhang, Siyuan Liu, Yifan Li, Junhan Chang, Xinyan Wang, Shuo Zhou, Jianchuan Liu, Xiaoshan Luo, Zhenyu Wang, Wanrun Jiang, Jing Wu, Yudi Yang, Jiyuan Yang, Manyi Yang, Fu-Qiang Gong, Linshuang Zhang, Mengchao Shi, Fu-Zhi Dai, Darrin M. York, Shi Liu, Tong Zhu, Zhicheng Zhong, Jian Lv, Jun Cheng, Weile Jia, Mohan Chen, Guolin Ke, Weinan E, Linfeng Zhang, Han Wang, [arXiv preprint arXiv:2312.15492 (2023)](https://arxiv.org/abs/2312.15492) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
+[^1]: Duo Zhang, Xinzijian Liu, Xiangyu Zhang, Chengqian Zhang, Chun Cai, Hangrui Bi, Yiming Du, Xuejian Qin, Anyang Peng, Jiameng Huang, Bowen Li, Yifan Shan, Jinzhe Zeng, Yuzhi Zhang, Siyuan Liu, Yifan Li, Junhan Chang, Xinyan Wang, Shuo Zhou, Jianchuan Liu, Xiaoshan Luo, Zhenyu Wang, Wanrun Jiang, Jing Wu, Yudi Yang, Jiyuan Yang, Manyi Yang, Fu-Qiang Gong, Linshuang Zhang, Mengchao Shi, Fu-Zhi Dai, Darrin M. York, Shi Liu, Tong Zhu, Zhicheng Zhong, Jian Lv, Jun Cheng, Weile Jia, Mohan Chen, Guolin Ke, Weinan E, Linfeng Zhang, Han Wang, DPA-2: a large atomic model as a multi-task learner. npj Comput Mater 10, 293 (2024). [DOI: 10.1038/s41524-024-01493-2](https://doi.org/10.1038/s41524-024-01493-2) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
 
 Compared with the previous TensorFlow implementation, the new support in PyTorch is more flexible and efficient.
 In particular, it makes multi-GPU parallel training and even tasks beyond DFT possible,

From 46e8dce54a3231ccbf68211cf87ca007c9bd22e0 Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Sun, 22 Dec 2024 01:46:15 -0500
Subject: [PATCH 11/13] docs: update deepmd-gnn URL (#4482)

<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->

## Summary by CodeRabbit

- **Documentation**
- Updated guidelines for creating and integrating new models in the
DeePMD-kit framework.
- Added new sections on descriptors, fitting networks, and model
requirements.
	- Enhanced unit testing section with instructions for regression tests.
- Updated URL for the DeePMD-GNN plugin to reflect new repository
location.

<!-- end of auto-generated comment: release notes by coderabbit.ai -->

Signed-off-by: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
(cherry picked from commit 250c9076795bedce0cf359d6aaaf049a4d450d4d)
---
 doc/development/create-a-model-pt.md | 2 +-
 doc/third-party/out-of-deepmd-kit.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/development/create-a-model-pt.md b/doc/development/create-a-model-pt.md
index 08528cc5f6..7eb75b7026 100644
--- a/doc/development/create-a-model-pt.md
+++ b/doc/development/create-a-model-pt.md
@@ -180,7 +180,7 @@ The arguments here should be consistent with the class arguments of your new com
 ## Package new codes
 
 You may package new codes into a new Python package if you don't want to contribute it to the main DeePMD-kit repository.
-A good example is [DeePMD-GNN](https://github.com/njzjz/deepmd-gnn).
+A good example is [DeePMD-GNN](https://gitlab.com/RutgersLBSR/deepmd-gnn).
 It's crucial to add your new component to `project.entry-points."deepmd.pt"` in `pyproject.toml`:
 
 ```toml
diff --git a/doc/third-party/out-of-deepmd-kit.md b/doc/third-party/out-of-deepmd-kit.md
index 12ae5842c7..a04ba9741b 100644
--- a/doc/third-party/out-of-deepmd-kit.md
+++ b/doc/third-party/out-of-deepmd-kit.md
@@ -6,7 +6,7 @@ The codes of the following interfaces are not a part of the DeePMD-kit package a
 
 ### External GNN models (MACE/NequIP)
 
-[DeePMD-GNN](https://github.com/njzjz/deepmd-gnn) is DeePMD-kit plugin for various graph neural network (GNN) models.
+[DeePMD-GNN](https://gitlab.com/RutgersLBSR/deepmd-gnn) is DeePMD-kit plugin for various graph neural network (GNN) models.
 It has interfaced with [MACE](https://github.com/ACEsuit/mace) (PyTorch version) and [NequIP](https://github.com/mir-group/nequip) (PyTorch version).
 It is also the first example to the DeePMD-kit [plugin mechanism](../development/create-a-model-pt.md#package-new-codes).
 

From 821ce2533016aff87284f4a93660d11b84b44557 Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Sun, 22 Dec 2024 01:47:09 -0500
Subject: [PATCH 12/13] docs: fix a minor typo on the title of
 `install-from-c-library.md` (#4484)

<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->

## Summary by CodeRabbit

- **Documentation**
- Updated formatting of the installation guide for the pre-compiled C
library.
- Icons for TensorFlow and JAX are now displayed together in the header.
	- Retained all installation instructions and compatibility notes.

<!-- end of auto-generated comment: release notes by coderabbit.ai -->

Signed-off-by: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
(cherry picked from commit 2525ab2a4ea0097baec842055f713eceddcb01af)
---
 doc/install/install-from-c-library.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/install/install-from-c-library.md b/doc/install/install-from-c-library.md
index d408fb1b67..806be51ca9 100644
--- a/doc/install/install-from-c-library.md
+++ b/doc/install/install-from-c-library.md
@@ -1,4 +1,4 @@
-# Install from pre-compiled C library {{ tensorflow_icon }}, JAX {{ jax_icon }}
+# Install from pre-compiled C library {{ tensorflow_icon }} {{ jax_icon }}
 
 :::{note}
 **Supported backends**: TensorFlow {{ tensorflow_icon }}, JAX {{ jax_icon }}

From 4698c567a3f2a1e39319f0da7d7736deef70fecd Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Sun, 22 Dec 2024 01:47:38 -0500
Subject: [PATCH 13/13] fix: print dlerror if dlopen fails (#4485)

xref: https://github.com/njzjz/deepmd-gnn/issues/44

<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->

## Summary by CodeRabbit

- **New Features**
- Enhanced error messages for library loading failures on non-Windows
platforms.
- Updated thread management environment variable checks for improved
compatibility.
- Added support for mixed types in tensor input handling, allowing for
more flexible configurations.

- **Bug Fixes**
	- Improved error reporting for dynamic library loading issues.

<!-- end of auto-generated comment: release notes by coderabbit.ai -->

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
(cherry picked from commit cfe17a3e3e2fd198a42d9591d203bd2975c72824)
---
 source/api_cc/src/common.cc              | 8 +++++++-
 source/lib/src/gpu/cudart/cudart_stub.cc | 4 ++++
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/source/api_cc/src/common.cc b/source/api_cc/src/common.cc
index 5a4f05d75c..6d5ffcde48 100644
--- a/source/api_cc/src/common.cc
+++ b/source/api_cc/src/common.cc
@@ -389,7 +389,13 @@ static inline void _load_library_path(std::string dso_path) {
   if (!dso_handle) {
     throw deepmd::deepmd_exception(
         dso_path +
-        " is not found! You can add the library directory to LD_LIBRARY_PATH");
+        " is not found or fails to load! You can add the library directory to "
+        "LD_LIBRARY_PATH."
+#ifndef _WIN32
+        " Error message: " +
+        std::string(dlerror())
+#endif
+    );
   }
 }
 
diff --git a/source/lib/src/gpu/cudart/cudart_stub.cc b/source/lib/src/gpu/cudart/cudart_stub.cc
index 8083a0a89d..cfbabd6f5e 100644
--- a/source/lib/src/gpu/cudart/cudart_stub.cc
+++ b/source/lib/src/gpu/cudart/cudart_stub.cc
@@ -25,6 +25,10 @@ void *DP_cudart_dlopen(char *libname) {
 #endif
     if (!dso_handle) {
       std::cerr << "DeePMD-kit: Cannot find " << libname << std::endl;
+#ifndef _WIN32
+      std::cerr << "DeePMD-kit: Error message: " << std::string(dlerror())
+                << std::endl;
+#endif
       return nullptr;
     }
     std::cerr << "DeePMD-kit: Successfully load " << libname << std::endl;