deepmodeling · njzjz · Dec 23, 2024 · Nov 26, 2024 · Nov 26, 2024 · Nov 27, 2024
diff --git a/CITATIONS.bib b/CITATIONS.bib
@@ -128,26 +128,26 @@ @article{Zhang_NpjComputMater_2024_v10_p94
   doi          = {10.1038/s41524-024-01278-7},
 }
 
-@misc{Zhang_2023_DPA2,
+@article{Zhang_npjComputMater_2024_v10_p293,
   annote       = {DPA-2},
   author       = {
     Duo Zhang and Xinzijian Liu and Xiangyu Zhang and Chengqian Zhang and Chun
-    Cai and Hangrui Bi and Yiming Du and Xuejian Qin and Jiameng Huang and
-    Bowen Li and Yifan Shan and Jinzhe Zeng and Yuzhi Zhang and Siyuan Liu and
-    Yifan Li and Junhan Chang and Xinyan Wang and Shuo Zhou and Jianchuan Liu
-    and Xiaoshan Luo and Zhenyu Wang and Wanrun Jiang and Jing Wu and Yudi Yang
-    and Jiyuan Yang and Manyi Yang and Fu-Qiang Gong and Linshuang Zhang and
-    Mengchao Shi and Fu-Zhi Dai and Darrin M. York and Shi Liu and Tong Zhu and
-    Zhicheng Zhong and Jian Lv and Jun Cheng and Weile Jia and Mohan Chen and
-    Guolin Ke and Weinan E and Linfeng Zhang and Han Wang
+    Cai and Hangrui Bi and Yiming Du and Xuejian Qin and Anyang Peng and
+    Jiameng Huang and Bowen Li and Yifan Shan and Jinzhe Zeng and Yuzhi Zhang
+    and Siyuan Liu and Yifan Li and Junhan Chang and Xinyan Wang and Shuo Zhou
+    and Jianchuan Liu and Xiaoshan Luo and Zhenyu Wang and Wanrun Jiang and
+    Jing Wu and Yudi Yang and Jiyuan Yang and Manyi Yang and Fu-Qiang Gong and
+    Linshuang Zhang and Mengchao Shi and Fu-Zhi Dai and Darrin M. York and Shi
+    Liu and Tong Zhu and Zhicheng Zhong and Jian Lv and Jun Cheng and Weile Jia
+    and Mohan Chen and Guolin Ke and Weinan E and Linfeng Zhang and Han Wang
   },
-  title        = {
-    {DPA-2: Towards a universal large atomic model for molecular and material
-    simulation}
-  },
-  publisher    = {arXiv},
-  year         = 2023,
-  doi          = {10.48550/arXiv.2312.15492},
+  title        = {{DPA-2: a large atomic model as a multi-task learner}},
+  journal      = {npj Comput. Mater},
+  year         = 2024,
+  volume       = 10,
+  number       = 1,
+  pages        = 293,
+  doi          = {10.1038/s41524-024-01493-2},
 }
 
 @article{Zhang_PhysPlasmas_2020_v27_p122704,

diff --git a/deepmd/dpmodel/atomic_model/pairtab_atomic_model.py b/deepmd/dpmodel/atomic_model/pairtab_atomic_model.py
@@ -425,3 +425,14 @@
         If False, the shape is (nframes, nloc, ndim).
         """
         return False
+
+    def enable_compression(
+        self,
+        min_nbor_dist: float,
+        table_extrapolate: float = 5,
+        table_stride_1: float = 0.01,
+        table_stride_2: float = 0.1,
+        check_frequency: int = -1,
+    ) -> None:
+        """Pairtab model does not support compression."""
+        pass
diff --git a/deepmd/dpmodel/descriptor/dpa2.py b/deepmd/dpmodel/descriptor/dpa2.py
@@ -387,7 +387,7 @@ def __init__(
         use_tebd_bias: bool = False,
         type_map: Optional[list[str]] = None,
     ) -> None:
-        r"""The DPA-2 descriptor. see https://arxiv.org/abs/2312.15492.
+        r"""The DPA-2 descriptor[1]_.
 
         Parameters
         ----------
@@ -434,6 +434,11 @@ def __init__(
         sw:                 torch.Tensor
             The switch function for decaying inverse distance.
 
+        References
+        ----------
+        .. [1] Zhang, D., Liu, X., Zhang, X. et al. DPA-2: a
+           large atomic model as a multi-task learner. npj
+           Comput Mater 10, 293 (2024). https://doi.org/10.1038/s41524-024-01493-2
         """
 
         def init_subclass_params(sub_data, sub_class):

diff --git a/deepmd/pt/model/atomic_model/pairtab_atomic_model.py b/deepmd/pt/model/atomic_model/pairtab_atomic_model.py
@@ -484,3 +484,14 @@
         If False, the shape is (nframes, nloc, ndim).
         """
         return False
+
+    def enable_compression(
+        self,
+        min_nbor_dist: float,
+        table_extrapolate: float = 5,
+        table_stride_1: float = 0.01,
+        table_stride_2: float = 0.1,
+        check_frequency: int = -1,
+    ) -> None:
+        """Pairtab model does not support compression."""
+        pass
diff --git a/deepmd/pt/model/descriptor/dpa2.py b/deepmd/pt/model/descriptor/dpa2.py
@@ -100,7 +100,7 @@ def __init__(
         use_tebd_bias: bool = False,
         type_map: Optional[list[str]] = None,
     ) -> None:
-        r"""The DPA-2 descriptor. see https://arxiv.org/abs/2312.15492.
+        r"""The DPA-2 descriptor[1]_.
 
         Parameters
         ----------
@@ -147,6 +147,11 @@ def __init__(
         sw:                 torch.Tensor
             The switch function for decaying inverse distance.
 
+        References
+        ----------
+        .. [1] Zhang, D., Liu, X., Zhang, X. et al. DPA-2: a
+           large atomic model as a multi-task learner. npj
+           Comput Mater 10, 293 (2024). https://doi.org/10.1038/s41524-024-01493-2
         """
         super().__init__()
 

diff --git a/deepmd/pt/model/model/__init__.py b/deepmd/pt/model/model/__init__.py
@@ -196,7 +196,7 @@ def get_zbl_model(model_params):
     rmax = model_params["sw_rmax"]
     atom_exclude_types = model_params.get("atom_exclude_types", [])
     pair_exclude_types = model_params.get("pair_exclude_types", [])
-    return DPZBLModel(
+    model = DPZBLModel(
         dp_model,
         pt_model,
         rmin,
@@ -205,6 +205,8 @@ def get_zbl_model(model_params):
         atom_exclude_types=atom_exclude_types,
         pair_exclude_types=pair_exclude_types,
     )
+    model.model_def_script = json.dumps(model_params)
+    return model
 
 
 def _can_be_converted_to_float(value) -> Optional[bool]:

diff --git a/deepmd/pt/train/training.py b/deepmd/pt/train/training.py
@@ -579,7 +579,7 @@ def warm_up_linear(step, warmup_steps):
         # author: iProzd
         if self.opt_type == "Adam":
             self.optimizer = torch.optim.Adam(
-                self.wrapper.parameters(), lr=self.lr_exp.start_lr
+                self.wrapper.parameters(), lr=self.lr_exp.start_lr, fused=True
             )
             if optimizer_state_dict is not None and self.restart_training:
                 self.optimizer.load_state_dict(optimizer_state_dict)
@@ -653,10 +653,15 @@ def run(self) -> None:
             prof.start()
 
         def step(_step_id, task_key="Default") -> None:
+            if self.multi_task:
+                model_index = dp_random.choice(
+                    np.arange(self.num_model, dtype=np.int_),
+                    p=self.model_prob,
+                )
+                task_key = self.model_keys[model_index]
             # PyTorch Profiler
             if self.enable_profiler or self.profiling:
                 prof.step()
-            self.wrapper.train()
             if isinstance(self.lr_exp, dict):
                 _lr = self.lr_exp[task_key]
             else:
@@ -682,12 +687,11 @@ def step(_step_id, task_key="Default") -> None:
                 )
                 loss.backward()
                 if self.gradient_max_norm > 0.0:
-                    grad_norm = torch.nn.utils.clip_grad_norm_(
-                        self.wrapper.parameters(), self.gradient_max_norm
+                    torch.nn.utils.clip_grad_norm_(
+                        self.wrapper.parameters(),
+                        self.gradient_max_norm,
+                        error_if_nonfinite=True,
                     )
-                    if not torch.isfinite(grad_norm).all():
-                        # check local gradnorm single GPU case, trigger NanDetector
-                        raise FloatingPointError("gradients are Nan/Inf")
                 with torch.device("cpu"):
                     self.optimizer.step()
                 self.scheduler.step()
@@ -766,7 +770,7 @@ def fake_model():
             if self.display_in_training and (
                 display_step_id % self.disp_freq == 0 or display_step_id == 1
             ):
-                self.wrapper.eval()
+                self.wrapper.eval()  # Will set to train mode before fininshing validation
 
                 def log_loss_train(_loss, _more_loss, _task_key="Default"):
                     results = {}
@@ -872,6 +876,7 @@ def log_loss_valid(_task_key="Default"):
                                         learning_rate=None,
                                     )
                                 )
+                self.wrapper.train()
 
                 current_time = time.time()
                 train_time = current_time - self.t0
@@ -927,26 +932,11 @@ def log_loss_valid(_task_key="Default"):
                         f"{task_key}/{item}", more_loss[item], display_step_id
                     )
 
+        self.wrapper.train()
         self.t0 = time.time()
         self.total_train_time = 0.0
-        for step_id in range(self.num_steps):
-            if step_id < self.start_step:
-                continue
-            if self.multi_task:
-                chosen_index_list = dp_random.choice(
-                    np.arange(
-                        self.num_model, dtype=np.int32
-                    ),  # int32 should be enough for # models...
-                    p=np.array(self.model_prob),
-                    size=self.world_size,
-                    replace=True,
-                )
-                assert chosen_index_list.size == self.world_size
-                model_index = chosen_index_list[self.rank]
-                model_key = self.model_keys[model_index]
-            else:
-                model_key = "Default"
-            step(step_id, model_key)
+        for step_id in range(self.start_step, self.num_steps):
+            step(step_id)
             if JIT:
                 break
 

diff --git a/deepmd/pt/utils/dataloader.py b/deepmd/pt/utils/dataloader.py
@@ -28,6 +28,7 @@
 )
 
 from deepmd.pt.utils import (
+    dp_random,
     env,
 )
 from deepmd.pt.utils.dataset import (
@@ -50,6 +51,7 @@ def setup_seed(seed) -> None:
     torch.manual_seed(seed)
     torch.cuda.manual_seed_all(seed)
     torch.backends.cudnn.deterministic = True
+    dp_random.seed(seed)
 
 
 class DpLoaderSet(Dataset):
@@ -185,19 +187,21 @@ def print_summary(
         name: str,
         prob: list[float],
     ) -> None:
-        print_summary(
-            name,
-            len(self.systems),
-            [ss.system for ss in self.systems],
-            [ss._natoms for ss in self.systems],
-            self.batch_sizes,
-            [
-                ss._data_system.get_sys_numb_batch(self.batch_sizes[ii])
-                for ii, ss in enumerate(self.systems)
-            ],
-            prob,
-            [ss._data_system.pbc for ss in self.systems],
-        )
+        rank = dist.get_rank() if dist.is_initialized() else 0
+        if rank == 0:
+            print_summary(
+                name,
+                len(self.systems),
+                [ss.system for ss in self.systems],
+                [ss._natoms for ss in self.systems],
+                self.batch_sizes,
+                [
+                    ss._data_system.get_sys_numb_batch(self.batch_sizes[ii])
+                    for ii, ss in enumerate(self.systems)
+                ],
+                prob,
+                [ss._data_system.pbc for ss in self.systems],
+            )
 
 
 _sentinel = object()

diff --git a/deepmd/tf/descriptor/se_r.py b/deepmd/tf/descriptor/se_r.py
@@ -356,6 +356,8 @@ def enable_compression(
             self.filter_neuron,
             graph,
             graph_def,
+            type_one_side=self.type_one_side,
+            exclude_types=self.exclude_types,
             activation_fn=self.filter_activation_fn,
             suffix=suffix,
         )

diff --git a/deepmd/utils/data.py b/deepmd/utils/data.py
@@ -60,6 +60,8 @@
     ) -> None:
         """Constructor."""
         root = DPPath(sys_path)
+        if not root.is_dir():
+            raise FileNotFoundError(f"System {sys_path} is not found!")
         self.dirs = root.glob(set_prefix + ".*")
         if not len(self.dirs):
             raise FileNotFoundError(f"No {set_prefix}.* is found in {sys_path}")

diff --git a/deepmd/utils/data_system.py b/deepmd/utils/data_system.py
@@ -28,9 +28,6 @@
 from deepmd.utils.out_stat import (
     compute_stats_from_redu,
 )
-from deepmd.utils.path import (
-    DPPath,
-)
 
 log = logging.getLogger(__name__)
 
@@ -103,6 +100,8 @@
         del rcut
         self.system_dirs = systems
         self.nsystems = len(self.system_dirs)
+        if self.nsystems <= 0:
+            raise ValueError("No systems provided")
         self.data_systems = []
         for ii in self.system_dirs:
             self.data_systems.append(
@@ -755,23 +754,6 @@
         systems = expand_sys_str(systems)
     elif isinstance(systems, list):
         systems = systems.copy()
-    help_msg = "Please check your setting for data systems"
-    # check length of systems
-    if len(systems) == 0:
-        msg = "cannot find valid a data system"
-        log.fatal(msg)
-        raise OSError(msg, help_msg)
-    # roughly check all items in systems are valid
-    for ii in systems:
-        ii = DPPath(ii)
-        if not ii.is_dir():
-            msg = f"dir {ii} is not a valid dir"
-            log.fatal(msg)
-            raise OSError(msg, help_msg)
-        if not (ii / "type.raw").is_file():
-            msg = f"dir {ii} is not a valid data system dir"
-            log.fatal(msg)
-            raise OSError(msg, help_msg)
     return systems
 
 

diff --git a/deepmd/utils/path.py b/deepmd/utils/path.py
@@ -14,6 +14,7 @@
 from typing import (
     ClassVar,
     Optional,
+    Union,
 )
 
 import h5py
@@ -151,25 +152,22 @@
            If true, no error will be raised if the target directory already exists.
        """


 class DPOSPath(DPPath):
    """The OS path class to data system (DeepmdData) for real directories.
 
     Parameters
     ----------
-    path : str
+    path : Union[str, Path]
         path
     mode : str, optional
         mode, by default "r"
     """
 
-    def __init__(self, path: str, mode: str = "r") -> None:
+    def __init__(self, path: Union[str, Path], mode: str = "r") -> None:
         super().__init__()
         self.mode = mode
-        if isinstance(path, Path):
-            self.path = path
-        else:
-            self.path = Path(path)
+        self.path = Path(path)
 
     def load_numpy(self) -> np.ndarray:
         """Load NumPy array.
@@ -300,6 +298,8 @@
         # so we do not support file names containing #...
         s = path.split("#")
         self.root_path = s[0]
+        if not os.path.isfile(self.root_path):
+            raise FileNotFoundError(f"{self.root_path} not found")
         self.root = self._load_h5py(s[0], mode)
         # h5 path: default is the root path
         self._name = s[1] if len(s) > 1 else "/"

diff --git a/doc/credits.rst b/doc/credits.rst
@@ -54,7 +54,7 @@ Cite DeePMD-kit and methods
 .. bibliography::
    :filter: False
 
-   Zhang_2023_DPA2
+   Zhang_npjComputMater_2024_v10_p293
 
 - If frame-specific parameters (`fparam`, e.g. electronic temperature) is used,
 

diff --git a/doc/development/create-a-model-pt.md b/doc/development/create-a-model-pt.md
@@ -180,7 +180,7 @@ The arguments here should be consistent with the class arguments of your new com
 ## Package new codes
 
 You may package new codes into a new Python package if you don't want to contribute it to the main DeePMD-kit repository.
-A good example is [DeePMD-GNN](https://github.com/njzjz/deepmd-gnn).
+A good example is [DeePMD-GNN](https://gitlab.com/RutgersLBSR/deepmd-gnn).
 It's crucial to add your new component to `project.entry-points."deepmd.pt"` in `pyproject.toml`:
 
 ```toml

diff --git a/doc/install/install-from-c-library.md b/doc/install/install-from-c-library.md
@@ -1,4 +1,4 @@
-# Install from pre-compiled C library {{ tensorflow_icon }}, JAX {{ jax_icon }}
+# Install from pre-compiled C library {{ tensorflow_icon }} {{ jax_icon }}
 
 :::{note}
 **Supported backends**: TensorFlow {{ tensorflow_icon }}, JAX {{ jax_icon }}