From 9c508b7d444fee251a9cb7c6151c5defb3116114 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Sun, 3 Mar 2024 04:19:06 -0500 Subject: [PATCH] allow loading either nsel or natoms atomic tensor data (#3394) A new parameter, `output_natoms_for_type_sel`, is added for the data requirement. (default=false) If sel_types is given, output_natoms_for_type_sel is true, and the data dimension is nsel, it will be converted to natoms. If sel_types is given, output_natoms_for_type_sel is false, and the data dimension is natoms, it will be converted to nsel. In other situations, it keeps the original shape. The user can give data in either nsel or natoms, if `sel_types` and `output_natoms_for_type_sel` are set. --------- Signed-off-by: Jinzhe Zeng --- deepmd/common.py | 4 ++ deepmd/pt/utils/dataset.py | 1 + deepmd/utils/data.py | 60 ++++++++++++++++++++-- deepmd/utils/data_system.py | 12 +++++ source/tests/tf/test_data_requirement.py | 1 + source/tests/tf/test_deepmd_data.py | 63 ++++++++++++++++++++++++ 6 files changed, 137 insertions(+), 4 deletions(-) diff --git a/deepmd/common.py b/deepmd/common.py index d7e485788b..29d32111a8 100644 --- a/deepmd/common.py +++ b/deepmd/common.py @@ -78,6 +78,7 @@ def add_data_requirement( repeat: int = 1, default: float = 0.0, dtype: Optional[np.dtype] = None, + output_natoms_for_type_sel: bool = False, ): """Specify data requirements for training. @@ -103,6 +104,8 @@ def add_data_requirement( default value of data dtype : np.dtype, optional the dtype of data, overwrites `high_prec` if provided + output_natoms_for_type_sel : bool, optional + if True and type_sel is True, the atomic dimension will be natoms instead of nsel """ data_requirement[key] = { "ndof": ndof, @@ -113,6 +116,7 @@ def add_data_requirement( "repeat": repeat, "default": default, "dtype": dtype, + "output_natoms_for_type_sel": output_natoms_for_type_sel, } diff --git a/deepmd/pt/utils/dataset.py b/deepmd/pt/utils/dataset.py index 67005b5ed3..77297d980c 100644 --- a/deepmd/pt/utils/dataset.py +++ b/deepmd/pt/utils/dataset.py @@ -61,4 +61,5 @@ def add_data_requirement(self, data_requirement: List[DataRequirementItem]): repeat=data_item["repeat"], default=data_item["default"], dtype=data_item["dtype"], + output_natoms_for_type_sel=data_item["output_natoms_for_type_sel"], ) diff --git a/deepmd/utils/data.py b/deepmd/utils/data.py index 03e39e1f21..194c6b1e24 100644 --- a/deepmd/utils/data.py +++ b/deepmd/utils/data.py @@ -147,6 +147,7 @@ def add( repeat: int = 1, default: float = 0.0, dtype: Optional[np.dtype] = None, + output_natoms_for_type_sel: bool = False, ): """Add a data item that to be loaded. @@ -173,6 +174,8 @@ def add( default value of data dtype : np.dtype, optional the dtype of data, overwrites `high_prec` if provided + output_natoms_for_type_sel : bool, optional + if True and type_sel is True, the atomic dimension will be natoms instead of nsel """ self.data_dict[key] = { "ndof": ndof, @@ -184,6 +187,7 @@ def add( "reduce": None, "default": default, "dtype": dtype, + "output_natoms_for_type_sel": output_natoms_for_type_sel, } return self @@ -523,6 +527,9 @@ def _load_set(self, set_name: DPPath): repeat=self.data_dict[kk]["repeat"], default=self.data_dict[kk]["default"], dtype=self.data_dict[kk]["dtype"], + output_natoms_for_type_sel=self.data_dict[kk][ + "output_natoms_for_type_sel" + ], ) for kk in self.data_dict.keys(): if self.data_dict[kk]["reduce"] is not None: @@ -589,19 +596,25 @@ def _load_data( type_sel=None, default: float = 0.0, dtype: Optional[np.dtype] = None, + output_natoms_for_type_sel: bool = False, ): if atomic: natoms = self.natoms idx_map = self.idx_map # if type_sel, then revise natoms and idx_map if type_sel is not None: - natoms = 0 + natoms_sel = 0 for jj in type_sel: - natoms += np.sum(self.atom_type == jj) - idx_map = self._idx_map_sel(self.atom_type, type_sel) + natoms_sel += np.sum(self.atom_type == jj) + idx_map_sel = self._idx_map_sel(self.atom_type, type_sel) + else: + natoms_sel = natoms + idx_map_sel = idx_map ndof = ndof_ * natoms else: ndof = ndof_ + natoms_sel = 0 + idx_map_sel = None if dtype is not None: pass elif high_prec: @@ -613,6 +626,38 @@ def _load_data( data = path.load_numpy().astype(dtype) try: # YWolfeee: deal with data shape error if atomic: + if type_sel is not None: + # check the data shape is nsel or natoms + if data.size == nframes * natoms_sel * ndof_: + if output_natoms_for_type_sel: + tmp = np.zeros( + [nframes, natoms, ndof_], dtype=data.dtype + ) + sel_mask = np.isin(self.atom_type, type_sel) + tmp[:, sel_mask] = data.reshape( + [nframes, natoms_sel, ndof_] + ) + data = tmp + else: + natoms = natoms_sel + idx_map = idx_map_sel + ndof = ndof_ * natoms + elif data.size == nframes * natoms * ndof_: + if output_natoms_for_type_sel: + pass + else: + sel_mask = np.isin(self.atom_type, type_sel) + data = data[:, sel_mask] + natoms = natoms_sel + idx_map = idx_map_sel + ndof = ndof_ * natoms + else: + raise ValueError( + f"The shape of the data {key} in {set_name}" + f"is {data.shape}, which doesn't match either" + f"({nframes}, {natoms_sel}, {ndof_}) or" + f"({nframes}, {natoms}, {ndof_})" + ) data = data.reshape([nframes, natoms, -1]) data = data[:, idx_map, :] data = data.reshape([nframes, -1]) @@ -621,13 +666,15 @@ def _load_data( explanation = "This error may occur when your label mismatch it's name, i.e. you might store global tensor in `atomic_tensor.npy` or atomic tensor in `tensor.npy`." log.error(str(err_message)) log.error(explanation) - raise ValueError(str(err_message) + ". " + explanation) + raise ValueError(str(err_message) + ". " + explanation) from err_message if repeat != 1: data = np.repeat(data, repeat).reshape([nframes, -1]) return np.float32(1.0), data elif must: raise RuntimeError("%s not found!" % path) else: + if type_sel is not None and not output_natoms_for_type_sel: + ndof = ndof_ * natoms_sel data = np.full([nframes, ndof], default, dtype=dtype) if repeat != 1: data = np.repeat(data, repeat).reshape([nframes, -1]) @@ -694,6 +741,8 @@ class DataRequirementItem: default value of data dtype : np.dtype, optional the dtype of data, overwrites `high_prec` if provided + output_natoms_for_type_sel : bool, optional + if True and type_sel is True, the atomic dimension will be natoms instead of nsel """ def __init__( @@ -707,6 +756,7 @@ def __init__( repeat: int = 1, default: float = 0.0, dtype: Optional[np.dtype] = None, + output_natoms_for_type_sel: bool = False, ) -> None: self.key = key self.ndof = ndof @@ -717,6 +767,7 @@ def __init__( self.repeat = repeat self.default = default self.dtype = dtype + self.output_natoms_for_type_sel = output_natoms_for_type_sel self.dict = self.to_dict() def to_dict(self) -> dict: @@ -730,6 +781,7 @@ def to_dict(self) -> dict: "repeat": self.repeat, "default": self.default, "dtype": self.dtype, + "output_natoms_for_type_sel": self.output_natoms_for_type_sel, } def __getitem__(self, key: str): diff --git a/deepmd/utils/data_system.py b/deepmd/utils/data_system.py index da1dd04026..0c74abfed1 100644 --- a/deepmd/utils/data_system.py +++ b/deepmd/utils/data_system.py @@ -293,6 +293,10 @@ def add_dict(self, adict: dict) -> None: type_sel=adict[kk]["type_sel"], repeat=adict[kk]["repeat"], default=adict[kk]["default"], + dtype=adict[kk].get("dtype"), + output_natoms_for_type_sel=adict[kk].get( + "output_natoms_for_type_sel", False + ), ) def add( @@ -305,6 +309,8 @@ def add( type_sel: Optional[List[int]] = None, repeat: int = 1, default: float = 0.0, + dtype: Optional[np.dtype] = None, + output_natoms_for_type_sel: bool = False, ): """Add a data item that to be loaded. @@ -329,6 +335,10 @@ def add( The data will be repeated `repeat` times. default, default=0. Default value of data + dtype + The dtype of data, overwrites `high_prec` if provided + output_natoms_for_type_sel : bool + If True and type_sel is True, the atomic dimension will be natoms instead of nsel """ for ii in self.data_systems: ii.add( @@ -340,6 +350,8 @@ def add( repeat=repeat, type_sel=type_sel, default=default, + dtype=dtype, + output_natoms_for_type_sel=output_natoms_for_type_sel, ) def reduce(self, key_out, key_in): diff --git a/source/tests/tf/test_data_requirement.py b/source/tests/tf/test_data_requirement.py index cabea15de1..e825bc3f92 100644 --- a/source/tests/tf/test_data_requirement.py +++ b/source/tests/tf/test_data_requirement.py @@ -16,3 +16,4 @@ def test_add(self): self.assertEqual(data_requirement["test"]["high_prec"], False) self.assertEqual(data_requirement["test"]["repeat"], 1) self.assertEqual(data_requirement["test"]["default"], 0.0) + self.assertEqual(data_requirement["test"]["output_natoms_for_type_sel"], False) diff --git a/source/tests/tf/test_deepmd_data.py b/source/tests/tf/test_deepmd_data.py index 3998e0f3e3..94e1f4c571 100644 --- a/source/tests/tf/test_deepmd_data.py +++ b/source/tests/tf/test_deepmd_data.py @@ -83,6 +83,7 @@ def setUp(self): os.makedirs(os.path.join(self.data_name, "set.foo"), exist_ok=True) os.makedirs(os.path.join(self.data_name, "set.bar"), exist_ok=True) os.makedirs(os.path.join(self.data_name, "set.tar"), exist_ok=True) + os.makedirs(os.path.join(self.data_name, "set.foo"), exist_ok=True) np.savetxt(os.path.join(self.data_name, "type.raw"), np.array([1, 0]), fmt="%d") np.savetxt( os.path.join(self.data_name, "type_map.raw"), @@ -141,6 +142,16 @@ def setUp(self): np.save(path, self.test_frame_bar) # t n self.test_null = np.zeros([self.nframes, 2 * self.natoms]) + # tensor shape + path = os.path.join(self.data_name, "set.foo", "tensor_natoms.npy") + self.tensor_natoms = np.random.default_rng().random( + [self.nframes, self.natoms, 6] + ) + self.tensor_natoms[:, 0, :] = 0 + np.save(path, self.tensor_natoms) + path = os.path.join(self.data_name, "set.foo", "tensor_nsel.npy") + self.tensor_nsel = self.tensor_natoms[:, 1, :] + np.save(path, self.tensor_nsel) def tearDown(self): shutil.rmtree(self.data_name) @@ -292,6 +303,58 @@ def test_get_nbatch(self): nb = dd.get_numb_batch(2, 0) self.assertEqual(nb, 2) + def test_get_tensor(self): + dd_natoms = ( + DeepmdData(self.data_name) + .add( + "tensor_nsel", + 6, + atomic=True, + must=True, + type_sel=[0], + output_natoms_for_type_sel=True, + ) + .add( + "tensor_natoms", + 6, + atomic=True, + must=True, + type_sel=[0], + output_natoms_for_type_sel=True, + ) + ) + data_natoms = dd_natoms._load_set(os.path.join(self.data_name, "set.foo")) + dd_nsel = ( + DeepmdData(self.data_name) + .add( + "tensor_nsel", + 6, + atomic=True, + must=True, + type_sel=[0], + output_natoms_for_type_sel=False, + ) + .add( + "tensor_natoms", + 6, + atomic=True, + must=True, + type_sel=[0], + output_natoms_for_type_sel=False, + ) + ) + data_nsel = dd_nsel._load_set(os.path.join(self.data_name, "set.foo")) + np.testing.assert_allclose( + data_natoms["tensor_natoms"], data_natoms["tensor_nsel"] + ) + np.testing.assert_allclose(data_nsel["tensor_natoms"], data_nsel["tensor_nsel"]) + np.testing.assert_allclose( + data_natoms["tensor_natoms"].reshape(self.nframes, self.natoms, -1)[ + :, 0, : + ], + data_nsel["tensor_natoms"], + ) + def _comp_np_mat2(self, first, second): np.testing.assert_almost_equal(first, second, places)