From 242408d584f47db6746000672c4241a02f671ae8 Mon Sep 17 00:00:00 2001 From: HydrogenSulfate <490868991@qq.com> Date: Tue, 24 Dec 2024 01:13:15 +0800 Subject: [PATCH 1/3] pd: fix oom error (#4493) Paddle use `MemoryError` rather than `RuntimeError` used in pytorch, now I can test DPA-1 and DPA-2 in 16G V100... ![image](https://github.com/user-attachments/assets/42ead773-bf26-4195-8f67-404b151371de) ## Summary by CodeRabbit - **Bug Fixes** - Improved detection of out-of-memory (OOM) errors to enhance application stability. - Ensured cached memory is cleared upon OOM errors, preventing potential memory leaks. --- deepmd/pd/utils/auto_batch_size.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/deepmd/pd/utils/auto_batch_size.py b/deepmd/pd/utils/auto_batch_size.py index 8cdb5ddea2..0eb5e46d5f 100644 --- a/deepmd/pd/utils/auto_batch_size.py +++ b/deepmd/pd/utils/auto_batch_size.py @@ -49,12 +49,8 @@ def is_oom_error(self, e: Exception) -> bool: # several sources think CUSOLVER_STATUS_INTERNAL_ERROR is another out-of-memory error, # such as https://github.com/JuliaGPU/CUDA.jl/issues/1924 # (the meaningless error message should be considered as a bug in cusolver) - if isinstance(e, RuntimeError) and ( - "CUDA out of memory." in e.args[0] - or "CUDA driver error: out of memory" in e.args[0] - or "cusolver error: CUSOLVER_STATUS_INTERNAL_ERROR" in e.args[0] - ): + if isinstance(e, MemoryError) and ("ResourceExhaustedError" in e.args[0]): # Release all unoccupied cached memory - # paddle.device.cuda.empty_cache() + paddle.device.cuda.empty_cache() return True return False From 47412da3de58fe92c7f7c12cbcb67220ed96cde2 Mon Sep 17 00:00:00 2001 From: HydrogenSulfate <490868991@qq.com> Date: Tue, 24 Dec 2024 01:50:09 +0800 Subject: [PATCH 2/3] pd: add missing `dp.eval()` in pd backend (#4488) Switch to eval mode when evaluating model, otherwise `self.training` will be `True`, backward graph will be created and cause OOM ## Summary by CodeRabbit - **New Features** - Enhanced model evaluation state management to ensure correct behavior during evaluation. - **Bug Fixes** - Improved type consistency in the `normalize_coord` function for better computational accuracy. --- deepmd/pd/infer/deep_eval.py | 1 + deepmd/pd/utils/region.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/deepmd/pd/infer/deep_eval.py b/deepmd/pd/infer/deep_eval.py index a2f8510f28..c31170ad71 100644 --- a/deepmd/pd/infer/deep_eval.py +++ b/deepmd/pd/infer/deep_eval.py @@ -113,6 +113,7 @@ def __init__( else: # self.dp = paddle.jit.load(self.model_path.split(".json")[0]) raise ValueError(f"Unknown model file format: {self.model_path}!") + self.dp.eval() self.rcut = self.dp.model["Default"].get_rcut() self.type_map = self.dp.model["Default"].get_type_map() if isinstance(auto_batch_size, bool): diff --git a/deepmd/pd/utils/region.py b/deepmd/pd/utils/region.py index f3e3eaa52d..d2600ef16e 100644 --- a/deepmd/pd/utils/region.py +++ b/deepmd/pd/utils/region.py @@ -108,5 +108,5 @@ def normalize_coord( """ icoord = phys2inter(coord, cell) - icoord = paddle.remainder(icoord, paddle.full([], 1.0)) + icoord = paddle.remainder(icoord, paddle.full([], 1.0, dtype=icoord.dtype)) return inter2phys(icoord, cell) From 30b1447aa900d393f8d5ffeceee156a3124cfec8 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 24 Dec 2024 12:20:34 +0800 Subject: [PATCH 3/3] [pre-commit.ci] pre-commit autoupdate (#4497) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/astral-sh/ruff-pre-commit: v0.8.3 → v0.8.4](https://github.com/astral-sh/ruff-pre-commit/compare/v0.8.3...v0.8.4) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 9058decc21..bd36fd6e63 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -29,7 +29,7 @@ repos: exclude: ^source/3rdparty - repo: https://github.com/astral-sh/ruff-pre-commit # Ruff version. - rev: v0.8.3 + rev: v0.8.4 hooks: - id: ruff args: ["--fix"]