From 242408d584f47db6746000672c4241a02f671ae8 Mon Sep 17 00:00:00 2001
From: HydrogenSulfate <490868991@qq.com>
Date: Tue, 24 Dec 2024 01:13:15 +0800
Subject: [PATCH 1/3] pd: fix oom error (#4493)

Paddle use `MemoryError` rather than `RuntimeError` used in pytorch, now
I can test DPA-1 and DPA-2 in 16G V100...

![image](https://github.com/user-attachments/assets/42ead773-bf26-4195-8f67-404b151371de)

<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->

## Summary by CodeRabbit

- **Bug Fixes**
- Improved detection of out-of-memory (OOM) errors to enhance
application stability.
- Ensured cached memory is cleared upon OOM errors, preventing potential
memory leaks.

<!-- end of auto-generated comment: release notes by coderabbit.ai -->
---
 deepmd/pd/utils/auto_batch_size.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/deepmd/pd/utils/auto_batch_size.py b/deepmd/pd/utils/auto_batch_size.py
index 8cdb5ddea2..0eb5e46d5f 100644
--- a/deepmd/pd/utils/auto_batch_size.py
+++ b/deepmd/pd/utils/auto_batch_size.py
@@ -49,12 +49,8 @@ def is_oom_error(self, e: Exception) -> bool:
         # several sources think CUSOLVER_STATUS_INTERNAL_ERROR is another out-of-memory error,
         # such as https://github.com/JuliaGPU/CUDA.jl/issues/1924
         # (the meaningless error message should be considered as a bug in cusolver)
-        if isinstance(e, RuntimeError) and (
-            "CUDA out of memory." in e.args[0]
-            or "CUDA driver error: out of memory" in e.args[0]
-            or "cusolver error: CUSOLVER_STATUS_INTERNAL_ERROR" in e.args[0]
-        ):
+        if isinstance(e, MemoryError) and ("ResourceExhaustedError" in e.args[0]):
             # Release all unoccupied cached memory
-            # paddle.device.cuda.empty_cache()
+            paddle.device.cuda.empty_cache()
             return True
         return False

From 47412da3de58fe92c7f7c12cbcb67220ed96cde2 Mon Sep 17 00:00:00 2001
From: HydrogenSulfate <490868991@qq.com>
Date: Tue, 24 Dec 2024 01:50:09 +0800
Subject: [PATCH 2/3] pd: add missing `dp.eval()` in pd backend (#4488)

Switch to eval mode when evaluating model, otherwise `self.training`
will be `True`, backward graph will be created and cause OOM

<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->
## Summary by CodeRabbit

- **New Features**
- Enhanced model evaluation state management to ensure correct behavior
during evaluation.

- **Bug Fixes**
- Improved type consistency in the `normalize_coord` function for better
computational accuracy.
<!-- end of auto-generated comment: release notes by coderabbit.ai -->
---
 deepmd/pd/infer/deep_eval.py | 1 +
 deepmd/pd/utils/region.py    | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/deepmd/pd/infer/deep_eval.py b/deepmd/pd/infer/deep_eval.py
index a2f8510f28..c31170ad71 100644
--- a/deepmd/pd/infer/deep_eval.py
+++ b/deepmd/pd/infer/deep_eval.py
@@ -113,6 +113,7 @@ def __init__(
         else:
             # self.dp = paddle.jit.load(self.model_path.split(".json")[0])
             raise ValueError(f"Unknown model file format: {self.model_path}!")
+        self.dp.eval()
         self.rcut = self.dp.model["Default"].get_rcut()
         self.type_map = self.dp.model["Default"].get_type_map()
         if isinstance(auto_batch_size, bool):
diff --git a/deepmd/pd/utils/region.py b/deepmd/pd/utils/region.py
index f3e3eaa52d..d2600ef16e 100644
--- a/deepmd/pd/utils/region.py
+++ b/deepmd/pd/utils/region.py
@@ -108,5 +108,5 @@ def normalize_coord(
 
     """
     icoord = phys2inter(coord, cell)
-    icoord = paddle.remainder(icoord, paddle.full([], 1.0))
+    icoord = paddle.remainder(icoord, paddle.full([], 1.0, dtype=icoord.dtype))
     return inter2phys(icoord, cell)

From 30b1447aa900d393f8d5ffeceee156a3124cfec8 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 24 Dec 2024 12:20:34 +0800
Subject: [PATCH 3/3] [pre-commit.ci] pre-commit autoupdate (#4497)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

<!--pre-commit.ci start-->
updates:
- [github.com/astral-sh/ruff-pre-commit: v0.8.3 →
v0.8.4](https://github.com/astral-sh/ruff-pre-commit/compare/v0.8.3...v0.8.4)
<!--pre-commit.ci end-->

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .pre-commit-config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 9058decc21..bd36fd6e63 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -29,7 +29,7 @@ repos:
         exclude: ^source/3rdparty
   - repo: https://github.com/astral-sh/ruff-pre-commit
     # Ruff version.
-    rev: v0.8.3
+    rev: v0.8.4
     hooks:
       - id: ruff
         args: ["--fix"]