From 4823dd37f761b1905ce9dfff10ffff771f5ada47 Mon Sep 17 00:00:00 2001 From: Chun Cai Date: Tue, 26 Nov 2024 19:01:29 +0800 Subject: [PATCH 1/6] perf: optimize training loop --- deepmd/pt/train/training.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/deepmd/pt/train/training.py b/deepmd/pt/train/training.py index f74c4769bf..a751be1d36 100644 --- a/deepmd/pt/train/training.py +++ b/deepmd/pt/train/training.py @@ -656,7 +656,8 @@ def step(_step_id, task_key="Default") -> None: # PyTorch Profiler if self.enable_profiler or self.profiling: prof.step() - self.wrapper.train() + if not self.wrapper.training: + self.wrapper.train() if isinstance(self.lr_exp, dict): _lr = self.lr_exp[task_key] else: From 7b9fb5aa6fe0001f42e17cacaedc0c32ed750cc8 Mon Sep 17 00:00:00 2001 From: Chun Cai Date: Tue, 26 Nov 2024 19:03:17 +0800 Subject: [PATCH 2/6] use faster norm check --- deepmd/pt/train/training.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/deepmd/pt/train/training.py b/deepmd/pt/train/training.py index a751be1d36..4df12ed259 100644 --- a/deepmd/pt/train/training.py +++ b/deepmd/pt/train/training.py @@ -684,11 +684,8 @@ def step(_step_id, task_key="Default") -> None: loss.backward() if self.gradient_max_norm > 0.0: grad_norm = torch.nn.utils.clip_grad_norm_( - self.wrapper.parameters(), self.gradient_max_norm + self.wrapper.parameters(), self.gradient_max_norm, error_if_nonfinite=True ) - if not torch.isfinite(grad_norm).all(): - # check local gradnorm single GPU case, trigger NanDetector - raise FloatingPointError("gradients are Nan/Inf") with torch.device("cpu"): self.optimizer.step() self.scheduler.step() From f48c18f8e67f5f6575a3d849c15f05fdcc7e0a86 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 26 Nov 2024 11:07:24 +0000 Subject: [PATCH 3/6] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- deepmd/pt/train/training.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/deepmd/pt/train/training.py b/deepmd/pt/train/training.py index 4df12ed259..c03acc52f1 100644 --- a/deepmd/pt/train/training.py +++ b/deepmd/pt/train/training.py @@ -684,7 +684,9 @@ def step(_step_id, task_key="Default") -> None: loss.backward() if self.gradient_max_norm > 0.0: grad_norm = torch.nn.utils.clip_grad_norm_( - self.wrapper.parameters(), self.gradient_max_norm, error_if_nonfinite=True + self.wrapper.parameters(), + self.gradient_max_norm, + error_if_nonfinite=True, ) with torch.device("cpu"): self.optimizer.step() From 53bbbdbb890e14af79fc8e8a8de778ad3cc06750 Mon Sep 17 00:00:00 2001 From: Chun Cai Date: Tue, 26 Nov 2024 20:31:23 +0800 Subject: [PATCH 4/6] remove unused return value --- deepmd/pt/train/training.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepmd/pt/train/training.py b/deepmd/pt/train/training.py index c03acc52f1..9b0b0a930e 100644 --- a/deepmd/pt/train/training.py +++ b/deepmd/pt/train/training.py @@ -683,7 +683,7 @@ def step(_step_id, task_key="Default") -> None: ) loss.backward() if self.gradient_max_norm > 0.0: - grad_norm = torch.nn.utils.clip_grad_norm_( + torch.nn.utils.clip_grad_norm_( self.wrapper.parameters(), self.gradient_max_norm, error_if_nonfinite=True, From 078e8485d483827624886d8f2927720965b8bf82 Mon Sep 17 00:00:00 2001 From: Chun Cai Date: Wed, 27 Nov 2024 12:47:44 +0800 Subject: [PATCH 5/6] only set training mode on first entrance and exiting valid --- deepmd/pt/train/training.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/deepmd/pt/train/training.py b/deepmd/pt/train/training.py index 9b0b0a930e..16c7810727 100644 --- a/deepmd/pt/train/training.py +++ b/deepmd/pt/train/training.py @@ -656,8 +656,6 @@ def step(_step_id, task_key="Default") -> None: # PyTorch Profiler if self.enable_profiler or self.profiling: prof.step() - if not self.wrapper.training: - self.wrapper.train() if isinstance(self.lr_exp, dict): _lr = self.lr_exp[task_key] else: @@ -766,7 +764,7 @@ def fake_model(): if self.display_in_training and ( display_step_id % self.disp_freq == 0 or display_step_id == 1 ): - self.wrapper.eval() + self.wrapper.eval() # Will set to train mode before fininshing validation def log_loss_train(_loss, _more_loss, _task_key="Default"): results = {} @@ -872,6 +870,7 @@ def log_loss_valid(_task_key="Default"): learning_rate=None, ) ) + self.wrapper.train() current_time = time.time() train_time = current_time - self.t0 @@ -926,7 +925,7 @@ def log_loss_valid(_task_key="Default"): writer.add_scalar( f"{task_key}/{item}", more_loss[item], display_step_id ) - + self.wrapper.train() self.t0 = time.time() self.total_train_time = 0.0 for step_id in range(self.num_steps): From 97c3fcf4af4cbc839de802c1017dfc211d622f08 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 27 Nov 2024 04:49:00 +0000 Subject: [PATCH 6/6] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- deepmd/pt/train/training.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/deepmd/pt/train/training.py b/deepmd/pt/train/training.py index 16c7810727..af6e48191d 100644 --- a/deepmd/pt/train/training.py +++ b/deepmd/pt/train/training.py @@ -764,7 +764,7 @@ def fake_model(): if self.display_in_training and ( display_step_id % self.disp_freq == 0 or display_step_id == 1 ): - self.wrapper.eval() # Will set to train mode before fininshing validation + self.wrapper.eval() # Will set to train mode before fininshing validation def log_loss_train(_loss, _more_loss, _task_key="Default"): results = {} @@ -925,6 +925,7 @@ def log_loss_valid(_task_key="Default"): writer.add_scalar( f"{task_key}/{item}", more_loss[item], display_step_id ) + self.wrapper.train() self.t0 = time.time() self.total_train_time = 0.0