fix bug in ema

marrlab · Jul 25, 2024 · a5f0f07 · a5f0f07
1 parent a78d84e
commit a5f0f07
Show file tree

Hide file tree

Showing 2 changed files with 18 additions and 14 deletions.
diff --git a/domainlab/algos/trainers/a_trainer.py b/domainlab/algos/trainers/a_trainer.py
@@ -86,7 +86,7 @@ def __init__(self, successor_node=None, extend=None):
         self.flag_initialized = False
         # moving average
         self.ma_weight_previous_model_params = None
-        self._ma_dict_para_persist = {}
+        self._dict_previous_para_persist = {}
         self._ma_iter = 0
 
     @property

diff --git a/domainlab/algos/trainers/train_ema.py b/domainlab/algos/trainers/train_ema.py
@@ -29,29 +29,33 @@ def move_average(self, dict_data, epoch):
         Salesforce Research, USA
         """
         self.ma_weight_previous_model_params = epoch / (epoch + 1)
-        # 1/2, 2/3, 3/4, 4/5, 
+        # 1/2, 2/3, 3/4, 4/5,
         # weight on previous model converges to 1 as training goes on
-        dict_ema_para_curr_iter = {}
+        dict_return_ema_para_curr_iter = {}
         for key, data in dict_data.items():
             # data = data.view(1, -1)  # make it rank 1 tensor (a.k.a. vector)
             if self._ma_iter == 0:
                 previous_data = torch.zeros_like(data)
+                local_data_convex = data
             else:
-                previous_data = self._ma_dict_para_persist[key]
-
-            local_data_convex = \
-                self.ma_weight_previous_model_params * previous_data + \
-                (1 - self.ma_weight_previous_model_params) * data
-            # correction by 1/(1 - self.rho)
+                previous_data = self._dict_previous_para_persist[key]
+                local_data_convex = \
+                    self.ma_weight_previous_model_params * previous_data + \
+                    (1 - self.ma_weight_previous_model_params) * data
+            # correction by 1/(1 - self.ma_weight_previous_model_params)
             # so that the gradients amplitude backpropagated in data is
-            # independent of self.rho
-            dict_ema_para_curr_iter[key] = \
-                local_data_convex / (1 - self.ma_weight_previous_model_params)
-            self._ma_dict_para_persist[key] = \
+            # independent of self.ma_weight_previous_model_params
+            # We did not do this because 1-rho will be almost zero as
+            # epochs goes on, which will expand the neural network weights
+            # to overflow
+            # dict_return_ema_para_curr_iter[key] = \
+            #    local_data_convex / (1 - self.ma_weight_previous_model_params)
+            dict_return_ema_para_curr_iter[key] = local_data_convex
+            self._dict_previous_para_persist[key] = \
                 local_data_convex.clone().detach()  # used as previous data
 
         self._ma_iter += 1
-        return dict_ema_para_curr_iter
+        return dict_return_ema_para_curr_iter
 
     def after_epoch(self, epoch):
         torch_model = self.get_model()