diff --git a/README.md b/README.md
index 2c5b88c92..23ed163d6 100644
--- a/README.md
+++ b/README.md
@@ -37,12 +37,32 @@ then
 
 #### Guide for Helmholtz GPU cluster
 ```
+conda create --name domainlab_py39 python=3.9
+conda activate domainlab_py39
 conda install pytorch==1.12.1 torchvision==0.13.1 torchaudio==0.12.1 cudatoolkit=11.6 -c pytorch -c conda-forge
 conda install torchmetric==0.10.3
+git checkout fbopt
 pip install -r requirements_notorch.txt 
 conda install tensorboard
 ```
 
+#### Download PACS
+
+step 1:
+
+use the following script to download PACS to your local laptop and upload it to your cluster
+
+https://github.com/marrlab/DomainLab/blob/fbopt/data/script/download_pacs.py
+
+step 2:
+make a symbolic link following the example script in https://github.com/marrlab/DomainLab/blob/master/sh_pacs.sh
+
+where `mkdir -p data/pacs` is executed under the repository directory, 
+
+`ln -s /dir/to/yourdata/pacs/raw  ./data/pacs/PACS`
+will create a symbolic link under the repository directory
+
+
 #### Windows installation details
 
 To install DomainLab on Windows, please remove the `snakemake` dependency from the `requirements.txt` file.
diff --git a/ci_run_examples.sh b/ci_run_examples.sh
index 23249327f..3c00bef61 100644
--- a/ci_run_examples.sh
+++ b/ci_run_examples.sh
@@ -8,6 +8,8 @@ sed -n '/```shell/,/```/ p' docs/doc_examples.md | sed '/^```/ d' >> ./sh_temp_e
 bash -x -v -e sh_temp_example.sh
 echo "general examples done"
 
+rm -r zoutput
+
 echo "#!/bin/bash -x -v" > sh_temp_mnist.sh
 sed -n '/```shell/,/```/ p' docs/doc_MNIST_classification.md | sed '/^```/ d' >> ./sh_temp_mnist.sh
 bash -x -v -e sh_temp_mnist.sh
diff --git a/domainlab/algos/builder_diva.py b/domainlab/algos/builder_diva.py
index d13de99d8..e3e5256cd 100644
--- a/domainlab/algos/builder_diva.py
+++ b/domainlab/algos/builder_diva.py
@@ -35,7 +35,7 @@ def init_business(self, exp):
         request = RequestVAEBuilderCHW(
             task.isize.c, task.isize.h, task.isize.w, args)
         node = VAEChainNodeGetter(request)()
-        model = mk_diva()(node,
+        model = mk_diva(str_mu=args.str_mu)(node,
                           zd_dim=args.zd_dim,
                           zy_dim=args.zy_dim,
                           zx_dim=args.zx_dim,
diff --git a/domainlab/algos/msels/a_model_sel.py b/domainlab/algos/msels/a_model_sel.py
index 1f20ff912..f6ae57799 100644
--- a/domainlab/algos/msels/a_model_sel.py
+++ b/domainlab/algos/msels/a_model_sel.py
@@ -25,7 +25,7 @@ def accept(self, trainer, tr_obs):
         self.tr_obs = tr_obs
 
     @abc.abstractmethod
-    def update(self):
+    def update(self, clear_counter=False):
         """
         observer + visitor pattern to trainer
         if the best model should be updated
diff --git a/domainlab/algos/msels/c_msel_oracle.py b/domainlab/algos/msels/c_msel_oracle.py
index eb672f46e..299a9e48b 100644
--- a/domainlab/algos/msels/c_msel_oracle.py
+++ b/domainlab/algos/msels/c_msel_oracle.py
@@ -18,7 +18,7 @@ def __init__(self, msel=None):
         self.best_oracle_acc = 0
         self.msel = msel
 
-    def update(self):
+    def update(self, clear_counter=False):
         """
         if the best model should be updated
         """
@@ -35,7 +35,7 @@ def update(self):
             logger.info("new oracle model saved")
             flag = True
         if self.msel is not None:
-            return self.msel.update()
+            return self.msel.update(clear_counter)
         return flag
 
     def if_stop(self):
diff --git a/domainlab/algos/msels/c_msel_tr_loss.py b/domainlab/algos/msels/c_msel_tr_loss.py
index c42f324b8..3b9d4581e 100644
--- a/domainlab/algos/msels/c_msel_tr_loss.py
+++ b/domainlab/algos/msels/c_msel_tr_loss.py
@@ -17,7 +17,7 @@ def __init__(self, max_es):
         self.max_es = max_es
         super().__init__()
 
-    def update(self):
+    def update(self, clear_counter=False):
         """
         if the best model should be updated
         """
@@ -34,6 +34,9 @@ def update(self):
             logger.info(f"early stop counter: {self.es_c}")
             logger.info(f"loss:{loss}, best loss: {self.best_loss}")
             flag = False  # do not update best model
+            if clear_counter:
+                logger.info("clearing counter")
+                self.es_c = 0
         return flag
 
     def if_stop(self):
diff --git a/domainlab/algos/msels/c_msel_val.py b/domainlab/algos/msels/c_msel_val.py
index 939cc47a0..01497f0c4 100644
--- a/domainlab/algos/msels/c_msel_val.py
+++ b/domainlab/algos/msels/c_msel_val.py
@@ -16,19 +16,19 @@ def __init__(self, max_es):
         self.best_te_metric = 0.0
         super().__init__(max_es)  # construct self.tr_obs (observer)
 
-    def update(self):
+    def update(self, clear_counter=False):
         """
         if the best model should be updated
         """
         flag = True
         if self.tr_obs.metric_val is None or self.tr_obs.str_msel == "loss_tr":
-            return super().update()
+            return super().update(clear_counter)
         metric = self.tr_obs.metric_val[self.tr_obs.str_metric4msel]
         if self.tr_obs.metric_te is not None:
             metric_te_current = self.tr_obs.metric_te[self.tr_obs.str_metric4msel]
             self.best_te_metric = max(self.best_te_metric, metric_te_current)
 
-        if metric > self.best_val_acc:  # observer
+        if metric > self.best_val_acc:  # update hat{model}
             # different from loss, accuracy should be improved: the bigger the better
             self.best_val_acc = metric
             self.es_c = 0  # restore counter
@@ -45,5 +45,7 @@ def update(self):
                         f"corresponding to test acc: \
                         {self.sel_model_te_acc} / {self.best_te_metric}")
             flag = False  # do not update best model
-
+            if clear_counter:
+                logger.info("clearing counter")
+                self.es_c = 0
         return flag
diff --git a/domainlab/algos/trainers/args_fbopt.py b/domainlab/algos/trainers/args_fbopt.py
index a84969efd..c144f3d58 100644
--- a/domainlab/algos/trainers/args_fbopt.py
+++ b/domainlab/algos/trainers/args_fbopt.py
@@ -36,6 +36,9 @@ def add_args2parser_fbopt(parser):
     parser.add_argument('--no_setpoint_update', action='store_true', default=False,
                         help='disable setpoint update')
 
+    parser.add_argument('--str_mu', type=str, default="default", help='which penalty to tune')
+
+
     # the following hyperparamters do not need to be tuned
     parser.add_argument('--beta_mu', type=float, default=1.1,
                         help='how much to multiply mu each time')
diff --git a/domainlab/algos/trainers/fbopt_alternate.py b/domainlab/algos/trainers/fbopt_alternate.py
index a7d795626..67fecec23 100644
--- a/domainlab/algos/trainers/fbopt_alternate.py
+++ b/domainlab/algos/trainers/fbopt_alternate.py
@@ -98,7 +98,7 @@ def cal_delta4control(self, list1, list_setpoint):
     def cal_delta_integration(self, list_old, list_new, coeff):
         return [(1-coeff)*a + coeff*b for a, b in zip(list_old, list_new)]
 
-    def search_mu(self, epo_reg_loss, epo_task_loss, dict_theta=None, miter=None):
+    def search_mu(self, epo_reg_loss, epo_task_loss, epo_loss_tr, dict_theta=None, miter=None):
         """
         start from parameter dictionary dict_theta: {"layer":tensor},
         enlarge mu w.r.t. its current value
@@ -137,9 +137,6 @@ def search_mu(self, epo_reg_loss, epo_task_loss, dict_theta=None, miter=None):
                 f'reg/setpoint{i}': reg_set,
             }, miter)
             self.writer.add_scalar(f'x-axis=task vs y-axis=reg/dyn{i}', reg_dyn, epo_task_loss)
-
-        epo_loss_tr = epo_task_loss + torch.inner(
-            torch.Tensor(list(self.mmu.values())), torch.Tensor(epo_reg_loss))
         self.writer.add_scalar('loss_penalized', epo_loss_tr, miter)
         self.writer.add_scalar('task', epo_task_loss, miter)
         acc_te = 0
diff --git a/domainlab/algos/trainers/train_mu_controller.py b/domainlab/algos/trainers/train_mu_controller.py
index b0c982e4a..c015930c4 100644
--- a/domainlab/algos/trainers/train_mu_controller.py
+++ b/domainlab/algos/trainers/train_mu_controller.py
@@ -45,24 +45,27 @@ def eval_r_loss(self):
         # mock the model hyper-parameter to be from dict4mu
         epo_reg_loss = []
         epo_task_loss = 0
+        epo_p_loss = 0
         counter = 0.0
         with torch.no_grad():
             for _, (tensor_x, vec_y, vec_d, *_) in enumerate(self.loader_tr_no_drop):
                 tensor_x, vec_y, vec_d = \
                     tensor_x.to(self.device), vec_y.to(self.device), vec_d.to(self.device)
                 tuple_reg_loss = self.model.cal_reg_loss(tensor_x, vec_y, vec_d)
+                p_loss, *_ = self.model.cal_loss(tensor_x, vec_y, vec_d)
                 # NOTE: first [0] extract the loss, second [0] get the list
                 list_b_reg_loss = tuple_reg_loss[0]
-                list_b_reg_loss_sumed = [ele.sum().item() for ele in list_b_reg_loss]
+                list_b_reg_loss_sumed = [ele.sum().detach().item() for ele in list_b_reg_loss]
                 if len(epo_reg_loss) == 0:
                     epo_reg_loss = list_b_reg_loss_sumed
                 else:
                     epo_reg_loss = list(map(add, epo_reg_loss, list_b_reg_loss_sumed))
-                b_task_loss = self.model.cal_task_loss(tensor_x, vec_y).sum()
+                b_task_loss = self.model.cal_task_loss(tensor_x, vec_y).sum().detach().item()
                 # sum will kill the dimension of the mini batch
                 epo_task_loss += b_task_loss
+                epo_p_loss += p_loss.sum().detach().item()
                 counter += 1.0
-        return list_divide(epo_reg_loss, counter), epo_task_loss/counter
+        return list_divide(epo_reg_loss, counter), epo_task_loss/counter, epo_p_loss / counter
 
     def before_batch(self, epoch, ind_batch):
         """
@@ -77,7 +80,7 @@ def before_batch(self, epoch, ind_batch):
     def before_tr(self):
         self.set_scheduler(scheduler=HyperSchedulerFeedbackAlternave)
         self.model.hyper_update(epoch=None, fun_scheduler=HyperSetter(self.hyper_scheduler.mmu))
-        self.epo_reg_loss_tr, self.epo_task_loss_tr = self.eval_r_loss()
+        self.epo_reg_loss_tr, self.epo_task_loss_tr, self.epo_loss_tr = self.eval_r_loss()
         self.hyper_scheduler.set_setpoint(
             [ele * self.aconf.ini_setpoint_ratio for ele in self.epo_reg_loss_tr],
             self.epo_task_loss_tr)
@@ -90,6 +93,7 @@ def tr_epoch(self, epoch):
         self.hyper_scheduler.search_mu(
             self.epo_reg_loss_tr,
             self.epo_task_loss_tr,
+            self.epo_loss_tr,
             dict(self.model.named_parameters()),
             miter=epoch)
         self.hyper_scheduler.update_setpoint(self.epo_reg_loss_tr, self.epo_task_loss_tr)
diff --git a/domainlab/models/a_model_classif.py b/domainlab/models/a_model_classif.py
index f5ec1034d..ae1814b95 100644
--- a/domainlab/models/a_model_classif.py
+++ b/domainlab/models/a_model_classif.py
@@ -197,4 +197,6 @@ def cal_reg_loss(self, tensor_x, tensor_y, tensor_d, others=None):
         """
         for ERM to adapt to the interface of other regularized learners
         """
-        return [torch.Tensor([0])], [0.0]
\ No newline at end of file
+        device = tensor_x.device
+        bsize = tensor_x.shape[0]
+        return [torch.zeros(bsize, 1).to(device)], [0.0]
\ No newline at end of file
diff --git a/domainlab/models/model_diva.py b/domainlab/models/model_diva.py
index 752d5acf2..305be1162 100644
--- a/domainlab/models/model_diva.py
+++ b/domainlab/models/model_diva.py
@@ -9,7 +9,7 @@
 from domainlab.utils.utils_class import store_args
 
 
-def mk_diva(parent_class=VAEXYDClassif):
+def mk_diva(parent_class=VAEXYDClassif, str_mu="default"):
     """
     Instantiate a domain invariant variational autoencoder (DIVA) with arbitrary task loss.
 
@@ -89,8 +89,6 @@ def hyper_update(self, epoch, fun_scheduler):
             self.beta_d = dict_rst["beta_d"]
             self.beta_y = dict_rst["beta_y"]
             self.beta_x = dict_rst["beta_x"]
-            self.gamma_d = dict_rst["gamma_d"]
-            self.mu_recon = dict_rst["mu_recon"]
 
         def hyper_init(self, functor_scheduler, trainer=None):
             """
@@ -100,11 +98,9 @@ def hyper_init(self, functor_scheduler, trainer=None):
             """
             return functor_scheduler(
                 trainer=trainer,
-                mu_recon=self.mu_recon,
                 beta_d=self.beta_d,
                 beta_y=self.beta_y,
                 beta_x=self.beta_x,
-                gamma_d=self.gamma_d,
             )
 
         def get_list_str_y(self):
@@ -142,4 +138,71 @@ def cal_reg_loss(self, tensor_x, tensor_y, tensor_d, others=None):
             lc_d = F.cross_entropy(logit_d, d_target, reduction="none")
             return [loss_recon_x, zd_p_minus_zd_q, zx_p_minus_zx_q, zy_p_minus_zy_q, lc_d], \
                    [self.mu_recon, -self.beta_d, -self.beta_x, -self.beta_y, -self.gamma_d]
-    return ModelDIVA
+
+    class ModelDIVAGammadRecon(ModelDIVA):
+        def hyper_update(self, epoch, fun_scheduler):
+            """hyper_update.
+
+            :param epoch:
+            :param fun_scheduler:
+            """
+            dict_rst = fun_scheduler(epoch)
+            self.beta_d = dict_rst["beta_d"]
+            self.beta_y = dict_rst["beta_y"]
+            self.beta_x = dict_rst["beta_x"]
+            self.gamma_d = dict_rst["gamma_d"]
+            self.mu_recon = dict_rst["mu_recon"]
+
+        def hyper_init(self, functor_scheduler, trainer=None):
+            """
+            initiate a scheduler object via class name and things inside this model
+
+            :param functor_scheduler: the class name of the scheduler
+            """
+            return functor_scheduler(
+                trainer=trainer,
+                mu_recon=self.mu_recon,
+                beta_d=self.beta_d,
+                beta_y=self.beta_y,
+                beta_x=self.beta_x,
+                gamma_d=self.gamma_d,
+            )
+
+
+    class ModelDIVAGammad(ModelDIVA):
+        def hyper_update(self, epoch, fun_scheduler):
+            """hyper_update.
+
+            :param epoch:
+            :param fun_scheduler:
+            """
+            dict_rst = fun_scheduler(epoch)
+            self.beta_d = dict_rst["beta_d"]
+            self.beta_y = dict_rst["beta_y"]
+            self.beta_x = dict_rst["beta_x"]
+            self.gamma_d = dict_rst["gamma_d"]
+
+        def hyper_init(self, functor_scheduler, trainer=None):
+            """
+            initiate a scheduler object via class name and things inside this model
+
+            :param functor_scheduler: the class name of the scheduler
+            """
+            return functor_scheduler(
+                trainer=trainer,
+                beta_d=self.beta_d,
+                beta_y=self.beta_y,
+                beta_x=self.beta_x,
+                gamma_d=self.gamma_d,
+            )
+
+    class ModelDIVADefault(ModelDIVA):
+        """
+        """
+    if str_mu == "gammad_recon":
+        return ModelDIVAGammadRecon
+    if str_mu == "gammad":
+        return ModelDIVAGammad
+    if str_mu == "default":
+        return ModelDIVADefault
+    raise RuntimeError("not support argument candiates for str_mu: allowed: default, gammad_recon, gammad")
diff --git a/domainlab/utils/generate_fbopt_phase_portrait.py b/domainlab/utils/generate_fbopt_phase_portrait.py
new file mode 100644
index 000000000..88b2bd97a
--- /dev/null
+++ b/domainlab/utils/generate_fbopt_phase_portrait.py
@@ -0,0 +1,63 @@
+import glob
+import os
+
+import matplotlib.pyplot as plt
+from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
+
+
+# FIXME: maybe adjust the output path where the png is saved
+output_dir = "../.."
+
+def get_xy_from_event_file(event_file, tf_size_guidance=None):
+    if tf_size_guidance is None:
+        # settings for which/how much data is loaded from the tensorboard event files
+        tf_size_guidance = {
+            'compressedHistograms': 0,
+            'images': 0,
+            'scalars': 1e10,  # keep unlimited number
+            'histograms': 0
+        }
+    # load event file
+    event = EventAccumulator(event_file, tf_size_guidance)
+    event.Reload()
+    # extract the reg/dyn0 values
+    y_event = event.Scalars('x-axis=task vs y-axis=reg/dyn0')
+    y = [s.value for s in y_event]
+    x_int = [s.step for s in y_event]  # the .step data are saved as ints in tensorboard, so we will re-extact from 'task'
+    # extract the corresponding 'task' values
+    x_event = event.Scalars('task')
+    x = [s.value for s in x_event]
+    # sanity check:
+    for i in range(len(x)):
+        assert int(x[i]) == x_int[i]
+    return x, y
+
+def phase_portrain_combined(event_files, colors):
+    plt.figure()
+
+    for event_i in range(len(event_files)):
+        x, y = get_xy_from_event_file(event_files[event_i])
+
+        assert len(x) == len(y)
+        for i in range(len(x)-1):
+            plt.arrow(x[i], y[i], (x[i+1]-x[i]), (y[i+1]-y[i]),
+                      head_width=0.2, head_length=0.2, length_includes_head=True,
+                      fc=colors[event_i], ec=colors[event_i], alpha=0.4)
+
+        plt.plot(x[0], y[0], 'ko')
+        plt.scatter(x, y, s=1, c='black')
+
+        plt.xlabel("task")
+        plt.ylabel("reg/dyn0")
+        plt.title("x-axis=task vs y-axis=reg/dyn0")
+
+    plt.savefig(os.path.join(output_dir, 'phase_portrain_combined.png'), dpi=300)
+
+
+if __name__ == "__main__":
+    event_files = glob.glob("../../runs/*/events*")
+    print("Using the following tensorboard event files:\n{}".format("\n".join(event_files)))
+    cmap = plt.get_cmap('tab10')  # Choose a colormap
+    colors = [cmap(i) for i in range(len(event_files))]  # Different colors for the different runs
+    phase_portrain_combined(event_files, colors)
+
diff --git a/examples/benchmark/benchmark_fbopt_mnist_diva.yaml b/examples/benchmark/benchmark_fbopt_mnist_diva.yaml
index f705f4da1..12878acb0 100644
--- a/examples/benchmark/benchmark_fbopt_mnist_diva.yaml
+++ b/examples/benchmark/benchmark_fbopt_mnist_diva.yaml
@@ -4,7 +4,7 @@ output_dir: zoutput/benchmarks/benchmark_fbopt
 
 sampling_seed: 0
 startseed: 0
-endseed: 2
+endseed: 4
 
 test_domains:
   - 0
@@ -21,7 +21,6 @@ domainlab_args:
   zx_dim: 0
   zy_dim: 32
   zd_dim: 32
-  gamma_y: 1.0
   nname: conv_bn_pool_2
   nname_dom: conv_bn_pool_2
   nname_topic_distrib_img2topic: conv_bn_pool_2
@@ -48,23 +47,60 @@ Shared params:
     distribution: loguniform
 
   mu_init:
-    min: 0.01
-    max: 0.05
+    min: 0.000001
+    max: 0.00001
     num: 5
     distribution: loguniform
 
+  gamma_y:
+    min: 1.0
+    max: 1e6
+    num: 3
+    distribution: loguniform
 
+  gamma_d:
+    min: 1.0
+    max: 1e6
+    step: 100
+    num: 3
+    distribution: loguniform
 
 # Test fbopt with different hyperparameter configurations
 
-diva_fbopt:
+diva_fbopt_a:
   aname: diva
   trainer: fbopt
+  str_mu: gammad_recon
+  gamma_y: 1.0
+  init_setpoint_ratio: 0.99
 
   shared:
-    - ini_setpoint_ratio
     - k_i_gain
     - mu_init
 
+diva_feedforward_a:
+  aname: diva
+  trainer: hyperscheduler
+  str_mu: gammad_recon
+  gamma_y: 1.0
+  shared:
+    - gamma_d
+
+diva_default:
+  aname: diva
+  trainer: hyperscheduler
+  str_mu: default
+  shared:
+    - gamma_d
+    - gamma_y
+
+diva_fixed_penalty:
+  aname: diva
+  trainer: basic
+  str_mu: default
+  shared:
+    - gamma_d
+    - gamma_y
+
 erm:
   aname: deepall
diff --git a/examples/benchmark/benchmark_fbopt_mnist_jigen.yaml b/examples/benchmark/benchmark_fbopt_mnist_jigen.yaml
index c0f3e25e5..c58868965 100644
--- a/examples/benchmark/benchmark_fbopt_mnist_jigen.yaml
+++ b/examples/benchmark/benchmark_fbopt_mnist_jigen.yaml
@@ -4,7 +4,7 @@ output_dir: zoutput/benchmarks/benchmark_fbopt
 
 sampling_seed: 0
 startseed: 0
-endseed: 2
+endseed: 4
 
 test_domains:
   - 0
@@ -24,52 +24,52 @@ domainlab_args:
   mu_clip: 10
   coeff_ma: 0.5
   no_tensorboard: False
+  pperm: 0.5
 
 
 
 Shared params:
-  ini_setpoint_ratio:
-    min: 0.95
-    max: 0.99
-    num: 2
-    distribution: uniform
-
   k_i_gain:
     min: 0.0001
     max: 0.01
     num: 2
-    step: 0.0001
     distribution: uniform
 
   mu_init:
-    min: 0.001
-    max: 0.1
+    min: 0.000001
+    max: 0.00001
     num: 3
     distribution: loguniform
 
-  pperm:
-    min: 0.1
-    max: 0.9
-    num: 3
-    distribution: uniform
+  gamma_reg:
+    min: 0.01
+    max: 10_000
+    num: 10
+    distribution: loguniform
+
+
 
 # Test fbopt with different hyperparameter configurations
 
 jigen_feedback:
   aname: jigen
   trainer: fbopt
-
+  ini_setpoint_ratio: 0.99
   shared:
-    - ini_setpoint_ratio
     - k_i_gain
     - mu_init
-    - pperm
     
 jigen_feedforward:
   aname: jigen
-  trainer: fbopt
+  trainer: hyperscheduler
   shared:
-    - pperm
-  
+    - gamma_reg
+
+jigen_fixed_penalty:
+  aname: jigen
+  trainer: basic
+  shared:
+    - gamma_reg
+
 erm:
   aname: deepall
diff --git a/examples/benchmark/benchmark_fbopt_pacs_diva.yaml b/examples/benchmark/benchmark_fbopt_pacs_diva.yaml
index 68d9d55e2..81a6a4e9a 100644
--- a/examples/benchmark/benchmark_fbopt_pacs_diva.yaml
+++ b/examples/benchmark/benchmark_fbopt_pacs_diva.yaml
@@ -23,7 +23,7 @@ domainlab_args:
   npath_topic_distrib_img2topic: examples/nets/resnet50domainbed.py
   npath_encoder_sandwich_layer_img2h4zd: examples/nets/resnet50domainbed.py
   exp_shoulder_clip: 10
-  mu_clip: 10_000
+  mu_clip: 1000_000
   coeff_ma: 0.5
   zx_dim: 0
   zy_dim: 64
@@ -41,13 +41,27 @@ Shared params:
 
   k_i_gain:
     min: 0.0001
-    max: 0.1
+    max: 0.01
     num: 3
     distribution: uniform
 
   mu_init:
-    min: 0.0001
-    max: 1.0
+    min: 0.000001
+    max: 0.00001
+    num: 3
+    distribution: loguniform
+
+  gamma_y:
+    min: 1.0
+    max: 1e6
+    step: 100
+    num: 3
+    distribution: loguniform
+
+  gamma_d:
+    min: 1.0
+    max: 1e6
+    step: 100
     num: 3
     distribution: loguniform
 
@@ -55,11 +69,39 @@ Shared params:
 
 # Test fbopt with different hyperparameter configurations
 
-diva_fbopt:
+diva_fbopt_full:
   aname: diva
   trainer: fbopt
+  str_mu: gammad_recon
   gamma_y: 1.0
+  ini_setpoint_ratio: 0.99
   shared:
-    - ini_setpoint_ratio
     - k_i_gain
     - mu_init
+
+diva_feedforward_full:
+  aname: diva
+  trainer: hyperscheduler
+  str_mu: gammad_recon
+  gamma_y: 1.0
+  shared:
+    - gamma_d
+
+diva_default:
+  aname: diva
+  trainer: hyperscheduler
+  str_mu: default
+  shared:
+    - gamma_d
+    - gamma_y
+
+diva_fixed_penalty:
+  aname: diva
+  trainer: basic
+  str_mu: default
+  shared:
+    - gamma_d
+    - gamma_y
+
+erm:
+  aname: deepall
diff --git a/examples/benchmark/benchmark_fbopt_pacs_jigen.yaml b/examples/benchmark/benchmark_fbopt_pacs_jigen.yaml
index 2ee334078..aa9d56696 100644
--- a/examples/benchmark/benchmark_fbopt_pacs_jigen.yaml
+++ b/examples/benchmark/benchmark_fbopt_pacs_jigen.yaml
@@ -5,7 +5,7 @@ output_dir: zoutput/benchmarks/benchmark_fbopt_pacs_full
 sampling_seed: 0
 
 startseed: 0
-endseed: 1
+endseed: 4
 
 test_domains:
   - sketch
@@ -28,6 +28,7 @@ domainlab_args:
   zx_dim: 0
   zy_dim: 64
   zd_dim: 64
+  pperm: 0.5
 
 
 Shared params:
@@ -38,9 +39,9 @@ Shared params:
     distribution: loguniform
 
   mu_init:
-    min: 0.00001
-    max: 1.0
-    num: 4
+    min: 0.000001
+    max: 0.00005
+    num: 3
     distribution: loguniform
 
   pperm:
@@ -49,13 +50,33 @@ Shared params:
     num: 3
     distribution: uniform
 
+  gamma_reg:
+    min: 0.01
+    max: 10_000
+    num: 10
+    distribution: loguniform
+
 # Test fbopt with different hyperparameter configurations
 
-jigen_fbopt:
+jigen_feedback:
   aname: jigen
-  init_setpoint_ratio: 0.99
   trainer: fbopt
+  ini_setpoint_ratio: 0.99
   shared:
     - k_i_gain
     - mu_init
-    - pperm
+    
+jigen_feedforward:
+  aname: jigen
+  trainer: hyperscheduler
+  shared:
+    - gamma_reg
+
+jigen_fixed_penalty:
+  aname: jigen
+  trainer: basic
+  shared:
+    - gamma_reg
+
+erm:
+  aname: deepall
diff --git a/pyproject.toml b/pyproject.toml
index 30419abf9..c08701030 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -9,7 +9,7 @@ python = "^3.9"
 numpy = "^1.23.4"
 matplotlib = "^3.6.1"
 seaborn = "0.12.2"
-snakemake = "7.21.0"
+snakemake = "^7.32.4"
 torchmetrics = "^0.10.0"
 torch = "^1.12.0"
 torchvision = "^0.13.0"
diff --git a/requirements.txt b/requirements.txt
index ef70b5567..df7bf73a0 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -61,7 +61,7 @@ setuptools==68.0.0 ; python_version >= "3.9" and python_version < "4.0"
 six==1.16.0 ; python_version >= "3.9" and python_version < "4.0"
 smart-open==6.3.0 ; python_version >= "3.9" and python_version < "4.0"
 smmap==5.0.0 ; python_version >= "3.9" and python_version < "4.0"
-snakemake==7.21.0 ; python_version >= "3.9" and python_version < "4.0"
+snakemake==7.32.4 ; python_version >= "3.9" and python_version < "4.0"
 soupsieve==2.4.1 ; python_version >= "3.9" and python_version < "4.0"
 stopit==1.1.2 ; python_version >= "3.9" and python_version < "4.0"
 tabulate==0.9.0 ; python_version >= "3.9" and python_version < "4.0"
diff --git a/run_erm.sh b/run_erm.sh
new file mode 100644
index 000000000..e5d44a151
--- /dev/null
+++ b/run_erm.sh
@@ -0,0 +1 @@
+python main_out.py --te_d=1 --tr_d 0 3 --task=mnistcolor10 --bs=16 --aname=deepall --nname=conv_bn_pool_2 --epos=10
diff --git a/run_fbopt.sh b/run_fbopt.sh
index ed7873414..33226f290 100644
--- a/run_fbopt.sh
+++ b/run_fbopt.sh
@@ -3,4 +3,4 @@
 # although garbage collector has been explicitly called, sometimes there is still CUDA out of memory error
 # so it is better not to use GPU to do the pytest to ensure every time there is no CUDA out of memory error occuring
 # pytest -s tests/test_fbopt.py
-python main_out.py --te_d=caltech --task=mini_vlcs --bs=16 --aname=jigen --trainer=fbopt --nname=alexnet --epos=200 --es=100 --init_mu=1.0
+python main_out.py --te_d=caltech --task=mini_vlcs --bs=16 --aname=jigen --trainer=fbopt --nname=alexnet --epos=200 --es=5 --mu_init=1.0
diff --git a/run_fbopt_mnist.sh b/run_fbopt_mnist.sh
index 212244e1d..55434f380 100644
--- a/run_fbopt_mnist.sh
+++ b/run_fbopt_mnist.sh
@@ -3,4 +3,4 @@
 # although garbage collector has been explicitly called, sometimes there is still CUDA out of memory error
 # so it is better not to use GPU to do the pytest to ensure every time there is no CUDA out of memory error occuring
 # pytest -s tests/test_fbopt.py
-python main_out.py --te_d=1 --tr_d 0 3 --task=mnistcolor10 --bs=16 --aname=jigen --trainer=fbopt --nname=conv_bn_pool_2 --epos=200 --es=100 --mu_init=0.00001
+python main_out.py --te_d=1 --tr_d 0 3 --task=mnistcolor10 --bs=16 --aname=jigen --trainer=fbopt --nname=conv_bn_pool_2 --epos=2000 --es=50 --mu_init=0.00001
diff --git a/tests/test_observer.py b/tests/test_observer.py
new file mode 100644
index 000000000..46e2cfd87
--- /dev/null
+++ b/tests/test_observer.py
@@ -0,0 +1,46 @@
+"""
+unit and end-end test for deep all, dann
+"""
+import gc
+import torch
+from domainlab.compos.exp.exp_main import Exp
+from domainlab.arg_parser import mk_parser_main
+
+
+def test_deepall():
+    """
+    unit deep all
+    """
+    parser = mk_parser_main()
+    margs = parser.parse_args(["--te_d", "caltech",
+                               "--task", "mini_vlcs",
+                               "--aname", "deepall", "--bs", "2",
+                               "--nname", "conv_bn_pool_2"
+                               ])
+    exp = Exp(margs)
+    exp.trainer.before_tr()
+    exp.trainer.tr_epoch(0)
+    exp.trainer.observer.update(True)
+    del exp
+    torch.cuda.empty_cache()
+    gc.collect()
+
+
+def test_deepall_trloss():
+    """
+    unit deep all
+    """
+    parser = mk_parser_main()
+    margs = parser.parse_args(["--te_d", "caltech",
+                               "--task", "mini_vlcs",
+                               "--aname", "deepall", "--bs", "2",
+                               "--nname", "conv_bn_pool_2",
+                               "--msel", "loss_tr"
+                               ])
+    exp = Exp(margs)
+    exp.trainer.before_tr()
+    exp.trainer.tr_epoch(0)
+    exp.trainer.observer.update(True)
+    del exp
+    torch.cuda.empty_cache()
+    gc.collect()