From 85c21f1e2786dff78ab5349b994e28f763716339 Mon Sep 17 00:00:00 2001
From: zyliang2001 <zhongyuan_liang@berkeley.edu>
Date: Sat, 13 Jan 2024 08:11:58 -0800
Subject: [PATCH] Update hierarchy, build treeshap local,

---
 ...=> 01_run_importance_local_simulations.py} | 100 +++++++++++++++++-
 .../two_subgroups_linear_sims/dgp.py          |  33 ++++++
 .../two_subgroups_linear_sims/models.py       |  15 +++
 feature_importance/fi_config/test/dgp.py      |   4 +-
 .../scripts/competing_methods_local.py        |  39 +++++++
 .../scripts/simulations_util.py               |   4 +-
 6 files changed, 188 insertions(+), 7 deletions(-)
 rename feature_importance/{01_simulation_localMDI.py => 01_run_importance_local_simulations.py} (80%)
 create mode 100644 feature_importance/fi_config/mdi_local/two_subgroups_linear_sims/dgp.py
 create mode 100644 feature_importance/fi_config/mdi_local/two_subgroups_linear_sims/models.py
 create mode 100644 feature_importance/scripts/competing_methods_local.py

diff --git a/feature_importance/01_simulation_localMDI.py b/feature_importance/01_run_importance_local_simulations.py
similarity index 80%
rename from feature_importance/01_simulation_localMDI.py
rename to feature_importance/01_run_importance_local_simulations.py
index ef4b19b..c864c82 100644
--- a/feature_importance/01_simulation_localMDI.py
+++ b/feature_importance/01_run_importance_local_simulations.py
@@ -223,7 +223,7 @@ def run_comparison(path: str,
 
 
 def get_metrics():
-    return [('rocauc', auroc_score), ('prauc', auprc_score)]
+    return [('rocauc', auroc_score)]#, ('prauc', auprc_score)]
 
 
 def reformat_results(results):
@@ -413,4 +413,100 @@ def run_simulation(i, path, val_name, X_params_dict, X_dgp, y_params_dict, y_dgp
                                           metrics, args) for i in range(args.nreps)]
             assert all(results)
 
-    print('completed all experiments successfully!')
\ No newline at end of file
+    print('completed all experiments successfully!')
+
+# get model file names
+    model_comparison_files_all = []
+    for est in ests:
+        estimator_name = est[0].name.split(' - ')[0]
+        fi_estimators_all = [fi_estimator for fi_estimator in itertools.chain(*fi_ests) \
+                             if fi_estimator.model_type in est[0].model_type]
+        model_comparison_files = [f'{estimator_name}_{fi_estimator.name}_comparisons.pkl' for fi_estimator in
+                                  fi_estimators_all]
+        model_comparison_files_all += model_comparison_files
+
+    # aggregate results
+    results_list = []
+    if isinstance(vary_param_name, list):
+        for vary_param_dict in vary_param_dicts:
+            val_name = "_".join(vary_param_dict.values())
+
+            for i in range(args.nreps):
+                all_files = glob.glob(oj(path, val_name, 'rep' + str(i), '*'))
+                model_files = sorted([f for f in all_files if os.path.basename(f) in model_comparison_files_all])
+
+                if len(model_files) == 0:
+                    print('No files found at ', oj(path, val_name, 'rep' + str(i)))
+                    continue
+
+                results = pd.concat(
+                    [pkl.load(open(f, 'rb'))['df'] for f in model_files],
+                    axis=0
+                )
+
+                for param_name, param_val in vary_param_dict.items():
+                    val = vary_param_vals[param_name][param_val]
+                    if vary_type[param_name] == "dgp":
+                        if np.isscalar(val):
+                            results.insert(0, param_name, val)
+                        else:
+                            results.insert(0, param_name, [val for i in range(results.shape[0])])
+                        results.insert(1, param_name + "_name", param_val)
+                    elif vary_type[param_name] == "est" or vary_type[param_name] == "fi_est":
+                        results.insert(0, param_name + "_name", copy.deepcopy(results[param_name]))
+                results.insert(0, 'rep', i)
+                results_list.append(results)
+    else:
+        for val_name, val in vary_param_vals.items():
+            for i in range(args.nreps):
+                all_files = glob.glob(oj(path, val_name, 'rep' + str(i), '*'))
+                model_files = sorted([f for f in all_files if os.path.basename(f) in model_comparison_files_all])
+
+                if len(model_files) == 0:
+                    print('No files found at ', oj(path, val_name, 'rep' + str(i)))
+                    continue
+
+                results = pd.concat(
+                    [pkl.load(open(f, 'rb'))['df'] for f in model_files],
+                    axis=0
+                )
+                if vary_type == "dgp":
+                    if np.isscalar(val):
+                        results.insert(0, vary_param_name, val)
+                    else:
+                        results.insert(0, vary_param_name, [val for i in range(results.shape[0])])
+                    results.insert(1, vary_param_name + "_name", val_name)
+                    results.insert(2, 'rep', i)
+                elif vary_type == "est" or vary_type == "fi_est":
+                    results.insert(0, vary_param_name + "_name", copy.deepcopy(results[vary_param_name]))
+                    results.insert(1, 'rep', i)
+                results_list.append(results)
+    results_merged = pd.concat(results_list, axis=0)
+    pkl.dump(results_merged, open(oj(path, 'results.pkl'), 'wb'))
+    results_df = reformat_results(results_merged)
+    results_df.to_csv(oj(path, 'results.csv'), index=False)
+
+    print('merged and saved all experiment results successfully!')
+
+    # create R markdown summary of results
+    if args.create_rmd:
+        if args.show_vars is None:
+            show_vars = 'NULL'
+        else:
+            show_vars = args.show_vars
+
+        if isinstance(vary_param_name, list):
+            vary_param_name = "; ".join(vary_param_name)
+
+        sim_rmd = os.path.basename(results_dir) + '_simulation_results.Rmd'
+        os.system(
+            'cp {} \'{}\''.format(oj("rmd", "simulation_results.Rmd"), sim_rmd)
+        )
+        os.system(
+            'Rscript -e "rmarkdown::render(\'{}\', params = list(results_dir = \'{}\', vary_param_name = \'{}\', seed = {}, keep_vars = {}), output_file = \'{}\', quiet = TRUE)"'.format(
+                sim_rmd,
+                results_dir, vary_param_name, str(args.split_seed), str(show_vars),
+                oj(path, "simulation_results.html"))
+        )
+        os.system('rm \'{}\''.format(sim_rmd))
+        print("created rmd of simulation results successfully!")
\ No newline at end of file
diff --git a/feature_importance/fi_config/mdi_local/two_subgroups_linear_sims/dgp.py b/feature_importance/fi_config/mdi_local/two_subgroups_linear_sims/dgp.py
new file mode 100644
index 0000000..3091bd0
--- /dev/null
+++ b/feature_importance/fi_config/mdi_local/two_subgroups_linear_sims/dgp.py
@@ -0,0 +1,33 @@
+import sys
+sys.path.append("../..")
+from feature_importance.scripts.simulations_util import *
+
+
+X_DGP = sample_real_X
+X_PARAMS_DICT = {
+    "fpath": "data/X_splicing_cleaned.csv",
+    "sample_row_n": None,
+    "sample_col_n": None
+}
+### Update start for local MDI+
+Y_DGP = linear_model_two_groups
+Y_PARAMS_DICT = {
+    "beta": 1,
+    "sigma": None,
+    "heritability": 0.4,
+    "s": 5
+}
+### Update for local MDI+ done
+
+# # vary one parameter
+# VARY_PARAM_NAME = "sample_row_n"
+# VARY_PARAM_VALS = {"100": 100, "250": 250, "500": 500, "1000": 1000}
+
+# vary two parameters in a grid
+VARY_PARAM_NAME = ["heritability", "sample_row_n"]
+VARY_PARAM_VALS = {"heritability": {"0.1": 0.1, "0.2": 0.2, "0.4": 0.4, "0.8": 0.8},
+                   "sample_row_n": {"100": 100, "250": 250, "500": 500, "1000": 1000}}
+
+# # vary over n_estimators in RF model in models.py
+# VARY_PARAM_NAME = "n_estimators"
+# VARY_PARAM_VALS = {"placeholder": 0}
\ No newline at end of file
diff --git a/feature_importance/fi_config/mdi_local/two_subgroups_linear_sims/models.py b/feature_importance/fi_config/mdi_local/two_subgroups_linear_sims/models.py
new file mode 100644
index 0000000..2357a1d
--- /dev/null
+++ b/feature_importance/fi_config/mdi_local/two_subgroups_linear_sims/models.py
@@ -0,0 +1,15 @@
+from sklearn.ensemble import RandomForestRegressor
+from feature_importance.util import ModelConfig, FIModelConfig
+from feature_importance.scripts.competing_methods_local import tree_shap_local
+
+# N_ESTIMATORS=[50, 100, 500, 1000]
+ESTIMATORS = [
+    [ModelConfig('RF', RandomForestRegressor, model_type='tree',
+                 other_params={'n_estimators': 100, 'min_samples_leaf': 5, 'max_features': 0.33})],
+    # [ModelConfig('RF', RandomForestRegressor, model_type='tree', vary_param="n_estimators", vary_param_val=m,
+    #              other_params={'min_samples_leaf': 5, 'max_features': 0.33}) for m in N_ESTIMATORS]
+]
+
+FI_ESTIMATORS = [
+    [FIModelConfig('TreeSHAP', tree_shap_local, model_type='tree')]
+]
\ No newline at end of file
diff --git a/feature_importance/fi_config/test/dgp.py b/feature_importance/fi_config/test/dgp.py
index 7979215..6f9eb20 100644
--- a/feature_importance/fi_config/test/dgp.py
+++ b/feature_importance/fi_config/test/dgp.py
@@ -9,15 +9,13 @@
     "sample_row_n": None,
     "sample_col_n": None
 }
-### Update start for local MDI+
-Y_DGP = linear_model_two_groups #linear_model
+Y_DGP = linear_model
 Y_PARAMS_DICT = {
     "beta": 1,
     "sigma": None,
     "heritability": 0.4,
     "s": 5
 }
-### Update for local MDI+ done
 
 # # vary one parameter
 # VARY_PARAM_NAME = "sample_row_n"
diff --git a/feature_importance/scripts/competing_methods_local.py b/feature_importance/scripts/competing_methods_local.py
new file mode 100644
index 0000000..b92fb67
--- /dev/null
+++ b/feature_importance/scripts/competing_methods_local.py
@@ -0,0 +1,39 @@
+import os
+import sys
+import pandas as pd
+import numpy as np
+import sklearn.base
+from sklearn.base import RegressorMixin, ClassifierMixin
+from functools import reduce
+
+import shap
+
+
+def tree_shap_local(X, y, fit):
+    """
+    Compute average treeshap value across observations
+    :param X: design matrix
+    :param y: response
+    :param fit: fitted model of interest (tree-based)
+    :return: dataframe - [Var, Importance]
+                         Var: variable name
+                         Importance: average absolute shap value
+    """
+    explainer = shap.TreeExplainer(fit)
+    shap_values = explainer.shap_values(X, check_additivity=False)
+    if sklearn.base.is_classifier(fit):
+        def add_abs(a, b):
+            return abs(a) + abs(b)
+        results = reduce(add_abs, shap_values)
+    else:
+        results = abs(shap_values)
+    result_table = pd.DataFrame(results)
+    # results = results.mean(axis=0)
+    # results = pd.DataFrame(data=results, columns=['importance'])
+    # # Use column names from dataframe if possible
+    # if isinstance(X, pd.DataFrame):
+    #     results.index = X.columns
+    # results.index.name = 'var'
+    # results.reset_index(inplace=True)
+
+    return result_table
\ No newline at end of file
diff --git a/feature_importance/scripts/simulations_util.py b/feature_importance/scripts/simulations_util.py
index bf0a71e..f697831 100644
--- a/feature_importance/scripts/simulations_util.py
+++ b/feature_importance/scripts/simulations_util.py
@@ -253,8 +253,8 @@ def create_y(x, s, beta, group_index):
     beta_group1 = generate_coef(beta, s)
     beta_group2 = generate_coef(beta, s)
     # Generate two response vectors for each subgroup
-    y_train_group1 = np.array([create_y(X[i, :], s, beta_group1, 0) for i in range(n//2)])
-    y_train_group2 = np.array([create_y(X[i, :], s, beta_group2, 1) for i in range(n//2, n)])
+    y_train_group1 = np.array([create_y(X[i, :], s, beta_group1, group_index=0) for i in range(n//2)])
+    y_train_group2 = np.array([create_y(X[i, :], s, beta_group2, group_index=1) for i in range(n//2, n)])
     y_train = np.concatenate((y_train_group1, y_train_group2))
     ### Update for local MDI+ done