From 85c21f1e2786dff78ab5349b994e28f763716339 Mon Sep 17 00:00:00 2001 From: zyliang2001 Date: Sat, 13 Jan 2024 08:11:58 -0800 Subject: [PATCH] Update hierarchy, build treeshap local, --- ...=> 01_run_importance_local_simulations.py} | 100 +++++++++++++++++- .../two_subgroups_linear_sims/dgp.py | 33 ++++++ .../two_subgroups_linear_sims/models.py | 15 +++ feature_importance/fi_config/test/dgp.py | 4 +- .../scripts/competing_methods_local.py | 39 +++++++ .../scripts/simulations_util.py | 4 +- 6 files changed, 188 insertions(+), 7 deletions(-) rename feature_importance/{01_simulation_localMDI.py => 01_run_importance_local_simulations.py} (80%) create mode 100644 feature_importance/fi_config/mdi_local/two_subgroups_linear_sims/dgp.py create mode 100644 feature_importance/fi_config/mdi_local/two_subgroups_linear_sims/models.py create mode 100644 feature_importance/scripts/competing_methods_local.py diff --git a/feature_importance/01_simulation_localMDI.py b/feature_importance/01_run_importance_local_simulations.py similarity index 80% rename from feature_importance/01_simulation_localMDI.py rename to feature_importance/01_run_importance_local_simulations.py index ef4b19b..c864c82 100644 --- a/feature_importance/01_simulation_localMDI.py +++ b/feature_importance/01_run_importance_local_simulations.py @@ -223,7 +223,7 @@ def run_comparison(path: str, def get_metrics(): - return [('rocauc', auroc_score), ('prauc', auprc_score)] + return [('rocauc', auroc_score)]#, ('prauc', auprc_score)] def reformat_results(results): @@ -413,4 +413,100 @@ def run_simulation(i, path, val_name, X_params_dict, X_dgp, y_params_dict, y_dgp metrics, args) for i in range(args.nreps)] assert all(results) - print('completed all experiments successfully!') \ No newline at end of file + print('completed all experiments successfully!') + +# get model file names + model_comparison_files_all = [] + for est in ests: + estimator_name = est[0].name.split(' - ')[0] + fi_estimators_all = [fi_estimator for fi_estimator in itertools.chain(*fi_ests) \ + if fi_estimator.model_type in est[0].model_type] + model_comparison_files = [f'{estimator_name}_{fi_estimator.name}_comparisons.pkl' for fi_estimator in + fi_estimators_all] + model_comparison_files_all += model_comparison_files + + # aggregate results + results_list = [] + if isinstance(vary_param_name, list): + for vary_param_dict in vary_param_dicts: + val_name = "_".join(vary_param_dict.values()) + + for i in range(args.nreps): + all_files = glob.glob(oj(path, val_name, 'rep' + str(i), '*')) + model_files = sorted([f for f in all_files if os.path.basename(f) in model_comparison_files_all]) + + if len(model_files) == 0: + print('No files found at ', oj(path, val_name, 'rep' + str(i))) + continue + + results = pd.concat( + [pkl.load(open(f, 'rb'))['df'] for f in model_files], + axis=0 + ) + + for param_name, param_val in vary_param_dict.items(): + val = vary_param_vals[param_name][param_val] + if vary_type[param_name] == "dgp": + if np.isscalar(val): + results.insert(0, param_name, val) + else: + results.insert(0, param_name, [val for i in range(results.shape[0])]) + results.insert(1, param_name + "_name", param_val) + elif vary_type[param_name] == "est" or vary_type[param_name] == "fi_est": + results.insert(0, param_name + "_name", copy.deepcopy(results[param_name])) + results.insert(0, 'rep', i) + results_list.append(results) + else: + for val_name, val in vary_param_vals.items(): + for i in range(args.nreps): + all_files = glob.glob(oj(path, val_name, 'rep' + str(i), '*')) + model_files = sorted([f for f in all_files if os.path.basename(f) in model_comparison_files_all]) + + if len(model_files) == 0: + print('No files found at ', oj(path, val_name, 'rep' + str(i))) + continue + + results = pd.concat( + [pkl.load(open(f, 'rb'))['df'] for f in model_files], + axis=0 + ) + if vary_type == "dgp": + if np.isscalar(val): + results.insert(0, vary_param_name, val) + else: + results.insert(0, vary_param_name, [val for i in range(results.shape[0])]) + results.insert(1, vary_param_name + "_name", val_name) + results.insert(2, 'rep', i) + elif vary_type == "est" or vary_type == "fi_est": + results.insert(0, vary_param_name + "_name", copy.deepcopy(results[vary_param_name])) + results.insert(1, 'rep', i) + results_list.append(results) + results_merged = pd.concat(results_list, axis=0) + pkl.dump(results_merged, open(oj(path, 'results.pkl'), 'wb')) + results_df = reformat_results(results_merged) + results_df.to_csv(oj(path, 'results.csv'), index=False) + + print('merged and saved all experiment results successfully!') + + # create R markdown summary of results + if args.create_rmd: + if args.show_vars is None: + show_vars = 'NULL' + else: + show_vars = args.show_vars + + if isinstance(vary_param_name, list): + vary_param_name = "; ".join(vary_param_name) + + sim_rmd = os.path.basename(results_dir) + '_simulation_results.Rmd' + os.system( + 'cp {} \'{}\''.format(oj("rmd", "simulation_results.Rmd"), sim_rmd) + ) + os.system( + 'Rscript -e "rmarkdown::render(\'{}\', params = list(results_dir = \'{}\', vary_param_name = \'{}\', seed = {}, keep_vars = {}), output_file = \'{}\', quiet = TRUE)"'.format( + sim_rmd, + results_dir, vary_param_name, str(args.split_seed), str(show_vars), + oj(path, "simulation_results.html")) + ) + os.system('rm \'{}\''.format(sim_rmd)) + print("created rmd of simulation results successfully!") \ No newline at end of file diff --git a/feature_importance/fi_config/mdi_local/two_subgroups_linear_sims/dgp.py b/feature_importance/fi_config/mdi_local/two_subgroups_linear_sims/dgp.py new file mode 100644 index 0000000..3091bd0 --- /dev/null +++ b/feature_importance/fi_config/mdi_local/two_subgroups_linear_sims/dgp.py @@ -0,0 +1,33 @@ +import sys +sys.path.append("../..") +from feature_importance.scripts.simulations_util import * + + +X_DGP = sample_real_X +X_PARAMS_DICT = { + "fpath": "data/X_splicing_cleaned.csv", + "sample_row_n": None, + "sample_col_n": None +} +### Update start for local MDI+ +Y_DGP = linear_model_two_groups +Y_PARAMS_DICT = { + "beta": 1, + "sigma": None, + "heritability": 0.4, + "s": 5 +} +### Update for local MDI+ done + +# # vary one parameter +# VARY_PARAM_NAME = "sample_row_n" +# VARY_PARAM_VALS = {"100": 100, "250": 250, "500": 500, "1000": 1000} + +# vary two parameters in a grid +VARY_PARAM_NAME = ["heritability", "sample_row_n"] +VARY_PARAM_VALS = {"heritability": {"0.1": 0.1, "0.2": 0.2, "0.4": 0.4, "0.8": 0.8}, + "sample_row_n": {"100": 100, "250": 250, "500": 500, "1000": 1000}} + +# # vary over n_estimators in RF model in models.py +# VARY_PARAM_NAME = "n_estimators" +# VARY_PARAM_VALS = {"placeholder": 0} \ No newline at end of file diff --git a/feature_importance/fi_config/mdi_local/two_subgroups_linear_sims/models.py b/feature_importance/fi_config/mdi_local/two_subgroups_linear_sims/models.py new file mode 100644 index 0000000..2357a1d --- /dev/null +++ b/feature_importance/fi_config/mdi_local/two_subgroups_linear_sims/models.py @@ -0,0 +1,15 @@ +from sklearn.ensemble import RandomForestRegressor +from feature_importance.util import ModelConfig, FIModelConfig +from feature_importance.scripts.competing_methods_local import tree_shap_local + +# N_ESTIMATORS=[50, 100, 500, 1000] +ESTIMATORS = [ + [ModelConfig('RF', RandomForestRegressor, model_type='tree', + other_params={'n_estimators': 100, 'min_samples_leaf': 5, 'max_features': 0.33})], + # [ModelConfig('RF', RandomForestRegressor, model_type='tree', vary_param="n_estimators", vary_param_val=m, + # other_params={'min_samples_leaf': 5, 'max_features': 0.33}) for m in N_ESTIMATORS] +] + +FI_ESTIMATORS = [ + [FIModelConfig('TreeSHAP', tree_shap_local, model_type='tree')] +] \ No newline at end of file diff --git a/feature_importance/fi_config/test/dgp.py b/feature_importance/fi_config/test/dgp.py index 7979215..6f9eb20 100644 --- a/feature_importance/fi_config/test/dgp.py +++ b/feature_importance/fi_config/test/dgp.py @@ -9,15 +9,13 @@ "sample_row_n": None, "sample_col_n": None } -### Update start for local MDI+ -Y_DGP = linear_model_two_groups #linear_model +Y_DGP = linear_model Y_PARAMS_DICT = { "beta": 1, "sigma": None, "heritability": 0.4, "s": 5 } -### Update for local MDI+ done # # vary one parameter # VARY_PARAM_NAME = "sample_row_n" diff --git a/feature_importance/scripts/competing_methods_local.py b/feature_importance/scripts/competing_methods_local.py new file mode 100644 index 0000000..b92fb67 --- /dev/null +++ b/feature_importance/scripts/competing_methods_local.py @@ -0,0 +1,39 @@ +import os +import sys +import pandas as pd +import numpy as np +import sklearn.base +from sklearn.base import RegressorMixin, ClassifierMixin +from functools import reduce + +import shap + + +def tree_shap_local(X, y, fit): + """ + Compute average treeshap value across observations + :param X: design matrix + :param y: response + :param fit: fitted model of interest (tree-based) + :return: dataframe - [Var, Importance] + Var: variable name + Importance: average absolute shap value + """ + explainer = shap.TreeExplainer(fit) + shap_values = explainer.shap_values(X, check_additivity=False) + if sklearn.base.is_classifier(fit): + def add_abs(a, b): + return abs(a) + abs(b) + results = reduce(add_abs, shap_values) + else: + results = abs(shap_values) + result_table = pd.DataFrame(results) + # results = results.mean(axis=0) + # results = pd.DataFrame(data=results, columns=['importance']) + # # Use column names from dataframe if possible + # if isinstance(X, pd.DataFrame): + # results.index = X.columns + # results.index.name = 'var' + # results.reset_index(inplace=True) + + return result_table \ No newline at end of file diff --git a/feature_importance/scripts/simulations_util.py b/feature_importance/scripts/simulations_util.py index bf0a71e..f697831 100644 --- a/feature_importance/scripts/simulations_util.py +++ b/feature_importance/scripts/simulations_util.py @@ -253,8 +253,8 @@ def create_y(x, s, beta, group_index): beta_group1 = generate_coef(beta, s) beta_group2 = generate_coef(beta, s) # Generate two response vectors for each subgroup - y_train_group1 = np.array([create_y(X[i, :], s, beta_group1, 0) for i in range(n//2)]) - y_train_group2 = np.array([create_y(X[i, :], s, beta_group2, 1) for i in range(n//2, n)]) + y_train_group1 = np.array([create_y(X[i, :], s, beta_group1, group_index=0) for i in range(n//2)]) + y_train_group2 = np.array([create_y(X[i, :], s, beta_group2, group_index=1) for i in range(n//2, n)]) y_train = np.concatenate((y_train_group1, y_train_group2)) ### Update for local MDI+ done