Skip to content

Commit

Permalink
Update hierarchy, build treeshap local,
Browse files Browse the repository at this point in the history
  • Loading branch information
zyliang2001 committed Jan 13, 2024
1 parent 7e3390e commit 85c21f1
Show file tree
Hide file tree
Showing 6 changed files with 188 additions and 7 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,7 @@ def run_comparison(path: str,


def get_metrics():
return [('rocauc', auroc_score), ('prauc', auprc_score)]
return [('rocauc', auroc_score)]#, ('prauc', auprc_score)]


def reformat_results(results):
Expand Down Expand Up @@ -413,4 +413,100 @@ def run_simulation(i, path, val_name, X_params_dict, X_dgp, y_params_dict, y_dgp
metrics, args) for i in range(args.nreps)]
assert all(results)

print('completed all experiments successfully!')
print('completed all experiments successfully!')

# get model file names
model_comparison_files_all = []
for est in ests:
estimator_name = est[0].name.split(' - ')[0]
fi_estimators_all = [fi_estimator for fi_estimator in itertools.chain(*fi_ests) \
if fi_estimator.model_type in est[0].model_type]
model_comparison_files = [f'{estimator_name}_{fi_estimator.name}_comparisons.pkl' for fi_estimator in
fi_estimators_all]
model_comparison_files_all += model_comparison_files

# aggregate results
results_list = []
if isinstance(vary_param_name, list):
for vary_param_dict in vary_param_dicts:
val_name = "_".join(vary_param_dict.values())

for i in range(args.nreps):
all_files = glob.glob(oj(path, val_name, 'rep' + str(i), '*'))
model_files = sorted([f for f in all_files if os.path.basename(f) in model_comparison_files_all])

if len(model_files) == 0:
print('No files found at ', oj(path, val_name, 'rep' + str(i)))
continue

results = pd.concat(
[pkl.load(open(f, 'rb'))['df'] for f in model_files],
axis=0
)

for param_name, param_val in vary_param_dict.items():
val = vary_param_vals[param_name][param_val]
if vary_type[param_name] == "dgp":
if np.isscalar(val):
results.insert(0, param_name, val)
else:
results.insert(0, param_name, [val for i in range(results.shape[0])])
results.insert(1, param_name + "_name", param_val)
elif vary_type[param_name] == "est" or vary_type[param_name] == "fi_est":
results.insert(0, param_name + "_name", copy.deepcopy(results[param_name]))
results.insert(0, 'rep', i)
results_list.append(results)
else:
for val_name, val in vary_param_vals.items():
for i in range(args.nreps):
all_files = glob.glob(oj(path, val_name, 'rep' + str(i), '*'))
model_files = sorted([f for f in all_files if os.path.basename(f) in model_comparison_files_all])

if len(model_files) == 0:
print('No files found at ', oj(path, val_name, 'rep' + str(i)))
continue

results = pd.concat(
[pkl.load(open(f, 'rb'))['df'] for f in model_files],
axis=0
)
if vary_type == "dgp":
if np.isscalar(val):
results.insert(0, vary_param_name, val)
else:
results.insert(0, vary_param_name, [val for i in range(results.shape[0])])
results.insert(1, vary_param_name + "_name", val_name)
results.insert(2, 'rep', i)
elif vary_type == "est" or vary_type == "fi_est":
results.insert(0, vary_param_name + "_name", copy.deepcopy(results[vary_param_name]))
results.insert(1, 'rep', i)
results_list.append(results)
results_merged = pd.concat(results_list, axis=0)
pkl.dump(results_merged, open(oj(path, 'results.pkl'), 'wb'))
results_df = reformat_results(results_merged)
results_df.to_csv(oj(path, 'results.csv'), index=False)

print('merged and saved all experiment results successfully!')

# create R markdown summary of results
if args.create_rmd:
if args.show_vars is None:
show_vars = 'NULL'
else:
show_vars = args.show_vars

if isinstance(vary_param_name, list):
vary_param_name = "; ".join(vary_param_name)

sim_rmd = os.path.basename(results_dir) + '_simulation_results.Rmd'
os.system(
'cp {} \'{}\''.format(oj("rmd", "simulation_results.Rmd"), sim_rmd)
)
os.system(
'Rscript -e "rmarkdown::render(\'{}\', params = list(results_dir = \'{}\', vary_param_name = \'{}\', seed = {}, keep_vars = {}), output_file = \'{}\', quiet = TRUE)"'.format(
sim_rmd,
results_dir, vary_param_name, str(args.split_seed), str(show_vars),
oj(path, "simulation_results.html"))
)
os.system('rm \'{}\''.format(sim_rmd))
print("created rmd of simulation results successfully!")
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import sys
sys.path.append("../..")
from feature_importance.scripts.simulations_util import *


X_DGP = sample_real_X
X_PARAMS_DICT = {
"fpath": "data/X_splicing_cleaned.csv",
"sample_row_n": None,
"sample_col_n": None
}
### Update start for local MDI+
Y_DGP = linear_model_two_groups
Y_PARAMS_DICT = {
"beta": 1,
"sigma": None,
"heritability": 0.4,
"s": 5
}
### Update for local MDI+ done

# # vary one parameter
# VARY_PARAM_NAME = "sample_row_n"
# VARY_PARAM_VALS = {"100": 100, "250": 250, "500": 500, "1000": 1000}

# vary two parameters in a grid
VARY_PARAM_NAME = ["heritability", "sample_row_n"]
VARY_PARAM_VALS = {"heritability": {"0.1": 0.1, "0.2": 0.2, "0.4": 0.4, "0.8": 0.8},
"sample_row_n": {"100": 100, "250": 250, "500": 500, "1000": 1000}}

# # vary over n_estimators in RF model in models.py
# VARY_PARAM_NAME = "n_estimators"
# VARY_PARAM_VALS = {"placeholder": 0}
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from sklearn.ensemble import RandomForestRegressor
from feature_importance.util import ModelConfig, FIModelConfig
from feature_importance.scripts.competing_methods_local import tree_shap_local

# N_ESTIMATORS=[50, 100, 500, 1000]
ESTIMATORS = [
[ModelConfig('RF', RandomForestRegressor, model_type='tree',
other_params={'n_estimators': 100, 'min_samples_leaf': 5, 'max_features': 0.33})],
# [ModelConfig('RF', RandomForestRegressor, model_type='tree', vary_param="n_estimators", vary_param_val=m,
# other_params={'min_samples_leaf': 5, 'max_features': 0.33}) for m in N_ESTIMATORS]
]

FI_ESTIMATORS = [
[FIModelConfig('TreeSHAP', tree_shap_local, model_type='tree')]
]
4 changes: 1 addition & 3 deletions feature_importance/fi_config/test/dgp.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,13 @@
"sample_row_n": None,
"sample_col_n": None
}
### Update start for local MDI+
Y_DGP = linear_model_two_groups #linear_model
Y_DGP = linear_model
Y_PARAMS_DICT = {
"beta": 1,
"sigma": None,
"heritability": 0.4,
"s": 5
}
### Update for local MDI+ done

# # vary one parameter
# VARY_PARAM_NAME = "sample_row_n"
Expand Down
39 changes: 39 additions & 0 deletions feature_importance/scripts/competing_methods_local.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import os
import sys
import pandas as pd
import numpy as np
import sklearn.base
from sklearn.base import RegressorMixin, ClassifierMixin
from functools import reduce

import shap


def tree_shap_local(X, y, fit):
"""
Compute average treeshap value across observations
:param X: design matrix
:param y: response
:param fit: fitted model of interest (tree-based)
:return: dataframe - [Var, Importance]
Var: variable name
Importance: average absolute shap value
"""
explainer = shap.TreeExplainer(fit)
shap_values = explainer.shap_values(X, check_additivity=False)
if sklearn.base.is_classifier(fit):
def add_abs(a, b):
return abs(a) + abs(b)
results = reduce(add_abs, shap_values)
else:
results = abs(shap_values)
result_table = pd.DataFrame(results)
# results = results.mean(axis=0)
# results = pd.DataFrame(data=results, columns=['importance'])
# # Use column names from dataframe if possible
# if isinstance(X, pd.DataFrame):
# results.index = X.columns
# results.index.name = 'var'
# results.reset_index(inplace=True)

return result_table
4 changes: 2 additions & 2 deletions feature_importance/scripts/simulations_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,8 +253,8 @@ def create_y(x, s, beta, group_index):
beta_group1 = generate_coef(beta, s)
beta_group2 = generate_coef(beta, s)
# Generate two response vectors for each subgroup
y_train_group1 = np.array([create_y(X[i, :], s, beta_group1, 0) for i in range(n//2)])
y_train_group2 = np.array([create_y(X[i, :], s, beta_group2, 1) for i in range(n//2, n)])
y_train_group1 = np.array([create_y(X[i, :], s, beta_group1, group_index=0) for i in range(n//2)])
y_train_group2 = np.array([create_y(X[i, :], s, beta_group2, group_index=1) for i in range(n//2, n)])
y_train = np.concatenate((y_train_group1, y_train_group2))
### Update for local MDI+ done

Expand Down

0 comments on commit 85c21f1

Please sign in to comment.