From 794f47f914df25373b27b226a7fa3818d10947eb Mon Sep 17 00:00:00 2001 From: zyliang2001 Date: Wed, 20 Mar 2024 23:47:42 -0700 Subject: [PATCH] Update normalization --- .../01_ablation_classification_script.sh | 2 +- feature_importance/01_ablation_regression_script.sh | 2 +- feature_importance/01_run_ablation_classification.py | 11 +++++++++++ feature_importance/01_run_ablation_regression.py | 11 +++++++++++ .../mdi_local/real_data_classification/models.py | 1 - feature_importance/scripts/simulations_util.py | 2 +- 6 files changed, 25 insertions(+), 4 deletions(-) diff --git a/feature_importance/01_ablation_classification_script.sh b/feature_importance/01_ablation_classification_script.sh index dbfacea..2850494 100755 --- a/feature_importance/01_ablation_classification_script.sh +++ b/feature_importance/01_ablation_classification_script.sh @@ -4,7 +4,7 @@ #SBATCH --partition=yugroup source activate mdi -command="01_run_ablation_classification.py --nreps 1 --config mdi_local.real_data_classification --split_seed ${1} --ignore_cache --create_rmd --result_name Diabetes_classification_parallel" +command="01_run_ablation_classification.py --nreps 1 --config mdi_local.real_data_classification --split_seed ${1} --normalization train_test --ignore_cache --create_rmd --result_name Diabetes_classification_parallel" # Execute the command python $command \ No newline at end of file diff --git a/feature_importance/01_ablation_regression_script.sh b/feature_importance/01_ablation_regression_script.sh index 2cfba56..de6d7e0 100755 --- a/feature_importance/01_ablation_regression_script.sh +++ b/feature_importance/01_ablation_regression_script.sh @@ -4,7 +4,7 @@ #SBATCH --partition=yugroup source activate mdi -command="01_run_ablation_regression.py --nreps 1 --config mdi_local.real_data_regression --split_seed ${1} --ignore_cache --create_rmd --result_name diabetes_regression_parallel" +command="01_run_ablation_regression.py --nreps 1 --config mdi_local.real_data_regression --split_seed ${1} --normalization train_test --ignore_cache --create_rmd --result_name diabetes_regression_parallel" # Execute the command python $command \ No newline at end of file diff --git a/feature_importance/01_run_ablation_classification.py b/feature_importance/01_run_ablation_classification.py index fe5dc78..e94e6d9 100644 --- a/feature_importance/01_run_ablation_classification.py +++ b/feature_importance/01_run_ablation_classification.py @@ -17,6 +17,7 @@ from typing import Callable, List, Tuple import itertools from sklearn.metrics import roc_auc_score, f1_score, recall_score, precision_score, mean_squared_error +from sklearn import preprocessing sys.path.append(".") sys.path.append("..") @@ -100,6 +101,15 @@ def compare_estimators(estimators: List[ModelConfig], y_tune = y y_test = y + normalizer = preprocessing.Normalizer() + if args.normalization == "train_test": + X_train = normalizer.fit_transform(X_train) + X_test = normalizer.transform(X_test) + elif args.normalization == "all": + X = normalizer.fit_transform(X) + X_train = normalizer.transform(X_train) + X_test = normalizer.transform(X_test) + # fit model est.fit(X_train, y_train) test_all_auc = roc_auc_score(y_test, est.predict_proba(X_test)[:, 1]) @@ -329,6 +339,7 @@ def run_simulation(i, path, val_name, X_params_dict, X_dgp, y_params_dict, y_dgp parser.add_argument('--n_cores', type=int, default=None) parser.add_argument('--split_seed', type=int, default=0) parser.add_argument('--results_path', type=str, default=default_dir) + parser.add_argument('--normalization', type=str, default="none") # arguments for rmd output of results parser.add_argument('--create_rmd', action='store_true', default=False) diff --git a/feature_importance/01_run_ablation_regression.py b/feature_importance/01_run_ablation_regression.py index 5114b01..5649739 100644 --- a/feature_importance/01_run_ablation_regression.py +++ b/feature_importance/01_run_ablation_regression.py @@ -17,6 +17,7 @@ from typing import Callable, List, Tuple import itertools from sklearn.metrics import roc_auc_score, f1_score, recall_score, precision_score, mean_squared_error, r2_score +from sklearn import preprocessing sys.path.append(".") sys.path.append("..") @@ -100,6 +101,15 @@ def compare_estimators(estimators: List[ModelConfig], y_tune = y y_test = y + normalizer = preprocessing.Normalizer() + if args.normalization == "train_test": + X_train = normalizer.fit_transform(X_train) + X_test = normalizer.transform(X_test) + elif args.normalization == "all": + X = normalizer.fit_transform(X) + X_train = normalizer.transform(X_train) + X_test = normalizer.transform(X_test) + # fit model est.fit(X_train, y_train) test_all_mse = mean_squared_error(y_test, est.predict(X_test)) @@ -325,6 +335,7 @@ def run_simulation(i, path, val_name, X_params_dict, X_dgp, y_params_dict, y_dgp parser.add_argument('--n_cores', type=int, default=None) parser.add_argument('--split_seed', type=int, default=0) parser.add_argument('--results_path', type=str, default=default_dir) + parser.add_argument('--normalization', type=str, default="none") # arguments for rmd output of results parser.add_argument('--create_rmd', action='store_true', default=False) diff --git a/feature_importance/fi_config/mdi_local/real_data_classification/models.py b/feature_importance/fi_config/mdi_local/real_data_classification/models.py index e87d392..182148a 100644 --- a/feature_importance/fi_config/mdi_local/real_data_classification/models.py +++ b/feature_importance/fi_config/mdi_local/real_data_classification/models.py @@ -18,7 +18,6 @@ FI_ESTIMATORS = [ [FIModelConfig('LFI_with_raw_RF', LFI_test_evaluation_RF, model_type='tree', splitting_strategy = "train-test")], - [FIModelConfig('LFI_with_raw_CV_RF', LFI_test_evaluation_RF, model_type='tree', splitting_strategy = "train-test", other_params={"cv_ridge": 5, "calc_loo_coef":False})], [FIModelConfig('MDI_RF', LFI_test_evaluation_RF, model_type='tree', splitting_strategy = "train-test", other_params={"include_raw": False, "cv_ridge": 0, "calc_loo_coef":False, "sample_split":"inbag"})], [FIModelConfig('LFI_with_raw_OOB_RF', LFI_test_evaluation_RF, model_type='tree', splitting_strategy = "train-test", other_params={"sample_split":"oob", "fit_on":"test", "calc_loo_coef":False})], [FIModelConfig('TreeSHAP_RF', tree_shap_evaluation_RF, model_type='tree', splitting_strategy = "train-test")], diff --git a/feature_importance/scripts/simulations_util.py b/feature_importance/scripts/simulations_util.py index c81d3f2..4b1380d 100644 --- a/feature_importance/scripts/simulations_util.py +++ b/feature_importance/scripts/simulations_util.py @@ -6,7 +6,7 @@ import math -def sample_real_data(X_fpath=None, y_fpath=None, seed=4307, normalize=True, +def sample_real_data(X_fpath=None, y_fpath=None, seed=4307, normalize=False, sample_row_n=None, sample_col_n=None, return_data=None, return_support=True):