From 9190311c7a94cc6d3f2d8b57d6b484e54b27e4b2 Mon Sep 17 00:00:00 2001 From: Andrew Weisman Date: Mon, 6 May 2019 15:54:06 -0400 Subject: [PATCH 001/331] Allowed uno benchmark to use a default params file variable within the model --- .gitignore | 1 + Pilot1/Uno/uno_baseline_keras2.py | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index d646835b..3253a2d2 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ *.pyc __pycache__/ +Data diff --git a/Pilot1/Uno/uno_baseline_keras2.py b/Pilot1/Uno/uno_baseline_keras2.py index 040f81c6..27705acc 100644 --- a/Pilot1/Uno/uno_baseline_keras2.py +++ b/Pilot1/Uno/uno_baseline_keras2.py @@ -279,7 +279,8 @@ def build_model(loader, args, permanent_dropout=True, silent=False): def initialize_parameters(): # Build benchmark object - unoBmk = benchmark.BenchmarkUno(benchmark.file_path, 'uno_default_model.txt', 'keras', + #mymodel_common = candle.Benchmark(file_path,os.getenv("DEFAULT_PARAMS_FILE"),'keras',prog='myprog',desc='My model') + unoBmk = benchmark.BenchmarkUno(benchmark.file_path, os.getenv("DEFAULT_PARAMS_FILE"), 'keras', prog='uno_baseline', desc='Build neural network based models to predict tumor response to single and paired drugs.') # Initialize parameters From 3ac1f83aa7384ef86ad1dc62093b59061d2493a0 Mon Sep 17 00:00:00 2001 From: Hyunseung Yoo Date: Wed, 5 Jun 2019 12:52:23 -0500 Subject: [PATCH 002/331] wip milestone 13 --- Pilot1/Uno/topN_to_uno.py | 94 +++++++++++++++++++++++++++++++ Pilot1/Uno/uno_baseline_keras2.py | 2 +- Pilot1/Uno/uno_data.py | 12 ++-- 3 files changed, 103 insertions(+), 5 deletions(-) create mode 100644 Pilot1/Uno/topN_to_uno.py diff --git a/Pilot1/Uno/topN_to_uno.py b/Pilot1/Uno/topN_to_uno.py new file mode 100644 index 00000000..1f7c2b6a --- /dev/null +++ b/Pilot1/Uno/topN_to_uno.py @@ -0,0 +1,94 @@ +import argparse +import json +import pandas as pd +import numpy as np + + +def parse_arguments(): + parser = argparse.ArgumentParser() + parser.add_argument('--dataframe_from', type=str, default='top21_dataframe_8x8.csv', + help='Dataframe file name contains all data points') + parser.add_argument('--plan', type=str, default='plan.json', + help='Plan data file') + parser.add_argument('--node', type=str, default=None, + help='node number to execute') + + args, unparsed = parser.parse_known_args() + return args, unparsed + + +def read_plan(filename, node): + print("reading {} file for node {}".format(filename, node)) + with open(filename, 'r') as plan_file: + plan = json.load(plan_file) + if node in plan: + return plan[node] + else: + raise Exception('Node index {} was not found in plan file') + + +def build_masks(args, df): + if args.node is None: + raise Exception('Node id is not given') + + plan = read_plan(args.plan, args.node) + mask = {} + for partition in ['train', 'val']: + _mask = df['Sample'] == None + for i, element in enumerate(plan[partition]): + cl_filter = element['CELL'] + dr_filter = element['DRUG'] + __mask = df['Sample'].isin(cl_filter) & df['Drug1'].isin(dr_filter) + _mask = _mask | __mask + mask[partition] = _mask + + return mask['train'], mask['val'] + + +def training_mask(df): + return np.random.rand(len(df)) < 0.8 + + +def read_dataframe(args): + df = pd.read_csv(args.dataframe_from, low_memory=False, na_values='na').fillna(0) + df.rename(columns={'SAMPLE': 'Sample', 'DRUG': 'Drug1'}, inplace=True) + df_y = df[['AUC', 'Sample', 'Drug1']] + + cols = df.columns.to_list() + cl_columns = list(filter(lambda x: x.startswith('GE_'), cols)) + dd_columns = list(filter(lambda x: x.startswith('DD_'), cols)) + + df_cl = df.loc[:, cl_columns] + df_dd = df.loc[:, dd_columns] + + return df_y, df_cl, df_dd + + +def build_dataframe(args): + df_y, df_cl, df_dd = read_dataframe(args) + + # mask = training_mask(df_y) + train_mask, val_mask = build_masks(args, df_y) + + y_train = pd.DataFrame(data=df_y[train_mask].reset_index(drop=True)) + y_val = pd.DataFrame(data=df_y[val_mask].reset_index(drop=True)) + + x_train_0 = df_cl[train_mask].reset_index(drop=True) + x_train_1 = df_dd[train_mask].reset_index(drop=True) + + x_val_0 = df_cl[val_mask].reset_index(drop=True) + x_val_1 = df_dd[val_mask].reset_index(drop=True) + + # store + store = pd.HDFStore('topN.uno.h5', 'w') + store.put('y_train', y_train) + store.put('y_val', y_val) + store.put('x_train_0', x_train_0) + store.put('x_train_1', x_train_1) + store.put('x_val_0', x_val_0) + store.put('x_val_1', x_val_1) + + +if __name__ == '__main__': + parsed, unparsed = parse_arguments() + build_dataframe(parsed) diff --git a/Pilot1/Uno/uno_baseline_keras2.py b/Pilot1/Uno/uno_baseline_keras2.py index 722f9482..8de286a5 100644 --- a/Pilot1/Uno/uno_baseline_keras2.py +++ b/Pilot1/Uno/uno_baseline_keras2.py @@ -444,7 +444,7 @@ def warmup_scheduler(epoch): if args.tb: callbacks.append(tensorboard) if args.save_weights: - callbacks.append(SimpleWeightSaver(args.save_path + '/' + args.save_weights)) + callbacks.append(MultiGPUCheckpoint(args.save_weights)) if args.use_exported_data is not None: train_gen = DataFeeder(filename=args.use_exported_data, batch_size=args.batch_size, shuffle=args.shuffle, single=args.single, agg_dose=args.agg_dose) diff --git a/Pilot1/Uno/uno_data.py b/Pilot1/Uno/uno_data.py index 5ede815e..52450fb2 100644 --- a/Pilot1/Uno/uno_data.py +++ b/Pilot1/Uno/uno_data.py @@ -955,7 +955,7 @@ def __init__(self, partition='train', filename=None, batch_size=32, shuffle=Fals # 4 inputs for single drug model (cell, dose1, descriptor, fingerprint) # 7 inputs for drug pair model (cell, dose1, dose1, dr1.descriptor, dr1.fingerprint, dr2.descriptor, dr2.fingerprint) self.input_size = 4 if self.single else 7 - self.input_size = 3 if agg_dose else self.input_size + self.input_size = 2 if agg_dose else self.input_size self.store = pd.HDFStore(filename, mode='r') y = self.store.select('y_{}'.format(self.partition)) @@ -973,7 +973,7 @@ def __getitem__(self, idx): start = self.index_map[idx] * self.batch_size stop = (self.index_map[idx] + 1) * self.batch_size x = [self.store.select('x_{0}_{1}'.format(self.partition, i), start=start, stop=stop) for i in range(self.input_size)] - y = self.store.select('y_{}'.format(self.partition), start=start, stop=stop, columns=[self.target]) + y = self.store.select('y_{}'.format(self.partition), start=start, stop=stop)[self.target] return x, y def reset(self): @@ -982,8 +982,12 @@ def reset(self): pass def get_response(self, copy=False): - self.index = [item for step in range(self.steps) for item in range(self.index_map[step] * self.batch_size, (self.index_map[step] + 1) * self.batch_size)] - df = self.store.get('y_{}'.format(self.partition)).iloc[self.index,:] + if self.shuffle: + self.index = [item for step in range(self.steps) for item in range(self.index_map[step] * self.batch_size, (self.index_map[step] + 1) * self.batch_size)] + df = self.store.get('y_{}'.format(self.partition)).iloc[self.index,:] + else: + df = self.store.get('y_{}'.format(self.partition)) + if self.agg_dose is None: df['Dose1'] = self.store.get('x_{}_0'.format(self.partition)).iloc[self.index,:] if not self.single: From c892f7c20af256d5ec577f9f5087ca78dc559392 Mon Sep 17 00:00:00 2001 From: Cristina Garcia Cardona - 285490 Date: Mon, 10 Jun 2019 15:27:22 -0600 Subject: [PATCH 003/331] Added UQ functionality, both at common and Pilot1 levels. New folder: Pilot1/Uno_UQ --- Pilot1/Uno_UQ/calibration/calibration_HET.py | 115 +++ Pilot1/Uno_UQ/calibration/calibration_HOM.py | 98 +++ .../Uno_UQ/calibration/calibration_HOM_all.py | 98 +++ Pilot1/Uno_UQ/calibration/calibration_QTL.py | 117 +++ Pilot1/Uno_UQ/data_utils_/__init__.py | 1 + Pilot1/Uno_UQ/data_utils_/cellline_data.py | 97 +++ Pilot1/Uno_UQ/data_utils_/drug_data.py | 188 +++++ Pilot1/Uno_UQ/data_utils_/response_data.py | 175 ++++ Pilot1/Uno_UQ/data_utils_/uno.py | 353 +++++++++ .../uno_combined_data_generator.py | 257 ++++++ .../data_utils_/uno_combined_data_loader.py | 427 ++++++++++ Pilot1/Uno_UQ/model_utils_/__init__.py | 0 Pilot1/Uno_UQ/model_utils_/uno_model_utils.py | 307 +++++++ Pilot1/Uno_UQ/uno_gCSI_modelUQ.txt | 39 + Pilot1/Uno_UQ/uno_holdoutUQ_data.py | 109 +++ Pilot1/Uno_UQ/uno_inferUQ_keras2.py | 296 +++++++ Pilot1/Uno_UQ/uno_trainUQ_keras2.py | 404 ++++++++++ common/candle/__init__.py | 28 + common/candle_keras/__init__.py | 32 +- common/data_utils.py | 156 +++- common/keras_utils.py | 25 +- common/uq_utils.py | 749 +++++++++++++++++- common/viz_utils.py | 299 +++++++ 23 files changed, 4363 insertions(+), 7 deletions(-) create mode 100644 Pilot1/Uno_UQ/calibration/calibration_HET.py create mode 100644 Pilot1/Uno_UQ/calibration/calibration_HOM.py create mode 100644 Pilot1/Uno_UQ/calibration/calibration_HOM_all.py create mode 100644 Pilot1/Uno_UQ/calibration/calibration_QTL.py create mode 100644 Pilot1/Uno_UQ/data_utils_/__init__.py create mode 100644 Pilot1/Uno_UQ/data_utils_/cellline_data.py create mode 100644 Pilot1/Uno_UQ/data_utils_/drug_data.py create mode 100644 Pilot1/Uno_UQ/data_utils_/response_data.py create mode 100644 Pilot1/Uno_UQ/data_utils_/uno.py create mode 100644 Pilot1/Uno_UQ/data_utils_/uno_combined_data_generator.py create mode 100644 Pilot1/Uno_UQ/data_utils_/uno_combined_data_loader.py create mode 100644 Pilot1/Uno_UQ/model_utils_/__init__.py create mode 100644 Pilot1/Uno_UQ/model_utils_/uno_model_utils.py create mode 100644 Pilot1/Uno_UQ/uno_gCSI_modelUQ.txt create mode 100644 Pilot1/Uno_UQ/uno_holdoutUQ_data.py create mode 100644 Pilot1/Uno_UQ/uno_inferUQ_keras2.py create mode 100644 Pilot1/Uno_UQ/uno_trainUQ_keras2.py diff --git a/Pilot1/Uno_UQ/calibration/calibration_HET.py b/Pilot1/Uno_UQ/calibration/calibration_HET.py new file mode 100644 index 00000000..ab354d76 --- /dev/null +++ b/Pilot1/Uno_UQ/calibration/calibration_HET.py @@ -0,0 +1,115 @@ +#! /usr/bin/env python + +from __future__ import division, print_function + +import pandas as pd +import sys +import os +import pickle +import dill + +lib_path2 = os.path.abspath(os.path.join('..', '..', 'common')) +sys.path.append(lib_path2) + +import candle_keras as candle + +def read_file(path, filename): + + df_data = pd.read_csv(path + filename, sep='\t') + print('data read shape: ', df_data.shape) + + return df_data + +def main(): + + if ( len ( sys.argv ) < 3 ) : + sys.stderr.write ( "\nUsage: calibration_HET.py PATH FILENAME [PLOT_STEPS_FLAG]\n" ) + sys.stderr.write ( "FILENAME: usually .predicted_INFER_HET.tsv\n") + sys.exit ( 0 ) + + path = sys.argv [1] + filename = sys.argv [2] + + try: + steps = sys.argv [3] + except IndexError: + steps = False + + + folder_out = './outUQ/' + if folder_out and not os.path.exists(folder_out): + os.makedirs(folder_out) + + index_dp = filename.find('DR=') + if index_dp == -1: # DR is not in filename + print('Enter dropout rate ') + dp_perc = input() + else: + if filename[index_dp + 6] == '.': + dp = float(filename[index_dp+3:index_dp+3+3]) + else: + dp = float(filename[index_dp+3:index_dp+3+4]) + + print('Droput rate: ', dp) + dp_perc = dp * 100. + method = 'Dropout ' + str(dp_perc) + '%' + prefix = folder_out + 'heteroscedastic_DR=' + str(dp_perc) + + df_data = read_file(path, filename) + Ytest, Ypred_mean, yerror, sigma, Ypred_std, pred_name = candle.compute_statistics_heteroscedastic(df_data) + + # storing sigma + fname = prefix + '_sigma.pkl' + with open(fname, 'wb') as f: + pickle.dump(sigma, f, protocol=4) + print('Sigma stored in file: ', fname) + + #plots + candle.plot_density_observed_vs_predicted(Ytest, Ypred_mean, pred_name, prefix) + candle.plot_2d_density_sigma_vs_error(sigma, yerror, method, prefix) + candle.plot_histogram_error_per_sigma(sigma, yerror, method, prefix) + + # shuffle data for calibration + index_perm_total, pSigma_cal, pSigma_test, pMean_cal, pMean_test, true_cal, true_test = candle.split_data_for_empirical_calibration(Ytest, Ypred_mean, sigma) + + # Compute empirical calibration + bins = 31 + coverage_percentile = 95 + mean_sigma, min_sigma, max_sigma, error_thresholds, err_err, error_thresholds_smooth, sigma_start_index, sigma_end_index, s_interpolate = candle.compute_empirical_calibration(pSigma_cal, pMean_cal, true_cal, bins, coverage_percentile) + + candle.plot_calibration_and_errors(mean_sigma, sigma_start_index, sigma_end_index, + min_sigma, max_sigma, + error_thresholds, + error_thresholds_smooth, + err_err, + s_interpolate, + coverage_percentile, method, prefix, steps) + + + # Use empirical calibration and automatic determined monotonic interval + minL_sigma_auto = mean_sigma[sigma_start_index] + maxL_sigma_auto = mean_sigma[sigma_end_index] + index_sigma_range_test, xp_test, yp_test, eabs_red = candle.applying_calibration(pSigma_test, pMean_test, true_test, s_interpolate, minL_sigma_auto, maxL_sigma_auto) + # Check sigma overprediction + p_cov = coverage_percentile + num_cal = pSigma_cal.shape[0] + pYstd_perm_all = Ypred_std[index_perm_total] + pYstd_test = pYstd_perm_all[num_cal:] + pYstd_red = pYstd_test[index_sigma_range_test] + candle.overprediction_check(yp_test, eabs_red) + + # storing calibration + fname = prefix + '_calibration_spline.dkl' + with open(fname, 'wb') as f: +# pickle.dump(s_interpolate, f, protocol=pickle.HIGHEST_PROTOCOL) + dill.dump(s_interpolate, f) + print('Calibration spline stored in file: ', fname) + fname = prefix + '_calibration_limits.pkl' + with open(fname, 'wb') as f: + pickle.dump([minL_sigma_auto, maxL_sigma_auto], f, protocol=4) + print('Calibration limits stored in file: ', fname) + +if __name__ == '__main__': + main() + + diff --git a/Pilot1/Uno_UQ/calibration/calibration_HOM.py b/Pilot1/Uno_UQ/calibration/calibration_HOM.py new file mode 100644 index 00000000..a9440fcb --- /dev/null +++ b/Pilot1/Uno_UQ/calibration/calibration_HOM.py @@ -0,0 +1,98 @@ +#! /usr/bin/env python + +from __future__ import division, print_function + +import pandas as pd +import sys +import os +import pickle +import dill + +lib_path2 = os.path.abspath(os.path.join('..', '..', 'common')) +sys.path.append(lib_path2) + +import candle_keras as candle + + +def read_file(path, filename): + + df_data = pd.read_csv(path + filename, sep='\t') + print('data read shape: ', df_data.shape) + + return df_data + +def main(): + + if ( len ( sys.argv ) < 3 ) : + sys.stderr.write ( "\nUsage: calibration_HOM.py PATH FILENAME [PLOT_STEPS_FLAG]\n" ) + sys.stderr.write ("FILENAME: usually _pred.tsv\n") + sys.exit ( 0 ) + + path = sys.argv [1] + filename = sys.argv [2] + + try: + steps = sys.argv [3] + except IndexError: + steps = False + + folder_out = './outUQ/' + if folder_out and not os.path.exists(folder_out): + os.makedirs(folder_out) + + method = 'Dropout' + prefix = folder_out + 'homoscedastic_DR' + + df_data = read_file(path, filename) + Ytest, Ypred_mean, yerror, sigma, Ypred_std, pred_name = candle.compute_statistics_homoscedastic(df_data) + + #plots + candle.plot_density_observed_vs_predicted(Ytest, Ypred_mean, pred_name, prefix) + candle.plot_2d_density_sigma_vs_error(sigma, yerror, method, prefix) + candle.plot_histogram_error_per_sigma(sigma, yerror, method, prefix) + + # shuffle data for calibration + index_perm_total, pSigma_cal, pSigma_test, pMean_cal, pMean_test, true_cal, true_test = candle.split_data_for_empirical_calibration(Ytest, Ypred_mean, sigma) + + # Compute empirical calibration + bins = 60 + coverage_percentile = 95 + mean_sigma, min_sigma, max_sigma, error_thresholds, err_err, error_thresholds_smooth, sigma_start_index, sigma_end_index, s_interpolate = candle.compute_empirical_calibration(pSigma_cal, pMean_cal, true_cal, bins, coverage_percentile) + + candle.plot_calibration_and_errors(mean_sigma, sigma_start_index, sigma_end_index, + min_sigma, max_sigma, + error_thresholds, + error_thresholds_smooth, + err_err, + s_interpolate, + coverage_percentile, method, prefix, steps) + + + # Use empirical calibration and automatic determined monotonic interval + minL_sigma_auto = mean_sigma[sigma_start_index] + maxL_sigma_auto = mean_sigma[sigma_end_index] + index_sigma_range_test, xp_test, yp_test, eabs_red = candle.applying_calibration(pSigma_test, pMean_test, true_test, s_interpolate, minL_sigma_auto, maxL_sigma_auto) + # Check sigma overprediction + p_cov = coverage_percentile + num_cal = pSigma_cal.shape[0] + pYstd_perm_all = Ypred_std[index_perm_total] + pYstd_test = pYstd_perm_all[num_cal:] + pYstd_red = pYstd_test[index_sigma_range_test] + candle.overprediction_check(yp_test, eabs_red) + + # storing calibration + fname = prefix + '_calibration_limits.pkl' + with open(fname, 'wb') as f: + pickle.dump([minL_sigma_auto, maxL_sigma_auto], f, protocol=4) + print('Calibration limits stored in file: ', fname) + fname = prefix + '_calibration_spline.dkl' + with open(fname, 'wb') as f: +# pickle.dump(s_interpolate, f, protocol=pickle.HIGHEST_PROTOCOL) + dill.dump(s_interpolate, f) + print('Calibration spline stored in file: ', fname) + + +if __name__ == '__main__': + main() + + diff --git a/Pilot1/Uno_UQ/calibration/calibration_HOM_all.py b/Pilot1/Uno_UQ/calibration/calibration_HOM_all.py new file mode 100644 index 00000000..df7e064b --- /dev/null +++ b/Pilot1/Uno_UQ/calibration/calibration_HOM_all.py @@ -0,0 +1,98 @@ +#! /usr/bin/env python + +from __future__ import division, print_function + +import pandas as pd +import sys +import os +import pickle +import dill + +lib_path2 = os.path.abspath(os.path.join('..', '..', 'common')) +sys.path.append(lib_path2) + +import candle_keras as candle + + +def read_file(path, filename): + + df_data = pd.read_csv(path + filename, sep='\t') + print('data read shape: ', df_data.shape) + + return df_data + +def main(): + + if ( len ( sys.argv ) < 3 ) : + sys.stderr.write ( "\nUsage: calibration_HOM_all.py PATH FILENAME [PLOT_STEPS_FLAG]\n" ) + sys.stderr.write ("FILENAME: usually .predicted_INFER.tsv\n") + sys.exit ( 0 ) + + path = sys.argv [1] + filename = sys.argv [2] + + try: + steps = sys.argv [3] + except IndexError: + steps = False + + folder_out = './outUQ/' + if folder_out and not os.path.exists(folder_out): + os.makedirs(folder_out) + + method = 'Dropout' + prefix = folder_out + 'homoscedastic_DR' + + df_data = read_file(path, filename) + Ytest, Ypred_mean, yerror, sigma, Ypred_std, pred_name = candle.compute_statistics_homoscedastic_all(df_data) + + #plots + candle.plot_density_observed_vs_predicted(Ytest, Ypred_mean, pred_name, prefix) + candle.plot_2d_density_sigma_vs_error(sigma, yerror, method, prefix) + candle.plot_histogram_error_per_sigma(sigma, yerror, method, prefix) + + # shuffle data for calibration + index_perm_total, pSigma_cal, pSigma_test, pMean_cal, pMean_test, true_cal, true_test = candle.split_data_for_empirical_calibration(Ytest, Ypred_mean, sigma) + + # Compute empirical calibration + bins = 60 + coverage_percentile = 95 + mean_sigma, min_sigma, max_sigma, error_thresholds, err_err, error_thresholds_smooth, sigma_start_index, sigma_end_index, s_interpolate = candle.compute_empirical_calibration(pSigma_cal, pMean_cal, true_cal, bins, coverage_percentile) + + candle.plot_calibration_and_errors(mean_sigma, sigma_start_index, sigma_end_index, + min_sigma, max_sigma, + error_thresholds, + error_thresholds_smooth, + err_err, + s_interpolate, + coverage_percentile, method, prefix, steps) + + + # Use empirical calibration and automatic determined monotonic interval + minL_sigma_auto = mean_sigma[sigma_start_index] + maxL_sigma_auto = mean_sigma[sigma_end_index] + index_sigma_range_test, xp_test, yp_test, eabs_red = candle.applying_calibration(pSigma_test, pMean_test, true_test, s_interpolate, minL_sigma_auto, maxL_sigma_auto) + # Check sigma overprediction + p_cov = coverage_percentile + num_cal = pSigma_cal.shape[0] + pYstd_perm_all = Ypred_std[index_perm_total] + pYstd_test = pYstd_perm_all[num_cal:] + pYstd_red = pYstd_test[index_sigma_range_test] + candle.overprediction_check(yp_test, eabs_red) + + # storing calibration + fname = prefix + '_calibration_limits.pkl' + with open(fname, 'wb') as f: + pickle.dump([minL_sigma_auto, maxL_sigma_auto], f, protocol=4) + print('Calibration limits stored in file: ', fname) + fname = prefix + '_calibration_spline.dkl' + with open(fname, 'wb') as f: +# pickle.dump(s_interpolate, f, protocol=pickle.HIGHEST_PROTOCOL) + dill.dump(s_interpolate, f) + print('Calibration spline stored in file: ', fname) + + +if __name__ == '__main__': + main() + + diff --git a/Pilot1/Uno_UQ/calibration/calibration_QTL.py b/Pilot1/Uno_UQ/calibration/calibration_QTL.py new file mode 100644 index 00000000..65f12710 --- /dev/null +++ b/Pilot1/Uno_UQ/calibration/calibration_QTL.py @@ -0,0 +1,117 @@ +#! /usr/bin/env python + +from __future__ import division, print_function + +import pandas as pd +import sys +import os +import pickle +import dill + +lib_path2 = os.path.abspath(os.path.join('..', '..', 'common')) +sys.path.append(lib_path2) + +import candle_keras as candle + +def read_file(path, filename): + + df_data = pd.read_csv(path + filename, sep='\t') + print('data read shape: ', df_data.shape) + + return df_data + +def main(): + + if ( len ( sys.argv ) < 3 ) : + sys.stderr.write ( "\nUsage: calibration_QTL.py PATH FILENAME [PLOT_STEPS_FLAG]\n" ) + sys.stderr.write ( "FILENAME: usually .predicted_INFER_QTL.tsv\n") + sys.exit ( 0 ) + + path = sys.argv [1] + filename = sys.argv [2] + + try: + steps = sys.argv [3] + except IndexError: + steps = False + + + folder_out = './outUQ/' + if folder_out and not os.path.exists(folder_out): + os.makedirs(folder_out) + + index_dp = filename.find('DR=') + if index_dp == -1: # DR is not in filename + print('Enter dropout rate ') + dp_perc = input() + else: + if filename[index_dp + 6] == '.': + dp = float(filename[index_dp+3:index_dp+3+3]) + else: + dp = float(filename[index_dp+3:index_dp+3+4]) + + print('Droput rate: ', dp) + dp_perc = dp * 100. + method = 'Dropout ' + str(dp_perc) + '%' + prefix = folder_out + 'quantile_DR=' + str(dp_perc) + + df_data = read_file(path, filename) + Ytest, Ypred_mean, yerror, sigma, Ypred_std, pred_name, Ypred_10p_mean, Ypred_90p_mean = candle.compute_statistics_quantile(df_data) + + # storing sigma + fname = prefix + '_sigma.pkl' + with open(fname, 'wb') as f: + pickle.dump(sigma, f, protocol=4) + print('Sigma stored in file: ', fname) + + #plots + percentile_list = ['50p', '10p', '90p'] + candle.plot_percentile_predictions(Ypred_mean, Ypred_10p_mean, Ypred_90p_mean, percentile_list, pred_name, prefix) + candle.plot_density_observed_vs_predicted(Ytest, Ypred_mean, pred_name, prefix) + candle.plot_2d_density_sigma_vs_error(sigma, yerror, method, prefix) + candle.plot_histogram_error_per_sigma(sigma, yerror, method, prefix) + + # shuffle data for calibration + index_perm_total, pSigma_cal, pSigma_test, pMean_cal, pMean_test, true_cal, true_test = candle.split_data_for_empirical_calibration(Ytest, Ypred_mean, sigma) + + # Compute empirical calibration + bins = 31 + coverage_percentile = 95 + mean_sigma, min_sigma, max_sigma, error_thresholds, err_err, error_thresholds_smooth, sigma_start_index, sigma_end_index, s_interpolate = candle.compute_empirical_calibration(pSigma_cal, pMean_cal, true_cal, bins, coverage_percentile) + + candle.plot_calibration_and_errors(mean_sigma, sigma_start_index, sigma_end_index, + min_sigma, max_sigma, + error_thresholds, + error_thresholds_smooth, + err_err, + s_interpolate, + coverage_percentile, method, prefix, steps) + + + # Use empirical calibration and automatic determined monotonic interval + minL_sigma_auto = mean_sigma[sigma_start_index] + maxL_sigma_auto = mean_sigma[sigma_end_index] + index_sigma_range_test, xp_test, yp_test, eabs_red = candle.applying_calibration(pSigma_test, pMean_test, true_test, s_interpolate, minL_sigma_auto, maxL_sigma_auto) + # Check sigma overprediction + p_cov = coverage_percentile + num_cal = pSigma_cal.shape[0] + pYstd_perm_all = Ypred_std[index_perm_total] + pYstd_test = pYstd_perm_all[num_cal:] + pYstd_red = pYstd_test[index_sigma_range_test] + candle.overprediction_check(yp_test, eabs_red) + + # storing calibration + fname = prefix + '_calibration_spline.dkl' + with open(fname, 'wb') as f: +# pickle.dump(s_interpolate, f, protocol=pickle.HIGHEST_PROTOCOL) + dill.dump(s_interpolate, f) + print('Calibration spline stored in file: ', fname) + fname = prefix + '_calibration_limits.pkl' + with open(fname, 'wb') as f: + pickle.dump([minL_sigma_auto, maxL_sigma_auto], f, protocol=4) + print('Calibration limits stored in file: ', fname) + +if __name__ == '__main__': + main() + + diff --git a/Pilot1/Uno_UQ/data_utils_/__init__.py b/Pilot1/Uno_UQ/data_utils_/__init__.py new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/Pilot1/Uno_UQ/data_utils_/__init__.py @@ -0,0 +1 @@ + diff --git a/Pilot1/Uno_UQ/data_utils_/cellline_data.py b/Pilot1/Uno_UQ/data_utils_/cellline_data.py new file mode 100644 index 00000000..af7e369a --- /dev/null +++ b/Pilot1/Uno_UQ/data_utils_/cellline_data.py @@ -0,0 +1,97 @@ + +import pandas as pd +import numpy as np + +import candle_keras as candle + +from uno import get_file_p1 as get_file +from uno import loggerUno as logger +from uno import DATA_URL + + +def load_cell_metadata(): + path = get_file(DATA_URL + 'cl_metadata') + df = pd.read_csv(path, sep='\t') + return df + + +def cell_name_to_ids(name, source=None): + path = get_file(DATA_URL + 'NCI60_CELLNAME_to_Combo.txt') + df1 = pd.read_csv(path, sep='\t') + hits1 = candle.lookup(df1, name, 'NCI60.ID', ['NCI60.ID', 'CELLNAME', 'Name'], match='contains') + path = get_file(DATA_URL + 'cl_mapping') + df2 = pd.read_csv(path, sep='\t', header=None) + hits2 = candle.lookup(df2, name, [0, 1], [0, 1], match='contains') + hits = hits1 + hits2 + if source: + hits = [x for x in hits if x.startswith(source.upper()+'.')] + return hits + + +def load_cell_rnaseq(ncols=None, scaling='std', imputing='mean', add_prefix=True, + use_landmark_genes=False, use_filtered_genes=False, + feature_subset=None, preprocess_rnaseq=None, + embed_feature_source=False, sample_set=None, index_by_sample=False): + + if use_landmark_genes: + filename = 'combined_rnaseq_data_lincs1000' + elif use_filtered_genes: + filename = 'combined_rnaseq_data_filtered' + else: + filename = 'combined_rnaseq_data' + + if preprocess_rnaseq and preprocess_rnaseq != 'none': + scaling = None + filename += ('_' + preprocess_rnaseq) # 'source_scale' or 'combat' + + path = get_file(DATA_URL + filename) + df_cols = pd.read_csv(path, engine='c', sep='\t', nrows=0) + total = df_cols.shape[1] - 1 # remove Sample column + if 'Cancer_type_id' in df_cols.columns: + total -= 1 + usecols = None + if ncols and ncols < total: + usecols = np.random.choice(total, size=ncols, replace=False) + usecols = np.append([0], np.add(sorted(usecols), 2)) + df_cols = df_cols.iloc[:, usecols] + if feature_subset: + with_prefix = lambda x: 'rnaseq.'+x if add_prefix else x + usecols = [0] + [i for i, c in enumerate(df_cols.columns) if with_prefix(c) in feature_subset] + df_cols = df_cols.iloc[:, usecols] + + dtype_dict = dict((x, np.float32) for x in df_cols.columns[1:]) + df = pd.read_csv(path, engine='c', sep='\t', usecols=usecols, dtype=dtype_dict) + if 'Cancer_type_id' in df.columns: + df.drop('Cancer_type_id', axis=1, inplace=True) + + prefixes = df['Sample'].str.extract('^([^.]*)', expand=False).rename('Source') + sources = prefixes.drop_duplicates().reset_index(drop=True) + df_source = pd.get_dummies(sources, prefix='rnaseq.source', prefix_sep='.') + df_source = pd.concat([sources, df_source], axis=1) + + df1 = df['Sample'] + if embed_feature_source: + df_sample_source = pd.concat([df1, prefixes], axis=1) + df1 = df_sample_source.merge(df_source, on='Source', how='left').drop('Source', axis=1) + logger.info('Embedding RNAseq data source into features: %d additional columns', df1.shape[1]-1) + + df2 = df.drop('Sample', 1) + if add_prefix: + df2 = df2.add_prefix('rnaseq.') + + df2 = candle.drop_impute_and_scale_dataframe(df2, scaling, imputing) + + df = pd.concat([df1, df2], axis=1) + + # scaling needs to be done before subsampling + if sample_set: + chosen = df['Sample'].str.startswith(sample_set) + df = df[chosen].reset_index(drop=True) + + if index_by_sample: + df = df.set_index('Sample') + + logger.info('Loaded combined RNAseq data: %s', df.shape) + + return df + diff --git a/Pilot1/Uno_UQ/data_utils_/drug_data.py b/Pilot1/Uno_UQ/data_utils_/drug_data.py new file mode 100644 index 00000000..cad8e326 --- /dev/null +++ b/Pilot1/Uno_UQ/data_utils_/drug_data.py @@ -0,0 +1,188 @@ + +import pandas as pd +import numpy as np + +import candle_keras as candle + +from uno import get_file_p1 as get_file +from uno import loggerUno as logger +from uno import DATA_URL + + +def load_drug_data(ncols=None, scaling='std', imputing='mean', dropna=None, add_prefix=True): + df_info = load_drug_info() + df_info['Drug'] = df_info['PUBCHEM'] + + df_desc = load_drug_set_descriptors(drug_set='Combined_PubChem', ncols=ncols) + df_fp = load_drug_set_fingerprints(drug_set='Combined_PubChem', ncols=ncols) + + df_desc = pd.merge(df_info[['ID', 'Drug']], df_desc, on='Drug').drop('Drug', 1).rename(columns={'ID': 'Drug'}) + df_fp = pd.merge(df_info[['ID', 'Drug']], df_fp, on='Drug').drop('Drug', 1).rename(columns={'ID': 'Drug'}) + + df_desc2 = load_drug_set_descriptors(drug_set='NCI60', usecols=df_desc.columns.tolist() if ncols else None) + df_fp2 = load_drug_set_fingerprints(drug_set='NCI60', usecols=df_fp.columns.tolist() if ncols else None) + + df_desc = pd.concat([df_desc, df_desc2]).reset_index(drop=True) + df1 = pd.DataFrame(df_desc.loc[:, 'Drug']) + df2 = df_desc.drop('Drug', 1) + df2 = candle.drop_impute_and_scale_dataframe(df2, scaling=scaling, imputing=imputing, dropna=dropna) + if add_prefix: + df2 = df2.add_prefix('dragon7.') + df_desc = pd.concat([df1, df2], axis=1) + + df_fp = pd.concat([df_fp, df_fp2]).reset_index(drop=True) + df1 = pd.DataFrame(df_fp.loc[:, 'Drug']) + df2 = df_fp.drop('Drug', 1) + df2 = candle.drop_impute_and_scale_dataframe(df2, scaling=None, imputing=imputing, dropna=dropna) + if add_prefix: + df2 = df2.add_prefix('dragon7.') + df_fp = pd.concat([df1, df2], axis=1) + + logger.info('Loaded combined dragon7 drug descriptors: %s', df_desc.shape) + logger.info('Loaded combined dragon7 drug fingerprints: %s', df_fp.shape) + + return df_desc, df_fp + + +def load_drug_descriptors(ncols=None, scaling='std', imputing='mean', dropna=None, add_prefix=True, feature_subset=None): + df_info = load_drug_info() + df_info['Drug'] = df_info['PUBCHEM'] + + df_desc = load_drug_set_descriptors(drug_set='Combined_PubChem', ncols=ncols) + df_desc = pd.merge(df_info[['ID', 'Drug']], df_desc, on='Drug').drop('Drug', 1).rename(columns={'ID': 'Drug'}) + + df_desc2 = load_drug_set_descriptors(drug_set='NCI60', usecols=df_desc.columns.tolist() if ncols else None) + + df_desc = pd.concat([df_desc, df_desc2]).reset_index(drop=True) + df1 = pd.DataFrame(df_desc.loc[:, 'Drug']) + df2 = df_desc.drop('Drug', 1) + if add_prefix: + df2 = df2.add_prefix('dragon7.') + if feature_subset: + df2 = df2[[x for x in df2.columns if x in feature_subset]] + df2 = candle.drop_impute_and_scale_dataframe(df2, scaling=scaling, imputing=imputing, dropna=dropna) + df_desc = pd.concat([df1, df2], axis=1) + + logger.info('Loaded combined dragon7 drug descriptors: %s', df_desc.shape) + + return df_desc + + +def load_drug_fingerprints(ncols=None, scaling='std', imputing='mean', dropna=None, add_prefix=True, feature_subset=None): + df_info = load_drug_info() + df_info['Drug'] = df_info['PUBCHEM'] + + df_fp = load_drug_set_fingerprints(drug_set='Combined_PubChem', ncols=ncols) + df_fp = pd.merge(df_info[['ID', 'Drug']], df_fp, on='Drug').drop('Drug', 1).rename(columns={'ID': 'Drug'}) + + df_fp2 = load_drug_set_fingerprints(drug_set='NCI60', usecols=df_fp.columns.tolist() if ncols else None) + + df_fp = pd.concat([df_fp, df_fp2]).reset_index(drop=True) + df1 = pd.DataFrame(df_fp.loc[:, 'Drug']) + df2 = df_fp.drop('Drug', 1) + if add_prefix: + df2 = df2.add_prefix('dragon7.') + if feature_subset: + df2 = df2[[x for x in df2.columns if x in feature_subset]] + df2 = candle.drop_impute_and_scale_dataframe(df2, scaling=None, imputing=imputing, dropna=dropna) + df_fp = pd.concat([df1, df2], axis=1) + + logger.info('Loaded combined dragon7 drug fingerprints: %s', df_fp.shape) + + return df_fp + + +def load_drug_info(): + path = get_file(DATA_URL + 'drug_info') + df = pd.read_csv(path, sep='\t', dtype=object) + df['PUBCHEM'] = 'PubChem.CID.' + df['PUBCHEM'] + return df + + +def drug_name_to_ids(name, source=None): + df1 = load_drug_info() + path = get_file(DATA_URL + 'NCI_IOA_AOA_drugs') + df2 = pd.read_csv(path, sep='\t', dtype=str) + df2['NSC'] = 'NSC.' + df2['NSC'] + hits1 = candle.lookup(df1, name, 'ID', ['ID', 'NAME', 'CLEAN_NAME', 'PUBCHEM']) + hits2 = candle.lookup(df2, name, 'NSC', ['NSC', 'Generic Name', 'Preffered Name']) + hits = hits1 + hits2 + if source: + hits = [x for x in hits if x.startswith(source.upper()+'.')] + return hits + + +def load_drug_set_descriptors(drug_set='Combined_PubChem', ncols=None, usecols=None, + scaling=None, imputing=None, add_prefix=False): + path = get_file(DATA_URL + '{}_dragon7_descriptors.tsv'.format(drug_set)) + + df_cols = pd.read_csv(path, engine='c', sep='\t', nrows=0) + total = df_cols.shape[1] - 1 + if usecols is not None: + usecols = [x for x in usecols if x in df_cols.columns] + if usecols[0] != 'NAME': + usecols = ['NAME'] + usecols + df_cols = df_cols.loc[:, usecols] + elif ncols and ncols < total: + usecols = np.random.choice(total, size=ncols, replace=False) + usecols = np.append([0], np.add(sorted(usecols), 1)) + df_cols = df_cols.iloc[:, usecols] + + dtype_dict = dict((x, np.float32) for x in df_cols.columns[1:]) + df = pd.read_csv(path, engine='c', sep='\t', usecols=usecols, dtype=dtype_dict, + na_values=['na', '-', '']) + + df1 = pd.DataFrame(df.loc[:, 'NAME']) + df1.rename(columns={'NAME': 'Drug'}, inplace=True) + + df2 = df.drop('NAME', 1) + if add_prefix: + df2 = df2.add_prefix('dragon7.') + + df2 = candle.drop_impute_and_scale_dataframe(df2, scaling, imputing, dropna=None) + + df = pd.concat([df1, df2], axis=1) + return df + + +def load_drug_set_fingerprints(drug_set='Combined_PubChem', ncols=None, usecols=None, + scaling=None, imputing=None, add_prefix=False): + fps = ['PFP', 'ECFP'] + usecols_all = usecols + df_merged = None + for fp in fps: + path = get_file(DATA_URL + '{}_dragon7_{}.tsv'.format(drug_set, fp)) + df_cols = pd.read_csv(path, engine='c', sep='\t', nrows=0, skiprows=1, header=None) + total = df_cols.shape[1] - 1 + if usecols_all is not None: + usecols = [x.replace(fp+'.', '') for x in usecols_all] + usecols = [int(x) for x in usecols if x.isdigit()] + usecols = [x for x in usecols if x in df_cols.columns] + if usecols[0] != 0: + usecols = [0] + usecols + df_cols = df_cols.loc[:, usecols] + elif ncols and ncols < total: + usecols = np.random.choice(total, size=ncols, replace=False) + usecols = np.append([0], np.add(sorted(usecols), 1)) + df_cols = df_cols.iloc[:, usecols] + + dtype_dict = dict((x, np.float32) for x in df_cols.columns[1:]) + df = pd.read_csv(path, engine='c', sep='\t', skiprows=1, header=None, + usecols=usecols, dtype=dtype_dict) + df.columns = ['{}.{}'.format(fp, x) for x in df.columns] + + col1 = '{}.0'.format(fp) + df1 = pd.DataFrame(df.loc[:, col1]) + df1.rename(columns={col1: 'Drug'}, inplace=True) + + df2 = df.drop(col1, 1) + if add_prefix: + df2 = df2.add_prefix('dragon7.') + + df2 = candle.drop_impute_and_scale_dataframe(df2, scaling, imputing, dropna=None) + + df = pd.concat([df1, df2], axis=1) + + df_merged = df if df_merged is None else df_merged.merge(df) + + return df_merged diff --git a/Pilot1/Uno_UQ/data_utils_/response_data.py b/Pilot1/Uno_UQ/data_utils_/response_data.py new file mode 100644 index 00000000..d4080da8 --- /dev/null +++ b/Pilot1/Uno_UQ/data_utils_/response_data.py @@ -0,0 +1,175 @@ + +import pandas as pd +import numpy as np + +from uno import get_file_p1 as get_file +from uno import loggerUno as logger +from uno import DATA_URL + +global_cache = {} + +def save_combined_dose_response(): + df1 = load_single_dose_response(combo_format=True, fraction=False) + df2 = load_combo_dose_response(fraction=False) + df = pd.concat([df1, df2]) + df.to_csv('combined_drug_growth', index=False, sep='\t') + + +def load_combined_dose_response(rename=True): + df1 = load_single_dose_response(combo_format=True) + logger.info('Loaded {} single drug dose response measurements'.format(df1.shape[0])) + + df2 = load_combo_dose_response() + logger.info('Loaded {} drug pair dose response measurements'.format(df2.shape[0])) + + df = pd.concat([df1, df2]) + logger.info('Combined dose response data contains sources: {}'.format(df['SOURCE'].unique())) + + if rename: + df = df.rename(columns={'SOURCE': 'Source', 'CELL': 'Sample', + 'DRUG1': 'Drug1', 'DRUG2': 'Drug2', + 'DOSE1': 'Dose1', 'DOSE2': 'Dose2', + 'GROWTH': 'Growth', 'STUDY': 'Study'}) + return df + + +def load_single_dose_response(combo_format=False, fraction=True): + # path = get_file(DATA_URL + 'combined_single_drug_growth') + path = get_file(DATA_URL + 'rescaled_combined_single_drug_growth') + + df = global_cache.get(path) + if df is None: + df = pd.read_csv(path, sep='\t', engine='c', + na_values=['na', '-', ''], + # nrows=10, + dtype={'SOURCE': str, 'DRUG_ID': str, + 'CELLNAME': str, 'CONCUNIT': str, + 'LOG_CONCENTRATION': np.float32, + 'EXPID': str, 'GROWTH': np.float32}) + global_cache[path] = df + + df['DOSE'] = -df['LOG_CONCENTRATION'] + + df = df.rename(columns={'CELLNAME': 'CELL', 'DRUG_ID': 'DRUG', 'EXPID': 'STUDY'}) + df = df[['SOURCE', 'CELL', 'DRUG', 'DOSE', 'GROWTH', 'STUDY']] + + if fraction: + df['GROWTH'] /= 100 + + if combo_format: + df = df.rename(columns={'DRUG': 'DRUG1', 'DOSE': 'DOSE1'}) + df['DRUG2'] = np.nan + df['DOSE2'] = np.nan + df['DRUG2'] = df['DRUG2'].astype(object) + df['DOSE2'] = df['DOSE2'].astype(np.float32) + df = df[['SOURCE', 'CELL', 'DRUG1', 'DOSE1', 'DRUG2', 'DOSE2', 'GROWTH', 'STUDY']] + + return df + + +def load_combo_dose_response(fraction=True): + path = get_file(DATA_URL + 'ComboDrugGrowth_Nov2017.csv') + df = global_cache.get(path) + if df is None: + df = pd.read_csv(path, sep=',', engine='c', + na_values=['na','-',''], + usecols=['CELLNAME', 'NSC1', 'CONC1', 'NSC2', 'CONC2', + 'PERCENTGROWTH', 'VALID', 'SCREENER', 'STUDY'], + # nrows=10000, + dtype={'CELLNAME': str, 'NSC1': str, 'NSC2': str, + 'CONC1': np.float32, 'CONC2': np.float32, + 'PERCENTGROWTH':np.float32, 'VALID': str, + 'SCREENER': str, 'STUDY': str}, + error_bad_lines=False, warn_bad_lines=True) + global_cache[path] = df + + df = df[df['VALID'] == 'Y'] + + df['SOURCE'] = 'ALMANAC.' + df['SCREENER'] + + cellmap_path = get_file(DATA_URL + 'NCI60_CELLNAME_to_Combo.txt') + df_cellmap = pd.read_csv(cellmap_path, sep='\t') + df_cellmap.set_index('Name', inplace=True) + cellmap = df_cellmap[['NCI60.ID']].to_dict()['NCI60.ID'] + + df['CELL'] = df['CELLNAME'].map(lambda x: cellmap[x]) + + df['DOSE1'] = -np.log10(df['CONC1']) + df['DOSE2'] = -np.log10(df['CONC2']) + + df['DRUG1'] = 'NSC.' + df['NSC1'] + df['DRUG2'] = 'NSC.' + df['NSC2'] + + if fraction: + df['GROWTH'] = df['PERCENTGROWTH'] / 100 + else: + df['GROWTH'] = df['PERCENTGROWTH'] + + df = df[['SOURCE', 'CELL', 'DRUG1', 'DOSE1', 'DRUG2', 'DOSE2', 'GROWTH', 'STUDY']] + + return df + + +def load_aggregated_single_response(target='AUC', min_r2_fit=0.3, max_ec50_se=3, combo_format=False, rename=True): + path = get_file(DATA_URL + 'combined_single_response_agg') + + df = global_cache.get(path) + if df is None: + df = pd.read_csv(path, engine='c', sep='\t', + dtype={'SOURCE': str, 'CELL': str, 'DRUG': str, 'STUDY': str, + 'AUC': np.float32, 'IC50': np.float32, + 'EC50': np.float32, 'EC50se': np.float32, + 'R2fit': np.float32, 'Einf': np.float32, + 'HS': np.float32, 'AAC1': np.float32, + 'AUC1': np.float32, 'DSS1': np.float32}) + global_cache[path] = df + + total = len(df) + + df = df[(df['R2fit'] >= min_r2_fit) & (df['EC50se'] <= max_ec50_se)] + df = df[['SOURCE', 'CELL', 'DRUG', target, 'STUDY']] + df = df[~df[target].isnull()] + + logger.info('Loaded %d dose independent response samples (filtered by EC50se <= %f & R2fit >=%f from a total of %d).', len(df), max_ec50_se, min_r2_fit, total) + + if combo_format: + df = df.rename(columns={'DRUG': 'DRUG1'}) + df['DRUG2'] = np.nan + df['DRUG2'] = df['DRUG2'].astype(object) + df = df[['SOURCE', 'CELL', 'DRUG1', 'DRUG2', target, 'STUDY']] + if rename: + df = df.rename(columns={'SOURCE': 'Source', 'CELL': 'Sample', + 'DRUG1': 'Drug1', 'DRUG2': 'Drug2', 'STUDY': 'Study'}) + else: + if rename: + df = df.rename(columns={'SOURCE': 'Source', 'CELL': 'Sample', + 'DRUG': 'Drug', 'STUDY': 'Study'}) + + return df + + + +def select_drugs_with_response_range(df_response, lower=0, upper=0, span=0, lower_median=None, upper_median=None): + df = df_response.groupby(['Drug1', 'Sample'])['Growth'].agg(['min', 'max', 'median']) + df['span'] = df['max'].clip(lower=-1, upper=1) - df['min'].clip(lower=-1, upper=1) + df = df.groupby('Drug1').mean().reset_index().rename(columns={'Drug1': 'Drug'}) + mask = (df['min'] <= lower) & (df['max'] >= upper) & (df['span'] >= span) + if lower_median: + mask &= (df['median'] >= lower_median) + if upper_median: + mask &= (df['median'] <= upper_median) + df_sub = df[mask] + return df_sub + + +def summarize_response_data(df, target=None): + target = target or 'Growth' + df_sum = df.groupby('Source').agg({target: 'count', 'Sample': 'nunique', + 'Drug1': 'nunique', 'Drug2': 'nunique'}) + if 'Dose1' in df_sum: + df_sum['MedianDose'] = df.groupby('Source').agg({'Dose1': 'median'}) + return df_sum + + + + diff --git a/Pilot1/Uno_UQ/data_utils_/uno.py b/Pilot1/Uno_UQ/data_utils_/uno.py new file mode 100644 index 00000000..4c1ddc56 --- /dev/null +++ b/Pilot1/Uno_UQ/data_utils_/uno.py @@ -0,0 +1,353 @@ +from __future__ import print_function + +import os +import sys +import logging +import argparse +try: + import configparser +except ImportError: + import ConfigParser as configparser + +from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error +from scipy.stats.stats import pearsonr + +#file_path = os.path.dirname(os.path.realpath(__file__)) +file_path = os.path.dirname(os.path.realpath(os.path.join(__file__, '..'))) +lib_path = os.path.abspath(os.path.join(file_path, '..')) +sys.path.append(lib_path) +lib_path = os.path.abspath(os.path.join(file_path, 'data_utils_')) +sys.path.append(lib_path) +lib_path = os.path.abspath(os.path.join(file_path, 'model_utils_')) +sys.path.append(lib_path) +lib_path2 = os.path.abspath(os.path.join(file_path, '..', '..', 'common')) +sys.path.append(lib_path2) + + +import candle + +P1B3_URL = 'http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/P1B3/' +DATA_URL = 'http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/combo/' + +loggerUno = logging.getLogger(__name__) + + +def set_up_logger(logfile, logger1, logger2, verbose): + candle.verify_path(logfile) + fh = logging.FileHandler(logfile) + fh.setFormatter(logging.Formatter("[%(asctime)s %(process)d] %(message)s", datefmt="%Y-%m-%d %H:%M:%S")) + fh.setLevel(logging.DEBUG) + + sh = logging.StreamHandler() + sh.setFormatter(logging.Formatter('')) + sh.setLevel(logging.DEBUG if verbose else logging.INFO) + + for log in [logger1, logger2]: + log.setLevel(logging.DEBUG) + log.addHandler(fh) + log.addHandler(sh) + + +def extension_from_parameters(args): + """Construct string for saving model with annotation of parameters""" + ext = '' + ext += '.A={}'.format(args.activation) + ext += '.B={}'.format(args.batch_size) + ext += '.E={}'.format(args.epochs) + ext += '.O={}'.format(args.optimizer) + ext += '.LS={}'.format(args.loss) + # ext += '.LEN={}'.format(args.maxlen) + ext += '.LR={}'.format(args.learning_rate) + ext += '.CF={}'.format(''.join([x[0] for x in sorted(args.cell_features)])) + ext += '.DF={}'.format(''.join([x[0] for x in sorted(args.drug_features)])) + if args.feature_subsample > 0: + ext += '.FS={}'.format(args.feature_subsample) + if args.drop > 0: + ext += '.DR={}'.format(args.drop) + if args.warmup_lr: + ext += '.wu_lr' + if args.reduce_lr: + ext += '.re_lr' + if args.residual: + ext += '.res' + if args.use_landmark_genes: + ext += '.L1000' + if args.no_gen: + ext += '.ng' + for i, n in enumerate(args.dense): + if n > 0: + ext += '.D{}={}'.format(i+1, n) + if args.dense_feature_layers != args.dense: + for i, n in enumerate(args.dense): + if n > 0: + ext += '.FD{}={}'.format(i+1, n) + + return ext + +def set_up_logger_data(verbose=False): + sh = logging.StreamHandler() + sh.setFormatter(logging.Formatter('')) + sh.setLevel(logging.DEBUG if verbose else logging.INFO) + + logger.setLevel(logging.DEBUG) + logger.addHandler(sh) + + +def log_evaluation(metric_outputs, logger, description='Comparing y_true and y_pred:'): + logger.info(description) + for metric, value in metric_outputs.items(): + logger.info(' {}: {:.4f}'.format(metric, value)) + + +def get_file_p1(url): + fname = os.path.basename(url) + return candle.get_file(fname, origin=url, cache_subdir='Pilot1') + + +def dict_compare(d1, d2, ignore=[], expand=False): + d1_keys = set(d1.keys()) - set(ignore) + d2_keys = set(d2.keys()) - set(ignore) + intersect_keys = d1_keys.intersection(d2_keys) + added = d1_keys - d2_keys + removed = d2_keys - d1_keys + modified = set({x : (d1[x], d2[x]) for x in intersect_keys if d1[x] != d2[x]}) + common = set(x for x in intersect_keys if d1[x] == d2[x]) + equal = not (added or removed or modified) + if expand: + return equal, added, removed, modified, common + else: + return equal, added | removed | modified + + +def evaluate_prediction(y_true, y_pred): + mse = mean_squared_error(y_true, y_pred) + mae = mean_absolute_error(y_true, y_pred) + r2 = r2_score(y_true, y_pred) + corr, _ = pearsonr(y_true, y_pred) + return {'mse': mse, 'mae': mae, 'r2': r2, 'corr': corr} + + +def read_IDs_file(fname): + + with open(fname, 'r') as f: + read_ids = f.read().splitlines() + + loggerUno.info('Read file: {}'.format(fname)) + loggerUno.info('Number of elements read: {}'.format(len(read_ids))) + + return read_ids + + +class BenchmarkUno(candle.Benchmark): + + def set_locals(self): + """Functionality to set variables specific for the benchmark + - required: set of required parameters for the benchmark. + - additional_definitions: list of dictionaries describing the additional parameters for the + benchmark. + """ + + if required is not None: + self.required = set(required) + if additional_definitions is not None: + self.additional_definitions = additional_definitions + + +additional_definitions = [ +# Feature selection + {'name':'agg_dose', + 'type': str, + 'default': None, + 'choices':['AUC', 'IC50', 'EC50', 'HS', 'AAC1', 'AUC1', 'DSS1'], + 'help':'use dose-independent response data with the specified aggregation metric'}, + {'name':'cell_features', + 'nargs':'+', + 'choices':['rnaseq', 'none'], + 'help':'use rnaseq cell line feature set or none at all'}, + {'name':'drug_features', + 'nargs':'+', + 'choices':['descriptors', 'fingerprints', 'none'], + 'help':'use dragon7 descriptors or fingerprint descriptors for drug features or none at all'}, + {'name': 'by_cell', + 'type':str, + 'default':None, + 'help':'sample ID for building a by-cell model'}, + {'name': 'by_drug', + 'type':str, + 'default':None, + 'help':'drug ID or name for building a by-drug model'}, +# Data set selection + {'name':'train_sources', + 'nargs':'+', + 'choices':['all', 'CCLE', 'CTRP', 'gCSI', 'GDSC', 'NCI60', 'SCL', 'SCLC', 'ALMANAC'], + 'help':'use one or more sources of drug response data for training'}, + {'name':'test_sources', + 'nargs':'+', + 'choices':['train', 'all', 'CCLE', 'CTRP', 'gCSI', 'GDSC', 'NCI60', 'SCL', 'SCLC', 'ALMANAC'], + 'help':'use one or more sources of drug response data for testing'}, +# Sample selection + {'name':'cell_types', + 'nargs':'+', + 'help':'limit training and test data to one or more tissue types'}, + {'name':'cell_subset_path', + 'type': str, + 'default': '', + 'help':'path for file with space delimited molecular sample IDs to keep'}, + {'name':'drug_subset_path', + 'type': str, + 'default': '', + 'help':'path for file with space delimited drug IDs to keep'}, + {'name':'drug_median_response_min', + 'type':float, + 'default':-1, + 'help':'keep drugs whose median response is greater than the threshold'}, + {'name':'drug_median_response_max', + 'type':float, + 'default':1, + 'help':'keep drugs whose median response is less than the threshold'}, +# Training + {'name':'no_feature_source', + 'type': candle.str2bool, + 'default': False, + 'help':'do not embed cell or drug feature source as part of input'}, + {'name':'no_response_source', + 'type': candle.str2bool, + 'default': False, + 'help':'do not encode response data source as an input feature'}, + {'name':'dense_feature_layers', + 'nargs':'+', + 'type':int, + 'help':'number of neurons in intermediate dense layers in the feature encoding submodels'}, + {'name':'use_landmark_genes', + 'type': candle.str2bool, + 'default': False, + 'help':'use the 978 landmark genes from LINCS (L1000) as expression features'}, + {'name':'use_filtered_genes', + 'type': candle.str2bool, + 'default': False, + 'help':'use the variance filtered genes as expression features'}, + {'name':'feature_subset_path', + 'type': str, + 'default': '', + 'help':'path for file with space delimited features to keep'}, + {'name':'cell_feature_subset_path', + 'type': str, + 'default': '', + 'help':'path for file with space delimited molecular features to keep'}, + {'name':'drug_feature_subset_path', + 'type': str, + 'default': '', + 'help':'path for file with space delimited drug features to keep'}, + {'name':'preprocess_rnaseq', + 'choices':['source_scale', 'combat', 'none'], + 'default':'none', + 'help':'preprocessing method for RNAseq data; none for global normalization'}, + {'name':'residual', + 'type': candle.str2bool, + 'default': False, + 'help':'add skip connections to the layers'}, + {'name':'reduce_lr', + 'type': candle.str2bool, + 'default': False, + 'help':'reduce learning rate on plateau'}, + {'name':'warmup_lr', + 'type': candle.str2bool, + 'default': False, + 'help':'gradually increase learning rate on start'}, + {'name':'base_lr', + 'type':float, + 'default':None, + 'help':'base learning rate'}, + {'name':'cp', + 'type': candle.str2bool, + 'default': False, + 'help':'checkpoint models with best val_loss'}, + {'name':'tb', + 'type': candle.str2bool, + 'default': False, + 'help':'use tensorboard'}, + {'name': 'tb_prefix', + 'type': str, + 'default': 'tb', + 'help': 'prefix name for tb log'}, + {'name':'max_val_loss', + 'type':float, + 'default':argparse.SUPPRESS, + 'help':'retrain if val_loss is greater than the threshold'}, + {'name':'partition_by', + 'choices':['index', 'drug_pair', 'cell'], + 'default':None, + 'help':'cross validation paritioning scheme'}, + {'name':'cv', + 'type':int, + 'default':argparse.SUPPRESS, + 'help':'cross validation folds'}, + {'name':'no_gen', + 'type': candle.str2bool, + 'default': False, + 'help':'do not use generator for training and validation data'}, + {'name':'cache', + 'type': str, + 'default': None, + 'help':'prefix of data cache files to use'}, + {'name':'single', + 'type': candle.str2bool, + 'default': False, + 'help':'do not use drug pair representation'}, + {'name': 'export_csv', + 'type': str, + 'default': None, + 'help': 'output csv file name'}, + {'name':'export_data', + 'type': str, + 'default': None, + 'help':'output dataframe file name'}, + {'name': 'use_exported_data', + 'type': str, + 'default': None, + 'help': 'exported file name'}, + {'name':'growth_bins', + 'type': int, + 'default': 0, + 'help':'number of bins to use when discretizing growth response'}, + {'name' : 'initial_weights', + 'type' : str, + 'default': None, + 'help' : 'file name of initial weights'}, + {'name' : 'save_weights', + 'type': str, + 'default' : None, + 'help': 'name of file to save weights to' }, + {'name':'exclude_cells', 'nargs':'+', + 'default': [], + 'help':'cell line IDs to exclude'}, + {'name':'exclude_drugs', 'nargs':'+', + 'default': [], + 'help':'drug line IDs to exclude'}, + {'name':'sample_repetition', + 'type': candle.str2bool, + 'default': False, + 'help':'allow repetition of training data'} +] + + + +required = [ + 'activation', + 'batch_size', + 'dense', + 'dense_feature_layers', + 'drop', + 'epochs', + 'feature_subsample', + 'learning_rate', + 'loss', + 'optimizer', + 'residual', + 'rng_seed', + 'save_path', + 'scaling', + 'val_split', + 'solr_root', + 'timeout' + ] diff --git a/Pilot1/Uno_UQ/data_utils_/uno_combined_data_generator.py b/Pilot1/Uno_UQ/data_utils_/uno_combined_data_generator.py new file mode 100644 index 00000000..649780c2 --- /dev/null +++ b/Pilot1/Uno_UQ/data_utils_/uno_combined_data_generator.py @@ -0,0 +1,257 @@ + +from itertools import cycle, islice + +import numpy as np +import pandas as pd + +from keras.utils import Sequence + +def values_or_dataframe(df, contiguous=False, dataframe=False): + if dataframe: + return df + mat = df.values + if contiguous: + mat = np.ascontiguousarray(mat) + return mat + + +class CombinedDataGenerator(Sequence):#object): + """Generate training, validation or testing batches from loaded data + """ +# def __init__(self, data, partition='train', fold=0, source=None, batch_size=32, shuffle=True): + def __init__(self, data, partition='train', fold=0, source=None, batch_size=32, shuffle=True, single=False, rank=0, total_ranks=1): + + self.data = data + self.partition = partition + self.batch_size = batch_size + self.single = single + + if partition == 'train': + index = data.train_indexes[fold] + elif partition == 'val': + index = data.val_indexes[fold] + else: + index = data.test_indexes[fold] + + if source: + df = data.df_response[['Source']].iloc[index, :] + index = df.index[df['Source'] == source] + + if shuffle: + index = np.random.permutation(index) + # index = index[:len(index)//10] + + # sharing by rank + samples_per_rank = len(index) // total_ranks + samples_per_rank = self.batch_size * (samples_per_rank // self.batch_size) + + self.index = index[rank * samples_per_rank:(rank + 1) * samples_per_rank] + self.index_cycle = cycle(self.index) + self.size = len(self.index) + self.steps = self.size // self.batch_size + print("partition:{0}, rank:{1}, sharded index size:{2}, batch_size:{3}, steps:{4}".format(partition, rank, self.size, self.batch_size, self.steps)) + + +# self.index = index +# self.index_cycle = cycle(index) +# self.size = len(index) +# self.steps = np.ceil(self.size / batch_size) +# # self.steps = np.ceil(self.size / batch_size / 100) + + def __len__(self): + return self.steps + + def __getitem__(self, idx): + shard = self.index[idx * self.batch_size:(idx + 1) * self.batch_size] + x_list, y = self.get_slice(self.batch_size, single=self.single, partial_index=shard) + return x_list, y + + def reset(self): + self.index_cycle = cycle(self.index) + + def get_response(self, copy=False): + df = self.data.df_response.iloc[self.index, :].drop(['Group'], axis=1) + return df.copy() if copy else df + +# def get_slice(self, size=None, contiguous=True, single=False, dataframe=False): + def get_slice(self, size=None, contiguous=True, single=False, dataframe=False, partial_index=None): + size = size or self.size + single = single or self.data.agg_dose + target = self.data.agg_dose or 'Growth' + +# index = list(islice(self.index_cycle, size)) + if partial_index is not None: + index = partial_index + else: + index = list(islice(self.index_cycle, size)) + df_orig = self.data.df_response.iloc[index, :] + df = df_orig.copy() + + if not single: + df['Swap'] = np.random.choice([True, False], df.shape[0]) + swap = df_orig['Drug2'].notnull() & df['Swap'] + df.loc[swap, 'Drug1'] = df_orig.loc[swap, 'Drug2'] + df.loc[swap, 'Drug2'] = df_orig.loc[swap, 'Drug1'] + if not self.data.agg_dose: + df['DoseSplit'] = np.random.uniform(0.001, 0.999, df.shape[0]) + df.loc[swap, 'Dose1'] = df_orig.loc[swap, 'Dose2'] + df.loc[swap, 'Dose2'] = df_orig.loc[swap, 'Dose1'] + + split = df_orig['Drug2'].isnull() + if not single: + df.loc[split, 'Drug2'] = df_orig.loc[split, 'Drug1'] + if not self.data.agg_dose: + df.loc[split, 'Dose1'] = df_orig.loc[split, 'Dose1'] - np.log10(df.loc[split, 'DoseSplit']) + df.loc[split, 'Dose2'] = df_orig.loc[split, 'Dose1'] - np.log10(1 - df.loc[split, 'DoseSplit']) + + if dataframe: + cols = [target, 'Sample', 'Drug1', 'Drug2'] if not single else [target, 'Sample', 'Drug1'] + y = df[cols].reset_index(drop=True) + else: + y = values_or_dataframe(df[target], contiguous, dataframe) + + x_list = [] + + if not self.data.agg_dose: + doses = ['Dose1', 'Dose2'] if not single else ['Dose1'] + for dose in doses: + x = values_or_dataframe(df[[dose]].reset_index(drop=True), contiguous, dataframe) + x_list.append(x) + + if self.data.encode_response_source: + df_x = pd.merge(df[['Source']], self.data.df_source, on='Source', how='left') + df_x.drop(['Source'], axis=1, inplace=True) + x = values_or_dataframe(df_x, contiguous, dataframe) + x_list.append(x) + + for fea in self.data.cell_features: + df_cell = getattr(self.data, self.data.cell_df_dict[fea]) + df_x = pd.merge(df[['Sample']], df_cell, on='Sample', how='left') + df_x.drop(['Sample'], axis=1, inplace=True) + x = values_or_dataframe(df_x, contiguous, dataframe) + x_list.append(x) + + drugs = ['Drug1', 'Drug2'] if not single else ['Drug1'] + for drug in drugs: + for fea in self.data.drug_features: + df_drug = getattr(self.data, self.data.drug_df_dict[fea]) + df_x = pd.merge(df[[drug]], df_drug, left_on=drug, right_on='Drug', how='left') + df_x.drop([drug, 'Drug'], axis=1, inplace=True) + if dataframe and not single: + df_x = df_x.add_prefix(drug + '.') + x = values_or_dataframe(df_x, contiguous, dataframe) + x_list.append(x) + + # print(x_list, y) + return x_list, y + + def flow(self, single=False): + while 1: + x_list, y = self.get_slice(self.batch_size, single=single) + yield x_list, y + + +def test_generator(loader): + gen = CombinedDataGenerator(loader).flow() + x_list, y = next(gen) + print('x shapes:') + for x in x_list: + print(x.shape) + print('y shape:') + print(y.shape) + + +def find_columns_with_str(df, substr): + col_indices = [df.columns.get_loc(col) for col in df.columns if substr in col] + + return col_indices + +class FromFileDataGenerator(object): + """Generate testing batches from loaded data + """ + def __init__(self, df_data, indices, target_str, feature_names_list, num_features_list, batch_size=32, shuffle=True): + + self.batch_size = batch_size + + index = indices + + if shuffle: + index = np.random.permutation(index) + + self.index = index + self.index_cycle = cycle(index) + self.size = len(index) + self.steps = np.ceil(self.size / batch_size) + + self.num_features_list = num_features_list + + try : # Try to get the 'target_str' column + target = df_data.columns.get_loc(target_str) + except KeyError: # The 'target_str' column is not available in data file + # No ground truth available + y_fake = np.zeros(df_data.shape[0]) + df_data['fake_target'] = y_fake + self.target = df_data.columns.get_loc('fake_target') + else: # 'target_str' column is available --> use this column + self.target = target + + self.df_data = df_data + self.offset = self.compute_offset(feature_names_list) + + def compute_offset(self, feature_names): + offset = self.df_data.shape[1] + for name in feature_names: + col_indices = find_columns_with_str(self.df_data, name) + if len(col_indices) > 0: + first_col = np.min(col_indices) + if first_col < offset: + offset = first_col + + if offset == self.df_data.shape[1]: + raise Exception('ERROR ! Feature names from model are not in file. ' \ + 'These are features in model: ' + str(sorted(feature_names)) + \ + '... Exiting') + + return offset + + def reset(self): + self.index_cycle = cycle(self.index) + + def get_response(self, copy=False): + df = self.df_data.iloc[self.index, :] + return df.copy() if copy else df + + def get_slice(self, size=None, contiguous=True): + + size = size or self.size + index = list(islice(self.index_cycle, size)) + df_orig = self.df_data.iloc[index, :] + df = df_orig.copy() + + #Features --> + x_list = [] + start = self.offset + # features need to be provided in the partitions expected by the model + for i,numf in enumerate(self.num_features_list): + end = start + numf + mat = df.iloc[:,start:end].values + if contiguous: + mat = np.ascontiguousarray(mat) + x_list.append(mat) + start = end + + # Target + mat = df.iloc[:,self.target].values + if contiguous: + mat = np.ascontiguousarray(mat) + y = mat + + # print(x_list, y) + return x_list, y + + + def flow(self, single=False): + while 1: + x_list, y = self.get_slice(self.batch_size) + yield x_list, y + diff --git a/Pilot1/Uno_UQ/data_utils_/uno_combined_data_loader.py b/Pilot1/Uno_UQ/data_utils_/uno_combined_data_loader.py new file mode 100644 index 00000000..be5a8483 --- /dev/null +++ b/Pilot1/Uno_UQ/data_utils_/uno_combined_data_loader.py @@ -0,0 +1,427 @@ +from __future__ import print_function + +import collections +import json +import logging +import os +import pickle + +import pandas as pd +import numpy as np + +from sklearn.model_selection import ShuffleSplit, KFold + +import cellline_data +import drug_data +import response_data + +from uno import loggerUno as logger +from uno import dict_compare + +SEED = 2019 + +def encode_sources(sources): + df = pd.get_dummies(sources, prefix='source', prefix_sep='.') + df['Source'] = sources + source_l1 = df['Source'].str.extract('^(\S+)\.', expand=False) + df1 = pd.get_dummies(source_l1, prefix='source.L1', prefix_sep='.') + df = pd.concat([df1, df], axis=1) + df = df.set_index('Source').reset_index() + return df + +def read_set_from_file(path): + if path: + with open(path, 'r') as f: + text = f.read().strip() + subset = text.split() + else: + subset = None + return subset + + +def assign_partition_groups(df, partition_by='drug_pair'): + if partition_by == 'cell': + group = df['Sample'] + elif partition_by == 'drug_pair': + df_info = drug_data.load_drug_info() + id_dict = df_info[['ID', 'PUBCHEM']].drop_duplicates(['ID']).set_index('ID').iloc[:, 0] + group = df['Drug1'].copy() + group[(df['Drug2'].notnull()) & (df['Drug1'] <= df['Drug2'])] = df['Drug1'] + ',' + df['Drug2'] + group[(df['Drug2'].notnull()) & (df['Drug1'] > df['Drug2'])] = df['Drug2'] + ',' + df['Drug1'] + group2 = group.map(id_dict) + mapped = group2.notnull() + group[mapped] = group2[mapped] + elif partition_by == 'index': + group = df.reset_index()['index'] + logger.info('Grouped response data by %s: %d groups', partition_by, group.nunique()) + return group + + +class CombinedDataLoader(object): + def __init__(self, seed=SEED): + self.seed = seed + self.test_indexes = [[]] + + def load_from_cache(self, cache, params): + param_fname = '{}.params.json'.format(cache) + if not os.path.isfile(param_fname): + logger.warning('Cache parameter file does not exist: %s', param_fname) + return False + with open(param_fname) as param_file: + try: + cached_params = json.load(param_file) + except json.JSONDecodeError as e: + logger.warning('Could not decode parameter file %s: %s', param_fname, e) + return False + ignore_keys = ['cache', 'partition_by', 'single'] + equal, diffs = dict_compare(params, cached_params, ignore_keys) + if not equal: + logger.warning('Cache parameter mismatch: %s\nSaved: %s\nAttemptd to load: %s', diffs, cached_params, params) + logger.warning('\nRemove %s to rebuild data cache.\n', param_fname) + raise ValueError('Could not load from a cache with incompatible keys:', diffs) + else: + fname = '{}.pkl'.format(cache) + if not os.path.isfile(fname): + logger.warning('Cache file does not exist: %s', fname) + return False + with open(fname, 'rb') as f: + obj = pickle.load(f) + self.__dict__.update(obj.__dict__) + logger.info('Loaded data from cache: %s', fname) + return True + return False + + def save_to_cache(self, cache, params): + for k in ['self', 'cache', 'single']: + if k in params: + del params[k] + param_fname = '{}.params.json'.format(cache) + with open(param_fname, 'w') as param_file: + json.dump(params, param_file, sort_keys=True) + fname = '{}.pkl'.format(cache) + with open(fname, 'wb') as f: + pickle.dump(self, f, pickle.HIGHEST_PROTOCOL) + logger.info('Saved data to cache: %s', fname) + + def partition_data(self, partition_by=None, cv_folds=1, train_split=0.7, val_split=0.2, + cell_types=None, by_cell=None, by_drug=None, + cell_subset_path=None, drug_subset_path=None, + exclude_cells=[], exclude_drugs=[], exclude_indices=[]): + + seed = self.seed + train_sep_sources = self.train_sep_sources + test_sep_sources = self.test_sep_sources + df_response = self.df_response + + + if not partition_by: + if by_drug and by_cell: + partition_by = 'index' + elif by_drug: + partition_by = 'cell' + else: + partition_by = 'drug_pair' + + + # Exclude specified cells / drugs / indices + if exclude_cells != []: + df_response = df_response[~df_response['Sample'].isin(exclude_cells)] + if exclude_drugs != []: + if np.isin('Drug', df_response.columns.values): + df_response = df_response[~df_response['Drug1'].isin(exclude_drugs)] + else: + df_response = df_response[~df_response['Drug1'].isin(exclude_drugs) & ~df_response['Drug2'].isin(exclude_drugs)] + if exclude_indices != []: + df_response = df_response.drop(exclude_indices, axis=0) + logger.info('Excluding indices specified') + + if partition_by != self.partition_by: + df_response = df_response.assign(Group = assign_partition_groups(df_response, partition_by)) + + mask = df_response['Source'].isin(train_sep_sources) + test_mask = df_response['Source'].isin(test_sep_sources) + + if by_drug: + drug_ids = drug_data.drug_name_to_ids(by_drug) + logger.info('Mapped drug IDs for %s: %s', by_drug, drug_ids) + mask &= (df_response['Drug1'].isin(drug_ids)) & (df_response['Drug2'].isnull()) + test_mask &= (df_response['Drug1'].isin(drug_ids)) & (df_response['Drug2'].isnull()) + + if by_cell: + cell_ids = cellline_data.cell_name_to_ids(by_cell) + logger.info('Mapped sample IDs for %s: %s', by_cell, cell_ids) + mask &= (df_response['Sample'].isin(cell_ids)) + test_mask &= (df_response['Sample'].isin(cell_ids)) + + if cell_subset_path: + cell_subset = read_set_from_file(cell_subset_path) + mask &= (df_response['Sample'].isin(cell_subset)) + test_mask &= (df_response['Sample'].isin(cell_subset)) + + if drug_subset_path: + drug_subset = read_set_from_file(drug_subset_path) + mask &= (df_response['Drug1'].isin(drug_subset)) & ((df_response['Drug2'].isnull()) | (df_response['Drug2'].isin(drug_subset))) + test_mask &= (df_response['Drug1'].isin(drug_subset)) & ((df_response['Drug2'].isnull()) | (df_response['Drug2'].isin(drug_subset))) + + if cell_types: + df_type = cellline_data.load_cell_metadata() + cell_ids = set() + for cell_type in cell_types: + cells = df_type[~df_type['TUMOR_TYPE'].isnull() & df_type['TUMOR_TYPE'].str.contains(cell_type, case=False)] + cell_ids |= set(cells['ANL_ID'].tolist()) + logger.info('Mapped sample tissue types for %s: %s', cell_type, set(cells['TUMOR_TYPE'].tolist())) + mask &= (df_response['Sample'].isin(cell_ids)) + test_mask &= (df_response['Sample'].isin(cell_ids)) + + + df_group = df_response[mask]['Group'].drop_duplicates().reset_index(drop=True) + + if cv_folds > 1: + selector = KFold(n_splits=cv_folds, shuffle=True, random_state=seed) + else: + selector = ShuffleSplit(n_splits=1, train_size=train_split, test_size=val_split, random_state=seed) + + splits = selector.split(df_group) + + train_indexes = [] + val_indexes = [] + test_indexes = [] + + for index, (train_group_index, val_group_index) in enumerate(splits): + train_groups = set(df_group.values[train_group_index]) + val_groups = set(df_group.values[val_group_index]) + train_index = df_response.index[df_response['Group'].isin(train_groups) & mask] + val_index = df_response.index[df_response['Group'].isin(val_groups) & mask] + test_index = df_response.index[~df_response['Group'].isin(train_groups) & ~df_response['Group'].isin(val_groups) & test_mask] + + train_indexes.append(train_index) + val_indexes.append(val_index) + test_indexes.append(test_index) + if logger.isEnabledFor(logging.DEBUG): + logger.debug('CV fold %d: train data = %s, val data = %s, test data = %s', index, train_index.shape[0], val_index.shape[0], test_index.shape[0]) + logger.debug(' train groups (%d): %s', df_response.loc[train_index]['Group'].nunique(), df_response.loc[train_index]['Group'].unique()) + logger.debug(' val groups ({%d}): %s', df_response.loc[val_index]['Group'].nunique(), df_response.loc[val_index]['Group'].unique()) + logger.debug(' test groups ({%d}): %s', df_response.loc[test_index]['Group'].nunique(), df_response.loc[test_index]['Group'].unique()) + + + self.partition_by = partition_by + self.cv_folds = cv_folds + self.train_indexes = train_indexes + self.val_indexes = val_indexes + self.test_indexes = test_indexes + + def build_feature_list(self, single=False): + input_features = collections.OrderedDict() + feature_shapes = collections.OrderedDict() + + if not self.agg_dose: + doses = ['dose1', 'dose2'] if not single else ['dose1'] + for dose in doses: + input_features[dose] = 'dose' + feature_shapes['dose'] = (1,) + + if self.encode_response_source: + input_features['response.source'] = 'response.source' + feature_shapes['response.source'] = (self.df_source.shape[1] - 1,) + + for fea in self.cell_features: + feature_type = 'cell.' + fea + feature_name = 'cell.' + fea + df_cell = getattr(self, self.cell_df_dict[fea]) + input_features[feature_name] = feature_type + feature_shapes[feature_type] = (df_cell.shape[1] - 1,) + + drugs = ['drug1', 'drug2'] if not single else ['drug1'] + for drug in drugs: + for fea in self.drug_features: + feature_type = 'drug.' + fea + feature_name = drug + '.' + fea + df_drug = getattr(self, self.drug_df_dict[fea]) + input_features[feature_name] = feature_type + feature_shapes[feature_type] = (df_drug.shape[1] - 1,) + + input_dim = sum([np.prod(feature_shapes[x]) for x in input_features.values()]) + + self.input_features = input_features + self.feature_shapes = feature_shapes + self.input_dim = input_dim + + logger.info('Input features shapes:') + for k, v in self.input_features.items(): + logger.info(' {}: {}'.format(k, self.feature_shapes[v])) + logger.info('Total input dimensions: {}'.format(self.input_dim)) + + + def load(self, cache=None, ncols=None, scaling='std', dropna=None, + agg_dose=None, embed_feature_source=True, encode_response_source=True, + cell_features=['rnaseq'], drug_features=['descriptors', 'fingerprints'], + cell_feature_subset_path=None, drug_feature_subset_path=None, + drug_lower_response=1, drug_upper_response=-1, drug_response_span=0, + drug_median_response_min=-1, drug_median_response_max=1, + use_landmark_genes=False, use_filtered_genes=False, + preprocess_rnaseq=None, single=False, + # train_sources=['GDSC', 'CTRP', 'ALMANAC', 'NCI60'], + train_sources=['GDSC', 'CTRP', 'ALMANAC'], + # val_sources='train', + # test_sources=['CCLE', 'gCSI'], + test_sources=['train'], + partition_by='drug_pair'): + + params = locals().copy() + del params['self'] + + if not cell_features or 'none' in [x.lower() for x in cell_features]: + cell_features = [] + + if not drug_features or 'none' in [x.lower() for x in drug_features]: + drug_features = [] + + if cache and self.load_from_cache(cache, params): + self.build_feature_list(single=single) + return + + logger.info('Loading data from scratch ...') + + if agg_dose: + df_response = response_data.load_aggregated_single_response(target=agg_dose, combo_format=True) + else: + df_response = response_data.load_combined_dose_response() + + if logger.isEnabledFor(logging.INFO): + logger.info('Summary of combined dose response by source:') + logger.info(response_data.summarize_response_data(df_response, target=agg_dose)) + + all_sources = df_response['Source'].unique() + df_source = encode_sources(all_sources) + + if 'all' in train_sources: + train_sources = all_sources + if 'all' in test_sources: + test_sources = all_sources + elif 'train' in test_sources: + test_sources = train_sources + + train_sep_sources = [x for x in all_sources for y in train_sources if x.startswith(y)] + test_sep_sources = [x for x in all_sources for y in test_sources if x.startswith(y)] + + ids1 = df_response[['Drug1']].drop_duplicates().rename(columns={'Drug1':'Drug'}) + ids2 = df_response[['Drug2']].drop_duplicates().rename(columns={'Drug2':'Drug'}) + df_drugs_with_response = pd.concat([ids1, ids2]).drop_duplicates().dropna().reset_index(drop=True) + df_cells_with_response = df_response[['Sample']].drop_duplicates().reset_index(drop=True) + logger.info('Combined raw dose response data has %d unique samples and %d unique drugs', df_cells_with_response.shape[0], df_drugs_with_response.shape[0]) + + if agg_dose: + df_selected_drugs = None + else: + logger.info('Limiting drugs to those with response min <= %g, max >= %g, span >= %g, median_min <= %g, median_max >= %g ...', drug_lower_response, drug_upper_response, drug_response_span, drug_median_response_min, drug_median_response_max) + df_selected_drugs = response_data.select_drugs_with_response_range(df_response, span=drug_response_span, lower=drug_lower_response, upper=drug_upper_response, lower_median=drug_median_response_min, upper_median=drug_median_response_max) + logger.info('Selected %d drugs from %d', df_selected_drugs.shape[0], df_response['Drug1'].nunique()) + + + cell_feature_subset = read_set_from_file(cell_feature_subset_path) + drug_feature_subset = read_set_from_file(drug_feature_subset_path) + + for fea in cell_features: + fea = fea.lower() + if fea == 'rnaseq' or fea == 'expression': + df_cell_rnaseq = cellline_data.load_cell_rnaseq(ncols=ncols, scaling=scaling, use_landmark_genes=use_landmark_genes, use_filtered_genes=use_filtered_genes, feature_subset=cell_feature_subset, preprocess_rnaseq=preprocess_rnaseq, embed_feature_source=embed_feature_source) + + for fea in drug_features: + fea = fea.lower() + if fea == 'descriptors': + df_drug_desc = drug_data.load_drug_descriptors(ncols=ncols, scaling=scaling, dropna=dropna, feature_subset=drug_feature_subset) + elif fea == 'fingerprints': + df_drug_fp = drug_data.load_drug_fingerprints(ncols=ncols, scaling=scaling, dropna=dropna, feature_subset=drug_feature_subset) + + # df_drug_desc, df_drug_fp = drug_data.load_drug_data(ncols=ncols, scaling=scaling, dropna=dropna) + + cell_df_dict = {'rnaseq': 'df_cell_rnaseq'} + + drug_df_dict = {'descriptors': 'df_drug_desc', + 'fingerprints': 'df_drug_fp'} + + # df_cell_ids = df_cell_rnaseq[['Sample']].drop_duplicates() + # df_drug_ids = pd.concat([df_drug_desc[['Drug']], df_drug_fp[['Drug']]]).drop_duplicates() + + logger.info('Filtering drug response data...') + + df_cell_ids = df_cells_with_response + for fea in cell_features: + df_cell = locals()[cell_df_dict[fea]] + df_cell_ids = df_cell_ids.merge(df_cell[['Sample']]).drop_duplicates() + logger.info(' %d molecular samples with feature and response data', df_cell_ids.shape[0]) + + df_drug_ids = df_drugs_with_response + for fea in drug_features: + df_drug = locals()[drug_df_dict[fea]] + df_drug_ids = df_drug_ids.merge(df_drug[['Drug']]).drop_duplicates() + + if df_selected_drugs is not None: + df_drug_ids = df_drug_ids.merge(df_selected_drugs).drop_duplicates() + logger.info(' %d selected drugs with feature and response data', df_drug_ids.shape[0]) + + df_response = df_response[df_response['Sample'].isin(df_cell_ids['Sample']) & + df_response['Drug1'].isin(df_drug_ids['Drug']) & + (df_response['Drug2'].isin(df_drug_ids['Drug']) | df_response['Drug2'].isnull())] + + df_response = df_response[df_response['Source'].isin(train_sep_sources + test_sep_sources)] + + df_response.reset_index(drop=True, inplace=True) + + if logger.isEnabledFor(logging.INFO): + logger.info('Summary of filtered dose response by source:') + logger.info(response_data.summarize_response_data(df_response, target=agg_dose)) + + df_response = df_response.assign(Group = assign_partition_groups(df_response, partition_by)) + + self.agg_dose = agg_dose + self.cell_features = cell_features + self.drug_features = drug_features + self.cell_df_dict = cell_df_dict + self.drug_df_dict = drug_df_dict + self.df_source = df_source + self.df_response = df_response + self.embed_feature_source = embed_feature_source + self.encode_response_source = encode_response_source + self.all_sources = all_sources + self.train_sources = train_sources + self.test_sources = test_sources + self.train_sep_sources = train_sep_sources + self.test_sep_sources = test_sep_sources + self.partition_by = partition_by + + for var in (list(drug_df_dict.values()) + list(cell_df_dict.values())): + value = locals().get(var) + if value is not None: + setattr(self, var, value) + + self.build_feature_list(single=single) + + if cache: + self.save_to_cache(cache, params) + + + def get_cells_in_val(self): + + val_cell_ids = list(set(self.df_response.loc[self.val_indexes[0]]['Sample'].values)) + + return val_cell_ids + + + def get_drugs_in_val(self): + + if np.isin('Drug', self.df_response.columns.values): + val_drug_ids = list(set(self.df_response.loc[self.val_indexes[0]]['Drug'].values)) + else: + val_drug_ids = list(set(self.df_response.loc[self.val_indexes[0]]['Drug1'].values)) + + return val_drug_ids + + + def get_index_in_val(self): + + val_indices = list(set(self.val_indexes[0])) + + return val_indices + + diff --git a/Pilot1/Uno_UQ/model_utils_/__init__.py b/Pilot1/Uno_UQ/model_utils_/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/Pilot1/Uno_UQ/model_utils_/uno_model_utils.py b/Pilot1/Uno_UQ/model_utils_/uno_model_utils.py new file mode 100644 index 00000000..244c1ba8 --- /dev/null +++ b/Pilot1/Uno_UQ/model_utils_/uno_model_utils.py @@ -0,0 +1,307 @@ +#! /usr/bin/env python + + +import numpy as np + +import keras +from keras import backend as K +from keras.models import Model +from keras.layers import Input, Dense, Dropout +from keras.callbacks import Callback +from keras import regularizers +from keras.metrics import mean_squared_error, mean_absolute_error + +import candle + + +def r2_heteroscedastic(y_true, y_pred): + y_out = K.reshape(y_pred[:,:-1], K.shape(y_true)) + SS_res = K.sum(K.square(y_true - y_out)) + SS_tot = K.sum(K.square(y_true - K.mean(y_true))) + return (1 - SS_res/(SS_tot + K.epsilon())) + + +def mae_heteroscedastic(y_true, y_pred): + y_out = K.reshape(y_pred[:,:-1], K.shape(y_true)) + return mean_absolute_error(y_true, y_out) + +def mse_heteroscedastic(y_true, y_pred): + y_out = K.reshape(y_pred[:,:-1], K.shape(y_true)) + return mean_squared_error(y_true, y_out) + +def meanS_heteroscesdastic(y_true, y_pred): + log_sig2 = y_pred[:,1] + return K.mean(log_sig2) + +def quantile_loss(quantile, y_true, y_pred): + error = (y_true - y_pred) + return K.mean(K.maximum(quantile*error, (quantile-1)*error), axis=-1) + +def quantile50(y_true, y_pred): + y_out0 = K.reshape(y_pred[:,0], K.shape(y_true)) + error = (y_true-y_out0) + quantile = 0.5 + return quantile_loss(quantile, y_true, y_out0) + + +def quantile10(y_true, y_pred): + y_out1 = K.reshape(y_pred[:,1], K.shape(y_true)) + error = (y_true-y_out1) + quantile = 0.1 + return quantile_loss(quantile, y_true, y_out1) + + +def quantile90(y_true, y_pred): + y_out2 = K.reshape(y_pred[:,2], K.shape(y_true)) + error = (y_true-y_out2) + quantile = 0.9 + return quantile_loss(quantile, y_true, y_out2) + + +class ModelRecorder(Callback): + def __init__(self, save_all_models=False): + Callback.__init__(self) + self.save_all_models = save_all_models + candle.register_permanent_dropout() + + def on_train_begin(self, logs={}): + self.val_losses = [] + self.best_val_loss = np.Inf + self.best_model = None + + def on_epoch_end(self, epoch, logs={}): + val_loss = logs.get('val_loss') + self.val_losses.append(val_loss) + if val_loss < self.best_val_loss: + self.best_model = keras.models.clone_model(self.model) + self.best_val_loss = val_loss + + +class SimpleWeightSaver(Callback): + + def __init__(self, fname): + self.fname = fname + + def set_model(self, model): + if isinstance(model.layers[-2], Model): + self.model = model.layers[-2] + else: + self.model = model + + def on_train_end(self, logs={}): + self.model.save_weights(self.fname) + + +def build_model(loader, args, logger=None, permanent_dropout=True, silent=False): + if args.loss == 'heteroscedastic': + model = build_heteroscedastic_model(loader, args, logger, permanent_dropout, silent) + elif args.loss == 'quantile': + model = build_quantile_model(loader, args, logger, permanent_dropout, silent) + else: + model = build_homoscedastic_model(loader, args, logger, permanent_dropout, silent) + + return model + +def build_feature_model(input_shape, name='', dense_layers=[1000, 1000], + activation='relu', residual=False, + dropout_rate=0, permanent_dropout=True, + reg_l2=0): + x_input = Input(shape=input_shape) + h = x_input + for i, layer in enumerate(dense_layers): + x = h + if reg_l2 > 0: + h = Dense(layer, activation=activation, kernel_regularizer=regularizers.l2(reg_l2))(h) + else: + h = Dense(layer, activation=activation)(h) + if dropout_rate > 0: + if permanent_dropout: + h = candle.PermanentDropout(dropout_rate)(h) + else: + h = Dropout(dropout_rate)(h) + if residual: + try: + h = keras.layers.add([h, x]) + except ValueError: + pass + model = Model(x_input, h, name=name) + return model + + +def build_homoscedastic_model(loader, args, logger=None, permanent_dropout=True, silent=False): + input_models = {} + dropout_rate = args.drop + reg_l2 = args.reg_l2 + for fea_type, shape in loader.feature_shapes.items(): + base_type = fea_type.split('.')[0] + if base_type in ['cell', 'drug']: + box = build_feature_model(input_shape=shape, name=fea_type, + dense_layers=args.dense_feature_layers, + dropout_rate=dropout_rate, permanent_dropout=permanent_dropout, + reg_l2=reg_l2) + if not silent: + logger.debug('Feature encoding submodel for %s:', fea_type) + box.summary(print_fn=logger.debug) + input_models[fea_type] = box + + inputs = [] + encoded_inputs = [] + for fea_name, fea_type in loader.input_features.items(): + shape = loader.feature_shapes[fea_type] + fea_input = Input(shape, name='input.'+fea_name) + inputs.append(fea_input) + if fea_type in input_models: + input_model = input_models[fea_type] + encoded = input_model(fea_input) + else: + encoded = fea_input + encoded_inputs.append(encoded) + + merged = keras.layers.concatenate(encoded_inputs) + + h = merged + for i, layer in enumerate(args.dense): + x = h + if reg_l2 > 0: + h = Dense(layer, activation=args.activation, kernel_regularizer=regularizers.l2(reg_l2))(h) + else: + h = Dense(layer, activation=args.activation)(h) + if dropout_rate > 0: + if permanent_dropout: + h = candle.PermanentDropout(dropout_rate)(h) + else: + h = Dropout(dropout_rate)(h) + if args.residual: + try: + h = keras.layers.add([h, x]) + except ValueError: + pass + output = Dense(1)(h) + + return Model(inputs, output) + + +def build_heteroscedastic_model(loader, args, logger=None, permanent_dropout=True, silent=False): + input_models = {} + dropout_rate = args.drop + reg_l2 = args.reg_l2 + for fea_type, shape in loader.feature_shapes.items(): + base_type = fea_type.split('.')[0] + if base_type in ['cell', 'drug']: + box = build_feature_model(input_shape=shape, name=fea_type, + dense_layers=args.dense_feature_layers, + dropout_rate=dropout_rate, permanent_dropout=permanent_dropout, + reg_l2=reg_l2) + if not silent: + logger.debug('Feature encoding submodel for %s:', fea_type) + box.summary(print_fn=logger.debug) + input_models[fea_type] = box + + inputs = [] + encoded_inputs = [] + for fea_name, fea_type in loader.input_features.items(): + shape = loader.feature_shapes[fea_type] + fea_input = Input(shape, name='input.'+fea_name) + inputs.append(fea_input) + if fea_type in input_models: + input_model = input_models[fea_type] + encoded = input_model(fea_input) + else: + encoded = fea_input + encoded_inputs.append(encoded) + + merged = keras.layers.concatenate(encoded_inputs) + + h = merged + for i, layer in enumerate(args.dense): + x = h + if reg_l2 > 0: + h = Dense(layer, activation=args.activation, kernel_regularizer=regularizers.l2(reg_l2))(h) + else: + h = Dense(layer, activation=args.activation)(h) + if dropout_rate > 0: + if permanent_dropout: + h = candle.PermanentDropout(dropout_rate)(h) + else: + h = Dropout(dropout_rate)(h) + if args.residual: + try: + h = keras.layers.add([h, x]) + except ValueError: + pass + output = Dense(2, bias_initializer='ones')(h) + + return Model(inputs, output) + +def build_quantile_model(loader, args, logger=None, permanent_dropout=True, silent=False): + input_models = {} + dropout_rate = args.drop + reg_l2 = args.reg_l2 + for fea_type, shape in loader.feature_shapes.items(): + base_type = fea_type.split('.')[0] + if base_type in ['cell', 'drug']: + box = build_feature_model(input_shape=shape, name=fea_type, + dense_layers=args.dense_feature_layers, + dropout_rate=dropout_rate, + permanent_dropout=permanent_dropout, + reg_l2=reg_l2) + if not silent: + logger.debug('Feature encoding submodel for %s:', fea_type) + box.summary(print_fn=logger.debug) + input_models[fea_type] = box + + inputs = [] + encoded_inputs = [] + for fea_name, fea_type in loader.input_features.items(): + shape = loader.feature_shapes[fea_type] + fea_input = Input(shape, name='input.'+fea_name) + inputs.append(fea_input) + if fea_type in input_models: + input_model = input_models[fea_type] + encoded = input_model(fea_input) + else: + encoded = fea_input + encoded_inputs.append(encoded) + + merged = keras.layers.concatenate(encoded_inputs) + + h = merged + for i, layer in enumerate(args.dense): + x = h + h = Dense(layer, activation=args.activation, kernel_regularizer=regularizers.l2(args.reg_l2))(h) + if dropout_rate > 0: + if permanent_dropout: + h = candle.PermanentDropout(dropout_rate)(h) + else: + h = Dropout(dropout_rate)(h) + if args.residual: + try: + h = keras.layers.add([h, x]) + except ValueError: + pass + output = Dense(3, bias_initializer='ones')(h) + + return Model(inputs, output) + + +def heteroscedastic_loss(y_true, y_pred): + y_shape = K.shape(y_true) + y_out = K.reshape(y_pred[:,0], y_shape) + diff_sq = K.square(y_out - y_true) + log_sig2 = y_pred[:,1] + + return K.mean(K.exp(-log_sig2) * diff_sq + log_sig2) + + +def tilted_loss(quantile, y_true, f): + error = (y_true-f) + return K.mean(K.maximum(quantile*error, (quantile-1)*error), axis=-1) + + +def triple_quantile_loss(y_true, y_pred): + y_shape = K.shape(y_true) + y_out0 = K.reshape(y_pred[:,0], y_shape) + y_out1 = K.reshape(y_pred[:,1], y_shape) + y_out2 = K.reshape(y_pred[:,2], y_shape) + + return tilted_loss(0.1, y_true, y_out1) + tilted_loss(0.9, y_true, y_out2) + 2. * tilted_loss(0.5, y_true, y_out0) diff --git a/Pilot1/Uno_UQ/uno_gCSI_modelUQ.txt b/Pilot1/Uno_UQ/uno_gCSI_modelUQ.txt new file mode 100644 index 00000000..71fec820 --- /dev/null +++ b/Pilot1/Uno_UQ/uno_gCSI_modelUQ.txt @@ -0,0 +1,39 @@ +[Global_Params] +train_sources=['gCSI'] +test_sources=['train'] +cell_types=None +cell_features=['rnaseq'] +drug_features=['descriptors', 'fingerprints'] +dense=[1000, 1000, 1000] +dense_feature_layers=[1000, 1000, 1000] +activation='relu' +loss='mse' +optimizer='adam' +scaling='std' +drop=0 +epochs=10 +batch_size=32 +val_split=0.2 +cv=1 +max_val_loss=1.0 +learning_rate=None +base_lr=None +residual=False +reduce_lr=False +warmup_lr=False +batch_normalization=False +feature_subsample=0 +rng_seed=2018 +save_path='save_gCSI/' +no_gen=False +verbose = False +single=True +agg_dose='AUC' +no_feature_source=True +no_response_source=True +use_landmark_genes=True +partition_by='cell' + +[Monitor_Params] +solr_root='' +timeout=3600 diff --git a/Pilot1/Uno_UQ/uno_holdoutUQ_data.py b/Pilot1/Uno_UQ/uno_holdoutUQ_data.py new file mode 100644 index 00000000..165f940f --- /dev/null +++ b/Pilot1/Uno_UQ/uno_holdoutUQ_data.py @@ -0,0 +1,109 @@ +#! /usr/bin/env python + +from __future__ import division, print_function + +import logging +import os + +from keras import backend as K + +import data_utils_.uno as uno +import candle + +import data_utils_.uno_combined_data_loader as uno_combined_data_loader + + +logger = logging.getLogger(__name__) + +os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' + + +def initialize_parameters(): + + # Build benchmark object + unoBmk = uno.BenchmarkUno(uno.file_path, 'uno_default_model.txt', 'keras', + prog='uno_holdoutUQ_data', desc='Build data split for UQ analysis in the problem of prediction of tumor response to drug pairs.') + + # Initialize parameters + gParameters = candle.initialize_parameters(unoBmk) + #benchmark.logger.info('Params: {}'.format(gParameters)) + + return gParameters + + +def run(params): + args = candle.ArgumentStruct(**params) + candle.set_seed(args.rng_seed) + ext = uno.extension_from_parameters(args) + candle.verify_path(args.save_path) + prefix = args.save_path + ext + logfile = args.logfile if args.logfile else prefix+'.log' + uno.set_up_logger(logfile, logger, uno.loggerUno, args.verbose) + logger.info('Params: {}'.format(params)) + + loader = uno_combined_data_loader.CombinedDataLoader(args.rng_seed) + loader.load(cache=args.cache, + ncols=args.feature_subsample, + agg_dose=args.agg_dose, + cell_features=args.cell_features, + drug_features=args.drug_features, + drug_median_response_min=args.drug_median_response_min, + drug_median_response_max=args.drug_median_response_max, + use_landmark_genes=args.use_landmark_genes, + use_filtered_genes=args.use_filtered_genes, + cell_feature_subset_path=args.cell_feature_subset_path or args.feature_subset_path, + drug_feature_subset_path=args.drug_feature_subset_path or args.feature_subset_path, + preprocess_rnaseq=args.preprocess_rnaseq, + single=args.single, + train_sources=args.train_sources, + test_sources=args.test_sources, + embed_feature_source=not args.no_feature_source, + encode_response_source=not args.no_response_source, + partition_by=args.partition_by + ) + + target = args.agg_dose or 'Growth' + val_split = args.val_split + train_split = 1 - val_split + + loader.partition_data(partition_by=args.partition_by, + cv_folds=args.cv, train_split=train_split, + val_split=val_split, cell_types=args.cell_types, + by_cell=args.by_cell, by_drug=args.by_drug, + cell_subset_path=args.cell_subset_path, + drug_subset_path=args.drug_subset_path + ) + + print('partition_by: ', args.partition_by) + if args.partition_by == 'drug_pair': + fname_drugs = 'infer_drug_ids' + pds = loader.get_drugs_in_val() + with open(fname_drugs, 'w') as f: + for item in pds: + f.write('%s\n' % item) + logger.info('Drug IDs in holdout set written in file: {}'.format(fname_drugs)) + elif args.partition_by == 'cell': + fname_cells = 'infer_cell_ids' + pcs = loader.get_cells_in_val() + with open(fname_cells, 'w') as f: + for item in pcs: + f.write('%s\n' % item) + logger.info('Cell IDs in holdout set written in file: {}'.format(fname_cells)) + else : # + fname_index = 'infer_index_ids' + pins = loader.get_index_in_val() + with open(fname_index, 'w') as f: + for item in pins: + f.write('%s\n' % item) + logger.info('Indices in holdout set written in file: {}'.format(fname_index)) + + +def main(): + params = initialize_parameters() + run(params) + + +if __name__ == '__main__': + main() + if K.backend() == 'tensorflow': + K.clear_session() diff --git a/Pilot1/Uno_UQ/uno_inferUQ_keras2.py b/Pilot1/Uno_UQ/uno_inferUQ_keras2.py new file mode 100644 index 00000000..af1c7934 --- /dev/null +++ b/Pilot1/Uno_UQ/uno_inferUQ_keras2.py @@ -0,0 +1,296 @@ +#! /usr/bin/env python + +from __future__ import division, print_function + +import argparse +import logging +import os + +import numpy as np +import pandas as pd + +from itertools import cycle + +from keras import backend as K + +import keras +from keras.utils import get_custom_objects + +import data_utils_.uno as uno +import candle + +import data_utils_.uno_combined_data_loader as uno_combined_data_loader +import data_utils_.uno_combined_data_generator as uno_combined_data_generator +import model_utils_.uno_model_utils as uno_model_utils + +logger = logging.getLogger(__name__) + +os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' + + +additional_definitions_local = [ +{'name':'uq_infer_file', + 'default':argparse.SUPPRESS, + 'action':'store', + 'help':'File to do inference'}, +{'name':'uq_infer_given_drugs', + 'type': candle.str2bool, + 'default': False, + 'help':'Use given inference file to obtain drug ids to do inference'}, +{'name':'uq_infer_given_cells', + 'type': candle.str2bool, + 'default': False, + 'help':'Use given inference file to obtain cell ids to do inference'}, +{'name':'uq_infer_given_indices', + 'type': candle.str2bool, + 'default': False, + 'help':'Use given inference file to obtain indices to do inference'}, +{'name':'weights_file', + 'default':'saved.weights.h5', + 'help':'trained weights file (loading model file alone sometimes does not work in keras)'}, +{'name':'n_pred', + 'type':int, + 'default':1, + 'help':'the number of predictions to make for each sample-drug combination for uncertainty quantification'} +] + +required_local = ( 'model_file', 'weights_file', 'uq_infer_file', + 'agg_dose', 'batch_size') + + +def initialize_parameters(): + + # Build benchmark object + unoBmk = uno.BenchmarkUno(uno.file_path, 'uno_default_model.txt', 'keras', + prog='uno_inferUQ', desc='Read models to predict tumor response to single and paired drugs.') + + unoBmk.additional_definitions += additional_definitions_local + unoBmk.required = unoBmk.required.union(required_local) + + # Initialize parameters + gParameters = candle.initialize_parameters(unoBmk) + #benchmark.logger.info('Params: {}'.format(gParameters)) + + return gParameters + + +def from_file(args, model): + + df_data = pd.read_csv(args.uq_infer_file, sep='\t') + logger.info('data shape: {}'.format(df_data.shape)) + logger.info('Size of data to infer: {}'.format(df_data.shape)) + + test_indices = range(df_data.shape[0]) + target_str = args.agg_dose or 'Growth' + + # Extract size of input layers to get number of features + num_features_list = [] + feature_names_list = [] + for layer in model.layers: # All layers in model + dict = layer.get_config() # getting layer config info + name = dict['name'] # getting layer name + if name.find('input') > -1: # if layer is an input layer + feature_names_list.append(name.split('.')[-1]) + size_ = dict['batch_input_shape'] # get layer size + num_features_list.append(size_[1]) + + feature_names_list.append('dragon7') + + test_gen = uno_combined_data_generator.FromFileDataGenerator(df_data, test_indices, + target_str, feature_names_list, num_features_list, + batch_size=args.batch_size, shuffle=False) + + return test_gen + + +def given_drugs(args, loader): + + test_gen = uno_combined_data_generator.CombinedDataGenerator(loader, partition='test', batch_size=args.batch_size) + + # Include specified drugs + include_drugs = uno.read_IDs_file(args.uq_infer_file) + df_response = test_gen.data.df_response + if np.isin('Drug', df_response.columns.values): + df = df_response[['Drug']] + index = df.index[df['Drug'].isin(include_drugs)] + else: + df = df_response[['Drug1', 'Drug2']] + index = df.index[df['Drug1'].isin(include_drugs) | + df['Drug2'].isin(include_drugs)] + + # Update object + test_gen.index = index + test_gen.index_cycle = cycle(index) + test_gen.size = len(index) + test_gen.steps = np.ceil(test_gen.size / args.batch_size) + + return test_gen + + +def given_cells(args, loader): + + test_gen = uno_combined_data_generator.CombinedDataGenerator(loader, partition='test', batch_size=args.batch_size) + + # Include specified cells + include_cells = uno.read_IDs_file(args.uq_infer_file) + df = test_gen.data.df_response[['Sample']] + index = df.index[df['Sample'].isin(include_cells)] + + # Update object + test_gen.index = index + test_gen.index_cycle = cycle(index) + test_gen.size = len(index) + test_gen.steps = np.ceil(test_gen.size / args.batch_size) + + return test_gen + + +def given_indices(args, loader): + + test_gen = uno_combined_data_generator.CombinedDataGenerator(loader, partition='test', batch_size=args.batch_size) + + # Include specified indices + index = uno.read_IDs_file(args.uq_infer_file) + + # Update object + test_gen.index = index + test_gen.index_cycle = cycle(index) + test_gen.size = len(index) + test_gen.steps = np.ceil(test_gen.size / args.batch_size) + + return test_gen + + +def run(params): + args = candle.ArgumentStruct(**params) + candle.set_seed(args.rng_seed) + logfile_def = 'uno_infer_from_' + args.uq_infer_file + '.log' + logfile = args.logfile if args.logfile else logfile_def + uno.set_up_logger(logfile, logger, uno.loggerUno, args.verbose) + logger.info('Params: {}'.format(params)) + + ext = uno.extension_from_parameters(args) + candle.verify_path(args.save_path) + prefix = args.save_path + 'uno' + ext + + # Load trained model + candle.register_permanent_dropout() + model = keras.models.load_model(args.model_file, compile=False) + model.load_weights(args.weights_file) + logger.info('Loaded model:') + model.summary(print_fn=logger.info) + + # Determine output to infer + target = args.agg_dose or 'Growth' + + if (args.uq_infer_given_drugs or args.uq_infer_given_cells or args.uq_infer_given_indices): + loader = uno_combined_data_loader.CombinedDataLoader(args.rng_seed) + loader.load(cache=args.cache, + ncols=args.feature_subsample, + agg_dose=args.agg_dose, + cell_features=args.cell_features, + drug_features=args.drug_features, + drug_median_response_min=args.drug_median_response_min, + drug_median_response_max=args.drug_median_response_max, + use_landmark_genes=args.use_landmark_genes, + use_filtered_genes=args.use_filtered_genes, + cell_feature_subset_path=args.cell_feature_subset_path or args.feature_subset_path, + drug_feature_subset_path=args.drug_feature_subset_path or args.feature_subset_path, + preprocess_rnaseq=args.preprocess_rnaseq, + single=args.single, + train_sources=args.train_sources, + test_sources=args.test_sources, + embed_feature_source=not args.no_feature_source, + encode_response_source=not args.no_response_source, + ) + + if args.uq_infer_given_drugs: + test_gen = given_drugs(args, loader) + elif args.uq_infer_given_cells: + test_gen = given_cells(args, loader) + else: + test_gen = given_indices(args, loader) + + else: + test_gen = from_file(args, model) + + + df_test = test_gen.get_response(copy=True) + y_test = df_test[target].values + + for i in range(args.n_pred): + + if args.no_gen: + x_test_list, y_test = test_gen.get_slice(size=test_gen.size, single=args.single) + y_test_pred = model.predict(x_test_list, batch_size=args.batch_size) + else: + test_gen.reset() + y_test_pred = model.predict_generator(test_gen.flow(single=args.single), test_gen.steps) + y_test_pred = y_test_pred[:test_gen.size] + + if args.loss == 'heteroscedastic': + y_test_pred_ = y_test_pred[:,0] + s_test_pred = y_test_pred[:,1] + + y_test_pred = y_test_pred_.flatten() + + df_test['Predicted_'+target+'_'+str(i+1)] = y_test_pred + df_test['Pred_S_'+target+'_'+str(i+1)] = s_test_pred + + pred_fname = prefix + '.predicted_INFER_HET.tsv' + + elif args.loss == 'quantile': + + y_test_pred_50q = y_test_pred[:,0] + y_test_pred_10q = y_test_pred[:,1] + y_test_pred_90q = y_test_pred[:,2] + + y_test_pred = y_test_pred_50q.flatten() # 50th quantile prediction + + df_test['Predicted_50q_'+target+'_'+str(i+1)] = y_test_pred + df_test['Predicted_10q_'+target+'_'+str(i+1)] = y_test_pred_10q.flatten() + df_test['Predicted_90q_'+target+'_'+str(i+1)] = y_test_pred_90q.flatten() + + pred_fname = prefix + '.predicted_INFER_QTL.tsv' + + else: + y_test_pred = y_test_pred.flatten() + df_test['Predicted_'+target+'_'+str(i+1)] = y_test_pred + pred_fname = prefix + '.predicted_INFER.tsv' + + if args.n_pred < 21: + scores = uno.evaluate_prediction(y_test, y_test_pred) + uno.log_evaluation(scores, logger) + + df_pred = df_test + if args.agg_dose: + if args.single: + df_pred.sort_values(['Sample', 'Drug1', target], inplace=True) + else: + df_pred.sort_values(['Sample', 'Drug1', 'Drug2', target], inplace=True) + else: + if args.single: + df_pred.sort_values(['Sample', 'Drug1', 'Dose1', 'Growth'], inplace=True) + else: + df_pred.sort_values(['Sample', 'Drug1', 'Drug2', 'Dose1', 'Dose2', 'Growth'], inplace=True) + + df_pred.to_csv(pred_fname, sep='\t', index=False, float_format='%.4g') + logger.info('Predictions stored in file: {}'.format(pred_fname)) + + + if K.backend() == 'tensorflow': + K.clear_session() + + logger.handlers = [] + + +def main(): + params = initialize_parameters() + run(params) + + +if __name__ == '__main__': + main() + if K.backend() == 'tensorflow': + K.clear_session() + diff --git a/Pilot1/Uno_UQ/uno_trainUQ_keras2.py b/Pilot1/Uno_UQ/uno_trainUQ_keras2.py new file mode 100644 index 00000000..8a06da16 --- /dev/null +++ b/Pilot1/Uno_UQ/uno_trainUQ_keras2.py @@ -0,0 +1,404 @@ +#! /usr/bin/env python + +from __future__ import division, print_function + +import argparse +import logging +import os + +import numpy as np +import pandas as pd + + +from keras import backend as K +from keras import optimizers +from keras.models import Model +from keras.callbacks import Callback, ModelCheckpoint, ReduceLROnPlateau, LearningRateScheduler, TensorBoard +from keras.utils.vis_utils import plot_model + +import data_utils_.uno as uno +import candle + +import data_utils_.uno_combined_data_loader as uno_combined_data_loader +import data_utils_.uno_combined_data_generator as uno_combined_data_generator +import model_utils_.uno_model_utils as uno_model_utils + +from model_utils_.uno_model_utils import heteroscedastic_loss, triple_quantile_loss + +logger = logging.getLogger(__name__) + +os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' + +additional_definitions = [ +{'name':'uq_exclude_drugs_file', + 'default':argparse.SUPPRESS, + 'action':'store', + 'help':'File with drug ids to exclude from training'}, +{'name':'uq_exclude_cells_file', + 'default':argparse.SUPPRESS, + 'action':'store', + 'help':'File with cell ids to exclude from training'}, +{'name':'uq_exclude_indices_file', + 'default':argparse.SUPPRESS, + 'action':'store', + 'help':'File with indices to exclude from training'}, +{'name':'exclude_indices', 'nargs':'+', + 'default': [], + 'help':'indices to exclude'}, +{'name':'reg_l2', + 'type': float, + 'default': 0., + 'help':'weight of regularization for l2 norm of nn weights'} +] + +required = ['exclude_drugs', 'exclude_cells', 'exclude_indices'] + +class UQUno(candle.Benchmark): + def set_locals(self): + """Functionality to set variables specific for the benchmark + - required: set of required parameters for the benchmark. + - additional_definitions: list of dictionaries describing the additional parameters for the + benchmark. + """ + + if required is not None: + self.required = set(uno.required) + self.required.update(required) + if additional_definitions is not None: + self.additional_definitions = additional_definitions + uno.additional_definitions + + + +def initialize_parameters(): + + # Build benchmark object + unoUQBmk = UQUno(uno.file_path, 'uno_defaultUQ_model.txt', 'keras', + prog='uno_trainUQ', desc='Build neural network based models to predict tumor response to single and paired drugs, including UQ analysis.') + + # Initialize parameters + gParameters = candle.initialize_parameters(unoUQBmk) + #benchmark.logger.info('Params: {}'.format(gParameters)) + + return gParameters + + +def run(params): + args = candle.ArgumentStruct(**params) + candle.set_seed(args.rng_seed) + ext = uno.extension_from_parameters(args) + candle.verify_path(args.save_path) + prefix = args.save_path + 'uno' + ext + logfile = args.logfile if args.logfile else prefix+'.log' + uno.set_up_logger(logfile, logger, uno.loggerUno, args.verbose) + logger.info('Params: {}'.format(params)) + + # Exclude drugs / cells for UQ + if 'uq_exclude_drugs_file' in params.keys(): + args.exclude_drugs = uno.read_IDs_file(args.uq_exclude_drugs_file) + logger.info('Drugs to exclude: {}'.format(args.exclude_drugs)) + else: + args.exclude_drugs = [] + if 'uq_exclude_cells_file' in params.keys(): + args.exclude_cells = uno.read_IDs_file(args.uq_exclude_cells_file) + logger.info('Cells to exclude: {}'.format(args.exclude_cells)) + else: + args.exclude_cells = [] + + if 'uq_exclude_indices_file' in params.keys(): + exclude_indices_ = uno.read_IDs_file(args.uq_exclude_indices_file) + args.exclude_indices = [int(x) for x in exclude_indices_] + logger.info('Indices to exclude: {}'.format(args.exclude_indices)) + else: + args.exclude_indices = [] + + + if (len(args.gpus) > 0): + import tensorflow as tf + config = tf.ConfigProto() + config.gpu_options.allow_growth = True + config.gpu_options.visible_device_list = ",".join(map(str, args.gpus)) + K.set_session(tf.Session(config=config)) + + loader = uno_combined_data_loader.CombinedDataLoader(seed=args.rng_seed) + loader.load(cache=args.cache, + ncols=args.feature_subsample, + agg_dose=args.agg_dose, + cell_features=args.cell_features, + drug_features=args.drug_features, + drug_median_response_min=args.drug_median_response_min, + drug_median_response_max=args.drug_median_response_max, + use_landmark_genes=args.use_landmark_genes, + use_filtered_genes=args.use_filtered_genes, + cell_feature_subset_path=args.cell_feature_subset_path or args.feature_subset_path, + drug_feature_subset_path=args.drug_feature_subset_path or args.feature_subset_path, + preprocess_rnaseq=args.preprocess_rnaseq, + single=args.single, + train_sources=args.train_sources, + test_sources=args.test_sources, + embed_feature_source=not args.no_feature_source, + encode_response_source=not args.no_response_source, + ) + + target = args.agg_dose or 'Growth' + val_split = args.val_split + train_split = 1 - val_split + + loader.partition_data(partition_by=args.partition_by, + cv_folds=args.cv, train_split=train_split, val_split=val_split, + cell_types=args.cell_types, by_cell=args.by_cell, by_drug=args.by_drug, + cell_subset_path=args.cell_subset_path, + drug_subset_path=args.drug_subset_path, + exclude_cells=args.exclude_cells, + exclude_drugs=args.exclude_drugs, + exclude_indices=args.exclude_indices + ) + + model = uno_model_utils.build_model(loader, args, logger) + logger.info('Combined model:') + model.summary(print_fn=logger.info) + # plot_model(model, to_file=prefix+'.model.png', show_shapes=True) + + if args.cp: + model_json = model.to_json() + with open(prefix+'.model.json', 'w') as f: + print(model_json, file=f) + + def warmup_scheduler(epoch): + lr = args.learning_rate or base_lr * args.batch_size/100 + if epoch <= 5: + K.set_value(model.optimizer.lr, (base_lr * (5-epoch) + lr * epoch) / 5) + logger.debug('Epoch {}: lr={:.5g}'.format(epoch, K.get_value(model.optimizer.lr))) + return K.get_value(model.optimizer.lr) + + df_pred_list = [] + + cv_ext = '' + cv = args.cv if args.cv > 1 else 1 + + for fold in range(cv): + if args.cv > 1: + logger.info('Cross validation fold {}/{}:'.format(fold+1, cv)) + cv_ext = '.cv{}'.format(fold+1) + +# model = uno_model_utils.build_model(loader, args, logger, silent=True) + + template_model = uno_model_utils.build_model(loader, args, logger, silent=True) + if args.initial_weights: + logger.info("Loading weights from {}".format(args.initial_weights)) + template_model.load_weights(args.initial_weights) + + if len(args.gpus) > 1: + from keras.utils import multi_gpu_model + gpu_count = len(args.gpus) + logger.info("Multi GPU with {} gpus".format(gpu_count)) + model = multi_gpu_model(template_model, cpu_merge=False, gpus=gpu_count) + else: + model = template_model + + + optimizer = optimizers.deserialize({'class_name': args.optimizer, 'config': {}}) + base_lr = args.base_lr or K.get_value(optimizer.lr) + if args.learning_rate: + K.set_value(optimizer.lr, args.learning_rate) + + if args.loss == 'heteroscedastic': + logger.info('Training heteroscedastic model:') + model.compile(loss=heteroscedastic_loss, optimizer=optimizer, metrics=[uno_model_utils.mae_heteroscedastic, uno_model_utils.r2_heteroscedastic, uno_model_utils.meanS_heteroscesdastic]) + elif args.loss == 'quantile': + logger.info('Training quantile model:') + model.compile(loss=triple_quantile_loss, optimizer=optimizer, metrics=[uno_model_utils.quantile50, uno_model_utils.quantile10, uno_model_utils.quantile90]) + else: + logger.info('Training homoscedastic model:') + model.compile(loss=args.loss, optimizer=optimizer, metrics=[candle.mae, candle.r2]) + + # calculate trainable and non-trainable params + params.update(candle.compute_trainable_params(model)) + + candle_monitor = candle.CandleRemoteMonitor(params=params) + timeout_monitor = candle.TerminateOnTimeOut(params['timeout']) + + reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=0.00001) + warmup_lr = LearningRateScheduler(warmup_scheduler) + #checkpointer = ModelCheckpoint(prefix+cv_ext+'.weights.h5', save_best_only=True, save_weights_only=True) + checkpointer = candle.MultiGPUCheckpoint(prefix + cv_ext + '.model.h5', save_best_only=True) + tensorboard = TensorBoard(log_dir="tb/{}{}{}".format(args.tb_prefix, ext, cv_ext)) + history_logger = candle.LoggingCallback(logger.debug) +# model_recorder = uno_model_utils.ModelRecorder() + + # callbacks = [history_logger, model_recorder] + callbacks = [candle_monitor, timeout_monitor, history_logger]#, model_recorder] + if args.reduce_lr: + callbacks.append(reduce_lr) + if args.warmup_lr: + callbacks.append(warmup_lr) + if args.cp: + callbacks.append(checkpointer) + if args.tb: + callbacks.append(tensorboard) + if args.save_weights: + callbacks.append(uno_model_utils.SimpleWeightSaver(args.save_path + '/' + args.save_weights)) + + + train_gen = uno_combined_data_generator.CombinedDataGenerator(loader, fold=fold, batch_size=args.batch_size, shuffle=args.shuffle) + val_gen = uno_combined_data_generator.CombinedDataGenerator(loader, partition='val', fold=fold, batch_size=args.batch_size, shuffle=args.shuffle) + + df_val = val_gen.get_response(copy=True) + y_val = df_val[target].values + y_shuf = np.random.permutation(y_val) + uno.log_evaluation(uno.evaluate_prediction(y_val, y_shuf), logger, + description='Between random pairs in y_val:') + + if args.no_gen: + x_train_list, y_train = train_gen.get_slice(size=train_gen.size, single=args.single) + x_val_list, y_val = val_gen.get_slice(size=val_gen.size, single=args.single) + history = model.fit(x_train_list, y_train, + batch_size=args.batch_size, + epochs=args.epochs, + callbacks=callbacks, + validation_data=(x_val_list, y_val)) + else: + logger.info('Data points per epoch: train = %d, val = %d',train_gen.size, val_gen.size) + logger.info('Steps per epoch: train = %d, val = %d',train_gen.steps, val_gen.steps) + history = model.fit_generator(train_gen, train_gen.steps, + epochs=args.epochs, + callbacks=callbacks, + validation_data=val_gen, + validation_steps=val_gen.steps) + +# if args.cp: +# model.load_weights(prefix+cv_ext+'.weights.h5') + # model = model_recorder.best_model + + if args.no_gen: + y_val_pred = model.predict(x_val_list, batch_size=args.batch_size) + else: + val_gen.reset() + y_val_pred = model.predict_generator(val_gen, val_gen.steps + 1) + y_val_pred = y_val_pred[:val_gen.size] + + if args.loss == 'heteroscedastic': + y_val_pred_ = y_val_pred[:,0] + s_val_pred = y_val_pred[:,1] + + y_val_pred = y_val_pred_.flatten() + + df_val['Predicted_'+target] = y_val_pred + df_val[target+'_Error'] = y_val_pred-y_val + df_val['Pred_S_'+target] = s_val_pred + + elif args.loss == 'quantile': + y_val_pred_50q = y_val_pred[:,0] + y_val_pred_10q = y_val_pred[:,1] + y_val_pred_90q = y_val_pred[:,2] + + y_val_pred = y_val_pred_50q.flatten() # 50th quantile prediction + + df_val['Predicted_50q_'+target] = y_val_pred + df_val[target+'_Error_50q'] = y_val_pred-y_val + df_val['Predicted_10q_'+target] = y_val_pred_10q.flatten() + df_val['Predicted_90q_'+target] = y_val_pred_90q.flatten() + + else: + y_val_pred = y_val_pred.flatten() + + # df_val = df_val.assign(PredictedGrowth=y_val_pred, GrowthError=y_val_pred-y_val) + df_val['Predicted'+target] = y_val_pred + df_val[target+'Error'] = y_val_pred-y_val + + scores = uno.evaluate_prediction(y_val, y_val_pred) + uno.log_evaluation(scores, logger) + + df_pred_list.append(df_val) + +# if args.cp: +# model_recorder.best_model.save(prefix+'.model.h5') + + if hasattr(history, 'loss'): + candle.plot_history(prefix, history, 'loss') + if args.loss == 'heteroscedastic': + if hasattr(history, 'r2_heteroscedastic'): + candle.plot_history(prefix, history, 'r2_heteroscedastic') + if hasattr(history, 'meanS_heteroscedastic'): + candle.plot_history(prefix, history, 'meanS_heteroscesdastic') + elif args.loss == 'quantile': + if hasattr(history, 'quantile50'): + candle.plot_history(prefix, history, 'quantile50') + if hasattr(history, 'quantile10'): + candle.plot_history(prefix, history, 'quantile10') + if hasattr(history, 'quantile90'): + candle.plot_history(prefix, history, 'quantile90') + else: + if hasattr(history, 'r2'): + candle.plot_history(prefix, history, 'r2') + + pred_fname = prefix + '.predicted.tsv' + df_pred = pd.concat(df_pred_list) + if args.agg_dose: + if args.single: +# df_pred.sort_values(['Source', 'Sample', 'Drug1', target], inplace=True) + df_pred.sort_values(['Sample', 'Drug1', target], inplace=True) + else: + df_pred.sort_values(['Source', 'Sample', 'Drug1', 'Drug2', target], inplace=True) + else: + if args.single: +# df_pred.sort_values(['Source', 'Sample', 'Drug1', 'Dose1', 'Growth'], inplace=True) + df_pred.sort_values(['Sample', 'Drug1', 'Dose1', 'Growth'], inplace=True) + else: +# df_pred.sort_values(['Source', 'Sample', 'Drug1', 'Drug2', 'Dose1', 'Dose2', 'Growth'], inplace=True) + df_pred.sort_values(['Sample', 'Drug1', 'Drug2', 'Dose1', 'Dose2', 'Growth'], inplace=True) + df_pred.to_csv(pred_fname, sep='\t', index=False, float_format='%.4g') + logger.info('Testing predictions stored in file: {}'.format(pred_fname)) + + if args.cp: + logger.info('Model stored in file: {}'.format(prefix+'.model.h5')) +# logger.info('Model weights stored in file: {}'.format(prefix+cv_ext+'.weights.h5')) + logger.info('Model weights stored in file: {}'.format(args.save_path + '/' + args.save_weights)) + + if args.cv > 1: + scores = uno.evaluate_prediction(df_pred[target], df_pred['Predicted'+target]) + uno.log_evaluation(scores, logger, description='Combining cross validation folds:') + + for test_source in loader.test_sep_sources: + test_gen = uno_combined_data_generator.CombinedDataGenerator(loader, partition='test', batch_size=args.batch_size, source=test_source) + df_test = test_gen.get_response(copy=True) + y_test = df_test[target].values + n_test = len(y_test) + if n_test == 0: + continue + if args.no_gen: + x_test_list, y_test = test_gen.get_slice(size=test_gen.size, single=args.single) + y_test_pred = model.predict(x_test_list, batch_size=args.batch_size) + if args.loss == 'heteroscedastic': + y_test_pred = y_test_pred[:,0] + elif args.loss == 'quantile': + y_test_pred = y_test_pred[:,0] # 50th quantile prediction + else: + y_test_pred = model.predict_generator(test_gen.flow(single=args.single), test_gen.steps) + if args.loss == 'heteroscedastic': + y_test_pred = y_test_pred[:test_gen.size,0] + elif args.loss == 'quantile': + y_test_pred = y_test_pred[:test_gen.size,0] # 50th quantile prediction + else: + y_test_pred = y_test_pred[:test_gen.size] + + y_test_pred = y_test_pred.flatten() + scores = uno.evaluate_prediction(y_test, y_test_pred) + uno.log_evaluation(scores, logger, description='Testing on data from {} ({})'.format(test_source, n_test)) + + if K.backend() == 'tensorflow': + K.clear_session() + + logger.handlers = [] + + return history + + +def main(): + params = initialize_parameters() + run(params) + + +if __name__ == '__main__': + main() + if K.backend() == 'tensorflow': + K.clear_session() diff --git a/common/candle/__init__.py b/common/candle/__init__.py index b8bf19c9..486ef1ef 100644 --- a/common/candle/__init__.py +++ b/common/candle/__init__.py @@ -6,6 +6,10 @@ from data_utils import load_csv_data from data_utils import load_Xy_one_hot_data2 from data_utils import load_Xy_data_noheader +from data_utils import drop_impute_and_scale_dataframe +from data_utils import discretize_dataframe +from data_utils import discretize_array +from data_utils import lookup #import from file_utils from file_utils import get_file @@ -25,6 +29,25 @@ # import from viz_utils from viz_utils import plot_history from viz_utils import plot_scatter +from viz_utils import plot_density_observed_vs_predicted +from viz_utils import plot_2d_density_sigma_vs_error +from viz_utils import plot_histogram_error_per_sigma +from viz_utils import plot_calibration_and_errors +from viz_utils import plot_percentile_predictions + + +# import from uq_utils +from uq_utils import compute_statistics_homoscedastic +from uq_utils import compute_statistics_homoscedastic_all +from uq_utils import compute_statistics_heteroscedastic +from uq_utils import compute_statistics_quantile +from uq_utils import split_data_for_empirical_calibration +from uq_utils import compute_empirical_calibration +from uq_utils import bining_for_calibration +from uq_utils import computation_of_valid_calibration_interval +from uq_utils import applying_calibration +from uq_utils import overprediction_check + # import benchmark-dependent utils import sys @@ -41,6 +64,11 @@ from keras_utils import PermanentDropout from keras_utils import register_permanent_dropout from keras_utils import LoggingCallback + from keras_utils import MultiGPUCheckpoint + from keras_utils import r2 + from keras_utils import mae + from keras_utils import mse + from solr_keras import CandleRemoteMonitor from solr_keras import compute_trainable_params diff --git a/common/candle_keras/__init__.py b/common/candle_keras/__init__.py index c5eccf06..bcf15874 100644 --- a/common/candle_keras/__init__.py +++ b/common/candle_keras/__init__.py @@ -6,6 +6,10 @@ from data_utils import load_csv_data from data_utils import load_Xy_one_hot_data2 from data_utils import load_Xy_data_noheader +from data_utils import drop_impute_and_scale_dataframe +from data_utils import discretize_dataframe +from data_utils import discretize_array +from data_utils import lookup #import from file_utils from file_utils import get_file @@ -20,6 +24,30 @@ from default_utils import keras_default_config from default_utils import set_up_logger +from generic_utils import Progbar + +# import from viz_utils +from viz_utils import plot_history +from viz_utils import plot_scatter +from viz_utils import plot_density_observed_vs_predicted +from viz_utils import plot_2d_density_sigma_vs_error +from viz_utils import plot_histogram_error_per_sigma +from viz_utils import plot_calibration_and_errors +from viz_utils import plot_percentile_predictions + +# import from uq_utils +from uq_utils import compute_statistics_homoscedastic +from uq_utils import compute_statistics_homoscedastic_all +from uq_utils import compute_statistics_heteroscedastic +from uq_utils import compute_statistics_quantile +from uq_utils import split_data_for_empirical_calibration +from uq_utils import compute_empirical_calibration +from uq_utils import bining_for_calibration +from uq_utils import computation_of_valid_calibration_interval +from uq_utils import applying_calibration +from uq_utils import overprediction_check + + #import from keras_utils #from keras_utils import dense #from keras_utils import add_dense @@ -30,8 +58,10 @@ from keras_utils import PermanentDropout from keras_utils import register_permanent_dropout from keras_utils import LoggingCallback +from keras_utils import r2 +from keras_utils import mae +from keras_utils import mse -from generic_utils import Progbar from solr_keras import CandleRemoteMonitor from solr_keras import compute_trainable_params diff --git a/common/data_utils.py b/common/data_utils.py index c17a3b42..856c63a0 100644 --- a/common/data_utils.py +++ b/common/data_utils.py @@ -3,7 +3,8 @@ import numpy as np import pandas as pd -from sklearn.preprocessing import Imputer +#from sklearn.preprocessing import Imputer +from sklearn.impute import SimpleImputer from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler from default_utils import DEFAULT_SEED @@ -125,13 +126,162 @@ def impute_and_scale_array(mat, scaling=None): it returns the imputed numpy array. """ - imputer = Imputer(strategy='mean', axis=0, copy=False) +# imputer = Imputer(strategy='mean', axis=0, copy=False) + imputer = SimpleImputer(strategy='mean', copy=False) imputer.fit_transform(mat) - #mat = imputer.fit_transform(mat) return scale_array(mat, scaling) +def drop_impute_and_scale_dataframe(df, scaling='std', imputing='mean', dropna='all'): + """Impute missing values with mean and scale data included in pandas dataframe. + + Parameters + ---------- + df : pandas dataframe + dataframe to process + scaling : string + String describing type of scaling to apply. + 'maxabs' [-1,1], 'minmax' [0,1], 'std', or None, optional + (Default 'std') + imputing : string + String describing type of imputation to apply. + 'mean' replace missing values with mean value along the column, + 'median' replace missing values with median value along the column, + 'most_frequent' replace missing values with most frequent value along column + (Default: 'mean'). + dropna : string + String describing strategy for handling missing values. + 'all' if all values are NA, drop that column. + 'any' if any NA values are present, dropt that column. + (Default: 'all'). + + Return + ---------- + Returns the data frame after handling missing values and scaling. + + """ + + if dropna: + df = df.dropna(axis=1, how=dropna) + else: + empty_cols = df.columns[df.notnull().sum() == 0] + df[empty_cols] = 0 + + if imputing is None or imputing.lower() == 'none': + mat = df.values + else: +# imputer = Imputer(strategy=imputing, axis=0) + imputer = SimpleImputer(strategy=imputing) + mat = imputer.fit_transform(df.values) + + if scaling is None or scaling.lower() == 'none': + return pd.DataFrame(mat, columns=df.columns) + + if scaling == 'maxabs': + scaler = MaxAbsScaler() + elif scaling == 'minmax': + scaler = MinMaxScaler() + else: + scaler = StandardScaler() + + mat = scaler.fit_transform(mat) + df = pd.DataFrame(mat, columns=df.columns) + + return df + + +def discretize_dataframe(df, col, bins=2, cutoffs=None): + """Discretize values of given column in pandas dataframe. + + Parameters + ---------- + df : pandas dataframe + dataframe to process. + col : int + Index of column to bin. + bins : int + Number of bins for distributing column values. + cutoffs : list + List of bin limits. + If None, the limits are computed as percentiles. + (Default: None). + + Return + ---------- + Returns the data frame with the values of the specified column binned, i.e. the values + are replaced by the associated bin number. + + """ + + y = df[col] + thresholds = cutoffs + if thresholds is None: + percentiles = [100 / bins * (i + 1) for i in range(bins - 1)] + thresholds = [np.percentile(y, x) for x in percentiles] + classes = np.digitize(y, thresholds) + df[col] = classes + + return df + + +def discretize_array(y, bins=5): + """Discretize values of given array. + + Parameters + ---------- + y : numpy array + array to discretize. + bins : int + Number of bins for distributing column values. + + Return + ---------- + Returns an array with the bin number associated to the values in the + original array. + + """ + + percentiles = [100 / bins * (i + 1) for i in range(bins - 1)] + thresholds = [np.percentile(y, x) for x in percentiles] + classes = np.digitize(y, thresholds) + return classes + + + +def lookup(df, query, ret, keys, match='match'): + """Dataframe lookup. + + Parameters + ---------- + df : pandas dataframe + dataframe for retrieving values. + query : string + String for searching. + ret : int/string or list + Names or indices of columns to be returned. + keys : list + List of strings or integers specifying the names or + indices of columns to look into. + match : string + String describing strategy for matching keys to query. + + Return + ---------- + Returns a list of the values in the dataframe whose columns match + the specified query and have been selected to be returned. + + """ + + mask = pd.Series(False, index=range(df.shape[0])) + for key in keys: + if match == 'contains': + mask |= df[key].str.contains(query.upper(), case=False) + else: + mask |= (df[key].str.upper() == query.upper()) + + return list(set(df[mask][ret].values.flatten().tolist())) + def load_X_data(train_file, test_file, drop_cols=None, n_cols=None, shuffle=False, scaling=None, diff --git a/common/keras_utils.py b/common/keras_utils.py index 2d35b3ac..06119051 100644 --- a/common/keras_utils.py +++ b/common/keras_utils.py @@ -6,9 +6,10 @@ from keras import initializers from keras.layers import Dropout -from keras.callbacks import Callback +from keras.callbacks import Callback, ModelCheckpoint from keras.utils import get_custom_objects -from keras.metrics import binary_crossentropy, mean_squared_error +from keras.metrics import binary_crossentropy, mean_squared_error, mean_absolute_error +from keras.models import Model from scipy.stats.stats import pearsonr @@ -196,6 +197,16 @@ def xent(y_true, y_pred): return binary_crossentropy(y_true, y_pred) +def r2(y_true, y_pred): + SS_res = K.sum(K.square(y_true - y_pred)) + SS_tot = K.sum(K.square(y_true - K.mean(y_true))) + return (1 - SS_res/(SS_tot + K.epsilon())) + + +def mae(y_true, y_pred): + return mean_absolute_error(y_true, y_pred) + + def mse(y_true, y_pred): return mean_squared_error(y_true, y_pred) @@ -243,3 +254,13 @@ def __init__(self, print_fcn=print): def on_epoch_end(self, epoch, logs={}): msg = "[Epoch: %i] %s" % (epoch, ", ".join("%s: %f" % (k, v) for k, v in sorted(logs.items()))) self.print_fcn(msg) + + +class MultiGPUCheckpoint(ModelCheckpoint): + + def set_model(self, model): + if isinstance(model.layers[-2], Model): + self.model = model.layers[-2] + else: + self.model = model + diff --git a/common/uq_utils.py b/common/uq_utils.py index 650da687..d0ab46c3 100644 --- a/common/uq_utils.py +++ b/common/uq_utils.py @@ -1,7 +1,9 @@ from __future__ import absolute_import import numpy as np - +from scipy.stats import pearsonr, spearmanr +from scipy import signal +from scipy.interpolate import InterpolatedUnivariateSpline def generate_index_distribution(numTrain, numTest, numValidation, params): """ Generates a vector of indices to partition the data for training. @@ -331,6 +333,751 @@ def fill_array(blocklist, maxsize, numdata, numblocks, blocksize): return indexArray[:offset] +###### UTILS for COMPUTATION OF EMPIRICAL CALIBRATION + +def compute_statistics_homoscedastic(df_data, + col_true=0, + col_pred=6, + col_std_pred=7, + ): + """ Extracts ground truth, mean predition, error and + standard deviation of prediction from inference + data frame. The latter includes the statistics + over all the inference realizations. + + Parameters + ---------- + df_data : pandas data frame + Data frame generated by current CANDLE inference + experiments. Indices are hard coded to agree with + current CANDLE version. (The inference file usually + has the name: _pred.tsv). + col_true : integer + Index of the column in the data frame where the true + value is stored (Default: 0, index in current CANDLE format). + col_pred : integer + Index of the column in the data frame where the predicted + value is stored (Default: 6, index in current CANDLE format). + col_std_pred : integer + Index of the column in the data frame where the standard + deviation of the predicted values is stored (Default: 7, + index in current CANDLE format). + + Return + ---------- + Ytrue : numpy array + Array with true (observed) values + Ypred : numpy array + Array with predicted values. + yerror : numpy array + Array with errors computed (observed - predicted). + sigma : numpy array + Array with standard deviations learned with deep learning + model. For homoscedastic inference this corresponds to the + std value computed from prediction (and is equal to the + following returned variable). + Ypred_std : numpy array + Array with standard deviations computed from regular + (homoscedastic) inference. + pred_name : string + Name of data colum or quantity predicted (as extracted + from the data frame using the col_true index). + """ + + Ytrue = df_data.iloc[:,col_true].values + print('Ytrue shape: ', Ytrue.shape) + pred_name = df_data.columns[col_true] + Ypred = df_data.iloc[:,col_pred].values + print('Ypred shape: ', Ypred.shape) + Ypred_std = df_data.iloc[:,col_std_pred].values + print('Ypred_std shape: ', Ypred_std.shape) + yerror = Ytrue - Ypred + print('yerror shape: ', yerror.shape) + sigma = Ypred_std # std + MSE = np.mean((Ytrue - Ypred)**2) + print('MSE: ', MSE) + MSE_STD = np.std((Ytrue - Ypred)**2) + print('MSE_STD: ', MSE_STD) + # p-value 'not entirely reliable, reasonable for datasets > 500' + spearman_cc, pval = spearmanr(Ytrue, Ypred) + print('Spearman CC: %f, p-value: %e' % (spearman_cc, pval)) + + return Ytrue, Ypred, yerror, sigma, Ypred_std, pred_name + + +def compute_statistics_homoscedastic_all(df_data, + col_true=4, + col_pred_start=6 + ): + """ Extracts ground truth, mean predition, error and + standard deviation of prediction from inference + data frame. The latter includes all the individual + inference realizations. + + Parameters + ---------- + df_data : pandas data frame + Data frame generated by current CANDLE inference + experiments. Indices are hard coded to agree with + current CANDLE version. (The inference file usually + has the name: .predicted_INFER.tsv). + col_true : integer + Index of the column in the data frame where the true + value is stored (Default: 4, index in current HOM format). + col_pred_start : integer + Index of the column in the data frame where the first predicted + value is stored. All the predicted values during inference + are stored (Default: 6 index, in current HOM format). + + Return + ---------- + Ytrue : numpy array + Array with true (observed) values + Ypred : numpy array + Array with predicted values. + yerror : numpy array + Array with errors computed (observed - predicted). + sigma : numpy array + Array with standard deviations learned with deep learning + model. For homoscedastic inference this corresponds to the + std value computed from prediction (and is equal to the + following returned variable). + Ypred_std : numpy array + Array with standard deviations computed from regular + (homoscedastic) inference. + pred_name : string + Name of data colum or quantity predicted (as extracted + from the data frame using the col_true index). + """ + + Ytrue = df_data.iloc[:,col_true].values + print('Ytrue shape: ', Ytrue.shape) + pred_name = df_data.columns[col_true] + Ypred_mean_ = np.mean(df_data.iloc[:,col_pred_start:], axis=1) + Ypred_mean = Ypred_mean_.values + print('Ypred_mean shape: ', Ypred_mean.shape) + Ypred_std_ = np.std(df_data.iloc[:,col_pred_start:], axis=1) + Ypred_std = Ypred_std_.values + print('Ypred_std shape: ', Ypred_std.shape) + yerror = Ytrue - Ypred_mean + print('yerror shape: ', yerror.shape) + sigma = Ypred_std # std + MSE = np.mean((Ytrue - Ypred_mean)**2) + print('MSE: ', MSE) + MSE_STD = np.std((Ytrue - Ypred_mean)**2) + print('MSE_STD: ', MSE_STD) + # p-value 'not entirely reliable, reasonable for datasets > 500' + spearman_cc, pval = spearmanr(Ytrue, Ypred_mean) + print('Spearman CC: %f, p-value: %e' % (spearman_cc, pval)) + + return Ytrue, Ypred_mean, yerror, sigma, Ypred_std, pred_name + + +def compute_statistics_heteroscedastic(df_data, + col_true=4, + col_pred_start=6, + col_std_pred_start=7, + ): + """ Extracts ground truth, mean predition, error, standard + deviation of prediction and predicted (learned) standard + deviation from inference data frame. The latter includes + all the individual inference realizations. + + Parameters + ---------- + df_data : pandas data frame + Data frame generated by current heteroscedastic inference + experiments. Indices are hard coded to agree with + current version. (The inference file usually + has the name: .predicted_INFER_HET.tsv). + col_true : integer + Index of the column in the data frame where the true + value is stored (Default: 4, index in current HET format). + col_pred_start : integer + Index of the column in the data frame where the first predicted + value is stored. All the predicted values during inference + are stored and are interspaced with standard deviation + predictions (Default: 6 index, step 2, in current HET format). + col_std_pred_start : integer + Index of the column in the data frame where the first predicted + standard deviation value is stored. All the predicted values + during inference are stored and are interspaced with predictions + (Default: 7 index, step 2, in current HET format). + + Return + ---------- + Ytrue : numpy array + Array with true (observed) values + Ypred : numpy array + Array with predicted values. + yerror : numpy array + Array with errors computed (observed - predicted). + sigma : numpy array + Array with standard deviations learned with deep learning + model. For homoscedastic inference this corresponds to the + std value computed from prediction (and is equal to the + following returned variable). + Ypred_std : numpy array + Array with standard deviations computed from regular + (homoscedastic) inference. + pred_name : string + Name of data colum or quantity predicted (as extracted + from the data frame using the col_true index). + """ + + Ytrue = df_data.iloc[:,col_true].values + print('Ytrue shape: ', Ytrue.shape) + pred_name = df_data.columns[col_true] + Ypred_mean_ = np.mean(df_data.iloc[:,col_pred_start::2], axis=1) + Ypred_mean = Ypred_mean_.values + print('Ypred shape: ', Ypred_mean.shape) + Ypred_std_ = np.std(df_data.iloc[:,col_pred_start::2], axis=1) + Ypred_std = Ypred_std_.values + print('Ypred_std shape: ', Ypred_std.shape) + yerror = Ytrue - Ypred_mean + print('yerror shape: ', yerror.shape) + s_ = df_data.iloc[:,col_std_pred_start::2] + s_mean = np.mean(s_, axis=1) + var = np.exp(s_mean.values) # variance + sigma = np.sqrt(var) # std + print('sigma shape: ', sigma.shape) + MSE = np.mean((Ytrue - Ypred_mean)**2) + print('MSE: ', MSE) + MSE_STD = np.std((Ytrue - Ypred_mean)**2) + print('MSE_STD: ', MSE_STD) + # p-value 'not entirely reliable, reasonable for datasets > 500' + spearman_cc, pval = spearmanr(Ytrue, Ypred_mean) + print('Spearman CC: %f, p-value: %e' % (spearman_cc, pval)) + + return Ytrue, Ypred_mean, yerror, sigma, Ypred_std, pred_name + + +def compute_statistics_quantile(df_data, + sigma_divisor=2.56, + col_true=4, + col_pred_start=6 + ): + """ Extracts ground truth, 50th percentile mean predition, + low percentile and high percentile mean prediction + (usually 10th percentile and 90th percentile respectively), + error (using 50th percentile), standard deviation of + prediction (using 50th percentile) and predicted (learned) + standard deviation from interdecile range in inference data frame. + The latter includes all the individual inference realizations. + + Parameters + ---------- + df_data : pandas data frame + Data frame generated by current quantile inference + experiments. Indices are hard coded to agree with + current version. (The inference file usually + has the name: .predicted_INFER_QTL.tsv). + sigma_divisor : float + Divisor to convert from the intercedile range to the corresponding + standard deviation for a Gaussian distribution. + (Default: 2.56, consisten with an interdecile range computed from + the difference between the 90th and 10th percentiles). + col_true : integer + Index of the column in the data frame where the true + value is stored (Default: 4, index in current QTL format). + col_pred_start : integer + Index of the column in the data frame where the first predicted + value is stored. All the predicted values during inference + are stored and are interspaced with other percentile + predictions (Default: 6 index, step 3, in current QTL format). + + Return + ---------- + Ytrue : numpy array + Array with true (observed) values + Ypred : numpy array + Array with predicted values (based on the 50th percentile). + yerror : numpy array + Array with errors computed (observed - predicted). + sigma : numpy array + Array with standard deviations learned with deep learning + model. This corresponds to the interdecile range divided + by the sigma divisor. + Ypred_std : numpy array + Array with standard deviations computed from regular + (homoscedastic) inference. + pred_name : string + Name of data colum or quantity predicted (as extracted + from the data frame using the col_true index). + Ypred_Lp_mean : numpy array + Array with predicted values of the lower percentile + (usually the 10th percentile). + Ypred_Hp_mean : numpy array + Array with predicted values of the higher percentile + (usually the 90th percentile). + """ + + Ytrue = df_data.iloc[:,col_true].values + print('Ytrue shape: ', Ytrue.shape) + pred_name = df_data.columns[col_true] + Ypred_50q_mean = np.mean(df_data.iloc[:,col_pred_start::3], axis=1) + Ypred_mean = Ypred_50q_mean.values + print('Ypred shape: ', Ypred_mean.shape) + Ypred_Lp_mean_ = np.mean(df_data.iloc[:,col_pred_start+1::3], axis=1) + Ypred_Hp_mean_ = np.mean(df_data.iloc[:,col_pred_start+2::3], axis=1) + Ypred_Lp_mean = Ypred_Lp_mean_.values + Ypred_Hp_mean = Ypred_Hp_mean_.values + interdecile_range = Ypred_Hp_mean - Ypred_Lp_mean + sigma = interdecile_range / sigma_divisor + print('sigma shape: ', sigma.shape) + yerror = Ytrue - Ypred_mean + print('yerror shape: ', yerror.shape) + Ypred_std_ = np.std(df_data.iloc[:,col_pred_start::3], axis=1) + Ypred_std = Ypred_std_.values + print('Ypred_std shape: ', Ypred_std.shape) + MSE = np.mean((Ytrue - Ypred_mean)**2) + print('MSE: ', MSE) + MSE_STD = np.std((Ytrue - Ypred_mean)**2) + print('MSE_STD: ', MSE_STD) + # p-value 'not entirely reliable, reasonable for datasets > 500' + spearman_cc, pval = spearmanr(Ytrue, Ypred_mean) + print('Spearman CC: %f, p-value: %e' % (spearman_cc, pval)) + + return Ytrue, Ypred_mean, yerror, sigma, Ypred_std, pred_name, Ypred_Lp_mean, Ypred_Hp_mean + + +def split_data_for_empirical_calibration(Ytrue, Ypred, sigma, cal_split=0.8): + """ Extracts a portion of the arrays provided for the computation + of the calibration and reserves the remainder portion + for testing. + + Parameters + ---------- + Ytrue : numpy array + Array with true (observed) values + Ypred : numpy array + Array with predicted values. + sigma : numpy array + Array with standard deviations learned with deep learning + model (or std value computed from prediction if homoscedastic + inference). + cal_split : float + Split of data to use for estimating the calibration relationship. + It is assumet that it will be a value in (0, 1). + (Default: use 80% of predictions to generate empirical + calibration). + + Return + ---------- + index_perm_total : numpy array + Random permutation of the array indices. The first 'num_cal' + of the indices correspond to the samples that are used for + calibration, while the remainder are the samples reserved + for calibration testing. + pSigma_cal : numpy array + Part of the input sigma array to use for calibration. + pSigma_test : numpy array + Part of the input sigma array to reserve for testing. + pPred_cal : numpy array + Part of the input Ypred array to use for calibration. + pPred_test : numpy array + Part of the input Ypred array to reserve for testing. + true_cal : numpy array + Part of the input Ytrue array to use for calibration. + true_test : numpy array + Part of the input Ytrue array to reserve for testing. + """ + + # shuffle data for calibration + num_pred_total = sigma.shape[0] + num_cal = np.int(num_pred_total * cal_split) + index_perm_total = np.random.permutation(range(num_pred_total)) + + # Permute data + pSigma_perm_all = sigma[index_perm_total] + pPred_perm_all = Ypred[index_perm_total] + true_perm_all = Ytrue[index_perm_total] + + # Split in calibration and testing + pSigma_cal = pSigma_perm_all[:num_cal] + pSigma_test = pSigma_perm_all[num_cal:] + pPred_cal = pPred_perm_all[:num_cal] + pPred_test = pPred_perm_all[num_cal:] + true_cal = true_perm_all[:num_cal] + true_test = true_perm_all[num_cal:] + + print('Size of calibration set: ', true_cal.shape) + print('Size of test set: ', true_test.shape) + + return index_perm_total, pSigma_cal, pSigma_test, pPred_cal, pPred_test, true_cal, true_test + + +def compute_empirical_calibration(pSigma_cal, pPred_cal, true_cal, bins, coverage_percentile): + """ Use the arrays provided to estimate an empirical mapping + between standard deviation and absolute value of error, + both of which have been observed during inference. Since + most of the times the raw statistics per bin are very noisy, + a smoothing step (based on scipy's savgol filter) is performed. + + Parameters + ---------- + pSigma_cal : numpy array + Part of the standard deviations array to use for calibration. + pPred_cal : numpy array + Part of the predictions array to use for calibration. + true_cal : numpy array + Part of the true (observed) values array to use for calibration. + bins : int + Number of bins to split the range of standard deviations + included in pSigma_cal array. + coverage_percentile : float + Value to use for estimating coverage when evaluating the percentiles + of the observed absolute value of errors. + + Return + ---------- + mean_sigma : numpy array + Array with the mean standard deviations computed per bin. + min_sigma : numpy array + Array with the minimum standard deviations computed per bin. + max_sigma : numpy array + Array with the maximum standard deviations computed per bin. + error_thresholds : numpy array + Thresholds of the errors computed to attain a certain + error coverage per bin. + err_err : numpy array + Error bars in errors (one standard deviation for a binomial + distribution estimated by bin vs. the other bins) for the + calibration error. + error_thresholds_smooth : numpy array + Thresholds of the errors computed to attain a certain + error coverage per bin after a smoothed operation is applied + to the frequently noisy bin-based estimations. + sigma_start_index : non-negative integer + Index in the mean_sigma array that defines the start of + the valid empirical calibration interval (i.e. index to + the smallest std for which a meaningful error mapping + is obtained). + sigma_end_index : non-negative integer + Index in the mean_sigma array that defines the end of + the valid empirical calibration interval (i.e. index to + the largest std for which a meaningful error mappping + is obtained). + s_interpolate : scipy.interpolate python object + A python object from scipy.interpolate that computes a + univariate spline (InterpolatedUnivariateSpline) constructed + to express the mapping from standard deviation to error. This + spline is generated during the computational empirical + calibration procedure. + """ + + index_sigma_cal = np.argsort(pSigma_cal) + pSigma_cal_ordered_ = pSigma_cal[index_sigma_cal] + Er_vect_cal_ = np.abs(true_cal - pPred_cal) + Er_vect_cal_orderedSigma_ = Er_vect_cal_[index_sigma_cal] + + minL_sigma = np.min(pSigma_cal_ordered_) + maxL_sigma = np.max(pSigma_cal_ordered_) + print('Complete Sigma range --> Min: %f, Max: %f' % (minL_sigma, maxL_sigma)) + + # Bin statistics for error and sigma + mean_sigma, min_sigma, max_sigma, error_thresholds, err_err = bining_for_calibration(pSigma_cal_ordered_, + minL_sigma, + maxL_sigma, + Er_vect_cal_orderedSigma_, + bins, + coverage_percentile) + + # smooth error function + #scipy.signal.savgol_filter(x, window_length, polyorder, + #deriv=0, delta=1.0, axis=-1, mode='interp', cval=0.0) + #error_thresholds_smooth = signal.savgol_filter(error_thresholds, 5, 1) + error_thresholds_smooth = signal.savgol_filter(error_thresholds, 5, 1, mode='nearest') + + # Build Interpolant over smooth plot (this will become the calibration function) + s_interpolate = InterpolatedUnivariateSpline(mean_sigma, error_thresholds_smooth) + # Determine limits of calibration (i.e. monotonicity range) + sigma_start_index, sigma_end_index = computation_of_valid_calibration_interval(error_thresholds, error_thresholds_smooth, err_err) + + print('Range of valid sigma: %.6f --> %.6f' % (mean_sigma[sigma_start_index], mean_sigma[sigma_end_index])) + + return mean_sigma, min_sigma, max_sigma, error_thresholds, err_err, error_thresholds_smooth, sigma_start_index, sigma_end_index, s_interpolate + + + +def bining_for_calibration(pSigma_cal_ordered_, minL_sigma, + maxL_sigma, Er_vect_cal_orderedSigma_, + bins, coverage_percentile): + """ Bin the values of the standard deviations observed during + inference and estimate a specified coverage percentile + in the absolute error (observed during inference as well). + Bins that have less than 50 samples are merged until they + surpass this threshold. + + Parameters + ---------- + pSigma_cal_ordered_ : numpy array + Array of standard deviations ordered in ascending way. + minL_sigma : float + Minimum value of standard deviations included in + pSigma_cal_ordered_ array. + maxL_sigma : numpy array + Maximum value of standard deviations included in + pSigma_cal_ordered_ array. + Er_vect_cal_orderedSigma_ : numpy array + Array ob absolute value of errors corresponding with + the array of ordered standard deviations. + bins : int + Number of bins to split the range of standard deviations + included in pSigma_cal_ordered_ array. + coverage_percentile : float + Value to use for estimating coverage when evaluating the percentiles + of the observed absolute value of errors. + + Return + ---------- + mean_sigma : numpy array + Array with the mean standard deviations computed per bin. + min_sigma : numpy array + Array with the minimum standard deviations computed per bin. + max_sigma : numpy array + Array with the maximum standard deviations computed per bin. + error_thresholds : numpy array + Thresholds of the errors computed to attain a certain + error coverage per bin. + err_err : numpy array + Error bars in errors (one standard deviation for a binomial + distribution estimated by bin vs. the other bins) for the + calibration error. + """ + + #thresholds = np.logspace(np.log10(minL_sigma), np.log10(maxL_sigma), num=bins) + thresholds = np.linspace(minL_sigma, maxL_sigma, num=bins) + classes = np.digitize(pSigma_cal_ordered_, thresholds) + Nbin = np.zeros(bins+1) + for i in range(bins+1): + indices = (classes == i) + Nbin[i] = indices.sum() + + # Repair bins + new_thresholds_l = [] + new_nbins_l = [] + sumN = 0 + for i in range(Nbin.shape[0]): + sumN += Nbin[i] + if sumN > 50: + if i > (thresholds.shape[0] - 1): + new_thresholds_l.append(thresholds[-1]) + else: + new_thresholds_l.append(thresholds[i]) + new_nbins_l.append(sumN) + sumN = 0 + new_thresholds = np.array(new_thresholds_l) + new_nbins = np.array(new_nbins_l) + new_thresholds[-1] = thresholds[-1] + new_nbins[-1] += sumN + + # + classes = np.digitize(pSigma_cal_ordered_, new_thresholds[:-1]) + error_thresholds = -1. * np.ones(new_nbins.shape[0]) + mean_sigma = -1. * np.ones(new_nbins.shape[0]) + min_sigma = -1. * np.ones(new_nbins.shape[0]) + max_sigma = -1. * np.ones(new_nbins.shape[0]) + err_err = -1. * np.ones(new_nbins.shape[0]) + Ncal = pSigma_cal_ordered_.shape[0] + for i in range(error_thresholds.shape[0]): + indices = (classes == i) + n_aux = indices.sum() + assert n_aux == new_nbins[i] + print('Points in bin %d: %d' % (i, n_aux)) + mean_sigma[i] = np.mean(pSigma_cal_ordered_[indices]) + min_sigma[i] = np.min(pSigma_cal_ordered_[indices]) + max_sigma[i] = np.max(pSigma_cal_ordered_[indices]) + error_thresholds[i] = np.percentile(Er_vect_cal_orderedSigma_[indices], coverage_percentile) + err_err[i] = np.sqrt(new_nbins[i] * (Ncal - new_nbins[i])) / Ncal * error_thresholds[i] + + return mean_sigma, min_sigma, max_sigma, error_thresholds, err_err + + +def computation_of_valid_calibration_interval(error_thresholds, error_thresholds_smooth, err_err): + """ Function that estimates the empirical range in which a + monotonic relation is observed between standard deviation + and coverage of absolute value of error. Since the + statistics computed per bin are relatively noisy, the + application of a greedy criterion (e.g. guarantee a + monotonically increasing relationship) does not yield + good results. Therefore, a softer version is constructed + based on the satisfaction of certain criteria depending + on: the values of the error coverage computed per bin, + a smoothed version of them and the assocatiate error + estimated (based on one standard deviation for a binomial + distribution estimated by bin vs. the other bins). + A minimal validation requiring the end idex to be + largest than the starting index is performed before + the function return. + + Current criteria: + - the smoothed errors are inside the error bars AND + they are almost increasing (a small tolerance is + allowed, so a small wobbliness in the smoother + values is permitted). + OR + - both the raw values for the bins (with a small tolerance) + are increasing, AND the smoothed value is greater than the + raw value. + OR + - the current smoothed value is greater than the previous AND + the smoothed values for the next been are inside the error + bars. + + Parameters + ---------- + error_thresholds : numpy array + Thresholds of the errors computed to attain a certain + error coverage per bin. + error_thresholds_smooth : numpy array + Thresholds of the errors computed to attain a certain + error coverage per bin after a smoothed operation is applied + to the frequently noisy bin-based estimations. + err_err : numpy array + Error bars in errors (one standard deviation for a binomial + distribution estimated by bin vs. the other bins) for the + calibration error. + + Return + ---------- + sigma_start_index : non-negative integer + Index estimated in the mean_sigma array corresponing to + the value that defines the start of the valid empirical + calibration interval (i.e. index to the smallest std for + which a meaningful error mapping is obtained, according + to the criteria explained before). + sigma_end_index : non-negative integer + Index estimated in the mean_sigma array corresponing to + the value that defines the end of the valid empirical + calibration interval (i.e. index to the largest std for + which a meaningful error mapping is obtained, according + to the criteria explained before). + """ + + # Computation of the calibration interval + limitH = error_thresholds + err_err + limitL = error_thresholds - err_err + + # search for starting point + for i in range(err_err.shape[0]): + if ((error_thresholds_smooth[i] >= limitL[i]) and + (error_thresholds_smooth[i] <= limitH[i])): # Ask if the current is in the interval + sigma_start_index = i + break + sigma_end_index = sigma_start_index - 1 + + restart = max(1, sigma_start_index) + for i in range(restart, err_err.shape[0]-1): + if (((error_thresholds_smooth[i] >= limitL[i]) and + (error_thresholds_smooth[i] <= limitH[i]) and + ((error_thresholds_smooth[i] * 1.005 > error_thresholds_smooth[i-1]) or + ((error_thresholds[i] * 1.01 > error_thresholds[i-1]) and + (error_thresholds_smooth[i] > error_thresholds[i])))) # Ask if the current is in the interval with slightly increasing trend + or # Ask if the current is greater than the previous and the next is in the interval + ((error_thresholds_smooth[i] > error_thresholds_smooth[i-1]) and + ((error_thresholds_smooth[i+1] >= limitL[i+1]) and + (error_thresholds_smooth[i+1] <= limitH[i+1])))): + + sigma_end_index = i + else: # Finalize search for monotonic range + if (sigma_end_index - sigma_start_index) > 4: + break + else: # Reset indices + sigma_start_index = i + 1 + sigma_end_index = i + + print('Range of valid sigma indices (inclusive): %d --> %d' % (sigma_start_index, sigma_end_index)) + + assert (sigma_end_index > sigma_start_index) + + return sigma_start_index, sigma_end_index + + +def applying_calibration(pSigma_test, pPred_test, true_test, s_interpolate, minL_sigma_auto, maxL_sigma_auto): + """ Use the empirical mapping between standard deviation and + absolute value of error estimated during calibration (i.e. + apply the univariate spline computed) to estimate the error + for the part of the standard deviation array that was reserved + for testing the empirical calibration. The resulting error array + (yp_test) should overestimate the true observed error (eabs_red). + All the computations are restricted to the valid calibration + interval: [minL_sigma_auto, maxL_sigma_auto]. + + Parameters + ---------- + pSigma_test : numpy array + Part of the standard deviations array to use for calibration testing. + pPred_test : numpy array + Part of the predictions array to use for calibration testing. + true_test : numpy array + Part of the true (observed) values array to use for calibration testing. + s_interpolate : scipy.interpolate python object + A python object from scipy.interpolate that computes a + univariate spline (InterpolatedUnivariateSpline) expressing + the mapping from standard deviation to error. This + spline is generated during the computational empirical + calibration procedure. + minL_sigma_auto : float + Starting value of the valid empirical calibration interval + (i.e. smallest std for which a meaningful error mapping + is obtained). + maxL_sigma_auto : float + Ending value of the valid empirical calibration interval + (i.e. largest std for which a meaningful error mappping + is obtained). + + Return + ---------- + index_sigma_range_test : numpy array + Indices of the pSigma_test array that are included in the + valid calibration interval, given by: + [minL_sigma_auto, maxL_sigma_auto]. + xp_test : numpy array + Array with the mean standard deviations in the calibration + testing array. + yp_test : numpy array + Mapping of the given standard deviation to error computed + from the interpolation spline constructed by empirical + calibration. + eabs_red : numpy array + Array with the observed abolute errors in the part of the testing + array for which the observed standard deviations are in the + valid interval of calibration. + """ + + # Filter to appropriate range + index_sigma_range_test = (pSigma_test >= minL_sigma_auto) & (pSigma_test < maxL_sigma_auto) + xp_test = pSigma_test[index_sigma_range_test] + yp_test = s_interpolate(xp_test) + Er_vect_ = true_test - pPred_test + eabs_ = np.abs(Er_vect_) + eabs_red = eabs_[index_sigma_range_test] + + return index_sigma_range_test, xp_test, yp_test, eabs_red + + +def overprediction_check(yp_test, eabs_red): + """ Compute the percentage of overestimated absoulte error + predictions for the arrays reserved for calibration testing + and whose corresponding standard deviations are included + in the valid calibration interval. + + Parameters + ---------- + yp_test : numpy array + Mapping of the standard deviation to error computed + from the interpolation spline constructed by empirical + calibration. + eabs_red : numpy array + Array with the observed abolute errors in the part of the testing + array for which the observed standard deviations are in the + valid interval of calibration. + """ + + over_pred_error_index = (yp_test >= eabs_red) + percentage_over_predicted = (over_pred_error_index.sum() / yp_test.shape[0]) + print("percentage over predicted: ", percentage_over_predicted) diff --git a/common/viz_utils.py b/common/viz_utils.py index eb570e37..2ca87eae 100644 --- a/common/viz_utils.py +++ b/common/viz_utils.py @@ -2,6 +2,8 @@ mpl.use('Agg') import matplotlib.pyplot as plt +import numpy as np + def plot_history(out, history, metric='loss', title=None, width=8, height=6): title = title or 'model {}'.format(metric) val_metric = 'val_{}'.format(metric) @@ -60,3 +62,300 @@ def plot_error(y_true, y_pred, batch, file_ext, file_pre='output_dir', subsample plt.savefig(file_pre+'.diff'+file_ext+'.b'+str(batch)+'.png') plt.close() +###### UTILS for UQ / CALIBRATION VISUALIZATION + +from matplotlib.colors import LogNorm + +def plot_density_observed_vs_predicted(Ytest, Ypred, pred_name=None, figprefix=None): + """Functionality to plot a 2D histogram of the distribution of observed (ground truth) + values vs. predicted values. The plot generated is stored in a png file. + + Parameters + ---------- + Ytest : numpy array + Array with (true) observed values + Ypred : numpy array + Array with predicted values. + pred_name : string + Name of data colum or quantity predicted (e.g. growth, AUC, etc.) + figprefix : string + String to prefix the filename to store the figure generated. + A '_density_predictions.png' string will be appended to the + figprefix given. + """ + + xbins = 51 + + fig = plt.figure(figsize=(24,18)) # (30,16) + ax = plt.gca() + plt.rc('xtick', labelsize=16) # fontsize of the tick labels + ax.plot([Ytest.min(), Ytest.max()], [Ytest.min(), Ytest.max()], 'r--', lw=4.) + plt.hist2d(Ytest, Ypred, bins=xbins, norm=LogNorm()) + cb = plt.colorbar() + ax.set_xlabel('Observed ' + pred_name, fontsize=38, labelpad=15.) + ax.set_ylabel('Mean ' + pred_name + ' Predicted', fontsize=38, labelpad=15.) + ax.axis([Ytest.min()*0.98, Ytest.max()*1.02, Ytest.min()*0.98, Ytest.max()*1.02]) + plt.setp(ax.get_xticklabels(), fontsize=32) + plt.setp(ax.get_yticklabels(), fontsize=32) + cb.ax.set_yticklabels(cb.ax.get_yticklabels(), fontsize=28) + plt.grid(True) + plt.savefig(figprefix + '_density_predictions.png') + plt.close() + print('Generated plot: ', figprefix + '_density_predictions.png') + + +def plot_2d_density_sigma_vs_error(sigma, yerror, method=None, figprefix=None): + """Functionality to plot a 2D histogram of the distribution of + the standard deviations computed for the predictions vs. the + computed errors (i.e. values of observed - predicted). + The plot generated is stored in a png file. + + Parameters + ---------- + sigma : numpy array + Array with standard deviations computed. + yerror : numpy array + Array with errors computed (observed - predicted). + method : string + Method used to comput the standard deviations (i.e. dropout, + heteroscedastic, etc.). + figprefix : string + String to prefix the filename to store the figure generated. + A '_density_sigma_error.png' string will be appended to the + figprefix given. + """ + + xbins = 51 + ybins = 31 + + fig = plt.figure(figsize=(24,12)) # (30,16) + ax = plt.gca() + plt.rc('xtick', labelsize=16) # fontsize of the tick labels + plt.hist2d(sigma, yerror, bins=[xbins,ybins], norm=LogNorm()) + cb = plt.colorbar() + ax.set_xlabel('Sigma (' + method + ')', fontsize=38, labelpad=15.) + ax.set_ylabel('Observed - Mean Predicted', fontsize=38, labelpad=15.) + ax.axis([sigma.min()*0.98, sigma.max()*1.02, -yerror.max(), yerror.max()]) + plt.setp(ax.get_xticklabels(), fontsize=28) + plt.setp(ax.get_yticklabels(), fontsize=28) + cb.ax.set_yticklabels(cb.ax.get_yticklabels(), fontsize=22) + plt.grid(True) + plt.savefig(figprefix + '_density_sigma_error.png') + plt.close() + print('Generated plot: ', figprefix + '_density_sigma_error.png') + + +def plot_histogram_error_per_sigma(sigma, yerror, method=None, figprefix=None): + """Functionality to plot a 1D histogram of the distribution of + computed errors (i.e. values of observed - predicted) observed + for specific values of standard deviations computed. The range of + standard deviations computed is split in xbins values and the + 1D histograms of error distributions for the smallest six + standard deviations are plotted. + The plot generated is stored in a png file. + + Parameters + ---------- + sigma : numpy array + Array with standard deviations computed. + yerror : numpy array + Array with errors computed (observed - predicted). + method : string + Method used to comput the standard deviations (i.e. dropout, + heteroscedastic, etc.). + figprefix : string + String to prefix the filename to store the figure generated. + A '_histogram_error_per_sigma.png' string will be appended to + the figprefix given. + """ + + xbins = 21 + ybins = 31 + + H, xedges, yedges, img = plt.hist2d(sigma, yerror,# normed=True, + bins=[xbins,ybins]) + + fig = plt.figure(figsize=(14,16)) + legend = [] + for ii in range(6):#(H.shape[0]): + if ii is not 1: + plt.plot(yedges[0:H.shape[1]], H[ii,:]/np.sum(H[ii,:]), marker='o', + markersize=12, lw=6.) + legend.append(str((xedges[ii] + xedges[ii+1])/2)) + plt.legend(legend, fontsize=16) + ax = plt.gca() + plt.title('Error Dist. per Sigma for ' + method, fontsize=40) + ax.set_xlabel('Observed - Mean Predicted', fontsize=38, labelpad=15.) + ax.set_ylabel('Density', fontsize=38, labelpad=15.) + plt.setp(ax.get_xticklabels(), fontsize=28) + plt.setp(ax.get_yticklabels(), fontsize=28) + plt.grid(True) + plt.savefig(figprefix + '_histogram_error_per_sigma.png') + plt.close() + print('Generated plot: ', figprefix + '_histogram_error_per_sigma.png') + + +def plot_calibration_and_errors(mean_sigma, sigma_start_index, sigma_end_index, + min_sigma, max_sigma, + error_thresholds, + error_thresholds_smooth, + err_err, + s_interpolate, + coverage_percentile, + method=None, figprefix=None, + steps=False): + """Functionality to plot empirical calibration curves + estimated by binning the statistics of computed + standard deviations and errors. + + Parameters + ---------- + mean_sigma : numpy array + Array with the mean standard deviations computed per bin. + sigma_start_index : non-negative integer + Index of the mean_sigma array that defines the start of + the valid empirical calibration interval (i.e. index to + the smallest std for which a meaningful error is obtained). + sigma_end_index : non-negative integer + Index of the mean_sigma array that defines the end of + the valid empirical calibration interval (i.e. index to + the largest std for which a meaningful error is obtained). + min_sigma : numpy array + Array with the minimum standard deviations computed per bin. + max_sigma : numpy array + Array with the maximum standard deviations computed per bin. + error_thresholds : numpy array + Thresholds of the errors computed to attain a certain + error coverage per bin. + error_thresholds_smooth : numpy array + Thresholds of the errors computed to attain a certain + error coverage per bin after a smoothed operation is applied + to the frequently noisy bin-based estimations. + err_err : numpy array + Vertical error bars (usually one standard deviation for a binomial + distribution estimated by bin) for the error calibration + computed empirically. + s_interpolate : scipy.interpolate python object + A python object from scipy.interpolate that computes a + univariate spline (InterpolatedUnivariateSpline) constructed + to express the mapping from standard deviation to error. This + spline is generated during the computational empirical + calibration procedure. + coverage_percentile : float + Value used for the coverage in the percentile estimation + of the observed error. + method : string + Method used to comput the standard deviations (i.e. dropout, + heteroscedastic, etc.). + figprefix : string + String to prefix the filename to store the figure generated. + A '_empirical_calibration.png' string will be appended to + the figprefix given. + steps : boolean + Besides the complete empirical calibration (including raw + statistics, error bars and smoothing), also generates partial + plots with only the raw bin statistics (step1) and with only + the raw bin statistics and the smoothing interpolation (step2). + """ + + xp23 = np.linspace(mean_sigma[sigma_start_index], mean_sigma[sigma_end_index], 200) + yp23 = s_interpolate(xp23) + + p_cov = coverage_percentile + if steps: + # Plot raw bin statistics + fig = plt.figure(figsize=(18,12)) + ax = plt.gca() + ax.errorbar(mean_sigma, error_thresholds, + yerr=err_err, + xerr=[mean_sigma-min_sigma, max_sigma-mean_sigma], + fmt='o', ecolor='k', capthick=2, ms=8) + plt.xlabel('Sigma Predicted (' + method + ')', fontsize=24.) + plt.ylabel(str(p_cov) + '% Coverage for ABS Observed - Mean Predicted', fontsize=24.) + plt.title('Calibration', fontsize=28) + ax.axis([0, np.max(max_sigma)*1.1, np.min(error_thresholds)*0.9, np.max(yp23)*1.2]) + plt.grid() + plt.setp(ax.get_xticklabels(), fontsize=22) + plt.setp(ax.get_yticklabels(), fontsize=22) + plt.savefig(figprefix + '_empirical_calibration_step1.png') + plt.close() + print('Generated plot: ', figprefix + '_empirical_calibration_step1.png') + # Plot raw bin statistics and smoothing + fig = plt.figure(figsize=(18,12)) + ax = plt.gca() + ax.plot(mean_sigma, error_thresholds_smooth, 'g^', ms=12) + ax.errorbar(mean_sigma, error_thresholds, + yerr=err_err, + xerr=[mean_sigma-min_sigma, max_sigma-mean_sigma], + fmt='o', ecolor='k', capthick=2, ms=8) + plt.xlabel('Sigma Predicted (' + method + ')', fontsize=24.) + plt.ylabel(str(p_cov) + '% Coverage for ABS Observed - Mean Predicted', fontsize=24.) + plt.title('Calibration', fontsize=28) + ax.axis([0, np.max(max_sigma)*1.1, np.min(error_thresholds)*0.9, np.max(yp23)*1.2]) + plt.grid() + plt.setp(ax.get_xticklabels(), fontsize=22) + plt.setp(ax.get_yticklabels(), fontsize=22) + plt.savefig(figprefix + '_empirical_calibration_step2.png') + plt.close() + print('Generated plot: ', figprefix + '_empirical_calibration_step2.png') + + # Plot raw bin statistics, smoothing and empirical calibration + fig = plt.figure(figsize=(18,12)) + ax = plt.gca() + ax.plot(xp23, yp23, 'rx', ms=20) + ax.plot(mean_sigma, error_thresholds_smooth, 'g^', ms=12) + ax.errorbar(mean_sigma, error_thresholds, + yerr=err_err, + xerr=[mean_sigma-min_sigma, max_sigma-mean_sigma], + fmt='o', ecolor='k', capthick=2, ms=8) + plt.xlabel('Sigma Predicted (' + method + ')', fontsize=24.) + plt.ylabel(str(p_cov) + '% Coverage for ABS Observed - Mean Predicted', fontsize=24.) + plt.title('Calibration', fontsize=28) + ax.axis([0, np.max(max_sigma)*1.1, np.min(error_thresholds)*0.9, np.max(yp23)*1.2]) + plt.grid() + plt.setp(ax.get_xticklabels(), fontsize=22) + plt.setp(ax.get_yticklabels(), fontsize=22) + plt.savefig(figprefix + '_empirical_calibration.png') + plt.close() + print('Generated plot: ', figprefix + '_empirical_calibration.png') + + +def plot_percentile_predictions(Ypred, Ypred_Lp, Ypred_Hp, percentile_list, pred_name=None, figprefix=None): + """Functionality to plot the mean of the percentiles predicted. + The plot generated is stored in a png file. + + Parameters + ---------- + Ypred : numpy array + Array with mid percentile predicted values. + Ypred_Lp : numpy array + Array with low percentile predicted values. + Ypred_Hp : numpy array + Array with high percentile predicted values. + percentile_list : string list + List of percentiles predicted (e.g. '10p', '90p', etc.) + pred_name : string + Name of data colum or quantity predicted (e.g. growth, AUC, etc.) + figprefix : string + String to prefix the filename to store the figure generated. + A '_density_predictions.png' string will be appended to the + figprefix given. + """ + + index_ = np.argsort(Ypred) + fig = plt.figure(figsize=(24,18)) + plt.scatter(range(index_.shape[0]), Ypred[index_]) + plt.scatter(range(index_.shape[0]), Ypred_Lp[index_]) + plt.scatter(range(index_.shape[0]), Ypred_Hp[index_]) + plt.legend(percentile_list, fontsize=20) + plt.xlabel('Index', fontsize=18.) + plt.ylabel(pred_name, fontsize=18.) + plt.title('Predicted ' + pred_name + ' Percentiles', fontsize=28) + plt.grid() + ax = plt.gca() + plt.setp(ax.get_xticklabels(), fontsize=16) + plt.setp(ax.get_yticklabels(), fontsize=16) + plt.savefig(figprefix + '_percentile_predictions.png') + plt.close() + print('Generated plot: ', figprefix + '_percentile_predictions.png') + From 972a1e101608708f5644bb95cfba5ea394c53679 Mon Sep 17 00:00:00 2001 From: Cristina Date: Mon, 10 Jun 2019 15:33:07 -0600 Subject: [PATCH 004/331] Added UQ functionality, both at common and Pilot1 levels. New folder: Pilot1/Uno_UQ --- Pilot1/Uno_UQ/calibration/calibration_HET.py | 115 +++ Pilot1/Uno_UQ/calibration/calibration_HOM.py | 98 +++ .../Uno_UQ/calibration/calibration_HOM_all.py | 98 +++ Pilot1/Uno_UQ/calibration/calibration_QTL.py | 117 +++ Pilot1/Uno_UQ/data_utils_/__init__.py | 1 + Pilot1/Uno_UQ/data_utils_/cellline_data.py | 97 +++ Pilot1/Uno_UQ/data_utils_/drug_data.py | 188 +++++ Pilot1/Uno_UQ/data_utils_/response_data.py | 175 ++++ Pilot1/Uno_UQ/data_utils_/uno.py | 353 +++++++++ .../uno_combined_data_generator.py | 257 ++++++ .../data_utils_/uno_combined_data_loader.py | 427 ++++++++++ Pilot1/Uno_UQ/model_utils_/__init__.py | 0 Pilot1/Uno_UQ/model_utils_/uno_model_utils.py | 307 +++++++ Pilot1/Uno_UQ/uno_gCSI_modelUQ.txt | 39 + Pilot1/Uno_UQ/uno_holdoutUQ_data.py | 109 +++ Pilot1/Uno_UQ/uno_inferUQ_keras2.py | 296 +++++++ Pilot1/Uno_UQ/uno_trainUQ_keras2.py | 404 ++++++++++ common/candle/__init__.py | 28 + common/candle_keras/__init__.py | 32 +- common/data_utils.py | 156 +++- common/keras_utils.py | 25 +- common/uq_utils.py | 749 +++++++++++++++++- common/viz_utils.py | 299 +++++++ 23 files changed, 4363 insertions(+), 7 deletions(-) create mode 100644 Pilot1/Uno_UQ/calibration/calibration_HET.py create mode 100644 Pilot1/Uno_UQ/calibration/calibration_HOM.py create mode 100644 Pilot1/Uno_UQ/calibration/calibration_HOM_all.py create mode 100644 Pilot1/Uno_UQ/calibration/calibration_QTL.py create mode 100644 Pilot1/Uno_UQ/data_utils_/__init__.py create mode 100644 Pilot1/Uno_UQ/data_utils_/cellline_data.py create mode 100644 Pilot1/Uno_UQ/data_utils_/drug_data.py create mode 100644 Pilot1/Uno_UQ/data_utils_/response_data.py create mode 100644 Pilot1/Uno_UQ/data_utils_/uno.py create mode 100644 Pilot1/Uno_UQ/data_utils_/uno_combined_data_generator.py create mode 100644 Pilot1/Uno_UQ/data_utils_/uno_combined_data_loader.py create mode 100644 Pilot1/Uno_UQ/model_utils_/__init__.py create mode 100644 Pilot1/Uno_UQ/model_utils_/uno_model_utils.py create mode 100644 Pilot1/Uno_UQ/uno_gCSI_modelUQ.txt create mode 100644 Pilot1/Uno_UQ/uno_holdoutUQ_data.py create mode 100644 Pilot1/Uno_UQ/uno_inferUQ_keras2.py create mode 100644 Pilot1/Uno_UQ/uno_trainUQ_keras2.py diff --git a/Pilot1/Uno_UQ/calibration/calibration_HET.py b/Pilot1/Uno_UQ/calibration/calibration_HET.py new file mode 100644 index 00000000..ab354d76 --- /dev/null +++ b/Pilot1/Uno_UQ/calibration/calibration_HET.py @@ -0,0 +1,115 @@ +#! /usr/bin/env python + +from __future__ import division, print_function + +import pandas as pd +import sys +import os +import pickle +import dill + +lib_path2 = os.path.abspath(os.path.join('..', '..', 'common')) +sys.path.append(lib_path2) + +import candle_keras as candle + +def read_file(path, filename): + + df_data = pd.read_csv(path + filename, sep='\t') + print('data read shape: ', df_data.shape) + + return df_data + +def main(): + + if ( len ( sys.argv ) < 3 ) : + sys.stderr.write ( "\nUsage: calibration_HET.py PATH FILENAME [PLOT_STEPS_FLAG]\n" ) + sys.stderr.write ( "FILENAME: usually .predicted_INFER_HET.tsv\n") + sys.exit ( 0 ) + + path = sys.argv [1] + filename = sys.argv [2] + + try: + steps = sys.argv [3] + except IndexError: + steps = False + + + folder_out = './outUQ/' + if folder_out and not os.path.exists(folder_out): + os.makedirs(folder_out) + + index_dp = filename.find('DR=') + if index_dp == -1: # DR is not in filename + print('Enter dropout rate ') + dp_perc = input() + else: + if filename[index_dp + 6] == '.': + dp = float(filename[index_dp+3:index_dp+3+3]) + else: + dp = float(filename[index_dp+3:index_dp+3+4]) + + print('Droput rate: ', dp) + dp_perc = dp * 100. + method = 'Dropout ' + str(dp_perc) + '%' + prefix = folder_out + 'heteroscedastic_DR=' + str(dp_perc) + + df_data = read_file(path, filename) + Ytest, Ypred_mean, yerror, sigma, Ypred_std, pred_name = candle.compute_statistics_heteroscedastic(df_data) + + # storing sigma + fname = prefix + '_sigma.pkl' + with open(fname, 'wb') as f: + pickle.dump(sigma, f, protocol=4) + print('Sigma stored in file: ', fname) + + #plots + candle.plot_density_observed_vs_predicted(Ytest, Ypred_mean, pred_name, prefix) + candle.plot_2d_density_sigma_vs_error(sigma, yerror, method, prefix) + candle.plot_histogram_error_per_sigma(sigma, yerror, method, prefix) + + # shuffle data for calibration + index_perm_total, pSigma_cal, pSigma_test, pMean_cal, pMean_test, true_cal, true_test = candle.split_data_for_empirical_calibration(Ytest, Ypred_mean, sigma) + + # Compute empirical calibration + bins = 31 + coverage_percentile = 95 + mean_sigma, min_sigma, max_sigma, error_thresholds, err_err, error_thresholds_smooth, sigma_start_index, sigma_end_index, s_interpolate = candle.compute_empirical_calibration(pSigma_cal, pMean_cal, true_cal, bins, coverage_percentile) + + candle.plot_calibration_and_errors(mean_sigma, sigma_start_index, sigma_end_index, + min_sigma, max_sigma, + error_thresholds, + error_thresholds_smooth, + err_err, + s_interpolate, + coverage_percentile, method, prefix, steps) + + + # Use empirical calibration and automatic determined monotonic interval + minL_sigma_auto = mean_sigma[sigma_start_index] + maxL_sigma_auto = mean_sigma[sigma_end_index] + index_sigma_range_test, xp_test, yp_test, eabs_red = candle.applying_calibration(pSigma_test, pMean_test, true_test, s_interpolate, minL_sigma_auto, maxL_sigma_auto) + # Check sigma overprediction + p_cov = coverage_percentile + num_cal = pSigma_cal.shape[0] + pYstd_perm_all = Ypred_std[index_perm_total] + pYstd_test = pYstd_perm_all[num_cal:] + pYstd_red = pYstd_test[index_sigma_range_test] + candle.overprediction_check(yp_test, eabs_red) + + # storing calibration + fname = prefix + '_calibration_spline.dkl' + with open(fname, 'wb') as f: +# pickle.dump(s_interpolate, f, protocol=pickle.HIGHEST_PROTOCOL) + dill.dump(s_interpolate, f) + print('Calibration spline stored in file: ', fname) + fname = prefix + '_calibration_limits.pkl' + with open(fname, 'wb') as f: + pickle.dump([minL_sigma_auto, maxL_sigma_auto], f, protocol=4) + print('Calibration limits stored in file: ', fname) + +if __name__ == '__main__': + main() + + diff --git a/Pilot1/Uno_UQ/calibration/calibration_HOM.py b/Pilot1/Uno_UQ/calibration/calibration_HOM.py new file mode 100644 index 00000000..a9440fcb --- /dev/null +++ b/Pilot1/Uno_UQ/calibration/calibration_HOM.py @@ -0,0 +1,98 @@ +#! /usr/bin/env python + +from __future__ import division, print_function + +import pandas as pd +import sys +import os +import pickle +import dill + +lib_path2 = os.path.abspath(os.path.join('..', '..', 'common')) +sys.path.append(lib_path2) + +import candle_keras as candle + + +def read_file(path, filename): + + df_data = pd.read_csv(path + filename, sep='\t') + print('data read shape: ', df_data.shape) + + return df_data + +def main(): + + if ( len ( sys.argv ) < 3 ) : + sys.stderr.write ( "\nUsage: calibration_HOM.py PATH FILENAME [PLOT_STEPS_FLAG]\n" ) + sys.stderr.write ("FILENAME: usually _pred.tsv\n") + sys.exit ( 0 ) + + path = sys.argv [1] + filename = sys.argv [2] + + try: + steps = sys.argv [3] + except IndexError: + steps = False + + folder_out = './outUQ/' + if folder_out and not os.path.exists(folder_out): + os.makedirs(folder_out) + + method = 'Dropout' + prefix = folder_out + 'homoscedastic_DR' + + df_data = read_file(path, filename) + Ytest, Ypred_mean, yerror, sigma, Ypred_std, pred_name = candle.compute_statistics_homoscedastic(df_data) + + #plots + candle.plot_density_observed_vs_predicted(Ytest, Ypred_mean, pred_name, prefix) + candle.plot_2d_density_sigma_vs_error(sigma, yerror, method, prefix) + candle.plot_histogram_error_per_sigma(sigma, yerror, method, prefix) + + # shuffle data for calibration + index_perm_total, pSigma_cal, pSigma_test, pMean_cal, pMean_test, true_cal, true_test = candle.split_data_for_empirical_calibration(Ytest, Ypred_mean, sigma) + + # Compute empirical calibration + bins = 60 + coverage_percentile = 95 + mean_sigma, min_sigma, max_sigma, error_thresholds, err_err, error_thresholds_smooth, sigma_start_index, sigma_end_index, s_interpolate = candle.compute_empirical_calibration(pSigma_cal, pMean_cal, true_cal, bins, coverage_percentile) + + candle.plot_calibration_and_errors(mean_sigma, sigma_start_index, sigma_end_index, + min_sigma, max_sigma, + error_thresholds, + error_thresholds_smooth, + err_err, + s_interpolate, + coverage_percentile, method, prefix, steps) + + + # Use empirical calibration and automatic determined monotonic interval + minL_sigma_auto = mean_sigma[sigma_start_index] + maxL_sigma_auto = mean_sigma[sigma_end_index] + index_sigma_range_test, xp_test, yp_test, eabs_red = candle.applying_calibration(pSigma_test, pMean_test, true_test, s_interpolate, minL_sigma_auto, maxL_sigma_auto) + # Check sigma overprediction + p_cov = coverage_percentile + num_cal = pSigma_cal.shape[0] + pYstd_perm_all = Ypred_std[index_perm_total] + pYstd_test = pYstd_perm_all[num_cal:] + pYstd_red = pYstd_test[index_sigma_range_test] + candle.overprediction_check(yp_test, eabs_red) + + # storing calibration + fname = prefix + '_calibration_limits.pkl' + with open(fname, 'wb') as f: + pickle.dump([minL_sigma_auto, maxL_sigma_auto], f, protocol=4) + print('Calibration limits stored in file: ', fname) + fname = prefix + '_calibration_spline.dkl' + with open(fname, 'wb') as f: +# pickle.dump(s_interpolate, f, protocol=pickle.HIGHEST_PROTOCOL) + dill.dump(s_interpolate, f) + print('Calibration spline stored in file: ', fname) + + +if __name__ == '__main__': + main() + + diff --git a/Pilot1/Uno_UQ/calibration/calibration_HOM_all.py b/Pilot1/Uno_UQ/calibration/calibration_HOM_all.py new file mode 100644 index 00000000..df7e064b --- /dev/null +++ b/Pilot1/Uno_UQ/calibration/calibration_HOM_all.py @@ -0,0 +1,98 @@ +#! /usr/bin/env python + +from __future__ import division, print_function + +import pandas as pd +import sys +import os +import pickle +import dill + +lib_path2 = os.path.abspath(os.path.join('..', '..', 'common')) +sys.path.append(lib_path2) + +import candle_keras as candle + + +def read_file(path, filename): + + df_data = pd.read_csv(path + filename, sep='\t') + print('data read shape: ', df_data.shape) + + return df_data + +def main(): + + if ( len ( sys.argv ) < 3 ) : + sys.stderr.write ( "\nUsage: calibration_HOM_all.py PATH FILENAME [PLOT_STEPS_FLAG]\n" ) + sys.stderr.write ("FILENAME: usually .predicted_INFER.tsv\n") + sys.exit ( 0 ) + + path = sys.argv [1] + filename = sys.argv [2] + + try: + steps = sys.argv [3] + except IndexError: + steps = False + + folder_out = './outUQ/' + if folder_out and not os.path.exists(folder_out): + os.makedirs(folder_out) + + method = 'Dropout' + prefix = folder_out + 'homoscedastic_DR' + + df_data = read_file(path, filename) + Ytest, Ypred_mean, yerror, sigma, Ypred_std, pred_name = candle.compute_statistics_homoscedastic_all(df_data) + + #plots + candle.plot_density_observed_vs_predicted(Ytest, Ypred_mean, pred_name, prefix) + candle.plot_2d_density_sigma_vs_error(sigma, yerror, method, prefix) + candle.plot_histogram_error_per_sigma(sigma, yerror, method, prefix) + + # shuffle data for calibration + index_perm_total, pSigma_cal, pSigma_test, pMean_cal, pMean_test, true_cal, true_test = candle.split_data_for_empirical_calibration(Ytest, Ypred_mean, sigma) + + # Compute empirical calibration + bins = 60 + coverage_percentile = 95 + mean_sigma, min_sigma, max_sigma, error_thresholds, err_err, error_thresholds_smooth, sigma_start_index, sigma_end_index, s_interpolate = candle.compute_empirical_calibration(pSigma_cal, pMean_cal, true_cal, bins, coverage_percentile) + + candle.plot_calibration_and_errors(mean_sigma, sigma_start_index, sigma_end_index, + min_sigma, max_sigma, + error_thresholds, + error_thresholds_smooth, + err_err, + s_interpolate, + coverage_percentile, method, prefix, steps) + + + # Use empirical calibration and automatic determined monotonic interval + minL_sigma_auto = mean_sigma[sigma_start_index] + maxL_sigma_auto = mean_sigma[sigma_end_index] + index_sigma_range_test, xp_test, yp_test, eabs_red = candle.applying_calibration(pSigma_test, pMean_test, true_test, s_interpolate, minL_sigma_auto, maxL_sigma_auto) + # Check sigma overprediction + p_cov = coverage_percentile + num_cal = pSigma_cal.shape[0] + pYstd_perm_all = Ypred_std[index_perm_total] + pYstd_test = pYstd_perm_all[num_cal:] + pYstd_red = pYstd_test[index_sigma_range_test] + candle.overprediction_check(yp_test, eabs_red) + + # storing calibration + fname = prefix + '_calibration_limits.pkl' + with open(fname, 'wb') as f: + pickle.dump([minL_sigma_auto, maxL_sigma_auto], f, protocol=4) + print('Calibration limits stored in file: ', fname) + fname = prefix + '_calibration_spline.dkl' + with open(fname, 'wb') as f: +# pickle.dump(s_interpolate, f, protocol=pickle.HIGHEST_PROTOCOL) + dill.dump(s_interpolate, f) + print('Calibration spline stored in file: ', fname) + + +if __name__ == '__main__': + main() + + diff --git a/Pilot1/Uno_UQ/calibration/calibration_QTL.py b/Pilot1/Uno_UQ/calibration/calibration_QTL.py new file mode 100644 index 00000000..65f12710 --- /dev/null +++ b/Pilot1/Uno_UQ/calibration/calibration_QTL.py @@ -0,0 +1,117 @@ +#! /usr/bin/env python + +from __future__ import division, print_function + +import pandas as pd +import sys +import os +import pickle +import dill + +lib_path2 = os.path.abspath(os.path.join('..', '..', 'common')) +sys.path.append(lib_path2) + +import candle_keras as candle + +def read_file(path, filename): + + df_data = pd.read_csv(path + filename, sep='\t') + print('data read shape: ', df_data.shape) + + return df_data + +def main(): + + if ( len ( sys.argv ) < 3 ) : + sys.stderr.write ( "\nUsage: calibration_QTL.py PATH FILENAME [PLOT_STEPS_FLAG]\n" ) + sys.stderr.write ( "FILENAME: usually .predicted_INFER_QTL.tsv\n") + sys.exit ( 0 ) + + path = sys.argv [1] + filename = sys.argv [2] + + try: + steps = sys.argv [3] + except IndexError: + steps = False + + + folder_out = './outUQ/' + if folder_out and not os.path.exists(folder_out): + os.makedirs(folder_out) + + index_dp = filename.find('DR=') + if index_dp == -1: # DR is not in filename + print('Enter dropout rate ') + dp_perc = input() + else: + if filename[index_dp + 6] == '.': + dp = float(filename[index_dp+3:index_dp+3+3]) + else: + dp = float(filename[index_dp+3:index_dp+3+4]) + + print('Droput rate: ', dp) + dp_perc = dp * 100. + method = 'Dropout ' + str(dp_perc) + '%' + prefix = folder_out + 'quantile_DR=' + str(dp_perc) + + df_data = read_file(path, filename) + Ytest, Ypred_mean, yerror, sigma, Ypred_std, pred_name, Ypred_10p_mean, Ypred_90p_mean = candle.compute_statistics_quantile(df_data) + + # storing sigma + fname = prefix + '_sigma.pkl' + with open(fname, 'wb') as f: + pickle.dump(sigma, f, protocol=4) + print('Sigma stored in file: ', fname) + + #plots + percentile_list = ['50p', '10p', '90p'] + candle.plot_percentile_predictions(Ypred_mean, Ypred_10p_mean, Ypred_90p_mean, percentile_list, pred_name, prefix) + candle.plot_density_observed_vs_predicted(Ytest, Ypred_mean, pred_name, prefix) + candle.plot_2d_density_sigma_vs_error(sigma, yerror, method, prefix) + candle.plot_histogram_error_per_sigma(sigma, yerror, method, prefix) + + # shuffle data for calibration + index_perm_total, pSigma_cal, pSigma_test, pMean_cal, pMean_test, true_cal, true_test = candle.split_data_for_empirical_calibration(Ytest, Ypred_mean, sigma) + + # Compute empirical calibration + bins = 31 + coverage_percentile = 95 + mean_sigma, min_sigma, max_sigma, error_thresholds, err_err, error_thresholds_smooth, sigma_start_index, sigma_end_index, s_interpolate = candle.compute_empirical_calibration(pSigma_cal, pMean_cal, true_cal, bins, coverage_percentile) + + candle.plot_calibration_and_errors(mean_sigma, sigma_start_index, sigma_end_index, + min_sigma, max_sigma, + error_thresholds, + error_thresholds_smooth, + err_err, + s_interpolate, + coverage_percentile, method, prefix, steps) + + + # Use empirical calibration and automatic determined monotonic interval + minL_sigma_auto = mean_sigma[sigma_start_index] + maxL_sigma_auto = mean_sigma[sigma_end_index] + index_sigma_range_test, xp_test, yp_test, eabs_red = candle.applying_calibration(pSigma_test, pMean_test, true_test, s_interpolate, minL_sigma_auto, maxL_sigma_auto) + # Check sigma overprediction + p_cov = coverage_percentile + num_cal = pSigma_cal.shape[0] + pYstd_perm_all = Ypred_std[index_perm_total] + pYstd_test = pYstd_perm_all[num_cal:] + pYstd_red = pYstd_test[index_sigma_range_test] + candle.overprediction_check(yp_test, eabs_red) + + # storing calibration + fname = prefix + '_calibration_spline.dkl' + with open(fname, 'wb') as f: +# pickle.dump(s_interpolate, f, protocol=pickle.HIGHEST_PROTOCOL) + dill.dump(s_interpolate, f) + print('Calibration spline stored in file: ', fname) + fname = prefix + '_calibration_limits.pkl' + with open(fname, 'wb') as f: + pickle.dump([minL_sigma_auto, maxL_sigma_auto], f, protocol=4) + print('Calibration limits stored in file: ', fname) + +if __name__ == '__main__': + main() + + diff --git a/Pilot1/Uno_UQ/data_utils_/__init__.py b/Pilot1/Uno_UQ/data_utils_/__init__.py new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/Pilot1/Uno_UQ/data_utils_/__init__.py @@ -0,0 +1 @@ + diff --git a/Pilot1/Uno_UQ/data_utils_/cellline_data.py b/Pilot1/Uno_UQ/data_utils_/cellline_data.py new file mode 100644 index 00000000..af7e369a --- /dev/null +++ b/Pilot1/Uno_UQ/data_utils_/cellline_data.py @@ -0,0 +1,97 @@ + +import pandas as pd +import numpy as np + +import candle_keras as candle + +from uno import get_file_p1 as get_file +from uno import loggerUno as logger +from uno import DATA_URL + + +def load_cell_metadata(): + path = get_file(DATA_URL + 'cl_metadata') + df = pd.read_csv(path, sep='\t') + return df + + +def cell_name_to_ids(name, source=None): + path = get_file(DATA_URL + 'NCI60_CELLNAME_to_Combo.txt') + df1 = pd.read_csv(path, sep='\t') + hits1 = candle.lookup(df1, name, 'NCI60.ID', ['NCI60.ID', 'CELLNAME', 'Name'], match='contains') + path = get_file(DATA_URL + 'cl_mapping') + df2 = pd.read_csv(path, sep='\t', header=None) + hits2 = candle.lookup(df2, name, [0, 1], [0, 1], match='contains') + hits = hits1 + hits2 + if source: + hits = [x for x in hits if x.startswith(source.upper()+'.')] + return hits + + +def load_cell_rnaseq(ncols=None, scaling='std', imputing='mean', add_prefix=True, + use_landmark_genes=False, use_filtered_genes=False, + feature_subset=None, preprocess_rnaseq=None, + embed_feature_source=False, sample_set=None, index_by_sample=False): + + if use_landmark_genes: + filename = 'combined_rnaseq_data_lincs1000' + elif use_filtered_genes: + filename = 'combined_rnaseq_data_filtered' + else: + filename = 'combined_rnaseq_data' + + if preprocess_rnaseq and preprocess_rnaseq != 'none': + scaling = None + filename += ('_' + preprocess_rnaseq) # 'source_scale' or 'combat' + + path = get_file(DATA_URL + filename) + df_cols = pd.read_csv(path, engine='c', sep='\t', nrows=0) + total = df_cols.shape[1] - 1 # remove Sample column + if 'Cancer_type_id' in df_cols.columns: + total -= 1 + usecols = None + if ncols and ncols < total: + usecols = np.random.choice(total, size=ncols, replace=False) + usecols = np.append([0], np.add(sorted(usecols), 2)) + df_cols = df_cols.iloc[:, usecols] + if feature_subset: + with_prefix = lambda x: 'rnaseq.'+x if add_prefix else x + usecols = [0] + [i for i, c in enumerate(df_cols.columns) if with_prefix(c) in feature_subset] + df_cols = df_cols.iloc[:, usecols] + + dtype_dict = dict((x, np.float32) for x in df_cols.columns[1:]) + df = pd.read_csv(path, engine='c', sep='\t', usecols=usecols, dtype=dtype_dict) + if 'Cancer_type_id' in df.columns: + df.drop('Cancer_type_id', axis=1, inplace=True) + + prefixes = df['Sample'].str.extract('^([^.]*)', expand=False).rename('Source') + sources = prefixes.drop_duplicates().reset_index(drop=True) + df_source = pd.get_dummies(sources, prefix='rnaseq.source', prefix_sep='.') + df_source = pd.concat([sources, df_source], axis=1) + + df1 = df['Sample'] + if embed_feature_source: + df_sample_source = pd.concat([df1, prefixes], axis=1) + df1 = df_sample_source.merge(df_source, on='Source', how='left').drop('Source', axis=1) + logger.info('Embedding RNAseq data source into features: %d additional columns', df1.shape[1]-1) + + df2 = df.drop('Sample', 1) + if add_prefix: + df2 = df2.add_prefix('rnaseq.') + + df2 = candle.drop_impute_and_scale_dataframe(df2, scaling, imputing) + + df = pd.concat([df1, df2], axis=1) + + # scaling needs to be done before subsampling + if sample_set: + chosen = df['Sample'].str.startswith(sample_set) + df = df[chosen].reset_index(drop=True) + + if index_by_sample: + df = df.set_index('Sample') + + logger.info('Loaded combined RNAseq data: %s', df.shape) + + return df + diff --git a/Pilot1/Uno_UQ/data_utils_/drug_data.py b/Pilot1/Uno_UQ/data_utils_/drug_data.py new file mode 100644 index 00000000..cad8e326 --- /dev/null +++ b/Pilot1/Uno_UQ/data_utils_/drug_data.py @@ -0,0 +1,188 @@ + +import pandas as pd +import numpy as np + +import candle_keras as candle + +from uno import get_file_p1 as get_file +from uno import loggerUno as logger +from uno import DATA_URL + + +def load_drug_data(ncols=None, scaling='std', imputing='mean', dropna=None, add_prefix=True): + df_info = load_drug_info() + df_info['Drug'] = df_info['PUBCHEM'] + + df_desc = load_drug_set_descriptors(drug_set='Combined_PubChem', ncols=ncols) + df_fp = load_drug_set_fingerprints(drug_set='Combined_PubChem', ncols=ncols) + + df_desc = pd.merge(df_info[['ID', 'Drug']], df_desc, on='Drug').drop('Drug', 1).rename(columns={'ID': 'Drug'}) + df_fp = pd.merge(df_info[['ID', 'Drug']], df_fp, on='Drug').drop('Drug', 1).rename(columns={'ID': 'Drug'}) + + df_desc2 = load_drug_set_descriptors(drug_set='NCI60', usecols=df_desc.columns.tolist() if ncols else None) + df_fp2 = load_drug_set_fingerprints(drug_set='NCI60', usecols=df_fp.columns.tolist() if ncols else None) + + df_desc = pd.concat([df_desc, df_desc2]).reset_index(drop=True) + df1 = pd.DataFrame(df_desc.loc[:, 'Drug']) + df2 = df_desc.drop('Drug', 1) + df2 = candle.drop_impute_and_scale_dataframe(df2, scaling=scaling, imputing=imputing, dropna=dropna) + if add_prefix: + df2 = df2.add_prefix('dragon7.') + df_desc = pd.concat([df1, df2], axis=1) + + df_fp = pd.concat([df_fp, df_fp2]).reset_index(drop=True) + df1 = pd.DataFrame(df_fp.loc[:, 'Drug']) + df2 = df_fp.drop('Drug', 1) + df2 = candle.drop_impute_and_scale_dataframe(df2, scaling=None, imputing=imputing, dropna=dropna) + if add_prefix: + df2 = df2.add_prefix('dragon7.') + df_fp = pd.concat([df1, df2], axis=1) + + logger.info('Loaded combined dragon7 drug descriptors: %s', df_desc.shape) + logger.info('Loaded combined dragon7 drug fingerprints: %s', df_fp.shape) + + return df_desc, df_fp + + +def load_drug_descriptors(ncols=None, scaling='std', imputing='mean', dropna=None, add_prefix=True, feature_subset=None): + df_info = load_drug_info() + df_info['Drug'] = df_info['PUBCHEM'] + + df_desc = load_drug_set_descriptors(drug_set='Combined_PubChem', ncols=ncols) + df_desc = pd.merge(df_info[['ID', 'Drug']], df_desc, on='Drug').drop('Drug', 1).rename(columns={'ID': 'Drug'}) + + df_desc2 = load_drug_set_descriptors(drug_set='NCI60', usecols=df_desc.columns.tolist() if ncols else None) + + df_desc = pd.concat([df_desc, df_desc2]).reset_index(drop=True) + df1 = pd.DataFrame(df_desc.loc[:, 'Drug']) + df2 = df_desc.drop('Drug', 1) + if add_prefix: + df2 = df2.add_prefix('dragon7.') + if feature_subset: + df2 = df2[[x for x in df2.columns if x in feature_subset]] + df2 = candle.drop_impute_and_scale_dataframe(df2, scaling=scaling, imputing=imputing, dropna=dropna) + df_desc = pd.concat([df1, df2], axis=1) + + logger.info('Loaded combined dragon7 drug descriptors: %s', df_desc.shape) + + return df_desc + + +def load_drug_fingerprints(ncols=None, scaling='std', imputing='mean', dropna=None, add_prefix=True, feature_subset=None): + df_info = load_drug_info() + df_info['Drug'] = df_info['PUBCHEM'] + + df_fp = load_drug_set_fingerprints(drug_set='Combined_PubChem', ncols=ncols) + df_fp = pd.merge(df_info[['ID', 'Drug']], df_fp, on='Drug').drop('Drug', 1).rename(columns={'ID': 'Drug'}) + + df_fp2 = load_drug_set_fingerprints(drug_set='NCI60', usecols=df_fp.columns.tolist() if ncols else None) + + df_fp = pd.concat([df_fp, df_fp2]).reset_index(drop=True) + df1 = pd.DataFrame(df_fp.loc[:, 'Drug']) + df2 = df_fp.drop('Drug', 1) + if add_prefix: + df2 = df2.add_prefix('dragon7.') + if feature_subset: + df2 = df2[[x for x in df2.columns if x in feature_subset]] + df2 = candle.drop_impute_and_scale_dataframe(df2, scaling=None, imputing=imputing, dropna=dropna) + df_fp = pd.concat([df1, df2], axis=1) + + logger.info('Loaded combined dragon7 drug fingerprints: %s', df_fp.shape) + + return df_fp + + +def load_drug_info(): + path = get_file(DATA_URL + 'drug_info') + df = pd.read_csv(path, sep='\t', dtype=object) + df['PUBCHEM'] = 'PubChem.CID.' + df['PUBCHEM'] + return df + + +def drug_name_to_ids(name, source=None): + df1 = load_drug_info() + path = get_file(DATA_URL + 'NCI_IOA_AOA_drugs') + df2 = pd.read_csv(path, sep='\t', dtype=str) + df2['NSC'] = 'NSC.' + df2['NSC'] + hits1 = candle.lookup(df1, name, 'ID', ['ID', 'NAME', 'CLEAN_NAME', 'PUBCHEM']) + hits2 = candle.lookup(df2, name, 'NSC', ['NSC', 'Generic Name', 'Preffered Name']) + hits = hits1 + hits2 + if source: + hits = [x for x in hits if x.startswith(source.upper()+'.')] + return hits + + +def load_drug_set_descriptors(drug_set='Combined_PubChem', ncols=None, usecols=None, + scaling=None, imputing=None, add_prefix=False): + path = get_file(DATA_URL + '{}_dragon7_descriptors.tsv'.format(drug_set)) + + df_cols = pd.read_csv(path, engine='c', sep='\t', nrows=0) + total = df_cols.shape[1] - 1 + if usecols is not None: + usecols = [x for x in usecols if x in df_cols.columns] + if usecols[0] != 'NAME': + usecols = ['NAME'] + usecols + df_cols = df_cols.loc[:, usecols] + elif ncols and ncols < total: + usecols = np.random.choice(total, size=ncols, replace=False) + usecols = np.append([0], np.add(sorted(usecols), 1)) + df_cols = df_cols.iloc[:, usecols] + + dtype_dict = dict((x, np.float32) for x in df_cols.columns[1:]) + df = pd.read_csv(path, engine='c', sep='\t', usecols=usecols, dtype=dtype_dict, + na_values=['na', '-', '']) + + df1 = pd.DataFrame(df.loc[:, 'NAME']) + df1.rename(columns={'NAME': 'Drug'}, inplace=True) + + df2 = df.drop('NAME', 1) + if add_prefix: + df2 = df2.add_prefix('dragon7.') + + df2 = candle.drop_impute_and_scale_dataframe(df2, scaling, imputing, dropna=None) + + df = pd.concat([df1, df2], axis=1) + return df + + +def load_drug_set_fingerprints(drug_set='Combined_PubChem', ncols=None, usecols=None, + scaling=None, imputing=None, add_prefix=False): + fps = ['PFP', 'ECFP'] + usecols_all = usecols + df_merged = None + for fp in fps: + path = get_file(DATA_URL + '{}_dragon7_{}.tsv'.format(drug_set, fp)) + df_cols = pd.read_csv(path, engine='c', sep='\t', nrows=0, skiprows=1, header=None) + total = df_cols.shape[1] - 1 + if usecols_all is not None: + usecols = [x.replace(fp+'.', '') for x in usecols_all] + usecols = [int(x) for x in usecols if x.isdigit()] + usecols = [x for x in usecols if x in df_cols.columns] + if usecols[0] != 0: + usecols = [0] + usecols + df_cols = df_cols.loc[:, usecols] + elif ncols and ncols < total: + usecols = np.random.choice(total, size=ncols, replace=False) + usecols = np.append([0], np.add(sorted(usecols), 1)) + df_cols = df_cols.iloc[:, usecols] + + dtype_dict = dict((x, np.float32) for x in df_cols.columns[1:]) + df = pd.read_csv(path, engine='c', sep='\t', skiprows=1, header=None, + usecols=usecols, dtype=dtype_dict) + df.columns = ['{}.{}'.format(fp, x) for x in df.columns] + + col1 = '{}.0'.format(fp) + df1 = pd.DataFrame(df.loc[:, col1]) + df1.rename(columns={col1: 'Drug'}, inplace=True) + + df2 = df.drop(col1, 1) + if add_prefix: + df2 = df2.add_prefix('dragon7.') + + df2 = candle.drop_impute_and_scale_dataframe(df2, scaling, imputing, dropna=None) + + df = pd.concat([df1, df2], axis=1) + + df_merged = df if df_merged is None else df_merged.merge(df) + + return df_merged diff --git a/Pilot1/Uno_UQ/data_utils_/response_data.py b/Pilot1/Uno_UQ/data_utils_/response_data.py new file mode 100644 index 00000000..d4080da8 --- /dev/null +++ b/Pilot1/Uno_UQ/data_utils_/response_data.py @@ -0,0 +1,175 @@ + +import pandas as pd +import numpy as np + +from uno import get_file_p1 as get_file +from uno import loggerUno as logger +from uno import DATA_URL + +global_cache = {} + +def save_combined_dose_response(): + df1 = load_single_dose_response(combo_format=True, fraction=False) + df2 = load_combo_dose_response(fraction=False) + df = pd.concat([df1, df2]) + df.to_csv('combined_drug_growth', index=False, sep='\t') + + +def load_combined_dose_response(rename=True): + df1 = load_single_dose_response(combo_format=True) + logger.info('Loaded {} single drug dose response measurements'.format(df1.shape[0])) + + df2 = load_combo_dose_response() + logger.info('Loaded {} drug pair dose response measurements'.format(df2.shape[0])) + + df = pd.concat([df1, df2]) + logger.info('Combined dose response data contains sources: {}'.format(df['SOURCE'].unique())) + + if rename: + df = df.rename(columns={'SOURCE': 'Source', 'CELL': 'Sample', + 'DRUG1': 'Drug1', 'DRUG2': 'Drug2', + 'DOSE1': 'Dose1', 'DOSE2': 'Dose2', + 'GROWTH': 'Growth', 'STUDY': 'Study'}) + return df + + +def load_single_dose_response(combo_format=False, fraction=True): + # path = get_file(DATA_URL + 'combined_single_drug_growth') + path = get_file(DATA_URL + 'rescaled_combined_single_drug_growth') + + df = global_cache.get(path) + if df is None: + df = pd.read_csv(path, sep='\t', engine='c', + na_values=['na', '-', ''], + # nrows=10, + dtype={'SOURCE': str, 'DRUG_ID': str, + 'CELLNAME': str, 'CONCUNIT': str, + 'LOG_CONCENTRATION': np.float32, + 'EXPID': str, 'GROWTH': np.float32}) + global_cache[path] = df + + df['DOSE'] = -df['LOG_CONCENTRATION'] + + df = df.rename(columns={'CELLNAME': 'CELL', 'DRUG_ID': 'DRUG', 'EXPID': 'STUDY'}) + df = df[['SOURCE', 'CELL', 'DRUG', 'DOSE', 'GROWTH', 'STUDY']] + + if fraction: + df['GROWTH'] /= 100 + + if combo_format: + df = df.rename(columns={'DRUG': 'DRUG1', 'DOSE': 'DOSE1'}) + df['DRUG2'] = np.nan + df['DOSE2'] = np.nan + df['DRUG2'] = df['DRUG2'].astype(object) + df['DOSE2'] = df['DOSE2'].astype(np.float32) + df = df[['SOURCE', 'CELL', 'DRUG1', 'DOSE1', 'DRUG2', 'DOSE2', 'GROWTH', 'STUDY']] + + return df + + +def load_combo_dose_response(fraction=True): + path = get_file(DATA_URL + 'ComboDrugGrowth_Nov2017.csv') + df = global_cache.get(path) + if df is None: + df = pd.read_csv(path, sep=',', engine='c', + na_values=['na','-',''], + usecols=['CELLNAME', 'NSC1', 'CONC1', 'NSC2', 'CONC2', + 'PERCENTGROWTH', 'VALID', 'SCREENER', 'STUDY'], + # nrows=10000, + dtype={'CELLNAME': str, 'NSC1': str, 'NSC2': str, + 'CONC1': np.float32, 'CONC2': np.float32, + 'PERCENTGROWTH':np.float32, 'VALID': str, + 'SCREENER': str, 'STUDY': str}, + error_bad_lines=False, warn_bad_lines=True) + global_cache[path] = df + + df = df[df['VALID'] == 'Y'] + + df['SOURCE'] = 'ALMANAC.' + df['SCREENER'] + + cellmap_path = get_file(DATA_URL + 'NCI60_CELLNAME_to_Combo.txt') + df_cellmap = pd.read_csv(cellmap_path, sep='\t') + df_cellmap.set_index('Name', inplace=True) + cellmap = df_cellmap[['NCI60.ID']].to_dict()['NCI60.ID'] + + df['CELL'] = df['CELLNAME'].map(lambda x: cellmap[x]) + + df['DOSE1'] = -np.log10(df['CONC1']) + df['DOSE2'] = -np.log10(df['CONC2']) + + df['DRUG1'] = 'NSC.' + df['NSC1'] + df['DRUG2'] = 'NSC.' + df['NSC2'] + + if fraction: + df['GROWTH'] = df['PERCENTGROWTH'] / 100 + else: + df['GROWTH'] = df['PERCENTGROWTH'] + + df = df[['SOURCE', 'CELL', 'DRUG1', 'DOSE1', 'DRUG2', 'DOSE2', 'GROWTH', 'STUDY']] + + return df + + +def load_aggregated_single_response(target='AUC', min_r2_fit=0.3, max_ec50_se=3, combo_format=False, rename=True): + path = get_file(DATA_URL + 'combined_single_response_agg') + + df = global_cache.get(path) + if df is None: + df = pd.read_csv(path, engine='c', sep='\t', + dtype={'SOURCE': str, 'CELL': str, 'DRUG': str, 'STUDY': str, + 'AUC': np.float32, 'IC50': np.float32, + 'EC50': np.float32, 'EC50se': np.float32, + 'R2fit': np.float32, 'Einf': np.float32, + 'HS': np.float32, 'AAC1': np.float32, + 'AUC1': np.float32, 'DSS1': np.float32}) + global_cache[path] = df + + total = len(df) + + df = df[(df['R2fit'] >= min_r2_fit) & (df['EC50se'] <= max_ec50_se)] + df = df[['SOURCE', 'CELL', 'DRUG', target, 'STUDY']] + df = df[~df[target].isnull()] + + logger.info('Loaded %d dose independent response samples (filtered by EC50se <= %f & R2fit >=%f from a total of %d).', len(df), max_ec50_se, min_r2_fit, total) + + if combo_format: + df = df.rename(columns={'DRUG': 'DRUG1'}) + df['DRUG2'] = np.nan + df['DRUG2'] = df['DRUG2'].astype(object) + df = df[['SOURCE', 'CELL', 'DRUG1', 'DRUG2', target, 'STUDY']] + if rename: + df = df.rename(columns={'SOURCE': 'Source', 'CELL': 'Sample', + 'DRUG1': 'Drug1', 'DRUG2': 'Drug2', 'STUDY': 'Study'}) + else: + if rename: + df = df.rename(columns={'SOURCE': 'Source', 'CELL': 'Sample', + 'DRUG': 'Drug', 'STUDY': 'Study'}) + + return df + + + +def select_drugs_with_response_range(df_response, lower=0, upper=0, span=0, lower_median=None, upper_median=None): + df = df_response.groupby(['Drug1', 'Sample'])['Growth'].agg(['min', 'max', 'median']) + df['span'] = df['max'].clip(lower=-1, upper=1) - df['min'].clip(lower=-1, upper=1) + df = df.groupby('Drug1').mean().reset_index().rename(columns={'Drug1': 'Drug'}) + mask = (df['min'] <= lower) & (df['max'] >= upper) & (df['span'] >= span) + if lower_median: + mask &= (df['median'] >= lower_median) + if upper_median: + mask &= (df['median'] <= upper_median) + df_sub = df[mask] + return df_sub + + +def summarize_response_data(df, target=None): + target = target or 'Growth' + df_sum = df.groupby('Source').agg({target: 'count', 'Sample': 'nunique', + 'Drug1': 'nunique', 'Drug2': 'nunique'}) + if 'Dose1' in df_sum: + df_sum['MedianDose'] = df.groupby('Source').agg({'Dose1': 'median'}) + return df_sum + + + + diff --git a/Pilot1/Uno_UQ/data_utils_/uno.py b/Pilot1/Uno_UQ/data_utils_/uno.py new file mode 100644 index 00000000..4c1ddc56 --- /dev/null +++ b/Pilot1/Uno_UQ/data_utils_/uno.py @@ -0,0 +1,353 @@ +from __future__ import print_function + +import os +import sys +import logging +import argparse +try: + import configparser +except ImportError: + import ConfigParser as configparser + +from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error +from scipy.stats.stats import pearsonr + +#file_path = os.path.dirname(os.path.realpath(__file__)) +file_path = os.path.dirname(os.path.realpath(os.path.join(__file__, '..'))) +lib_path = os.path.abspath(os.path.join(file_path, '..')) +sys.path.append(lib_path) +lib_path = os.path.abspath(os.path.join(file_path, 'data_utils_')) +sys.path.append(lib_path) +lib_path = os.path.abspath(os.path.join(file_path, 'model_utils_')) +sys.path.append(lib_path) +lib_path2 = os.path.abspath(os.path.join(file_path, '..', '..', 'common')) +sys.path.append(lib_path2) + + +import candle + +P1B3_URL = 'http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/P1B3/' +DATA_URL = 'http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/combo/' + +loggerUno = logging.getLogger(__name__) + + +def set_up_logger(logfile, logger1, logger2, verbose): + candle.verify_path(logfile) + fh = logging.FileHandler(logfile) + fh.setFormatter(logging.Formatter("[%(asctime)s %(process)d] %(message)s", datefmt="%Y-%m-%d %H:%M:%S")) + fh.setLevel(logging.DEBUG) + + sh = logging.StreamHandler() + sh.setFormatter(logging.Formatter('')) + sh.setLevel(logging.DEBUG if verbose else logging.INFO) + + for log in [logger1, logger2]: + log.setLevel(logging.DEBUG) + log.addHandler(fh) + log.addHandler(sh) + + +def extension_from_parameters(args): + """Construct string for saving model with annotation of parameters""" + ext = '' + ext += '.A={}'.format(args.activation) + ext += '.B={}'.format(args.batch_size) + ext += '.E={}'.format(args.epochs) + ext += '.O={}'.format(args.optimizer) + ext += '.LS={}'.format(args.loss) + # ext += '.LEN={}'.format(args.maxlen) + ext += '.LR={}'.format(args.learning_rate) + ext += '.CF={}'.format(''.join([x[0] for x in sorted(args.cell_features)])) + ext += '.DF={}'.format(''.join([x[0] for x in sorted(args.drug_features)])) + if args.feature_subsample > 0: + ext += '.FS={}'.format(args.feature_subsample) + if args.drop > 0: + ext += '.DR={}'.format(args.drop) + if args.warmup_lr: + ext += '.wu_lr' + if args.reduce_lr: + ext += '.re_lr' + if args.residual: + ext += '.res' + if args.use_landmark_genes: + ext += '.L1000' + if args.no_gen: + ext += '.ng' + for i, n in enumerate(args.dense): + if n > 0: + ext += '.D{}={}'.format(i+1, n) + if args.dense_feature_layers != args.dense: + for i, n in enumerate(args.dense): + if n > 0: + ext += '.FD{}={}'.format(i+1, n) + + return ext + +def set_up_logger_data(verbose=False): + sh = logging.StreamHandler() + sh.setFormatter(logging.Formatter('')) + sh.setLevel(logging.DEBUG if verbose else logging.INFO) + + logger.setLevel(logging.DEBUG) + logger.addHandler(sh) + + +def log_evaluation(metric_outputs, logger, description='Comparing y_true and y_pred:'): + logger.info(description) + for metric, value in metric_outputs.items(): + logger.info(' {}: {:.4f}'.format(metric, value)) + + +def get_file_p1(url): + fname = os.path.basename(url) + return candle.get_file(fname, origin=url, cache_subdir='Pilot1') + + +def dict_compare(d1, d2, ignore=[], expand=False): + d1_keys = set(d1.keys()) - set(ignore) + d2_keys = set(d2.keys()) - set(ignore) + intersect_keys = d1_keys.intersection(d2_keys) + added = d1_keys - d2_keys + removed = d2_keys - d1_keys + modified = set({x : (d1[x], d2[x]) for x in intersect_keys if d1[x] != d2[x]}) + common = set(x for x in intersect_keys if d1[x] == d2[x]) + equal = not (added or removed or modified) + if expand: + return equal, added, removed, modified, common + else: + return equal, added | removed | modified + + +def evaluate_prediction(y_true, y_pred): + mse = mean_squared_error(y_true, y_pred) + mae = mean_absolute_error(y_true, y_pred) + r2 = r2_score(y_true, y_pred) + corr, _ = pearsonr(y_true, y_pred) + return {'mse': mse, 'mae': mae, 'r2': r2, 'corr': corr} + + +def read_IDs_file(fname): + + with open(fname, 'r') as f: + read_ids = f.read().splitlines() + + loggerUno.info('Read file: {}'.format(fname)) + loggerUno.info('Number of elements read: {}'.format(len(read_ids))) + + return read_ids + + +class BenchmarkUno(candle.Benchmark): + + def set_locals(self): + """Functionality to set variables specific for the benchmark + - required: set of required parameters for the benchmark. + - additional_definitions: list of dictionaries describing the additional parameters for the + benchmark. + """ + + if required is not None: + self.required = set(required) + if additional_definitions is not None: + self.additional_definitions = additional_definitions + + +additional_definitions = [ +# Feature selection + {'name':'agg_dose', + 'type': str, + 'default': None, + 'choices':['AUC', 'IC50', 'EC50', 'HS', 'AAC1', 'AUC1', 'DSS1'], + 'help':'use dose-independent response data with the specified aggregation metric'}, + {'name':'cell_features', + 'nargs':'+', + 'choices':['rnaseq', 'none'], + 'help':'use rnaseq cell line feature set or none at all'}, + {'name':'drug_features', + 'nargs':'+', + 'choices':['descriptors', 'fingerprints', 'none'], + 'help':'use dragon7 descriptors or fingerprint descriptors for drug features or none at all'}, + {'name': 'by_cell', + 'type':str, + 'default':None, + 'help':'sample ID for building a by-cell model'}, + {'name': 'by_drug', + 'type':str, + 'default':None, + 'help':'drug ID or name for building a by-drug model'}, +# Data set selection + {'name':'train_sources', + 'nargs':'+', + 'choices':['all', 'CCLE', 'CTRP', 'gCSI', 'GDSC', 'NCI60', 'SCL', 'SCLC', 'ALMANAC'], + 'help':'use one or more sources of drug response data for training'}, + {'name':'test_sources', + 'nargs':'+', + 'choices':['train', 'all', 'CCLE', 'CTRP', 'gCSI', 'GDSC', 'NCI60', 'SCL', 'SCLC', 'ALMANAC'], + 'help':'use one or more sources of drug response data for testing'}, +# Sample selection + {'name':'cell_types', + 'nargs':'+', + 'help':'limit training and test data to one or more tissue types'}, + {'name':'cell_subset_path', + 'type': str, + 'default': '', + 'help':'path for file with space delimited molecular sample IDs to keep'}, + {'name':'drug_subset_path', + 'type': str, + 'default': '', + 'help':'path for file with space delimited drug IDs to keep'}, + {'name':'drug_median_response_min', + 'type':float, + 'default':-1, + 'help':'keep drugs whose median response is greater than the threshold'}, + {'name':'drug_median_response_max', + 'type':float, + 'default':1, + 'help':'keep drugs whose median response is less than the threshold'}, +# Training + {'name':'no_feature_source', + 'type': candle.str2bool, + 'default': False, + 'help':'do not embed cell or drug feature source as part of input'}, + {'name':'no_response_source', + 'type': candle.str2bool, + 'default': False, + 'help':'do not encode response data source as an input feature'}, + {'name':'dense_feature_layers', + 'nargs':'+', + 'type':int, + 'help':'number of neurons in intermediate dense layers in the feature encoding submodels'}, + {'name':'use_landmark_genes', + 'type': candle.str2bool, + 'default': False, + 'help':'use the 978 landmark genes from LINCS (L1000) as expression features'}, + {'name':'use_filtered_genes', + 'type': candle.str2bool, + 'default': False, + 'help':'use the variance filtered genes as expression features'}, + {'name':'feature_subset_path', + 'type': str, + 'default': '', + 'help':'path for file with space delimited features to keep'}, + {'name':'cell_feature_subset_path', + 'type': str, + 'default': '', + 'help':'path for file with space delimited molecular features to keep'}, + {'name':'drug_feature_subset_path', + 'type': str, + 'default': '', + 'help':'path for file with space delimited drug features to keep'}, + {'name':'preprocess_rnaseq', + 'choices':['source_scale', 'combat', 'none'], + 'default':'none', + 'help':'preprocessing method for RNAseq data; none for global normalization'}, + {'name':'residual', + 'type': candle.str2bool, + 'default': False, + 'help':'add skip connections to the layers'}, + {'name':'reduce_lr', + 'type': candle.str2bool, + 'default': False, + 'help':'reduce learning rate on plateau'}, + {'name':'warmup_lr', + 'type': candle.str2bool, + 'default': False, + 'help':'gradually increase learning rate on start'}, + {'name':'base_lr', + 'type':float, + 'default':None, + 'help':'base learning rate'}, + {'name':'cp', + 'type': candle.str2bool, + 'default': False, + 'help':'checkpoint models with best val_loss'}, + {'name':'tb', + 'type': candle.str2bool, + 'default': False, + 'help':'use tensorboard'}, + {'name': 'tb_prefix', + 'type': str, + 'default': 'tb', + 'help': 'prefix name for tb log'}, + {'name':'max_val_loss', + 'type':float, + 'default':argparse.SUPPRESS, + 'help':'retrain if val_loss is greater than the threshold'}, + {'name':'partition_by', + 'choices':['index', 'drug_pair', 'cell'], + 'default':None, + 'help':'cross validation paritioning scheme'}, + {'name':'cv', + 'type':int, + 'default':argparse.SUPPRESS, + 'help':'cross validation folds'}, + {'name':'no_gen', + 'type': candle.str2bool, + 'default': False, + 'help':'do not use generator for training and validation data'}, + {'name':'cache', + 'type': str, + 'default': None, + 'help':'prefix of data cache files to use'}, + {'name':'single', + 'type': candle.str2bool, + 'default': False, + 'help':'do not use drug pair representation'}, + {'name': 'export_csv', + 'type': str, + 'default': None, + 'help': 'output csv file name'}, + {'name':'export_data', + 'type': str, + 'default': None, + 'help':'output dataframe file name'}, + {'name': 'use_exported_data', + 'type': str, + 'default': None, + 'help': 'exported file name'}, + {'name':'growth_bins', + 'type': int, + 'default': 0, + 'help':'number of bins to use when discretizing growth response'}, + {'name' : 'initial_weights', + 'type' : str, + 'default': None, + 'help' : 'file name of initial weights'}, + {'name' : 'save_weights', + 'type': str, + 'default' : None, + 'help': 'name of file to save weights to' }, + {'name':'exclude_cells', 'nargs':'+', + 'default': [], + 'help':'cell line IDs to exclude'}, + {'name':'exclude_drugs', 'nargs':'+', + 'default': [], + 'help':'drug line IDs to exclude'}, + {'name':'sample_repetition', + 'type': candle.str2bool, + 'default': False, + 'help':'allow repetition of training data'} +] + + + +required = [ + 'activation', + 'batch_size', + 'dense', + 'dense_feature_layers', + 'drop', + 'epochs', + 'feature_subsample', + 'learning_rate', + 'loss', + 'optimizer', + 'residual', + 'rng_seed', + 'save_path', + 'scaling', + 'val_split', + 'solr_root', + 'timeout' + ] diff --git a/Pilot1/Uno_UQ/data_utils_/uno_combined_data_generator.py b/Pilot1/Uno_UQ/data_utils_/uno_combined_data_generator.py new file mode 100644 index 00000000..649780c2 --- /dev/null +++ b/Pilot1/Uno_UQ/data_utils_/uno_combined_data_generator.py @@ -0,0 +1,257 @@ + +from itertools import cycle, islice + +import numpy as np +import pandas as pd + +from keras.utils import Sequence + +def values_or_dataframe(df, contiguous=False, dataframe=False): + if dataframe: + return df + mat = df.values + if contiguous: + mat = np.ascontiguousarray(mat) + return mat + + +class CombinedDataGenerator(Sequence):#object): + """Generate training, validation or testing batches from loaded data + """ +# def __init__(self, data, partition='train', fold=0, source=None, batch_size=32, shuffle=True): + def __init__(self, data, partition='train', fold=0, source=None, batch_size=32, shuffle=True, single=False, rank=0, total_ranks=1): + + self.data = data + self.partition = partition + self.batch_size = batch_size + self.single = single + + if partition == 'train': + index = data.train_indexes[fold] + elif partition == 'val': + index = data.val_indexes[fold] + else: + index = data.test_indexes[fold] + + if source: + df = data.df_response[['Source']].iloc[index, :] + index = df.index[df['Source'] == source] + + if shuffle: + index = np.random.permutation(index) + # index = index[:len(index)//10] + + # sharing by rank + samples_per_rank = len(index) // total_ranks + samples_per_rank = self.batch_size * (samples_per_rank // self.batch_size) + + self.index = index[rank * samples_per_rank:(rank + 1) * samples_per_rank] + self.index_cycle = cycle(self.index) + self.size = len(self.index) + self.steps = self.size // self.batch_size + print("partition:{0}, rank:{1}, sharded index size:{2}, batch_size:{3}, steps:{4}".format(partition, rank, self.size, self.batch_size, self.steps)) + + +# self.index = index +# self.index_cycle = cycle(index) +# self.size = len(index) +# self.steps = np.ceil(self.size / batch_size) +# # self.steps = np.ceil(self.size / batch_size / 100) + + def __len__(self): + return self.steps + + def __getitem__(self, idx): + shard = self.index[idx * self.batch_size:(idx + 1) * self.batch_size] + x_list, y = self.get_slice(self.batch_size, single=self.single, partial_index=shard) + return x_list, y + + def reset(self): + self.index_cycle = cycle(self.index) + + def get_response(self, copy=False): + df = self.data.df_response.iloc[self.index, :].drop(['Group'], axis=1) + return df.copy() if copy else df + +# def get_slice(self, size=None, contiguous=True, single=False, dataframe=False): + def get_slice(self, size=None, contiguous=True, single=False, dataframe=False, partial_index=None): + size = size or self.size + single = single or self.data.agg_dose + target = self.data.agg_dose or 'Growth' + +# index = list(islice(self.index_cycle, size)) + if partial_index is not None: + index = partial_index + else: + index = list(islice(self.index_cycle, size)) + df_orig = self.data.df_response.iloc[index, :] + df = df_orig.copy() + + if not single: + df['Swap'] = np.random.choice([True, False], df.shape[0]) + swap = df_orig['Drug2'].notnull() & df['Swap'] + df.loc[swap, 'Drug1'] = df_orig.loc[swap, 'Drug2'] + df.loc[swap, 'Drug2'] = df_orig.loc[swap, 'Drug1'] + if not self.data.agg_dose: + df['DoseSplit'] = np.random.uniform(0.001, 0.999, df.shape[0]) + df.loc[swap, 'Dose1'] = df_orig.loc[swap, 'Dose2'] + df.loc[swap, 'Dose2'] = df_orig.loc[swap, 'Dose1'] + + split = df_orig['Drug2'].isnull() + if not single: + df.loc[split, 'Drug2'] = df_orig.loc[split, 'Drug1'] + if not self.data.agg_dose: + df.loc[split, 'Dose1'] = df_orig.loc[split, 'Dose1'] - np.log10(df.loc[split, 'DoseSplit']) + df.loc[split, 'Dose2'] = df_orig.loc[split, 'Dose1'] - np.log10(1 - df.loc[split, 'DoseSplit']) + + if dataframe: + cols = [target, 'Sample', 'Drug1', 'Drug2'] if not single else [target, 'Sample', 'Drug1'] + y = df[cols].reset_index(drop=True) + else: + y = values_or_dataframe(df[target], contiguous, dataframe) + + x_list = [] + + if not self.data.agg_dose: + doses = ['Dose1', 'Dose2'] if not single else ['Dose1'] + for dose in doses: + x = values_or_dataframe(df[[dose]].reset_index(drop=True), contiguous, dataframe) + x_list.append(x) + + if self.data.encode_response_source: + df_x = pd.merge(df[['Source']], self.data.df_source, on='Source', how='left') + df_x.drop(['Source'], axis=1, inplace=True) + x = values_or_dataframe(df_x, contiguous, dataframe) + x_list.append(x) + + for fea in self.data.cell_features: + df_cell = getattr(self.data, self.data.cell_df_dict[fea]) + df_x = pd.merge(df[['Sample']], df_cell, on='Sample', how='left') + df_x.drop(['Sample'], axis=1, inplace=True) + x = values_or_dataframe(df_x, contiguous, dataframe) + x_list.append(x) + + drugs = ['Drug1', 'Drug2'] if not single else ['Drug1'] + for drug in drugs: + for fea in self.data.drug_features: + df_drug = getattr(self.data, self.data.drug_df_dict[fea]) + df_x = pd.merge(df[[drug]], df_drug, left_on=drug, right_on='Drug', how='left') + df_x.drop([drug, 'Drug'], axis=1, inplace=True) + if dataframe and not single: + df_x = df_x.add_prefix(drug + '.') + x = values_or_dataframe(df_x, contiguous, dataframe) + x_list.append(x) + + # print(x_list, y) + return x_list, y + + def flow(self, single=False): + while 1: + x_list, y = self.get_slice(self.batch_size, single=single) + yield x_list, y + + +def test_generator(loader): + gen = CombinedDataGenerator(loader).flow() + x_list, y = next(gen) + print('x shapes:') + for x in x_list: + print(x.shape) + print('y shape:') + print(y.shape) + + +def find_columns_with_str(df, substr): + col_indices = [df.columns.get_loc(col) for col in df.columns if substr in col] + + return col_indices + +class FromFileDataGenerator(object): + """Generate testing batches from loaded data + """ + def __init__(self, df_data, indices, target_str, feature_names_list, num_features_list, batch_size=32, shuffle=True): + + self.batch_size = batch_size + + index = indices + + if shuffle: + index = np.random.permutation(index) + + self.index = index + self.index_cycle = cycle(index) + self.size = len(index) + self.steps = np.ceil(self.size / batch_size) + + self.num_features_list = num_features_list + + try : # Try to get the 'target_str' column + target = df_data.columns.get_loc(target_str) + except KeyError: # The 'target_str' column is not available in data file + # No ground truth available + y_fake = np.zeros(df_data.shape[0]) + df_data['fake_target'] = y_fake + self.target = df_data.columns.get_loc('fake_target') + else: # 'target_str' column is available --> use this column + self.target = target + + self.df_data = df_data + self.offset = self.compute_offset(feature_names_list) + + def compute_offset(self, feature_names): + offset = self.df_data.shape[1] + for name in feature_names: + col_indices = find_columns_with_str(self.df_data, name) + if len(col_indices) > 0: + first_col = np.min(col_indices) + if first_col < offset: + offset = first_col + + if offset == self.df_data.shape[1]: + raise Exception('ERROR ! Feature names from model are not in file. ' \ + 'These are features in model: ' + str(sorted(feature_names)) + \ + '... Exiting') + + return offset + + def reset(self): + self.index_cycle = cycle(self.index) + + def get_response(self, copy=False): + df = self.df_data.iloc[self.index, :] + return df.copy() if copy else df + + def get_slice(self, size=None, contiguous=True): + + size = size or self.size + index = list(islice(self.index_cycle, size)) + df_orig = self.df_data.iloc[index, :] + df = df_orig.copy() + + #Features --> + x_list = [] + start = self.offset + # features need to be provided in the partitions expected by the model + for i,numf in enumerate(self.num_features_list): + end = start + numf + mat = df.iloc[:,start:end].values + if contiguous: + mat = np.ascontiguousarray(mat) + x_list.append(mat) + start = end + + # Target + mat = df.iloc[:,self.target].values + if contiguous: + mat = np.ascontiguousarray(mat) + y = mat + + # print(x_list, y) + return x_list, y + + + def flow(self, single=False): + while 1: + x_list, y = self.get_slice(self.batch_size) + yield x_list, y + diff --git a/Pilot1/Uno_UQ/data_utils_/uno_combined_data_loader.py b/Pilot1/Uno_UQ/data_utils_/uno_combined_data_loader.py new file mode 100644 index 00000000..be5a8483 --- /dev/null +++ b/Pilot1/Uno_UQ/data_utils_/uno_combined_data_loader.py @@ -0,0 +1,427 @@ +from __future__ import print_function + +import collections +import json +import logging +import os +import pickle + +import pandas as pd +import numpy as np + +from sklearn.model_selection import ShuffleSplit, KFold + +import cellline_data +import drug_data +import response_data + +from uno import loggerUno as logger +from uno import dict_compare + +SEED = 2019 + +def encode_sources(sources): + df = pd.get_dummies(sources, prefix='source', prefix_sep='.') + df['Source'] = sources + source_l1 = df['Source'].str.extract('^(\S+)\.', expand=False) + df1 = pd.get_dummies(source_l1, prefix='source.L1', prefix_sep='.') + df = pd.concat([df1, df], axis=1) + df = df.set_index('Source').reset_index() + return df + +def read_set_from_file(path): + if path: + with open(path, 'r') as f: + text = f.read().strip() + subset = text.split() + else: + subset = None + return subset + + +def assign_partition_groups(df, partition_by='drug_pair'): + if partition_by == 'cell': + group = df['Sample'] + elif partition_by == 'drug_pair': + df_info = drug_data.load_drug_info() + id_dict = df_info[['ID', 'PUBCHEM']].drop_duplicates(['ID']).set_index('ID').iloc[:, 0] + group = df['Drug1'].copy() + group[(df['Drug2'].notnull()) & (df['Drug1'] <= df['Drug2'])] = df['Drug1'] + ',' + df['Drug2'] + group[(df['Drug2'].notnull()) & (df['Drug1'] > df['Drug2'])] = df['Drug2'] + ',' + df['Drug1'] + group2 = group.map(id_dict) + mapped = group2.notnull() + group[mapped] = group2[mapped] + elif partition_by == 'index': + group = df.reset_index()['index'] + logger.info('Grouped response data by %s: %d groups', partition_by, group.nunique()) + return group + + +class CombinedDataLoader(object): + def __init__(self, seed=SEED): + self.seed = seed + self.test_indexes = [[]] + + def load_from_cache(self, cache, params): + param_fname = '{}.params.json'.format(cache) + if not os.path.isfile(param_fname): + logger.warning('Cache parameter file does not exist: %s', param_fname) + return False + with open(param_fname) as param_file: + try: + cached_params = json.load(param_file) + except json.JSONDecodeError as e: + logger.warning('Could not decode parameter file %s: %s', param_fname, e) + return False + ignore_keys = ['cache', 'partition_by', 'single'] + equal, diffs = dict_compare(params, cached_params, ignore_keys) + if not equal: + logger.warning('Cache parameter mismatch: %s\nSaved: %s\nAttemptd to load: %s', diffs, cached_params, params) + logger.warning('\nRemove %s to rebuild data cache.\n', param_fname) + raise ValueError('Could not load from a cache with incompatible keys:', diffs) + else: + fname = '{}.pkl'.format(cache) + if not os.path.isfile(fname): + logger.warning('Cache file does not exist: %s', fname) + return False + with open(fname, 'rb') as f: + obj = pickle.load(f) + self.__dict__.update(obj.__dict__) + logger.info('Loaded data from cache: %s', fname) + return True + return False + + def save_to_cache(self, cache, params): + for k in ['self', 'cache', 'single']: + if k in params: + del params[k] + param_fname = '{}.params.json'.format(cache) + with open(param_fname, 'w') as param_file: + json.dump(params, param_file, sort_keys=True) + fname = '{}.pkl'.format(cache) + with open(fname, 'wb') as f: + pickle.dump(self, f, pickle.HIGHEST_PROTOCOL) + logger.info('Saved data to cache: %s', fname) + + def partition_data(self, partition_by=None, cv_folds=1, train_split=0.7, val_split=0.2, + cell_types=None, by_cell=None, by_drug=None, + cell_subset_path=None, drug_subset_path=None, + exclude_cells=[], exclude_drugs=[], exclude_indices=[]): + + seed = self.seed + train_sep_sources = self.train_sep_sources + test_sep_sources = self.test_sep_sources + df_response = self.df_response + + + if not partition_by: + if by_drug and by_cell: + partition_by = 'index' + elif by_drug: + partition_by = 'cell' + else: + partition_by = 'drug_pair' + + + # Exclude specified cells / drugs / indices + if exclude_cells != []: + df_response = df_response[~df_response['Sample'].isin(exclude_cells)] + if exclude_drugs != []: + if np.isin('Drug', df_response.columns.values): + df_response = df_response[~df_response['Drug1'].isin(exclude_drugs)] + else: + df_response = df_response[~df_response['Drug1'].isin(exclude_drugs) & ~df_response['Drug2'].isin(exclude_drugs)] + if exclude_indices != []: + df_response = df_response.drop(exclude_indices, axis=0) + logger.info('Excluding indices specified') + + if partition_by != self.partition_by: + df_response = df_response.assign(Group = assign_partition_groups(df_response, partition_by)) + + mask = df_response['Source'].isin(train_sep_sources) + test_mask = df_response['Source'].isin(test_sep_sources) + + if by_drug: + drug_ids = drug_data.drug_name_to_ids(by_drug) + logger.info('Mapped drug IDs for %s: %s', by_drug, drug_ids) + mask &= (df_response['Drug1'].isin(drug_ids)) & (df_response['Drug2'].isnull()) + test_mask &= (df_response['Drug1'].isin(drug_ids)) & (df_response['Drug2'].isnull()) + + if by_cell: + cell_ids = cellline_data.cell_name_to_ids(by_cell) + logger.info('Mapped sample IDs for %s: %s', by_cell, cell_ids) + mask &= (df_response['Sample'].isin(cell_ids)) + test_mask &= (df_response['Sample'].isin(cell_ids)) + + if cell_subset_path: + cell_subset = read_set_from_file(cell_subset_path) + mask &= (df_response['Sample'].isin(cell_subset)) + test_mask &= (df_response['Sample'].isin(cell_subset)) + + if drug_subset_path: + drug_subset = read_set_from_file(drug_subset_path) + mask &= (df_response['Drug1'].isin(drug_subset)) & ((df_response['Drug2'].isnull()) | (df_response['Drug2'].isin(drug_subset))) + test_mask &= (df_response['Drug1'].isin(drug_subset)) & ((df_response['Drug2'].isnull()) | (df_response['Drug2'].isin(drug_subset))) + + if cell_types: + df_type = cellline_data.load_cell_metadata() + cell_ids = set() + for cell_type in cell_types: + cells = df_type[~df_type['TUMOR_TYPE'].isnull() & df_type['TUMOR_TYPE'].str.contains(cell_type, case=False)] + cell_ids |= set(cells['ANL_ID'].tolist()) + logger.info('Mapped sample tissue types for %s: %s', cell_type, set(cells['TUMOR_TYPE'].tolist())) + mask &= (df_response['Sample'].isin(cell_ids)) + test_mask &= (df_response['Sample'].isin(cell_ids)) + + + df_group = df_response[mask]['Group'].drop_duplicates().reset_index(drop=True) + + if cv_folds > 1: + selector = KFold(n_splits=cv_folds, shuffle=True, random_state=seed) + else: + selector = ShuffleSplit(n_splits=1, train_size=train_split, test_size=val_split, random_state=seed) + + splits = selector.split(df_group) + + train_indexes = [] + val_indexes = [] + test_indexes = [] + + for index, (train_group_index, val_group_index) in enumerate(splits): + train_groups = set(df_group.values[train_group_index]) + val_groups = set(df_group.values[val_group_index]) + train_index = df_response.index[df_response['Group'].isin(train_groups) & mask] + val_index = df_response.index[df_response['Group'].isin(val_groups) & mask] + test_index = df_response.index[~df_response['Group'].isin(train_groups) & ~df_response['Group'].isin(val_groups) & test_mask] + + train_indexes.append(train_index) + val_indexes.append(val_index) + test_indexes.append(test_index) + if logger.isEnabledFor(logging.DEBUG): + logger.debug('CV fold %d: train data = %s, val data = %s, test data = %s', index, train_index.shape[0], val_index.shape[0], test_index.shape[0]) + logger.debug(' train groups (%d): %s', df_response.loc[train_index]['Group'].nunique(), df_response.loc[train_index]['Group'].unique()) + logger.debug(' val groups ({%d}): %s', df_response.loc[val_index]['Group'].nunique(), df_response.loc[val_index]['Group'].unique()) + logger.debug(' test groups ({%d}): %s', df_response.loc[test_index]['Group'].nunique(), df_response.loc[test_index]['Group'].unique()) + + + self.partition_by = partition_by + self.cv_folds = cv_folds + self.train_indexes = train_indexes + self.val_indexes = val_indexes + self.test_indexes = test_indexes + + def build_feature_list(self, single=False): + input_features = collections.OrderedDict() + feature_shapes = collections.OrderedDict() + + if not self.agg_dose: + doses = ['dose1', 'dose2'] if not single else ['dose1'] + for dose in doses: + input_features[dose] = 'dose' + feature_shapes['dose'] = (1,) + + if self.encode_response_source: + input_features['response.source'] = 'response.source' + feature_shapes['response.source'] = (self.df_source.shape[1] - 1,) + + for fea in self.cell_features: + feature_type = 'cell.' + fea + feature_name = 'cell.' + fea + df_cell = getattr(self, self.cell_df_dict[fea]) + input_features[feature_name] = feature_type + feature_shapes[feature_type] = (df_cell.shape[1] - 1,) + + drugs = ['drug1', 'drug2'] if not single else ['drug1'] + for drug in drugs: + for fea in self.drug_features: + feature_type = 'drug.' + fea + feature_name = drug + '.' + fea + df_drug = getattr(self, self.drug_df_dict[fea]) + input_features[feature_name] = feature_type + feature_shapes[feature_type] = (df_drug.shape[1] - 1,) + + input_dim = sum([np.prod(feature_shapes[x]) for x in input_features.values()]) + + self.input_features = input_features + self.feature_shapes = feature_shapes + self.input_dim = input_dim + + logger.info('Input features shapes:') + for k, v in self.input_features.items(): + logger.info(' {}: {}'.format(k, self.feature_shapes[v])) + logger.info('Total input dimensions: {}'.format(self.input_dim)) + + + def load(self, cache=None, ncols=None, scaling='std', dropna=None, + agg_dose=None, embed_feature_source=True, encode_response_source=True, + cell_features=['rnaseq'], drug_features=['descriptors', 'fingerprints'], + cell_feature_subset_path=None, drug_feature_subset_path=None, + drug_lower_response=1, drug_upper_response=-1, drug_response_span=0, + drug_median_response_min=-1, drug_median_response_max=1, + use_landmark_genes=False, use_filtered_genes=False, + preprocess_rnaseq=None, single=False, + # train_sources=['GDSC', 'CTRP', 'ALMANAC', 'NCI60'], + train_sources=['GDSC', 'CTRP', 'ALMANAC'], + # val_sources='train', + # test_sources=['CCLE', 'gCSI'], + test_sources=['train'], + partition_by='drug_pair'): + + params = locals().copy() + del params['self'] + + if not cell_features or 'none' in [x.lower() for x in cell_features]: + cell_features = [] + + if not drug_features or 'none' in [x.lower() for x in drug_features]: + drug_features = [] + + if cache and self.load_from_cache(cache, params): + self.build_feature_list(single=single) + return + + logger.info('Loading data from scratch ...') + + if agg_dose: + df_response = response_data.load_aggregated_single_response(target=agg_dose, combo_format=True) + else: + df_response = response_data.load_combined_dose_response() + + if logger.isEnabledFor(logging.INFO): + logger.info('Summary of combined dose response by source:') + logger.info(response_data.summarize_response_data(df_response, target=agg_dose)) + + all_sources = df_response['Source'].unique() + df_source = encode_sources(all_sources) + + if 'all' in train_sources: + train_sources = all_sources + if 'all' in test_sources: + test_sources = all_sources + elif 'train' in test_sources: + test_sources = train_sources + + train_sep_sources = [x for x in all_sources for y in train_sources if x.startswith(y)] + test_sep_sources = [x for x in all_sources for y in test_sources if x.startswith(y)] + + ids1 = df_response[['Drug1']].drop_duplicates().rename(columns={'Drug1':'Drug'}) + ids2 = df_response[['Drug2']].drop_duplicates().rename(columns={'Drug2':'Drug'}) + df_drugs_with_response = pd.concat([ids1, ids2]).drop_duplicates().dropna().reset_index(drop=True) + df_cells_with_response = df_response[['Sample']].drop_duplicates().reset_index(drop=True) + logger.info('Combined raw dose response data has %d unique samples and %d unique drugs', df_cells_with_response.shape[0], df_drugs_with_response.shape[0]) + + if agg_dose: + df_selected_drugs = None + else: + logger.info('Limiting drugs to those with response min <= %g, max >= %g, span >= %g, median_min <= %g, median_max >= %g ...', drug_lower_response, drug_upper_response, drug_response_span, drug_median_response_min, drug_median_response_max) + df_selected_drugs = response_data.select_drugs_with_response_range(df_response, span=drug_response_span, lower=drug_lower_response, upper=drug_upper_response, lower_median=drug_median_response_min, upper_median=drug_median_response_max) + logger.info('Selected %d drugs from %d', df_selected_drugs.shape[0], df_response['Drug1'].nunique()) + + + cell_feature_subset = read_set_from_file(cell_feature_subset_path) + drug_feature_subset = read_set_from_file(drug_feature_subset_path) + + for fea in cell_features: + fea = fea.lower() + if fea == 'rnaseq' or fea == 'expression': + df_cell_rnaseq = cellline_data.load_cell_rnaseq(ncols=ncols, scaling=scaling, use_landmark_genes=use_landmark_genes, use_filtered_genes=use_filtered_genes, feature_subset=cell_feature_subset, preprocess_rnaseq=preprocess_rnaseq, embed_feature_source=embed_feature_source) + + for fea in drug_features: + fea = fea.lower() + if fea == 'descriptors': + df_drug_desc = drug_data.load_drug_descriptors(ncols=ncols, scaling=scaling, dropna=dropna, feature_subset=drug_feature_subset) + elif fea == 'fingerprints': + df_drug_fp = drug_data.load_drug_fingerprints(ncols=ncols, scaling=scaling, dropna=dropna, feature_subset=drug_feature_subset) + + # df_drug_desc, df_drug_fp = drug_data.load_drug_data(ncols=ncols, scaling=scaling, dropna=dropna) + + cell_df_dict = {'rnaseq': 'df_cell_rnaseq'} + + drug_df_dict = {'descriptors': 'df_drug_desc', + 'fingerprints': 'df_drug_fp'} + + # df_cell_ids = df_cell_rnaseq[['Sample']].drop_duplicates() + # df_drug_ids = pd.concat([df_drug_desc[['Drug']], df_drug_fp[['Drug']]]).drop_duplicates() + + logger.info('Filtering drug response data...') + + df_cell_ids = df_cells_with_response + for fea in cell_features: + df_cell = locals()[cell_df_dict[fea]] + df_cell_ids = df_cell_ids.merge(df_cell[['Sample']]).drop_duplicates() + logger.info(' %d molecular samples with feature and response data', df_cell_ids.shape[0]) + + df_drug_ids = df_drugs_with_response + for fea in drug_features: + df_drug = locals()[drug_df_dict[fea]] + df_drug_ids = df_drug_ids.merge(df_drug[['Drug']]).drop_duplicates() + + if df_selected_drugs is not None: + df_drug_ids = df_drug_ids.merge(df_selected_drugs).drop_duplicates() + logger.info(' %d selected drugs with feature and response data', df_drug_ids.shape[0]) + + df_response = df_response[df_response['Sample'].isin(df_cell_ids['Sample']) & + df_response['Drug1'].isin(df_drug_ids['Drug']) & + (df_response['Drug2'].isin(df_drug_ids['Drug']) | df_response['Drug2'].isnull())] + + df_response = df_response[df_response['Source'].isin(train_sep_sources + test_sep_sources)] + + df_response.reset_index(drop=True, inplace=True) + + if logger.isEnabledFor(logging.INFO): + logger.info('Summary of filtered dose response by source:') + logger.info(response_data.summarize_response_data(df_response, target=agg_dose)) + + df_response = df_response.assign(Group = assign_partition_groups(df_response, partition_by)) + + self.agg_dose = agg_dose + self.cell_features = cell_features + self.drug_features = drug_features + self.cell_df_dict = cell_df_dict + self.drug_df_dict = drug_df_dict + self.df_source = df_source + self.df_response = df_response + self.embed_feature_source = embed_feature_source + self.encode_response_source = encode_response_source + self.all_sources = all_sources + self.train_sources = train_sources + self.test_sources = test_sources + self.train_sep_sources = train_sep_sources + self.test_sep_sources = test_sep_sources + self.partition_by = partition_by + + for var in (list(drug_df_dict.values()) + list(cell_df_dict.values())): + value = locals().get(var) + if value is not None: + setattr(self, var, value) + + self.build_feature_list(single=single) + + if cache: + self.save_to_cache(cache, params) + + + def get_cells_in_val(self): + + val_cell_ids = list(set(self.df_response.loc[self.val_indexes[0]]['Sample'].values)) + + return val_cell_ids + + + def get_drugs_in_val(self): + + if np.isin('Drug', self.df_response.columns.values): + val_drug_ids = list(set(self.df_response.loc[self.val_indexes[0]]['Drug'].values)) + else: + val_drug_ids = list(set(self.df_response.loc[self.val_indexes[0]]['Drug1'].values)) + + return val_drug_ids + + + def get_index_in_val(self): + + val_indices = list(set(self.val_indexes[0])) + + return val_indices + + diff --git a/Pilot1/Uno_UQ/model_utils_/__init__.py b/Pilot1/Uno_UQ/model_utils_/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/Pilot1/Uno_UQ/model_utils_/uno_model_utils.py b/Pilot1/Uno_UQ/model_utils_/uno_model_utils.py new file mode 100644 index 00000000..244c1ba8 --- /dev/null +++ b/Pilot1/Uno_UQ/model_utils_/uno_model_utils.py @@ -0,0 +1,307 @@ +#! /usr/bin/env python + + +import numpy as np + +import keras +from keras import backend as K +from keras.models import Model +from keras.layers import Input, Dense, Dropout +from keras.callbacks import Callback +from keras import regularizers +from keras.metrics import mean_squared_error, mean_absolute_error + +import candle + + +def r2_heteroscedastic(y_true, y_pred): + y_out = K.reshape(y_pred[:,:-1], K.shape(y_true)) + SS_res = K.sum(K.square(y_true - y_out)) + SS_tot = K.sum(K.square(y_true - K.mean(y_true))) + return (1 - SS_res/(SS_tot + K.epsilon())) + + +def mae_heteroscedastic(y_true, y_pred): + y_out = K.reshape(y_pred[:,:-1], K.shape(y_true)) + return mean_absolute_error(y_true, y_out) + +def mse_heteroscedastic(y_true, y_pred): + y_out = K.reshape(y_pred[:,:-1], K.shape(y_true)) + return mean_squared_error(y_true, y_out) + +def meanS_heteroscesdastic(y_true, y_pred): + log_sig2 = y_pred[:,1] + return K.mean(log_sig2) + +def quantile_loss(quantile, y_true, y_pred): + error = (y_true - y_pred) + return K.mean(K.maximum(quantile*error, (quantile-1)*error), axis=-1) + +def quantile50(y_true, y_pred): + y_out0 = K.reshape(y_pred[:,0], K.shape(y_true)) + error = (y_true-y_out0) + quantile = 0.5 + return quantile_loss(quantile, y_true, y_out0) + + +def quantile10(y_true, y_pred): + y_out1 = K.reshape(y_pred[:,1], K.shape(y_true)) + error = (y_true-y_out1) + quantile = 0.1 + return quantile_loss(quantile, y_true, y_out1) + + +def quantile90(y_true, y_pred): + y_out2 = K.reshape(y_pred[:,2], K.shape(y_true)) + error = (y_true-y_out2) + quantile = 0.9 + return quantile_loss(quantile, y_true, y_out2) + + +class ModelRecorder(Callback): + def __init__(self, save_all_models=False): + Callback.__init__(self) + self.save_all_models = save_all_models + candle.register_permanent_dropout() + + def on_train_begin(self, logs={}): + self.val_losses = [] + self.best_val_loss = np.Inf + self.best_model = None + + def on_epoch_end(self, epoch, logs={}): + val_loss = logs.get('val_loss') + self.val_losses.append(val_loss) + if val_loss < self.best_val_loss: + self.best_model = keras.models.clone_model(self.model) + self.best_val_loss = val_loss + + +class SimpleWeightSaver(Callback): + + def __init__(self, fname): + self.fname = fname + + def set_model(self, model): + if isinstance(model.layers[-2], Model): + self.model = model.layers[-2] + else: + self.model = model + + def on_train_end(self, logs={}): + self.model.save_weights(self.fname) + + +def build_model(loader, args, logger=None, permanent_dropout=True, silent=False): + if args.loss == 'heteroscedastic': + model = build_heteroscedastic_model(loader, args, logger, permanent_dropout, silent) + elif args.loss == 'quantile': + model = build_quantile_model(loader, args, logger, permanent_dropout, silent) + else: + model = build_homoscedastic_model(loader, args, logger, permanent_dropout, silent) + + return model + +def build_feature_model(input_shape, name='', dense_layers=[1000, 1000], + activation='relu', residual=False, + dropout_rate=0, permanent_dropout=True, + reg_l2=0): + x_input = Input(shape=input_shape) + h = x_input + for i, layer in enumerate(dense_layers): + x = h + if reg_l2 > 0: + h = Dense(layer, activation=activation, kernel_regularizer=regularizers.l2(reg_l2))(h) + else: + h = Dense(layer, activation=activation)(h) + if dropout_rate > 0: + if permanent_dropout: + h = candle.PermanentDropout(dropout_rate)(h) + else: + h = Dropout(dropout_rate)(h) + if residual: + try: + h = keras.layers.add([h, x]) + except ValueError: + pass + model = Model(x_input, h, name=name) + return model + + +def build_homoscedastic_model(loader, args, logger=None, permanent_dropout=True, silent=False): + input_models = {} + dropout_rate = args.drop + reg_l2 = args.reg_l2 + for fea_type, shape in loader.feature_shapes.items(): + base_type = fea_type.split('.')[0] + if base_type in ['cell', 'drug']: + box = build_feature_model(input_shape=shape, name=fea_type, + dense_layers=args.dense_feature_layers, + dropout_rate=dropout_rate, permanent_dropout=permanent_dropout, + reg_l2=reg_l2) + if not silent: + logger.debug('Feature encoding submodel for %s:', fea_type) + box.summary(print_fn=logger.debug) + input_models[fea_type] = box + + inputs = [] + encoded_inputs = [] + for fea_name, fea_type in loader.input_features.items(): + shape = loader.feature_shapes[fea_type] + fea_input = Input(shape, name='input.'+fea_name) + inputs.append(fea_input) + if fea_type in input_models: + input_model = input_models[fea_type] + encoded = input_model(fea_input) + else: + encoded = fea_input + encoded_inputs.append(encoded) + + merged = keras.layers.concatenate(encoded_inputs) + + h = merged + for i, layer in enumerate(args.dense): + x = h + if reg_l2 > 0: + h = Dense(layer, activation=args.activation, kernel_regularizer=regularizers.l2(reg_l2))(h) + else: + h = Dense(layer, activation=args.activation)(h) + if dropout_rate > 0: + if permanent_dropout: + h = candle.PermanentDropout(dropout_rate)(h) + else: + h = Dropout(dropout_rate)(h) + if args.residual: + try: + h = keras.layers.add([h, x]) + except ValueError: + pass + output = Dense(1)(h) + + return Model(inputs, output) + + +def build_heteroscedastic_model(loader, args, logger=None, permanent_dropout=True, silent=False): + input_models = {} + dropout_rate = args.drop + reg_l2 = args.reg_l2 + for fea_type, shape in loader.feature_shapes.items(): + base_type = fea_type.split('.')[0] + if base_type in ['cell', 'drug']: + box = build_feature_model(input_shape=shape, name=fea_type, + dense_layers=args.dense_feature_layers, + dropout_rate=dropout_rate, permanent_dropout=permanent_dropout, + reg_l2=reg_l2) + if not silent: + logger.debug('Feature encoding submodel for %s:', fea_type) + box.summary(print_fn=logger.debug) + input_models[fea_type] = box + + inputs = [] + encoded_inputs = [] + for fea_name, fea_type in loader.input_features.items(): + shape = loader.feature_shapes[fea_type] + fea_input = Input(shape, name='input.'+fea_name) + inputs.append(fea_input) + if fea_type in input_models: + input_model = input_models[fea_type] + encoded = input_model(fea_input) + else: + encoded = fea_input + encoded_inputs.append(encoded) + + merged = keras.layers.concatenate(encoded_inputs) + + h = merged + for i, layer in enumerate(args.dense): + x = h + if reg_l2 > 0: + h = Dense(layer, activation=args.activation, kernel_regularizer=regularizers.l2(reg_l2))(h) + else: + h = Dense(layer, activation=args.activation)(h) + if dropout_rate > 0: + if permanent_dropout: + h = candle.PermanentDropout(dropout_rate)(h) + else: + h = Dropout(dropout_rate)(h) + if args.residual: + try: + h = keras.layers.add([h, x]) + except ValueError: + pass + output = Dense(2, bias_initializer='ones')(h) + + return Model(inputs, output) + +def build_quantile_model(loader, args, logger=None, permanent_dropout=True, silent=False): + input_models = {} + dropout_rate = args.drop + reg_l2 = args.reg_l2 + for fea_type, shape in loader.feature_shapes.items(): + base_type = fea_type.split('.')[0] + if base_type in ['cell', 'drug']: + box = build_feature_model(input_shape=shape, name=fea_type, + dense_layers=args.dense_feature_layers, + dropout_rate=dropout_rate, + permanent_dropout=permanent_dropout, + reg_l2=reg_l2) + if not silent: + logger.debug('Feature encoding submodel for %s:', fea_type) + box.summary(print_fn=logger.debug) + input_models[fea_type] = box + + inputs = [] + encoded_inputs = [] + for fea_name, fea_type in loader.input_features.items(): + shape = loader.feature_shapes[fea_type] + fea_input = Input(shape, name='input.'+fea_name) + inputs.append(fea_input) + if fea_type in input_models: + input_model = input_models[fea_type] + encoded = input_model(fea_input) + else: + encoded = fea_input + encoded_inputs.append(encoded) + + merged = keras.layers.concatenate(encoded_inputs) + + h = merged + for i, layer in enumerate(args.dense): + x = h + h = Dense(layer, activation=args.activation, kernel_regularizer=regularizers.l2(args.reg_l2))(h) + if dropout_rate > 0: + if permanent_dropout: + h = candle.PermanentDropout(dropout_rate)(h) + else: + h = Dropout(dropout_rate)(h) + if args.residual: + try: + h = keras.layers.add([h, x]) + except ValueError: + pass + output = Dense(3, bias_initializer='ones')(h) + + return Model(inputs, output) + + +def heteroscedastic_loss(y_true, y_pred): + y_shape = K.shape(y_true) + y_out = K.reshape(y_pred[:,0], y_shape) + diff_sq = K.square(y_out - y_true) + log_sig2 = y_pred[:,1] + + return K.mean(K.exp(-log_sig2) * diff_sq + log_sig2) + + +def tilted_loss(quantile, y_true, f): + error = (y_true-f) + return K.mean(K.maximum(quantile*error, (quantile-1)*error), axis=-1) + + +def triple_quantile_loss(y_true, y_pred): + y_shape = K.shape(y_true) + y_out0 = K.reshape(y_pred[:,0], y_shape) + y_out1 = K.reshape(y_pred[:,1], y_shape) + y_out2 = K.reshape(y_pred[:,2], y_shape) + + return tilted_loss(0.1, y_true, y_out1) + tilted_loss(0.9, y_true, y_out2) + 2. * tilted_loss(0.5, y_true, y_out0) diff --git a/Pilot1/Uno_UQ/uno_gCSI_modelUQ.txt b/Pilot1/Uno_UQ/uno_gCSI_modelUQ.txt new file mode 100644 index 00000000..71fec820 --- /dev/null +++ b/Pilot1/Uno_UQ/uno_gCSI_modelUQ.txt @@ -0,0 +1,39 @@ +[Global_Params] +train_sources=['gCSI'] +test_sources=['train'] +cell_types=None +cell_features=['rnaseq'] +drug_features=['descriptors', 'fingerprints'] +dense=[1000, 1000, 1000] +dense_feature_layers=[1000, 1000, 1000] +activation='relu' +loss='mse' +optimizer='adam' +scaling='std' +drop=0 +epochs=10 +batch_size=32 +val_split=0.2 +cv=1 +max_val_loss=1.0 +learning_rate=None +base_lr=None +residual=False +reduce_lr=False +warmup_lr=False +batch_normalization=False +feature_subsample=0 +rng_seed=2018 +save_path='save_gCSI/' +no_gen=False +verbose = False +single=True +agg_dose='AUC' +no_feature_source=True +no_response_source=True +use_landmark_genes=True +partition_by='cell' + +[Monitor_Params] +solr_root='' +timeout=3600 diff --git a/Pilot1/Uno_UQ/uno_holdoutUQ_data.py b/Pilot1/Uno_UQ/uno_holdoutUQ_data.py new file mode 100644 index 00000000..165f940f --- /dev/null +++ b/Pilot1/Uno_UQ/uno_holdoutUQ_data.py @@ -0,0 +1,109 @@ +#! /usr/bin/env python + +from __future__ import division, print_function + +import logging +import os + +from keras import backend as K + +import data_utils_.uno as uno +import candle + +import data_utils_.uno_combined_data_loader as uno_combined_data_loader + + +logger = logging.getLogger(__name__) + +os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' + + +def initialize_parameters(): + + # Build benchmark object + unoBmk = uno.BenchmarkUno(uno.file_path, 'uno_default_model.txt', 'keras', + prog='uno_holdoutUQ_data', desc='Build data split for UQ analysis in the problem of prediction of tumor response to drug pairs.') + + # Initialize parameters + gParameters = candle.initialize_parameters(unoBmk) + #benchmark.logger.info('Params: {}'.format(gParameters)) + + return gParameters + + +def run(params): + args = candle.ArgumentStruct(**params) + candle.set_seed(args.rng_seed) + ext = uno.extension_from_parameters(args) + candle.verify_path(args.save_path) + prefix = args.save_path + ext + logfile = args.logfile if args.logfile else prefix+'.log' + uno.set_up_logger(logfile, logger, uno.loggerUno, args.verbose) + logger.info('Params: {}'.format(params)) + + loader = uno_combined_data_loader.CombinedDataLoader(args.rng_seed) + loader.load(cache=args.cache, + ncols=args.feature_subsample, + agg_dose=args.agg_dose, + cell_features=args.cell_features, + drug_features=args.drug_features, + drug_median_response_min=args.drug_median_response_min, + drug_median_response_max=args.drug_median_response_max, + use_landmark_genes=args.use_landmark_genes, + use_filtered_genes=args.use_filtered_genes, + cell_feature_subset_path=args.cell_feature_subset_path or args.feature_subset_path, + drug_feature_subset_path=args.drug_feature_subset_path or args.feature_subset_path, + preprocess_rnaseq=args.preprocess_rnaseq, + single=args.single, + train_sources=args.train_sources, + test_sources=args.test_sources, + embed_feature_source=not args.no_feature_source, + encode_response_source=not args.no_response_source, + partition_by=args.partition_by + ) + + target = args.agg_dose or 'Growth' + val_split = args.val_split + train_split = 1 - val_split + + loader.partition_data(partition_by=args.partition_by, + cv_folds=args.cv, train_split=train_split, + val_split=val_split, cell_types=args.cell_types, + by_cell=args.by_cell, by_drug=args.by_drug, + cell_subset_path=args.cell_subset_path, + drug_subset_path=args.drug_subset_path + ) + + print('partition_by: ', args.partition_by) + if args.partition_by == 'drug_pair': + fname_drugs = 'infer_drug_ids' + pds = loader.get_drugs_in_val() + with open(fname_drugs, 'w') as f: + for item in pds: + f.write('%s\n' % item) + logger.info('Drug IDs in holdout set written in file: {}'.format(fname_drugs)) + elif args.partition_by == 'cell': + fname_cells = 'infer_cell_ids' + pcs = loader.get_cells_in_val() + with open(fname_cells, 'w') as f: + for item in pcs: + f.write('%s\n' % item) + logger.info('Cell IDs in holdout set written in file: {}'.format(fname_cells)) + else : # + fname_index = 'infer_index_ids' + pins = loader.get_index_in_val() + with open(fname_index, 'w') as f: + for item in pins: + f.write('%s\n' % item) + logger.info('Indices in holdout set written in file: {}'.format(fname_index)) + + +def main(): + params = initialize_parameters() + run(params) + + +if __name__ == '__main__': + main() + if K.backend() == 'tensorflow': + K.clear_session() diff --git a/Pilot1/Uno_UQ/uno_inferUQ_keras2.py b/Pilot1/Uno_UQ/uno_inferUQ_keras2.py new file mode 100644 index 00000000..af1c7934 --- /dev/null +++ b/Pilot1/Uno_UQ/uno_inferUQ_keras2.py @@ -0,0 +1,296 @@ +#! /usr/bin/env python + +from __future__ import division, print_function + +import argparse +import logging +import os + +import numpy as np +import pandas as pd + +from itertools import cycle + +from keras import backend as K + +import keras +from keras.utils import get_custom_objects + +import data_utils_.uno as uno +import candle + +import data_utils_.uno_combined_data_loader as uno_combined_data_loader +import data_utils_.uno_combined_data_generator as uno_combined_data_generator +import model_utils_.uno_model_utils as uno_model_utils + +logger = logging.getLogger(__name__) + +os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' + + +additional_definitions_local = [ +{'name':'uq_infer_file', + 'default':argparse.SUPPRESS, + 'action':'store', + 'help':'File to do inference'}, +{'name':'uq_infer_given_drugs', + 'type': candle.str2bool, + 'default': False, + 'help':'Use given inference file to obtain drug ids to do inference'}, +{'name':'uq_infer_given_cells', + 'type': candle.str2bool, + 'default': False, + 'help':'Use given inference file to obtain cell ids to do inference'}, +{'name':'uq_infer_given_indices', + 'type': candle.str2bool, + 'default': False, + 'help':'Use given inference file to obtain indices to do inference'}, +{'name':'weights_file', + 'default':'saved.weights.h5', + 'help':'trained weights file (loading model file alone sometimes does not work in keras)'}, +{'name':'n_pred', + 'type':int, + 'default':1, + 'help':'the number of predictions to make for each sample-drug combination for uncertainty quantification'} +] + +required_local = ( 'model_file', 'weights_file', 'uq_infer_file', + 'agg_dose', 'batch_size') + + +def initialize_parameters(): + + # Build benchmark object + unoBmk = uno.BenchmarkUno(uno.file_path, 'uno_default_model.txt', 'keras', + prog='uno_inferUQ', desc='Read models to predict tumor response to single and paired drugs.') + + unoBmk.additional_definitions += additional_definitions_local + unoBmk.required = unoBmk.required.union(required_local) + + # Initialize parameters + gParameters = candle.initialize_parameters(unoBmk) + #benchmark.logger.info('Params: {}'.format(gParameters)) + + return gParameters + + +def from_file(args, model): + + df_data = pd.read_csv(args.uq_infer_file, sep='\t') + logger.info('data shape: {}'.format(df_data.shape)) + logger.info('Size of data to infer: {}'.format(df_data.shape)) + + test_indices = range(df_data.shape[0]) + target_str = args.agg_dose or 'Growth' + + # Extract size of input layers to get number of features + num_features_list = [] + feature_names_list = [] + for layer in model.layers: # All layers in model + dict = layer.get_config() # getting layer config info + name = dict['name'] # getting layer name + if name.find('input') > -1: # if layer is an input layer + feature_names_list.append(name.split('.')[-1]) + size_ = dict['batch_input_shape'] # get layer size + num_features_list.append(size_[1]) + + feature_names_list.append('dragon7') + + test_gen = uno_combined_data_generator.FromFileDataGenerator(df_data, test_indices, + target_str, feature_names_list, num_features_list, + batch_size=args.batch_size, shuffle=False) + + return test_gen + + +def given_drugs(args, loader): + + test_gen = uno_combined_data_generator.CombinedDataGenerator(loader, partition='test', batch_size=args.batch_size) + + # Include specified drugs + include_drugs = uno.read_IDs_file(args.uq_infer_file) + df_response = test_gen.data.df_response + if np.isin('Drug', df_response.columns.values): + df = df_response[['Drug']] + index = df.index[df['Drug'].isin(include_drugs)] + else: + df = df_response[['Drug1', 'Drug2']] + index = df.index[df['Drug1'].isin(include_drugs) | + df['Drug2'].isin(include_drugs)] + + # Update object + test_gen.index = index + test_gen.index_cycle = cycle(index) + test_gen.size = len(index) + test_gen.steps = np.ceil(test_gen.size / args.batch_size) + + return test_gen + + +def given_cells(args, loader): + + test_gen = uno_combined_data_generator.CombinedDataGenerator(loader, partition='test', batch_size=args.batch_size) + + # Include specified cells + include_cells = uno.read_IDs_file(args.uq_infer_file) + df = test_gen.data.df_response[['Sample']] + index = df.index[df['Sample'].isin(include_cells)] + + # Update object + test_gen.index = index + test_gen.index_cycle = cycle(index) + test_gen.size = len(index) + test_gen.steps = np.ceil(test_gen.size / args.batch_size) + + return test_gen + + +def given_indices(args, loader): + + test_gen = uno_combined_data_generator.CombinedDataGenerator(loader, partition='test', batch_size=args.batch_size) + + # Include specified indices + index = uno.read_IDs_file(args.uq_infer_file) + + # Update object + test_gen.index = index + test_gen.index_cycle = cycle(index) + test_gen.size = len(index) + test_gen.steps = np.ceil(test_gen.size / args.batch_size) + + return test_gen + + +def run(params): + args = candle.ArgumentStruct(**params) + candle.set_seed(args.rng_seed) + logfile_def = 'uno_infer_from_' + args.uq_infer_file + '.log' + logfile = args.logfile if args.logfile else logfile_def + uno.set_up_logger(logfile, logger, uno.loggerUno, args.verbose) + logger.info('Params: {}'.format(params)) + + ext = uno.extension_from_parameters(args) + candle.verify_path(args.save_path) + prefix = args.save_path + 'uno' + ext + + # Load trained model + candle.register_permanent_dropout() + model = keras.models.load_model(args.model_file, compile=False) + model.load_weights(args.weights_file) + logger.info('Loaded model:') + model.summary(print_fn=logger.info) + + # Determine output to infer + target = args.agg_dose or 'Growth' + + if (args.uq_infer_given_drugs or args.uq_infer_given_cells or args.uq_infer_given_indices): + loader = uno_combined_data_loader.CombinedDataLoader(args.rng_seed) + loader.load(cache=args.cache, + ncols=args.feature_subsample, + agg_dose=args.agg_dose, + cell_features=args.cell_features, + drug_features=args.drug_features, + drug_median_response_min=args.drug_median_response_min, + drug_median_response_max=args.drug_median_response_max, + use_landmark_genes=args.use_landmark_genes, + use_filtered_genes=args.use_filtered_genes, + cell_feature_subset_path=args.cell_feature_subset_path or args.feature_subset_path, + drug_feature_subset_path=args.drug_feature_subset_path or args.feature_subset_path, + preprocess_rnaseq=args.preprocess_rnaseq, + single=args.single, + train_sources=args.train_sources, + test_sources=args.test_sources, + embed_feature_source=not args.no_feature_source, + encode_response_source=not args.no_response_source, + ) + + if args.uq_infer_given_drugs: + test_gen = given_drugs(args, loader) + elif args.uq_infer_given_cells: + test_gen = given_cells(args, loader) + else: + test_gen = given_indices(args, loader) + + else: + test_gen = from_file(args, model) + + + df_test = test_gen.get_response(copy=True) + y_test = df_test[target].values + + for i in range(args.n_pred): + + if args.no_gen: + x_test_list, y_test = test_gen.get_slice(size=test_gen.size, single=args.single) + y_test_pred = model.predict(x_test_list, batch_size=args.batch_size) + else: + test_gen.reset() + y_test_pred = model.predict_generator(test_gen.flow(single=args.single), test_gen.steps) + y_test_pred = y_test_pred[:test_gen.size] + + if args.loss == 'heteroscedastic': + y_test_pred_ = y_test_pred[:,0] + s_test_pred = y_test_pred[:,1] + + y_test_pred = y_test_pred_.flatten() + + df_test['Predicted_'+target+'_'+str(i+1)] = y_test_pred + df_test['Pred_S_'+target+'_'+str(i+1)] = s_test_pred + + pred_fname = prefix + '.predicted_INFER_HET.tsv' + + elif args.loss == 'quantile': + + y_test_pred_50q = y_test_pred[:,0] + y_test_pred_10q = y_test_pred[:,1] + y_test_pred_90q = y_test_pred[:,2] + + y_test_pred = y_test_pred_50q.flatten() # 50th quantile prediction + + df_test['Predicted_50q_'+target+'_'+str(i+1)] = y_test_pred + df_test['Predicted_10q_'+target+'_'+str(i+1)] = y_test_pred_10q.flatten() + df_test['Predicted_90q_'+target+'_'+str(i+1)] = y_test_pred_90q.flatten() + + pred_fname = prefix + '.predicted_INFER_QTL.tsv' + + else: + y_test_pred = y_test_pred.flatten() + df_test['Predicted_'+target+'_'+str(i+1)] = y_test_pred + pred_fname = prefix + '.predicted_INFER.tsv' + + if args.n_pred < 21: + scores = uno.evaluate_prediction(y_test, y_test_pred) + uno.log_evaluation(scores, logger) + + df_pred = df_test + if args.agg_dose: + if args.single: + df_pred.sort_values(['Sample', 'Drug1', target], inplace=True) + else: + df_pred.sort_values(['Sample', 'Drug1', 'Drug2', target], inplace=True) + else: + if args.single: + df_pred.sort_values(['Sample', 'Drug1', 'Dose1', 'Growth'], inplace=True) + else: + df_pred.sort_values(['Sample', 'Drug1', 'Drug2', 'Dose1', 'Dose2', 'Growth'], inplace=True) + + df_pred.to_csv(pred_fname, sep='\t', index=False, float_format='%.4g') + logger.info('Predictions stored in file: {}'.format(pred_fname)) + + + if K.backend() == 'tensorflow': + K.clear_session() + + logger.handlers = [] + + +def main(): + params = initialize_parameters() + run(params) + + +if __name__ == '__main__': + main() + if K.backend() == 'tensorflow': + K.clear_session() + diff --git a/Pilot1/Uno_UQ/uno_trainUQ_keras2.py b/Pilot1/Uno_UQ/uno_trainUQ_keras2.py new file mode 100644 index 00000000..8a06da16 --- /dev/null +++ b/Pilot1/Uno_UQ/uno_trainUQ_keras2.py @@ -0,0 +1,404 @@ +#! /usr/bin/env python + +from __future__ import division, print_function + +import argparse +import logging +import os + +import numpy as np +import pandas as pd + + +from keras import backend as K +from keras import optimizers +from keras.models import Model +from keras.callbacks import Callback, ModelCheckpoint, ReduceLROnPlateau, LearningRateScheduler, TensorBoard +from keras.utils.vis_utils import plot_model + +import data_utils_.uno as uno +import candle + +import data_utils_.uno_combined_data_loader as uno_combined_data_loader +import data_utils_.uno_combined_data_generator as uno_combined_data_generator +import model_utils_.uno_model_utils as uno_model_utils + +from model_utils_.uno_model_utils import heteroscedastic_loss, triple_quantile_loss + +logger = logging.getLogger(__name__) + +os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' + +additional_definitions = [ +{'name':'uq_exclude_drugs_file', + 'default':argparse.SUPPRESS, + 'action':'store', + 'help':'File with drug ids to exclude from training'}, +{'name':'uq_exclude_cells_file', + 'default':argparse.SUPPRESS, + 'action':'store', + 'help':'File with cell ids to exclude from training'}, +{'name':'uq_exclude_indices_file', + 'default':argparse.SUPPRESS, + 'action':'store', + 'help':'File with indices to exclude from training'}, +{'name':'exclude_indices', 'nargs':'+', + 'default': [], + 'help':'indices to exclude'}, +{'name':'reg_l2', + 'type': float, + 'default': 0., + 'help':'weight of regularization for l2 norm of nn weights'} +] + +required = ['exclude_drugs', 'exclude_cells', 'exclude_indices'] + +class UQUno(candle.Benchmark): + def set_locals(self): + """Functionality to set variables specific for the benchmark + - required: set of required parameters for the benchmark. + - additional_definitions: list of dictionaries describing the additional parameters for the + benchmark. + """ + + if required is not None: + self.required = set(uno.required) + self.required.update(required) + if additional_definitions is not None: + self.additional_definitions = additional_definitions + uno.additional_definitions + + + +def initialize_parameters(): + + # Build benchmark object + unoUQBmk = UQUno(uno.file_path, 'uno_defaultUQ_model.txt', 'keras', + prog='uno_trainUQ', desc='Build neural network based models to predict tumor response to single and paired drugs, including UQ analysis.') + + # Initialize parameters + gParameters = candle.initialize_parameters(unoUQBmk) + #benchmark.logger.info('Params: {}'.format(gParameters)) + + return gParameters + + +def run(params): + args = candle.ArgumentStruct(**params) + candle.set_seed(args.rng_seed) + ext = uno.extension_from_parameters(args) + candle.verify_path(args.save_path) + prefix = args.save_path + 'uno' + ext + logfile = args.logfile if args.logfile else prefix+'.log' + uno.set_up_logger(logfile, logger, uno.loggerUno, args.verbose) + logger.info('Params: {}'.format(params)) + + # Exclude drugs / cells for UQ + if 'uq_exclude_drugs_file' in params.keys(): + args.exclude_drugs = uno.read_IDs_file(args.uq_exclude_drugs_file) + logger.info('Drugs to exclude: {}'.format(args.exclude_drugs)) + else: + args.exclude_drugs = [] + if 'uq_exclude_cells_file' in params.keys(): + args.exclude_cells = uno.read_IDs_file(args.uq_exclude_cells_file) + logger.info('Cells to exclude: {}'.format(args.exclude_cells)) + else: + args.exclude_cells = [] + + if 'uq_exclude_indices_file' in params.keys(): + exclude_indices_ = uno.read_IDs_file(args.uq_exclude_indices_file) + args.exclude_indices = [int(x) for x in exclude_indices_] + logger.info('Indices to exclude: {}'.format(args.exclude_indices)) + else: + args.exclude_indices = [] + + + if (len(args.gpus) > 0): + import tensorflow as tf + config = tf.ConfigProto() + config.gpu_options.allow_growth = True + config.gpu_options.visible_device_list = ",".join(map(str, args.gpus)) + K.set_session(tf.Session(config=config)) + + loader = uno_combined_data_loader.CombinedDataLoader(seed=args.rng_seed) + loader.load(cache=args.cache, + ncols=args.feature_subsample, + agg_dose=args.agg_dose, + cell_features=args.cell_features, + drug_features=args.drug_features, + drug_median_response_min=args.drug_median_response_min, + drug_median_response_max=args.drug_median_response_max, + use_landmark_genes=args.use_landmark_genes, + use_filtered_genes=args.use_filtered_genes, + cell_feature_subset_path=args.cell_feature_subset_path or args.feature_subset_path, + drug_feature_subset_path=args.drug_feature_subset_path or args.feature_subset_path, + preprocess_rnaseq=args.preprocess_rnaseq, + single=args.single, + train_sources=args.train_sources, + test_sources=args.test_sources, + embed_feature_source=not args.no_feature_source, + encode_response_source=not args.no_response_source, + ) + + target = args.agg_dose or 'Growth' + val_split = args.val_split + train_split = 1 - val_split + + loader.partition_data(partition_by=args.partition_by, + cv_folds=args.cv, train_split=train_split, val_split=val_split, + cell_types=args.cell_types, by_cell=args.by_cell, by_drug=args.by_drug, + cell_subset_path=args.cell_subset_path, + drug_subset_path=args.drug_subset_path, + exclude_cells=args.exclude_cells, + exclude_drugs=args.exclude_drugs, + exclude_indices=args.exclude_indices + ) + + model = uno_model_utils.build_model(loader, args, logger) + logger.info('Combined model:') + model.summary(print_fn=logger.info) + # plot_model(model, to_file=prefix+'.model.png', show_shapes=True) + + if args.cp: + model_json = model.to_json() + with open(prefix+'.model.json', 'w') as f: + print(model_json, file=f) + + def warmup_scheduler(epoch): + lr = args.learning_rate or base_lr * args.batch_size/100 + if epoch <= 5: + K.set_value(model.optimizer.lr, (base_lr * (5-epoch) + lr * epoch) / 5) + logger.debug('Epoch {}: lr={:.5g}'.format(epoch, K.get_value(model.optimizer.lr))) + return K.get_value(model.optimizer.lr) + + df_pred_list = [] + + cv_ext = '' + cv = args.cv if args.cv > 1 else 1 + + for fold in range(cv): + if args.cv > 1: + logger.info('Cross validation fold {}/{}:'.format(fold+1, cv)) + cv_ext = '.cv{}'.format(fold+1) + +# model = uno_model_utils.build_model(loader, args, logger, silent=True) + + template_model = uno_model_utils.build_model(loader, args, logger, silent=True) + if args.initial_weights: + logger.info("Loading weights from {}".format(args.initial_weights)) + template_model.load_weights(args.initial_weights) + + if len(args.gpus) > 1: + from keras.utils import multi_gpu_model + gpu_count = len(args.gpus) + logger.info("Multi GPU with {} gpus".format(gpu_count)) + model = multi_gpu_model(template_model, cpu_merge=False, gpus=gpu_count) + else: + model = template_model + + + optimizer = optimizers.deserialize({'class_name': args.optimizer, 'config': {}}) + base_lr = args.base_lr or K.get_value(optimizer.lr) + if args.learning_rate: + K.set_value(optimizer.lr, args.learning_rate) + + if args.loss == 'heteroscedastic': + logger.info('Training heteroscedastic model:') + model.compile(loss=heteroscedastic_loss, optimizer=optimizer, metrics=[uno_model_utils.mae_heteroscedastic, uno_model_utils.r2_heteroscedastic, uno_model_utils.meanS_heteroscesdastic]) + elif args.loss == 'quantile': + logger.info('Training quantile model:') + model.compile(loss=triple_quantile_loss, optimizer=optimizer, metrics=[uno_model_utils.quantile50, uno_model_utils.quantile10, uno_model_utils.quantile90]) + else: + logger.info('Training homoscedastic model:') + model.compile(loss=args.loss, optimizer=optimizer, metrics=[candle.mae, candle.r2]) + + # calculate trainable and non-trainable params + params.update(candle.compute_trainable_params(model)) + + candle_monitor = candle.CandleRemoteMonitor(params=params) + timeout_monitor = candle.TerminateOnTimeOut(params['timeout']) + + reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=0.00001) + warmup_lr = LearningRateScheduler(warmup_scheduler) + #checkpointer = ModelCheckpoint(prefix+cv_ext+'.weights.h5', save_best_only=True, save_weights_only=True) + checkpointer = candle.MultiGPUCheckpoint(prefix + cv_ext + '.model.h5', save_best_only=True) + tensorboard = TensorBoard(log_dir="tb/{}{}{}".format(args.tb_prefix, ext, cv_ext)) + history_logger = candle.LoggingCallback(logger.debug) +# model_recorder = uno_model_utils.ModelRecorder() + + # callbacks = [history_logger, model_recorder] + callbacks = [candle_monitor, timeout_monitor, history_logger]#, model_recorder] + if args.reduce_lr: + callbacks.append(reduce_lr) + if args.warmup_lr: + callbacks.append(warmup_lr) + if args.cp: + callbacks.append(checkpointer) + if args.tb: + callbacks.append(tensorboard) + if args.save_weights: + callbacks.append(uno_model_utils.SimpleWeightSaver(args.save_path + '/' + args.save_weights)) + + + train_gen = uno_combined_data_generator.CombinedDataGenerator(loader, fold=fold, batch_size=args.batch_size, shuffle=args.shuffle) + val_gen = uno_combined_data_generator.CombinedDataGenerator(loader, partition='val', fold=fold, batch_size=args.batch_size, shuffle=args.shuffle) + + df_val = val_gen.get_response(copy=True) + y_val = df_val[target].values + y_shuf = np.random.permutation(y_val) + uno.log_evaluation(uno.evaluate_prediction(y_val, y_shuf), logger, + description='Between random pairs in y_val:') + + if args.no_gen: + x_train_list, y_train = train_gen.get_slice(size=train_gen.size, single=args.single) + x_val_list, y_val = val_gen.get_slice(size=val_gen.size, single=args.single) + history = model.fit(x_train_list, y_train, + batch_size=args.batch_size, + epochs=args.epochs, + callbacks=callbacks, + validation_data=(x_val_list, y_val)) + else: + logger.info('Data points per epoch: train = %d, val = %d',train_gen.size, val_gen.size) + logger.info('Steps per epoch: train = %d, val = %d',train_gen.steps, val_gen.steps) + history = model.fit_generator(train_gen, train_gen.steps, + epochs=args.epochs, + callbacks=callbacks, + validation_data=val_gen, + validation_steps=val_gen.steps) + +# if args.cp: +# model.load_weights(prefix+cv_ext+'.weights.h5') + # model = model_recorder.best_model + + if args.no_gen: + y_val_pred = model.predict(x_val_list, batch_size=args.batch_size) + else: + val_gen.reset() + y_val_pred = model.predict_generator(val_gen, val_gen.steps + 1) + y_val_pred = y_val_pred[:val_gen.size] + + if args.loss == 'heteroscedastic': + y_val_pred_ = y_val_pred[:,0] + s_val_pred = y_val_pred[:,1] + + y_val_pred = y_val_pred_.flatten() + + df_val['Predicted_'+target] = y_val_pred + df_val[target+'_Error'] = y_val_pred-y_val + df_val['Pred_S_'+target] = s_val_pred + + elif args.loss == 'quantile': + y_val_pred_50q = y_val_pred[:,0] + y_val_pred_10q = y_val_pred[:,1] + y_val_pred_90q = y_val_pred[:,2] + + y_val_pred = y_val_pred_50q.flatten() # 50th quantile prediction + + df_val['Predicted_50q_'+target] = y_val_pred + df_val[target+'_Error_50q'] = y_val_pred-y_val + df_val['Predicted_10q_'+target] = y_val_pred_10q.flatten() + df_val['Predicted_90q_'+target] = y_val_pred_90q.flatten() + + else: + y_val_pred = y_val_pred.flatten() + + # df_val = df_val.assign(PredictedGrowth=y_val_pred, GrowthError=y_val_pred-y_val) + df_val['Predicted'+target] = y_val_pred + df_val[target+'Error'] = y_val_pred-y_val + + scores = uno.evaluate_prediction(y_val, y_val_pred) + uno.log_evaluation(scores, logger) + + df_pred_list.append(df_val) + +# if args.cp: +# model_recorder.best_model.save(prefix+'.model.h5') + + if hasattr(history, 'loss'): + candle.plot_history(prefix, history, 'loss') + if args.loss == 'heteroscedastic': + if hasattr(history, 'r2_heteroscedastic'): + candle.plot_history(prefix, history, 'r2_heteroscedastic') + if hasattr(history, 'meanS_heteroscedastic'): + candle.plot_history(prefix, history, 'meanS_heteroscesdastic') + elif args.loss == 'quantile': + if hasattr(history, 'quantile50'): + candle.plot_history(prefix, history, 'quantile50') + if hasattr(history, 'quantile10'): + candle.plot_history(prefix, history, 'quantile10') + if hasattr(history, 'quantile90'): + candle.plot_history(prefix, history, 'quantile90') + else: + if hasattr(history, 'r2'): + candle.plot_history(prefix, history, 'r2') + + pred_fname = prefix + '.predicted.tsv' + df_pred = pd.concat(df_pred_list) + if args.agg_dose: + if args.single: +# df_pred.sort_values(['Source', 'Sample', 'Drug1', target], inplace=True) + df_pred.sort_values(['Sample', 'Drug1', target], inplace=True) + else: + df_pred.sort_values(['Source', 'Sample', 'Drug1', 'Drug2', target], inplace=True) + else: + if args.single: +# df_pred.sort_values(['Source', 'Sample', 'Drug1', 'Dose1', 'Growth'], inplace=True) + df_pred.sort_values(['Sample', 'Drug1', 'Dose1', 'Growth'], inplace=True) + else: +# df_pred.sort_values(['Source', 'Sample', 'Drug1', 'Drug2', 'Dose1', 'Dose2', 'Growth'], inplace=True) + df_pred.sort_values(['Sample', 'Drug1', 'Drug2', 'Dose1', 'Dose2', 'Growth'], inplace=True) + df_pred.to_csv(pred_fname, sep='\t', index=False, float_format='%.4g') + logger.info('Testing predictions stored in file: {}'.format(pred_fname)) + + if args.cp: + logger.info('Model stored in file: {}'.format(prefix+'.model.h5')) +# logger.info('Model weights stored in file: {}'.format(prefix+cv_ext+'.weights.h5')) + logger.info('Model weights stored in file: {}'.format(args.save_path + '/' + args.save_weights)) + + if args.cv > 1: + scores = uno.evaluate_prediction(df_pred[target], df_pred['Predicted'+target]) + uno.log_evaluation(scores, logger, description='Combining cross validation folds:') + + for test_source in loader.test_sep_sources: + test_gen = uno_combined_data_generator.CombinedDataGenerator(loader, partition='test', batch_size=args.batch_size, source=test_source) + df_test = test_gen.get_response(copy=True) + y_test = df_test[target].values + n_test = len(y_test) + if n_test == 0: + continue + if args.no_gen: + x_test_list, y_test = test_gen.get_slice(size=test_gen.size, single=args.single) + y_test_pred = model.predict(x_test_list, batch_size=args.batch_size) + if args.loss == 'heteroscedastic': + y_test_pred = y_test_pred[:,0] + elif args.loss == 'quantile': + y_test_pred = y_test_pred[:,0] # 50th quantile prediction + else: + y_test_pred = model.predict_generator(test_gen.flow(single=args.single), test_gen.steps) + if args.loss == 'heteroscedastic': + y_test_pred = y_test_pred[:test_gen.size,0] + elif args.loss == 'quantile': + y_test_pred = y_test_pred[:test_gen.size,0] # 50th quantile prediction + else: + y_test_pred = y_test_pred[:test_gen.size] + + y_test_pred = y_test_pred.flatten() + scores = uno.evaluate_prediction(y_test, y_test_pred) + uno.log_evaluation(scores, logger, description='Testing on data from {} ({})'.format(test_source, n_test)) + + if K.backend() == 'tensorflow': + K.clear_session() + + logger.handlers = [] + + return history + + +def main(): + params = initialize_parameters() + run(params) + + +if __name__ == '__main__': + main() + if K.backend() == 'tensorflow': + K.clear_session() diff --git a/common/candle/__init__.py b/common/candle/__init__.py index b8bf19c9..486ef1ef 100644 --- a/common/candle/__init__.py +++ b/common/candle/__init__.py @@ -6,6 +6,10 @@ from data_utils import load_csv_data from data_utils import load_Xy_one_hot_data2 from data_utils import load_Xy_data_noheader +from data_utils import drop_impute_and_scale_dataframe +from data_utils import discretize_dataframe +from data_utils import discretize_array +from data_utils import lookup #import from file_utils from file_utils import get_file @@ -25,6 +29,25 @@ # import from viz_utils from viz_utils import plot_history from viz_utils import plot_scatter +from viz_utils import plot_density_observed_vs_predicted +from viz_utils import plot_2d_density_sigma_vs_error +from viz_utils import plot_histogram_error_per_sigma +from viz_utils import plot_calibration_and_errors +from viz_utils import plot_percentile_predictions + + +# import from uq_utils +from uq_utils import compute_statistics_homoscedastic +from uq_utils import compute_statistics_homoscedastic_all +from uq_utils import compute_statistics_heteroscedastic +from uq_utils import compute_statistics_quantile +from uq_utils import split_data_for_empirical_calibration +from uq_utils import compute_empirical_calibration +from uq_utils import bining_for_calibration +from uq_utils import computation_of_valid_calibration_interval +from uq_utils import applying_calibration +from uq_utils import overprediction_check + # import benchmark-dependent utils import sys @@ -41,6 +64,11 @@ from keras_utils import PermanentDropout from keras_utils import register_permanent_dropout from keras_utils import LoggingCallback + from keras_utils import MultiGPUCheckpoint + from keras_utils import r2 + from keras_utils import mae + from keras_utils import mse + from solr_keras import CandleRemoteMonitor from solr_keras import compute_trainable_params diff --git a/common/candle_keras/__init__.py b/common/candle_keras/__init__.py index c5eccf06..bcf15874 100644 --- a/common/candle_keras/__init__.py +++ b/common/candle_keras/__init__.py @@ -6,6 +6,10 @@ from data_utils import load_csv_data from data_utils import load_Xy_one_hot_data2 from data_utils import load_Xy_data_noheader +from data_utils import drop_impute_and_scale_dataframe +from data_utils import discretize_dataframe +from data_utils import discretize_array +from data_utils import lookup #import from file_utils from file_utils import get_file @@ -20,6 +24,30 @@ from default_utils import keras_default_config from default_utils import set_up_logger +from generic_utils import Progbar + +# import from viz_utils +from viz_utils import plot_history +from viz_utils import plot_scatter +from viz_utils import plot_density_observed_vs_predicted +from viz_utils import plot_2d_density_sigma_vs_error +from viz_utils import plot_histogram_error_per_sigma +from viz_utils import plot_calibration_and_errors +from viz_utils import plot_percentile_predictions + +# import from uq_utils +from uq_utils import compute_statistics_homoscedastic +from uq_utils import compute_statistics_homoscedastic_all +from uq_utils import compute_statistics_heteroscedastic +from uq_utils import compute_statistics_quantile +from uq_utils import split_data_for_empirical_calibration +from uq_utils import compute_empirical_calibration +from uq_utils import bining_for_calibration +from uq_utils import computation_of_valid_calibration_interval +from uq_utils import applying_calibration +from uq_utils import overprediction_check + + #import from keras_utils #from keras_utils import dense #from keras_utils import add_dense @@ -30,8 +58,10 @@ from keras_utils import PermanentDropout from keras_utils import register_permanent_dropout from keras_utils import LoggingCallback +from keras_utils import r2 +from keras_utils import mae +from keras_utils import mse -from generic_utils import Progbar from solr_keras import CandleRemoteMonitor from solr_keras import compute_trainable_params diff --git a/common/data_utils.py b/common/data_utils.py index c17a3b42..856c63a0 100644 --- a/common/data_utils.py +++ b/common/data_utils.py @@ -3,7 +3,8 @@ import numpy as np import pandas as pd -from sklearn.preprocessing import Imputer +#from sklearn.preprocessing import Imputer +from sklearn.impute import SimpleImputer from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler from default_utils import DEFAULT_SEED @@ -125,13 +126,162 @@ def impute_and_scale_array(mat, scaling=None): it returns the imputed numpy array. """ - imputer = Imputer(strategy='mean', axis=0, copy=False) +# imputer = Imputer(strategy='mean', axis=0, copy=False) + imputer = SimpleImputer(strategy='mean', copy=False) imputer.fit_transform(mat) - #mat = imputer.fit_transform(mat) return scale_array(mat, scaling) +def drop_impute_and_scale_dataframe(df, scaling='std', imputing='mean', dropna='all'): + """Impute missing values with mean and scale data included in pandas dataframe. + + Parameters + ---------- + df : pandas dataframe + dataframe to process + scaling : string + String describing type of scaling to apply. + 'maxabs' [-1,1], 'minmax' [0,1], 'std', or None, optional + (Default 'std') + imputing : string + String describing type of imputation to apply. + 'mean' replace missing values with mean value along the column, + 'median' replace missing values with median value along the column, + 'most_frequent' replace missing values with most frequent value along column + (Default: 'mean'). + dropna : string + String describing strategy for handling missing values. + 'all' if all values are NA, drop that column. + 'any' if any NA values are present, dropt that column. + (Default: 'all'). + + Return + ---------- + Returns the data frame after handling missing values and scaling. + + """ + + if dropna: + df = df.dropna(axis=1, how=dropna) + else: + empty_cols = df.columns[df.notnull().sum() == 0] + df[empty_cols] = 0 + + if imputing is None or imputing.lower() == 'none': + mat = df.values + else: +# imputer = Imputer(strategy=imputing, axis=0) + imputer = SimpleImputer(strategy=imputing) + mat = imputer.fit_transform(df.values) + + if scaling is None or scaling.lower() == 'none': + return pd.DataFrame(mat, columns=df.columns) + + if scaling == 'maxabs': + scaler = MaxAbsScaler() + elif scaling == 'minmax': + scaler = MinMaxScaler() + else: + scaler = StandardScaler() + + mat = scaler.fit_transform(mat) + df = pd.DataFrame(mat, columns=df.columns) + + return df + + +def discretize_dataframe(df, col, bins=2, cutoffs=None): + """Discretize values of given column in pandas dataframe. + + Parameters + ---------- + df : pandas dataframe + dataframe to process. + col : int + Index of column to bin. + bins : int + Number of bins for distributing column values. + cutoffs : list + List of bin limits. + If None, the limits are computed as percentiles. + (Default: None). + + Return + ---------- + Returns the data frame with the values of the specified column binned, i.e. the values + are replaced by the associated bin number. + + """ + + y = df[col] + thresholds = cutoffs + if thresholds is None: + percentiles = [100 / bins * (i + 1) for i in range(bins - 1)] + thresholds = [np.percentile(y, x) for x in percentiles] + classes = np.digitize(y, thresholds) + df[col] = classes + + return df + + +def discretize_array(y, bins=5): + """Discretize values of given array. + + Parameters + ---------- + y : numpy array + array to discretize. + bins : int + Number of bins for distributing column values. + + Return + ---------- + Returns an array with the bin number associated to the values in the + original array. + + """ + + percentiles = [100 / bins * (i + 1) for i in range(bins - 1)] + thresholds = [np.percentile(y, x) for x in percentiles] + classes = np.digitize(y, thresholds) + return classes + + + +def lookup(df, query, ret, keys, match='match'): + """Dataframe lookup. + + Parameters + ---------- + df : pandas dataframe + dataframe for retrieving values. + query : string + String for searching. + ret : int/string or list + Names or indices of columns to be returned. + keys : list + List of strings or integers specifying the names or + indices of columns to look into. + match : string + String describing strategy for matching keys to query. + + Return + ---------- + Returns a list of the values in the dataframe whose columns match + the specified query and have been selected to be returned. + + """ + + mask = pd.Series(False, index=range(df.shape[0])) + for key in keys: + if match == 'contains': + mask |= df[key].str.contains(query.upper(), case=False) + else: + mask |= (df[key].str.upper() == query.upper()) + + return list(set(df[mask][ret].values.flatten().tolist())) + def load_X_data(train_file, test_file, drop_cols=None, n_cols=None, shuffle=False, scaling=None, diff --git a/common/keras_utils.py b/common/keras_utils.py index 2d35b3ac..06119051 100644 --- a/common/keras_utils.py +++ b/common/keras_utils.py @@ -6,9 +6,10 @@ from keras import initializers from keras.layers import Dropout -from keras.callbacks import Callback +from keras.callbacks import Callback, ModelCheckpoint from keras.utils import get_custom_objects -from keras.metrics import binary_crossentropy, mean_squared_error +from keras.metrics import binary_crossentropy, mean_squared_error, mean_absolute_error +from keras.models import Model from scipy.stats.stats import pearsonr @@ -196,6 +197,16 @@ def xent(y_true, y_pred): return binary_crossentropy(y_true, y_pred) +def r2(y_true, y_pred): + SS_res = K.sum(K.square(y_true - y_pred)) + SS_tot = K.sum(K.square(y_true - K.mean(y_true))) + return (1 - SS_res/(SS_tot + K.epsilon())) + + +def mae(y_true, y_pred): + return mean_absolute_error(y_true, y_pred) + + def mse(y_true, y_pred): return mean_squared_error(y_true, y_pred) @@ -243,3 +254,13 @@ def __init__(self, print_fcn=print): def on_epoch_end(self, epoch, logs={}): msg = "[Epoch: %i] %s" % (epoch, ", ".join("%s: %f" % (k, v) for k, v in sorted(logs.items()))) self.print_fcn(msg) + + +class MultiGPUCheckpoint(ModelCheckpoint): + + def set_model(self, model): + if isinstance(model.layers[-2], Model): + self.model = model.layers[-2] + else: + self.model = model + diff --git a/common/uq_utils.py b/common/uq_utils.py index 650da687..d0ab46c3 100644 --- a/common/uq_utils.py +++ b/common/uq_utils.py @@ -1,7 +1,9 @@ from __future__ import absolute_import import numpy as np - +from scipy.stats import pearsonr, spearmanr +from scipy import signal +from scipy.interpolate import InterpolatedUnivariateSpline def generate_index_distribution(numTrain, numTest, numValidation, params): """ Generates a vector of indices to partition the data for training. @@ -331,6 +333,751 @@ def fill_array(blocklist, maxsize, numdata, numblocks, blocksize): return indexArray[:offset] +###### UTILS for COMPUTATION OF EMPIRICAL CALIBRATION + +def compute_statistics_homoscedastic(df_data, + col_true=0, + col_pred=6, + col_std_pred=7, + ): + """ Extracts ground truth, mean predition, error and + standard deviation of prediction from inference + data frame. The latter includes the statistics + over all the inference realizations. + + Parameters + ---------- + df_data : pandas data frame + Data frame generated by current CANDLE inference + experiments. Indices are hard coded to agree with + current CANDLE version. (The inference file usually + has the name: _pred.tsv). + col_true : integer + Index of the column in the data frame where the true + value is stored (Default: 0, index in current CANDLE format). + col_pred : integer + Index of the column in the data frame where the predicted + value is stored (Default: 6, index in current CANDLE format). + col_std_pred : integer + Index of the column in the data frame where the standard + deviation of the predicted values is stored (Default: 7, + index in current CANDLE format). + + Return + ---------- + Ytrue : numpy array + Array with true (observed) values + Ypred : numpy array + Array with predicted values. + yerror : numpy array + Array with errors computed (observed - predicted). + sigma : numpy array + Array with standard deviations learned with deep learning + model. For homoscedastic inference this corresponds to the + std value computed from prediction (and is equal to the + following returned variable). + Ypred_std : numpy array + Array with standard deviations computed from regular + (homoscedastic) inference. + pred_name : string + Name of data colum or quantity predicted (as extracted + from the data frame using the col_true index). + """ + + Ytrue = df_data.iloc[:,col_true].values + print('Ytrue shape: ', Ytrue.shape) + pred_name = df_data.columns[col_true] + Ypred = df_data.iloc[:,col_pred].values + print('Ypred shape: ', Ypred.shape) + Ypred_std = df_data.iloc[:,col_std_pred].values + print('Ypred_std shape: ', Ypred_std.shape) + yerror = Ytrue - Ypred + print('yerror shape: ', yerror.shape) + sigma = Ypred_std # std + MSE = np.mean((Ytrue - Ypred)**2) + print('MSE: ', MSE) + MSE_STD = np.std((Ytrue - Ypred)**2) + print('MSE_STD: ', MSE_STD) + # p-value 'not entirely reliable, reasonable for datasets > 500' + spearman_cc, pval = spearmanr(Ytrue, Ypred) + print('Spearman CC: %f, p-value: %e' % (spearman_cc, pval)) + + return Ytrue, Ypred, yerror, sigma, Ypred_std, pred_name + + +def compute_statistics_homoscedastic_all(df_data, + col_true=4, + col_pred_start=6 + ): + """ Extracts ground truth, mean predition, error and + standard deviation of prediction from inference + data frame. The latter includes all the individual + inference realizations. + + Parameters + ---------- + df_data : pandas data frame + Data frame generated by current CANDLE inference + experiments. Indices are hard coded to agree with + current CANDLE version. (The inference file usually + has the name: .predicted_INFER.tsv). + col_true : integer + Index of the column in the data frame where the true + value is stored (Default: 4, index in current HOM format). + col_pred_start : integer + Index of the column in the data frame where the first predicted + value is stored. All the predicted values during inference + are stored (Default: 6 index, in current HOM format). + + Return + ---------- + Ytrue : numpy array + Array with true (observed) values + Ypred : numpy array + Array with predicted values. + yerror : numpy array + Array with errors computed (observed - predicted). + sigma : numpy array + Array with standard deviations learned with deep learning + model. For homoscedastic inference this corresponds to the + std value computed from prediction (and is equal to the + following returned variable). + Ypred_std : numpy array + Array with standard deviations computed from regular + (homoscedastic) inference. + pred_name : string + Name of data colum or quantity predicted (as extracted + from the data frame using the col_true index). + """ + + Ytrue = df_data.iloc[:,col_true].values + print('Ytrue shape: ', Ytrue.shape) + pred_name = df_data.columns[col_true] + Ypred_mean_ = np.mean(df_data.iloc[:,col_pred_start:], axis=1) + Ypred_mean = Ypred_mean_.values + print('Ypred_mean shape: ', Ypred_mean.shape) + Ypred_std_ = np.std(df_data.iloc[:,col_pred_start:], axis=1) + Ypred_std = Ypred_std_.values + print('Ypred_std shape: ', Ypred_std.shape) + yerror = Ytrue - Ypred_mean + print('yerror shape: ', yerror.shape) + sigma = Ypred_std # std + MSE = np.mean((Ytrue - Ypred_mean)**2) + print('MSE: ', MSE) + MSE_STD = np.std((Ytrue - Ypred_mean)**2) + print('MSE_STD: ', MSE_STD) + # p-value 'not entirely reliable, reasonable for datasets > 500' + spearman_cc, pval = spearmanr(Ytrue, Ypred_mean) + print('Spearman CC: %f, p-value: %e' % (spearman_cc, pval)) + + return Ytrue, Ypred_mean, yerror, sigma, Ypred_std, pred_name + + +def compute_statistics_heteroscedastic(df_data, + col_true=4, + col_pred_start=6, + col_std_pred_start=7, + ): + """ Extracts ground truth, mean predition, error, standard + deviation of prediction and predicted (learned) standard + deviation from inference data frame. The latter includes + all the individual inference realizations. + + Parameters + ---------- + df_data : pandas data frame + Data frame generated by current heteroscedastic inference + experiments. Indices are hard coded to agree with + current version. (The inference file usually + has the name: .predicted_INFER_HET.tsv). + col_true : integer + Index of the column in the data frame where the true + value is stored (Default: 4, index in current HET format). + col_pred_start : integer + Index of the column in the data frame where the first predicted + value is stored. All the predicted values during inference + are stored and are interspaced with standard deviation + predictions (Default: 6 index, step 2, in current HET format). + col_std_pred_start : integer + Index of the column in the data frame where the first predicted + standard deviation value is stored. All the predicted values + during inference are stored and are interspaced with predictions + (Default: 7 index, step 2, in current HET format). + + Return + ---------- + Ytrue : numpy array + Array with true (observed) values + Ypred : numpy array + Array with predicted values. + yerror : numpy array + Array with errors computed (observed - predicted). + sigma : numpy array + Array with standard deviations learned with deep learning + model. For homoscedastic inference this corresponds to the + std value computed from prediction (and is equal to the + following returned variable). + Ypred_std : numpy array + Array with standard deviations computed from regular + (homoscedastic) inference. + pred_name : string + Name of data colum or quantity predicted (as extracted + from the data frame using the col_true index). + """ + + Ytrue = df_data.iloc[:,col_true].values + print('Ytrue shape: ', Ytrue.shape) + pred_name = df_data.columns[col_true] + Ypred_mean_ = np.mean(df_data.iloc[:,col_pred_start::2], axis=1) + Ypred_mean = Ypred_mean_.values + print('Ypred shape: ', Ypred_mean.shape) + Ypred_std_ = np.std(df_data.iloc[:,col_pred_start::2], axis=1) + Ypred_std = Ypred_std_.values + print('Ypred_std shape: ', Ypred_std.shape) + yerror = Ytrue - Ypred_mean + print('yerror shape: ', yerror.shape) + s_ = df_data.iloc[:,col_std_pred_start::2] + s_mean = np.mean(s_, axis=1) + var = np.exp(s_mean.values) # variance + sigma = np.sqrt(var) # std + print('sigma shape: ', sigma.shape) + MSE = np.mean((Ytrue - Ypred_mean)**2) + print('MSE: ', MSE) + MSE_STD = np.std((Ytrue - Ypred_mean)**2) + print('MSE_STD: ', MSE_STD) + # p-value 'not entirely reliable, reasonable for datasets > 500' + spearman_cc, pval = spearmanr(Ytrue, Ypred_mean) + print('Spearman CC: %f, p-value: %e' % (spearman_cc, pval)) + + return Ytrue, Ypred_mean, yerror, sigma, Ypred_std, pred_name + + +def compute_statistics_quantile(df_data, + sigma_divisor=2.56, + col_true=4, + col_pred_start=6 + ): + """ Extracts ground truth, 50th percentile mean predition, + low percentile and high percentile mean prediction + (usually 10th percentile and 90th percentile respectively), + error (using 50th percentile), standard deviation of + prediction (using 50th percentile) and predicted (learned) + standard deviation from interdecile range in inference data frame. + The latter includes all the individual inference realizations. + + Parameters + ---------- + df_data : pandas data frame + Data frame generated by current quantile inference + experiments. Indices are hard coded to agree with + current version. (The inference file usually + has the name: .predicted_INFER_QTL.tsv). + sigma_divisor : float + Divisor to convert from the intercedile range to the corresponding + standard deviation for a Gaussian distribution. + (Default: 2.56, consisten with an interdecile range computed from + the difference between the 90th and 10th percentiles). + col_true : integer + Index of the column in the data frame where the true + value is stored (Default: 4, index in current QTL format). + col_pred_start : integer + Index of the column in the data frame where the first predicted + value is stored. All the predicted values during inference + are stored and are interspaced with other percentile + predictions (Default: 6 index, step 3, in current QTL format). + + Return + ---------- + Ytrue : numpy array + Array with true (observed) values + Ypred : numpy array + Array with predicted values (based on the 50th percentile). + yerror : numpy array + Array with errors computed (observed - predicted). + sigma : numpy array + Array with standard deviations learned with deep learning + model. This corresponds to the interdecile range divided + by the sigma divisor. + Ypred_std : numpy array + Array with standard deviations computed from regular + (homoscedastic) inference. + pred_name : string + Name of data colum or quantity predicted (as extracted + from the data frame using the col_true index). + Ypred_Lp_mean : numpy array + Array with predicted values of the lower percentile + (usually the 10th percentile). + Ypred_Hp_mean : numpy array + Array with predicted values of the higher percentile + (usually the 90th percentile). + """ + + Ytrue = df_data.iloc[:,col_true].values + print('Ytrue shape: ', Ytrue.shape) + pred_name = df_data.columns[col_true] + Ypred_50q_mean = np.mean(df_data.iloc[:,col_pred_start::3], axis=1) + Ypred_mean = Ypred_50q_mean.values + print('Ypred shape: ', Ypred_mean.shape) + Ypred_Lp_mean_ = np.mean(df_data.iloc[:,col_pred_start+1::3], axis=1) + Ypred_Hp_mean_ = np.mean(df_data.iloc[:,col_pred_start+2::3], axis=1) + Ypred_Lp_mean = Ypred_Lp_mean_.values + Ypred_Hp_mean = Ypred_Hp_mean_.values + interdecile_range = Ypred_Hp_mean - Ypred_Lp_mean + sigma = interdecile_range / sigma_divisor + print('sigma shape: ', sigma.shape) + yerror = Ytrue - Ypred_mean + print('yerror shape: ', yerror.shape) + Ypred_std_ = np.std(df_data.iloc[:,col_pred_start::3], axis=1) + Ypred_std = Ypred_std_.values + print('Ypred_std shape: ', Ypred_std.shape) + MSE = np.mean((Ytrue - Ypred_mean)**2) + print('MSE: ', MSE) + MSE_STD = np.std((Ytrue - Ypred_mean)**2) + print('MSE_STD: ', MSE_STD) + # p-value 'not entirely reliable, reasonable for datasets > 500' + spearman_cc, pval = spearmanr(Ytrue, Ypred_mean) + print('Spearman CC: %f, p-value: %e' % (spearman_cc, pval)) + + return Ytrue, Ypred_mean, yerror, sigma, Ypred_std, pred_name, Ypred_Lp_mean, Ypred_Hp_mean + + +def split_data_for_empirical_calibration(Ytrue, Ypred, sigma, cal_split=0.8): + """ Extracts a portion of the arrays provided for the computation + of the calibration and reserves the remainder portion + for testing. + + Parameters + ---------- + Ytrue : numpy array + Array with true (observed) values + Ypred : numpy array + Array with predicted values. + sigma : numpy array + Array with standard deviations learned with deep learning + model (or std value computed from prediction if homoscedastic + inference). + cal_split : float + Split of data to use for estimating the calibration relationship. + It is assumet that it will be a value in (0, 1). + (Default: use 80% of predictions to generate empirical + calibration). + + Return + ---------- + index_perm_total : numpy array + Random permutation of the array indices. The first 'num_cal' + of the indices correspond to the samples that are used for + calibration, while the remainder are the samples reserved + for calibration testing. + pSigma_cal : numpy array + Part of the input sigma array to use for calibration. + pSigma_test : numpy array + Part of the input sigma array to reserve for testing. + pPred_cal : numpy array + Part of the input Ypred array to use for calibration. + pPred_test : numpy array + Part of the input Ypred array to reserve for testing. + true_cal : numpy array + Part of the input Ytrue array to use for calibration. + true_test : numpy array + Part of the input Ytrue array to reserve for testing. + """ + + # shuffle data for calibration + num_pred_total = sigma.shape[0] + num_cal = np.int(num_pred_total * cal_split) + index_perm_total = np.random.permutation(range(num_pred_total)) + + # Permute data + pSigma_perm_all = sigma[index_perm_total] + pPred_perm_all = Ypred[index_perm_total] + true_perm_all = Ytrue[index_perm_total] + + # Split in calibration and testing + pSigma_cal = pSigma_perm_all[:num_cal] + pSigma_test = pSigma_perm_all[num_cal:] + pPred_cal = pPred_perm_all[:num_cal] + pPred_test = pPred_perm_all[num_cal:] + true_cal = true_perm_all[:num_cal] + true_test = true_perm_all[num_cal:] + + print('Size of calibration set: ', true_cal.shape) + print('Size of test set: ', true_test.shape) + + return index_perm_total, pSigma_cal, pSigma_test, pPred_cal, pPred_test, true_cal, true_test + + +def compute_empirical_calibration(pSigma_cal, pPred_cal, true_cal, bins, coverage_percentile): + """ Use the arrays provided to estimate an empirical mapping + between standard deviation and absolute value of error, + both of which have been observed during inference. Since + most of the times the raw statistics per bin are very noisy, + a smoothing step (based on scipy's savgol filter) is performed. + + Parameters + ---------- + pSigma_cal : numpy array + Part of the standard deviations array to use for calibration. + pPred_cal : numpy array + Part of the predictions array to use for calibration. + true_cal : numpy array + Part of the true (observed) values array to use for calibration. + bins : int + Number of bins to split the range of standard deviations + included in pSigma_cal array. + coverage_percentile : float + Value to use for estimating coverage when evaluating the percentiles + of the observed absolute value of errors. + + Return + ---------- + mean_sigma : numpy array + Array with the mean standard deviations computed per bin. + min_sigma : numpy array + Array with the minimum standard deviations computed per bin. + max_sigma : numpy array + Array with the maximum standard deviations computed per bin. + error_thresholds : numpy array + Thresholds of the errors computed to attain a certain + error coverage per bin. + err_err : numpy array + Error bars in errors (one standard deviation for a binomial + distribution estimated by bin vs. the other bins) for the + calibration error. + error_thresholds_smooth : numpy array + Thresholds of the errors computed to attain a certain + error coverage per bin after a smoothed operation is applied + to the frequently noisy bin-based estimations. + sigma_start_index : non-negative integer + Index in the mean_sigma array that defines the start of + the valid empirical calibration interval (i.e. index to + the smallest std for which a meaningful error mapping + is obtained). + sigma_end_index : non-negative integer + Index in the mean_sigma array that defines the end of + the valid empirical calibration interval (i.e. index to + the largest std for which a meaningful error mappping + is obtained). + s_interpolate : scipy.interpolate python object + A python object from scipy.interpolate that computes a + univariate spline (InterpolatedUnivariateSpline) constructed + to express the mapping from standard deviation to error. This + spline is generated during the computational empirical + calibration procedure. + """ + + index_sigma_cal = np.argsort(pSigma_cal) + pSigma_cal_ordered_ = pSigma_cal[index_sigma_cal] + Er_vect_cal_ = np.abs(true_cal - pPred_cal) + Er_vect_cal_orderedSigma_ = Er_vect_cal_[index_sigma_cal] + + minL_sigma = np.min(pSigma_cal_ordered_) + maxL_sigma = np.max(pSigma_cal_ordered_) + print('Complete Sigma range --> Min: %f, Max: %f' % (minL_sigma, maxL_sigma)) + + # Bin statistics for error and sigma + mean_sigma, min_sigma, max_sigma, error_thresholds, err_err = bining_for_calibration(pSigma_cal_ordered_, + minL_sigma, + maxL_sigma, + Er_vect_cal_orderedSigma_, + bins, + coverage_percentile) + + # smooth error function + #scipy.signal.savgol_filter(x, window_length, polyorder, + #deriv=0, delta=1.0, axis=-1, mode='interp', cval=0.0) + #error_thresholds_smooth = signal.savgol_filter(error_thresholds, 5, 1) + error_thresholds_smooth = signal.savgol_filter(error_thresholds, 5, 1, mode='nearest') + + # Build Interpolant over smooth plot (this will become the calibration function) + s_interpolate = InterpolatedUnivariateSpline(mean_sigma, error_thresholds_smooth) + # Determine limits of calibration (i.e. monotonicity range) + sigma_start_index, sigma_end_index = computation_of_valid_calibration_interval(error_thresholds, error_thresholds_smooth, err_err) + + print('Range of valid sigma: %.6f --> %.6f' % (mean_sigma[sigma_start_index], mean_sigma[sigma_end_index])) + + return mean_sigma, min_sigma, max_sigma, error_thresholds, err_err, error_thresholds_smooth, sigma_start_index, sigma_end_index, s_interpolate + + + +def bining_for_calibration(pSigma_cal_ordered_, minL_sigma, + maxL_sigma, Er_vect_cal_orderedSigma_, + bins, coverage_percentile): + """ Bin the values of the standard deviations observed during + inference and estimate a specified coverage percentile + in the absolute error (observed during inference as well). + Bins that have less than 50 samples are merged until they + surpass this threshold. + + Parameters + ---------- + pSigma_cal_ordered_ : numpy array + Array of standard deviations ordered in ascending way. + minL_sigma : float + Minimum value of standard deviations included in + pSigma_cal_ordered_ array. + maxL_sigma : numpy array + Maximum value of standard deviations included in + pSigma_cal_ordered_ array. + Er_vect_cal_orderedSigma_ : numpy array + Array ob absolute value of errors corresponding with + the array of ordered standard deviations. + bins : int + Number of bins to split the range of standard deviations + included in pSigma_cal_ordered_ array. + coverage_percentile : float + Value to use for estimating coverage when evaluating the percentiles + of the observed absolute value of errors. + + Return + ---------- + mean_sigma : numpy array + Array with the mean standard deviations computed per bin. + min_sigma : numpy array + Array with the minimum standard deviations computed per bin. + max_sigma : numpy array + Array with the maximum standard deviations computed per bin. + error_thresholds : numpy array + Thresholds of the errors computed to attain a certain + error coverage per bin. + err_err : numpy array + Error bars in errors (one standard deviation for a binomial + distribution estimated by bin vs. the other bins) for the + calibration error. + """ + + #thresholds = np.logspace(np.log10(minL_sigma), np.log10(maxL_sigma), num=bins) + thresholds = np.linspace(minL_sigma, maxL_sigma, num=bins) + classes = np.digitize(pSigma_cal_ordered_, thresholds) + Nbin = np.zeros(bins+1) + for i in range(bins+1): + indices = (classes == i) + Nbin[i] = indices.sum() + + # Repair bins + new_thresholds_l = [] + new_nbins_l = [] + sumN = 0 + for i in range(Nbin.shape[0]): + sumN += Nbin[i] + if sumN > 50: + if i > (thresholds.shape[0] - 1): + new_thresholds_l.append(thresholds[-1]) + else: + new_thresholds_l.append(thresholds[i]) + new_nbins_l.append(sumN) + sumN = 0 + new_thresholds = np.array(new_thresholds_l) + new_nbins = np.array(new_nbins_l) + new_thresholds[-1] = thresholds[-1] + new_nbins[-1] += sumN + + # + classes = np.digitize(pSigma_cal_ordered_, new_thresholds[:-1]) + error_thresholds = -1. * np.ones(new_nbins.shape[0]) + mean_sigma = -1. * np.ones(new_nbins.shape[0]) + min_sigma = -1. * np.ones(new_nbins.shape[0]) + max_sigma = -1. * np.ones(new_nbins.shape[0]) + err_err = -1. * np.ones(new_nbins.shape[0]) + Ncal = pSigma_cal_ordered_.shape[0] + for i in range(error_thresholds.shape[0]): + indices = (classes == i) + n_aux = indices.sum() + assert n_aux == new_nbins[i] + print('Points in bin %d: %d' % (i, n_aux)) + mean_sigma[i] = np.mean(pSigma_cal_ordered_[indices]) + min_sigma[i] = np.min(pSigma_cal_ordered_[indices]) + max_sigma[i] = np.max(pSigma_cal_ordered_[indices]) + error_thresholds[i] = np.percentile(Er_vect_cal_orderedSigma_[indices], coverage_percentile) + err_err[i] = np.sqrt(new_nbins[i] * (Ncal - new_nbins[i])) / Ncal * error_thresholds[i] + + return mean_sigma, min_sigma, max_sigma, error_thresholds, err_err + + +def computation_of_valid_calibration_interval(error_thresholds, error_thresholds_smooth, err_err): + """ Function that estimates the empirical range in which a + monotonic relation is observed between standard deviation + and coverage of absolute value of error. Since the + statistics computed per bin are relatively noisy, the + application of a greedy criterion (e.g. guarantee a + monotonically increasing relationship) does not yield + good results. Therefore, a softer version is constructed + based on the satisfaction of certain criteria depending + on: the values of the error coverage computed per bin, + a smoothed version of them and the assocatiate error + estimated (based on one standard deviation for a binomial + distribution estimated by bin vs. the other bins). + A minimal validation requiring the end idex to be + largest than the starting index is performed before + the function return. + + Current criteria: + - the smoothed errors are inside the error bars AND + they are almost increasing (a small tolerance is + allowed, so a small wobbliness in the smoother + values is permitted). + OR + - both the raw values for the bins (with a small tolerance) + are increasing, AND the smoothed value is greater than the + raw value. + OR + - the current smoothed value is greater than the previous AND + the smoothed values for the next been are inside the error + bars. + + Parameters + ---------- + error_thresholds : numpy array + Thresholds of the errors computed to attain a certain + error coverage per bin. + error_thresholds_smooth : numpy array + Thresholds of the errors computed to attain a certain + error coverage per bin after a smoothed operation is applied + to the frequently noisy bin-based estimations. + err_err : numpy array + Error bars in errors (one standard deviation for a binomial + distribution estimated by bin vs. the other bins) for the + calibration error. + + Return + ---------- + sigma_start_index : non-negative integer + Index estimated in the mean_sigma array corresponing to + the value that defines the start of the valid empirical + calibration interval (i.e. index to the smallest std for + which a meaningful error mapping is obtained, according + to the criteria explained before). + sigma_end_index : non-negative integer + Index estimated in the mean_sigma array corresponing to + the value that defines the end of the valid empirical + calibration interval (i.e. index to the largest std for + which a meaningful error mapping is obtained, according + to the criteria explained before). + """ + + # Computation of the calibration interval + limitH = error_thresholds + err_err + limitL = error_thresholds - err_err + + # search for starting point + for i in range(err_err.shape[0]): + if ((error_thresholds_smooth[i] >= limitL[i]) and + (error_thresholds_smooth[i] <= limitH[i])): # Ask if the current is in the interval + sigma_start_index = i + break + sigma_end_index = sigma_start_index - 1 + + restart = max(1, sigma_start_index) + for i in range(restart, err_err.shape[0]-1): + if (((error_thresholds_smooth[i] >= limitL[i]) and + (error_thresholds_smooth[i] <= limitH[i]) and + ((error_thresholds_smooth[i] * 1.005 > error_thresholds_smooth[i-1]) or + ((error_thresholds[i] * 1.01 > error_thresholds[i-1]) and + (error_thresholds_smooth[i] > error_thresholds[i])))) # Ask if the current is in the interval with slightly increasing trend + or # Ask if the current is greater than the previous and the next is in the interval + ((error_thresholds_smooth[i] > error_thresholds_smooth[i-1]) and + ((error_thresholds_smooth[i+1] >= limitL[i+1]) and + (error_thresholds_smooth[i+1] <= limitH[i+1])))): + + sigma_end_index = i + else: # Finalize search for monotonic range + if (sigma_end_index - sigma_start_index) > 4: + break + else: # Reset indices + sigma_start_index = i + 1 + sigma_end_index = i + + print('Range of valid sigma indices (inclusive): %d --> %d' % (sigma_start_index, sigma_end_index)) + + assert (sigma_end_index > sigma_start_index) + + return sigma_start_index, sigma_end_index + + +def applying_calibration(pSigma_test, pPred_test, true_test, s_interpolate, minL_sigma_auto, maxL_sigma_auto): + """ Use the empirical mapping between standard deviation and + absolute value of error estimated during calibration (i.e. + apply the univariate spline computed) to estimate the error + for the part of the standard deviation array that was reserved + for testing the empirical calibration. The resulting error array + (yp_test) should overestimate the true observed error (eabs_red). + All the computations are restricted to the valid calibration + interval: [minL_sigma_auto, maxL_sigma_auto]. + + Parameters + ---------- + pSigma_test : numpy array + Part of the standard deviations array to use for calibration testing. + pPred_test : numpy array + Part of the predictions array to use for calibration testing. + true_test : numpy array + Part of the true (observed) values array to use for calibration testing. + s_interpolate : scipy.interpolate python object + A python object from scipy.interpolate that computes a + univariate spline (InterpolatedUnivariateSpline) expressing + the mapping from standard deviation to error. This + spline is generated during the computational empirical + calibration procedure. + minL_sigma_auto : float + Starting value of the valid empirical calibration interval + (i.e. smallest std for which a meaningful error mapping + is obtained). + maxL_sigma_auto : float + Ending value of the valid empirical calibration interval + (i.e. largest std for which a meaningful error mappping + is obtained). + + Return + ---------- + index_sigma_range_test : numpy array + Indices of the pSigma_test array that are included in the + valid calibration interval, given by: + [minL_sigma_auto, maxL_sigma_auto]. + xp_test : numpy array + Array with the mean standard deviations in the calibration + testing array. + yp_test : numpy array + Mapping of the given standard deviation to error computed + from the interpolation spline constructed by empirical + calibration. + eabs_red : numpy array + Array with the observed abolute errors in the part of the testing + array for which the observed standard deviations are in the + valid interval of calibration. + """ + + # Filter to appropriate range + index_sigma_range_test = (pSigma_test >= minL_sigma_auto) & (pSigma_test < maxL_sigma_auto) + xp_test = pSigma_test[index_sigma_range_test] + yp_test = s_interpolate(xp_test) + Er_vect_ = true_test - pPred_test + eabs_ = np.abs(Er_vect_) + eabs_red = eabs_[index_sigma_range_test] + + return index_sigma_range_test, xp_test, yp_test, eabs_red + + +def overprediction_check(yp_test, eabs_red): + """ Compute the percentage of overestimated absoulte error + predictions for the arrays reserved for calibration testing + and whose corresponding standard deviations are included + in the valid calibration interval. + + Parameters + ---------- + yp_test : numpy array + Mapping of the standard deviation to error computed + from the interpolation spline constructed by empirical + calibration. + eabs_red : numpy array + Array with the observed abolute errors in the part of the testing + array for which the observed standard deviations are in the + valid interval of calibration. + """ + + over_pred_error_index = (yp_test >= eabs_red) + percentage_over_predicted = (over_pred_error_index.sum() / yp_test.shape[0]) + print("percentage over predicted: ", percentage_over_predicted) diff --git a/common/viz_utils.py b/common/viz_utils.py index eb570e37..2ca87eae 100644 --- a/common/viz_utils.py +++ b/common/viz_utils.py @@ -2,6 +2,8 @@ mpl.use('Agg') import matplotlib.pyplot as plt +import numpy as np + def plot_history(out, history, metric='loss', title=None, width=8, height=6): title = title or 'model {}'.format(metric) val_metric = 'val_{}'.format(metric) @@ -60,3 +62,300 @@ def plot_error(y_true, y_pred, batch, file_ext, file_pre='output_dir', subsample plt.savefig(file_pre+'.diff'+file_ext+'.b'+str(batch)+'.png') plt.close() +###### UTILS for UQ / CALIBRATION VISUALIZATION + +from matplotlib.colors import LogNorm + +def plot_density_observed_vs_predicted(Ytest, Ypred, pred_name=None, figprefix=None): + """Functionality to plot a 2D histogram of the distribution of observed (ground truth) + values vs. predicted values. The plot generated is stored in a png file. + + Parameters + ---------- + Ytest : numpy array + Array with (true) observed values + Ypred : numpy array + Array with predicted values. + pred_name : string + Name of data colum or quantity predicted (e.g. growth, AUC, etc.) + figprefix : string + String to prefix the filename to store the figure generated. + A '_density_predictions.png' string will be appended to the + figprefix given. + """ + + xbins = 51 + + fig = plt.figure(figsize=(24,18)) # (30,16) + ax = plt.gca() + plt.rc('xtick', labelsize=16) # fontsize of the tick labels + ax.plot([Ytest.min(), Ytest.max()], [Ytest.min(), Ytest.max()], 'r--', lw=4.) + plt.hist2d(Ytest, Ypred, bins=xbins, norm=LogNorm()) + cb = plt.colorbar() + ax.set_xlabel('Observed ' + pred_name, fontsize=38, labelpad=15.) + ax.set_ylabel('Mean ' + pred_name + ' Predicted', fontsize=38, labelpad=15.) + ax.axis([Ytest.min()*0.98, Ytest.max()*1.02, Ytest.min()*0.98, Ytest.max()*1.02]) + plt.setp(ax.get_xticklabels(), fontsize=32) + plt.setp(ax.get_yticklabels(), fontsize=32) + cb.ax.set_yticklabels(cb.ax.get_yticklabels(), fontsize=28) + plt.grid(True) + plt.savefig(figprefix + '_density_predictions.png') + plt.close() + print('Generated plot: ', figprefix + '_density_predictions.png') + + +def plot_2d_density_sigma_vs_error(sigma, yerror, method=None, figprefix=None): + """Functionality to plot a 2D histogram of the distribution of + the standard deviations computed for the predictions vs. the + computed errors (i.e. values of observed - predicted). + The plot generated is stored in a png file. + + Parameters + ---------- + sigma : numpy array + Array with standard deviations computed. + yerror : numpy array + Array with errors computed (observed - predicted). + method : string + Method used to comput the standard deviations (i.e. dropout, + heteroscedastic, etc.). + figprefix : string + String to prefix the filename to store the figure generated. + A '_density_sigma_error.png' string will be appended to the + figprefix given. + """ + + xbins = 51 + ybins = 31 + + fig = plt.figure(figsize=(24,12)) # (30,16) + ax = plt.gca() + plt.rc('xtick', labelsize=16) # fontsize of the tick labels + plt.hist2d(sigma, yerror, bins=[xbins,ybins], norm=LogNorm()) + cb = plt.colorbar() + ax.set_xlabel('Sigma (' + method + ')', fontsize=38, labelpad=15.) + ax.set_ylabel('Observed - Mean Predicted', fontsize=38, labelpad=15.) + ax.axis([sigma.min()*0.98, sigma.max()*1.02, -yerror.max(), yerror.max()]) + plt.setp(ax.get_xticklabels(), fontsize=28) + plt.setp(ax.get_yticklabels(), fontsize=28) + cb.ax.set_yticklabels(cb.ax.get_yticklabels(), fontsize=22) + plt.grid(True) + plt.savefig(figprefix + '_density_sigma_error.png') + plt.close() + print('Generated plot: ', figprefix + '_density_sigma_error.png') + + +def plot_histogram_error_per_sigma(sigma, yerror, method=None, figprefix=None): + """Functionality to plot a 1D histogram of the distribution of + computed errors (i.e. values of observed - predicted) observed + for specific values of standard deviations computed. The range of + standard deviations computed is split in xbins values and the + 1D histograms of error distributions for the smallest six + standard deviations are plotted. + The plot generated is stored in a png file. + + Parameters + ---------- + sigma : numpy array + Array with standard deviations computed. + yerror : numpy array + Array with errors computed (observed - predicted). + method : string + Method used to comput the standard deviations (i.e. dropout, + heteroscedastic, etc.). + figprefix : string + String to prefix the filename to store the figure generated. + A '_histogram_error_per_sigma.png' string will be appended to + the figprefix given. + """ + + xbins = 21 + ybins = 31 + + H, xedges, yedges, img = plt.hist2d(sigma, yerror,# normed=True, + bins=[xbins,ybins]) + + fig = plt.figure(figsize=(14,16)) + legend = [] + for ii in range(6):#(H.shape[0]): + if ii is not 1: + plt.plot(yedges[0:H.shape[1]], H[ii,:]/np.sum(H[ii,:]), marker='o', + markersize=12, lw=6.) + legend.append(str((xedges[ii] + xedges[ii+1])/2)) + plt.legend(legend, fontsize=16) + ax = plt.gca() + plt.title('Error Dist. per Sigma for ' + method, fontsize=40) + ax.set_xlabel('Observed - Mean Predicted', fontsize=38, labelpad=15.) + ax.set_ylabel('Density', fontsize=38, labelpad=15.) + plt.setp(ax.get_xticklabels(), fontsize=28) + plt.setp(ax.get_yticklabels(), fontsize=28) + plt.grid(True) + plt.savefig(figprefix + '_histogram_error_per_sigma.png') + plt.close() + print('Generated plot: ', figprefix + '_histogram_error_per_sigma.png') + + +def plot_calibration_and_errors(mean_sigma, sigma_start_index, sigma_end_index, + min_sigma, max_sigma, + error_thresholds, + error_thresholds_smooth, + err_err, + s_interpolate, + coverage_percentile, + method=None, figprefix=None, + steps=False): + """Functionality to plot empirical calibration curves + estimated by binning the statistics of computed + standard deviations and errors. + + Parameters + ---------- + mean_sigma : numpy array + Array with the mean standard deviations computed per bin. + sigma_start_index : non-negative integer + Index of the mean_sigma array that defines the start of + the valid empirical calibration interval (i.e. index to + the smallest std for which a meaningful error is obtained). + sigma_end_index : non-negative integer + Index of the mean_sigma array that defines the end of + the valid empirical calibration interval (i.e. index to + the largest std for which a meaningful error is obtained). + min_sigma : numpy array + Array with the minimum standard deviations computed per bin. + max_sigma : numpy array + Array with the maximum standard deviations computed per bin. + error_thresholds : numpy array + Thresholds of the errors computed to attain a certain + error coverage per bin. + error_thresholds_smooth : numpy array + Thresholds of the errors computed to attain a certain + error coverage per bin after a smoothed operation is applied + to the frequently noisy bin-based estimations. + err_err : numpy array + Vertical error bars (usually one standard deviation for a binomial + distribution estimated by bin) for the error calibration + computed empirically. + s_interpolate : scipy.interpolate python object + A python object from scipy.interpolate that computes a + univariate spline (InterpolatedUnivariateSpline) constructed + to express the mapping from standard deviation to error. This + spline is generated during the computational empirical + calibration procedure. + coverage_percentile : float + Value used for the coverage in the percentile estimation + of the observed error. + method : string + Method used to comput the standard deviations (i.e. dropout, + heteroscedastic, etc.). + figprefix : string + String to prefix the filename to store the figure generated. + A '_empirical_calibration.png' string will be appended to + the figprefix given. + steps : boolean + Besides the complete empirical calibration (including raw + statistics, error bars and smoothing), also generates partial + plots with only the raw bin statistics (step1) and with only + the raw bin statistics and the smoothing interpolation (step2). + """ + + xp23 = np.linspace(mean_sigma[sigma_start_index], mean_sigma[sigma_end_index], 200) + yp23 = s_interpolate(xp23) + + p_cov = coverage_percentile + if steps: + # Plot raw bin statistics + fig = plt.figure(figsize=(18,12)) + ax = plt.gca() + ax.errorbar(mean_sigma, error_thresholds, + yerr=err_err, + xerr=[mean_sigma-min_sigma, max_sigma-mean_sigma], + fmt='o', ecolor='k', capthick=2, ms=8) + plt.xlabel('Sigma Predicted (' + method + ')', fontsize=24.) + plt.ylabel(str(p_cov) + '% Coverage for ABS Observed - Mean Predicted', fontsize=24.) + plt.title('Calibration', fontsize=28) + ax.axis([0, np.max(max_sigma)*1.1, np.min(error_thresholds)*0.9, np.max(yp23)*1.2]) + plt.grid() + plt.setp(ax.get_xticklabels(), fontsize=22) + plt.setp(ax.get_yticklabels(), fontsize=22) + plt.savefig(figprefix + '_empirical_calibration_step1.png') + plt.close() + print('Generated plot: ', figprefix + '_empirical_calibration_step1.png') + # Plot raw bin statistics and smoothing + fig = plt.figure(figsize=(18,12)) + ax = plt.gca() + ax.plot(mean_sigma, error_thresholds_smooth, 'g^', ms=12) + ax.errorbar(mean_sigma, error_thresholds, + yerr=err_err, + xerr=[mean_sigma-min_sigma, max_sigma-mean_sigma], + fmt='o', ecolor='k', capthick=2, ms=8) + plt.xlabel('Sigma Predicted (' + method + ')', fontsize=24.) + plt.ylabel(str(p_cov) + '% Coverage for ABS Observed - Mean Predicted', fontsize=24.) + plt.title('Calibration', fontsize=28) + ax.axis([0, np.max(max_sigma)*1.1, np.min(error_thresholds)*0.9, np.max(yp23)*1.2]) + plt.grid() + plt.setp(ax.get_xticklabels(), fontsize=22) + plt.setp(ax.get_yticklabels(), fontsize=22) + plt.savefig(figprefix + '_empirical_calibration_step2.png') + plt.close() + print('Generated plot: ', figprefix + '_empirical_calibration_step2.png') + + # Plot raw bin statistics, smoothing and empirical calibration + fig = plt.figure(figsize=(18,12)) + ax = plt.gca() + ax.plot(xp23, yp23, 'rx', ms=20) + ax.plot(mean_sigma, error_thresholds_smooth, 'g^', ms=12) + ax.errorbar(mean_sigma, error_thresholds, + yerr=err_err, + xerr=[mean_sigma-min_sigma, max_sigma-mean_sigma], + fmt='o', ecolor='k', capthick=2, ms=8) + plt.xlabel('Sigma Predicted (' + method + ')', fontsize=24.) + plt.ylabel(str(p_cov) + '% Coverage for ABS Observed - Mean Predicted', fontsize=24.) + plt.title('Calibration', fontsize=28) + ax.axis([0, np.max(max_sigma)*1.1, np.min(error_thresholds)*0.9, np.max(yp23)*1.2]) + plt.grid() + plt.setp(ax.get_xticklabels(), fontsize=22) + plt.setp(ax.get_yticklabels(), fontsize=22) + plt.savefig(figprefix + '_empirical_calibration.png') + plt.close() + print('Generated plot: ', figprefix + '_empirical_calibration.png') + + +def plot_percentile_predictions(Ypred, Ypred_Lp, Ypred_Hp, percentile_list, pred_name=None, figprefix=None): + """Functionality to plot the mean of the percentiles predicted. + The plot generated is stored in a png file. + + Parameters + ---------- + Ypred : numpy array + Array with mid percentile predicted values. + Ypred_Lp : numpy array + Array with low percentile predicted values. + Ypred_Hp : numpy array + Array with high percentile predicted values. + percentile_list : string list + List of percentiles predicted (e.g. '10p', '90p', etc.) + pred_name : string + Name of data colum or quantity predicted (e.g. growth, AUC, etc.) + figprefix : string + String to prefix the filename to store the figure generated. + A '_density_predictions.png' string will be appended to the + figprefix given. + """ + + index_ = np.argsort(Ypred) + fig = plt.figure(figsize=(24,18)) + plt.scatter(range(index_.shape[0]), Ypred[index_]) + plt.scatter(range(index_.shape[0]), Ypred_Lp[index_]) + plt.scatter(range(index_.shape[0]), Ypred_Hp[index_]) + plt.legend(percentile_list, fontsize=20) + plt.xlabel('Index', fontsize=18.) + plt.ylabel(pred_name, fontsize=18.) + plt.title('Predicted ' + pred_name + ' Percentiles', fontsize=28) + plt.grid() + ax = plt.gca() + plt.setp(ax.get_xticklabels(), fontsize=16) + plt.setp(ax.get_yticklabels(), fontsize=16) + plt.savefig(figprefix + '_percentile_predictions.png') + plt.close() + print('Generated plot: ', figprefix + '_percentile_predictions.png') + From df74ce32ede27f196b853202c737b6fe3410ca70 Mon Sep 17 00:00:00 2001 From: Cristina Date: Mon, 10 Jun 2019 16:22:16 -0600 Subject: [PATCH 005/331] Added conditional import of scikit imputer in data_utils to handle different scikit versions --- common/data_utils.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/common/data_utils.py b/common/data_utils.py index 856c63a0..99323350 100644 --- a/common/data_utils.py +++ b/common/data_utils.py @@ -3,8 +3,15 @@ import numpy as np import pandas as pd +## Adding conditional import for compatibility between +## sklearn versions +## The second commented line corresponds to a more recent version #from sklearn.preprocessing import Imputer -from sklearn.impute import SimpleImputer +#from sklearn.impute import SimpleImputer +try: + from sklearn.impute import SimpleImputer as Imputer +except ImportError: + from sklearn.preprocessing import Imputer from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler from default_utils import DEFAULT_SEED @@ -127,7 +134,10 @@ def impute_and_scale_array(mat, scaling=None): """ # imputer = Imputer(strategy='mean', axis=0, copy=False) - imputer = SimpleImputer(strategy='mean', copy=False) +# imputer = SimpleImputer(strategy='mean', copy=False) + # Next line is from conditional import. axis=0 is default + # in old version so it is not necessary. + imputer = Imputer(strategy='mean', copy=False) imputer.fit_transform(mat) return scale_array(mat, scaling) @@ -172,7 +182,10 @@ def drop_impute_and_scale_dataframe(df, scaling='std', imputing='mean', dropna=' mat = df.values else: # imputer = Imputer(strategy=imputing, axis=0) - imputer = SimpleImputer(strategy=imputing) +# imputer = SimpleImputer(strategy=imputing) + # Next line is from conditional import. axis=0 is default + # in old version so it is not necessary. + imputer = Imputer(strategy='mean', copy=False) mat = imputer.fit_transform(df.values) if scaling is None or scaling.lower() == 'none': From 8ece3283d69b0e1064e51cb05e12a8e9ab8a722e Mon Sep 17 00:00:00 2001 From: Cristina Date: Mon, 10 Jun 2019 16:36:45 -0600 Subject: [PATCH 006/331] Removed copy from imputer call (not used before). --- common/data_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/data_utils.py b/common/data_utils.py index 8e36e21c..b1a3e613 100644 --- a/common/data_utils.py +++ b/common/data_utils.py @@ -185,7 +185,7 @@ def drop_impute_and_scale_dataframe(df, scaling='std', imputing='mean', dropna=' # imputer = SimpleImputer(strategy=imputing) # Next line is from conditional import. axis=0 is default # in old version so it is not necessary. - imputer = Imputer(strategy=imputing, copy=False) + imputer = Imputer(strategy=imputing) mat = imputer.fit_transform(df.values) if scaling is None or scaling.lower() == 'none': From bf5211ad1bf13121588d8ba95c6d2ff77712f61e Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 19 Jun 2019 09:38:09 -0500 Subject: [PATCH 007/331] Small fix to Exception --- Pilot1/Uno/topN_to_uno.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Pilot1/Uno/topN_to_uno.py b/Pilot1/Uno/topN_to_uno.py index 1f7c2b6a..ffc153d1 100644 --- a/Pilot1/Uno/topN_to_uno.py +++ b/Pilot1/Uno/topN_to_uno.py @@ -24,7 +24,7 @@ def read_plan(filename, node): if node in plan: return plan[node] else: - raise Exception('Node index {} was not found in plan file') + raise Exception('Node index "{}" was not found in plan file'.format(node)) def build_masks(args, df): From 897806f41b919ec79537acfc2b39d9c58e9d9c74 Mon Sep 17 00:00:00 2001 From: Hyunseung Yoo Date: Thu, 27 Jun 2019 22:10:19 -0500 Subject: [PATCH 008/331] read hdf format master dataframe --- Pilot1/Uno/topN_to_uno.py | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/Pilot1/Uno/topN_to_uno.py b/Pilot1/Uno/topN_to_uno.py index ffc153d1..87c03a9e 100644 --- a/Pilot1/Uno/topN_to_uno.py +++ b/Pilot1/Uno/topN_to_uno.py @@ -1,4 +1,5 @@ import argparse +import os import json import pandas as pd import numpy as np @@ -36,8 +37,8 @@ def build_masks(args, df): for partition in ['train', 'val']: _mask = df['Sample'] == None for i, element in enumerate(plan[partition]): - cl_filter = element['CELL'] - dr_filter = element['DRUG'] + cl_filter = element['cell'] + dr_filter = element['drug'] __mask = df['Sample'].isin(cl_filter) & df['Drug1'].isin(dr_filter) _mask = _mask | __mask mask[partition] = _mask @@ -49,7 +50,7 @@ def training_mask(df): return np.random.rand(len(df)) < 0.8 -def read_dataframe(args): +def read_dataframe_from_csv(args): df = pd.read_csv(args.dataframe_from, low_memory=False, na_values='na').fillna(0) df.rename(columns={'SAMPLE': 'Sample', 'DRUG': 'Drug1'}, inplace=True) df_y = df[['AUC', 'Sample', 'Drug1']] @@ -64,8 +65,28 @@ def read_dataframe(args): return df_y, df_cl, df_dd +def read_dataframe_from_hdf(args): + store = pd.HDFStore(args.dataframe_from, 'r') + df = store.get('df') + df.rename(columns={'CELL': 'Sample', 'DRUG': 'Drug1'}, inplace=True) + df_y = df[['AUC', 'Sample', 'Drug1']] + + cols = df.columns.to_list() + cl_columns = list(filter(lambda x: x.startswith('GE_'), cols)) + dd_columns = list(filter(lambda x: x.startswith('DD_'), cols)) + + df_cl = df.loc[:, cl_columns] + df_dd = df.loc[:, dd_columns] + + return df_y, df_cl, df_dd + + def build_dataframe(args): - df_y, df_cl, df_dd = read_dataframe(args) + _, ext = os.path.splitext(args.dataframe_from) + if ext == '.h5' or ext == '.hdf5': + df_y, df_cl, df_dd = read_dataframe_from_hdf(args) + else: + df_y, df_cl, df_dd = read_dataframe_from_csv(args) # mask = training_mask(df_y) train_mask, val_mask = build_masks(args, df_y) From a8636ae3ace5e633f4cf594cce05883780a9d6de Mon Sep 17 00:00:00 2001 From: Hyunseung Yoo Date: Thu, 27 Jun 2019 22:32:34 -0500 Subject: [PATCH 009/331] add dose_aggregated AUC prediction model --- Pilot1/Uno/uno_auc_model.txt | 39 ++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100644 Pilot1/Uno/uno_auc_model.txt diff --git a/Pilot1/Uno/uno_auc_model.txt b/Pilot1/Uno/uno_auc_model.txt new file mode 100644 index 00000000..00d2224e --- /dev/null +++ b/Pilot1/Uno/uno_auc_model.txt @@ -0,0 +1,39 @@ +[Global_Params] +train_sources=['CCLE'] +test_sources=['train'] +cell_types=None +cell_features=['rnaseq'] +drug_features=['descriptors'] +dense=[1000, 1000, 1000] +dense_feature_layers=[1000, 1000, 1000] +activation='relu' +loss='mse' +optimizer='adam' +scaling='std' +drop=0 +epochs=50 +batch_size=512 +validation_split=0.2 +cv=1 +max_val_loss=1.0 +learning_rate=None +base_lr=None +residual=False +reduce_lr=False +warmup_lr=False +batch_normalization=False +feature_subsample=0 +rng_seed=2018 +save_path='save/uno' +no_gen=False +verbose=False +no_response_source=True +no_feature_source=True +use_landmark_genes=True +agg_dose='AUC' +preprocess_rnaseq='source_scale' +single=True + +[Monitor_Params] +solr_root='' +timeout=3600 From a2aa2718c6a0923f8b4772e7f282dcb64e843d16 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 16 Jul 2019 13:05:20 -0500 Subject: [PATCH 010/331] Create cache directory if it does not exist --- Pilot1/Uno/uno_data.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Pilot1/Uno/uno_data.py b/Pilot1/Uno/uno_data.py index 52450fb2..c15e217e 100644 --- a/Pilot1/Uno/uno_data.py +++ b/Pilot1/Uno/uno_data.py @@ -654,6 +654,10 @@ def save_to_cache(self, cache, params): for k in ['self', 'cache', 'single']: if k in params: del params[k] + dirname = os.path.dirname(cache) + if not os.path.exists(dirname): + logger.debug('Creating directory for cache: %s', dirname) + os.mkdir(dirname) param_fname = '{}.params.json'.format(cache) with open(param_fname, 'w') as param_file: json.dump(params, param_file, sort_keys=True) From 0d0e657249ddc1f87354dc1064789dda0fd0fcd7 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 16 Jul 2019 14:09:40 -0500 Subject: [PATCH 011/331] Fix typos --- common/default_utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/common/default_utils.py b/common/default_utils.py index 143e227e..5ea5bfaa 100644 --- a/common/default_utils.py +++ b/common/default_utils.py @@ -319,19 +319,19 @@ def set_seed(seed): def initialize_parameters(bmk): - """Utility to parse parameters in common as well as parmeters + """Utility to parse parameters in common as well as parameters particular to each benchmark. Parameters ---------- bmk : benchmark object Object that has benchmark filepaths and specifications - + Return ---------- gParameters : python dictionary Dictionary with all the parameters necessary to run the benchmark. - Command line overwrites config file especifications + Command line overwrites config file specifications """ # Parse common parameters From dd2b2e7b6ae9a28d526a1078158e58f884e5c846 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 16 Jul 2019 14:38:13 -0500 Subject: [PATCH 012/331] Post questions regarding CombinedDataLoader.load_from_cache() --- Pilot1/Uno/uno_data.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Pilot1/Uno/uno_data.py b/Pilot1/Uno/uno_data.py index c15e217e..1488b6a8 100644 --- a/Pilot1/Uno/uno_data.py +++ b/Pilot1/Uno/uno_data.py @@ -622,6 +622,7 @@ def __init__(self, seed=SEED): self.seed = seed def load_from_cache(self, cache, params): + """ NOTE: How does this function return an error? (False?) -Wozniak """ param_fname = '{}.params.json'.format(cache) if not os.path.isfile(param_fname): logger.warning('Cache parameter file does not exist: %s', param_fname) @@ -648,6 +649,7 @@ def load_from_cache(self, cache, params): self.__dict__.update(obj.__dict__) logger.info('Loaded data from cache: %s', fname) return True + # NOTE: This is unreachable -Wozniak return False def save_to_cache(self, cache, params): From 0aca0c9093cc24f23a3a30b24958f6c2fc69525d Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 16 Jul 2019 14:38:24 -0500 Subject: [PATCH 013/331] Fix typo --- Pilot1/Uno/uno_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Pilot1/Uno/uno_data.py b/Pilot1/Uno/uno_data.py index 1488b6a8..1406a8de 100644 --- a/Pilot1/Uno/uno_data.py +++ b/Pilot1/Uno/uno_data.py @@ -636,7 +636,7 @@ def load_from_cache(self, cache, params): ignore_keys = ['cache', 'partition_by', 'single'] equal, diffs = dict_compare(params, cached_params, ignore_keys) if not equal: - logger.warning('Cache parameter mismatch: %s\nSaved: %s\nAttemptd to load: %s', diffs, cached_params, params) + logger.warning('Cache parameter mismatch: %s\nSaved: %s\nAttempted to load: %s', diffs, cached_params, params) logger.warning('\nRemove %s to rebuild data cache.\n', param_fname) raise ValueError('Could not load from a cache with incompatible keys:', diffs) else: From 6e531c36fea0fde8086d57507ff009e059cc8ade Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Thu, 18 Jul 2019 09:41:21 -0500 Subject: [PATCH 014/331] Improve log messages --- Pilot1/Uno/uno_baseline_keras2.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Pilot1/Uno/uno_baseline_keras2.py b/Pilot1/Uno/uno_baseline_keras2.py index 8de286a5..0ec9201b 100644 --- a/Pilot1/Uno/uno_baseline_keras2.py +++ b/Pilot1/Uno/uno_baseline_keras2.py @@ -403,7 +403,7 @@ def warmup_scheduler(epoch): template_model = build_model(loader, args, silent=True) if args.initial_weights: - logger.info("Loading weights from {}".format(args.initial_weights)) + logger.info("Loading initial weights from {}".format(args.initial_weights)) template_model.load_weights(args.initial_weights) if len(args.gpus) > 1: @@ -444,6 +444,7 @@ def warmup_scheduler(epoch): if args.tb: callbacks.append(tensorboard) if args.save_weights: + logger.info("Will save weights to: " + args.save_weights) callbacks.append(MultiGPUCheckpoint(args.save_weights)) if args.use_exported_data is not None: From 2e09f37e1938531ac442f1188718d87825cc9827 Mon Sep 17 00:00:00 2001 From: Hyunseung Yoo Date: Thu, 1 Aug 2019 11:37:39 -0500 Subject: [PATCH 015/331] loocv data util --- Pilot1/Uno/loocv_data_util.py | 91 +++++++++++++++++++++++++++++++++++ 1 file changed, 91 insertions(+) create mode 100644 Pilot1/Uno/loocv_data_util.py diff --git a/Pilot1/Uno/loocv_data_util.py b/Pilot1/Uno/loocv_data_util.py new file mode 100644 index 00000000..d42a41fb --- /dev/null +++ b/Pilot1/Uno/loocv_data_util.py @@ -0,0 +1,91 @@ +import argparse +import json +import pandas as pd +import numpy as np + + +def parse_arguments(): + parser = argparse.ArgumentParser() + parser.add_argument('--dataframe_from', type=str, default='GDSC.h5', + help='Dataframe file name contains all data points') + parser.add_argument('--plan', type=str, default='plan.json', + help='Plan data file') + parser.add_argument('--node', type=str, default=None, + help='node number to execute') + + args, unparsed = parser.parse_known_args() + return args, unparsed + + +def read_plan(filename, node): + print("reading {} file for node {}".format(filename, node)) + with open(filename, 'r') as plan_file: + plan = json.load(plan_file) + if node in plan: + return plan[node] + else: + raise Exception('Node index "{}" was not found in plan file'.format(node)) + + +def build_masks(args, df): + if args.node is None: + raise Exception('Node id is not given') + + plan = read_plan(args.plan, args.node) + mask = {} + for partition in ['train', 'val']: + _mask = df['Sample'] is None + for i, element in enumerate(plan[partition]): + cl_filter = element['cell'] + dr_filter = element['drug'] + __mask = df['Sample'].isin(cl_filter) & df['Drug1'].isin(dr_filter) + _mask = _mask | __mask + mask[partition] = _mask + + return mask['train'], mask['val'] + + +def training_mask(df): + return np.random.rand(len(df)) < 0.8 + + +def build_dataframe(args): + store = pd.HDFStore(args.dataframe_from, 'r') + df_y = store.get('y_train') + df_ds = store.get('x_train_0') + df_cl = store.get('x_train_1') + df_dd = store.get('x_train_2') + df_fp = store.get('x_train_3') + + train_mask, val_mask = build_masks(args, df_y) + + y_train = pd.DataFrame(data=df_y[train_mask].reset_index(drop=True)) + y_val = pd.DataFrame(data=df_y[val_mask].reset_index(drop=True)) + + x_train_0 = df_ds[train_mask].reset_index(drop=True) + x_train_1 = df_cl[train_mask].reset_index(drop=True) + x_train_2 = df_dd[train_mask].reset_index(drop=True) + x_train_3 = df_fp[train_mask].reset_index(drop=True) + + x_val_0 = df_ds[val_mask].reset_index(drop=True) + x_val_1 = df_cl[val_mask].reset_index(drop=True) + x_val_2 = df_dd[val_mask].reset_index(drop=True) + x_val_3 = df_fp[val_mask].reset_index(drop=True) + + # store + store = pd.HDFStore('topN.uno.h5', 'w') + store.put('y_train', y_train) + store.put('y_val', y_val) + store.put('x_train_0', x_train_0) + store.put('x_train_1', x_train_1) + store.put('x_train_2', x_train_2) + store.put('x_train_3', x_train_3) + store.put('x_val_0', x_val_0) + store.put('x_val_1', x_val_1) + store.put('x_val_2', x_val_2) + store.put('x_val_3', x_val_3) + + +if __name__ == '__main__': + parsed, unparsed = parse_arguments() + build_dataframe(parsed) From 23586c20ea16d6d5be1542ed9ab39ee789190ad3 Mon Sep 17 00:00:00 2001 From: Hyunseung Yoo Date: Thu, 1 Aug 2019 23:40:38 -0500 Subject: [PATCH 016/331] use table format --- Pilot1/Uno/loocv_data_util.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/Pilot1/Uno/loocv_data_util.py b/Pilot1/Uno/loocv_data_util.py index d42a41fb..412dba5b 100644 --- a/Pilot1/Uno/loocv_data_util.py +++ b/Pilot1/Uno/loocv_data_util.py @@ -74,16 +74,16 @@ def build_dataframe(args): # store store = pd.HDFStore('topN.uno.h5', 'w') - store.put('y_train', y_train) - store.put('y_val', y_val) - store.put('x_train_0', x_train_0) - store.put('x_train_1', x_train_1) - store.put('x_train_2', x_train_2) - store.put('x_train_3', x_train_3) - store.put('x_val_0', x_val_0) - store.put('x_val_1', x_val_1) - store.put('x_val_2', x_val_2) - store.put('x_val_3', x_val_3) + store.put('y_train', y_train, format='t') + store.put('y_val', y_val, format='t') + store.put('x_train_0', x_train_0, format='t') + store.put('x_train_1', x_train_1, format='t') + store.put('x_train_2', x_train_2, format='t') + store.put('x_train_3', x_train_3, format='t') + store.put('x_val_0', x_val_0, format='t') + store.put('x_val_1', x_val_1, format='t') + store.put('x_val_2', x_val_2, format='t') + store.put('x_val_3', x_val_3, format='t') if __name__ == '__main__': From 8cf5d6d4abb7c5affdbbce7adbe42fb36ecdbbda Mon Sep 17 00:00:00 2001 From: Hyunseung Yoo Date: Sun, 4 Aug 2019 10:22:55 -0500 Subject: [PATCH 017/331] add fom default model --- Pilot1/Uno/uno_fom_model.txt | 38 ++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 Pilot1/Uno/uno_fom_model.txt diff --git a/Pilot1/Uno/uno_fom_model.txt b/Pilot1/Uno/uno_fom_model.txt new file mode 100644 index 00000000..cf66baae --- /dev/null +++ b/Pilot1/Uno/uno_fom_model.txt @@ -0,0 +1,38 @@ +[Global_Params] +train_sources=['GDSC'] +test_sources=['train'] +cell_types=None +cell_features=['rnaseq'] +drug_features=['descriptors', 'fingerprints'] +dense=[1000, 1000, 1000] +dense_feature_layers=[1000, 1000, 1000] +activation='relu' +loss='mse' +optimizer='adam' +scaling='std' +drop=0 +epochs=50 +batch_size=512 +validation_split=0.2 +cv=1 +max_val_loss=1.0 +learning_rate=None +base_lr=None +residual=False +reduce_lr=False +warmup_lr=False +batch_normalization=False +feature_subsample=0 +rng_seed=2018 +save_path='save/uno' +no_gen=False +verbose = False +use_landmark_genes=True +preprocess_rnaseq='source_scale' +no_feature_source=True +no_response_source=True +single=True + +[Monitor_Params] +solr_root='' +timeout=-1 From 4a9fcea5678388f225096988f6fd5c61cfb0441e Mon Sep 17 00:00:00 2001 From: brettin Date: Fri, 9 Aug 2019 11:32:18 -0500 Subject: [PATCH 018/331] tested py 37 --- README.setup.linux | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/README.setup.linux b/README.setup.linux index 504a1e87..b12446d5 100644 --- a/README.setup.linux +++ b/README.setup.linux @@ -2,27 +2,30 @@ # ------------------------------ # Download the Anaconda installer -# curl -o Anaconda2-4.3.1-Linux-x86_64.sh https://repo.continuum.io/archive/Anaconda2-4.3.1-Linux-x86_64.sh -curl -o Anaconda3-5.1.0-Linux-x86_64.sh https://repo.anaconda.com/archive/Anaconda3-5.1.0-Linux-x86_64.sh +curl -o Anaconda3-2018.12-Linux-x86_64.sh https://repo.continuum.io/archive/Anaconda3-2018.12-Linux-x86_64.sh # Make the installer executable -chmod u+x ./Anaconda3-5.1.0-Linux-x86_64.sh +chmod u+x ./Anaconda3-2018.12-Linux-x86_64.sh # Run the installer, accepting the defaults. -./Anaconda3-5.1.0-Linux-x86_64.sh +./Anaconda3-2018.12-Linux-x86_64.sh # Add anaconda2/bin to your path (assumes default install location) export PATH=$HOME/anaconda3/bin:$PATH +# Create a new conda environment +conda create --name py37_candle +source activate py37_candle + # Install additonal modules not shipped with Anaconda -conda install -y -c conda-forge tensorflow -conda install -y -c anaconda hdf5=1.8.17 -conda install -y -c anaconda theano -conda install -y -c conda-forge keras=2 -conda install -y -c anaconda pandas -conda install -y -c anaconda scikit-learn -conda install -y -c anaconda matplotlib -conda install -y -c conda-forge pygpu +conda install -c conda-forge tensorflow +conda install -c anaconda hdf5=1.8.17 +conda install -c anaconda theano +conda install -c conda-forge keras=2 +conda install -c anaconda pandas +conda install -c anaconda scikit-learn +conda install -c anaconda matplotlib +---conda install -c conda-forge pygpu # Install additional modules for Pilot2 benchmarks conda install -c conda-forge opencv From a555a20be040370f0295bd5acbbd59e61a29bb32 Mon Sep 17 00:00:00 2001 From: Jamal Date: Fri, 9 Aug 2019 11:39:03 -0600 Subject: [PATCH 019/331] Added first draft of simple profiling hooks to enable nvprof to profile benchmarks. --- common/candle/__init__.py | 3 +++ common/default_utils.py | 4 ++++ common/profiling_utils.py | 11 +++++++++++ 3 files changed, 18 insertions(+) create mode 100644 common/profiling_utils.py diff --git a/common/candle/__init__.py b/common/candle/__init__.py index 486ef1ef..95a2eac6 100644 --- a/common/candle/__init__.py +++ b/common/candle/__init__.py @@ -48,6 +48,9 @@ from uq_utils import applying_calibration from uq_utils import overprediction_check +#profiling +from profiling_utils import start_profiling +from profiling_utils import stop_profiling # import benchmark-dependent utils import sys diff --git a/common/default_utils.py b/common/default_utils.py index 143e227e..33915ae3 100644 --- a/common/default_utils.py +++ b/common/default_utils.py @@ -570,6 +570,10 @@ def get_common_parser(parser): default=[], type=int, help="set IDs of GPUs to use") + # profiling flags + parser.add_argument("-p", "--profiling", type=str2bool, + default = 'false', + help="Turn profiling on or off") return parser diff --git a/common/profiling_utils.py b/common/profiling_utils.py new file mode 100644 index 00000000..81d793b4 --- /dev/null +++ b/common/profiling_utils.py @@ -0,0 +1,11 @@ +import numba.cuda + +def start_profiling(do_prof): + if (do_prof): + numba.cuda.profile_start() + + +def stop_profiling(do_prof): + if (do_prof): + numba.cuda.profile_stop() + From 5b386feab77992d917525abf51847bdbfec8bd41 Mon Sep 17 00:00:00 2001 From: Hyunseung Yoo Date: Tue, 13 Aug 2019 15:22:54 -0500 Subject: [PATCH 020/331] generate random split when node is not given; fix hdfstore issues --- Pilot1/Uno/topN_to_uno.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/Pilot1/Uno/topN_to_uno.py b/Pilot1/Uno/topN_to_uno.py index 87c03a9e..5ab5f3ff 100644 --- a/Pilot1/Uno/topN_to_uno.py +++ b/Pilot1/Uno/topN_to_uno.py @@ -30,7 +30,9 @@ def read_plan(filename, node): def build_masks(args, df): if args.node is None: - raise Exception('Node id is not given') + print('node is None. Generate Random split') + mask = np.random.rand(len(df)) < 0.8 + return mask, ~mask plan = read_plan(args.plan, args.node) mask = {} @@ -96,18 +98,20 @@ def build_dataframe(args): x_train_0 = df_cl[train_mask].reset_index(drop=True) x_train_1 = df_dd[train_mask].reset_index(drop=True) + x_train_1.columns = [''] * len(x_train_1.columns) x_val_0 = df_cl[val_mask].reset_index(drop=True) x_val_1 = df_dd[val_mask].reset_index(drop=True) + x_val_1.columns = [''] * len(x_val_1.columns) # store - store = pd.HDFStore('topN.uno.h5', 'w') - store.put('y_train', y_train) - store.put('y_val', y_val) - store.put('x_train_0', x_train_0) - store.put('x_train_1', x_train_1) - store.put('x_val_0', x_val_0) - store.put('x_val_1', x_val_1) + store = pd.HDFStore('topN.uno.h5', 'w', complevel=9, complib='blosc:snappy') + store.put('y_train', y_train, format='table') + store.put('y_val', y_val, format='table') + store.put('x_train_0', x_train_0, format='table') + store.put('x_train_1', x_train_1, format='table') + store.put('x_val_0', x_val_0, format='table') + store.put('x_val_1', x_val_1, format='table') if __name__ == '__main__': From 59718b69f1f95831a6b87b514d52473314673e06 Mon Sep 17 00:00:00 2001 From: Hyunseung Yoo Date: Thu, 15 Aug 2019 09:33:09 -0500 Subject: [PATCH 021/331] set timeout unlimited --- Pilot1/Uno/uno_auc_model.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Pilot1/Uno/uno_auc_model.txt b/Pilot1/Uno/uno_auc_model.txt index 00d2224e..23f93ba8 100644 --- a/Pilot1/Uno/uno_auc_model.txt +++ b/Pilot1/Uno/uno_auc_model.txt @@ -36,4 +36,4 @@ single=True [Monitor_Params] solr_root='' -timeout=3600 +timeout=-1 From 2a5033c554b97a7aa281c72330900eae1b63972a Mon Sep 17 00:00:00 2001 From: Hyunseung Yoo Date: Wed, 21 Aug 2019 09:32:09 -0500 Subject: [PATCH 022/331] set hyper-params for auc training --- Pilot1/Uno/uno_auc_model.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Pilot1/Uno/uno_auc_model.txt b/Pilot1/Uno/uno_auc_model.txt index 23f93ba8..4a803b43 100644 --- a/Pilot1/Uno/uno_auc_model.txt +++ b/Pilot1/Uno/uno_auc_model.txt @@ -8,15 +8,15 @@ dense=[1000, 1000, 1000] dense_feature_layers=[1000, 1000, 1000] activation='relu' loss='mse' -optimizer='adam' +optimizer='sgd' scaling='std' drop=0 epochs=50 -batch_size=512 +batch_size=32 validation_split=0.2 cv=1 max_val_loss=1.0 -learning_rate=None +learning_rate=0.0001 base_lr=None residual=False reduce_lr=False From 0ad72d38a9a70b31e8ab2aee5a209e8f7fd43c6a Mon Sep 17 00:00:00 2001 From: Hyunseung Yoo Date: Thu, 22 Aug 2019 15:55:34 -0500 Subject: [PATCH 023/331] code cleanup --- Pilot1/Uno/topN_to_uno.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Pilot1/Uno/topN_to_uno.py b/Pilot1/Uno/topN_to_uno.py index 5ab5f3ff..dd81d9f3 100644 --- a/Pilot1/Uno/topN_to_uno.py +++ b/Pilot1/Uno/topN_to_uno.py @@ -31,13 +31,13 @@ def read_plan(filename, node): def build_masks(args, df): if args.node is None: print('node is None. Generate Random split') - mask = np.random.rand(len(df)) < 0.8 + mask = training_mask(df) return mask, ~mask plan = read_plan(args.plan, args.node) mask = {} for partition in ['train', 'val']: - _mask = df['Sample'] == None + _mask = df['Sample'] is None for i, element in enumerate(plan[partition]): cl_filter = element['cell'] dr_filter = element['drug'] From 3ba592562653f54c232b71183c7b097428dcc645 Mon Sep 17 00:00:00 2001 From: Hyunseung Yoo Date: Fri, 23 Aug 2019 00:43:13 -0500 Subject: [PATCH 024/331] use plangen api --- Pilot1/Uno/plangen.py | 1489 +++++++++++++++++++++++++++++++++++++ Pilot1/Uno/topN_to_uno.py | 87 ++- 2 files changed, 1560 insertions(+), 16 deletions(-) create mode 100644 Pilot1/Uno/plangen.py diff --git a/Pilot1/Uno/plangen.py b/Pilot1/Uno/plangen.py new file mode 100644 index 00000000..5eccdcca --- /dev/null +++ b/Pilot1/Uno/plangen.py @@ -0,0 +1,1489 @@ + +from collections import deque +from collections import namedtuple +from enum import Enum +import glob +import itertools as it +import json +import numpy as np +import os +import sys +import sqlite3 +from sqlite3 import Error as db_Error + +# import planargs + +from abc import ABC, abstractmethod # abstract class support +from collections import OrderedDict +from scipy.special import comb +from pprint import pprint as pp +from datetime import datetime + +ISO_TIMESTAMP = "seconds" # timestamp to ISO string +ISO_TIMESTAMP_ENCODE = "%Y-%m-%dT%H:%M:%S" # ISO string to timestamp +DEBUG_SQL = False + +def isempty(path): + """Determine whether the given directory is empty.""" + flist = glob.glob(os.path.join(path,'*')) + return flist == [] + + +def validate_args(args): + """Validate the execution arguments as defined in planargs.py. + + This function validates input arguments defined in the 'args' namespace. + The inputs are lists series of feature-set names (fs_names), files + (fs_paths) and partitioning attributes (fs_parts). fs_names and fs_files + must designate the same number of parameters. For example: + + --fs_names CELL DRUG --fs_paths cells.txt drugs.txt + + The CELL name is paired with the cells.txt file, DRUG with drugs.txt, etc. + Currently, this one for one correspondence also applies to the fs_part arg, + which specifies the number of partitions the feature-set list is broken + into at every level of the plan generation recursion. A complete example + might look like this: + + --fsnames CELL DRUG --fs_paths cells.txt drugs.txt --fs_parts 2 2 + + An output directory for the plan in any of its formats is given by out_dir. + An input directory may be specified via in_dir to simplify the coding of + fs_paths. Otherwise, feature-set files must be fully specified. Each of the + files is read and returned. + + Returns + Upon success, a tuple is returned. It contains: + + t[0] - the generator class implementing the appropriate partition() + function. + + t[1] - a list of feature-set entry lists is returned. All entries + are stripped of white-space, all white-space lines have been removed. + For example: + + [[CELL1 ... CELLn] [DRUG1 ... DRUGn]] + + Additionally, an args.lines list is created where each entry contains + the entry count of the corresponding fs_paths file argument. + """ + params = {} + verbose = args.verbose + + fs_names_len = len(args.fs_names) + fs_paths_len = len(args.fs_paths) + fs_parts_len = len(args.fs_parts) + + nbr_feature_sets = fs_names_len + test_lengths = [fs_names_len, fs_paths_len, fs_parts_len] + reqd_lengths = [nbr_feature_sets] * 3 + + if test_lengths != reqd_lengths: + sys.exit("Error: The lengths of all feature set definition args (fs_<>) must be identical") + + if nbr_feature_sets <= 1: + sys.exit("Error: Partitioning requires multiple feature sets") + + for nparts in args.fs_parts: + if nparts < 1 or nparts >= 8: + sys.exit("Error: Invalid partitioning value %d" % nparts) + + # validate input and output directories + if args.in_dir and not os.path.isdir(args.in_dir): + sys.exit("Error: --in_dir must designate a directory, '%s' is not valid" % args.in_dir) + + if not os.path.isdir(args.out_dir): + sys.exit("Error: --out_dir must designate a directory, '%s' is not valid" % args.out_dir) + + if not args.overwrite and not isempty(args.out_dir): + sys.exit("Error: --out_dir '%s' is not empty, --overwrite not specified" % args.out_dir) + + if verbose: + print("Writing plan information to %s" % os.path.abspath(args.out_dir)) + + # expand, validate and load input feature-set content lists + fs_content = [] + args.fs_lines = [] + file_error = False + if args.in_dir == None: + args.in_dir = '' # prepare for use in os.path.join() + + for i, path in enumerate(args.fs_paths): + fullpath = os.path.join(args.in_dir, path) + if not os.path.exists(fullpath): + file_error = True + print("Error: %s file not found" % fullpath) + else: + with open(fullpath, 'r') as f: # read text and sanitize + raw_lines = f.readlines() + + text = [line.strip() for line in raw_lines] + text = [l for l in text if l != ''] + fs_content.append(text) + args.fs_lines.append(len(text)) + + if verbose: + print("Loading '%s' feature set definition from %s - %d lines" + % (args.fs_names[i], fullpath, len(text))) + + if file_error: + sys.exit("Terminating due to error") + + # construct a partitioning object exporting a partion() function + if args.partition_strategy == 'leaveout': + generator = LeaveoutSubsetGenerator() + + # return feature-set contents lists + return generator, fs_content + + +class SubsetGenerator(ABC): + """Abstract class implementing a data partitioning method. + + The SubsetGenerator class provides a template for subclasses that implement + mechanisms for dividing sets of lists into sublists for the purpose of + defining unique ML training and validation sets. + + Subclasses must implement those methods defined as @abstractmethod. + The validate() function provided here does a sanity test for all anticipated + partitioning schemes. Subclasses should implement their specializations. + """ + + def __init__(self, name=''): + self.name = name + self.term_msg = "Terminating due to error" + + @abstractmethod + def partition( + self, + base, + size=None, + count=None, + name='-unspecified-' + ): + """Partition a feature-set array. + + Partition the 'base', a list of elements, using the abstract arguments + 'size' and 'count' to tailor the implementation's algorithm. 'name' is + used in error reporting and is optional. + """ + validate(self, base, size, count, name) + return [] + + def get_plan_label(self, plan_dict, root_name): + root = plan_dict[root_name] + return root['label'] + + def _validation_error(self, base_len, size, count, name='-unspecified-'): + """Provide a common error reporting function. """ + print("Base list length: %d requested %d sublists of length %d" % + (base_len, count, size)) + + def validate(self, base, size=None, count=None, name='-unspecified-'): + """Provide basic request validation, specific generators may impose + additional requirements. + """ + berror = False + base_len = len(base) + + if size == None or size <= 0 or size > base_len: + berror = True + else: + unique_combos = comb(base_len, size) # implements N take K + if count > unique_combos: + berror = True + if berror: + SubsetGenerator._validation_error(self, base_len, size, count, name) + + return not berror + +# +# UNDER EVALUATION ????????????????????????????????????????????????????? +# + +class IterativeSubsetGenerator(SubsetGenerator): + """ Tom Brettin method... subset generation via iteration over base""" + def __init__(self): + SubsetGenerator.__init__(self, 'IterativeSubsetGenerator') + + def partition(self, base, size=None, count=0, name=None): + """ """ + + if size is None: + print("Error: Unspecified list partitioning size") + sys.exit(3) + + """ + base_len = len(base) + if count == 0: # a simplification useful in the iterative approach + count = base_len + """ + + is_valid = SubsetGenerator.validate(self, base, size, count, name) + if not is_valid: + print(self.term_msg) + sys.exit(1) + + if count > base_len: + SubsetGenerator._validation_error(self, base_len, size, count, name) + print(self.term_msg) + sys.exit(2) + + np_base = np.array(base) + selected_sublists = [] + omit_size = base_len - size + increment = min(size, omit_size) + + # omit consecutive blocks of feature-name entries + for i in range(count): + org = i * increment + if org >= base_len: + org = org % base_len + if org == 0 and i > 0: + print("Warning: %d sublists of %s completed short of the requested %d" + % (i, name, count)) + break + + end = org + size + sublist = np_base.take(range(org, end), mode='wrap') + print(sublist) + selected_sublists.append(sublist) + + return selected_sublists + + +class LeaveoutSubsetGenerator(SubsetGenerator): + """CANDLE milestone 13 style feature set partitioning. + + All SubsetGenerator subclasses are required to implement partition(), + plan_init() and plan_term() functions. + """ + + def __init__(self): + SubsetGenerator.__init__(self, 'LeaveoutSubsetGenerator') + self.strategy = "leaveout" + + def plan_init(self, fs_names, fs_paths, fs_lines, fs_parts, maxdepth, root_name='1'): + """Initialize - collect plan metadata """ + currtime = datetime.now() + details = {'fs_names': fs_names, 'fs_filepaths':fs_paths, 'fs_parts': fs_parts} + details['create_date'] = currtime.isoformat(timespec=ISO_TIMESTAMP) + details['strategy'] = self.strategy + + label = '' + for i in range(len(fs_names)): + if i != 0: + label += '_' + s = '{}{}-p{}'.format(fs_names[i], fs_lines[i], fs_parts[i]) + label += s + + if maxdepth > 0: + label += '-maxdepth{}'.format(maxdepth) + + details['label'] = label + plan_dict = OrderedDict() + plan_dict[root_name] = details + return root_name, plan_dict + + def plan_term(self, plan_dict, root_name, nbr_subplans): + """Completion - post plan summary metadata """ + meta = plan_dict[root_name] + meta['nbr_subplans'] = nbr_subplans + + + def partition(self, base, size='n/a', count=None, name=None): + """Partition a feature-set list into lists of equal sized elements. + + This partitioner accepts a list of feature-set names and returns + 'count' lists, the elements evenly divided between these lists. + The last sublist will contain more or fewer elements if the base + list cannot be evenly divided. + + Args + base: A list of feature-set names. + size: Ignored, not used in this implementation. + count: The number of equal sized partitions requested, required. + name: A tag used for debug/error tracing. Not used in this + implementation. + + These arguments are common to all partition functions defined in + SubsetGenerator subclasses. + + Returns + When the input 'base' list contains a number of entries equal to or + greater than 'count', a list of 'count' sublists is returned. For + example: + + [[CELL1, ..., CELL4], [CELL5, ..., CELL7]] + + Otherwise the base list is returned as a list of lists, each list + containing one feature from the input list. This implementation + maintains compatibility with the "standard" return format discussed + above. + """ + + base_len = len(base) + if base_len < count: # can partition any further? + return [[feature] for feature in base] + + size = base_len // count + sublists = [] + + for i in range(count): + org = i * size + end = org + size + if i != count - 1: + part = base[org:end] + else: + part = base[org:] + sublists.append(part) + + return sublists + +#------------------------------------------------------------------------------ +# Database support, table and column definitions, DDL and DML +# Refer to the plan_prep() function for a discussion of the "planstat" and +# "runhist" tables defined below. +#------------------------------------------------------------------------------ + +class RunType(Enum): + RUN_ALL = 0 + RESTART = 1 + +class RunStat(Enum): # subplan execution status + SCHEDULED = 'scheduled' + COMPLETE = 'complete' + +# planstat table, rows are returned via the PlanstatRow namedtuple + +_planstat_ddl = """ + CREATE TABLE IF NOT EXISTS planstat ( + plan_name TEXT NOT NULL PRIMARY KEY, + create_date TEXT NOT NULL, + feature_sets TEXT NOT NULL, + partitions TEXT NOT NULL, + nbr_subplans INTEGER + ); """ + +PlanstatRow = namedtuple('PlanstatRow', + [ + 'rowid', + 'plan_name', + 'create_date', + 'feature_sets', + 'partitions', + 'nbr_subplans' + ] +) + +_select_row_from_planstat = """ + SELECT rowid, + plan_name, create_date, feature_sets, partitions, nbr_subplans + FROM planstat + WHERE plan_name='{}' + """ + +_insert_planstat_plan = """ + INSERT INTO planstat ( + plan_name, create_date, feature_sets, partitions, nbr_subplans) + VALUES ('{}', '{}', '{}', '{}', {}) + """ + +_delete_planstat_plan = """ + DELETE FROM planstat where rowid = {} + """ + +# runhist table, rows are returned via the RunhistRow namedtuple + +_runhist_ddl = """ + CREATE TABLE IF NOT EXISTS runhist ( + plan_id INTEGER NOT NULL, + subplan_id TEXT NOT NULL, + status TEXT NOT NULL, + start_time TEXT NOT NULL, + stop_time TEXT, + run_mins INT, + mae REAL, + mse REAL, + r_square REAL, + other_info TEXT, + weights_fn TEXT, + PRIMARY KEY (plan_id, subplan_id) + ); """ + +RunhistRow = namedtuple('RunhistRow', + [ + 'plan_id', + 'subplan_id', + 'status', + 'start_time', + 'stop_time', + 'run_mins', + 'mae', + 'mse', + 'r_square', + 'other_info', + 'weights_fn' + ] +) + +_select_row_from_runhist = """ + SELECT plan_id, subplan_id, status, + start_time, stop_time, run_mins, + mae, mse, r_square, other_info, weights_fn + FROM runhist + WHERE plan_id = {} and subplan_id = '{}' + """ + +_insupd_scheduled_runhist = """ + REPLACE INTO runhist(plan_id, subplan_id, status, start_time, + stop_time, run_mins, mae, mse, r_square, other_info, weights_fn) + VALUES({}, '{}', '{}', '{}', + NULL, NULL, NULL, NULL, NULL, NULL, NULL) + """ + +_insupd_completed_runhist = """ + UPDATE runhist SET + status = '{}', + stop_time = '{}', + run_mins = {}, + mae = {}, + mse = {}, + r_square = {}, + other_info = '{}', + weights_fn = '{}' + WHERE + plan_id = {} AND subplan_id='{}' + """ + +_delete_from_runhistory = """ + DELETE FROM runhist where plan_id = {} + """ + +#------------------------------------------------------------------------------ +# "Plan management" Database functions +# +# db_connect - establish database connection returning conn handle +# execute_sql_stmt - execute a SQL statement with optional error trap +# plan_prep - prepare for the execution of a multi-step "plan" +# start_subplan - start a subplan, (ex. '1.4.8'), write RunhistRow +# stop_subplan - stop a subplan, update RunhistRow +# get_subplan_runhist - return a RunhistRow for a given subplan +# plan_remove - remove all database records for the named plan +#------------------------------------------------------------------------------ + +def execute_sql_stmt(conn, stmt, cursor=None, trap_exception=False): + """Execute a SQL statement. + + This is a convenience function that wraps the execution of a given SQL + statement with exception handling and cleanup logic. + + Args + conn: An open database connection handle + stmt: A fully instantiated SQL statement + + cursor: Optionally, a cursor managed by the caller. If + local cursor is used. Provide a cursor if you must + operate on it after completion, fetchall() for example. + + trap_exception: By default exceptions raised by the database must be + handled by the caller. If True, errors are reflected + by the boolean return value and the cursor and/or + connection handle provided by the caller are closed.. + + Returns + False indicates that an exception occurred, else True. + """ + + if cursor: + lclcsr = cursor + else: + lclcsr = conn.cursor() + try: + if DEBUG_SQL: + with open("plangen_db.log", "a") as fp: + fp.write("STMT: " + stmt + "\n") + + db_exception = False + lclcsr.execute(stmt) + + except db_Error as e: + db_exception = True + print('execute_sql_stmt:', stmt) + print('execute_sql_stmt:', e) + if not trap_exception: + raise + finally: + if not cursor: + lclcsr.close() + + if db_exception: + if cursor: + cursor.close() + conn.close() + + return not db_exception + + +def db_connect(db_path): + """Connect to the plan management database. + + Establish a connection to the sqlite3 database contained in the named file. + A plan management database is created and populated at db_path if the file + does not exist. + + Args + db_path: A relative or absolute path or ":memory:" + + Returns + A connection handle is returned upon success, else None + """ + + if db_path == ':memory:' or not os.path.exists(db_path): + prev_allocated = False + else: + prev_allocated = True + + try: + conn = sqlite3.connect(db_path) + except db_Error as error: + print('db_connect', error) + raise + + # create plan management tables on initial database allocation + if conn and not prev_allocated: + complete = execute_sql_stmt(conn, _planstat_ddl) + complete &= execute_sql_stmt(conn, _runhist_ddl) + + if complete: + conn.commit() + else: + conn.close() + conn = None + return conn + + +def plan_remove(db_path, plan_path): + """Delete the named plan from the plan managment database. + + The relative plan name is extracted from the plan_path by removing the + leading directories and the trailing filetype suffix from the given + plan_path. The planstat row is retrieved and the associated rowid is + the plan_id identifying the target runhist table rows. + + Returns + Zero indicates deletion complete, -1 if the plan name is not matched. + """ + + status = 0 + conn = db_connect(db_path) + plan_key = _get_planstat_key(plan_path) + stmt = _select_row_from_planstat.format(plan_key) + csr = conn.cursor() + execute_sql_stmt(conn, stmt, cursor=csr) + nrow = csr.rowcount + row = csr.fetchone() + + print("%d run history rows deleted" % nrow) + + if not row: + print("Error: CLEANUP request failed - %s has not been run" % plan_key) + status = -1 + else: + plan_rec = PlanstatRow._make(row) # column-name addressable + rowid = plan_rec.rowid # the unique rowid is the plan uniquifier + _delete_runhistory(conn, rowid) + stmt = _delete_planstat_plan.format(rowid) + status = execute_sql_stmt(conn, stmt) + + csr.close() + conn.close() + return status + + +def plan_prep(db_path, plan_path, run_type=RunType.RUN_ALL): + """Prepare to run a plan, a hierarchy of interdependent subplans. + + Plan names and related information are stored in the planstat (PLAN STATUS) + table. There is one row for each plan submitted. A positive, unique integer + called the 'rowid' is assigned to table rows by the database manager. The + rowid of a planstat table row is defined here as the "plan_id". The plan_id + together with a textual "subplan_id" (example: '1.2.4') form a composite + key that is the primary key of the runhist (RUN HISTORY) table. The purpose + of this function is to register the plan and return the associated plan_id. + + RunTypes + When a new plan is presented it is registered in the planstat table and + during its execution a large number of runhist (RUN HISTORY) table + entries are created and then updated. To prevent unintended loss of + data one of the following "RunTypes" is specified on the initial + plan_prep() call and again on subsequent start_subplan() calls. + + Specify RUN_ALL on the first attempt to run a plan. If the plan name + is already registered, the request fails and neither the planstat or + runstat tables are changed. + + Specify RESTART if a prior attempt to run a plan did not complete. The + presence of a corresponding planstat record is verified. start_subplan() + returns a SKIP status if the associated runhist row (if any) is marked + COMPLETE. + + Args + db_path: plan management database path (relative or absolute) + plan_path: JSON plan file (relative or absolute) + run_type: RunType.RUN_ALL, the default, or RunType.RESTART + + Returns + A negative value indicates a fatal error. + + Otherwise the integer returned is the plan_id used together with a + subplan_id string used in subsequent start_subplan(), stop_subplan() + and get_subplan_hist() calls. + """ + + # load the plan and retrieve identity info + plan_dict = load_plan(plan_path) + create_date = get_plan_create_date(plan_dict) + feature_sets = get_plan_fs_names(plan_dict) + partitions = get_plan_fs_parts(plan_dict) + nbr_subplans = get_plan_nbr_subplans(plan_dict) + + # de termine if a plan of the given name has already been registered + conn = db_connect(db_path) + plan_key = _get_planstat_key(plan_path) + stmt = _select_row_from_planstat.format(plan_key) + csr = conn.cursor() + execute_sql_stmt(conn, stmt, cursor=csr) + row = csr.fetchone() + + if not row: + rowid = -1 + else: + plan_rec = PlanstatRow._make(row) # column-name addressable + rowid = plan_rec.rowid # the unique rowid will be the uniquifier returned + + # compare run_type to initial expectations + error = False + + if run_type == RunType.RUN_ALL and rowid > 0: + print("Error: RUN_ALL specified but plan: %s has already been defined" % plan_key) + error = True + + elif run_type == RunType.RESTART and rowid < 0: + print("Warning: RESTART specified but plan: %s has not been previously run" % plan_key) + + elif rowid > 0 and create_date != create_date: # DEBUG ???????????????????????????????????? plan_rec.create_date: + print("Error: RESTART specified but the signature of the previously defined plan: %s does not match" % plan_key) + error = True + + # register new plans acquiring the uniquifying plan_id used to compose runhistory table keys + if not error and rowid < 0: + feature_sets = str(feature_sets) + feature_sets = feature_sets.replace("'", "") # create string literal from list of str + partitions = str(partitions) # create string literal from list of int + + stmt = _insert_planstat_plan.format( + plan_key, + create_date, + feature_sets, + partitions, + nbr_subplans + ) + + status = execute_sql_stmt(conn, stmt, cursor=csr) + rowid = csr.lastrowid + + # cleanup resources and return uniquifier or error indicator + csr.close() + conn.commit() + + if error: + return -1 + else: + return rowid + + +def start_subplan(db_path, plan_path, plan_id=None, subplan_id=None, run_type=None): + """Schedule the execution of a subplan. + + This function writes a RunhistRow record to the runhist table indicating that + the named plan/subplan has been SCHEDULED. The row includes the "start time". + If the given run_type is RESTART, it is possible that the subplan has already + run, as indicated by the status returned. + + Args + db_path: plan management database path (relative or absolute) + plan_path: JSON plan file (relative or absolute) + plan_id: the plan identifier returned by plan_prep() + subplan_id the subplan identifier ex. '1 4.8' + run_type: RunType.RUN_ALL or RunType.RESTART + + Returns + Zero indicates that a RunhistRow record has been created to represent + the subplan. -1 is returned from a RESTART call if the a RunhistRow + already exists for the plan/subplan and is marked COMPLETE. + """ + + conn = db_connect(db_path) + csr = conn.cursor() + skip = False + + # skip previously completed work if RESTART + if run_type == RunType.RESTART: + stmt = _select_row_from_runhist.format(plan_id, subplan_id) + execute_sql_stmt(conn, stmt, cursor=csr) + row = csr.fetchone() + + if row: + runhist_rec = RunhistRow._make(row) + if runhist_rec.status == RunStat.COMPLETE.name: + skip = True + + # construct/reinit a new runhist record + if not skip: + currtime = datetime.now() + start_time = currtime.isoformat(timespec=ISO_TIMESTAMP) + + stmt = _insupd_scheduled_runhist.format( + plan_id, + subplan_id, + RunStat.SCHEDULED.name, + start_time + ) + + execute_sql_stmt(conn, stmt, cursor=csr) + + csr.close() + conn.commit() + conn.close() + + if skip: + return -1 + else: + return 0 + + +def stop_subplan(db_path, plan_id=None, subplan_id=None, comp_info_dict={}): + """Complete the execution of a subplan. + + This function updates the RunhistRow record created by start_subplan() + updating the status to COMPLETE, the completion timestamp, and "user + fields" (such as MAE, MSE, R2) returned by the model. + + A comp_dict dictionary is populated with the names and default values + for columns implemented in the RunhistRow table. Values matching those + names are extracted from the comp_info_dict are written to the table. + + Args + db_path: plan management database path (relative or absolute) + plan_path: JSON plan file (relative or absolute) + plan_id: the plan identifier returned by plan_prep() + comp_info_dict: supplemental completion data dictionar + """ + + conn = db_connect(db_path) + csr = conn.cursor() + curr_time = datetime.now() + stop_time = curr_time.isoformat(timespec=ISO_TIMESTAMP) + + comp_dict = dict(mae=0.0, mse=0.0, r_square=0.0, weights_fn='N/A', unprocessed='') + remainder = _acquire_actuals(comp_dict, comp_info_dict) + + if len(remainder) == 0: + other_info = '' + else: + other_info = json.dumps(remainder) + + # fetch row to retrieve schedule info + stmt = _select_row_from_runhist.format(plan_id, subplan_id) + execute_sql_stmt(conn, stmt, csr) + row = csr.fetchone() + + if row: # expected, caller error if already marked COMPLETED + runhist_rec = RunhistRow._make(row) + if runhist_rec.status != RunStat.COMPLETE.name: + start_time = datetime.strptime(runhist_rec.start_time, ISO_TIMESTAMP_ENCODE) + duration = curr_time - start_time + run_mins = int((duration.total_seconds() + 59) / 60) + + # update runhist record + stmt = _insupd_completed_runhist.format( + # column values + RunStat.COMPLETE.name, + stop_time, + run_mins, + comp_dict['mae'], + comp_dict['mse'], + comp_dict['r_square'], + other_info, + comp_dict['weights_fn'], + # key spec + plan_id, + subplan_id + ) + + execute_sql_stmt(conn, stmt) + + # cleanup + csr.close() + conn.commit() + conn.close() + + +def get_subplan_runhist(db_path, plan_id=None, subplan_id=None): + """Return the RunhistRow record for a given plan/subplan. + + Args + db_path: plan management database path (relative or absolute) + plan_id: the plan identifier returned by plan_prep() + subplan_id the subplan identifier ex. '1 4.8' + + Returns + The RunhistRow associated with the given plan/subplan is returned if + found. + """ + conn = db_connect(db_path) + stmt = _select_row_from_runhist.format(plan_id, subplan_id) + csr = conn.cursor() + execute_sql_stmt(conn, stmt, csr) + row = csr.fetchone() + + if not row: + plan_rec = None + else: + plan_rec = RunhistRow._make(row) + + return plan_rec + +def _acquire_actuals(dft_dict, actuals_dict): + """Extract values from dictionary overlaying defaults.""" + actuals = actuals_dict.copy() + for key, value in dft_dict.items(): + if key in actuals: + dft_dict[key] = actuals[key] + actuals.pop(key) + + return actuals # possibly empty + + +def _get_planstat_key(plan_path): + """Extract the name portion of a plan from a filepath.""" + basename = os.path.basename(plan_path) + basepfx = basename.split(sep='.') + return basepfx[0] + + +def _delete_runhistory(conn, plan_id): + """Delete RunhistRows containing the given plan_id.""" + csr = conn.cursor() + stmt = _delete_from_runhistory.format(plan_id) + execute_sql_stmt(conn, stmt, cursor=csr, trap_exception=True) + rowcount = csr.rowcount + print("CLEANUP processing removed %d run history records" % rowcount) + csr.close() + return rowcount + + +#------------------------------------------------------------------------------ +# Plan navigation, content retrieval +#------------------------------------------------------------------------------ + +def load_plan(filepath): + """Load a JSON transfer learning plan. + + The named JSON tranfer learning plan file is loaded in a manner that preserves + the entry order imposed when the plan was created. This allows the root entry + to be easily located regardless of the plan entry naming scheme in use. + + Args + filepath: A relative or absolute path to the JSON file. + + Returns + An entry-ordered plan in OrderedDict format is returned. + """ + + with open(filepath, 'r') as f: + ordered_plan_dict = json.load(f, object_pairs_hook=OrderedDict) + return ordered_plan_dict + +def get_plan_create_date(plan_dict): + _, value = _get_first_entry(plan_dict) + return value['create_date'] + +def get_plan_fs_names(plan_dict): + _, value = _get_first_entry(plan_dict) + return value['fs_names'] + +def get_plan_fs_parts(plan_dict): + _, value = _get_first_entry(plan_dict) + return value['fs_parts'] + +def get_plan_nbr_subplans(plan_dict): + _, value = _get_first_entry(plan_dict) + return value['nbr_subplans'] + +def _get_first_entry(ordered_dict): + key, value = next(iter(ordered_dict.items())) + return key, value + +def get_subplan(plan_dict, subplan_id=None): + """Retrieve the content of a named subplan or the root plan. + + Args + plan_dict: The plan dictionary as returned by load_plan(). + subplan_id: The name of the desired subplan. Omit this arg to acquire + the content and name of the plan tree root. + + Returns + A (content, subplan_id) pair is returned. The returned name is useful when + using default arguments to retrieve the root plan. + """ + + if subplan_id is None: + subplan_id, content = _get_first_entry(plan_dict) + else: + content = plan_dict.get(subplan_id) + return content, subplan_id + + +def get_predecessor(plan_dict, subplan_id): + """Acquire the name of the predecessor (parent) of a given subplan. + + The plan tree is a true tree. All subplans have exactly one + predecessor/parent. Use this function to walk 'up' the tree. + + Args + plan_dict: The plan dictionary as returned by load_plan(). + subplan_id: The name of the target subplan. + + Returns + The name of the parent subplan is returned. If the root plan name + is specified None is returned. + """ + + segments = subplan_id.split(sep='.') + if len(segments) <= 1: + subplan_id = None + else: + segments.pop() + subplan_id = '.'.join(segments) + return subplan_id + + +def get_successors(plan_dict, subplan_id): + """Acquire the names of the successors (children) of a given subplan. + + All subplans other than 'leaf' subplans have at least one successor. Use + this function to walk 'down' one or more plan subtrees. + + Args + plan_dict: The plan dictionary as returned by load_plan(). + subplan_id: The name of the target subplan. + + Returns + A list of the names of all successors (children) of the given subplan + is returned. The list may be empty. + """ + successor_names = [] + for i in it.count(start=1): + new_name = subplan_id + '.' + str(i) + value = plan_dict.get(new_name) + if not value: + break + successor_names.append(new_name) + + return successor_names + + +def _get_named_set(plan_dict, subplan_id, section_tag, fs_name, collector, parent_features=None): + """ """ + + while True: + content, _ = get_subplan(plan_dict, subplan_id) + assert(content) + + section = content[section_tag] + for i, section_features in enumerate(section): + feature_list = section_features[fs_name] + collector.insert(i, feature_list) + + if not parent_features: + break + + # visit parent node, root has no feature information and ends upward traversal + subplan_id = get_predecessor(plan_dict, subplan_id) + grand_parent_id = get_predecessor(plan_dict, subplan_id) + + if not grand_parent_id: + break + + +def get_subplan_features(plan_dict, subplan_id, parent_features=False): + """Return train and validation features associated with a named subplan. + + Args + plan_dict: The plan dictionary as returned by load_plan()x. + subplan_id: The name of the target subplan + parent_features: True or False + + Returns + The result is four-tuple (t0, t1, t2, t30) constructed as follows. + Some applications may choose to discard some of the returns, t0 and + t1, for example. + + t0 - the result dictionary which is disassmbled as follows + t1 - a list of feature names found in the train/validate sets + t2 - training feature set dictionary as described below + t3 - validation feature set dictionary as described below + + t2 and t3 are dictionaries that represent one or more training sets + and one or more validation sets, respectively. The key of each entry + is a feature-set name as returned in the t1 list, ['cell', 'drug'] for + example. The value of each is a list of lists. + + Consider a training feature set dictionary returned as follows: + + { + 'cell': [[C1, C2, C3, C4], [C5, C6, C7, C8]], + 'drug': [[ [D1, D2] , [D3, D4]] + } + + The feature sets defined here are the combination of (cell[0], drug[0]) + and (cell[1], drug[1]). The lenghts, i.e. number of sublists of each + dictionary entry are always equal. + """ + + # acquire feature_set names populated in the plan + content, _ = get_subplan(plan_dict, subplan_id) + if not content: + return None, None + + # peek inside the training set to capture active feature-set names + train_set = content['train'][0] + fs_names = [name for name in train_set.keys()] + + # categorize the results + result = {} + result[0] = fs_names + result['train'] = {} + result['val'] = {} + + for set_name, pf in [('train', True), ('val', False)]: + if pf == True: + pf = parent_features + + for fs_name in fs_names: + collector = [] + _get_named_set( + plan_dict, + subplan_id, + set_name, + fs_name, + collector, + parent_features=pf + ) + + result[set_name][fs_name] = collector + + return result, result[0], result['train'], result['val'] + +#------------------------------------------------------------------------------ +# Plan construction +#------------------------------------------------------------------------------ + +def build_dictionary_from_lists(seq_list, names): + """Create a dictionary with 'names' as labels and 'seq_list' values.""" + dict = {} + for seq, tag in zip(seq_list, names): + dict[tag] = list(seq) + return dict + + +def build_plan_tree(args, feature_set_content, parent_plan_id='', depth=0, data_pfx='', plan_pfx=''): + """Generate a plan supporting training, transfer-learning, resume-training. + + ADD GENERAL DOC + + This function is recursive. + + Arguments: + args: A namespace capturing the values of command line arguments + and parameter values derived from those arguments. Refer to + validate_args(). + + feature_set_content: This is a list of sublists, where each sublist + contains the names of the nth group of feature-set elements. + + parent_plan_id: This is the name of the parent's plan. The name + is extended with '.nn' at each level of the recursion to + ensure that parentage/liniage is fully conveyed in each + (subplan) plan_id. + + depth: Specify 0 on the root call. This arg can be used to + determine/set the current level of the recursion. + + data_pfx: Reserved for constructing feature-set name files. + plan_pfx: Reserved for constructing plan control files. + + Returns + args.plan_dict contains a dictionary representing the plan. This may be + JSONized. + + The number of planning steps (nbr of subplans in the plan tree) is explicitly + returned. + """ + curr_depth = depth + 1 + if args.maxdepth > 0 and curr_depth >= args.maxdepth: + return 0 + + all_parts = [] + + #flat_partitions = [] # preserve, used for file-based approach + #files = [] # preserve, used for file-based approach + #sequence = 0 # preserve, used for file-based approach + xxx = False + + for i in range(len(args.fs_names)): + group = feature_set_content[i] + count = args.fs_parts[i] + feature_set_name = args.fs_names[i] + partitions = args.generator.partition(feature_set_content[i], count=count) + all_parts.append(partitions) + + # acquire a cross-product of all feature-set partitions + parts_xprod = np.array(list(it.product(*all_parts))) + steps = len(parts_xprod) + + if steps > 1: + substeps = 0 + for step in range(steps): + train = [] + val = [] + + # split into validation and training components + for i, plan in enumerate(parts_xprod): + section = build_dictionary_from_lists(plan, args.fs_names) + if i == step: + val.append(section) + else: + train.append(section) + + # generate next depth/level (successor) plans + curr_plan_id = '{}.{}'.format(parent_plan_id, step + 1) + args.plan_dict[curr_plan_id] = {'val': val, 'train': train} + data_name = '{}.{}'.format(data_pfx, step + 1) + plan_name = '{}.{}'.format(plan_pfx, step + 1) + + # depth-first, shorthand representation of tree showing first feature names + if args.debug: + indent = ' ' * (depth * 4) + print(indent, curr_plan_id) + indent += ' ' * 4 + fs = parts_xprod[step] + for i in range(len(fs)): + print(indent, args.fs_names[i], 'count:', len(fs[i]), 'first:', fs[i][0]) + + substeps += build_plan_tree( + args, + parts_xprod[step], + parent_plan_id=curr_plan_id, + depth=curr_depth, + data_pfx=data_name, + plan_pfx=plan_name + ) + + steps += substeps + return steps + + """ + # THIS IS A WORK-IN-PROGRESS ... GENERATING FILES FOR DATA AND PLAN + + files.append([]) + files_ndx = len(files) - 1 + + for j in range(len(partitions)): + part = partitions[j] + flat_partitions.append(part) + if len(part) == 0: + sys.exit("big trouble ?????????????") + + sequence += 1 + file_name = '{}.{}.{}'.format(data_pfx, sequence, feature_set_name) + print("writing file %s with %d entries" % (file_name, len(part))) # write out 'part' + #write_file(file_name, part) + pair = (feature_set_name, file_name) + files[files_ndx].append(pair) + + file_xprod = np.array(list(it.product(*files))) + nbr_plans = len(file_xprod) + + for seq in range(nbr_plans): + plan_string = '' + + for ndx, curr in enumerate(file_xprod): + if ndx == seq: + plan_string += '--val (' + else: + plan_string += '--inc (' + for (tag, fname) in curr: + plan_string += '{}-{} '.format(tag, fname) + plan_string += ')' + + file_name = '{}.{}'.format(plan_pfx, seq + 1) + print(file_name) + plan_lines = list(plan_string) + #write_file(file_name, plan_lines) + + # construct list of omitted feature entries + + for seq in range(nbr_plans): + omitted_feature_content = [] + org = 0 + + for i in partition_spec: + omitted_feature_content.append(flat_partitions[org]) + org = i + + data_name = '{}.{}'.format(data_pfx, seq + 1) + plan_name = '{}.{}'.format(plan_pfx, seq + 1) + + steps = build_plan_tree( + args, + omitted_feature_content, + parent_plan_id=curr_plan_id, + depth=curr_depth, + data_pfx=data_name, + plan_pfx=plan_name + ) + return + """ + +def write_file(fname, title, string_list): + """Write text expressed as an array of lines to file.""" + with open(fname, 'w') as f: + for line in string_list: + f.write(line) + +def write_dict_to_json(dictionary, fname): + """Write dictionary to a json file.""" + with open(fname, 'w') as f: + json.dump(dictionary, f) + +#---------------------------------------------------------------------------------- +# various hard-coded lists, test cases - the synthetic feature-sets remain useful +#---------------------------------------------------------------------------------- + +""" +synthetic_cell_names = ['cell_' + '%04d' % (x) for x in range(1000)] +synthetic_drug_names = ['drug_' + '%04d' % (x) for x in range(1000)] +""" + +#---------------------------------------------------------------------------------- +# mainline +#---------------------------------------------------------------------------------- + +def main(): + # Acquire and validate arguments + args = planargs.parse_arguments() + args.json = True # the only available option thus far + + generator, feature_set_content = validate_args(args) + args.generator = generator + + root_name, args.plan_dict = generator.plan_init( + fs_names = args.fs_names, # validated cmdline arg + fs_paths = args.fs_paths, # validated cmdline arg + fs_lines = args.fs_lines, # created by validate_args + fs_parts = args.fs_parts, # validated cmdline arg + maxdepth = args.maxdepth + ) + + # feature_set_content = [cell_names, drug_names] + # feature_set_content = [synthetic_cell_names, synthetic_drug_names] + + # remove by-1 dimensions, they do not need to be represented in the plan explicitly + while True: + try: + ndx = args.fs_parts.index(1) + args.fs_names.pop(ndx) + args.fs_paths.pop(ndx) + args.fs_lines.pop(ndx) + args.fs_parts.pop(ndx) + except ValueError: + break + + # Plan generation + data_fname_pfx = os.path.join(args.out_dir, 'DATA.1') + plan_fname_pfx = os.path.join(args.out_dir, 'PLAN.1') + + steps = build_plan_tree( + args, # command line argument namespace + feature_set_content, # for example [[cell1 ... celln] [drug1 ... drugn]] + parent_plan_id=root_name, # name of root plan, subplan names created from this stem + data_pfx=data_fname_pfx, # DATA file prefix, building block for feature name files + plan_pfx=plan_fname_pfx # PLAN file prefix, building block for plan name files + ) + + generator.plan_term(args.plan_dict, root_name, steps) + print("Plan generation complete, total steps: %d" % steps) + + if args.json: + label = args.generator.get_plan_label(args.plan_dict, root_name) + qualified_name = 'plangen_' + label + '.json' + json_file_name = os.path.join(args.out_dir, qualified_name) + json_abspath = os.path.abspath(json_file_name) + write_dict_to_json(args.plan_dict, json_abspath) + print("%s JSON file written" % json_abspath) + + if args.print_tree: + print("Plan dictionary generated") + pp(args.plan_dict, width=160) # DEBUG comment this out for large plans + + if args.test: + test1(json_abspath, "test1_sql.db") + # test2(json_abspath, "test2_sql.db") + +#---------------------------------------------------------------------------------- +# test plan navigation and subplan entry retrieval +#---------------------------------------------------------------------------------- + +def test2(plan_path, db_path): + run_type = RunType.RESTART + #run_type = RunType.RUN_ALL + + plan_name = os.path.basename(plan_path) + plan_id = plan_prep(db_path, plan_name, run_type) + + plan_dict = load_plan(plan_path) + metadata, root_name = get_subplan(plan_dict) + + queue = deque() + queue.append(root_name) + + print("Test2 start") + for iloop in it.count(start = 0): + if len(queue) == 0: + print("Test2 complete - proc loop count: %d" % iloop) + break + + curr_subplan = queue.popleft() + successor_names = get_successors(plan_dict, curr_subplan) + for successor in successor_names: + queue.append(successor) + + if len(curr_subplan) == 1: + continue + + status = start_subplan( + db_path, + plan_path, + plan_id=plan_id, + subplan_id=curr_subplan, + run_type=run_type + ) + + if status < 0: + continue + + completion_status = dict(mse=1.1, mae=2.2, r_square=.555) + + stop_subplan( + db_path, + plan_id=plan_id, + subplan_id=curr_subplan, + comp_info_dict=completion_status + ) + print("Completing subplan %6d" % iloop) + +#---------------------------------------------------------------------------------- +# +def test1(plan_path, db_path): + run_type = RunType.RESTART + #run_type = RunType.RUN_ALL + + plan_name = os.path.basename(plan_path) + plan_id = plan_prep(db_path, plan_name, run_type) + + if (plan_id < 0): + sys.exit("Terminating due to database detected error") + + print("\nBegin plan navigation and subplan retrieval test") + plan_dict = load_plan(plan_path) + + # plan root name value returned when subplan_id= is omitted + metadata, root_name = get_subplan(plan_dict) + + # the root has no parent / predecessor + parent_name = get_predecessor(plan_dict, root_name) + print("Demonstrate that root \'%s\' predecessor is not defined: %s" % (root_name, parent_name)) + + # the root contains metadata, it is not a run specification + successor_names = get_successors(plan_dict, root_name) + print("\nThe first runable configurations are defined in %s\n" % successor_names) + + # the root is the predecessor of these first level runables + for sname in successor_names: + parent_name = get_predecessor(plan_dict, sname) + print("The parent of %s is %s" % (sname, parent_name)) + + # run the right subtree + print("\nRun the rightmost subtree \n") + for i in it.count(start = 1): + listlen = len(successor_names) + if listlen == 0: + break + + for name in successor_names: + status = start_subplan( + db_path, + plan_path, + plan_id=plan_id, + subplan_id=name, + run_type=run_type + ) + + if status < 0: + print("subplan: %s skipped, previously processed" % name) + + select_one = successor_names[listlen - 1] + parent_name = get_predecessor(plan_dict, select_one) + print("%-16s is a successor of %-16s - all successors: %s" % (select_one, parent_name, successor_names)) + +# ??????????????????????????????????????????????????????????? + value,_ = get_subplan(plan_dict, select_one) + + if i < 3: + for pf in [False, True]: + _, fs_name_list, train_list, val_list = get_subplan_features(plan_dict, select_one, parent_features=pf) + print("\nsubplan original:", select_one, "parent features:", pf) + pp(plan_dict[select_one]) + print("\nflattened TRAIN") + pp(train_list) + print("\nflattened VAL") + pp(val_list) + +# ??????????????????????????????????????????????????????????? + + # test retrieval api + row = get_subplan_runhist(db_path, plan_id=plan_id, subplan_id=select_one) + #print(row) + + # post subplan termination + completion_status = dict(mse=1.1, mae=2.2, r_square=.555, misc='no such column', data=123) + + stop_subplan( + db_path, + plan_id=plan_id, + subplan_id=select_one, + comp_info_dict=completion_status + ) + + successor_names = get_successors(plan_dict, select_one) + + print("\nEnd of branch reached") +# plan_remove(db_path, "plangen_cell8-p2_drug8-p2.json") + +#---------------------------------------------------------------------------------- + +if __name__ == "__main__": + main() diff --git a/Pilot1/Uno/topN_to_uno.py b/Pilot1/Uno/topN_to_uno.py index dd81d9f3..d9a2d6d0 100644 --- a/Pilot1/Uno/topN_to_uno.py +++ b/Pilot1/Uno/topN_to_uno.py @@ -13,6 +13,10 @@ def parse_arguments(): help='Plan data file') parser.add_argument('--node', type=str, default=None, help='node number to execute') + parser.add_argument('--incremental', action='store_true', + help='True for building dataset incrementally') + parser.add_argument('--fold', type=str, default=None, + help='pre-calculated indexes for cross fold validation') args, unparsed = parser.parse_known_args() return args, unparsed @@ -22,29 +26,64 @@ def read_plan(filename, node): print("reading {} file for node {}".format(filename, node)) with open(filename, 'r') as plan_file: plan = json.load(plan_file) + if node is None: + return plan + if node in plan: return plan[node] else: raise Exception('Node index "{}" was not found in plan file'.format(node)) +# def build_masks(args, df): +# if args.node is None: +# print('node is None. Generate Random split') +# mask = training_mask(df) +# return mask, ~mask +# +# plan = read_plan(args.plan, args.node) +# mask = {} +# for partition in ['train', 'val']: +# _mask = df['Sample'] is None +# for i, element in enumerate(plan[partition]): +# cl_filter = element['cell'] +# dr_filter = element['drug'] +# __mask = df['Sample'].isin(cl_filter) & df['Drug1'].isin(dr_filter) +# _mask = _mask | __mask +# mask[partition] = _mask +# +# return mask['train'], mask['val'] + + def build_masks(args, df): if args.node is None: print('node is None. Generate Random split') mask = training_mask(df) return mask, ~mask - plan = read_plan(args.plan, args.node) + print('from new build_mask: {} {} {}'.format(args.plan, args.node, args.incremental)) + import plangen + plan = read_plan(args.plan, None) + ids = {} mask = {} + _, _, ids['train'], ids['val'] = plangen.get_subplan_features(plan, args.node, args.incremental) + for partition in ['train', 'val']: _mask = df['Sample'] is None - for i, element in enumerate(plan[partition]): - cl_filter = element['cell'] - dr_filter = element['drug'] - __mask = df['Sample'].isin(cl_filter) & df['Drug1'].isin(dr_filter) + for i in range(len(ids[partition]['cell'])): + if 'cell' in ids[partition] and 'drug' in ids[partition]: + cl_filter = ids[partition]['cell'][i] + dr_filter = ids[partition]['drug'][i] + __mask = df['Sample'].isin(cl_filter) & df['Drug1'].isin(dr_filter) + elif 'cell' in ids[partition]: + cl_filter = ids[partition]['cell'][i] + __mask = df['Sample'].isin(cl_filter) + elif 'drug' in ids[partition]: + dr_filter = ids[partition]['drug'][i] + __mask = df['Drug1'].isin(dr_filter) + _mask = _mask | __mask mask[partition] = _mask - return mask['train'], mask['val'] @@ -90,19 +129,35 @@ def build_dataframe(args): else: df_y, df_cl, df_dd = read_dataframe_from_csv(args) - # mask = training_mask(df_y) - train_mask, val_mask = build_masks(args, df_y) + if args.fold is not None: + tr_id = pd.read_csv('{}_tr_id.csv'.format(args.fold)) + vl_id = pd.read_csv('{}_vl_id.csv'.format(args.fold)) + tr_idx = tr_id.iloc[:, 0].dropna().values.astype(int).tolist() + vl_idx = vl_id.iloc[:, 0].dropna().values.astype(int).tolist() + + y_train = df_y.iloc[tr_idx, :] + y_val = df_y.iloc[vl_idx, :] + + x_train_0 = df_cl.iloc[tr_idx, :] + x_train_1 = df_dd.iloc[tr_idx, :] + x_train_1.columns = [''] * len(x_train_1.columns) + + x_val_0 = df_cl.iloc[vl_idx, :] + x_val_1 = df_dd.iloc[vl_idx, :] + x_val_1.columns = [''] * len(x_val_1.columns) + else: + train_mask, val_mask = build_masks(args, df_y) - y_train = pd.DataFrame(data=df_y[train_mask].reset_index(drop=True)) - y_val = pd.DataFrame(data=df_y[val_mask].reset_index(drop=True)) + y_train = pd.DataFrame(data=df_y[train_mask].reset_index(drop=True)) + y_val = pd.DataFrame(data=df_y[val_mask].reset_index(drop=True)) - x_train_0 = df_cl[train_mask].reset_index(drop=True) - x_train_1 = df_dd[train_mask].reset_index(drop=True) - x_train_1.columns = [''] * len(x_train_1.columns) + x_train_0 = df_cl[train_mask].reset_index(drop=True) + x_train_1 = df_dd[train_mask].reset_index(drop=True) + x_train_1.columns = [''] * len(x_train_1.columns) - x_val_0 = df_cl[val_mask].reset_index(drop=True) - x_val_1 = df_dd[val_mask].reset_index(drop=True) - x_val_1.columns = [''] * len(x_val_1.columns) + x_val_0 = df_cl[val_mask].reset_index(drop=True) + x_val_1 = df_dd[val_mask].reset_index(drop=True) + x_val_1.columns = [''] * len(x_val_1.columns) # store store = pd.HDFStore('topN.uno.h5', 'w', complevel=9, complib='blosc:snappy') From 307cde847ff778bb084ae3e699822da0c178f06f Mon Sep 17 00:00:00 2001 From: Hyunseung Yoo Date: Sun, 25 Aug 2019 06:12:02 -0500 Subject: [PATCH 025/331] add support for feather format --- Pilot1/Uno/topN_to_uno.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/Pilot1/Uno/topN_to_uno.py b/Pilot1/Uno/topN_to_uno.py index d9a2d6d0..e0957b55 100644 --- a/Pilot1/Uno/topN_to_uno.py +++ b/Pilot1/Uno/topN_to_uno.py @@ -93,7 +93,22 @@ def training_mask(df): def read_dataframe_from_csv(args): df = pd.read_csv(args.dataframe_from, low_memory=False, na_values='na').fillna(0) - df.rename(columns={'SAMPLE': 'Sample', 'DRUG': 'Drug1'}, inplace=True) + df.rename(columns={'CELL': 'Sample', 'DRUG': 'Drug1'}, inplace=True) + df_y = df[['AUC', 'Sample', 'Drug1']] + + cols = df.columns.to_list() + cl_columns = list(filter(lambda x: x.startswith('GE_'), cols)) + dd_columns = list(filter(lambda x: x.startswith('DD_'), cols)) + + df_cl = df.loc[:, cl_columns] + df_dd = df.loc[:, dd_columns] + + return df_y, df_cl, df_dd + + +def read_dataframe_from_feather(args): + df = pd.read_feather(args.dataframe_from).fillna(0) + df.rename(columns={'CELL': 'Sample', 'DRUG': 'Drug1'}, inplace=True) df_y = df[['AUC', 'Sample', 'Drug1']] cols = df.columns.to_list() @@ -126,6 +141,8 @@ def build_dataframe(args): _, ext = os.path.splitext(args.dataframe_from) if ext == '.h5' or ext == '.hdf5': df_y, df_cl, df_dd = read_dataframe_from_hdf(args) + elif ext == '.feather': + df_y, df_cl, df_dd = read_dataframe_from_feather(args) else: df_y, df_cl, df_dd = read_dataframe_from_csv(args) From bbf50d6a0d1bda8b18a68a05b376393e6a76be0c Mon Sep 17 00:00:00 2001 From: brettin Date: Tue, 27 Aug 2019 07:41:10 -0400 Subject: [PATCH 026/331] using information from milestone12 HPO --- Pilot1/Uno/uno_auc_model.txt | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/Pilot1/Uno/uno_auc_model.txt b/Pilot1/Uno/uno_auc_model.txt index 4a803b43..7789f732 100644 --- a/Pilot1/Uno/uno_auc_model.txt +++ b/Pilot1/Uno/uno_auc_model.txt @@ -4,13 +4,13 @@ test_sources=['train'] cell_types=None cell_features=['rnaseq'] drug_features=['descriptors'] -dense=[1000, 1000, 1000] +dense=[1000, 1000, 1000, 1000, 1000] dense_feature_layers=[1000, 1000, 1000] activation='relu' loss='mse' -optimizer='sgd' +optimizer='adamax' scaling='std' -drop=0 +drop=.1 epochs=50 batch_size=32 validation_split=0.2 @@ -18,22 +18,28 @@ cv=1 max_val_loss=1.0 learning_rate=0.0001 base_lr=None +agg_dose='AUC' residual=False -reduce_lr=False -warmup_lr=False +reduce_lr=True +warmup_lr=True batch_normalization=False feature_subsample=0 rng_seed=2018 -save_path='save/uno' no_gen=False verbose=False -no_response_source=True -no_feature_source=True -use_landmark_genes=True -agg_dose='AUC' + + preprocess_rnaseq='source_scale' +gpus=1 +use_landmark_genes=True +no_feature_source=True +no_response_source=True +cp=True +save_path='/ccs/home/brettin/project_work/brettin/milestone13/save/uno' + single=True +timeout=-1 [Monitor_Params] solr_root='' -timeout=-1 + From 388559a66770472d600cee2ce70e45f99bf2951c Mon Sep 17 00:00:00 2001 From: Hyunseung Yoo Date: Wed, 28 Aug 2019 14:13:59 -0500 Subject: [PATCH 027/331] add dataframe index. --- Pilot1/Uno/topN_to_uno.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/Pilot1/Uno/topN_to_uno.py b/Pilot1/Uno/topN_to_uno.py index e0957b55..a4bafcc4 100644 --- a/Pilot1/Uno/topN_to_uno.py +++ b/Pilot1/Uno/topN_to_uno.py @@ -69,7 +69,7 @@ def build_masks(args, df): _, _, ids['train'], ids['val'] = plangen.get_subplan_features(plan, args.node, args.incremental) for partition in ['train', 'val']: - _mask = df['Sample'] is None + _mask = df['Sample'] == None for i in range(len(ids[partition]['cell'])): if 'cell' in ids[partition] and 'drug' in ids[partition]: cl_filter = ids[partition]['cell'][i] @@ -81,7 +81,6 @@ def build_masks(args, df): elif 'drug' in ids[partition]: dr_filter = ids[partition]['drug'][i] __mask = df['Drug1'].isin(dr_filter) - _mask = _mask | __mask mask[partition] = _mask return mask['train'], mask['val'] @@ -152,15 +151,15 @@ def build_dataframe(args): tr_idx = tr_id.iloc[:, 0].dropna().values.astype(int).tolist() vl_idx = vl_id.iloc[:, 0].dropna().values.astype(int).tolist() - y_train = df_y.iloc[tr_idx, :] - y_val = df_y.iloc[vl_idx, :] + y_train = df_y.iloc[tr_idx, :].reset_index(drop=True) + y_val = df_y.iloc[vl_idx, :].reset_index(drop=True) - x_train_0 = df_cl.iloc[tr_idx, :] - x_train_1 = df_dd.iloc[tr_idx, :] + x_train_0 = df_cl.iloc[tr_idx, :].reset_index(drop=True) + x_train_1 = df_dd.iloc[tr_idx, :].reset_index(drop=True) x_train_1.columns = [''] * len(x_train_1.columns) - x_val_0 = df_cl.iloc[vl_idx, :] - x_val_1 = df_dd.iloc[vl_idx, :] + x_val_0 = df_cl.iloc[vl_idx, :].reset_index(drop=True) + x_val_1 = df_dd.iloc[vl_idx, :].reset_index(drop=True) x_val_1.columns = [''] * len(x_val_1.columns) else: train_mask, val_mask = build_masks(args, df_y) From c15edc7f1ab950343d5370c4a5e10e34a7e507a4 Mon Sep 17 00:00:00 2001 From: Harry Yoo Date: Wed, 28 Aug 2019 16:34:42 -0400 Subject: [PATCH 028/331] remove hard-coded path --- Pilot1/Uno/uno_auc_model.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Pilot1/Uno/uno_auc_model.txt b/Pilot1/Uno/uno_auc_model.txt index 7789f732..2d9158a0 100644 --- a/Pilot1/Uno/uno_auc_model.txt +++ b/Pilot1/Uno/uno_auc_model.txt @@ -35,7 +35,7 @@ use_landmark_genes=True no_feature_source=True no_response_source=True cp=True -save_path='/ccs/home/brettin/project_work/brettin/milestone13/save/uno' +save_path='save/uno' single=True timeout=-1 From 04979c97835888e18c65582c8519a69e9549afe9 Mon Sep 17 00:00:00 2001 From: Hyunseung Yoo Date: Wed, 4 Sep 2019 21:29:21 -0500 Subject: [PATCH 029/331] close filepointer --- Pilot1/Uno/topN_to_uno.py | 1 + 1 file changed, 1 insertion(+) diff --git a/Pilot1/Uno/topN_to_uno.py b/Pilot1/Uno/topN_to_uno.py index a4bafcc4..dc85ec0a 100644 --- a/Pilot1/Uno/topN_to_uno.py +++ b/Pilot1/Uno/topN_to_uno.py @@ -183,6 +183,7 @@ def build_dataframe(args): store.put('x_train_1', x_train_1, format='table') store.put('x_val_0', x_val_0, format='table') store.put('x_val_1', x_val_1, format='table') + store.close() if __name__ == '__main__': From 1727d72a72eeabb62fad0a22cb44d0f53c25a9b4 Mon Sep 17 00:00:00 2001 From: Hyunseung Yoo Date: Fri, 6 Sep 2019 13:19:26 -0500 Subject: [PATCH 030/331] output file name can be changed by --ouput --- Pilot1/Uno/topN_to_uno.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Pilot1/Uno/topN_to_uno.py b/Pilot1/Uno/topN_to_uno.py index dc85ec0a..42ef4c12 100644 --- a/Pilot1/Uno/topN_to_uno.py +++ b/Pilot1/Uno/topN_to_uno.py @@ -17,6 +17,8 @@ def parse_arguments(): help='True for building dataset incrementally') parser.add_argument('--fold', type=str, default=None, help='pre-calculated indexes for cross fold validation') + parser.add_argument('--output', type=str, default='topN.uno.h5', + help='output filename') args, unparsed = parser.parse_known_args() return args, unparsed @@ -176,7 +178,7 @@ def build_dataframe(args): x_val_1.columns = [''] * len(x_val_1.columns) # store - store = pd.HDFStore('topN.uno.h5', 'w', complevel=9, complib='blosc:snappy') + store = pd.HDFStore(args.output, 'w', complevel=9, complib='blosc:snappy') store.put('y_train', y_train, format='table') store.put('y_val', y_val, format='table') store.put('x_train_0', x_train_0, format='table') From 83f996090eb8191d6c338bd424e7840c192ce5e1 Mon Sep 17 00:00:00 2001 From: Hyunseung Yoo Date: Sun, 8 Sep 2019 23:08:43 -0500 Subject: [PATCH 031/331] handle edge case when validation partition is smaller than batch size --- Pilot1/Uno/uno_data.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/Pilot1/Uno/uno_data.py b/Pilot1/Uno/uno_data.py index 1406a8de..b25a3748 100644 --- a/Pilot1/Uno/uno_data.py +++ b/Pilot1/Uno/uno_data.py @@ -967,7 +967,11 @@ def __init__(self, partition='train', filename=None, batch_size=32, shuffle=Fals y = self.store.select('y_{}'.format(self.partition)) self.index = y.index self.size = len(self.index) - self.steps = self.size // self.batch_size + if self.size >= self.batch_size: + self.steps = self.size // self.batch_size + else: + self.steps = 1 + self.batch_size = self.size self.index_map = np.arange(self.steps) if self.shuffle: np.random.shuffle(self.index_map) From de274395dde091620742445b0225bbc2beac55da Mon Sep 17 00:00:00 2001 From: Hyunseung Yoo Date: Thu, 19 Sep 2019 11:16:53 -0500 Subject: [PATCH 032/331] add AUC training example --- Pilot1/Uno/README.AUC.md | 137 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 137 insertions(+) create mode 100644 Pilot1/Uno/README.AUC.md diff --git a/Pilot1/Uno/README.AUC.md b/Pilot1/Uno/README.AUC.md new file mode 100644 index 00000000..ae4bce2a --- /dev/null +++ b/Pilot1/Uno/README.AUC.md @@ -0,0 +1,137 @@ +# Training with static datafile +Use static datafile prebuilt and shared at `/vol/ml/hsyoo/shared/top_21_auc_1fold.uno.h5` + +``` +python uno_baseline_keras2.py --config_file uno_auc_model.txt --cache cache/top6_auc --use_exported_data /vol/ml/hsyoo/shared/top_21_auc_1fold.uno.h5 +``` + +The log will look like below, + +``` +Using TensorFlow backend. +Importing candle utils for keras +Configuration file: /ssd1/homes/hsyoo/projects/CANDLE/Benchmarks/Pilot1/Uno/uno_auc_model.txt +{'activation': 'relu', + 'agg_dose': 'AUC', + 'base_lr': None, + 'batch_normalization': False, + 'batch_size': 32, + 'cell_features': ['rnaseq'], + 'cell_types': None, + 'cp': True, + 'cv': 1, + 'dense': [1000, 1000, 1000, 1000, 1000], + 'dense_feature_layers': [1000, 1000, 1000], + 'drop': 0.1, + 'drug_features': ['descriptors'], + 'epochs': 50, + 'feature_subsample': 0, + 'gpus': 1, + 'learning_rate': 0.0001, + 'loss': 'mse', + 'max_val_loss': 1.0, + 'no_feature_source': True, + 'no_gen': False, + 'no_response_source': True, + 'optimizer': 'adamax', + 'preprocess_rnaseq': 'source_scale', + 'reduce_lr': True, + 'residual': False, + 'rng_seed': 2018, + 'save_path': 'save/uno', + 'scaling': 'std', + 'single': True, + 'solr_root': '', + 'test_sources': ['train'], + 'timeout': -1, + 'train_sources': ['CCLE'], + 'use_landmark_genes': True, + 'validation_split': 0.2, + 'verbose': False, + 'warmup_lr': True} +Params: +{'activation': 'relu', + 'agg_dose': 'AUC', + 'base_lr': None, + 'batch_normalization': False, + 'batch_size': 32, + 'by_cell': None, + 'by_drug': None, + 'cache': 'cache/top6_auc', + 'cell_feature_subset_path': '', + 'cell_features': ['rnaseq'], + 'cell_subset_path': '', + 'cell_types': None, + 'config_file': 'uno_auc_model.txt', + 'cp': True, + 'cv': 1, + 'datatype': , + 'dense': [1000, 1000, 1000, 1000, 1000], + 'dense_feature_layers': [1000, 1000, 1000], + 'drop': 0.1, + 'drug_feature_subset_path': '', + 'drug_features': ['descriptors'], + 'drug_median_response_max': 1, + 'drug_median_response_min': -1, + 'drug_subset_path': '', + 'epochs': 50, + 'experiment_id': 'EXP000', + 'export_csv': None, + 'export_data': None, + 'feature_subsample': 0, + 'feature_subset_path': '', + 'gpus': [], + 'growth_bins': 0, + 'initial_weights': None, + 'learning_rate': 0.0001, + 'logfile': None, + 'loss': 'mse', + 'max_val_loss': 1.0, + 'no_feature_source': True, + 'no_gen': False, + 'no_response_source': True, + 'optimizer': 'adamax', + 'output_dir': '/ssd1/homes/hsyoo/projects/CANDLE/Benchmarks/Pilot1/Uno/Output/EXP000/RUN000', + 'partition_by': None, + 'preprocess_rnaseq': 'source_scale', + 'reduce_lr': True, + 'residual': False, + 'rng_seed': 2018, + 'run_id': 'RUN000', + 'save_path': 'save/uno', + 'save_weights': None, + 'scaling': 'std', + 'shuffle': False, + 'single': True, + 'solr_root': '', + 'tb': False, + 'tb_prefix': 'tb', + 'test_sources': ['train'], + 'timeout': -1, + 'train_bool': True, + 'train_sources': ['CCLE'], + 'use_exported_data': '/vol/ml/hsyoo/shared/top_21_auc_1fold.uno.h5', + 'use_filtered_genes': False, + 'use_landmark_genes': True, + 'validation_split': 0.2, + 'verbose': None, + 'warmup_lr': True} + + ... +Total params: 16,224,001 +Trainable params: 16,224,001 +Non-trainable params: 0 +... +Between random pairs in y_val: + mse: 0.0474 + mae: 0.1619 + r2: -1.0103 + corr: -0.0051 +Data points per epoch: train = 423952, val = 52994 +Steps per epoch: train = 13248, val = 1656 +Epoch 1/50 +13248/13248 [==============================] - 198s 15ms/step - loss: 0.0235 - mae: 0.1048 - r2: -0.1311 - val_loss: 0.0145 - val_mae: 0.0903 - val_r2: 0.3393 +Current time ....198.278 +Epoch 2/50 +... +``` From 3449bf52e6b942f6cde3d00c0c523613e71c07f1 Mon Sep 17 00:00:00 2001 From: yngtodd Date: Thu, 19 Sep 2019 12:32:01 -0400 Subject: [PATCH 033/331] Remove spacing --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index 0009e403..4d49afd1 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,6 @@ ECP-CANDLE Benchmarks - This repository contains the CANDLE benchmark codes. These codes implement deep learning architectures that are relevant to problems in cancer. These architectures address problems at different biological scales, specifically problems at the molecular, cellular and population scales. The naming conventions adopted reflect the different biological scales. From 9ed7affce93830c605d36820580cdedd6591c60e Mon Sep 17 00:00:00 2001 From: Hyunseung Yoo Date: Thu, 19 Sep 2019 11:48:51 -0500 Subject: [PATCH 034/331] update data file location --- Pilot1/Uno/README.AUC.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Pilot1/Uno/README.AUC.md b/Pilot1/Uno/README.AUC.md index ae4bce2a..b80fee7c 100644 --- a/Pilot1/Uno/README.AUC.md +++ b/Pilot1/Uno/README.AUC.md @@ -1,8 +1,8 @@ # Training with static datafile -Use static datafile prebuilt and shared at `/vol/ml/hsyoo/shared/top_21_auc_1fold.uno.h5` +Use static datafile prebuilt and shared at `http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/top_21_auc_1fold.uno.h5` ``` -python uno_baseline_keras2.py --config_file uno_auc_model.txt --cache cache/top6_auc --use_exported_data /vol/ml/hsyoo/shared/top_21_auc_1fold.uno.h5 +python uno_baseline_keras2.py --config_file uno_auc_model.txt --cache cache/top6_auc --use_exported_data top_21_auc_1fold.uno.h5 ``` The log will look like below, @@ -110,7 +110,7 @@ Params: 'timeout': -1, 'train_bool': True, 'train_sources': ['CCLE'], - 'use_exported_data': '/vol/ml/hsyoo/shared/top_21_auc_1fold.uno.h5', + 'use_exported_data': 'top_21_auc_1fold.uno.h5', 'use_filtered_genes': False, 'use_landmark_genes': True, 'validation_split': 0.2, From ffb8633c7ba347567ee653f5cb94c974b82b6bd5 Mon Sep 17 00:00:00 2001 From: yngtodd Date: Thu, 19 Sep 2019 12:57:08 -0400 Subject: [PATCH 035/331] Add README --- Pilot3/P3B5/README.rst | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 Pilot3/P3B5/README.rst diff --git a/Pilot3/P3B5/README.rst b/Pilot3/P3B5/README.rst new file mode 100644 index 00000000..14d182b0 --- /dev/null +++ b/Pilot3/P3B5/README.rst @@ -0,0 +1,5 @@ +======================================= +P3B5 Differentiable Architecture Search +======================================= + +Differentiable architecture search (DARTS) benchmark using clinical pathology reports. From 72d976ea121ce0a623c4e96bd7014fea396597e4 Mon Sep 17 00:00:00 2001 From: yngtodd Date: Thu, 19 Sep 2019 13:16:45 -0400 Subject: [PATCH 036/331] Transplanting DARTS from external repository --- Pilot3/P3B5/darts/__init__.py | 3 + Pilot3/P3B5/darts/api/__init__.py | 1 + Pilot3/P3B5/darts/api/config.py | 88 ++++++ Pilot3/P3B5/darts/api/info.py | 72 +++++ Pilot3/P3B5/darts/api/model.py | 35 +++ Pilot3/P3B5/darts/architecture.py | 198 +++++++++++++ Pilot3/P3B5/darts/darts.py | 0 Pilot3/P3B5/darts/functional.py | 46 ++++ Pilot3/P3B5/darts/genotypes.py | 116 ++++++++ Pilot3/P3B5/darts/meters/__init__.py | 0 Pilot3/P3B5/darts/meters/accuracy.py | 30 ++ Pilot3/P3B5/darts/meters/average.py | 23 ++ Pilot3/P3B5/darts/modules/__init__.py | 0 Pilot3/P3B5/darts/modules/cell.py | 80 ++++++ Pilot3/P3B5/darts/modules/classifier.py | 26 ++ Pilot3/P3B5/darts/modules/mixed_layer.py | 41 +++ Pilot3/P3B5/darts/modules/network.py | 260 ++++++++++++++++++ .../P3B5/darts/modules/operations/__init__.py | 0 .../P3B5/darts/modules/operations/original.py | 167 +++++++++++ Pilot3/P3B5/darts/normal | 24 ++ Pilot3/P3B5/darts/reduction | 24 ++ Pilot3/P3B5/darts/utils/__init__.py | 0 Pilot3/P3B5/darts/utils/logging.py | 25 ++ Pilot3/P3B5/darts/visualize.py | 70 +++++ Pilot3/P3B5/p3b3.py | 213 ++++++++++++++ 25 files changed, 1542 insertions(+) create mode 100644 Pilot3/P3B5/darts/__init__.py create mode 100644 Pilot3/P3B5/darts/api/__init__.py create mode 100644 Pilot3/P3B5/darts/api/config.py create mode 100644 Pilot3/P3B5/darts/api/info.py create mode 100644 Pilot3/P3B5/darts/api/model.py create mode 100644 Pilot3/P3B5/darts/architecture.py create mode 100644 Pilot3/P3B5/darts/darts.py create mode 100644 Pilot3/P3B5/darts/functional.py create mode 100644 Pilot3/P3B5/darts/genotypes.py create mode 100644 Pilot3/P3B5/darts/meters/__init__.py create mode 100644 Pilot3/P3B5/darts/meters/accuracy.py create mode 100644 Pilot3/P3B5/darts/meters/average.py create mode 100644 Pilot3/P3B5/darts/modules/__init__.py create mode 100644 Pilot3/P3B5/darts/modules/cell.py create mode 100644 Pilot3/P3B5/darts/modules/classifier.py create mode 100644 Pilot3/P3B5/darts/modules/mixed_layer.py create mode 100644 Pilot3/P3B5/darts/modules/network.py create mode 100644 Pilot3/P3B5/darts/modules/operations/__init__.py create mode 100644 Pilot3/P3B5/darts/modules/operations/original.py create mode 100644 Pilot3/P3B5/darts/normal create mode 100644 Pilot3/P3B5/darts/reduction create mode 100644 Pilot3/P3B5/darts/utils/__init__.py create mode 100644 Pilot3/P3B5/darts/utils/logging.py create mode 100644 Pilot3/P3B5/darts/visualize.py create mode 100644 Pilot3/P3B5/p3b3.py diff --git a/Pilot3/P3B5/darts/__init__.py b/Pilot3/P3B5/darts/__init__.py new file mode 100644 index 00000000..974de4cc --- /dev/null +++ b/Pilot3/P3B5/darts/__init__.py @@ -0,0 +1,3 @@ +__author__ = 'Todd Young' +__email__ = 'youngmt1@ornl.gov' +__version__ = '0.1.0' diff --git a/Pilot3/P3B5/darts/api/__init__.py b/Pilot3/P3B5/darts/api/__init__.py new file mode 100644 index 00000000..3b4d86eb --- /dev/null +++ b/Pilot3/P3B5/darts/api/__init__.py @@ -0,0 +1 @@ +from .model import Model diff --git a/Pilot3/P3B5/darts/api/config.py b/Pilot3/P3B5/darts/api/config.py new file mode 100644 index 00000000..8d60579c --- /dev/null +++ b/Pilot3/P3B5/darts/api/config.py @@ -0,0 +1,88 @@ +import os +import datetime as dtm +from collections import namedtuple + +import torch + + +def banner(device): + """ Print a banner of the system config + + Parameters + ---------- + device : torch.device + """ + print("=" * 80) + info = get_torch_info() + torch_msg = ( + f"Pytorch version: {info.torch_version} ", + f"cuda version {info.cuda_version} ", + f"cudnn version {info.cudnn_version}" + ) + print(''.join(torch_msg)) + + if device.type == 'cuda': + device_idx = get_device_idx(device) + usage = memory_usage(device) + print(f"CUDA Device name {torch.cuda.get_device_name(device_idx)}") + print(f"CUDA memory - total: {usage.total} current usage: {usage.used}") + else: + print(f'Using CPU') + + print(dtm.datetime.now().strftime("%Y/%m/%d - %H:%M:%S")) + print("=" * 80) + + +def get_torch_info(): + """ Get Pytorch system info """ + VersionInfo = namedtuple( + "PytorchVersionInfo", + "torch_version cuda_version cudnn_version" + ) + return VersionInfo(torch.__version__, torch.version.cuda, torch.backends.cudnn.version()) + + +def get_device_idx(device): + """ Get the CUDA device from torch + + Parameters + ---------- + device : torch.device + + Returns + ------- + index of the CUDA device + """ + return 0 if device.index is None else device.index + + +def memory_usage(device): + """ Get GPU memory total and usage + + Parameters + ---------- + device : torch.device + + Returns + ------- + usage : namedtuple(torch.device, int, int) + Total memory of the GPU and its current usage + """ + if device.type == "cpu": + raise ValueError(f'Can only query GPU memory usage, but device is {device}') + + Usage = namedtuple("MemoryUsage", "device total used") + + if device.type == "cuda": + device_idx = get_device_idx(device) + + try: + total, used = os.popen( + 'nvidia-smi --query-gpu=memory.total,memory.used --format=csv,nounits,noheader' + ).read().split('\n')[device_idx].split(',') + except: + raise ValueError( + f'Attempted to query CUDA device {device_idx}, does this system have that many GPUs?' + ) + + return Usage(device, int(total), int(used)) \ No newline at end of file diff --git a/Pilot3/P3B5/darts/api/info.py b/Pilot3/P3B5/darts/api/info.py new file mode 100644 index 00000000..2a394c1f --- /dev/null +++ b/Pilot3/P3B5/darts/api/info.py @@ -0,0 +1,72 @@ +import typing +from collections.abc import abc + +import torch +import numpy as np +import pandas as pd + + +class TrainingHistory: + + def __init__(self): + self.data = [] + + def add(self, epoch_result): + """ Add a datapoint to the history """ + self.data.append(epoch_result) + + def frame(self): + return pd.DataFrame(self.data).set_index('epoch_index') + + +class TrainingInfo(abc.MutableMapping): + """ Information that needs to persist through training """ + + def __init__(self, start_epoch_index=0, run_name: typing.Optional[str]=None, metrics=None, callbacks=None): + self.data_dict = {} # optional information + + self.run_name = run_name + self.history = TrainingHistory() + self.start_epoch_index = start_epoch_index + self.metrics = metrics if metrics is not None else [] + self.callbacks = callbacks if callbacks is not None else [] + + def initialize(self): + for callback in self.callbacks: + callback.on_initialization(self) + + def on_train_begin(self): + """ Start the training process - always used, even in restarts """ + for callback in self.callbacks: + callback.on_train_begin(self) + + def on_train_end(self): + """ Finalize training process """ + for callback in self.callbacks: + callback.on_train_end(self) + + def __getitem__(self, key): + return self.data[key] + + def __setitem__(self, key, value): + self.data[key] = value + + def __delitem__(self, key): + del self.data[key] + + def __iter__(self): + return iter(self.data) + + def __len__(self): + return len(self.data) + + def __contains__(self, key): + return key in self.data + + +class EpochResultAccumulator(abc.MutableMapping): + """ Result of a single epoch of training """ + + def __init__(self, global_epoch_idx, metrics): + self.metrics = metrics + self.global_epoch_idx = global_epoch_idx \ No newline at end of file diff --git a/Pilot3/P3B5/darts/api/model.py b/Pilot3/P3B5/darts/api/model.py new file mode 100644 index 00000000..4911f6eb --- /dev/null +++ b/Pilot3/P3B5/darts/api/model.py @@ -0,0 +1,35 @@ +import hashlib + +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class Model(nn.Module): + """ Class representing sampleable neural network model """ + + def num_params(self): + """ Get the number of model parameters. """ + return sum(p.numel() for p in self.parameters()) + + def summary(self, hashsummary=False): + print(self) + print("-" * 80) + n_params = self.num_params() + print(f"Number of model parameters: {n_params}") + print("-" * 80) + + if hashsummary: + print('Hash Summary:') + for idx, hashvalue in enumerate(self.hashsummary()): + print(f"{idx}: {hashvalue}") + + def hashsummary(self): + """ Print a model summary - checksums of each layer parameters """ + children = list(self.children()) + + result = [] + for child in children: + result.extend(hashlib.sha256(x.detach().cpu().numpy().tobytes()).hexdigest() for x in child.parameters()) + + return result diff --git a/Pilot3/P3B5/darts/architecture.py b/Pilot3/P3B5/darts/architecture.py new file mode 100644 index 00000000..7c303ffc --- /dev/null +++ b/Pilot3/P3B5/darts/architecture.py @@ -0,0 +1,198 @@ +import torch +from torch import optim, autograd + +import darts.functional as F + + +class Hyperparameters: + alpha_lr = 3e-4 + alpha_wd = 1e-3 + + +class Architecture: + + def __init__(self, model, args, hyperparams=Hyperparameters(), device='cpu'): + self.momentum = args.momentum # momentum for optimizer of theta + self.wd = args.wd # weight decay for optimizer of model's theta + self.model = model # main model with respect to theta and alpha + self.device = device + + # this is the optimizer to optimize alpha parameter + self.optimizer = optim.Adam( + self.model.arch_parameters(), + lr=hyperparams.alpha_lr, + betas=(0.5, 0.999), + weight_decay=hyperparams.alpha_wd + ) + + def comp_unrolled_model(self, data, target, eta, optimizer): + """ Loss on train set and then update w_pi, not-in-place + + Parameters + ---------- + data : torch.tensor + + target : torch.tensor + + eta : float + + optimizer : torch.optim.optimizer + optimizer of theta, not optimizer of alpha + + Returns + ------- + model_unrolled + """ + # forward to get loss + loss = self.model.loss(data, target) + # flatten current weights + theta = F.flatten(self.model.parameters()).detach() + # theta: torch.Size([1930618]) + # print('theta:', theta.shape) + try: + # fetch momentum data from theta optimizer + moment = F.flatten(optimizer.state[v]['momentum_buffer'] for v in self.model.parameters()) + moment.mul_(self.momentum) + except: + moment = torch.zeros_like(theta) + + # flatten all gradients + dtheta = F.flatten(autograd.grad(loss, self.model.parameters())).data + # indeed, here we implement a simple SGD with momentum and weight decay + # theta = theta - eta * (moment + weight decay + dtheta) + theta = theta.sub(eta, moment + dtheta + self.wd * theta) + # construct a new model + unrolled_model = self.construct_model_from_theta(theta) + + return unrolled_model.to(self.device) + + def step(self, x_train, target_train, x_valid, target_valid, eta, optimizer, unrolled): + """ + update alpha parameter by manually computing the gradients + :param x_train: + :param target_train: + :param x_valid: + :param target_valid: + :param eta: + :param optimizer: theta optimizer + :param unrolled: + :return: + """ + # alpha optimizer + self.optimizer.zero_grad() + + # compute the gradient and write it into tensor.grad + # instead of generated by loss.backward() + if unrolled: + self.backward_step_unrolled(x_train, target_train, x_valid, target_valid, eta, optimizer) + else: + # directly optimize alpha on w, instead of w_pi + self.backward_step(x_valid, target_valid) + + self.optimizer.step() + + def backward_step(self, x_valid, target_valid): + """ + simply train on validate set and backward + :param x_valid: + :param target_valid: + :return: + """ + loss = self.model.loss(x_valid, target_valid) + # both alpha and theta require grad but only alpha optimizer will + # step in current phase. + loss.backward() + + def backward_step_unrolled(self, x_train, target_train, x_valid, target_valid, eta, optimizer): + """ + train on validate set based on update w_pi + :param x_train: + :param target_train: + :param x_valid: + :param target_valid: + :param eta: 0.01, according to author's comments + :param optimizer: theta optimizer + :return: + """ + # theta_pi = theta - lr * grad + unrolled_model = self.comp_unrolled_model(x_train, target_train, eta, optimizer) + # calculate loss on theta_pi + unrolled_loss = unrolled_model.loss(x_valid, target_valid) + + # this will update theta_pi model, but NOT theta model + unrolled_loss.backward() + # grad(L(w', a), a), part of Eq. 6 + dalpha = [v.grad for v in unrolled_model.arch_parameters()] + vector = [v.grad.data for v in unrolled_model.parameters()] + implicit_grads = self.hessian_vector_product(vector, x_train, target_train) + + for g, ig in zip(dalpha, implicit_grads): + # g = g - eta * ig, from Eq. 6 + g.data.sub_(eta, ig.data) + + # write updated alpha into original model + for v, g in zip(self.model.arch_parameters(), dalpha): + if v.grad is None: + v.grad = g.data + else: + v.grad.data.copy_(g.data) + + def construct_model_from_theta(self, theta): + """ + construct a new model with initialized weight from theta + it use .state_dict() and load_state_dict() instead of + .parameters() + fill_() + :param theta: flatten weights, need to reshape to original shape + :return: + """ + model = self.model.new() + state_dict = self.model.state_dict() + + params, offset = {}, 0 + for k, v in self.model.named_parameters(): + v_length = v.numel() + # restore theta[] value to original shape + params[k] = theta[offset: offset + v_length].view(v.size()) + offset += v_length + + assert offset == len(theta) + state_dict.update(params) + model.load_state_dict(state_dict) + model.to(self.device) + return model + + def hessian_vector_product(self, vector, data, target, r=1e-2): + """ + slightly touch vector value to estimate the gradient with respect to alpha + refer to Eq. 7 for more details. + :param vector: gradient.data of parameters theta + :param x: + :param target: + :param r: + :return: + """ + R = r / F.flatten(vector).norm() + + for p, v in zip(self.model.parameters(), vector): + # w+ = w + R * v + p.data.add_(R, v) + + loss = self.model.loss(data, target) + # gradient with respect to alpha + grads_p = autograd.grad(loss, self.model.arch_parameters()) + + for p, v in zip(self.model.parameters(), vector): + # w- = (w+R*v) - 2R*v + p.data.sub_(2 * R, v) + + loss = self.model.loss(data, target) + grads_n = autograd.grad(loss, self.model.arch_parameters()) + + for p, v in zip(self.model.parameters(), vector): + # w = (w+R*v) - 2R*v + R*v + p.data.add_(R, v) + + h= [(x - y).div_(2 * R) for x, y in zip(grads_p, grads_n)] + # h len: 2 h0 torch.Size([14, 8]) + # print('h len:', len(h), 'h0', h[0].shape) + return h diff --git a/Pilot3/P3B5/darts/darts.py b/Pilot3/P3B5/darts/darts.py new file mode 100644 index 00000000..e69de29b diff --git a/Pilot3/P3B5/darts/functional.py b/Pilot3/P3B5/darts/functional.py new file mode 100644 index 00000000..44b0c064 --- /dev/null +++ b/Pilot3/P3B5/darts/functional.py @@ -0,0 +1,46 @@ +import torch + + +def flatten(tensor): + """ Flatten a tensor. + + Parameters + ---------- + tensor : torch.tensor + + Returns + ------- + Flattened tensor + + Example + ------- + >>> x = torch.tensor([[0,1],[2,3]]) + >>> x_flattened = flatten(x) + >>> print(x) + >>> tensor([[0, 1], + [2, 3]]) + >>> print(x_flattened) + >>> tensor([0, 1, 2, 3]) + """ + return torch.cat([x.view(-1) for x in tensor]) + + +def multitask_loss(logits, target, criterion, reduce='mean'): + """ Compute multitask loss """ + losses = {} + for task, label in target.items(): + losses[task] = criterion(logits[task], label) + + if reduce: + total = 0 + for _, value in losses.items(): + total += value + + if reduce == "mean": + losses = total / len(losses) + elif reduce == "sum": + losses = total + else: + raise ValueError('Reduced loss must use either `mean` or `sum`!') + + return losses \ No newline at end of file diff --git a/Pilot3/P3B5/darts/genotypes.py b/Pilot3/P3B5/darts/genotypes.py new file mode 100644 index 00000000..caf874e7 --- /dev/null +++ b/Pilot3/P3B5/darts/genotypes.py @@ -0,0 +1,116 @@ +from collections import namedtuple + + +Genotype = namedtuple('Genotype', 'normal normal_concat reduce reduce_concat') + + +PRIMITIVES = [ + 'none', + 'max_pool_3', + 'avg_pool_3', + 'skip_connect', + 'sep_conv_3', + 'sep_conv_5', + 'dil_conv_3', + 'dil_conv_5', +] + + +AmoebaNet = Genotype( + normal=[ + ('avg_pool_3', 0), + ('max_pool_3', 1), + ('sep_conv_3', 0), + ('sep_conv_5', 2), + ('sep_conv_3', 0), + ('avg_pool_3', 3), + ('sep_conv_3', 1), + ('skip_connect', 1), + ('skip_connect', 0), + ('avg_pool_3', 1), + ], + normal_concat=[4, 5, 6], + reduce=[ + ('avg_pool_3', 0), + ('sep_conv_3', 1), + ('max_pool_3', 0), + ('sep_conv_7', 2), + ('sep_conv_7', 0), + ('avg_pool_3', 1), + ('max_pool_3', 0), + ('max_pool_3', 1), + ('conv_7x1_1', 0), + ('sep_conv_3', 5), + ], + reduce_concat=[3, 4, 6] +) + + +GradeNet36 = Genotype( + normal=[ + ('sep_conv_5', 1), + ('dil_conv_3', 0), + ('sep_conv_5', 2), + ('max_pool_3', 1), + ('max_pool_3', 2), + ('max_pool_3', 1), + ('skip_connect', 4), + ('max_pool_3', 1), + ], + normal_concat = [4, 5, 6], + reduce=[ + ('sep_conv_5', 0), + ('sep_conv_5', 1), + ('max_pool_3', 2), + ('sep_conv_3', 1), + ('dil_conv_5', 3), + ('sep_conv_5', 2), + ('sep_conv_5', 3), + ('dil_conv_5', 4) + ], + reduce_concat = [4, 5, 6] +) + + +Multitask = Genotype( + normal=[ + ('avg_pool_3', 1), + ('sep_conv_3', 0), + ('avg_pool_3', 1), + ('sep_conv_5', 2), + ('max_pool_3', 2), + ('max_pool_3', 1), + ('skip_connect', 4), + ('avg_pool_3', 1) + ], + normal_concat = [4, 5, 6], + reduce=[ + ('sep_conv_5', 1), + ('sep_conv_5', 0), + ('sep_conv_5', 2), + ('sep_conv_3', 0), + ('sep_conv_5', 3), + ('sep_conv_5', 2), + ('sep_conv_5', 4), + ('sep_conv_5', 3) + ], + reduce_concat = [4, 5, 6] +) + + +MultitaskN2C3 = Genotype( + normal=[ + ('max_pool_3', 0), + ('sep_conv_5', 1), + ('sep_conv_5', 1), + ('sep_conv_5', 0) + ], + normal_concat = [2, 3, 4], + reduce=[ + ('sep_conv_5', 1), + ('sep_conv_5', 0), + ('sep_conv_5', 1), + ('sep_conv_3', 2) + ], + reduce_concat=[2, 3, 4] +) \ No newline at end of file diff --git a/Pilot3/P3B5/darts/meters/__init__.py b/Pilot3/P3B5/darts/meters/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/Pilot3/P3B5/darts/meters/accuracy.py b/Pilot3/P3B5/darts/meters/accuracy.py new file mode 100644 index 00000000..66b4e63d --- /dev/null +++ b/Pilot3/P3B5/darts/meters/accuracy.py @@ -0,0 +1,30 @@ +from darts.meters.average import AverageMeter + + +class MultitaskAccuracyMeter: + + def __init__(self, tasks): + self.tasks = tasks + self.reset() + + def reset(self): + self.meters = self.create_meters() + + def create_meters(self): + """ Create an average meter for each task """ + meters = {} + for task, _ in self.tasks.items(): + meters[task] = AverageMeter('Acc@1', ':6.2f') + return meters + + def get_avg_accuracy(self, task): + return self.meters[task].avg + + def get_accuracy(self, task): + return self.meters[task].val + + def update(self, accuracies, batch_size): + for task, acc in accuracies.items(): + self.meters[task].update(acc[0].item(), batch_size) + + diff --git a/Pilot3/P3B5/darts/meters/average.py b/Pilot3/P3B5/darts/meters/average.py new file mode 100644 index 00000000..e82af8ec --- /dev/null +++ b/Pilot3/P3B5/darts/meters/average.py @@ -0,0 +1,23 @@ +class AverageMeter: + """Computes and stores the average and current value""" + + def __init__(self, name, fmt=':f'): + self.name = name + self.fmt = fmt + self.reset() + + def reset(self): + self.val = 0 + self.avg = 0 + self.sum = 0 + self.count = 0 + + def update(self, val, n=1): + self.val = val + self.sum += val * n + self.count += n + self.avg = self.sum / self.count + + def __str__(self): + fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})' + return fmtstr.format(**self.__dict__) diff --git a/Pilot3/P3B5/darts/modules/__init__.py b/Pilot3/P3B5/darts/modules/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/Pilot3/P3B5/darts/modules/cell.py b/Pilot3/P3B5/darts/modules/cell.py new file mode 100644 index 00000000..a25424cf --- /dev/null +++ b/Pilot3/P3B5/darts/modules/cell.py @@ -0,0 +1,80 @@ +import torch +import torch.nn as nn + +from darts.api import Model +from darts.modules.mixed_layer import MixedLayer +from darts.modules.operations.original import ConvBlock, FactorizedReduce + + +class Cell(Model): + + def __init__(self, num_nodes, multiplier, cpp, cp, c, reduction, reduction_prev): + """ + :param steps: 4, number of layers inside a cell + :param multiplier: 4 + :param cpp: 48 + :param cp: 48 + :param c: 16 + :param reduction: indicates whether to reduce the output maps width + :param reduction_prev: when previous cell reduced width, s1_d = s0_d//2 + in order to keep same shape between s1 and s0, we adopt prep0 layer to + reduce the s0 width by half. + """ + super(Cell, self).__init__() + + # indicating current cell is reduction or not + self.reduction = reduction + self.reduction_prev = reduction_prev + + # preprocess0 deal with output from prev_prev cell + if reduction_prev: + # if prev cell has reduced channel/double width, + # it will reduce width by half + self.preprocess0 = FactorizedReduce(cpp, c, affine=False) + else: + self.preprocess0 = ConvBlock(cpp, c, 1, 1, 0, affine=False) + # preprocess1 deal with output from prev cell + self.preprocess1 = ConvBlock(cp, c, 1, 1, 0, affine=False) + + # steps inside a cell + self.num_nodes = num_nodes # 4 + self.multiplier = multiplier # 4 + + self.layers = nn.ModuleList() + + for i in range(self.num_nodes): + # for each i inside cell, it connects with all previous output + # plus previous two cells' output + for j in range(2 + i): + # for reduction cell, it will reduce the heading 2 inputs only + stride = 2 if reduction and j < 2 else 1 + layer = MixedLayer(c, stride) + self.layers.append(layer) + + def forward(self, s0, s1, weights): + """ + :param s0: + :param s1: + :param weights: [14, 8] + :return: + """ + #print('s0:', s0.shape,end='=>') + s0 = self.preprocess0(s0) # [40, 48, 32, 32], [40, 16, 32, 32] + #print(s0.shape, self.reduction_prev) + #print('s1:', s1.shape,end='=>') + s1 = self.preprocess1(s1) # [40, 48, 32, 32], [40, 16, 32, 32] + #print(s1.shape) + + states = [s0, s1] + offset = 0 + # for each node, receive input from all previous intermediate nodes and s0, s1 + for i in range(self.num_nodes): # 4 + # [40, 16, 32, 32] + s = sum(self.layers[offset + j](h, weights[offset + j]) for j, h in enumerate(states)) + offset += len(states) + # append one state since s is the elem-wise addition of all output + states.append(s) + #print('node:',i, s.shape, self.reduction) + + # concat along dim=channel + return torch.cat(states[-self.multiplier:], dim=1) # 6 of [40, 16, 32, 32] \ No newline at end of file diff --git a/Pilot3/P3B5/darts/modules/classifier.py b/Pilot3/P3B5/darts/modules/classifier.py new file mode 100644 index 00000000..18910616 --- /dev/null +++ b/Pilot3/P3B5/darts/modules/classifier.py @@ -0,0 +1,26 @@ +from typing import Dict +import torch.nn as nn + + +class MultitaskClassifier(nn.Module): + + def __init__(self, input_dim: int, tasks: Dict[str, int]): + super(MultitaskClassifier, self).__init__() + self.tasks = tasks + + for task, num_classes in tasks.items(): + self.add_module( + task, + nn.Linear(input_dim, num_classes) + ) + + def num_classes(self, task): + """ Get number of classes for a task. """ + return self.tasks[task] + + def forward(self, x): + logits = {} + for task, _ in self.tasks.items(): + logits[task] = self._modules[task](x) + + return logits diff --git a/Pilot3/P3B5/darts/modules/mixed_layer.py b/Pilot3/P3B5/darts/modules/mixed_layer.py new file mode 100644 index 00000000..4ecd47d4 --- /dev/null +++ b/Pilot3/P3B5/darts/modules/mixed_layer.py @@ -0,0 +1,41 @@ +import torch +import torch.nn as nn + +from darts.api import Model +from darts.genotypes import PRIMITIVES +from darts.modules.operations.original import OPS + + +class MixedLayer(Model): + """ A mixture of 8 unit types + + We use weights to aggregate these outputs while training. + and softmax to select the strongest edges while inference. + """ + def __init__(self, c, stride): + super(MixedLayer, self).__init__() + self.reset(c, stride) + + def reset(self, c, stride): + self.layers = nn.ModuleList() + + for primitive in PRIMITIVES: + layer = OPS[primitive](c, stride, False) + + if 'pool' in primitive: + layer = nn.Sequential(layer, nn.BatchNorm1d(c, affine=False)) + + self.layers.append(layer) + + def forward(self, x, weights): + """ + Parameters + ---------- + x : torch.tensor + Data + + Weights : torch.tensor + alpha, [op_num:8], the output = sum of alpha * op(x) + """ + x = [w * layer(x) for w, layer in zip(weights, self.layers)] + return sum(x) diff --git a/Pilot3/P3B5/darts/modules/network.py b/Pilot3/P3B5/darts/modules/network.py new file mode 100644 index 00000000..8b0ea92c --- /dev/null +++ b/Pilot3/P3B5/darts/modules/network.py @@ -0,0 +1,260 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +from darts.api import Model +from darts.modules.cell import Cell +from darts.modules.classifier import MultitaskClassifier +from darts.genotypes import PRIMITIVES, Genotype + + +class Hyperparameters: + c = 8 + num_nodes = 2 + num_cells = 3 + channel_multiplier = 2 + stem_channel_multiplier = 2 + num_embeddings = 35095 # vocab size + embedding_dim = 1500 + + +class Network(Model): + """ Collection of cells """ + + def __init__(self, tasks, criterion, device='cpu', hyperparams=Hyperparameters()): + super(Network, self).__init__() + self.tasks = tasks + self.criterion = criterion + self.device = device + self.c = hyperparams.c + self.num_cells = hyperparams.num_cells + self.num_nodes = hyperparams.num_nodes + self.channel_multiplier = hyperparams.channel_multiplier + + # stem_multiplier is for stem network, + # and multiplier is for general cell + c_curr = hyperparams.stem_channel_multiplier * self.c # 3*16 + # stem network, convert 3 channel to c_curr + self.stem = nn.Sequential( + nn.Embedding( + num_embeddings=hyperparams.num_embeddings, + embedding_dim=hyperparams.embedding_dim + ), + nn.Conv1d(hyperparams.embedding_dim, c_curr, 3, padding=1, bias=False), + nn.BatchNorm1d(c_curr) + ).to(self.device) + + # c_curr means a factor of the output channels of current cell + # output channels = multiplier * c_curr + cpp, cp, c_curr = c_curr, c_curr, self.c + self.cells = nn.ModuleList() + reduction_prev = False + for i in range(hyperparams.num_cells): + + # for layer in the middle [1/3, 2/3], reduce via stride=2 + if i in [hyperparams.num_cells // 3, 2 * hyperparams.num_cells // 3]: + c_curr *= 2 + reduction = True + else: + reduction = False + + # [cp, h, h] => [multiplier*c_curr, h/h//2, h/h//2] + # the output channels = multiplier * c_curr + cell = Cell( + hyperparams.num_nodes, + hyperparams.channel_multiplier, + cpp, + cp, + c_curr, + reduction, + reduction_prev + ).to(self.device) + # update reduction_prev + reduction_prev = reduction + self.cells += [cell] + cpp, cp = cp, hyperparams.channel_multiplier * c_curr + + # adaptive pooling output size to 1x1 + self.global_pooling = nn.AdaptiveAvgPool1d(1) + # since cp records last cell's output channels + # it indicates the input channel number + # self.classifier = self.fc_layers(cp, tasks) + self.classifier = MultitaskClassifier(cp, tasks) + + # k is the total number of edges inside single cell, 14 + k = sum(1 for i in range(self.num_nodes) for j in range(2 + i)) + num_ops = len(PRIMITIVES) # 8 + + # TODO + # this kind of implementation will add alpha into self.parameters() + # it has num k of alpha parameters, and each alpha shape: [num_ops] + # it requires grad and can be converted to cpu/gpu automatically + self.alpha_normal = nn.Parameter(torch.randn(k, num_ops)) + self.alpha_reduce = nn.Parameter(torch.randn(k, num_ops)) + + with torch.no_grad(): + # initialize to smaller value + self.alpha_normal.mul_(1e-3) + self.alpha_reduce.mul_(1e-3) + + self._arch_parameters = [ + self.alpha_normal, + self.alpha_reduce, + ] + + def fc_layers(self, cp, tasks): + """ Create fully connnected layers for each task """ + fc_layers = {} + for task, dim in tasks.items(): + fc_layers[task] = nn.Linear(cp, dim).to(self.device) + return fc_layers + + def new(self): + """ Create a new model initialzed with current alpha parameters. + + Weights are left untouched. + + Returns + ------- + model : Network + New model initialized with current alpha. + """ + model = Network( + self.tasks, + self.criterion + ).to(self.device) + + for x, y in zip(model.arch_parameters(), self.arch_parameters()): + x.data.copy_(y.data) + + return model + + def forward(self, x): + """ + in: torch.Size([3, 3, 32, 32]) + stem: torch.Size([3, 48, 32, 32]) + cell: 0 torch.Size([3, 64, 32, 32]) False + cell: 1 torch.Size([3, 64, 32, 32]) False + cell: 2 torch.Size([3, 128, 16, 16]) True + cell: 3 torch.Size([3, 128, 16, 16]) False + cell: 4 torch.Size([3, 128, 16, 16]) False + cell: 5 torch.Size([3, 256, 8, 8]) True + cell: 6 torch.Size([3, 256, 8, 8]) False + cell: 7 torch.Size([3, 256, 8, 8]) False + pool: torch.Size([16, 256, 1, 1]) + linear: [b, 10] + :param x: + :return: + """ + #print('network in:', x.shape) + # s0 & s1 means the last cells' output + s0 = s1 = self.stem(x) # [b, 3, 32, 32] => [b, 48, 32, 32] + #print('network stem:', s0.shape) + #print('network stem1:', s1.shape) + + for i, cell in enumerate(self.cells): + # weights are shared across all reduction cell or normal cell + # according to current cell's type, it choose which architecture parameters + # to use + if cell.reduction: # if current cell is reduction cell + weights = F.softmax(self.alpha_reduce, dim=-1) + else: + weights = F.softmax(self.alpha_normal, dim=-1) # [14, 8] + # execute cell() firstly and then assign s0=s1, s1=result + s0, s1 = s1, cell(s0, s1, weights) # [40, 64, 32, 32] + #print('cell:',i, s1.shape, cell.reduction, cell.reduction_prev) + #print('\n') + + # s1 is the last cell's output + out = self.global_pooling(s1) + # logits = {} + # for task, fc in self.classifier.items(): + # logits[task] = fc(out.view(out.size(0), -1)) + logits = self.classifier(out.view(out.size(0), -1)) + + return logits + + def loss(self, data, target, reduce='mean'): + """ Calculate a value of loss function """ + logits = self(data) + + for task, logit in logits.items(): + logits[task] = logit.to(self.device) + + losses = {} + for task, label in target.items(): + label = label.to(self.device) + losses[task] = self.criterion(logits[task], label) + + if reduce: + total = 0 + for _, value in losses.items(): + total += value + + if reduce == "mean": + losses = total / len(losses) + elif reduce == "sum": + losses = total + else: + raise ValueError('Reduced loss must use either `mean` or `sum`!') + + return losses + + def arch_parameters(self): + return self._arch_parameters + + def genotype(self): + """ + :return: + """ + def _parse(weights): + """ + :param weights: [14, 8] + :return: + """ + gene = [] + n = 2 + start = 0 + for i in range(self.num_nodes): # for each node + end = start + n + W = weights[start:end].copy() # [2, 8], [3, 8], ... + edges = sorted(range(i + 2), # i+2 is the number of connection for node i + key=lambda x: -max(W[x][k] # by descending order + for k in range(len(W[x])) # get strongest ops + if k != PRIMITIVES.index('none')) + )[:2] # only has two inputs + for j in edges: # for every input nodes j of current node i + k_best = None + for k in range(len(W[j])): # get strongest ops for current input j->i + if k != PRIMITIVES.index('none'): + if k_best is None or W[j][k] > W[j][k_best]: + k_best = k + gene.append((PRIMITIVES[k_best], j)) # save ops and input node + start = end + n += 1 + return gene + + gene_normal = _parse(F.softmax(self.alpha_normal, dim=-1).data.cpu().numpy()) + gene_reduce = _parse(F.softmax(self.alpha_reduce, dim=-1).data.cpu().numpy()) + + concat = range(2 + self.num_nodes - self.channel_multiplier, self.num_nodes + 2) + genotype = Genotype( + normal=gene_normal, normal_concat=concat, + reduce=gene_reduce, reduce_concat=concat + ) + + return genotype + + +def new(c, num_classes, num_layers, criterion, device, steps=4, multiplier=4, stem_multiplier=3): + """ + create a new model and initialize it with current alpha parameters. + However, its weights are left untouched. + :return: + """ + model = Network(c, num_classes, num_layers, criterion, steps, multiplier, stem_multiplier).to(device) + + for x, y in zip(model_new.arch_parameters(), self.arch_parameters()): + x.data.copy_(y.data) + + return model diff --git a/Pilot3/P3B5/darts/modules/operations/__init__.py b/Pilot3/P3B5/darts/modules/operations/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/Pilot3/P3B5/darts/modules/operations/original.py b/Pilot3/P3B5/darts/modules/operations/original.py new file mode 100644 index 00000000..9bc5e14b --- /dev/null +++ b/Pilot3/P3B5/darts/modules/operations/original.py @@ -0,0 +1,167 @@ +""" +CNN NLP operations closely modeled after the original paper's vision task. +""" + +import torch +import torch.nn as nn + +from darts.api import Model + + +OPS = { + 'none' : lambda c, stride, affine: Zero(stride), + 'avg_pool_3' : lambda c, stride, affine: nn.AvgPool1d(3, stride=stride, padding=1, count_include_pad=False), + 'max_pool_3' : lambda c, stride, affine: nn.MaxPool1d(3, stride=stride, padding=1), + 'skip_connect': lambda c, stride, affine: Identity() if stride == 1 else FactorizedReduce(c, c, affine=affine), + 'sep_conv_3' : lambda c, stride, affine: SepConv(c, c, 3, stride, 1, affine=affine), + 'sep_conv_5' : lambda c, stride, affine: SepConv(c, c, 5, stride, 2, affine=affine), + 'sep_conv_7' : lambda c, stride, affine: SepConv(c, c, 7, stride, 3, affine=affine), + 'dil_conv_3' : lambda c, stride, affine: DilConv(c, c, 3, stride, 2, 2, affine=affine), + 'dil_conv_5' : lambda c, stride, affine: DilConv(c, c, 5, stride, 4, 2, affine=affine), + 'convblock_7' : lambda c, stride, affine: ConvBlock(c, c, 7, stride, 3, affine=affine), +} + + +class ConvBlock(Model): + """ ReLu -> Conv1d -> BatchNorm """ + + def __init__(self, c_in, c_out, kernel_size, stride, padding, affine=True): + super(ConvBlock, self).__init__() + + self.op = nn.Sequential( + nn.ReLU(inplace=False), + nn.Conv1d(c_in, c_out, kernel_size, stride=stride, padding=padding, bias=False), + nn.BatchNorm1d(c_out, affine=affine) + ) + + def forward(self, x): + return self.op(x) + + +class DilConv(Model): + """ ReLU Dilated Convolution """ + + def __init__(self, c_in, c_out, kernel_size, stride, padding, dilation, affine=True): + super(DilConv, self).__init__() + + self.op = nn.Sequential( + nn.ReLU(inplace=False), + + nn.Conv1d( + c_in, + c_in, + kernel_size=kernel_size, + stride=stride, + padding=padding, + dilation=dilation, + groups=c_in, + bias=False + ), + + nn.Conv1d( + c_in, + c_out, + kernel_size=1, + padding=0, + bias=False + ), + + nn.BatchNorm1d(c_out, affine=affine), + ) + + def forward(self, x): + return self.op(x) + + +class FactorizedReduce(Model): + """ Reduce the feature maps by half, maintaining number of channels + + Example + ------- + x: torch.Size([2, 10, 12]) + out: [batch_size, c_out, d//2] + out: torch.Size([2, 10, 6]) + """ + + def __init__(self, c_in, c_out, affine=True): + super(FactorizedReduce, self).__init__() + assert c_out % 2 == 0 + + self.conv_1 = nn.Conv1d(c_in, c_out // 2, 1, stride=2, padding=0, bias=False) + self.conv_2 = nn.Conv1d(c_in, c_out // 2, 1, stride=2, padding=0, bias=False) + self.bn = nn.BatchNorm1d(c_out, affine=affine) + + def forward(self, x): + x = torch.relu(x) + out = torch.cat([self.conv_1(x), self.conv_2(x[:, :, 1:])], dim=1) + out = self.bn(out) + return out + + +class Identity(Model): + + def __init__(self): + super(Identity, self).__init__() + + def forward(self, x): + return x + + +class SepConv(Model): + """ Separable Convolution Block """ + def __init__(self, c_in, c_out, kernel_size, stride, padding, affine=True): + super(SepConv, self).__init__() + + self.op = nn.Sequential( + nn.ReLU(inplace=False), + + nn.Conv1d( + c_in, + c_in, + kernel_size=kernel_size, + stride=stride, + padding=padding, + groups=c_in, + bias=False + ), + + nn.Conv1d( + c_in, + c_in, + kernel_size=1, + padding=0, + bias=False + ), + + nn.BatchNorm1d(c_in, affine=affine), + nn.ReLU(inplace=False), + + nn.Conv1d( + c_in, + c_in, + kernel_size=kernel_size, + stride=1, + padding=padding, + groups=c_in, + bias=False + ), + + nn.Conv1d(c_in, c_out, kernel_size=1, padding=0, bias=False), + nn.BatchNorm1d(c_out, affine=affine), + ) + + def forward(self, x): + return self.op(x) + + +class Zero(nn.Module): + """ Zero tensor by stride """ + + def __init__(self, stride): + super(Zero, self).__init__() + self.stride = stride + + def forward(self, x): + if self.stride == 1: + return x.mul(0.) + return x[:, :, ::self.stride].mul(0.) diff --git a/Pilot3/P3B5/darts/normal b/Pilot3/P3B5/darts/normal new file mode 100644 index 00000000..64ac240d --- /dev/null +++ b/Pilot3/P3B5/darts/normal @@ -0,0 +1,24 @@ +digraph { + node [align=center fontname=times fontsize=20 height=0.5 penwidth=2 shape=rect style=filled width=0.5] + edge [fontname=times fontsize=20] +rankdir=LR + "c_{k-2}" [fillcolor=darkseagreen2] + "c_{k-1}" [fillcolor=darkseagreen2] + 0 [fillcolor=lightblue] + 1 [fillcolor=lightblue] + 2 [fillcolor=lightblue] + 3 [fillcolor=lightblue] + "c_{k-1}" -> 0 [label=avg_pool_3 fillcolor=gray] + "c_{k-2}" -> 0 [label=sep_conv_3 fillcolor=gray] + "c_{k-1}" -> 1 [label=avg_pool_3 fillcolor=gray] + 0 -> 1 [label=sep_conv_5 fillcolor=gray] + 0 -> 2 [label=max_pool_3 fillcolor=gray] + "c_{k-1}" -> 2 [label=max_pool_3 fillcolor=gray] + 2 -> 3 [label=skip_connect fillcolor=gray] + "c_{k-1}" -> 3 [label=avg_pool_3 fillcolor=gray] + "c_{k}" [fillcolor=palegoldenrod] + 0 -> "c_{k}" [fillcolor=gray] + 1 -> "c_{k}" [fillcolor=gray] + 2 -> "c_{k}" [fillcolor=gray] + 3 -> "c_{k}" [fillcolor=gray] +} diff --git a/Pilot3/P3B5/darts/reduction b/Pilot3/P3B5/darts/reduction new file mode 100644 index 00000000..8dca275e --- /dev/null +++ b/Pilot3/P3B5/darts/reduction @@ -0,0 +1,24 @@ +digraph { + node [align=center fontname=times fontsize=20 height=0.5 penwidth=2 shape=rect style=filled width=0.5] + edge [fontname=times fontsize=20] +rankdir=LR + "c_{k-2}" [fillcolor=darkseagreen2] + "c_{k-1}" [fillcolor=darkseagreen2] + 0 [fillcolor=lightblue] + 1 [fillcolor=lightblue] + 2 [fillcolor=lightblue] + 3 [fillcolor=lightblue] + "c_{k-1}" -> 0 [label=sep_conv_5 fillcolor=gray] + "c_{k-2}" -> 0 [label=sep_conv_5 fillcolor=gray] + 0 -> 1 [label=sep_conv_5 fillcolor=gray] + "c_{k-2}" -> 1 [label=sep_conv_3 fillcolor=gray] + 1 -> 2 [label=sep_conv_5 fillcolor=gray] + 0 -> 2 [label=sep_conv_5 fillcolor=gray] + 2 -> 3 [label=sep_conv_5 fillcolor=gray] + 1 -> 3 [label=sep_conv_5 fillcolor=gray] + "c_{k}" [fillcolor=palegoldenrod] + 0 -> "c_{k}" [fillcolor=gray] + 1 -> "c_{k}" [fillcolor=gray] + 2 -> "c_{k}" [fillcolor=gray] + 3 -> "c_{k}" [fillcolor=gray] +} diff --git a/Pilot3/P3B5/darts/utils/__init__.py b/Pilot3/P3B5/darts/utils/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/Pilot3/P3B5/darts/utils/logging.py b/Pilot3/P3B5/darts/utils/logging.py new file mode 100644 index 00000000..c846e69f --- /dev/null +++ b/Pilot3/P3B5/darts/utils/logging.py @@ -0,0 +1,25 @@ +from loguru import logger + + +logger.add("darts_p3b3.log", format="{time:YYYY-MM-DD at HH:mm:ss} | {level} | {message}", level="INFO") + + +def log_accuracy(accuracy, split: str='train'): + """ Log the average accuracy + + Parameters + ---------- + accuracy: darts.MultitaskAccuracyMeter + Current accuracy meter state + + split: str + Either training of testing + """ + acc_info = ( + f">>> {split.upper()} Accuracy - Subsite: {accuracy.get_avg_accuracy('subsite'):.4f}, " + f"Laterality: {accuracy.get_avg_accuracy('laterality'):.4f}, " + f"Behavior: {accuracy.get_avg_accuracy('behavior'):.4f}, " + f"Grade: {accuracy.get_avg_accuracy('grade'):.4f}" + ) + + logger.info(acc_info) \ No newline at end of file diff --git a/Pilot3/P3B5/darts/visualize.py b/Pilot3/P3B5/darts/visualize.py new file mode 100644 index 00000000..bf048ca6 --- /dev/null +++ b/Pilot3/P3B5/darts/visualize.py @@ -0,0 +1,70 @@ +import sys +from graphviz import Digraph +import genotypes + + +def plot(genotype, filename): + """ Plot the graph of a given genotype """ + g = Digraph( + format='pdf', + edge_attr = dict(fontsize='20', fontname="times"), + node_attr = dict( + style='filled', + shape='rect', + align='center', + fontsize='20', + height='0.5', + width='0.5', + penwidth='2', + fontname="times" + ), + engine='dot' + ) + + g.body.extend(['rankdir=LR']) + + g.node("c_{k-2}", fillcolor='darkseagreen2') + g.node("c_{k-1}", fillcolor='darkseagreen2') + + assert len(genotype) % 2 == 0 + steps = len(genotype) // 2 + + for i in range(steps): + g.node(str(i), fillcolor='lightblue') + + for i in range(steps): + for k in [2 * i, 2 * i + 1]: + op, j = genotype[k] + if j == 0: + u = "c_{k-2}" + elif j == 1: + u = "c_{k-1}" + else: + u = str(j - 2) + v = str(i) + g.edge(u, v, label=op, fillcolor="gray") + + g.node("c_{k}", fillcolor='palegoldenrod') + + for i in range(steps): + g.edge(str(i), "c_{k}", fillcolor="gray") + + g.render(filename, view=True) + + +if __name__ == '__main__': + if len(sys.argv) != 2: + print("usage:\n python {} ARCH_NAME".format(sys.argv[0])) + sys.exit(1) + + genotype_name = sys.argv[1] + + try: + genotype = eval('genotypes.{}'.format(genotype_name)) + except AttributeError: + print("{} is not specified in genotypes.py".format(genotype_name)) + sys.exit(1) + + plot(genotype.normal, "normal") + plot(genotype.reduce, "reduction") + diff --git a/Pilot3/P3B5/p3b3.py b/Pilot3/P3B5/p3b3.py new file mode 100644 index 00000000..2f083d1c --- /dev/null +++ b/Pilot3/P3B5/p3b3.py @@ -0,0 +1,213 @@ +import os +import sys +import argparse +from loguru import logger + +import torch +import torch.nn as nn +from torch import optim +import torch.nn.functional as F +from torch.utils.data import DataLoader + +from datastore.data import P3B3 +from hammer.metrics import multitask_accuracy_topk +from hammer.meters.average import AverageMeter + +from darts.api.config import banner +from darts.modules.network import Network +from darts.architecture import Architecture +from darts.functional import multitask_loss +from darts.meters.accuracy import MultitaskAccuracyMeter +from darts.utils.logging import log_accuracy + + +def parse_args(): + parser = argparse.ArgumentParser(description='P3B3 Darts Example') + parser.add_argument('--batch_size', type=int, default=100, metavar='N', + help='input batch size for training (default: 128)') + parser.add_argument('--lr', type=float, default=0.025, + help='init learning rate') + parser.add_argument('--lr_min', type=float, default=0.001, + help='min learning rate') + parser.add_argument('--momentum', type=float, default=0.9, + help='momentum') + parser.add_argument('--wd', type=float, default=3e-4, + help='weight decay') + parser.add_argument('--grad_clip', type=float, default=5, + help='gradient clipping range') + parser.add_argument('--epochs', type=int, default=1, metavar='N', + help='number of epochs to train (default: 10)') + parser.add_argument('--no_cuda', action='store_true', default=False, + help='enables CUDA training') + parser.add_argument('--gpu_id', type=int, default=0, + help='cuda device id for torch.device') + parser.add_argument('--seed', type=int, default=1, metavar='S', + help='random seed (default: 1)') + parser.add_argument('--log_interval', type=int, default=10, metavar='N', + help='how many batches to wait before logging training status') + parser.add_argument('--datapath', type=str, default='/Users/yngtodd/data', + help='path to the dataset') + parser.add_argument('--unrolled', action='store_true', default=False, + help='use one-step unrolled validation loss') + args = parser.parse_args() + return args + + +def main(): + args = parse_args() + args.cuda = not args.no_cuda and torch.cuda.is_available() + + device = torch.device(f'cuda:{args.gpu_id}' if args.cuda else "cpu") + banner(device=device) + + train_data = P3B3(args.datapath, 'train', download=True) + valid_data = P3B3(args.datapath, 'test') + + trainloader = DataLoader(train_data, batch_size=args.batch_size) + validloader = DataLoader(valid_data, batch_size=args.batch_size) + + criterion = nn.CrossEntropyLoss().to(device) + + tasks = { + 'subsite': 6, + 'laterality': 2, + 'behavior': 2, + 'grade': 3 + } + + model = Network(tasks=tasks, criterion=criterion, device=device).to(device) + architecture = Architecture(model, args, device=device) + + optimizer = optim.SGD( + model.parameters(), + args.lr, + momentum=args.momentum, + weight_decay=args.wd + ) + + scheduler = optim.lr_scheduler.CosineAnnealingLR( + optimizer, + float(args.epochs), + eta_min=args.lr_min + ) + + for epoch in range(args.epochs): + + scheduler.step() + lr = scheduler.get_lr()[0] + logger.info(f'\nEpoch: {epoch} lr: {lr}') + + genotype = model.genotype() + logger.info(f'Genotype: {genotype}') + + #logger.debug(F.softmax(model.alphas_normal, dim=-1)) + #logger.debug(F.softmax(model.alphas_reduce, dim=-1)) + + # training + train_acc, train_obj = train( + trainloader, + validloader, + model, + architecture, + criterion, + optimizer, + lr, + args, + tasks, + device + ) + + # validation + valid_acc, valid_obj = infer(validloader, model, criterion, args, tasks, device) + + logger.info(f'\nEpoch {epoch} stats:') + log_accuracy(train_acc, 'train') + log_accuracy(valid_acc, 'valid') + + #utils.save(model, os.path.join(args.exp_path, 'search.pt')) + + +def train(trainloader, validloader, model, architecture, criterion, optimizer, lr, args, tasks, device): + losses = AverageMeter('LossMeter') + top1 = MultitaskAccuracyMeter(tasks) + + valid_iter = iter(trainloader) + + for step, (data, target) in enumerate(trainloader): + + batch_size = data.size(0) + model.train() + + data = data.to(device) + + for task, label in target.items(): + target[task] = target[task].to(device) + + x_search, target_search = next(valid_iter) + x_search = x_search.to(device) + + for task, label in target_search.items(): + target_search[task] = target_search[task].to(device) + + # 1. update alpha + architecture.step( + data, + target, + x_search, + target_search, + lr, + optimizer, + unrolled=args.unrolled + ) + + logits = model(data) + loss = multitask_loss(logits, target, criterion, reduce='mean') + + # 2. update weight + optimizer.zero_grad() + loss.backward() + nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip) + optimizer.step() + + prec1 = multitask_accuracy_topk(logits, target, topk=(1,)) + losses.update(loss.item(), batch_size) + top1.update(prec1, batch_size) + + if step % args.log_interval == 0: + logger.info(f'Step: {step} loss: {losses.avg:.4}') + log_accuracy(top1) + + return top1, losses.avg + + +def infer(validloader, model, criterion, args, tasks, device): + losses = AverageMeter('LossMeter') + top1 = MultitaskAccuracyMeter(tasks) + + model.eval() + + with torch.no_grad(): + for step, (data, target) in enumerate(validloader): + + data = data.to(device) + for task, label in target.items(): + target[task] = target[task].to(device) + + batch_size = data.size(0) + + logits = model(data) + loss = multitask_loss(logits, target, criterion, reduce='mean') + + prec1 = multitask_accuracy_topk(logits, target, topk=(1,)) + losses.update(loss.item(), batch_size) + top1.update(prec1, batch_size) + + if step % args.log_interval == 0: + logger.info(f'>> Validation: {step} loss: {losses.avg:.4}') + log_accuracy(top1, 'valid') + + return top1, losses.avg + + +if __name__=='__main__': + main() From 0f3bcfbed6b05540a6e3648cfab0d3b87137d11b Mon Sep 17 00:00:00 2001 From: yngtodd Date: Thu, 19 Sep 2019 13:19:54 -0400 Subject: [PATCH 037/331] Remove DAG diagrams --- Pilot3/P3B5/darts/normal | 24 ------------------------ Pilot3/P3B5/darts/reduction | 24 ------------------------ 2 files changed, 48 deletions(-) delete mode 100644 Pilot3/P3B5/darts/normal delete mode 100644 Pilot3/P3B5/darts/reduction diff --git a/Pilot3/P3B5/darts/normal b/Pilot3/P3B5/darts/normal deleted file mode 100644 index 64ac240d..00000000 --- a/Pilot3/P3B5/darts/normal +++ /dev/null @@ -1,24 +0,0 @@ -digraph { - node [align=center fontname=times fontsize=20 height=0.5 penwidth=2 shape=rect style=filled width=0.5] - edge [fontname=times fontsize=20] -rankdir=LR - "c_{k-2}" [fillcolor=darkseagreen2] - "c_{k-1}" [fillcolor=darkseagreen2] - 0 [fillcolor=lightblue] - 1 [fillcolor=lightblue] - 2 [fillcolor=lightblue] - 3 [fillcolor=lightblue] - "c_{k-1}" -> 0 [label=avg_pool_3 fillcolor=gray] - "c_{k-2}" -> 0 [label=sep_conv_3 fillcolor=gray] - "c_{k-1}" -> 1 [label=avg_pool_3 fillcolor=gray] - 0 -> 1 [label=sep_conv_5 fillcolor=gray] - 0 -> 2 [label=max_pool_3 fillcolor=gray] - "c_{k-1}" -> 2 [label=max_pool_3 fillcolor=gray] - 2 -> 3 [label=skip_connect fillcolor=gray] - "c_{k-1}" -> 3 [label=avg_pool_3 fillcolor=gray] - "c_{k}" [fillcolor=palegoldenrod] - 0 -> "c_{k}" [fillcolor=gray] - 1 -> "c_{k}" [fillcolor=gray] - 2 -> "c_{k}" [fillcolor=gray] - 3 -> "c_{k}" [fillcolor=gray] -} diff --git a/Pilot3/P3B5/darts/reduction b/Pilot3/P3B5/darts/reduction deleted file mode 100644 index 8dca275e..00000000 --- a/Pilot3/P3B5/darts/reduction +++ /dev/null @@ -1,24 +0,0 @@ -digraph { - node [align=center fontname=times fontsize=20 height=0.5 penwidth=2 shape=rect style=filled width=0.5] - edge [fontname=times fontsize=20] -rankdir=LR - "c_{k-2}" [fillcolor=darkseagreen2] - "c_{k-1}" [fillcolor=darkseagreen2] - 0 [fillcolor=lightblue] - 1 [fillcolor=lightblue] - 2 [fillcolor=lightblue] - 3 [fillcolor=lightblue] - "c_{k-1}" -> 0 [label=sep_conv_5 fillcolor=gray] - "c_{k-2}" -> 0 [label=sep_conv_5 fillcolor=gray] - 0 -> 1 [label=sep_conv_5 fillcolor=gray] - "c_{k-2}" -> 1 [label=sep_conv_3 fillcolor=gray] - 1 -> 2 [label=sep_conv_5 fillcolor=gray] - 0 -> 2 [label=sep_conv_5 fillcolor=gray] - 2 -> 3 [label=sep_conv_5 fillcolor=gray] - 1 -> 3 [label=sep_conv_5 fillcolor=gray] - "c_{k}" [fillcolor=palegoldenrod] - 0 -> "c_{k}" [fillcolor=gray] - 1 -> "c_{k}" [fillcolor=gray] - 2 -> "c_{k}" [fillcolor=gray] - 3 -> "c_{k}" [fillcolor=gray] -} From fc099706e8256201981df76435a06422bb952747 Mon Sep 17 00:00:00 2001 From: yngtodd Date: Thu, 19 Sep 2019 13:20:39 -0400 Subject: [PATCH 038/331] Clean up docstring --- Pilot3/P3B5/darts/api/model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Pilot3/P3B5/darts/api/model.py b/Pilot3/P3B5/darts/api/model.py index 4911f6eb..b1b701c1 100644 --- a/Pilot3/P3B5/darts/api/model.py +++ b/Pilot3/P3B5/darts/api/model.py @@ -6,7 +6,7 @@ class Model(nn.Module): - """ Class representing sampleable neural network model """ + """ Abstract class for Pytorch models """ def num_params(self): """ Get the number of model parameters. """ From 90e5bead9aa353cdf2b483200ab5fc6e862eadb0 Mon Sep 17 00:00:00 2001 From: yngtodd Date: Thu, 19 Sep 2019 13:44:23 -0400 Subject: [PATCH 039/331] Remove darts entrypoint to make way for candle --- Pilot3/P3B5/darts/darts.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 Pilot3/P3B5/darts/darts.py diff --git a/Pilot3/P3B5/darts/darts.py b/Pilot3/P3B5/darts/darts.py deleted file mode 100644 index e69de29b..00000000 From 355bf130b7ca0b0ee9f83080eb47f8c1d450929f Mon Sep 17 00:00:00 2001 From: "Young, Todd" Date: Wed, 2 Oct 2019 10:52:10 -0400 Subject: [PATCH 040/331] Add model running files --- Pilot3/P3B5/p3b5.py | 23 +++++++++++++++ Pilot3/P3B5/p3b5_baseline_pytorch.py | 33 ++++++++++++++++++++++ Pilot3/P3B5/{p3b3.py => p3b5_darts.py} | 39 ++++++++++++++++++++++++++ Pilot3/P3B5/p3b5_default_model.txt | 5 ++++ 4 files changed, 100 insertions(+) create mode 100644 Pilot3/P3B5/p3b5.py create mode 100644 Pilot3/P3B5/p3b5_baseline_pytorch.py rename Pilot3/P3B5/{p3b3.py => p3b5_darts.py} (85%) create mode 100644 Pilot3/P3B5/p3b5_default_model.txt diff --git a/Pilot3/P3B5/p3b5.py b/Pilot3/P3B5/p3b5.py new file mode 100644 index 00000000..1a81b9f4 --- /dev/null +++ b/Pilot3/P3B5/p3b5.py @@ -0,0 +1,23 @@ +import candle +import p3b5 as bmk + + +required = [ + 'learning_rate', 'batch_size', 'epochs', 'dropout', \ + 'optimizer', 'wv_len', \ + 'filter_sizes', 'filter_sets', 'num_filters', 'emb_l2', 'w_l2'] + + +class BenchmarkP3B3(candle.Benchmark): + + def set_locals(self): + """Functionality to set variables specific for the benchmark + - required: set of required parameters for the benchmark. + - additional_definitions: list of dictionaries describing the additional parameters for the + benchmark. + """ + + if required is not None: + self.required = set(required) + # if additional_definitions is not None: + # self.additional_definitions = additional_definitions diff --git a/Pilot3/P3B5/p3b5_baseline_pytorch.py b/Pilot3/P3B5/p3b5_baseline_pytorch.py new file mode 100644 index 00000000..4c7d5a6a --- /dev/null +++ b/Pilot3/P3B5/p3b5_baseline_pytorch.py @@ -0,0 +1,33 @@ +import candle +import p3b5_darts as bmk + + +def initialize_parameters(): + """ Initialize the parameters for the P3B5 benchmark """ + + p3b5_bench = bmk.BenchmarkP3B3( + bmk.file_path, + 'p3b5_default_model.txt', + 'pytorch', + prog='p3b5_baseline', + desc='Differentiable Architecture Search - Pilot 3 Benchmark 5', + ) + + # Initialize parameters + gParameters = candle.initialize_parameters(p3b5_bench) + #bmk.logger.info('Params: {}'.format(gParameters)) + return gParameters + + +def fetch_data(gParameters): + """ Download and untar data + + Args: + gParameters: parameters from candle + + Returns: + path to where the data is located + """ + path = gParameters['data_url'] + fpath = candle.fetch_file(path + gParameters['train_data'], 'Pilot3', untar=True) + return fpath diff --git a/Pilot3/P3B5/p3b3.py b/Pilot3/P3B5/p3b5_darts.py similarity index 85% rename from Pilot3/P3B5/p3b3.py rename to Pilot3/P3B5/p3b5_darts.py index 2f083d1c..5d810260 100644 --- a/Pilot3/P3B5/p3b3.py +++ b/Pilot3/P3B5/p3b5_darts.py @@ -3,6 +3,8 @@ import argparse from loguru import logger +import candle + import torch import torch.nn as nn from torch import optim @@ -21,6 +23,43 @@ from darts.utils.logging import log_accuracy +file_path = os.path.dirname(os.path.realpath(__file__)) +lib_path = os.path.abspath(os.path.join(file_path, '..')) +sys.path.append(lib_path) +lib_path2 = os.path.abspath(os.path.join(file_path, '..', '..', 'common')) +sys.path.append(lib_path2) + + +def initialize_parameters(): + """ Initialize the parameters for the P3B5 benchmark """ + + p3b5_bench = bmk.BenchmarkP3B3( + bmk.file_path, + 'p3b5_default_model.txt', + 'pytorch', + prog='p3b5_baseline', + desc='Differentiable Architecture Search - Pilot 3 Benchmark 5', + ) + + # Initialize parameters + gParameters = candle.initialize_parameters(p3b5_bench) + #bmk.logger.info('Params: {}'.format(gParameters)) + + return gParameters + + +def fetch_data(gParameters): + """ Downloads and decompresses the data if not locally available. + Since the training data depends on the model definition it is not loaded, + instead the local path where the raw data resides is returned + """ + + path = gParameters['data_url'] + fpath = candle.fetch_file(path + gParameters['train_data'], 'Pilot3', untar=True) + + return fpath + + def parse_args(): parser = argparse.ArgumentParser(description='P3B3 Darts Example') parser.add_argument('--batch_size', type=int, default=100, metavar='N', diff --git a/Pilot3/P3B5/p3b5_default_model.txt b/Pilot3/P3B5/p3b5_default_model.txt new file mode 100644 index 00000000..aa78332e --- /dev/null +++ b/Pilot3/P3B5/p3b5_default_model.txt @@ -0,0 +1,5 @@ +[Global_Params] +model_name = 'p3b5' +learning_rate = 0.01 +batch_size = 100 +epochs = 10 From a36b75a80d8f3579d1b3dd1c03812a524d46b7d8 Mon Sep 17 00:00:00 2001 From: "Young, Todd" Date: Wed, 2 Oct 2019 11:17:14 -0400 Subject: [PATCH 041/331] Set hyperparameters --- Pilot3/P3B5/p3b5.py | 39 ++++++++++++++++++------------ Pilot3/P3B5/p3b5_darts.py | 30 ----------------------- Pilot3/P3B5/p3b5_default_model.txt | 8 ++++++ 3 files changed, 32 insertions(+), 45 deletions(-) diff --git a/Pilot3/P3B5/p3b5.py b/Pilot3/P3B5/p3b5.py index 1a81b9f4..3892d88a 100644 --- a/Pilot3/P3B5/p3b5.py +++ b/Pilot3/P3B5/p3b5.py @@ -1,23 +1,32 @@ import candle -import p3b5 as bmk -required = [ - 'learning_rate', 'batch_size', 'epochs', 'dropout', \ - 'optimizer', 'wv_len', \ - 'filter_sizes', 'filter_sets', 'num_filters', 'emb_l2', 'w_l2'] +file_path = os.path.dirname(os.path.realpath(__file__)) +lib_path2 = os.path.abspath(os.path.join(file_path, '..', '..', 'common')) +sys.path.append(lib_path2) -class BenchmarkP3B3(candle.Benchmark): +REQUIRED = [ + 'learning_rate', + 'learning_rate_min', + 'momentum', + 'weight_decay', + 'grad_clip', + 'seed', + 'unrolled', + 'batch_size', + 'epochs', +] + + +class BenchmarkP3B5(candle.Benchmark): + """ Benchmark for P3B5 """ def set_locals(self): - """Functionality to set variables specific for the benchmark - - required: set of required parameters for the benchmark. - - additional_definitions: list of dictionaries describing the additional parameters for the - benchmark. - """ + """ Set parameters for the benchmark. - if required is not None: - self.required = set(required) - # if additional_definitions is not None: - # self.additional_definitions = additional_definitions + Args: + required: set of required parameters for the benchmark. + """ + if REQUIRED is not None: + self.required = set(REQUIRED) diff --git a/Pilot3/P3B5/p3b5_darts.py b/Pilot3/P3B5/p3b5_darts.py index 5d810260..8898f31c 100644 --- a/Pilot3/P3B5/p3b5_darts.py +++ b/Pilot3/P3B5/p3b5_darts.py @@ -30,36 +30,6 @@ sys.path.append(lib_path2) -def initialize_parameters(): - """ Initialize the parameters for the P3B5 benchmark """ - - p3b5_bench = bmk.BenchmarkP3B3( - bmk.file_path, - 'p3b5_default_model.txt', - 'pytorch', - prog='p3b5_baseline', - desc='Differentiable Architecture Search - Pilot 3 Benchmark 5', - ) - - # Initialize parameters - gParameters = candle.initialize_parameters(p3b5_bench) - #bmk.logger.info('Params: {}'.format(gParameters)) - - return gParameters - - -def fetch_data(gParameters): - """ Downloads and decompresses the data if not locally available. - Since the training data depends on the model definition it is not loaded, - instead the local path where the raw data resides is returned - """ - - path = gParameters['data_url'] - fpath = candle.fetch_file(path + gParameters['train_data'], 'Pilot3', untar=True) - - return fpath - - def parse_args(): parser = argparse.ArgumentParser(description='P3B3 Darts Example') parser.add_argument('--batch_size', type=int, default=100, metavar='N', diff --git a/Pilot3/P3B5/p3b5_default_model.txt b/Pilot3/P3B5/p3b5_default_model.txt index aa78332e..e3cc631c 100644 --- a/Pilot3/P3B5/p3b5_default_model.txt +++ b/Pilot3/P3B5/p3b5_default_model.txt @@ -1,5 +1,13 @@ [Global_Params] model_name = 'p3b5' +unrolled = True +data_url = 'ftp://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot3/' +train_data = 'P3B3_data.tar.gz' learning_rate = 0.01 +learning_rate_min = 0.001 +momentum = 0.9 +weight_decay = 3e-4 +grad_clip = 5 batch_size = 100 epochs = 10 +seed = 13 \ No newline at end of file From 89fb9fddfaab6c7156ad63417bf8e16dc3e7716f Mon Sep 17 00:00:00 2001 From: "Young, Todd" Date: Wed, 2 Oct 2019 11:37:44 -0400 Subject: [PATCH 042/331] Separate train and infer from benchmark module --- Pilot3/P3B5/p3b5_baseline_pytorch.py | 93 ++++++++++++++++++++++++++++ Pilot3/P3B5/p3b5_darts.py | 33 +--------- 2 files changed, 94 insertions(+), 32 deletions(-) diff --git a/Pilot3/P3B5/p3b5_baseline_pytorch.py b/Pilot3/P3B5/p3b5_baseline_pytorch.py index 4c7d5a6a..c8bcf47c 100644 --- a/Pilot3/P3B5/p3b5_baseline_pytorch.py +++ b/Pilot3/P3B5/p3b5_baseline_pytorch.py @@ -1,6 +1,21 @@ import candle import p3b5_darts as bmk +import torch +import torch.nn as nn +from torch import optim +import torch.nn.functional as F +from torch.utils.data import DataLoader + +from darts.api.config import banner +from darts.modules.network import Network +from darts.architecture import Architecture +from darts.functional import multitask_loss +from darts.meters.accuracy import MultitaskAccuracyMeter +from darts.utils.logging import log_accurac + +from p3b5_darts import train, infer + def initialize_parameters(): """ Initialize the parameters for the P3B5 benchmark """ @@ -31,3 +46,81 @@ def fetch_data(gParameters): path = gParameters['data_url'] fpath = candle.fetch_file(path + gParameters['train_data'], 'Pilot3', untar=True) return fpath + + +def run(params): + args = candle.ArgumentStruct(**params) + args.cuda = not args.no_cuda and torch.cuda.is_available() + + device = torch.device(f'cuda' if args.cuda else "cpu") + banner(device=device) + + train_data = P3B3(args.datapath, 'train', download=True) + valid_data = P3B3(args.datapath, 'test') + + trainloader = DataLoader(train_data, batch_size=args.batch_size) + validloader = DataLoader(valid_data, batch_size=args.batch_size) + + criterion = nn.CrossEntropyLoss().to(device) + + tasks = { + 'subsite': 6, + 'laterality': 2, + 'behavior': 2, + 'grade': 3 + } + + model = Network(tasks=tasks, criterion=criterion, device=device).to(device) + architecture = Architecture(model, args, device=device) + + optimizer = optim.SGD( + model.parameters(), + args.learning_rate, + momentum=args.momentum, + weight_decay=args.weight_decay, + ) + + scheduler = optim.lr_scheduler.CosineAnnealingLR( + optimizer, + float(args.epochs), + eta_min=args.learning_rate_min, + ) + + for epoch in range(args.epochs): + + scheduler.step() + lr = scheduler.get_lr()[0] + logger.info(f'\nEpoch: {epoch} lr: {lr}') + + genotype = model.genotype() + logger.info(f'Genotype: {genotype}') + + # training + train_acc, train_obj = train( + trainloader, + validloader, + model, + architecture, + criterion, + optimizer, + lr, + args, + tasks, + device + ) + + # validation + valid_acc, valid_obj = infer(validloader, model, criterion, args, tasks, device) + + logger.info(f'\nEpoch {epoch} stats:') + log_accuracy(train_acc, 'train') + log_accuracy(valid_acc, 'valid') + + +def main(): + params = initialize_parameters() + run(params) + + +if __name__=='__main__': + main() \ No newline at end of file diff --git a/Pilot3/P3B5/p3b5_darts.py b/Pilot3/P3B5/p3b5_darts.py index 8898f31c..6022ceb8 100644 --- a/Pilot3/P3B5/p3b5_darts.py +++ b/Pilot3/P3B5/p3b5_darts.py @@ -4,6 +4,7 @@ from loguru import logger import candle +import p3b5 as bmk import torch import torch.nn as nn @@ -30,38 +31,6 @@ sys.path.append(lib_path2) -def parse_args(): - parser = argparse.ArgumentParser(description='P3B3 Darts Example') - parser.add_argument('--batch_size', type=int, default=100, metavar='N', - help='input batch size for training (default: 128)') - parser.add_argument('--lr', type=float, default=0.025, - help='init learning rate') - parser.add_argument('--lr_min', type=float, default=0.001, - help='min learning rate') - parser.add_argument('--momentum', type=float, default=0.9, - help='momentum') - parser.add_argument('--wd', type=float, default=3e-4, - help='weight decay') - parser.add_argument('--grad_clip', type=float, default=5, - help='gradient clipping range') - parser.add_argument('--epochs', type=int, default=1, metavar='N', - help='number of epochs to train (default: 10)') - parser.add_argument('--no_cuda', action='store_true', default=False, - help='enables CUDA training') - parser.add_argument('--gpu_id', type=int, default=0, - help='cuda device id for torch.device') - parser.add_argument('--seed', type=int, default=1, metavar='S', - help='random seed (default: 1)') - parser.add_argument('--log_interval', type=int, default=10, metavar='N', - help='how many batches to wait before logging training status') - parser.add_argument('--datapath', type=str, default='/Users/yngtodd/data', - help='path to the dataset') - parser.add_argument('--unrolled', action='store_true', default=False, - help='use one-step unrolled validation loss') - args = parser.parse_args() - return args - - def main(): args = parse_args() args.cuda = not args.no_cuda and torch.cuda.is_available() From 2bf86101c0146bd2e975b693facfaa60989634c1 Mon Sep 17 00:00:00 2001 From: "Young, Todd" Date: Wed, 2 Oct 2019 14:39:55 -0400 Subject: [PATCH 043/331] Add new Dataset class for Candle --- Pilot3/P3B5/darts/data/__init__.py | 0 Pilot3/P3B5/darts/data/p3b3.py | 102 +++++++++++++++++++++++++++++ 2 files changed, 102 insertions(+) create mode 100644 Pilot3/P3B5/darts/data/__init__.py create mode 100644 Pilot3/P3B5/darts/data/p3b3.py diff --git a/Pilot3/P3B5/darts/data/__init__.py b/Pilot3/P3B5/darts/data/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/Pilot3/P3B5/darts/data/p3b3.py b/Pilot3/P3B5/darts/data/p3b3.py new file mode 100644 index 00000000..13cf3c5d --- /dev/null +++ b/Pilot3/P3B5/darts/data/p3b3.py @@ -0,0 +1,102 @@ +import os +import numpy as np +from torch.utils.data import Dataset + + +class P3B3(Dataset): + """P3B3 Synthetic Dataset. + + Args: + root: str + Root directory of dataset where CANDLE loads P3B3 data. + + partition: str + dataset partition to be loaded. + Either 'train', 'validation', or 'test'. + """ + training_data_file = 'train_X.npy' + training_label_file = 'train_Y.npy' + test_data_file = 'test_X.npy' + test_label_file = 'test_Y.npy' + + def __init__(self, root, partition, subsite=True, + laterality=True, behavior=True, grade=True, + transform=None, target_transform=None): + self.root = root + self.partition = partition + self.transform = transform + self.target_transform = target_transform + self.subsite = subsite + self.laterality = laterality + self.behavior = behavior + self.grade = grade + + if self.partition == 'train': + data_file = self.training_data_file + label_file = self.training_label_file + elif self.partition == 'test': + data_file = self.test_data_file + label_file = self.test_label_file + else: + raise ValueError("Partition must either be 'train' or 'test'.") + + self.data = np.load(os.path.join(self.root, data_file)) + self.targets = self.get_targets(label_file) + + def __len__(self): + return len(self.data) + + def load_data(self): + return self.data, self.targets + + def get_targets(self, label_file): + """Get dictionary of targets specified by user.""" + targets = np.load(os.path.join(self.root, label_file)) + + tasks = {} + if self.subsite: + tasks['subsite'] = targets[:, 0] + if self.laterality: + tasks['laterality'] = targets[:, 1] + if self.behavior: + tasks['behavior'] = targets[:, 2] + if self.grade: + tasks['grade'] = targets[:, 3] + + return tasks + + def __getitem__(self, idx): + """ + Parameters + ---------- + index : int + Index of the data to be loaded. + + Returns + ------- + (document, target) : tuple + where target is index of the target class. + """ + document = self.data[idx] + + if self.transform is not None: + document = self.transform(document) + + targets = {} + for key, value in self.targets.items(): + subset = value[idx] + + if self.target_transform is not None: + subset = self.target_transform(subset) + + targets[key] = subset + + return document, targets + + def __repr__(self): + fmt_str = 'Dataset ' + self.__class__.__name__ + '\n' + fmt_str += ' Number of datapoints: {}\n'.format(self.__len__()) + tmp = self.partition + fmt_str += ' Split: {}\n'.format(tmp) + fmt_str += ' Root Location: {}\n'.format(self.root) + return fmt_str \ No newline at end of file From b05d7ae15c31df8b1d8b161dd570f9521f7abda8 Mon Sep 17 00:00:00 2001 From: "Young, Todd" Date: Wed, 2 Oct 2019 14:41:48 -0400 Subject: [PATCH 044/331] Reorder methods --- Pilot3/P3B5/darts/data/p3b3.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/Pilot3/P3B5/darts/data/p3b3.py b/Pilot3/P3B5/darts/data/p3b3.py index 13cf3c5d..1285a158 100644 --- a/Pilot3/P3B5/darts/data/p3b3.py +++ b/Pilot3/P3B5/darts/data/p3b3.py @@ -12,7 +12,7 @@ class P3B3(Dataset): partition: str dataset partition to be loaded. - Either 'train', 'validation', or 'test'. + Must be either 'train' or 'test'. """ training_data_file = 'train_X.npy' training_label_file = 'train_Y.npy' @@ -43,6 +43,14 @@ def __init__(self, root, partition, subsite=True, self.data = np.load(os.path.join(self.root, data_file)) self.targets = self.get_targets(label_file) + def __repr__(self): + fmt_str = 'Dataset ' + self.__class__.__name__ + '\n' + fmt_str += ' Number of datapoints: {}\n'.format(self.__len__()) + tmp = self.partition + fmt_str += ' Split: {}\n'.format(tmp) + fmt_str += ' Root Location: {}\n'.format(self.root) + return fmt_str + def __len__(self): return len(self.data) @@ -91,12 +99,4 @@ def __getitem__(self, idx): targets[key] = subset - return document, targets - - def __repr__(self): - fmt_str = 'Dataset ' + self.__class__.__name__ + '\n' - fmt_str += ' Number of datapoints: {}\n'.format(self.__len__()) - tmp = self.partition - fmt_str += ' Split: {}\n'.format(tmp) - fmt_str += ' Root Location: {}\n'.format(self.root) - return fmt_str \ No newline at end of file + return document, targets \ No newline at end of file From b68a981176a4be0572eaa599d5128da8c4f76c8f Mon Sep 17 00:00:00 2001 From: "Young, Todd" Date: Thu, 3 Oct 2019 10:15:30 -0400 Subject: [PATCH 045/331] Adds Dataset from darts This removes an external dependence on DataStore, and allows us to use the data fetching parts of Candle. --- Pilot3/P3B5/p3b5_baseline_pytorch.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/Pilot3/P3B5/p3b5_baseline_pytorch.py b/Pilot3/P3B5/p3b5_baseline_pytorch.py index c8bcf47c..977f905e 100644 --- a/Pilot3/P3B5/p3b5_baseline_pytorch.py +++ b/Pilot3/P3B5/p3b5_baseline_pytorch.py @@ -8,6 +8,7 @@ from torch.utils.data import DataLoader from darts.api.config import banner +from darts.data.p3b3 import P3B3 from darts.modules.network import Network from darts.architecture import Architecture from darts.functional import multitask_loss @@ -55,8 +56,9 @@ def run(params): device = torch.device(f'cuda' if args.cuda else "cpu") banner(device=device) - train_data = P3B3(args.datapath, 'train', download=True) - valid_data = P3B3(args.datapath, 'test') + datapath = fetch_data(params) + train_data = P3B3(datapath, 'train') + valid_data = P3B3(datapath, 'test') trainloader = DataLoader(train_data, batch_size=args.batch_size) validloader = DataLoader(valid_data, batch_size=args.batch_size) From c492b45fe40b9057214bb154c7fc17079abd2976 Mon Sep 17 00:00:00 2001 From: "Young, Todd" Date: Fri, 4 Oct 2019 10:58:22 -0400 Subject: [PATCH 046/331] Add GenotypeStorage and logging to track run Saving the best genotype in this way keeps us from having to track down the gene in all of the logs. --- Pilot3/P3B5/darts/storage/__init__.py | 0 Pilot3/P3B5/darts/storage/genotype.py | 74 +++++++++++++++++++++++++++ Pilot3/P3B5/darts/utils/logging.py | 6 ++- Pilot3/P3B5/p3b5_baseline_pytorch.py | 14 +++-- 4 files changed, 89 insertions(+), 5 deletions(-) create mode 100644 Pilot3/P3B5/darts/storage/__init__.py create mode 100644 Pilot3/P3B5/darts/storage/genotype.py diff --git a/Pilot3/P3B5/darts/storage/__init__.py b/Pilot3/P3B5/darts/storage/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/Pilot3/P3B5/darts/storage/genotype.py b/Pilot3/P3B5/darts/storage/genotype.py new file mode 100644 index 00000000..1d80b855 --- /dev/null +++ b/Pilot3/P3B5/darts/storage/genotype.py @@ -0,0 +1,74 @@ +import os +import json +from typing import List + +from darts.genotypes import Genotype + + +class GenotypeStorage: + """ Disk storage for Genotypes + + Args: + root: rooth path to save genotype + """ + + def __init__(self, root: str): + self.root = root + + def save_genotype(self, genotype: Genotype, filename='genotype.json') -> None: + """ Save a genotype to disk + + Args: + genotype: genotype to be saved + filename: name of the save file + """ + path = os.path.join(self.root, filename) + with open(path, 'w') as outfile: + json.dump(genotype, outfile) + + def load_genotype(self, filename='genotype.json') -> Genotype: + """ Load a genotype from disk + + Args: + filename: name of the save file + + Returns: + the genotype + """ + path = os.path.join(self.root, filename) + with open(path, 'r') as infile: + saved = json.load(infile) + + genotype = self._convert_serialized(saved) + return genotype + + def _convert_serialized(self, save: list) -> Genotype: + """ Convert json serialized form to Genotype + + Args: + save: serialized form of the the genotype + + Returns: + the genotype + """ + # Serialized genotypes have a consistent structure + normal = self._convert_to_tuple(save[0]) + normal_concat = save[1] + reduce = self._convert_to_tuple(save[2]) + reduce_concat = save[3] + return Genotype(normal, normal_concat, reduce, reduce_concat) + + def _convert_to_tuple(self, block: list) -> List[tuple]: + """ Convert list to list of tuples + + Used when converting part of a serialized form of + the genotype + + Args: + block: part of the serialized genotype + + Returns: + list of tuples that constitute that block + """ + return [tuple(x) for x in block] + diff --git a/Pilot3/P3B5/darts/utils/logging.py b/Pilot3/P3B5/darts/utils/logging.py index c846e69f..dcd272fe 100644 --- a/Pilot3/P3B5/darts/utils/logging.py +++ b/Pilot3/P3B5/darts/utils/logging.py @@ -1,7 +1,9 @@ -from loguru import logger +import logger -logger.add("darts_p3b3.log", format="{time:YYYY-MM-DD at HH:mm:ss} | {level} | {message}", level="INFO") +logger = logging.getLogger('DARTS') +fh = logging.FileHandler('darts_accuracy.log') +logger.addHandler(fh) def log_accuracy(accuracy, split: str='train'): diff --git a/Pilot3/P3B5/p3b5_baseline_pytorch.py b/Pilot3/P3B5/p3b5_baseline_pytorch.py index 977f905e..ee0b398d 100644 --- a/Pilot3/P3B5/p3b5_baseline_pytorch.py +++ b/Pilot3/P3B5/p3b5_baseline_pytorch.py @@ -11,9 +11,10 @@ from darts.data.p3b3 import P3B3 from darts.modules.network import Network from darts.architecture import Architecture +from darts.storage.genotype import GenotypeStorage from darts.functional import multitask_loss from darts.meters.accuracy import MultitaskAccuracyMeter -from darts.utils.logging import log_accurac +from darts.utils.logging import log_accuracy from p3b5_darts import train, infer @@ -88,6 +89,9 @@ def run(params): eta_min=args.learning_rate_min, ) + genotype_store = GenotypeStorage(root=args.savepath) + + min_loss = 9999 for epoch in range(args.epochs): scheduler.step() @@ -98,7 +102,7 @@ def run(params): logger.info(f'Genotype: {genotype}') # training - train_acc, train_obj = train( + train_acc, train_loss = train( trainloader, validloader, model, @@ -112,7 +116,11 @@ def run(params): ) # validation - valid_acc, valid_obj = infer(validloader, model, criterion, args, tasks, device) + valid_acc, valid_loss = infer(validloader, model, criterion, args, tasks, device) + + if valid_loss < min_loss: + genotype_store.save_genotype(genotype) + min_loss = valid_loss logger.info(f'\nEpoch {epoch} stats:') log_accuracy(train_acc, 'train') From 33bdad049e6f2f379ff9a1e702f491e231217132 Mon Sep 17 00:00:00 2001 From: "Young, Todd" Date: Fri, 4 Oct 2019 12:49:53 -0400 Subject: [PATCH 047/331] Add multitask loss and multitask accuracy funcs This removes external dependencies from `Hammer`, simplifying things for users across Candle. --- Pilot3/P3B5/darts/functional.py | 52 +++++++++++++++++- Pilot3/P3B5/p3b5_darts.py | 95 +++------------------------------ 2 files changed, 58 insertions(+), 89 deletions(-) diff --git a/Pilot3/P3B5/darts/functional.py b/Pilot3/P3B5/darts/functional.py index 44b0c064..902ec79c 100644 --- a/Pilot3/P3B5/darts/functional.py +++ b/Pilot3/P3B5/darts/functional.py @@ -25,7 +25,7 @@ def flatten(tensor): return torch.cat([x.view(-1) for x in tensor]) -def multitask_loss(logits, target, criterion, reduce='mean'): +def multitask_loss(target, logits, criterion, reduce='mean'): """ Compute multitask loss """ losses = {} for task, label in target.items(): @@ -43,4 +43,52 @@ def multitask_loss(logits, target, criterion, reduce='mean'): else: raise ValueError('Reduced loss must use either `mean` or `sum`!') - return losses \ No newline at end of file + return losses + + +def accuracy(target: torch.tensor, output: torch.tensor,): + """ Computes accuracy + + Args: + output: logits of the model + target: true labels + + Returns: + accuracy of the predictions + """ + return output.argmax(1).eq(target).double().mean().item() + + +def multitask_accuracy(target, output): + """ Compute the accuracy for multitask problems """ + accuracies = {} + for key, value in target.items(): + accuracies[key] = accuracy(target[key], output[key]) + + return accuracies + + +def accuracy_topk(target, output, topk=(1,)): + """Computes the accuracy over the k top predictions for the specified values of k""" + with torch.no_grad(): + maxk = max(topk) + batch_size = target.size(0) + + _, pred = output.topk(maxk, 1, True, True) + pred = pred.t() + correct = pred.eq(target.view(1, -1).expand_as(pred)) + + res = [] + for k in topk: + correct_k = correct[:k].view(-1).float().sum(0, keepdim=True) + res.append(correct_k.mul_(100.0 / batch_size)) + return res + + +def multitask_accuracy_topk(target, output, topk=(1,)): + """Compute the topk accuracy for multitask problems""" + topk_accuracies = {} + for key, value in target.items(): + topk_accuracies[key] = accuracy_topk(output[key], target[key], topk) + + return topk_accuracies \ No newline at end of file diff --git a/Pilot3/P3B5/p3b5_darts.py b/Pilot3/P3B5/p3b5_darts.py index 6022ceb8..dfbd3cf1 100644 --- a/Pilot3/P3B5/p3b5_darts.py +++ b/Pilot3/P3B5/p3b5_darts.py @@ -1,8 +1,6 @@ import os import sys import argparse -from loguru import logger - import candle import p3b5 as bmk @@ -12,14 +10,11 @@ import torch.nn.functional as F from torch.utils.data import DataLoader -from datastore.data import P3B3 -from hammer.metrics import multitask_accuracy_topk -from hammer.meters.average import AverageMeter - from darts.api.config import banner from darts.modules.network import Network from darts.architecture import Architecture -from darts.functional import multitask_loss +from darts.meteters.average import AverageMeter +from darts.functional import multitask_loss, multitask_accuracy from darts.meters.accuracy import MultitaskAccuracyMeter from darts.utils.logging import log_accuracy @@ -31,80 +26,6 @@ sys.path.append(lib_path2) -def main(): - args = parse_args() - args.cuda = not args.no_cuda and torch.cuda.is_available() - - device = torch.device(f'cuda:{args.gpu_id}' if args.cuda else "cpu") - banner(device=device) - - train_data = P3B3(args.datapath, 'train', download=True) - valid_data = P3B3(args.datapath, 'test') - - trainloader = DataLoader(train_data, batch_size=args.batch_size) - validloader = DataLoader(valid_data, batch_size=args.batch_size) - - criterion = nn.CrossEntropyLoss().to(device) - - tasks = { - 'subsite': 6, - 'laterality': 2, - 'behavior': 2, - 'grade': 3 - } - - model = Network(tasks=tasks, criterion=criterion, device=device).to(device) - architecture = Architecture(model, args, device=device) - - optimizer = optim.SGD( - model.parameters(), - args.lr, - momentum=args.momentum, - weight_decay=args.wd - ) - - scheduler = optim.lr_scheduler.CosineAnnealingLR( - optimizer, - float(args.epochs), - eta_min=args.lr_min - ) - - for epoch in range(args.epochs): - - scheduler.step() - lr = scheduler.get_lr()[0] - logger.info(f'\nEpoch: {epoch} lr: {lr}') - - genotype = model.genotype() - logger.info(f'Genotype: {genotype}') - - #logger.debug(F.softmax(model.alphas_normal, dim=-1)) - #logger.debug(F.softmax(model.alphas_reduce, dim=-1)) - - # training - train_acc, train_obj = train( - trainloader, - validloader, - model, - architecture, - criterion, - optimizer, - lr, - args, - tasks, - device - ) - - # validation - valid_acc, valid_obj = infer(validloader, model, criterion, args, tasks, device) - - logger.info(f'\nEpoch {epoch} stats:') - log_accuracy(train_acc, 'train') - log_accuracy(valid_acc, 'valid') - - #utils.save(model, os.path.join(args.exp_path, 'search.pt')) - - def train(trainloader, validloader, model, architecture, criterion, optimizer, lr, args, tasks, device): losses = AverageMeter('LossMeter') top1 = MultitaskAccuracyMeter(tasks) @@ -139,7 +60,7 @@ def train(trainloader, validloader, model, architecture, criterion, optimizer, l ) logits = model(data) - loss = multitask_loss(logits, target, criterion, reduce='mean') + loss = multitask_loss(target, logits, criterion, reduce='mean') # 2. update weight optimizer.zero_grad() @@ -147,12 +68,12 @@ def train(trainloader, validloader, model, architecture, criterion, optimizer, l nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip) optimizer.step() - prec1 = multitask_accuracy_topk(logits, target, topk=(1,)) + prec1 = multitask_accuracy(target, logits) losses.update(loss.item(), batch_size) top1.update(prec1, batch_size) if step % args.log_interval == 0: - logger.info(f'Step: {step} loss: {losses.avg:.4}') + print(f'Step: {step} loss: {losses.avg:.4}') log_accuracy(top1) return top1, losses.avg @@ -174,14 +95,14 @@ def infer(validloader, model, criterion, args, tasks, device): batch_size = data.size(0) logits = model(data) - loss = multitask_loss(logits, target, criterion, reduce='mean') + loss = multitask_loss(target, logits, criterion, reduce='mean') - prec1 = multitask_accuracy_topk(logits, target, topk=(1,)) + prec1 = multitask_accuracy(target, logits) losses.update(loss.item(), batch_size) top1.update(prec1, batch_size) if step % args.log_interval == 0: - logger.info(f'>> Validation: {step} loss: {losses.avg:.4}') + print(f'>> Validation: {step} loss: {losses.avg:.4}') log_accuracy(top1, 'valid') return top1, losses.avg From 2c0ba063b79259e17336f6a6f6776bb8b7c16e87 Mon Sep 17 00:00:00 2001 From: yngtodd Date: Fri, 4 Oct 2019 15:51:14 -0400 Subject: [PATCH 048/331] Fix missing import --- Pilot3/P3B5/p3b5.py | 1 + 1 file changed, 1 insertion(+) diff --git a/Pilot3/P3B5/p3b5.py b/Pilot3/P3B5/p3b5.py index 3892d88a..8bff5fec 100644 --- a/Pilot3/P3B5/p3b5.py +++ b/Pilot3/P3B5/p3b5.py @@ -1,3 +1,4 @@ +import os import candle From 2a724872cba5c48ddbd336f06460aa2ad851c6d0 Mon Sep 17 00:00:00 2001 From: yngtodd Date: Fri, 4 Oct 2019 15:52:53 -0400 Subject: [PATCH 049/331] Fix missing import for sys --- Pilot3/P3B5/p3b5.py | 1 + 1 file changed, 1 insertion(+) diff --git a/Pilot3/P3B5/p3b5.py b/Pilot3/P3B5/p3b5.py index 8bff5fec..aba56cf9 100644 --- a/Pilot3/P3B5/p3b5.py +++ b/Pilot3/P3B5/p3b5.py @@ -1,4 +1,5 @@ import os +import sys import candle From e39503de38825f2d6857035994172884515a0093 Mon Sep 17 00:00:00 2001 From: yngtodd Date: Fri, 4 Oct 2019 15:55:04 -0400 Subject: [PATCH 050/331] Fix typo in import --- Pilot3/P3B5/p3b5_darts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Pilot3/P3B5/p3b5_darts.py b/Pilot3/P3B5/p3b5_darts.py index dfbd3cf1..88207a73 100644 --- a/Pilot3/P3B5/p3b5_darts.py +++ b/Pilot3/P3B5/p3b5_darts.py @@ -13,7 +13,7 @@ from darts.api.config import banner from darts.modules.network import Network from darts.architecture import Architecture -from darts.meteters.average import AverageMeter +from darts.meters.average import AverageMeter from darts.functional import multitask_loss, multitask_accuracy from darts.meters.accuracy import MultitaskAccuracyMeter from darts.utils.logging import log_accuracy From cc9b1793dc8d6f0f4465679a964ad1540534bec0 Mon Sep 17 00:00:00 2001 From: yngtodd Date: Fri, 4 Oct 2019 15:56:51 -0400 Subject: [PATCH 051/331] Fix import for default logging --- Pilot3/P3B5/darts/utils/logging.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Pilot3/P3B5/darts/utils/logging.py b/Pilot3/P3B5/darts/utils/logging.py index dcd272fe..81561286 100644 --- a/Pilot3/P3B5/darts/utils/logging.py +++ b/Pilot3/P3B5/darts/utils/logging.py @@ -1,4 +1,4 @@ -import logger +import logging logger = logging.getLogger('DARTS') @@ -24,4 +24,4 @@ def log_accuracy(accuracy, split: str='train'): f"Grade: {accuracy.get_avg_accuracy('grade'):.4f}" ) - logger.info(acc_info) \ No newline at end of file + logger.info(acc_info) From 0460517649d301e47254844cb3a4a972f0684260 Mon Sep 17 00:00:00 2001 From: yngtodd Date: Fri, 4 Oct 2019 16:35:31 -0400 Subject: [PATCH 052/331] Update arguments to match the default specification Some of the arguments in darts were too abbreviated. It is more clear to use weight_decay than wd. --- Pilot3/P3B5/darts/architecture.py | 2 +- Pilot3/P3B5/p3b5_baseline_pytorch.py | 8 ++++---- Pilot3/P3B5/p3b5_default_model.txt | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/Pilot3/P3B5/darts/architecture.py b/Pilot3/P3B5/darts/architecture.py index 7c303ffc..57c89c1d 100644 --- a/Pilot3/P3B5/darts/architecture.py +++ b/Pilot3/P3B5/darts/architecture.py @@ -13,7 +13,7 @@ class Architecture: def __init__(self, model, args, hyperparams=Hyperparameters(), device='cpu'): self.momentum = args.momentum # momentum for optimizer of theta - self.wd = args.wd # weight decay for optimizer of model's theta + self.wd = args.weight_decay # weight decay for optimizer of model's theta self.model = model # main model with respect to theta and alpha self.device = device diff --git a/Pilot3/P3B5/p3b5_baseline_pytorch.py b/Pilot3/P3B5/p3b5_baseline_pytorch.py index ee0b398d..01d87fee 100644 --- a/Pilot3/P3B5/p3b5_baseline_pytorch.py +++ b/Pilot3/P3B5/p3b5_baseline_pytorch.py @@ -1,5 +1,5 @@ import candle -import p3b5_darts as bmk +import p3b5 as bmk import torch import torch.nn as nn @@ -22,7 +22,7 @@ def initialize_parameters(): """ Initialize the parameters for the P3B5 benchmark """ - p3b5_bench = bmk.BenchmarkP3B3( + p3b5_bench = bmk.BenchmarkP3B5( bmk.file_path, 'p3b5_default_model.txt', 'pytorch', @@ -52,7 +52,7 @@ def fetch_data(gParameters): def run(params): args = candle.ArgumentStruct(**params) - args.cuda = not args.no_cuda and torch.cuda.is_available() + args.cuda = torch.cuda.is_available() device = torch.device(f'cuda' if args.cuda else "cpu") banner(device=device) @@ -133,4 +133,4 @@ def main(): if __name__=='__main__': - main() \ No newline at end of file + main() diff --git a/Pilot3/P3B5/p3b5_default_model.txt b/Pilot3/P3B5/p3b5_default_model.txt index e3cc631c..d34eb599 100644 --- a/Pilot3/P3B5/p3b5_default_model.txt +++ b/Pilot3/P3B5/p3b5_default_model.txt @@ -10,4 +10,4 @@ weight_decay = 3e-4 grad_clip = 5 batch_size = 100 epochs = 10 -seed = 13 \ No newline at end of file +seed = 13 From 431030b2a5f8c2fd312a7537aae71cf7247a0ad3 Mon Sep 17 00:00:00 2001 From: yngtodd Date: Fri, 4 Oct 2019 16:42:38 -0400 Subject: [PATCH 053/331] Remove calls to logger from within `run` We are already capturing print statements within Summit's log. No need to clutter up the rest of the logging. --- Pilot3/P3B5/p3b5_baseline_pytorch.py | 6 +++--- Pilot3/P3B5/p3b5_default_model.txt | 1 + 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/Pilot3/P3B5/p3b5_baseline_pytorch.py b/Pilot3/P3B5/p3b5_baseline_pytorch.py index 01d87fee..860e8df3 100644 --- a/Pilot3/P3B5/p3b5_baseline_pytorch.py +++ b/Pilot3/P3B5/p3b5_baseline_pytorch.py @@ -96,10 +96,10 @@ def run(params): scheduler.step() lr = scheduler.get_lr()[0] - logger.info(f'\nEpoch: {epoch} lr: {lr}') + print(f'\nEpoch: {epoch} lr: {lr}') genotype = model.genotype() - logger.info(f'Genotype: {genotype}') + print(f'Genotype: {genotype}') # training train_acc, train_loss = train( @@ -122,7 +122,7 @@ def run(params): genotype_store.save_genotype(genotype) min_loss = valid_loss - logger.info(f'\nEpoch {epoch} stats:') + print(f'\nEpoch {epoch} stats:') log_accuracy(train_acc, 'train') log_accuracy(valid_acc, 'valid') diff --git a/Pilot3/P3B5/p3b5_default_model.txt b/Pilot3/P3B5/p3b5_default_model.txt index d34eb599..8fdd0904 100644 --- a/Pilot3/P3B5/p3b5_default_model.txt +++ b/Pilot3/P3B5/p3b5_default_model.txt @@ -2,6 +2,7 @@ model_name = 'p3b5' unrolled = True data_url = 'ftp://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot3/' +savepath = '/gpfs/alpine/proj-shared/med107/yngtodd/src/checkout/Benchmarks/Pilot3/P3B5' train_data = 'P3B3_data.tar.gz' learning_rate = 0.01 learning_rate_min = 0.001 From d9d4a638c85b469ffc42de9539f41a72d90346c0 Mon Sep 17 00:00:00 2001 From: yngtodd Date: Sat, 5 Oct 2019 00:34:13 -0400 Subject: [PATCH 054/331] Fix serialization for genotypes Python's `range` is not json serializable. This is fixed by first converting the `range`s in the Genotype to be lists, before saving to the filesystem. --- Pilot3/P3B5/darts/meters/accuracy.py | 3 +-- Pilot3/P3B5/darts/storage/genotype.py | 17 +++++++++++++++++ Pilot3/P3B5/p3b5_baseline_pytorch.py | 8 ++++---- Pilot3/P3B5/p3b5_default_model.txt | 1 + 4 files changed, 23 insertions(+), 6 deletions(-) diff --git a/Pilot3/P3B5/darts/meters/accuracy.py b/Pilot3/P3B5/darts/meters/accuracy.py index 66b4e63d..d3089454 100644 --- a/Pilot3/P3B5/darts/meters/accuracy.py +++ b/Pilot3/P3B5/darts/meters/accuracy.py @@ -25,6 +25,5 @@ def get_accuracy(self, task): def update(self, accuracies, batch_size): for task, acc in accuracies.items(): - self.meters[task].update(acc[0].item(), batch_size) - + self.meters[task].update(acc, batch_size) diff --git a/Pilot3/P3B5/darts/storage/genotype.py b/Pilot3/P3B5/darts/storage/genotype.py index 1d80b855..87596b35 100644 --- a/Pilot3/P3B5/darts/storage/genotype.py +++ b/Pilot3/P3B5/darts/storage/genotype.py @@ -22,6 +22,7 @@ def save_genotype(self, genotype: Genotype, filename='genotype.json') -> None: genotype: genotype to be saved filename: name of the save file """ + genotype = self._replace_range(genotype) path = os.path.join(self.root, filename) with open(path, 'w') as outfile: json.dump(genotype, outfile) @@ -42,6 +43,22 @@ def load_genotype(self, filename='genotype.json') -> Genotype: genotype = self._convert_serialized(saved) return genotype + def _replace_range(self, genotype: Genotype) -> Genotype: + """ Replace the range values with lists + + Python's `range` is not serializable as json objects. + We convert the genotype's ranges to lists first. + + Args: + genotype: the genotype to be serialized + + Returns + genotype: with proper lists. + """ + genotype = genotype._replace(normal_concat=list(genotype.normal_concat)) + genotype = genotype._replace(reduce_concat=list(genotype.reduce_concat)) + return genotype + def _convert_serialized(self, save: list) -> Genotype: """ Convert json serialized form to Genotype diff --git a/Pilot3/P3B5/p3b5_baseline_pytorch.py b/Pilot3/P3B5/p3b5_baseline_pytorch.py index 860e8df3..77415250 100644 --- a/Pilot3/P3B5/p3b5_baseline_pytorch.py +++ b/Pilot3/P3B5/p3b5_baseline_pytorch.py @@ -67,10 +67,10 @@ def run(params): criterion = nn.CrossEntropyLoss().to(device) tasks = { - 'subsite': 6, - 'laterality': 2, - 'behavior': 2, - 'grade': 3 + 'subsite': 15, + 'laterality': 3, + 'behavior': 3, + 'grade': 3, } model = Network(tasks=tasks, criterion=criterion, device=device).to(device) diff --git a/Pilot3/P3B5/p3b5_default_model.txt b/Pilot3/P3B5/p3b5_default_model.txt index 8fdd0904..6ad42c5c 100644 --- a/Pilot3/P3B5/p3b5_default_model.txt +++ b/Pilot3/P3B5/p3b5_default_model.txt @@ -3,6 +3,7 @@ model_name = 'p3b5' unrolled = True data_url = 'ftp://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot3/' savepath = '/gpfs/alpine/proj-shared/med107/yngtodd/src/checkout/Benchmarks/Pilot3/P3B5' +log_interval = 10 train_data = 'P3B3_data.tar.gz' learning_rate = 0.01 learning_rate_min = 0.001 From d6378a5b8538aac2d2542274e5320988e6bcfe71 Mon Sep 17 00:00:00 2001 From: "Young, Todd" Date: Sat, 5 Oct 2019 08:13:28 -0400 Subject: [PATCH 055/331] Add run command to README --- Pilot3/P3B5/README.rst | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/Pilot3/P3B5/README.rst b/Pilot3/P3B5/README.rst index 14d182b0..aa09ec54 100644 --- a/Pilot3/P3B5/README.rst +++ b/Pilot3/P3B5/README.rst @@ -3,3 +3,11 @@ P3B5 Differentiable Architecture Search ======================================= Differentiable architecture search (DARTS) benchmark using clinical pathology reports. + + +To test your environment, use the UPF method of running the benchmark. A UPF test script +is available at `Supervisor/workflows/upf/test/upf-1.sh`. + +.. code-block:: console + + bash upf1-test.sh p3b5 summit-world From 7996c8f5532b29f8f32cd8fba7ecc3054e4ea2da Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 13 Nov 2019 15:37:55 -0600 Subject: [PATCH 056/331] New initialize_parameters(default_model) argument --- Pilot1/Uno/uno_baseline_keras2.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Pilot1/Uno/uno_baseline_keras2.py b/Pilot1/Uno/uno_baseline_keras2.py index 0ec9201b..7c08aa77 100644 --- a/Pilot1/Uno/uno_baseline_keras2.py +++ b/Pilot1/Uno/uno_baseline_keras2.py @@ -265,10 +265,10 @@ def build_model(loader, args, permanent_dropout=True, silent=False): return Model(inputs, output) -def initialize_parameters(): +def initialize_parameters(default_model='uno_default_model.txt'): # Build benchmark object - unoBmk = benchmark.BenchmarkUno(benchmark.file_path, 'uno_default_model.txt', 'keras', + unoBmk = benchmark.BenchmarkUno(benchmark.file_path, default_model, 'keras', prog='uno_baseline', desc='Build neural network based models to predict tumor response to single and paired drugs.') # Initialize parameters From af01de8b4868500e350a0e2d4e05bd1e14895cf9 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 13 Nov 2019 15:37:55 -0600 Subject: [PATCH 057/331] New initialize_parameters(default_model) argument --- Pilot1/Uno/uno_baseline_keras2.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Pilot1/Uno/uno_baseline_keras2.py b/Pilot1/Uno/uno_baseline_keras2.py index 722f9482..5af3673c 100644 --- a/Pilot1/Uno/uno_baseline_keras2.py +++ b/Pilot1/Uno/uno_baseline_keras2.py @@ -265,10 +265,10 @@ def build_model(loader, args, permanent_dropout=True, silent=False): return Model(inputs, output) -def initialize_parameters(): +def initialize_parameters(default_model='uno_default_model.txt'): # Build benchmark object - unoBmk = benchmark.BenchmarkUno(benchmark.file_path, 'uno_default_model.txt', 'keras', + unoBmk = benchmark.BenchmarkUno(benchmark.file_path, default_model, 'keras', prog='uno_baseline', desc='Build neural network based models to predict tumor response to single and paired drugs.') # Initialize parameters From 29529ef0c14b89325f74eed6832189da33fe72d0 Mon Sep 17 00:00:00 2001 From: Jamaludin Mohd Yusof Date: Wed, 13 Nov 2019 16:40:15 -0600 Subject: [PATCH 058/331] Initial commit to add default model file as argument to initialize_parameters. Rename candle.initialize_parameters to candle.finalize_parameters to avoid name contention. --- Pilot1/Combo/combo_baseline_keras2.py | 6 +++--- Pilot1/NT3/nt3_baseline_keras2.py | 6 +++--- Pilot1/P1B1/p1b1_baseline_keras2.py | 6 +++--- Pilot1/P1B2/p1b2_baseline_keras2.py | 6 +++--- Pilot1/P1B3/p1b3_baseline_keras2.py | 6 +++--- Pilot1/T29/t29res.py | 6 +++--- Pilot1/TC1/tc1_baseline_keras2.py | 6 +++--- Pilot1/Uno/uno_baseline_keras2.py | 6 +++--- Pilot1/UnoMT/unoMT_baseline_pytorch.py | 6 +++--- Pilot2/P2B1/p2b1_baseline_keras2.py | 6 +++--- Pilot3/P3B1/p3b1_baseline_keras2.py | 6 +++--- Pilot3/P3B2/p3b2_baseline_keras2.py | 6 +++--- Pilot3/P3B3/p3b3_baseline_keras2.py | 6 +++--- Pilot3/P3B4/p3b4.py | 2 -- Pilot3/P3B4/p3b4_baseline_keras2.py | 8 +++----- common/candle/__init__.py | 2 +- common/candle_keras/__init__.py | 2 +- common/default_utils.py | 2 +- 18 files changed, 45 insertions(+), 49 deletions(-) diff --git a/Pilot1/Combo/combo_baseline_keras2.py b/Pilot1/Combo/combo_baseline_keras2.py index 16f2e6f9..1a836d9e 100644 --- a/Pilot1/Combo/combo_baseline_keras2.py +++ b/Pilot1/Combo/combo_baseline_keras2.py @@ -644,15 +644,15 @@ def build_model(loader, args, verbose=False): return Model(inputs, output) -def initialize_parameters(): +def initialize_parameters(default_model = 'combo_default_model.txt'): # Build benchmark object - comboBmk = combo.BenchmarkCombo(combo.file_path, 'combo_default_model.txt', 'keras', + comboBmk = combo.BenchmarkCombo(combo.file_path, default_model, 'keras', prog='combo_baseline', desc = 'Build neural network based models to predict tumor response to drug pairs.') # Initialize parameters - gParameters = candle.initialize_parameters(comboBmk) + gParameters = candle.finalize_parameters(comboBmk) #combo.logger.info('Params: {}'.format(gParameters)) return gParameters diff --git a/Pilot1/NT3/nt3_baseline_keras2.py b/Pilot1/NT3/nt3_baseline_keras2.py index 1fc8e0dc..0877bd04 100644 --- a/Pilot1/NT3/nt3_baseline_keras2.py +++ b/Pilot1/NT3/nt3_baseline_keras2.py @@ -22,14 +22,14 @@ import nt3 as bmk import candle -def initialize_parameters(): +def initialize_parameters(default_model = 'nt3_default_model.txt'): # Build benchmark object - nt3Bmk = bmk.BenchmarkNT3(bmk.file_path, 'nt3_default_model.txt', 'keras', + nt3Bmk = bmk.BenchmarkNT3(bmk.file_path, default_model, 'keras', prog='nt3_baseline', desc='Multi-task (DNN) for data extraction from clinical reports - Pilot 3 Benchmark 1') # Initialize parameters - gParameters = candle.initialize_parameters(nt3Bmk) + gParameters = candle.finalize_parameters(nt3Bmk) #benchmark.logger.info('Params: {}'.format(gParameters)) return gParameters diff --git a/Pilot1/P1B1/p1b1_baseline_keras2.py b/Pilot1/P1B1/p1b1_baseline_keras2.py index 1b515c5e..34fca44f 100644 --- a/Pilot1/P1B1/p1b1_baseline_keras2.py +++ b/Pilot1/P1B1/p1b1_baseline_keras2.py @@ -104,14 +104,14 @@ def build_type_classifier(x_train, y_train, x_test, y_test): print(acc) return clf -def initialize_parameters(): +def initialize_parameters(default_model = 'p1b1_default_model.txt'): # Build benchmark object - p1b1Bmk = p1b1.BenchmarkP1B1(p1b1.file_path, 'p1b1_default_model.txt', 'keras', + p1b1Bmk = p1b1.BenchmarkP1B1(p1b1.file_path, default_model, 'keras', prog='p1b1_baseline', desc='Multi-task (DNN) for data extraction from clinical reports - Pilot 3 Benchmark 1') # Initialize parameters - gParameters = candle.initialize_parameters(p1b1Bmk) + gParameters = candle.finalize_parameters(p1b1Bmk) #p1b1.logger.info('Params: {}'.format(gParameters)) return gParameters diff --git a/Pilot1/P1B2/p1b2_baseline_keras2.py b/Pilot1/P1B2/p1b2_baseline_keras2.py index 504cd421..d4018453 100644 --- a/Pilot1/P1B2/p1b2_baseline_keras2.py +++ b/Pilot1/P1B2/p1b2_baseline_keras2.py @@ -17,14 +17,14 @@ import p1b2 import candle -def initialize_parameters(): +def initialize_parameters(default_model = 'p1b2_default_model.txt): # Build benchmark object - p1b2Bmk = p1b2.BenchmarkP1B2(p1b2.file_path, 'p1b2_default_model.txt', 'keras', + p1b2Bmk = p1b2.BenchmarkP1B2(p1b2.file_path, default_model, 'keras', prog='p1b2_baseline', desc='Train Classifier - Pilot 1 Benchmark 2') # Initialize parameters - gParameters = candle.initialize_parameters(p1b2Bmk) + gParameters = candle.finalize_parameters(p1b2Bmk) #p1b2.logger.info('Params: {}'.format(gParameters)) return gParameters diff --git a/Pilot1/P1B3/p1b3_baseline_keras2.py b/Pilot1/P1B3/p1b3_baseline_keras2.py index 8458684d..f46e02cb 100644 --- a/Pilot1/P1B3/p1b3_baseline_keras2.py +++ b/Pilot1/P1B3/p1b3_baseline_keras2.py @@ -28,14 +28,14 @@ #np.set_printoptions(threshold=np.nan) -def initialize_parameters(): +def initialize_parameters(default_model = 'p1b3_default_model.txt'): # Build benchmark object - p1b3Bmk = benchmark.BenchmarkP1B3(benchmark.file_path, 'p1b3_default_model.txt', 'keras', + p1b3Bmk = benchmark.BenchmarkP1B3(benchmark.file_path, default_model, 'keras', prog='p1b3_baseline', desc='Multi-task (DNN) for data extraction from clinical reports - Pilot 3 Benchmark 1') # Initialize parameters - gParameters = candle.initialize_parameters(p1b3Bmk) + gParameters = candle.finalize_parameters(p1b3Bmk) #benchmark.logger.info('Params: {}'.format(gParameters)) return gParameters diff --git a/Pilot1/T29/t29res.py b/Pilot1/T29/t29res.py index b66e41be..e5a2dfb0 100644 --- a/Pilot1/T29/t29res.py +++ b/Pilot1/T29/t29res.py @@ -24,8 +24,8 @@ import candle # candle -def initialize_parameters(): - t29_common = candle.Benchmark(file_path, 't29_default_model.txt','keras', +def initialize_parameters(default_model = 't29_default_model.txt'): + t29_common = candle.Benchmark(file_path, default_model,'keras', prog='t29res.py',desc='resnet') # Need a pointer to the docs showing what is provided @@ -41,7 +41,7 @@ def initialize_parameters(): 'help':'Residual connection distance between dense layers.'} ] t29_common.additional_definitions = additional_definitions - gParameters = candle.initialize_parameters(t29_common) + gParameters = candle.finalize_parameters(t29_common) return gParameters diff --git a/Pilot1/TC1/tc1_baseline_keras2.py b/Pilot1/TC1/tc1_baseline_keras2.py index bbb90057..6b7252cf 100644 --- a/Pilot1/TC1/tc1_baseline_keras2.py +++ b/Pilot1/TC1/tc1_baseline_keras2.py @@ -30,14 +30,14 @@ import candle -def initialize_parameters(): +def initialize_parameters(default_model = 'tc1_default_model.txt'): # Build benchmark object - tc1Bmk = bmk.BenchmarkTC1(file_path, 'tc1_default_model.txt', 'keras', + tc1Bmk = bmk.BenchmarkTC1(file_path, default_model, 'keras', prog='tc1_baseline', desc='Multi-task (DNN) for data extraction from clinical reports - Pilot 3 Benchmark 1') # Initialize parameters - gParameters = candle.initialize_parameters(tc1Bmk) + gParameters = candle.finalize_parameters(tc1Bmk) #benchmark.logger.info('Params: {}'.format(gParameters)) return gParameters diff --git a/Pilot1/Uno/uno_baseline_keras2.py b/Pilot1/Uno/uno_baseline_keras2.py index 722f9482..1aa1f081 100644 --- a/Pilot1/Uno/uno_baseline_keras2.py +++ b/Pilot1/Uno/uno_baseline_keras2.py @@ -265,14 +265,14 @@ def build_model(loader, args, permanent_dropout=True, silent=False): return Model(inputs, output) -def initialize_parameters(): +def initialize_parameters(default_model = 'uno_default_model.txt'): # Build benchmark object - unoBmk = benchmark.BenchmarkUno(benchmark.file_path, 'uno_default_model.txt', 'keras', + unoBmk = benchmark.BenchmarkUno(benchmark.file_path, default_model, 'keras', prog='uno_baseline', desc='Build neural network based models to predict tumor response to single and paired drugs.') # Initialize parameters - gParameters = candle.initialize_parameters(unoBmk) + gParameters = candle.finalize_parameters(unoBmk) # benchmark.logger.info('Params: {}'.format(gParameters)) return gParameters diff --git a/Pilot1/UnoMT/unoMT_baseline_pytorch.py b/Pilot1/UnoMT/unoMT_baseline_pytorch.py index f9e698b0..abdbf5ef 100644 --- a/Pilot1/UnoMT/unoMT_baseline_pytorch.py +++ b/Pilot1/UnoMT/unoMT_baseline_pytorch.py @@ -21,16 +21,16 @@ np.set_printoptions(precision=4) -def initialize_parameters(): +def initialize_parameters(default_model = 'unoMT_default_model.txt'): # Build benchmark object - unoMTb = unoMT.unoMTBk(unoMT.file_path, 'unoMT_default_model.txt', 'pytorch', + unoMTb = unoMT.unoMTBk(unoMT.file_path, default_model, 'pytorch', prog='unoMT_baseline', desc='Multi-task combined single and combo drug prediction for cross-study data - Pilot 1') print("Created unoMT benchmark") # Initialize parameters - gParameters = candle.initialize_parameters(unoMTb) + gParameters = candle.finalize_parameters(unoMTb) print("Parameters initialized") diff --git a/Pilot2/P2B1/p2b1_baseline_keras2.py b/Pilot2/P2B1/p2b1_baseline_keras2.py index a897d6aa..524c2cf7 100644 --- a/Pilot2/P2B1/p2b1_baseline_keras2.py +++ b/Pilot2/P2B1/p2b1_baseline_keras2.py @@ -40,14 +40,14 @@ def str2bool(v): return v.lower() in ("yes", "true", "t", "1") -def initialize_parameters(): +def initialize_parameters(default_model = 'p2b1_default_model.txt'): # Build benchmark object - p2b1Bmk = p2b1.BenchmarkP2B1(p2b1.file_path, 'p2b1_default_model.txt', 'keras', + p2b1Bmk = p2b1.BenchmarkP2B1(p2b1.file_path, default_model, 'keras', prog='p2b1_baseline', desc='Train Molecular Frame Autoencoder - Pilot 2 Benchmark 1') # Initialize parameters - GP = candle.initialize_parameters(p2b1Bmk) + GP = candle.finalize_parameters(p2b1Bmk) #p2b1.logger.info('Params: {}'.format(gParameters)) print ('\nTraining parameters:') diff --git a/Pilot3/P3B1/p3b1_baseline_keras2.py b/Pilot3/P3B1/p3b1_baseline_keras2.py index 1cae3c19..d1d469ca 100644 --- a/Pilot3/P3B1/p3b1_baseline_keras2.py +++ b/Pilot3/P3B1/p3b1_baseline_keras2.py @@ -14,14 +14,14 @@ import p3b1 as bmk import candle -def initialize_parameters(): +def initialize_parameters(default_model = 'p3b1_default_model.txt'): # Build benchmark object - p3b1Bmk = bmk.BenchmarkP3B1(bmk.file_path, 'p3b1_default_model.txt', 'keras', + p3b1Bmk = bmk.BenchmarkP3B1(bmk.file_path, default_model, 'keras', prog='p3b1_baseline', desc='Multi-task (DNN) for data extraction from clinical reports - Pilot 3 Benchmark 1') # Initialize parameters - gParameters = candle.initialize_parameters(p3b1Bmk) + gParameters = candle.finalize_parameters(p3b1Bmk) #bmk.logger.info('Params: {}'.format(gParameters)) return gParameters diff --git a/Pilot3/P3B2/p3b2_baseline_keras2.py b/Pilot3/P3B2/p3b2_baseline_keras2.py index 824b3be1..83be99ef 100644 --- a/Pilot3/P3B2/p3b2_baseline_keras2.py +++ b/Pilot3/P3B2/p3b2_baseline_keras2.py @@ -15,14 +15,14 @@ import p3b2 as bmk import candle -def initialize_parameters(): +def initialize_parameters(default_model = 'p3b2_default_model.txt'): # Build benchmark object - p3b2Bmk = bmk.BenchmarkP3B2(bmk.file_path, 'p3b2_default_model.txt', 'keras', + p3b2Bmk = bmk.BenchmarkP3B2(bmk.file_path, default_model, 'keras', prog='p3b2_baseline', desc='Multi-task (DNN) for data extraction from clinical reports - Pilot 3 Benchmark 1') # Initialize parameters - gParameters = candle.initialize_parameters(p3b2Bmk) + gParameters = candle.finalize_parameters(p3b2Bmk) #bmk.logger.info('Params: {}'.format(gParameters)) return gParameters diff --git a/Pilot3/P3B3/p3b3_baseline_keras2.py b/Pilot3/P3B3/p3b3_baseline_keras2.py index b8bd9e3d..c8227adb 100644 --- a/Pilot3/P3B3/p3b3_baseline_keras2.py +++ b/Pilot3/P3B3/p3b3_baseline_keras2.py @@ -34,14 +34,14 @@ -def initialize_parameters(): +def initialize_parameters(default_model = 'p3b3_default_model.txt'): # Build benchmark object - p3b3Bmk = bmk.BenchmarkP3B3(bmk.file_path, 'p3b3_default_model.txt', 'keras', + p3b3Bmk = bmk.BenchmarkP3B3(bmk.file_path, default_model, 'keras', prog='p3b3_baseline', desc='Multi-task CNN for data extraction from clinical reports - Pilot 3 Benchmark 3') # Initialize parameters - gParameters = candle.initialize_parameters(p3b3Bmk) + gParameters = candle.finalize_parameters(p3b3Bmk) #bmk.logger.info('Params: {}'.format(gParameters)) return gParameters diff --git a/Pilot3/P3B4/p3b4.py b/Pilot3/P3B4/p3b4.py index 430f9fd9..08498acc 100644 --- a/Pilot3/P3B4/p3b4.py +++ b/Pilot3/P3B4/p3b4.py @@ -14,8 +14,6 @@ import candle - - required = [ 'learning_rate', 'batch_size', 'epochs', 'dropout', \ 'optimizer', 'wv_len', \ diff --git a/Pilot3/P3B4/p3b4_baseline_keras2.py b/Pilot3/P3B4/p3b4_baseline_keras2.py index 5caacdf0..19b64d03 100644 --- a/Pilot3/P3B4/p3b4_baseline_keras2.py +++ b/Pilot3/P3B4/p3b4_baseline_keras2.py @@ -12,14 +12,14 @@ import p3b4 as bmk import candle -def initialize_parameters(): +def initialize_parameters(default_model = 'p3b4_default_model.txt' ): # Build benchmark object - p3b3Bmk = bmk.BenchmarkP3B3(bmk.file_path, 'p3b4_default_model.txt', 'keras', + p3b3Bmk = bmk.BenchmarkP3B3(bmk.file_path, default_model, 'keras', prog='p3b4_baseline', desc='Hierarchical Convolutional Attention Networks for data extraction from clinical reports - Pilot 3 Benchmark 4') # Initialize parameters - gParameters = candle.initialize_parameters(p3b3Bmk) + gParameters = candle.finalize_parameters(p3b3Bmk) #bmk.logger.info('Params: {}'.format(gParameters)) return gParameters @@ -74,8 +74,6 @@ def run(gParameters): test_x = np.load( fpath + '/test_X.npy' ) test_y = np.load( fpath + '/test_Y.npy' ) - - num_classes = [] for task in range( len( train_y[ 0, : ] ) ): cat = np.unique( train_y[ :, task ] ) diff --git a/common/candle/__init__.py b/common/candle/__init__.py index 486ef1ef..0a16bfe1 100644 --- a/common/candle/__init__.py +++ b/common/candle/__init__.py @@ -18,7 +18,7 @@ from default_utils import ArgumentStruct from default_utils import Benchmark from default_utils import str2bool -from default_utils import initialize_parameters +from default_utils import finalize_parameters from default_utils import fetch_file from default_utils import verify_path from default_utils import keras_default_config diff --git a/common/candle_keras/__init__.py b/common/candle_keras/__init__.py index bcf15874..bf37f7ec 100644 --- a/common/candle_keras/__init__.py +++ b/common/candle_keras/__init__.py @@ -18,7 +18,7 @@ from default_utils import ArgumentStruct from default_utils import Benchmark from default_utils import str2bool -from default_utils import initialize_parameters +from default_utils import finalize_parameters from default_utils import fetch_file from default_utils import verify_path from default_utils import keras_default_config diff --git a/common/default_utils.py b/common/default_utils.py index 143e227e..8f90066a 100644 --- a/common/default_utils.py +++ b/common/default_utils.py @@ -318,7 +318,7 @@ def set_seed(seed): random.seed(seed) -def initialize_parameters(bmk): +def finalize_parameters(bmk): """Utility to parse parameters in common as well as parmeters particular to each benchmark. From a8aaa1b367c592c0a480cf9229b9d99e87dba4a6 Mon Sep 17 00:00:00 2001 From: Hyunseung Yoo Date: Thu, 14 Nov 2019 10:02:13 -0600 Subject: [PATCH 059/331] add early stopping --- Pilot1/Uno/uno.py | 4 ++++ Pilot1/Uno/uno_baseline_keras2.py | 3 +++ 2 files changed, 7 insertions(+) diff --git a/Pilot1/Uno/uno.py b/Pilot1/Uno/uno.py index d4731e50..d246d58e 100644 --- a/Pilot1/Uno/uno.py +++ b/Pilot1/Uno/uno.py @@ -140,6 +140,10 @@ def set_locals(self): 'type': float, 'default': None, 'help': 'base learning rate'}, + {'name': 'es', + 'type': candle.str2bool, + 'default': False, + 'help': 'early stopping on val_loss'}, {'name': 'cp', 'type': candle.str2bool, 'default': False, diff --git a/Pilot1/Uno/uno_baseline_keras2.py b/Pilot1/Uno/uno_baseline_keras2.py index 0ec9201b..401a9254 100644 --- a/Pilot1/Uno/uno_baseline_keras2.py +++ b/Pilot1/Uno/uno_baseline_keras2.py @@ -427,6 +427,7 @@ def warmup_scheduler(epoch): candle_monitor = candle.CandleRemoteMonitor(params=params) timeout_monitor = candle.TerminateOnTimeOut(params['timeout']) + es_monitor = keras.callbacks.EarlyStopping(patience=10, verbose=1) reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=0.00001) warmup_lr = LearningRateScheduler(warmup_scheduler) @@ -435,6 +436,8 @@ def warmup_scheduler(epoch): history_logger = LoggingCallback(logger.debug) callbacks = [candle_monitor, timeout_monitor, history_logger] + if args.es: + callbacks.append(es_monitor) if args.reduce_lr: callbacks.append(reduce_lr) if args.warmup_lr: From 3c2901db359f8eeb4d428023a9ab08790330c075 Mon Sep 17 00:00:00 2001 From: Hyunseung Yoo Date: Thu, 14 Nov 2019 11:09:33 -0600 Subject: [PATCH 060/331] auto-detect input size --- Pilot1/Uno/uno_data.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/Pilot1/Uno/uno_data.py b/Pilot1/Uno/uno_data.py index b25a3748..8cc6561d 100644 --- a/Pilot1/Uno/uno_data.py +++ b/Pilot1/Uno/uno_data.py @@ -958,12 +958,9 @@ def __init__(self, partition='train', filename=None, batch_size=32, shuffle=Fals self.single = single self.agg_dose = agg_dose self.target = agg_dose if agg_dose is not None else 'Growth' - # 4 inputs for single drug model (cell, dose1, descriptor, fingerprint) - # 7 inputs for drug pair model (cell, dose1, dose1, dr1.descriptor, dr1.fingerprint, dr2.descriptor, dr2.fingerprint) - self.input_size = 4 if self.single else 7 - self.input_size = 2 if agg_dose else self.input_size self.store = pd.HDFStore(filename, mode='r') + self.input_size = len(list(filter(lambda x: x.startswith('/x_train'), self.store.keys()))) y = self.store.select('y_{}'.format(self.partition)) self.index = y.index self.size = len(self.index) From 7121416f0db9f2a3ded842fa75934b4efedd35d5 Mon Sep 17 00:00:00 2001 From: Andrew Weisman Date: Thu, 14 Nov 2019 13:05:25 -0500 Subject: [PATCH 061/331] Made data directory for caching data user definable; fixed sum(fractions)=1 check by adding a tolerance in generate_index_distribution_from_fraction() --- common/candle/__init__.py | 1 + common/file_utils.py | 9 +++++---- common/uq_utils.py | 5 ++++- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/common/candle/__init__.py b/common/candle/__init__.py index 486ef1ef..feb5ee30 100644 --- a/common/candle/__init__.py +++ b/common/candle/__init__.py @@ -47,6 +47,7 @@ from uq_utils import computation_of_valid_calibration_interval from uq_utils import applying_calibration from uq_utils import overprediction_check +from uq_utils import generate_index_distribution # import benchmark-dependent utils diff --git a/common/file_utils.py b/common/file_utils.py index 04085dc3..b506f03c 100644 --- a/common/file_utils.py +++ b/common/file_utils.py @@ -39,7 +39,8 @@ def chunk_read(response, chunk_size=8192, reporthook=None): def get_file(fname, origin, untar=False, - md5_hash=None, cache_subdir='common'): + md5_hash=None, datadir='../Data/common'): + #md5_hash=None, cache_subdir='common', datadir='../Data/common'): """ Downloads a file from a URL if it not already in the cache. Passing the MD5 hash will verify the file after download as well as if it is already present in the cache. @@ -62,9 +63,9 @@ def get_file(fname, origin, untar=False, Path to the downloaded file """ - file_path = os.path.dirname(os.path.realpath(__file__)) - datadir_base = os.path.expanduser(os.path.join(file_path, '..', 'Data')) - datadir = os.path.join(datadir_base, cache_subdir) + #file_path = os.path.dirname(os.path.realpath(__file__)) + #datadir_base = os.path.expanduser(os.path.join(file_path, '..', 'Data')) + #datadir = os.path.join(datadir_base, cache_subdir) if not os.path.exists(datadir): os.makedirs(datadir) diff --git a/common/uq_utils.py b/common/uq_utils.py index d0ab46c3..9497cf94 100644 --- a/common/uq_utils.py +++ b/common/uq_utils.py @@ -77,6 +77,8 @@ def generate_index_distribution_from_fraction(numTrain, numTest, numValidation, Indices for data in testing (if merging) """ + tol = 1e-7 + # Extract required parameters fractionTrain = params['uq_train_fr'] fractionValidation = params['uq_valid_fr'] @@ -90,7 +92,8 @@ def generate_index_distribution_from_fraction(numTrain, numTest, numValidation, raise ValueError('uq_test_fr is not in (0, 1) range. uq_test_fr: ', fractionTest) fractionSum = fractionTrain + fractionValidation + fractionTest - if (fractionSum > 1.) or (fractionSum < 1.): + #if (fractionSum > 1.) or (fractionSum < 1.): + if abs(fractionSum-1.) > tol: raise ValueError('Specified UQ fractions (uq_train_fr, uq_valid_fr, uq_test_fr) do not add up to 1. No cross-validation partition is computed ! sum:', fractionSum) # Determine data size and block size From 32cecd0768545efe9fbcdfe7893d86ddfc008d76 Mon Sep 17 00:00:00 2001 From: Andrew Weisman Date: Thu, 14 Nov 2019 13:10:03 -0500 Subject: [PATCH 062/331] Undid my change using environment variable configuration file to uno_baseline_keras2.py --- Pilot1/Uno/uno_baseline_keras2.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Pilot1/Uno/uno_baseline_keras2.py b/Pilot1/Uno/uno_baseline_keras2.py index 4bf784c1..98e806b4 100644 --- a/Pilot1/Uno/uno_baseline_keras2.py +++ b/Pilot1/Uno/uno_baseline_keras2.py @@ -268,10 +268,10 @@ def build_model(loader, args, permanent_dropout=True, silent=False): def initialize_parameters(): # Build benchmark object - #mymodel_common = candle.Benchmark(file_path,os.getenv("DEFAULT_PARAMS_FILE"),'keras',prog='myprog',desc='My model') - #unoBmk = benchmark.BenchmarkUno(benchmark.file_path, 'uno_default_model.txt', 'keras', - unoBmk = benchmark.BenchmarkUno(benchmark.file_path, os.getenv("DEFAULT_PARAMS_FILE"), 'keras', - prog='uno_baseline', desc='Build neural network based models to predict tumor response to single and paired drugs.') + mymodel_common = candle.Benchmark(file_path,os.getenv("DEFAULT_PARAMS_FILE"),'keras',prog='myprog',desc='My model') + unoBmk = benchmark.BenchmarkUno(benchmark.file_path, 'uno_default_model.txt', 'keras', + #unoBmk = benchmark.BenchmarkUno(benchmark.file_path, os.getenv("DEFAULT_PARAMS_FILE"), 'keras', + #prog='uno_baseline', desc='Build neural network based models to predict tumor response to single and paired drugs.') # Initialize parameters gParameters = candle.initialize_parameters(unoBmk) From 5b0bd8dd8943b4a19db757eafbeff6f58207cdec Mon Sep 17 00:00:00 2001 From: Hyunseung Yoo Date: Wed, 5 Jun 2019 12:52:23 -0500 Subject: [PATCH 063/331] wip milestone 13 --- Pilot1/Uno/topN_to_uno.py | 94 +++++++++++++++++++++++++++++++ Pilot1/Uno/uno_baseline_keras2.py | 2 +- Pilot1/Uno/uno_data.py | 12 ++-- 3 files changed, 103 insertions(+), 5 deletions(-) create mode 100644 Pilot1/Uno/topN_to_uno.py diff --git a/Pilot1/Uno/topN_to_uno.py b/Pilot1/Uno/topN_to_uno.py new file mode 100644 index 00000000..1f7c2b6a --- /dev/null +++ b/Pilot1/Uno/topN_to_uno.py @@ -0,0 +1,94 @@ +import argparse +import json +import pandas as pd +import numpy as np + + +def parse_arguments(): + parser = argparse.ArgumentParser() + parser.add_argument('--dataframe_from', type=str, default='top21_dataframe_8x8.csv', + help='Dataframe file name contains all data points') + parser.add_argument('--plan', type=str, default='plan.json', + help='Plan data file') + parser.add_argument('--node', type=str, default=None, + help='node number to execute') + + args, unparsed = parser.parse_known_args() + return args, unparsed + + +def read_plan(filename, node): + print("reading {} file for node {}".format(filename, node)) + with open(filename, 'r') as plan_file: + plan = json.load(plan_file) + if node in plan: + return plan[node] + else: + raise Exception('Node index {} was not found in plan file') + + +def build_masks(args, df): + if args.node is None: + raise Exception('Node id is not given') + + plan = read_plan(args.plan, args.node) + mask = {} + for partition in ['train', 'val']: + _mask = df['Sample'] == None + for i, element in enumerate(plan[partition]): + cl_filter = element['CELL'] + dr_filter = element['DRUG'] + __mask = df['Sample'].isin(cl_filter) & df['Drug1'].isin(dr_filter) + _mask = _mask | __mask + mask[partition] = _mask + + return mask['train'], mask['val'] + + +def training_mask(df): + return np.random.rand(len(df)) < 0.8 + + +def read_dataframe(args): + df = pd.read_csv(args.dataframe_from, low_memory=False, na_values='na').fillna(0) + df.rename(columns={'SAMPLE': 'Sample', 'DRUG': 'Drug1'}, inplace=True) + df_y = df[['AUC', 'Sample', 'Drug1']] + + cols = df.columns.to_list() + cl_columns = list(filter(lambda x: x.startswith('GE_'), cols)) + dd_columns = list(filter(lambda x: x.startswith('DD_'), cols)) + + df_cl = df.loc[:, cl_columns] + df_dd = df.loc[:, dd_columns] + + return df_y, df_cl, df_dd + + +def build_dataframe(args): + df_y, df_cl, df_dd = read_dataframe(args) + + # mask = training_mask(df_y) + train_mask, val_mask = build_masks(args, df_y) + + y_train = pd.DataFrame(data=df_y[train_mask].reset_index(drop=True)) + y_val = pd.DataFrame(data=df_y[val_mask].reset_index(drop=True)) + + x_train_0 = df_cl[train_mask].reset_index(drop=True) + x_train_1 = df_dd[train_mask].reset_index(drop=True) + + x_val_0 = df_cl[val_mask].reset_index(drop=True) + x_val_1 = df_dd[val_mask].reset_index(drop=True) + + # store + store = pd.HDFStore('topN.uno.h5', 'w') + store.put('y_train', y_train) + store.put('y_val', y_val) + store.put('x_train_0', x_train_0) + store.put('x_train_1', x_train_1) + store.put('x_val_0', x_val_0) + store.put('x_val_1', x_val_1) + + +if __name__ == '__main__': + parsed, unparsed = parse_arguments() + build_dataframe(parsed) diff --git a/Pilot1/Uno/uno_baseline_keras2.py b/Pilot1/Uno/uno_baseline_keras2.py index 5af3673c..3ab072f0 100644 --- a/Pilot1/Uno/uno_baseline_keras2.py +++ b/Pilot1/Uno/uno_baseline_keras2.py @@ -444,7 +444,7 @@ def warmup_scheduler(epoch): if args.tb: callbacks.append(tensorboard) if args.save_weights: - callbacks.append(SimpleWeightSaver(args.save_path + '/' + args.save_weights)) + callbacks.append(MultiGPUCheckpoint(args.save_weights)) if args.use_exported_data is not None: train_gen = DataFeeder(filename=args.use_exported_data, batch_size=args.batch_size, shuffle=args.shuffle, single=args.single, agg_dose=args.agg_dose) diff --git a/Pilot1/Uno/uno_data.py b/Pilot1/Uno/uno_data.py index 5ede815e..52450fb2 100644 --- a/Pilot1/Uno/uno_data.py +++ b/Pilot1/Uno/uno_data.py @@ -955,7 +955,7 @@ def __init__(self, partition='train', filename=None, batch_size=32, shuffle=Fals # 4 inputs for single drug model (cell, dose1, descriptor, fingerprint) # 7 inputs for drug pair model (cell, dose1, dose1, dr1.descriptor, dr1.fingerprint, dr2.descriptor, dr2.fingerprint) self.input_size = 4 if self.single else 7 - self.input_size = 3 if agg_dose else self.input_size + self.input_size = 2 if agg_dose else self.input_size self.store = pd.HDFStore(filename, mode='r') y = self.store.select('y_{}'.format(self.partition)) @@ -973,7 +973,7 @@ def __getitem__(self, idx): start = self.index_map[idx] * self.batch_size stop = (self.index_map[idx] + 1) * self.batch_size x = [self.store.select('x_{0}_{1}'.format(self.partition, i), start=start, stop=stop) for i in range(self.input_size)] - y = self.store.select('y_{}'.format(self.partition), start=start, stop=stop, columns=[self.target]) + y = self.store.select('y_{}'.format(self.partition), start=start, stop=stop)[self.target] return x, y def reset(self): @@ -982,8 +982,12 @@ def reset(self): pass def get_response(self, copy=False): - self.index = [item for step in range(self.steps) for item in range(self.index_map[step] * self.batch_size, (self.index_map[step] + 1) * self.batch_size)] - df = self.store.get('y_{}'.format(self.partition)).iloc[self.index,:] + if self.shuffle: + self.index = [item for step in range(self.steps) for item in range(self.index_map[step] * self.batch_size, (self.index_map[step] + 1) * self.batch_size)] + df = self.store.get('y_{}'.format(self.partition)).iloc[self.index,:] + else: + df = self.store.get('y_{}'.format(self.partition)) + if self.agg_dose is None: df['Dose1'] = self.store.get('x_{}_0'.format(self.partition)).iloc[self.index,:] if not self.single: From 1aef83c23b359fa6110929a657e6e7ba82bc81cd Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 19 Jun 2019 09:38:09 -0500 Subject: [PATCH 064/331] Small fix to Exception --- Pilot1/Uno/topN_to_uno.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Pilot1/Uno/topN_to_uno.py b/Pilot1/Uno/topN_to_uno.py index 1f7c2b6a..ffc153d1 100644 --- a/Pilot1/Uno/topN_to_uno.py +++ b/Pilot1/Uno/topN_to_uno.py @@ -24,7 +24,7 @@ def read_plan(filename, node): if node in plan: return plan[node] else: - raise Exception('Node index {} was not found in plan file') + raise Exception('Node index "{}" was not found in plan file'.format(node)) def build_masks(args, df): From 536c83522454a12ea59581f7760daeb7abffe43d Mon Sep 17 00:00:00 2001 From: Hyunseung Yoo Date: Thu, 27 Jun 2019 22:10:19 -0500 Subject: [PATCH 065/331] read hdf format master dataframe --- Pilot1/Uno/topN_to_uno.py | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/Pilot1/Uno/topN_to_uno.py b/Pilot1/Uno/topN_to_uno.py index ffc153d1..87c03a9e 100644 --- a/Pilot1/Uno/topN_to_uno.py +++ b/Pilot1/Uno/topN_to_uno.py @@ -1,4 +1,5 @@ import argparse +import os import json import pandas as pd import numpy as np @@ -36,8 +37,8 @@ def build_masks(args, df): for partition in ['train', 'val']: _mask = df['Sample'] == None for i, element in enumerate(plan[partition]): - cl_filter = element['CELL'] - dr_filter = element['DRUG'] + cl_filter = element['cell'] + dr_filter = element['drug'] __mask = df['Sample'].isin(cl_filter) & df['Drug1'].isin(dr_filter) _mask = _mask | __mask mask[partition] = _mask @@ -49,7 +50,7 @@ def training_mask(df): return np.random.rand(len(df)) < 0.8 -def read_dataframe(args): +def read_dataframe_from_csv(args): df = pd.read_csv(args.dataframe_from, low_memory=False, na_values='na').fillna(0) df.rename(columns={'SAMPLE': 'Sample', 'DRUG': 'Drug1'}, inplace=True) df_y = df[['AUC', 'Sample', 'Drug1']] @@ -64,8 +65,28 @@ def read_dataframe(args): return df_y, df_cl, df_dd +def read_dataframe_from_hdf(args): + store = pd.HDFStore(args.dataframe_from, 'r') + df = store.get('df') + df.rename(columns={'CELL': 'Sample', 'DRUG': 'Drug1'}, inplace=True) + df_y = df[['AUC', 'Sample', 'Drug1']] + + cols = df.columns.to_list() + cl_columns = list(filter(lambda x: x.startswith('GE_'), cols)) + dd_columns = list(filter(lambda x: x.startswith('DD_'), cols)) + + df_cl = df.loc[:, cl_columns] + df_dd = df.loc[:, dd_columns] + + return df_y, df_cl, df_dd + + def build_dataframe(args): - df_y, df_cl, df_dd = read_dataframe(args) + _, ext = os.path.splitext(args.dataframe_from) + if ext == '.h5' or ext == '.hdf5': + df_y, df_cl, df_dd = read_dataframe_from_hdf(args) + else: + df_y, df_cl, df_dd = read_dataframe_from_csv(args) # mask = training_mask(df_y) train_mask, val_mask = build_masks(args, df_y) From 104c34af5cf03fbffc0fbd7ca7e8a5423551a629 Mon Sep 17 00:00:00 2001 From: Hyunseung Yoo Date: Thu, 27 Jun 2019 22:32:34 -0500 Subject: [PATCH 066/331] add dose_aggregated AUC prediction model --- Pilot1/Uno/uno_auc_model.txt | 39 ++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100644 Pilot1/Uno/uno_auc_model.txt diff --git a/Pilot1/Uno/uno_auc_model.txt b/Pilot1/Uno/uno_auc_model.txt new file mode 100644 index 00000000..00d2224e --- /dev/null +++ b/Pilot1/Uno/uno_auc_model.txt @@ -0,0 +1,39 @@ +[Global_Params] +train_sources=['CCLE'] +test_sources=['train'] +cell_types=None +cell_features=['rnaseq'] +drug_features=['descriptors'] +dense=[1000, 1000, 1000] +dense_feature_layers=[1000, 1000, 1000] +activation='relu' +loss='mse' +optimizer='adam' +scaling='std' +drop=0 +epochs=50 +batch_size=512 +validation_split=0.2 +cv=1 +max_val_loss=1.0 +learning_rate=None +base_lr=None +residual=False +reduce_lr=False +warmup_lr=False +batch_normalization=False +feature_subsample=0 +rng_seed=2018 +save_path='save/uno' +no_gen=False +verbose=False +no_response_source=True +no_feature_source=True +use_landmark_genes=True +agg_dose='AUC' +preprocess_rnaseq='source_scale' +single=True + +[Monitor_Params] +solr_root='' +timeout=3600 From f63971e89cb35c371e94e340cc1bc957b68403a8 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 16 Jul 2019 13:05:20 -0500 Subject: [PATCH 067/331] Create cache directory if it does not exist --- Pilot1/Uno/uno_data.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Pilot1/Uno/uno_data.py b/Pilot1/Uno/uno_data.py index 52450fb2..c15e217e 100644 --- a/Pilot1/Uno/uno_data.py +++ b/Pilot1/Uno/uno_data.py @@ -654,6 +654,10 @@ def save_to_cache(self, cache, params): for k in ['self', 'cache', 'single']: if k in params: del params[k] + dirname = os.path.dirname(cache) + if not os.path.exists(dirname): + logger.debug('Creating directory for cache: %s', dirname) + os.mkdir(dirname) param_fname = '{}.params.json'.format(cache) with open(param_fname, 'w') as param_file: json.dump(params, param_file, sort_keys=True) From 5f60f3df4a172106aebc822c62e7aa48f2c8749c Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 16 Jul 2019 14:09:40 -0500 Subject: [PATCH 068/331] Fix typos --- common/default_utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/common/default_utils.py b/common/default_utils.py index 143e227e..5ea5bfaa 100644 --- a/common/default_utils.py +++ b/common/default_utils.py @@ -319,19 +319,19 @@ def set_seed(seed): def initialize_parameters(bmk): - """Utility to parse parameters in common as well as parmeters + """Utility to parse parameters in common as well as parameters particular to each benchmark. Parameters ---------- bmk : benchmark object Object that has benchmark filepaths and specifications - + Return ---------- gParameters : python dictionary Dictionary with all the parameters necessary to run the benchmark. - Command line overwrites config file especifications + Command line overwrites config file specifications """ # Parse common parameters From 8f41b50ab48259c632abd122740c0dab1320b93b Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 16 Jul 2019 14:38:13 -0500 Subject: [PATCH 069/331] Post questions regarding CombinedDataLoader.load_from_cache() --- Pilot1/Uno/uno_data.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Pilot1/Uno/uno_data.py b/Pilot1/Uno/uno_data.py index c15e217e..1488b6a8 100644 --- a/Pilot1/Uno/uno_data.py +++ b/Pilot1/Uno/uno_data.py @@ -622,6 +622,7 @@ def __init__(self, seed=SEED): self.seed = seed def load_from_cache(self, cache, params): + """ NOTE: How does this function return an error? (False?) -Wozniak """ param_fname = '{}.params.json'.format(cache) if not os.path.isfile(param_fname): logger.warning('Cache parameter file does not exist: %s', param_fname) @@ -648,6 +649,7 @@ def load_from_cache(self, cache, params): self.__dict__.update(obj.__dict__) logger.info('Loaded data from cache: %s', fname) return True + # NOTE: This is unreachable -Wozniak return False def save_to_cache(self, cache, params): From 882e5eb75aa57a6dccb0aa2ed4642f99aeb70a7a Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 16 Jul 2019 14:38:24 -0500 Subject: [PATCH 070/331] Fix typo --- Pilot1/Uno/uno_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Pilot1/Uno/uno_data.py b/Pilot1/Uno/uno_data.py index 1488b6a8..1406a8de 100644 --- a/Pilot1/Uno/uno_data.py +++ b/Pilot1/Uno/uno_data.py @@ -636,7 +636,7 @@ def load_from_cache(self, cache, params): ignore_keys = ['cache', 'partition_by', 'single'] equal, diffs = dict_compare(params, cached_params, ignore_keys) if not equal: - logger.warning('Cache parameter mismatch: %s\nSaved: %s\nAttemptd to load: %s', diffs, cached_params, params) + logger.warning('Cache parameter mismatch: %s\nSaved: %s\nAttempted to load: %s', diffs, cached_params, params) logger.warning('\nRemove %s to rebuild data cache.\n', param_fname) raise ValueError('Could not load from a cache with incompatible keys:', diffs) else: From 3386be12662cbb436941d35dc20ea2b9706c8970 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Thu, 18 Jul 2019 09:41:21 -0500 Subject: [PATCH 071/331] Improve log messages --- Pilot1/Uno/uno_baseline_keras2.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Pilot1/Uno/uno_baseline_keras2.py b/Pilot1/Uno/uno_baseline_keras2.py index 3ab072f0..7c08aa77 100644 --- a/Pilot1/Uno/uno_baseline_keras2.py +++ b/Pilot1/Uno/uno_baseline_keras2.py @@ -403,7 +403,7 @@ def warmup_scheduler(epoch): template_model = build_model(loader, args, silent=True) if args.initial_weights: - logger.info("Loading weights from {}".format(args.initial_weights)) + logger.info("Loading initial weights from {}".format(args.initial_weights)) template_model.load_weights(args.initial_weights) if len(args.gpus) > 1: @@ -444,6 +444,7 @@ def warmup_scheduler(epoch): if args.tb: callbacks.append(tensorboard) if args.save_weights: + logger.info("Will save weights to: " + args.save_weights) callbacks.append(MultiGPUCheckpoint(args.save_weights)) if args.use_exported_data is not None: From 91b4c1f9118d1cedb3602b40f9144cef85142518 Mon Sep 17 00:00:00 2001 From: Hyunseung Yoo Date: Thu, 1 Aug 2019 11:37:39 -0500 Subject: [PATCH 072/331] loocv data util --- Pilot1/Uno/loocv_data_util.py | 91 +++++++++++++++++++++++++++++++++++ 1 file changed, 91 insertions(+) create mode 100644 Pilot1/Uno/loocv_data_util.py diff --git a/Pilot1/Uno/loocv_data_util.py b/Pilot1/Uno/loocv_data_util.py new file mode 100644 index 00000000..d42a41fb --- /dev/null +++ b/Pilot1/Uno/loocv_data_util.py @@ -0,0 +1,91 @@ +import argparse +import json +import pandas as pd +import numpy as np + + +def parse_arguments(): + parser = argparse.ArgumentParser() + parser.add_argument('--dataframe_from', type=str, default='GDSC.h5', + help='Dataframe file name contains all data points') + parser.add_argument('--plan', type=str, default='plan.json', + help='Plan data file') + parser.add_argument('--node', type=str, default=None, + help='node number to execute') + + args, unparsed = parser.parse_known_args() + return args, unparsed + + +def read_plan(filename, node): + print("reading {} file for node {}".format(filename, node)) + with open(filename, 'r') as plan_file: + plan = json.load(plan_file) + if node in plan: + return plan[node] + else: + raise Exception('Node index "{}" was not found in plan file'.format(node)) + + +def build_masks(args, df): + if args.node is None: + raise Exception('Node id is not given') + + plan = read_plan(args.plan, args.node) + mask = {} + for partition in ['train', 'val']: + _mask = df['Sample'] is None + for i, element in enumerate(plan[partition]): + cl_filter = element['cell'] + dr_filter = element['drug'] + __mask = df['Sample'].isin(cl_filter) & df['Drug1'].isin(dr_filter) + _mask = _mask | __mask + mask[partition] = _mask + + return mask['train'], mask['val'] + + +def training_mask(df): + return np.random.rand(len(df)) < 0.8 + + +def build_dataframe(args): + store = pd.HDFStore(args.dataframe_from, 'r') + df_y = store.get('y_train') + df_ds = store.get('x_train_0') + df_cl = store.get('x_train_1') + df_dd = store.get('x_train_2') + df_fp = store.get('x_train_3') + + train_mask, val_mask = build_masks(args, df_y) + + y_train = pd.DataFrame(data=df_y[train_mask].reset_index(drop=True)) + y_val = pd.DataFrame(data=df_y[val_mask].reset_index(drop=True)) + + x_train_0 = df_ds[train_mask].reset_index(drop=True) + x_train_1 = df_cl[train_mask].reset_index(drop=True) + x_train_2 = df_dd[train_mask].reset_index(drop=True) + x_train_3 = df_fp[train_mask].reset_index(drop=True) + + x_val_0 = df_ds[val_mask].reset_index(drop=True) + x_val_1 = df_cl[val_mask].reset_index(drop=True) + x_val_2 = df_dd[val_mask].reset_index(drop=True) + x_val_3 = df_fp[val_mask].reset_index(drop=True) + + # store + store = pd.HDFStore('topN.uno.h5', 'w') + store.put('y_train', y_train) + store.put('y_val', y_val) + store.put('x_train_0', x_train_0) + store.put('x_train_1', x_train_1) + store.put('x_train_2', x_train_2) + store.put('x_train_3', x_train_3) + store.put('x_val_0', x_val_0) + store.put('x_val_1', x_val_1) + store.put('x_val_2', x_val_2) + store.put('x_val_3', x_val_3) + + +if __name__ == '__main__': + parsed, unparsed = parse_arguments() + build_dataframe(parsed) From ec4ae35f05242b5eb0f5dcb9ce16fa4e93b9c9b7 Mon Sep 17 00:00:00 2001 From: Hyunseung Yoo Date: Thu, 1 Aug 2019 23:40:38 -0500 Subject: [PATCH 073/331] use table format --- Pilot1/Uno/loocv_data_util.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/Pilot1/Uno/loocv_data_util.py b/Pilot1/Uno/loocv_data_util.py index d42a41fb..412dba5b 100644 --- a/Pilot1/Uno/loocv_data_util.py +++ b/Pilot1/Uno/loocv_data_util.py @@ -74,16 +74,16 @@ def build_dataframe(args): # store store = pd.HDFStore('topN.uno.h5', 'w') - store.put('y_train', y_train) - store.put('y_val', y_val) - store.put('x_train_0', x_train_0) - store.put('x_train_1', x_train_1) - store.put('x_train_2', x_train_2) - store.put('x_train_3', x_train_3) - store.put('x_val_0', x_val_0) - store.put('x_val_1', x_val_1) - store.put('x_val_2', x_val_2) - store.put('x_val_3', x_val_3) + store.put('y_train', y_train, format='t') + store.put('y_val', y_val, format='t') + store.put('x_train_0', x_train_0, format='t') + store.put('x_train_1', x_train_1, format='t') + store.put('x_train_2', x_train_2, format='t') + store.put('x_train_3', x_train_3, format='t') + store.put('x_val_0', x_val_0, format='t') + store.put('x_val_1', x_val_1, format='t') + store.put('x_val_2', x_val_2, format='t') + store.put('x_val_3', x_val_3, format='t') if __name__ == '__main__': From 7208db3bf43d6134f87d9b1fd221b2e3cd4c614c Mon Sep 17 00:00:00 2001 From: Hyunseung Yoo Date: Sun, 4 Aug 2019 10:22:55 -0500 Subject: [PATCH 074/331] add fom default model --- Pilot1/Uno/uno_fom_model.txt | 38 ++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 Pilot1/Uno/uno_fom_model.txt diff --git a/Pilot1/Uno/uno_fom_model.txt b/Pilot1/Uno/uno_fom_model.txt new file mode 100644 index 00000000..cf66baae --- /dev/null +++ b/Pilot1/Uno/uno_fom_model.txt @@ -0,0 +1,38 @@ +[Global_Params] +train_sources=['GDSC'] +test_sources=['train'] +cell_types=None +cell_features=['rnaseq'] +drug_features=['descriptors', 'fingerprints'] +dense=[1000, 1000, 1000] +dense_feature_layers=[1000, 1000, 1000] +activation='relu' +loss='mse' +optimizer='adam' +scaling='std' +drop=0 +epochs=50 +batch_size=512 +validation_split=0.2 +cv=1 +max_val_loss=1.0 +learning_rate=None +base_lr=None +residual=False +reduce_lr=False +warmup_lr=False +batch_normalization=False +feature_subsample=0 +rng_seed=2018 +save_path='save/uno' +no_gen=False +verbose = False +use_landmark_genes=True +preprocess_rnaseq='source_scale' +no_feature_source=True +no_response_source=True +single=True + +[Monitor_Params] +solr_root='' +timeout=-1 From 9f668e567cb8ab091f4744a94eaed89c2b705490 Mon Sep 17 00:00:00 2001 From: Hyunseung Yoo Date: Tue, 13 Aug 2019 15:22:54 -0500 Subject: [PATCH 075/331] generate random split when node is not given; fix hdfstore issues --- Pilot1/Uno/topN_to_uno.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/Pilot1/Uno/topN_to_uno.py b/Pilot1/Uno/topN_to_uno.py index 87c03a9e..5ab5f3ff 100644 --- a/Pilot1/Uno/topN_to_uno.py +++ b/Pilot1/Uno/topN_to_uno.py @@ -30,7 +30,9 @@ def read_plan(filename, node): def build_masks(args, df): if args.node is None: - raise Exception('Node id is not given') + print('node is None. Generate Random split') + mask = np.random.rand(len(df)) < 0.8 + return mask, ~mask plan = read_plan(args.plan, args.node) mask = {} @@ -96,18 +98,20 @@ def build_dataframe(args): x_train_0 = df_cl[train_mask].reset_index(drop=True) x_train_1 = df_dd[train_mask].reset_index(drop=True) + x_train_1.columns = [''] * len(x_train_1.columns) x_val_0 = df_cl[val_mask].reset_index(drop=True) x_val_1 = df_dd[val_mask].reset_index(drop=True) + x_val_1.columns = [''] * len(x_val_1.columns) # store - store = pd.HDFStore('topN.uno.h5', 'w') - store.put('y_train', y_train) - store.put('y_val', y_val) - store.put('x_train_0', x_train_0) - store.put('x_train_1', x_train_1) - store.put('x_val_0', x_val_0) - store.put('x_val_1', x_val_1) + store = pd.HDFStore('topN.uno.h5', 'w', complevel=9, complib='blosc:snappy') + store.put('y_train', y_train, format='table') + store.put('y_val', y_val, format='table') + store.put('x_train_0', x_train_0, format='table') + store.put('x_train_1', x_train_1, format='table') + store.put('x_val_0', x_val_0, format='table') + store.put('x_val_1', x_val_1, format='table') if __name__ == '__main__': From 61af6279e2f5fb7d7a802768b168339e9fe214b3 Mon Sep 17 00:00:00 2001 From: Hyunseung Yoo Date: Thu, 15 Aug 2019 09:33:09 -0500 Subject: [PATCH 076/331] set timeout unlimited --- Pilot1/Uno/uno_auc_model.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Pilot1/Uno/uno_auc_model.txt b/Pilot1/Uno/uno_auc_model.txt index 00d2224e..23f93ba8 100644 --- a/Pilot1/Uno/uno_auc_model.txt +++ b/Pilot1/Uno/uno_auc_model.txt @@ -36,4 +36,4 @@ single=True [Monitor_Params] solr_root='' -timeout=3600 +timeout=-1 From 4baea0857c4a544f0cba4d350c4b1be837539eda Mon Sep 17 00:00:00 2001 From: Hyunseung Yoo Date: Wed, 21 Aug 2019 09:32:09 -0500 Subject: [PATCH 077/331] set hyper-params for auc training --- Pilot1/Uno/uno_auc_model.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Pilot1/Uno/uno_auc_model.txt b/Pilot1/Uno/uno_auc_model.txt index 23f93ba8..4a803b43 100644 --- a/Pilot1/Uno/uno_auc_model.txt +++ b/Pilot1/Uno/uno_auc_model.txt @@ -8,15 +8,15 @@ dense=[1000, 1000, 1000] dense_feature_layers=[1000, 1000, 1000] activation='relu' loss='mse' -optimizer='adam' +optimizer='sgd' scaling='std' drop=0 epochs=50 -batch_size=512 +batch_size=32 validation_split=0.2 cv=1 max_val_loss=1.0 -learning_rate=None +learning_rate=0.0001 base_lr=None residual=False reduce_lr=False From b057a9dfeec22f56e87cd0352a1d5ab4e69393aa Mon Sep 17 00:00:00 2001 From: Hyunseung Yoo Date: Thu, 22 Aug 2019 15:55:34 -0500 Subject: [PATCH 078/331] code cleanup --- Pilot1/Uno/topN_to_uno.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Pilot1/Uno/topN_to_uno.py b/Pilot1/Uno/topN_to_uno.py index 5ab5f3ff..dd81d9f3 100644 --- a/Pilot1/Uno/topN_to_uno.py +++ b/Pilot1/Uno/topN_to_uno.py @@ -31,13 +31,13 @@ def read_plan(filename, node): def build_masks(args, df): if args.node is None: print('node is None. Generate Random split') - mask = np.random.rand(len(df)) < 0.8 + mask = training_mask(df) return mask, ~mask plan = read_plan(args.plan, args.node) mask = {} for partition in ['train', 'val']: - _mask = df['Sample'] == None + _mask = df['Sample'] is None for i, element in enumerate(plan[partition]): cl_filter = element['cell'] dr_filter = element['drug'] From dd14fdb197c408c74c59139ca0fe0ac788277047 Mon Sep 17 00:00:00 2001 From: Hyunseung Yoo Date: Fri, 23 Aug 2019 00:43:13 -0500 Subject: [PATCH 079/331] use plangen api --- Pilot1/Uno/plangen.py | 1489 +++++++++++++++++++++++++++++++++++++ Pilot1/Uno/topN_to_uno.py | 87 ++- 2 files changed, 1560 insertions(+), 16 deletions(-) create mode 100644 Pilot1/Uno/plangen.py diff --git a/Pilot1/Uno/plangen.py b/Pilot1/Uno/plangen.py new file mode 100644 index 00000000..5eccdcca --- /dev/null +++ b/Pilot1/Uno/plangen.py @@ -0,0 +1,1489 @@ + +from collections import deque +from collections import namedtuple +from enum import Enum +import glob +import itertools as it +import json +import numpy as np +import os +import sys +import sqlite3 +from sqlite3 import Error as db_Error + +# import planargs + +from abc import ABC, abstractmethod # abstract class support +from collections import OrderedDict +from scipy.special import comb +from pprint import pprint as pp +from datetime import datetime + +ISO_TIMESTAMP = "seconds" # timestamp to ISO string +ISO_TIMESTAMP_ENCODE = "%Y-%m-%dT%H:%M:%S" # ISO string to timestamp +DEBUG_SQL = False + +def isempty(path): + """Determine whether the given directory is empty.""" + flist = glob.glob(os.path.join(path,'*')) + return flist == [] + + +def validate_args(args): + """Validate the execution arguments as defined in planargs.py. + + This function validates input arguments defined in the 'args' namespace. + The inputs are lists series of feature-set names (fs_names), files + (fs_paths) and partitioning attributes (fs_parts). fs_names and fs_files + must designate the same number of parameters. For example: + + --fs_names CELL DRUG --fs_paths cells.txt drugs.txt + + The CELL name is paired with the cells.txt file, DRUG with drugs.txt, etc. + Currently, this one for one correspondence also applies to the fs_part arg, + which specifies the number of partitions the feature-set list is broken + into at every level of the plan generation recursion. A complete example + might look like this: + + --fsnames CELL DRUG --fs_paths cells.txt drugs.txt --fs_parts 2 2 + + An output directory for the plan in any of its formats is given by out_dir. + An input directory may be specified via in_dir to simplify the coding of + fs_paths. Otherwise, feature-set files must be fully specified. Each of the + files is read and returned. + + Returns + Upon success, a tuple is returned. It contains: + + t[0] - the generator class implementing the appropriate partition() + function. + + t[1] - a list of feature-set entry lists is returned. All entries + are stripped of white-space, all white-space lines have been removed. + For example: + + [[CELL1 ... CELLn] [DRUG1 ... DRUGn]] + + Additionally, an args.lines list is created where each entry contains + the entry count of the corresponding fs_paths file argument. + """ + params = {} + verbose = args.verbose + + fs_names_len = len(args.fs_names) + fs_paths_len = len(args.fs_paths) + fs_parts_len = len(args.fs_parts) + + nbr_feature_sets = fs_names_len + test_lengths = [fs_names_len, fs_paths_len, fs_parts_len] + reqd_lengths = [nbr_feature_sets] * 3 + + if test_lengths != reqd_lengths: + sys.exit("Error: The lengths of all feature set definition args (fs_<>) must be identical") + + if nbr_feature_sets <= 1: + sys.exit("Error: Partitioning requires multiple feature sets") + + for nparts in args.fs_parts: + if nparts < 1 or nparts >= 8: + sys.exit("Error: Invalid partitioning value %d" % nparts) + + # validate input and output directories + if args.in_dir and not os.path.isdir(args.in_dir): + sys.exit("Error: --in_dir must designate a directory, '%s' is not valid" % args.in_dir) + + if not os.path.isdir(args.out_dir): + sys.exit("Error: --out_dir must designate a directory, '%s' is not valid" % args.out_dir) + + if not args.overwrite and not isempty(args.out_dir): + sys.exit("Error: --out_dir '%s' is not empty, --overwrite not specified" % args.out_dir) + + if verbose: + print("Writing plan information to %s" % os.path.abspath(args.out_dir)) + + # expand, validate and load input feature-set content lists + fs_content = [] + args.fs_lines = [] + file_error = False + if args.in_dir == None: + args.in_dir = '' # prepare for use in os.path.join() + + for i, path in enumerate(args.fs_paths): + fullpath = os.path.join(args.in_dir, path) + if not os.path.exists(fullpath): + file_error = True + print("Error: %s file not found" % fullpath) + else: + with open(fullpath, 'r') as f: # read text and sanitize + raw_lines = f.readlines() + + text = [line.strip() for line in raw_lines] + text = [l for l in text if l != ''] + fs_content.append(text) + args.fs_lines.append(len(text)) + + if verbose: + print("Loading '%s' feature set definition from %s - %d lines" + % (args.fs_names[i], fullpath, len(text))) + + if file_error: + sys.exit("Terminating due to error") + + # construct a partitioning object exporting a partion() function + if args.partition_strategy == 'leaveout': + generator = LeaveoutSubsetGenerator() + + # return feature-set contents lists + return generator, fs_content + + +class SubsetGenerator(ABC): + """Abstract class implementing a data partitioning method. + + The SubsetGenerator class provides a template for subclasses that implement + mechanisms for dividing sets of lists into sublists for the purpose of + defining unique ML training and validation sets. + + Subclasses must implement those methods defined as @abstractmethod. + The validate() function provided here does a sanity test for all anticipated + partitioning schemes. Subclasses should implement their specializations. + """ + + def __init__(self, name=''): + self.name = name + self.term_msg = "Terminating due to error" + + @abstractmethod + def partition( + self, + base, + size=None, + count=None, + name='-unspecified-' + ): + """Partition a feature-set array. + + Partition the 'base', a list of elements, using the abstract arguments + 'size' and 'count' to tailor the implementation's algorithm. 'name' is + used in error reporting and is optional. + """ + validate(self, base, size, count, name) + return [] + + def get_plan_label(self, plan_dict, root_name): + root = plan_dict[root_name] + return root['label'] + + def _validation_error(self, base_len, size, count, name='-unspecified-'): + """Provide a common error reporting function. """ + print("Base list length: %d requested %d sublists of length %d" % + (base_len, count, size)) + + def validate(self, base, size=None, count=None, name='-unspecified-'): + """Provide basic request validation, specific generators may impose + additional requirements. + """ + berror = False + base_len = len(base) + + if size == None or size <= 0 or size > base_len: + berror = True + else: + unique_combos = comb(base_len, size) # implements N take K + if count > unique_combos: + berror = True + if berror: + SubsetGenerator._validation_error(self, base_len, size, count, name) + + return not berror + +# +# UNDER EVALUATION ????????????????????????????????????????????????????? +# + +class IterativeSubsetGenerator(SubsetGenerator): + """ Tom Brettin method... subset generation via iteration over base""" + def __init__(self): + SubsetGenerator.__init__(self, 'IterativeSubsetGenerator') + + def partition(self, base, size=None, count=0, name=None): + """ """ + + if size is None: + print("Error: Unspecified list partitioning size") + sys.exit(3) + + """ + base_len = len(base) + if count == 0: # a simplification useful in the iterative approach + count = base_len + """ + + is_valid = SubsetGenerator.validate(self, base, size, count, name) + if not is_valid: + print(self.term_msg) + sys.exit(1) + + if count > base_len: + SubsetGenerator._validation_error(self, base_len, size, count, name) + print(self.term_msg) + sys.exit(2) + + np_base = np.array(base) + selected_sublists = [] + omit_size = base_len - size + increment = min(size, omit_size) + + # omit consecutive blocks of feature-name entries + for i in range(count): + org = i * increment + if org >= base_len: + org = org % base_len + if org == 0 and i > 0: + print("Warning: %d sublists of %s completed short of the requested %d" + % (i, name, count)) + break + + end = org + size + sublist = np_base.take(range(org, end), mode='wrap') + print(sublist) + selected_sublists.append(sublist) + + return selected_sublists + + +class LeaveoutSubsetGenerator(SubsetGenerator): + """CANDLE milestone 13 style feature set partitioning. + + All SubsetGenerator subclasses are required to implement partition(), + plan_init() and plan_term() functions. + """ + + def __init__(self): + SubsetGenerator.__init__(self, 'LeaveoutSubsetGenerator') + self.strategy = "leaveout" + + def plan_init(self, fs_names, fs_paths, fs_lines, fs_parts, maxdepth, root_name='1'): + """Initialize - collect plan metadata """ + currtime = datetime.now() + details = {'fs_names': fs_names, 'fs_filepaths':fs_paths, 'fs_parts': fs_parts} + details['create_date'] = currtime.isoformat(timespec=ISO_TIMESTAMP) + details['strategy'] = self.strategy + + label = '' + for i in range(len(fs_names)): + if i != 0: + label += '_' + s = '{}{}-p{}'.format(fs_names[i], fs_lines[i], fs_parts[i]) + label += s + + if maxdepth > 0: + label += '-maxdepth{}'.format(maxdepth) + + details['label'] = label + plan_dict = OrderedDict() + plan_dict[root_name] = details + return root_name, plan_dict + + def plan_term(self, plan_dict, root_name, nbr_subplans): + """Completion - post plan summary metadata """ + meta = plan_dict[root_name] + meta['nbr_subplans'] = nbr_subplans + + + def partition(self, base, size='n/a', count=None, name=None): + """Partition a feature-set list into lists of equal sized elements. + + This partitioner accepts a list of feature-set names and returns + 'count' lists, the elements evenly divided between these lists. + The last sublist will contain more or fewer elements if the base + list cannot be evenly divided. + + Args + base: A list of feature-set names. + size: Ignored, not used in this implementation. + count: The number of equal sized partitions requested, required. + name: A tag used for debug/error tracing. Not used in this + implementation. + + These arguments are common to all partition functions defined in + SubsetGenerator subclasses. + + Returns + When the input 'base' list contains a number of entries equal to or + greater than 'count', a list of 'count' sublists is returned. For + example: + + [[CELL1, ..., CELL4], [CELL5, ..., CELL7]] + + Otherwise the base list is returned as a list of lists, each list + containing one feature from the input list. This implementation + maintains compatibility with the "standard" return format discussed + above. + """ + + base_len = len(base) + if base_len < count: # can partition any further? + return [[feature] for feature in base] + + size = base_len // count + sublists = [] + + for i in range(count): + org = i * size + end = org + size + if i != count - 1: + part = base[org:end] + else: + part = base[org:] + sublists.append(part) + + return sublists + +#------------------------------------------------------------------------------ +# Database support, table and column definitions, DDL and DML +# Refer to the plan_prep() function for a discussion of the "planstat" and +# "runhist" tables defined below. +#------------------------------------------------------------------------------ + +class RunType(Enum): + RUN_ALL = 0 + RESTART = 1 + +class RunStat(Enum): # subplan execution status + SCHEDULED = 'scheduled' + COMPLETE = 'complete' + +# planstat table, rows are returned via the PlanstatRow namedtuple + +_planstat_ddl = """ + CREATE TABLE IF NOT EXISTS planstat ( + plan_name TEXT NOT NULL PRIMARY KEY, + create_date TEXT NOT NULL, + feature_sets TEXT NOT NULL, + partitions TEXT NOT NULL, + nbr_subplans INTEGER + ); """ + +PlanstatRow = namedtuple('PlanstatRow', + [ + 'rowid', + 'plan_name', + 'create_date', + 'feature_sets', + 'partitions', + 'nbr_subplans' + ] +) + +_select_row_from_planstat = """ + SELECT rowid, + plan_name, create_date, feature_sets, partitions, nbr_subplans + FROM planstat + WHERE plan_name='{}' + """ + +_insert_planstat_plan = """ + INSERT INTO planstat ( + plan_name, create_date, feature_sets, partitions, nbr_subplans) + VALUES ('{}', '{}', '{}', '{}', {}) + """ + +_delete_planstat_plan = """ + DELETE FROM planstat where rowid = {} + """ + +# runhist table, rows are returned via the RunhistRow namedtuple + +_runhist_ddl = """ + CREATE TABLE IF NOT EXISTS runhist ( + plan_id INTEGER NOT NULL, + subplan_id TEXT NOT NULL, + status TEXT NOT NULL, + start_time TEXT NOT NULL, + stop_time TEXT, + run_mins INT, + mae REAL, + mse REAL, + r_square REAL, + other_info TEXT, + weights_fn TEXT, + PRIMARY KEY (plan_id, subplan_id) + ); """ + +RunhistRow = namedtuple('RunhistRow', + [ + 'plan_id', + 'subplan_id', + 'status', + 'start_time', + 'stop_time', + 'run_mins', + 'mae', + 'mse', + 'r_square', + 'other_info', + 'weights_fn' + ] +) + +_select_row_from_runhist = """ + SELECT plan_id, subplan_id, status, + start_time, stop_time, run_mins, + mae, mse, r_square, other_info, weights_fn + FROM runhist + WHERE plan_id = {} and subplan_id = '{}' + """ + +_insupd_scheduled_runhist = """ + REPLACE INTO runhist(plan_id, subplan_id, status, start_time, + stop_time, run_mins, mae, mse, r_square, other_info, weights_fn) + VALUES({}, '{}', '{}', '{}', + NULL, NULL, NULL, NULL, NULL, NULL, NULL) + """ + +_insupd_completed_runhist = """ + UPDATE runhist SET + status = '{}', + stop_time = '{}', + run_mins = {}, + mae = {}, + mse = {}, + r_square = {}, + other_info = '{}', + weights_fn = '{}' + WHERE + plan_id = {} AND subplan_id='{}' + """ + +_delete_from_runhistory = """ + DELETE FROM runhist where plan_id = {} + """ + +#------------------------------------------------------------------------------ +# "Plan management" Database functions +# +# db_connect - establish database connection returning conn handle +# execute_sql_stmt - execute a SQL statement with optional error trap +# plan_prep - prepare for the execution of a multi-step "plan" +# start_subplan - start a subplan, (ex. '1.4.8'), write RunhistRow +# stop_subplan - stop a subplan, update RunhistRow +# get_subplan_runhist - return a RunhistRow for a given subplan +# plan_remove - remove all database records for the named plan +#------------------------------------------------------------------------------ + +def execute_sql_stmt(conn, stmt, cursor=None, trap_exception=False): + """Execute a SQL statement. + + This is a convenience function that wraps the execution of a given SQL + statement with exception handling and cleanup logic. + + Args + conn: An open database connection handle + stmt: A fully instantiated SQL statement + + cursor: Optionally, a cursor managed by the caller. If + local cursor is used. Provide a cursor if you must + operate on it after completion, fetchall() for example. + + trap_exception: By default exceptions raised by the database must be + handled by the caller. If True, errors are reflected + by the boolean return value and the cursor and/or + connection handle provided by the caller are closed.. + + Returns + False indicates that an exception occurred, else True. + """ + + if cursor: + lclcsr = cursor + else: + lclcsr = conn.cursor() + try: + if DEBUG_SQL: + with open("plangen_db.log", "a") as fp: + fp.write("STMT: " + stmt + "\n") + + db_exception = False + lclcsr.execute(stmt) + + except db_Error as e: + db_exception = True + print('execute_sql_stmt:', stmt) + print('execute_sql_stmt:', e) + if not trap_exception: + raise + finally: + if not cursor: + lclcsr.close() + + if db_exception: + if cursor: + cursor.close() + conn.close() + + return not db_exception + + +def db_connect(db_path): + """Connect to the plan management database. + + Establish a connection to the sqlite3 database contained in the named file. + A plan management database is created and populated at db_path if the file + does not exist. + + Args + db_path: A relative or absolute path or ":memory:" + + Returns + A connection handle is returned upon success, else None + """ + + if db_path == ':memory:' or not os.path.exists(db_path): + prev_allocated = False + else: + prev_allocated = True + + try: + conn = sqlite3.connect(db_path) + except db_Error as error: + print('db_connect', error) + raise + + # create plan management tables on initial database allocation + if conn and not prev_allocated: + complete = execute_sql_stmt(conn, _planstat_ddl) + complete &= execute_sql_stmt(conn, _runhist_ddl) + + if complete: + conn.commit() + else: + conn.close() + conn = None + return conn + + +def plan_remove(db_path, plan_path): + """Delete the named plan from the plan managment database. + + The relative plan name is extracted from the plan_path by removing the + leading directories and the trailing filetype suffix from the given + plan_path. The planstat row is retrieved and the associated rowid is + the plan_id identifying the target runhist table rows. + + Returns + Zero indicates deletion complete, -1 if the plan name is not matched. + """ + + status = 0 + conn = db_connect(db_path) + plan_key = _get_planstat_key(plan_path) + stmt = _select_row_from_planstat.format(plan_key) + csr = conn.cursor() + execute_sql_stmt(conn, stmt, cursor=csr) + nrow = csr.rowcount + row = csr.fetchone() + + print("%d run history rows deleted" % nrow) + + if not row: + print("Error: CLEANUP request failed - %s has not been run" % plan_key) + status = -1 + else: + plan_rec = PlanstatRow._make(row) # column-name addressable + rowid = plan_rec.rowid # the unique rowid is the plan uniquifier + _delete_runhistory(conn, rowid) + stmt = _delete_planstat_plan.format(rowid) + status = execute_sql_stmt(conn, stmt) + + csr.close() + conn.close() + return status + + +def plan_prep(db_path, plan_path, run_type=RunType.RUN_ALL): + """Prepare to run a plan, a hierarchy of interdependent subplans. + + Plan names and related information are stored in the planstat (PLAN STATUS) + table. There is one row for each plan submitted. A positive, unique integer + called the 'rowid' is assigned to table rows by the database manager. The + rowid of a planstat table row is defined here as the "plan_id". The plan_id + together with a textual "subplan_id" (example: '1.2.4') form a composite + key that is the primary key of the runhist (RUN HISTORY) table. The purpose + of this function is to register the plan and return the associated plan_id. + + RunTypes + When a new plan is presented it is registered in the planstat table and + during its execution a large number of runhist (RUN HISTORY) table + entries are created and then updated. To prevent unintended loss of + data one of the following "RunTypes" is specified on the initial + plan_prep() call and again on subsequent start_subplan() calls. + + Specify RUN_ALL on the first attempt to run a plan. If the plan name + is already registered, the request fails and neither the planstat or + runstat tables are changed. + + Specify RESTART if a prior attempt to run a plan did not complete. The + presence of a corresponding planstat record is verified. start_subplan() + returns a SKIP status if the associated runhist row (if any) is marked + COMPLETE. + + Args + db_path: plan management database path (relative or absolute) + plan_path: JSON plan file (relative or absolute) + run_type: RunType.RUN_ALL, the default, or RunType.RESTART + + Returns + A negative value indicates a fatal error. + + Otherwise the integer returned is the plan_id used together with a + subplan_id string used in subsequent start_subplan(), stop_subplan() + and get_subplan_hist() calls. + """ + + # load the plan and retrieve identity info + plan_dict = load_plan(plan_path) + create_date = get_plan_create_date(plan_dict) + feature_sets = get_plan_fs_names(plan_dict) + partitions = get_plan_fs_parts(plan_dict) + nbr_subplans = get_plan_nbr_subplans(plan_dict) + + # de termine if a plan of the given name has already been registered + conn = db_connect(db_path) + plan_key = _get_planstat_key(plan_path) + stmt = _select_row_from_planstat.format(plan_key) + csr = conn.cursor() + execute_sql_stmt(conn, stmt, cursor=csr) + row = csr.fetchone() + + if not row: + rowid = -1 + else: + plan_rec = PlanstatRow._make(row) # column-name addressable + rowid = plan_rec.rowid # the unique rowid will be the uniquifier returned + + # compare run_type to initial expectations + error = False + + if run_type == RunType.RUN_ALL and rowid > 0: + print("Error: RUN_ALL specified but plan: %s has already been defined" % plan_key) + error = True + + elif run_type == RunType.RESTART and rowid < 0: + print("Warning: RESTART specified but plan: %s has not been previously run" % plan_key) + + elif rowid > 0 and create_date != create_date: # DEBUG ???????????????????????????????????? plan_rec.create_date: + print("Error: RESTART specified but the signature of the previously defined plan: %s does not match" % plan_key) + error = True + + # register new plans acquiring the uniquifying plan_id used to compose runhistory table keys + if not error and rowid < 0: + feature_sets = str(feature_sets) + feature_sets = feature_sets.replace("'", "") # create string literal from list of str + partitions = str(partitions) # create string literal from list of int + + stmt = _insert_planstat_plan.format( + plan_key, + create_date, + feature_sets, + partitions, + nbr_subplans + ) + + status = execute_sql_stmt(conn, stmt, cursor=csr) + rowid = csr.lastrowid + + # cleanup resources and return uniquifier or error indicator + csr.close() + conn.commit() + + if error: + return -1 + else: + return rowid + + +def start_subplan(db_path, plan_path, plan_id=None, subplan_id=None, run_type=None): + """Schedule the execution of a subplan. + + This function writes a RunhistRow record to the runhist table indicating that + the named plan/subplan has been SCHEDULED. The row includes the "start time". + If the given run_type is RESTART, it is possible that the subplan has already + run, as indicated by the status returned. + + Args + db_path: plan management database path (relative or absolute) + plan_path: JSON plan file (relative or absolute) + plan_id: the plan identifier returned by plan_prep() + subplan_id the subplan identifier ex. '1 4.8' + run_type: RunType.RUN_ALL or RunType.RESTART + + Returns + Zero indicates that a RunhistRow record has been created to represent + the subplan. -1 is returned from a RESTART call if the a RunhistRow + already exists for the plan/subplan and is marked COMPLETE. + """ + + conn = db_connect(db_path) + csr = conn.cursor() + skip = False + + # skip previously completed work if RESTART + if run_type == RunType.RESTART: + stmt = _select_row_from_runhist.format(plan_id, subplan_id) + execute_sql_stmt(conn, stmt, cursor=csr) + row = csr.fetchone() + + if row: + runhist_rec = RunhistRow._make(row) + if runhist_rec.status == RunStat.COMPLETE.name: + skip = True + + # construct/reinit a new runhist record + if not skip: + currtime = datetime.now() + start_time = currtime.isoformat(timespec=ISO_TIMESTAMP) + + stmt = _insupd_scheduled_runhist.format( + plan_id, + subplan_id, + RunStat.SCHEDULED.name, + start_time + ) + + execute_sql_stmt(conn, stmt, cursor=csr) + + csr.close() + conn.commit() + conn.close() + + if skip: + return -1 + else: + return 0 + + +def stop_subplan(db_path, plan_id=None, subplan_id=None, comp_info_dict={}): + """Complete the execution of a subplan. + + This function updates the RunhistRow record created by start_subplan() + updating the status to COMPLETE, the completion timestamp, and "user + fields" (such as MAE, MSE, R2) returned by the model. + + A comp_dict dictionary is populated with the names and default values + for columns implemented in the RunhistRow table. Values matching those + names are extracted from the comp_info_dict are written to the table. + + Args + db_path: plan management database path (relative or absolute) + plan_path: JSON plan file (relative or absolute) + plan_id: the plan identifier returned by plan_prep() + comp_info_dict: supplemental completion data dictionar + """ + + conn = db_connect(db_path) + csr = conn.cursor() + curr_time = datetime.now() + stop_time = curr_time.isoformat(timespec=ISO_TIMESTAMP) + + comp_dict = dict(mae=0.0, mse=0.0, r_square=0.0, weights_fn='N/A', unprocessed='') + remainder = _acquire_actuals(comp_dict, comp_info_dict) + + if len(remainder) == 0: + other_info = '' + else: + other_info = json.dumps(remainder) + + # fetch row to retrieve schedule info + stmt = _select_row_from_runhist.format(plan_id, subplan_id) + execute_sql_stmt(conn, stmt, csr) + row = csr.fetchone() + + if row: # expected, caller error if already marked COMPLETED + runhist_rec = RunhistRow._make(row) + if runhist_rec.status != RunStat.COMPLETE.name: + start_time = datetime.strptime(runhist_rec.start_time, ISO_TIMESTAMP_ENCODE) + duration = curr_time - start_time + run_mins = int((duration.total_seconds() + 59) / 60) + + # update runhist record + stmt = _insupd_completed_runhist.format( + # column values + RunStat.COMPLETE.name, + stop_time, + run_mins, + comp_dict['mae'], + comp_dict['mse'], + comp_dict['r_square'], + other_info, + comp_dict['weights_fn'], + # key spec + plan_id, + subplan_id + ) + + execute_sql_stmt(conn, stmt) + + # cleanup + csr.close() + conn.commit() + conn.close() + + +def get_subplan_runhist(db_path, plan_id=None, subplan_id=None): + """Return the RunhistRow record for a given plan/subplan. + + Args + db_path: plan management database path (relative or absolute) + plan_id: the plan identifier returned by plan_prep() + subplan_id the subplan identifier ex. '1 4.8' + + Returns + The RunhistRow associated with the given plan/subplan is returned if + found. + """ + conn = db_connect(db_path) + stmt = _select_row_from_runhist.format(plan_id, subplan_id) + csr = conn.cursor() + execute_sql_stmt(conn, stmt, csr) + row = csr.fetchone() + + if not row: + plan_rec = None + else: + plan_rec = RunhistRow._make(row) + + return plan_rec + +def _acquire_actuals(dft_dict, actuals_dict): + """Extract values from dictionary overlaying defaults.""" + actuals = actuals_dict.copy() + for key, value in dft_dict.items(): + if key in actuals: + dft_dict[key] = actuals[key] + actuals.pop(key) + + return actuals # possibly empty + + +def _get_planstat_key(plan_path): + """Extract the name portion of a plan from a filepath.""" + basename = os.path.basename(plan_path) + basepfx = basename.split(sep='.') + return basepfx[0] + + +def _delete_runhistory(conn, plan_id): + """Delete RunhistRows containing the given plan_id.""" + csr = conn.cursor() + stmt = _delete_from_runhistory.format(plan_id) + execute_sql_stmt(conn, stmt, cursor=csr, trap_exception=True) + rowcount = csr.rowcount + print("CLEANUP processing removed %d run history records" % rowcount) + csr.close() + return rowcount + + +#------------------------------------------------------------------------------ +# Plan navigation, content retrieval +#------------------------------------------------------------------------------ + +def load_plan(filepath): + """Load a JSON transfer learning plan. + + The named JSON tranfer learning plan file is loaded in a manner that preserves + the entry order imposed when the plan was created. This allows the root entry + to be easily located regardless of the plan entry naming scheme in use. + + Args + filepath: A relative or absolute path to the JSON file. + + Returns + An entry-ordered plan in OrderedDict format is returned. + """ + + with open(filepath, 'r') as f: + ordered_plan_dict = json.load(f, object_pairs_hook=OrderedDict) + return ordered_plan_dict + +def get_plan_create_date(plan_dict): + _, value = _get_first_entry(plan_dict) + return value['create_date'] + +def get_plan_fs_names(plan_dict): + _, value = _get_first_entry(plan_dict) + return value['fs_names'] + +def get_plan_fs_parts(plan_dict): + _, value = _get_first_entry(plan_dict) + return value['fs_parts'] + +def get_plan_nbr_subplans(plan_dict): + _, value = _get_first_entry(plan_dict) + return value['nbr_subplans'] + +def _get_first_entry(ordered_dict): + key, value = next(iter(ordered_dict.items())) + return key, value + +def get_subplan(plan_dict, subplan_id=None): + """Retrieve the content of a named subplan or the root plan. + + Args + plan_dict: The plan dictionary as returned by load_plan(). + subplan_id: The name of the desired subplan. Omit this arg to acquire + the content and name of the plan tree root. + + Returns + A (content, subplan_id) pair is returned. The returned name is useful when + using default arguments to retrieve the root plan. + """ + + if subplan_id is None: + subplan_id, content = _get_first_entry(plan_dict) + else: + content = plan_dict.get(subplan_id) + return content, subplan_id + + +def get_predecessor(plan_dict, subplan_id): + """Acquire the name of the predecessor (parent) of a given subplan. + + The plan tree is a true tree. All subplans have exactly one + predecessor/parent. Use this function to walk 'up' the tree. + + Args + plan_dict: The plan dictionary as returned by load_plan(). + subplan_id: The name of the target subplan. + + Returns + The name of the parent subplan is returned. If the root plan name + is specified None is returned. + """ + + segments = subplan_id.split(sep='.') + if len(segments) <= 1: + subplan_id = None + else: + segments.pop() + subplan_id = '.'.join(segments) + return subplan_id + + +def get_successors(plan_dict, subplan_id): + """Acquire the names of the successors (children) of a given subplan. + + All subplans other than 'leaf' subplans have at least one successor. Use + this function to walk 'down' one or more plan subtrees. + + Args + plan_dict: The plan dictionary as returned by load_plan(). + subplan_id: The name of the target subplan. + + Returns + A list of the names of all successors (children) of the given subplan + is returned. The list may be empty. + """ + successor_names = [] + for i in it.count(start=1): + new_name = subplan_id + '.' + str(i) + value = plan_dict.get(new_name) + if not value: + break + successor_names.append(new_name) + + return successor_names + + +def _get_named_set(plan_dict, subplan_id, section_tag, fs_name, collector, parent_features=None): + """ """ + + while True: + content, _ = get_subplan(plan_dict, subplan_id) + assert(content) + + section = content[section_tag] + for i, section_features in enumerate(section): + feature_list = section_features[fs_name] + collector.insert(i, feature_list) + + if not parent_features: + break + + # visit parent node, root has no feature information and ends upward traversal + subplan_id = get_predecessor(plan_dict, subplan_id) + grand_parent_id = get_predecessor(plan_dict, subplan_id) + + if not grand_parent_id: + break + + +def get_subplan_features(plan_dict, subplan_id, parent_features=False): + """Return train and validation features associated with a named subplan. + + Args + plan_dict: The plan dictionary as returned by load_plan()x. + subplan_id: The name of the target subplan + parent_features: True or False + + Returns + The result is four-tuple (t0, t1, t2, t30) constructed as follows. + Some applications may choose to discard some of the returns, t0 and + t1, for example. + + t0 - the result dictionary which is disassmbled as follows + t1 - a list of feature names found in the train/validate sets + t2 - training feature set dictionary as described below + t3 - validation feature set dictionary as described below + + t2 and t3 are dictionaries that represent one or more training sets + and one or more validation sets, respectively. The key of each entry + is a feature-set name as returned in the t1 list, ['cell', 'drug'] for + example. The value of each is a list of lists. + + Consider a training feature set dictionary returned as follows: + + { + 'cell': [[C1, C2, C3, C4], [C5, C6, C7, C8]], + 'drug': [[ [D1, D2] , [D3, D4]] + } + + The feature sets defined here are the combination of (cell[0], drug[0]) + and (cell[1], drug[1]). The lenghts, i.e. number of sublists of each + dictionary entry are always equal. + """ + + # acquire feature_set names populated in the plan + content, _ = get_subplan(plan_dict, subplan_id) + if not content: + return None, None + + # peek inside the training set to capture active feature-set names + train_set = content['train'][0] + fs_names = [name for name in train_set.keys()] + + # categorize the results + result = {} + result[0] = fs_names + result['train'] = {} + result['val'] = {} + + for set_name, pf in [('train', True), ('val', False)]: + if pf == True: + pf = parent_features + + for fs_name in fs_names: + collector = [] + _get_named_set( + plan_dict, + subplan_id, + set_name, + fs_name, + collector, + parent_features=pf + ) + + result[set_name][fs_name] = collector + + return result, result[0], result['train'], result['val'] + +#------------------------------------------------------------------------------ +# Plan construction +#------------------------------------------------------------------------------ + +def build_dictionary_from_lists(seq_list, names): + """Create a dictionary with 'names' as labels and 'seq_list' values.""" + dict = {} + for seq, tag in zip(seq_list, names): + dict[tag] = list(seq) + return dict + + +def build_plan_tree(args, feature_set_content, parent_plan_id='', depth=0, data_pfx='', plan_pfx=''): + """Generate a plan supporting training, transfer-learning, resume-training. + + ADD GENERAL DOC + + This function is recursive. + + Arguments: + args: A namespace capturing the values of command line arguments + and parameter values derived from those arguments. Refer to + validate_args(). + + feature_set_content: This is a list of sublists, where each sublist + contains the names of the nth group of feature-set elements. + + parent_plan_id: This is the name of the parent's plan. The name + is extended with '.nn' at each level of the recursion to + ensure that parentage/liniage is fully conveyed in each + (subplan) plan_id. + + depth: Specify 0 on the root call. This arg can be used to + determine/set the current level of the recursion. + + data_pfx: Reserved for constructing feature-set name files. + plan_pfx: Reserved for constructing plan control files. + + Returns + args.plan_dict contains a dictionary representing the plan. This may be + JSONized. + + The number of planning steps (nbr of subplans in the plan tree) is explicitly + returned. + """ + curr_depth = depth + 1 + if args.maxdepth > 0 and curr_depth >= args.maxdepth: + return 0 + + all_parts = [] + + #flat_partitions = [] # preserve, used for file-based approach + #files = [] # preserve, used for file-based approach + #sequence = 0 # preserve, used for file-based approach + xxx = False + + for i in range(len(args.fs_names)): + group = feature_set_content[i] + count = args.fs_parts[i] + feature_set_name = args.fs_names[i] + partitions = args.generator.partition(feature_set_content[i], count=count) + all_parts.append(partitions) + + # acquire a cross-product of all feature-set partitions + parts_xprod = np.array(list(it.product(*all_parts))) + steps = len(parts_xprod) + + if steps > 1: + substeps = 0 + for step in range(steps): + train = [] + val = [] + + # split into validation and training components + for i, plan in enumerate(parts_xprod): + section = build_dictionary_from_lists(plan, args.fs_names) + if i == step: + val.append(section) + else: + train.append(section) + + # generate next depth/level (successor) plans + curr_plan_id = '{}.{}'.format(parent_plan_id, step + 1) + args.plan_dict[curr_plan_id] = {'val': val, 'train': train} + data_name = '{}.{}'.format(data_pfx, step + 1) + plan_name = '{}.{}'.format(plan_pfx, step + 1) + + # depth-first, shorthand representation of tree showing first feature names + if args.debug: + indent = ' ' * (depth * 4) + print(indent, curr_plan_id) + indent += ' ' * 4 + fs = parts_xprod[step] + for i in range(len(fs)): + print(indent, args.fs_names[i], 'count:', len(fs[i]), 'first:', fs[i][0]) + + substeps += build_plan_tree( + args, + parts_xprod[step], + parent_plan_id=curr_plan_id, + depth=curr_depth, + data_pfx=data_name, + plan_pfx=plan_name + ) + + steps += substeps + return steps + + """ + # THIS IS A WORK-IN-PROGRESS ... GENERATING FILES FOR DATA AND PLAN + + files.append([]) + files_ndx = len(files) - 1 + + for j in range(len(partitions)): + part = partitions[j] + flat_partitions.append(part) + if len(part) == 0: + sys.exit("big trouble ?????????????") + + sequence += 1 + file_name = '{}.{}.{}'.format(data_pfx, sequence, feature_set_name) + print("writing file %s with %d entries" % (file_name, len(part))) # write out 'part' + #write_file(file_name, part) + pair = (feature_set_name, file_name) + files[files_ndx].append(pair) + + file_xprod = np.array(list(it.product(*files))) + nbr_plans = len(file_xprod) + + for seq in range(nbr_plans): + plan_string = '' + + for ndx, curr in enumerate(file_xprod): + if ndx == seq: + plan_string += '--val (' + else: + plan_string += '--inc (' + for (tag, fname) in curr: + plan_string += '{}-{} '.format(tag, fname) + plan_string += ')' + + file_name = '{}.{}'.format(plan_pfx, seq + 1) + print(file_name) + plan_lines = list(plan_string) + #write_file(file_name, plan_lines) + + # construct list of omitted feature entries + + for seq in range(nbr_plans): + omitted_feature_content = [] + org = 0 + + for i in partition_spec: + omitted_feature_content.append(flat_partitions[org]) + org = i + + data_name = '{}.{}'.format(data_pfx, seq + 1) + plan_name = '{}.{}'.format(plan_pfx, seq + 1) + + steps = build_plan_tree( + args, + omitted_feature_content, + parent_plan_id=curr_plan_id, + depth=curr_depth, + data_pfx=data_name, + plan_pfx=plan_name + ) + return + """ + +def write_file(fname, title, string_list): + """Write text expressed as an array of lines to file.""" + with open(fname, 'w') as f: + for line in string_list: + f.write(line) + +def write_dict_to_json(dictionary, fname): + """Write dictionary to a json file.""" + with open(fname, 'w') as f: + json.dump(dictionary, f) + +#---------------------------------------------------------------------------------- +# various hard-coded lists, test cases - the synthetic feature-sets remain useful +#---------------------------------------------------------------------------------- + +""" +synthetic_cell_names = ['cell_' + '%04d' % (x) for x in range(1000)] +synthetic_drug_names = ['drug_' + '%04d' % (x) for x in range(1000)] +""" + +#---------------------------------------------------------------------------------- +# mainline +#---------------------------------------------------------------------------------- + +def main(): + # Acquire and validate arguments + args = planargs.parse_arguments() + args.json = True # the only available option thus far + + generator, feature_set_content = validate_args(args) + args.generator = generator + + root_name, args.plan_dict = generator.plan_init( + fs_names = args.fs_names, # validated cmdline arg + fs_paths = args.fs_paths, # validated cmdline arg + fs_lines = args.fs_lines, # created by validate_args + fs_parts = args.fs_parts, # validated cmdline arg + maxdepth = args.maxdepth + ) + + # feature_set_content = [cell_names, drug_names] + # feature_set_content = [synthetic_cell_names, synthetic_drug_names] + + # remove by-1 dimensions, they do not need to be represented in the plan explicitly + while True: + try: + ndx = args.fs_parts.index(1) + args.fs_names.pop(ndx) + args.fs_paths.pop(ndx) + args.fs_lines.pop(ndx) + args.fs_parts.pop(ndx) + except ValueError: + break + + # Plan generation + data_fname_pfx = os.path.join(args.out_dir, 'DATA.1') + plan_fname_pfx = os.path.join(args.out_dir, 'PLAN.1') + + steps = build_plan_tree( + args, # command line argument namespace + feature_set_content, # for example [[cell1 ... celln] [drug1 ... drugn]] + parent_plan_id=root_name, # name of root plan, subplan names created from this stem + data_pfx=data_fname_pfx, # DATA file prefix, building block for feature name files + plan_pfx=plan_fname_pfx # PLAN file prefix, building block for plan name files + ) + + generator.plan_term(args.plan_dict, root_name, steps) + print("Plan generation complete, total steps: %d" % steps) + + if args.json: + label = args.generator.get_plan_label(args.plan_dict, root_name) + qualified_name = 'plangen_' + label + '.json' + json_file_name = os.path.join(args.out_dir, qualified_name) + json_abspath = os.path.abspath(json_file_name) + write_dict_to_json(args.plan_dict, json_abspath) + print("%s JSON file written" % json_abspath) + + if args.print_tree: + print("Plan dictionary generated") + pp(args.plan_dict, width=160) # DEBUG comment this out for large plans + + if args.test: + test1(json_abspath, "test1_sql.db") + # test2(json_abspath, "test2_sql.db") + +#---------------------------------------------------------------------------------- +# test plan navigation and subplan entry retrieval +#---------------------------------------------------------------------------------- + +def test2(plan_path, db_path): + run_type = RunType.RESTART + #run_type = RunType.RUN_ALL + + plan_name = os.path.basename(plan_path) + plan_id = plan_prep(db_path, plan_name, run_type) + + plan_dict = load_plan(plan_path) + metadata, root_name = get_subplan(plan_dict) + + queue = deque() + queue.append(root_name) + + print("Test2 start") + for iloop in it.count(start = 0): + if len(queue) == 0: + print("Test2 complete - proc loop count: %d" % iloop) + break + + curr_subplan = queue.popleft() + successor_names = get_successors(plan_dict, curr_subplan) + for successor in successor_names: + queue.append(successor) + + if len(curr_subplan) == 1: + continue + + status = start_subplan( + db_path, + plan_path, + plan_id=plan_id, + subplan_id=curr_subplan, + run_type=run_type + ) + + if status < 0: + continue + + completion_status = dict(mse=1.1, mae=2.2, r_square=.555) + + stop_subplan( + db_path, + plan_id=plan_id, + subplan_id=curr_subplan, + comp_info_dict=completion_status + ) + print("Completing subplan %6d" % iloop) + +#---------------------------------------------------------------------------------- +# +def test1(plan_path, db_path): + run_type = RunType.RESTART + #run_type = RunType.RUN_ALL + + plan_name = os.path.basename(plan_path) + plan_id = plan_prep(db_path, plan_name, run_type) + + if (plan_id < 0): + sys.exit("Terminating due to database detected error") + + print("\nBegin plan navigation and subplan retrieval test") + plan_dict = load_plan(plan_path) + + # plan root name value returned when subplan_id= is omitted + metadata, root_name = get_subplan(plan_dict) + + # the root has no parent / predecessor + parent_name = get_predecessor(plan_dict, root_name) + print("Demonstrate that root \'%s\' predecessor is not defined: %s" % (root_name, parent_name)) + + # the root contains metadata, it is not a run specification + successor_names = get_successors(plan_dict, root_name) + print("\nThe first runable configurations are defined in %s\n" % successor_names) + + # the root is the predecessor of these first level runables + for sname in successor_names: + parent_name = get_predecessor(plan_dict, sname) + print("The parent of %s is %s" % (sname, parent_name)) + + # run the right subtree + print("\nRun the rightmost subtree \n") + for i in it.count(start = 1): + listlen = len(successor_names) + if listlen == 0: + break + + for name in successor_names: + status = start_subplan( + db_path, + plan_path, + plan_id=plan_id, + subplan_id=name, + run_type=run_type + ) + + if status < 0: + print("subplan: %s skipped, previously processed" % name) + + select_one = successor_names[listlen - 1] + parent_name = get_predecessor(plan_dict, select_one) + print("%-16s is a successor of %-16s - all successors: %s" % (select_one, parent_name, successor_names)) + +# ??????????????????????????????????????????????????????????? + value,_ = get_subplan(plan_dict, select_one) + + if i < 3: + for pf in [False, True]: + _, fs_name_list, train_list, val_list = get_subplan_features(plan_dict, select_one, parent_features=pf) + print("\nsubplan original:", select_one, "parent features:", pf) + pp(plan_dict[select_one]) + print("\nflattened TRAIN") + pp(train_list) + print("\nflattened VAL") + pp(val_list) + +# ??????????????????????????????????????????????????????????? + + # test retrieval api + row = get_subplan_runhist(db_path, plan_id=plan_id, subplan_id=select_one) + #print(row) + + # post subplan termination + completion_status = dict(mse=1.1, mae=2.2, r_square=.555, misc='no such column', data=123) + + stop_subplan( + db_path, + plan_id=plan_id, + subplan_id=select_one, + comp_info_dict=completion_status + ) + + successor_names = get_successors(plan_dict, select_one) + + print("\nEnd of branch reached") +# plan_remove(db_path, "plangen_cell8-p2_drug8-p2.json") + +#---------------------------------------------------------------------------------- + +if __name__ == "__main__": + main() diff --git a/Pilot1/Uno/topN_to_uno.py b/Pilot1/Uno/topN_to_uno.py index dd81d9f3..d9a2d6d0 100644 --- a/Pilot1/Uno/topN_to_uno.py +++ b/Pilot1/Uno/topN_to_uno.py @@ -13,6 +13,10 @@ def parse_arguments(): help='Plan data file') parser.add_argument('--node', type=str, default=None, help='node number to execute') + parser.add_argument('--incremental', action='store_true', + help='True for building dataset incrementally') + parser.add_argument('--fold', type=str, default=None, + help='pre-calculated indexes for cross fold validation') args, unparsed = parser.parse_known_args() return args, unparsed @@ -22,29 +26,64 @@ def read_plan(filename, node): print("reading {} file for node {}".format(filename, node)) with open(filename, 'r') as plan_file: plan = json.load(plan_file) + if node is None: + return plan + if node in plan: return plan[node] else: raise Exception('Node index "{}" was not found in plan file'.format(node)) +# def build_masks(args, df): +# if args.node is None: +# print('node is None. Generate Random split') +# mask = training_mask(df) +# return mask, ~mask +# +# plan = read_plan(args.plan, args.node) +# mask = {} +# for partition in ['train', 'val']: +# _mask = df['Sample'] is None +# for i, element in enumerate(plan[partition]): +# cl_filter = element['cell'] +# dr_filter = element['drug'] +# __mask = df['Sample'].isin(cl_filter) & df['Drug1'].isin(dr_filter) +# _mask = _mask | __mask +# mask[partition] = _mask +# +# return mask['train'], mask['val'] + + def build_masks(args, df): if args.node is None: print('node is None. Generate Random split') mask = training_mask(df) return mask, ~mask - plan = read_plan(args.plan, args.node) + print('from new build_mask: {} {} {}'.format(args.plan, args.node, args.incremental)) + import plangen + plan = read_plan(args.plan, None) + ids = {} mask = {} + _, _, ids['train'], ids['val'] = plangen.get_subplan_features(plan, args.node, args.incremental) + for partition in ['train', 'val']: _mask = df['Sample'] is None - for i, element in enumerate(plan[partition]): - cl_filter = element['cell'] - dr_filter = element['drug'] - __mask = df['Sample'].isin(cl_filter) & df['Drug1'].isin(dr_filter) + for i in range(len(ids[partition]['cell'])): + if 'cell' in ids[partition] and 'drug' in ids[partition]: + cl_filter = ids[partition]['cell'][i] + dr_filter = ids[partition]['drug'][i] + __mask = df['Sample'].isin(cl_filter) & df['Drug1'].isin(dr_filter) + elif 'cell' in ids[partition]: + cl_filter = ids[partition]['cell'][i] + __mask = df['Sample'].isin(cl_filter) + elif 'drug' in ids[partition]: + dr_filter = ids[partition]['drug'][i] + __mask = df['Drug1'].isin(dr_filter) + _mask = _mask | __mask mask[partition] = _mask - return mask['train'], mask['val'] @@ -90,19 +129,35 @@ def build_dataframe(args): else: df_y, df_cl, df_dd = read_dataframe_from_csv(args) - # mask = training_mask(df_y) - train_mask, val_mask = build_masks(args, df_y) + if args.fold is not None: + tr_id = pd.read_csv('{}_tr_id.csv'.format(args.fold)) + vl_id = pd.read_csv('{}_vl_id.csv'.format(args.fold)) + tr_idx = tr_id.iloc[:, 0].dropna().values.astype(int).tolist() + vl_idx = vl_id.iloc[:, 0].dropna().values.astype(int).tolist() + + y_train = df_y.iloc[tr_idx, :] + y_val = df_y.iloc[vl_idx, :] + + x_train_0 = df_cl.iloc[tr_idx, :] + x_train_1 = df_dd.iloc[tr_idx, :] + x_train_1.columns = [''] * len(x_train_1.columns) + + x_val_0 = df_cl.iloc[vl_idx, :] + x_val_1 = df_dd.iloc[vl_idx, :] + x_val_1.columns = [''] * len(x_val_1.columns) + else: + train_mask, val_mask = build_masks(args, df_y) - y_train = pd.DataFrame(data=df_y[train_mask].reset_index(drop=True)) - y_val = pd.DataFrame(data=df_y[val_mask].reset_index(drop=True)) + y_train = pd.DataFrame(data=df_y[train_mask].reset_index(drop=True)) + y_val = pd.DataFrame(data=df_y[val_mask].reset_index(drop=True)) - x_train_0 = df_cl[train_mask].reset_index(drop=True) - x_train_1 = df_dd[train_mask].reset_index(drop=True) - x_train_1.columns = [''] * len(x_train_1.columns) + x_train_0 = df_cl[train_mask].reset_index(drop=True) + x_train_1 = df_dd[train_mask].reset_index(drop=True) + x_train_1.columns = [''] * len(x_train_1.columns) - x_val_0 = df_cl[val_mask].reset_index(drop=True) - x_val_1 = df_dd[val_mask].reset_index(drop=True) - x_val_1.columns = [''] * len(x_val_1.columns) + x_val_0 = df_cl[val_mask].reset_index(drop=True) + x_val_1 = df_dd[val_mask].reset_index(drop=True) + x_val_1.columns = [''] * len(x_val_1.columns) # store store = pd.HDFStore('topN.uno.h5', 'w', complevel=9, complib='blosc:snappy') From 2af4bc0f29a3a1411b0dca790698360a9c4f4b8b Mon Sep 17 00:00:00 2001 From: Hyunseung Yoo Date: Sun, 25 Aug 2019 06:12:02 -0500 Subject: [PATCH 080/331] add support for feather format --- Pilot1/Uno/topN_to_uno.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/Pilot1/Uno/topN_to_uno.py b/Pilot1/Uno/topN_to_uno.py index d9a2d6d0..e0957b55 100644 --- a/Pilot1/Uno/topN_to_uno.py +++ b/Pilot1/Uno/topN_to_uno.py @@ -93,7 +93,22 @@ def training_mask(df): def read_dataframe_from_csv(args): df = pd.read_csv(args.dataframe_from, low_memory=False, na_values='na').fillna(0) - df.rename(columns={'SAMPLE': 'Sample', 'DRUG': 'Drug1'}, inplace=True) + df.rename(columns={'CELL': 'Sample', 'DRUG': 'Drug1'}, inplace=True) + df_y = df[['AUC', 'Sample', 'Drug1']] + + cols = df.columns.to_list() + cl_columns = list(filter(lambda x: x.startswith('GE_'), cols)) + dd_columns = list(filter(lambda x: x.startswith('DD_'), cols)) + + df_cl = df.loc[:, cl_columns] + df_dd = df.loc[:, dd_columns] + + return df_y, df_cl, df_dd + + +def read_dataframe_from_feather(args): + df = pd.read_feather(args.dataframe_from).fillna(0) + df.rename(columns={'CELL': 'Sample', 'DRUG': 'Drug1'}, inplace=True) df_y = df[['AUC', 'Sample', 'Drug1']] cols = df.columns.to_list() @@ -126,6 +141,8 @@ def build_dataframe(args): _, ext = os.path.splitext(args.dataframe_from) if ext == '.h5' or ext == '.hdf5': df_y, df_cl, df_dd = read_dataframe_from_hdf(args) + elif ext == '.feather': + df_y, df_cl, df_dd = read_dataframe_from_feather(args) else: df_y, df_cl, df_dd = read_dataframe_from_csv(args) From 56d6e746c6e84f0e713700d60b919fe60bfc3a16 Mon Sep 17 00:00:00 2001 From: Hyunseung Yoo Date: Wed, 28 Aug 2019 14:13:59 -0500 Subject: [PATCH 081/331] add dataframe index. --- Pilot1/Uno/topN_to_uno.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/Pilot1/Uno/topN_to_uno.py b/Pilot1/Uno/topN_to_uno.py index e0957b55..a4bafcc4 100644 --- a/Pilot1/Uno/topN_to_uno.py +++ b/Pilot1/Uno/topN_to_uno.py @@ -69,7 +69,7 @@ def build_masks(args, df): _, _, ids['train'], ids['val'] = plangen.get_subplan_features(plan, args.node, args.incremental) for partition in ['train', 'val']: - _mask = df['Sample'] is None + _mask = df['Sample'] == None for i in range(len(ids[partition]['cell'])): if 'cell' in ids[partition] and 'drug' in ids[partition]: cl_filter = ids[partition]['cell'][i] @@ -81,7 +81,6 @@ def build_masks(args, df): elif 'drug' in ids[partition]: dr_filter = ids[partition]['drug'][i] __mask = df['Drug1'].isin(dr_filter) - _mask = _mask | __mask mask[partition] = _mask return mask['train'], mask['val'] @@ -152,15 +151,15 @@ def build_dataframe(args): tr_idx = tr_id.iloc[:, 0].dropna().values.astype(int).tolist() vl_idx = vl_id.iloc[:, 0].dropna().values.astype(int).tolist() - y_train = df_y.iloc[tr_idx, :] - y_val = df_y.iloc[vl_idx, :] + y_train = df_y.iloc[tr_idx, :].reset_index(drop=True) + y_val = df_y.iloc[vl_idx, :].reset_index(drop=True) - x_train_0 = df_cl.iloc[tr_idx, :] - x_train_1 = df_dd.iloc[tr_idx, :] + x_train_0 = df_cl.iloc[tr_idx, :].reset_index(drop=True) + x_train_1 = df_dd.iloc[tr_idx, :].reset_index(drop=True) x_train_1.columns = [''] * len(x_train_1.columns) - x_val_0 = df_cl.iloc[vl_idx, :] - x_val_1 = df_dd.iloc[vl_idx, :] + x_val_0 = df_cl.iloc[vl_idx, :].reset_index(drop=True) + x_val_1 = df_dd.iloc[vl_idx, :].reset_index(drop=True) x_val_1.columns = [''] * len(x_val_1.columns) else: train_mask, val_mask = build_masks(args, df_y) From 35d688bd4af90c43ec0e28fbd34e12e12299e55f Mon Sep 17 00:00:00 2001 From: brettin Date: Tue, 27 Aug 2019 07:41:10 -0400 Subject: [PATCH 082/331] using information from milestone12 HPO --- Pilot1/Uno/uno_auc_model.txt | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/Pilot1/Uno/uno_auc_model.txt b/Pilot1/Uno/uno_auc_model.txt index 4a803b43..7789f732 100644 --- a/Pilot1/Uno/uno_auc_model.txt +++ b/Pilot1/Uno/uno_auc_model.txt @@ -4,13 +4,13 @@ test_sources=['train'] cell_types=None cell_features=['rnaseq'] drug_features=['descriptors'] -dense=[1000, 1000, 1000] +dense=[1000, 1000, 1000, 1000, 1000] dense_feature_layers=[1000, 1000, 1000] activation='relu' loss='mse' -optimizer='sgd' +optimizer='adamax' scaling='std' -drop=0 +drop=.1 epochs=50 batch_size=32 validation_split=0.2 @@ -18,22 +18,28 @@ cv=1 max_val_loss=1.0 learning_rate=0.0001 base_lr=None +agg_dose='AUC' residual=False -reduce_lr=False -warmup_lr=False +reduce_lr=True +warmup_lr=True batch_normalization=False feature_subsample=0 rng_seed=2018 -save_path='save/uno' no_gen=False verbose=False -no_response_source=True -no_feature_source=True -use_landmark_genes=True -agg_dose='AUC' + + preprocess_rnaseq='source_scale' +gpus=1 +use_landmark_genes=True +no_feature_source=True +no_response_source=True +cp=True +save_path='/ccs/home/brettin/project_work/brettin/milestone13/save/uno' + single=True +timeout=-1 [Monitor_Params] solr_root='' -timeout=-1 + From 7f7e1b9dd036dc679593e9e5f73c3002317aaed6 Mon Sep 17 00:00:00 2001 From: Harry Yoo Date: Wed, 28 Aug 2019 16:34:42 -0400 Subject: [PATCH 083/331] remove hard-coded path --- Pilot1/Uno/uno_auc_model.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Pilot1/Uno/uno_auc_model.txt b/Pilot1/Uno/uno_auc_model.txt index 7789f732..2d9158a0 100644 --- a/Pilot1/Uno/uno_auc_model.txt +++ b/Pilot1/Uno/uno_auc_model.txt @@ -35,7 +35,7 @@ use_landmark_genes=True no_feature_source=True no_response_source=True cp=True -save_path='/ccs/home/brettin/project_work/brettin/milestone13/save/uno' +save_path='save/uno' single=True timeout=-1 From 60ef84e120ad71d564c8814ddbb210140da1e53e Mon Sep 17 00:00:00 2001 From: Hyunseung Yoo Date: Wed, 4 Sep 2019 21:29:21 -0500 Subject: [PATCH 084/331] close filepointer --- Pilot1/Uno/topN_to_uno.py | 1 + 1 file changed, 1 insertion(+) diff --git a/Pilot1/Uno/topN_to_uno.py b/Pilot1/Uno/topN_to_uno.py index a4bafcc4..dc85ec0a 100644 --- a/Pilot1/Uno/topN_to_uno.py +++ b/Pilot1/Uno/topN_to_uno.py @@ -183,6 +183,7 @@ def build_dataframe(args): store.put('x_train_1', x_train_1, format='table') store.put('x_val_0', x_val_0, format='table') store.put('x_val_1', x_val_1, format='table') + store.close() if __name__ == '__main__': From daa6f6f840b13d198f55b04b850aaf34dc5b6630 Mon Sep 17 00:00:00 2001 From: Hyunseung Yoo Date: Fri, 6 Sep 2019 13:19:26 -0500 Subject: [PATCH 085/331] output file name can be changed by --ouput --- Pilot1/Uno/topN_to_uno.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Pilot1/Uno/topN_to_uno.py b/Pilot1/Uno/topN_to_uno.py index dc85ec0a..42ef4c12 100644 --- a/Pilot1/Uno/topN_to_uno.py +++ b/Pilot1/Uno/topN_to_uno.py @@ -17,6 +17,8 @@ def parse_arguments(): help='True for building dataset incrementally') parser.add_argument('--fold', type=str, default=None, help='pre-calculated indexes for cross fold validation') + parser.add_argument('--output', type=str, default='topN.uno.h5', + help='output filename') args, unparsed = parser.parse_known_args() return args, unparsed @@ -176,7 +178,7 @@ def build_dataframe(args): x_val_1.columns = [''] * len(x_val_1.columns) # store - store = pd.HDFStore('topN.uno.h5', 'w', complevel=9, complib='blosc:snappy') + store = pd.HDFStore(args.output, 'w', complevel=9, complib='blosc:snappy') store.put('y_train', y_train, format='table') store.put('y_val', y_val, format='table') store.put('x_train_0', x_train_0, format='table') From 2e8eceac3ae1e2448465b589ff76379776f03b65 Mon Sep 17 00:00:00 2001 From: Hyunseung Yoo Date: Thu, 19 Sep 2019 11:16:53 -0500 Subject: [PATCH 086/331] add AUC training example --- Pilot1/Uno/README.AUC.md | 137 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 137 insertions(+) create mode 100644 Pilot1/Uno/README.AUC.md diff --git a/Pilot1/Uno/README.AUC.md b/Pilot1/Uno/README.AUC.md new file mode 100644 index 00000000..ae4bce2a --- /dev/null +++ b/Pilot1/Uno/README.AUC.md @@ -0,0 +1,137 @@ +# Training with static datafile +Use static datafile prebuilt and shared at `/vol/ml/hsyoo/shared/top_21_auc_1fold.uno.h5` + +``` +python uno_baseline_keras2.py --config_file uno_auc_model.txt --cache cache/top6_auc --use_exported_data /vol/ml/hsyoo/shared/top_21_auc_1fold.uno.h5 +``` + +The log will look like below, + +``` +Using TensorFlow backend. +Importing candle utils for keras +Configuration file: /ssd1/homes/hsyoo/projects/CANDLE/Benchmarks/Pilot1/Uno/uno_auc_model.txt +{'activation': 'relu', + 'agg_dose': 'AUC', + 'base_lr': None, + 'batch_normalization': False, + 'batch_size': 32, + 'cell_features': ['rnaseq'], + 'cell_types': None, + 'cp': True, + 'cv': 1, + 'dense': [1000, 1000, 1000, 1000, 1000], + 'dense_feature_layers': [1000, 1000, 1000], + 'drop': 0.1, + 'drug_features': ['descriptors'], + 'epochs': 50, + 'feature_subsample': 0, + 'gpus': 1, + 'learning_rate': 0.0001, + 'loss': 'mse', + 'max_val_loss': 1.0, + 'no_feature_source': True, + 'no_gen': False, + 'no_response_source': True, + 'optimizer': 'adamax', + 'preprocess_rnaseq': 'source_scale', + 'reduce_lr': True, + 'residual': False, + 'rng_seed': 2018, + 'save_path': 'save/uno', + 'scaling': 'std', + 'single': True, + 'solr_root': '', + 'test_sources': ['train'], + 'timeout': -1, + 'train_sources': ['CCLE'], + 'use_landmark_genes': True, + 'validation_split': 0.2, + 'verbose': False, + 'warmup_lr': True} +Params: +{'activation': 'relu', + 'agg_dose': 'AUC', + 'base_lr': None, + 'batch_normalization': False, + 'batch_size': 32, + 'by_cell': None, + 'by_drug': None, + 'cache': 'cache/top6_auc', + 'cell_feature_subset_path': '', + 'cell_features': ['rnaseq'], + 'cell_subset_path': '', + 'cell_types': None, + 'config_file': 'uno_auc_model.txt', + 'cp': True, + 'cv': 1, + 'datatype': , + 'dense': [1000, 1000, 1000, 1000, 1000], + 'dense_feature_layers': [1000, 1000, 1000], + 'drop': 0.1, + 'drug_feature_subset_path': '', + 'drug_features': ['descriptors'], + 'drug_median_response_max': 1, + 'drug_median_response_min': -1, + 'drug_subset_path': '', + 'epochs': 50, + 'experiment_id': 'EXP000', + 'export_csv': None, + 'export_data': None, + 'feature_subsample': 0, + 'feature_subset_path': '', + 'gpus': [], + 'growth_bins': 0, + 'initial_weights': None, + 'learning_rate': 0.0001, + 'logfile': None, + 'loss': 'mse', + 'max_val_loss': 1.0, + 'no_feature_source': True, + 'no_gen': False, + 'no_response_source': True, + 'optimizer': 'adamax', + 'output_dir': '/ssd1/homes/hsyoo/projects/CANDLE/Benchmarks/Pilot1/Uno/Output/EXP000/RUN000', + 'partition_by': None, + 'preprocess_rnaseq': 'source_scale', + 'reduce_lr': True, + 'residual': False, + 'rng_seed': 2018, + 'run_id': 'RUN000', + 'save_path': 'save/uno', + 'save_weights': None, + 'scaling': 'std', + 'shuffle': False, + 'single': True, + 'solr_root': '', + 'tb': False, + 'tb_prefix': 'tb', + 'test_sources': ['train'], + 'timeout': -1, + 'train_bool': True, + 'train_sources': ['CCLE'], + 'use_exported_data': '/vol/ml/hsyoo/shared/top_21_auc_1fold.uno.h5', + 'use_filtered_genes': False, + 'use_landmark_genes': True, + 'validation_split': 0.2, + 'verbose': None, + 'warmup_lr': True} + + ... +Total params: 16,224,001 +Trainable params: 16,224,001 +Non-trainable params: 0 +... +Between random pairs in y_val: + mse: 0.0474 + mae: 0.1619 + r2: -1.0103 + corr: -0.0051 +Data points per epoch: train = 423952, val = 52994 +Steps per epoch: train = 13248, val = 1656 +Epoch 1/50 +13248/13248 [==============================] - 198s 15ms/step - loss: 0.0235 - mae: 0.1048 - r2: -0.1311 - val_loss: 0.0145 - val_mae: 0.0903 - val_r2: 0.3393 +Current time ....198.278 +Epoch 2/50 +... +``` From 063766ee0cebb27f6c6430307f9bfebaf0561001 Mon Sep 17 00:00:00 2001 From: Hyunseung Yoo Date: Sun, 8 Sep 2019 23:08:43 -0500 Subject: [PATCH 087/331] handle edge case when validation partition is smaller than batch size --- Pilot1/Uno/uno_data.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/Pilot1/Uno/uno_data.py b/Pilot1/Uno/uno_data.py index 1406a8de..b25a3748 100644 --- a/Pilot1/Uno/uno_data.py +++ b/Pilot1/Uno/uno_data.py @@ -967,7 +967,11 @@ def __init__(self, partition='train', filename=None, batch_size=32, shuffle=Fals y = self.store.select('y_{}'.format(self.partition)) self.index = y.index self.size = len(self.index) - self.steps = self.size // self.batch_size + if self.size >= self.batch_size: + self.steps = self.size // self.batch_size + else: + self.steps = 1 + self.batch_size = self.size self.index_map = np.arange(self.steps) if self.shuffle: np.random.shuffle(self.index_map) From 0f00168ff5b9aab6d087d40857b03259b6b81951 Mon Sep 17 00:00:00 2001 From: Hyunseung Yoo Date: Thu, 19 Sep 2019 11:48:51 -0500 Subject: [PATCH 088/331] update data file location --- Pilot1/Uno/README.AUC.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Pilot1/Uno/README.AUC.md b/Pilot1/Uno/README.AUC.md index ae4bce2a..b80fee7c 100644 --- a/Pilot1/Uno/README.AUC.md +++ b/Pilot1/Uno/README.AUC.md @@ -1,8 +1,8 @@ # Training with static datafile -Use static datafile prebuilt and shared at `/vol/ml/hsyoo/shared/top_21_auc_1fold.uno.h5` +Use static datafile prebuilt and shared at `http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/top_21_auc_1fold.uno.h5` ``` -python uno_baseline_keras2.py --config_file uno_auc_model.txt --cache cache/top6_auc --use_exported_data /vol/ml/hsyoo/shared/top_21_auc_1fold.uno.h5 +python uno_baseline_keras2.py --config_file uno_auc_model.txt --cache cache/top6_auc --use_exported_data top_21_auc_1fold.uno.h5 ``` The log will look like below, @@ -110,7 +110,7 @@ Params: 'timeout': -1, 'train_bool': True, 'train_sources': ['CCLE'], - 'use_exported_data': '/vol/ml/hsyoo/shared/top_21_auc_1fold.uno.h5', + 'use_exported_data': 'top_21_auc_1fold.uno.h5', 'use_filtered_genes': False, 'use_landmark_genes': True, 'validation_split': 0.2, From 8f5cc6b8c39dcc618674c00e687e63c8758d5150 Mon Sep 17 00:00:00 2001 From: Hyunseung Yoo Date: Thu, 14 Nov 2019 10:02:13 -0600 Subject: [PATCH 089/331] add early stopping --- Pilot1/Uno/uno.py | 4 ++++ Pilot1/Uno/uno_baseline_keras2.py | 3 +++ 2 files changed, 7 insertions(+) diff --git a/Pilot1/Uno/uno.py b/Pilot1/Uno/uno.py index d4731e50..d246d58e 100644 --- a/Pilot1/Uno/uno.py +++ b/Pilot1/Uno/uno.py @@ -140,6 +140,10 @@ def set_locals(self): 'type': float, 'default': None, 'help': 'base learning rate'}, + {'name': 'es', + 'type': candle.str2bool, + 'default': False, + 'help': 'early stopping on val_loss'}, {'name': 'cp', 'type': candle.str2bool, 'default': False, diff --git a/Pilot1/Uno/uno_baseline_keras2.py b/Pilot1/Uno/uno_baseline_keras2.py index 7c08aa77..f87a6a36 100644 --- a/Pilot1/Uno/uno_baseline_keras2.py +++ b/Pilot1/Uno/uno_baseline_keras2.py @@ -427,6 +427,7 @@ def warmup_scheduler(epoch): candle_monitor = candle.CandleRemoteMonitor(params=params) timeout_monitor = candle.TerminateOnTimeOut(params['timeout']) + es_monitor = keras.callbacks.EarlyStopping(patience=10, verbose=1) reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=0.00001) warmup_lr = LearningRateScheduler(warmup_scheduler) @@ -435,6 +436,8 @@ def warmup_scheduler(epoch): history_logger = LoggingCallback(logger.debug) callbacks = [candle_monitor, timeout_monitor, history_logger] + if args.es: + callbacks.append(es_monitor) if args.reduce_lr: callbacks.append(reduce_lr) if args.warmup_lr: From 98dfd74a5ae8e042cc63a75426cf7b1acb1486cd Mon Sep 17 00:00:00 2001 From: Hyunseung Yoo Date: Thu, 14 Nov 2019 11:09:33 -0600 Subject: [PATCH 090/331] auto-detect input size --- Pilot1/Uno/uno_data.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/Pilot1/Uno/uno_data.py b/Pilot1/Uno/uno_data.py index b25a3748..8cc6561d 100644 --- a/Pilot1/Uno/uno_data.py +++ b/Pilot1/Uno/uno_data.py @@ -958,12 +958,9 @@ def __init__(self, partition='train', filename=None, batch_size=32, shuffle=Fals self.single = single self.agg_dose = agg_dose self.target = agg_dose if agg_dose is not None else 'Growth' - # 4 inputs for single drug model (cell, dose1, descriptor, fingerprint) - # 7 inputs for drug pair model (cell, dose1, dose1, dr1.descriptor, dr1.fingerprint, dr2.descriptor, dr2.fingerprint) - self.input_size = 4 if self.single else 7 - self.input_size = 2 if agg_dose else self.input_size self.store = pd.HDFStore(filename, mode='r') + self.input_size = len(list(filter(lambda x: x.startswith('/x_train'), self.store.keys()))) y = self.store.select('y_{}'.format(self.partition)) self.index = y.index self.size = len(self.index) From 2a693b063b620aa31ab78a36415190a3b5a643a8 Mon Sep 17 00:00:00 2001 From: Hyunseung Yoo Date: Wed, 5 Jun 2019 12:52:23 -0500 Subject: [PATCH 091/331] wip milestone 13 --- Pilot1/Uno/topN_to_uno.py | 94 +++++++++++++++++++++++++++++++ Pilot1/Uno/uno_baseline_keras2.py | 2 +- Pilot1/Uno/uno_data.py | 12 ++-- 3 files changed, 103 insertions(+), 5 deletions(-) create mode 100644 Pilot1/Uno/topN_to_uno.py diff --git a/Pilot1/Uno/topN_to_uno.py b/Pilot1/Uno/topN_to_uno.py new file mode 100644 index 00000000..1f7c2b6a --- /dev/null +++ b/Pilot1/Uno/topN_to_uno.py @@ -0,0 +1,94 @@ +import argparse +import json +import pandas as pd +import numpy as np + + +def parse_arguments(): + parser = argparse.ArgumentParser() + parser.add_argument('--dataframe_from', type=str, default='top21_dataframe_8x8.csv', + help='Dataframe file name contains all data points') + parser.add_argument('--plan', type=str, default='plan.json', + help='Plan data file') + parser.add_argument('--node', type=str, default=None, + help='node number to execute') + + args, unparsed = parser.parse_known_args() + return args, unparsed + + +def read_plan(filename, node): + print("reading {} file for node {}".format(filename, node)) + with open(filename, 'r') as plan_file: + plan = json.load(plan_file) + if node in plan: + return plan[node] + else: + raise Exception('Node index {} was not found in plan file') + + +def build_masks(args, df): + if args.node is None: + raise Exception('Node id is not given') + + plan = read_plan(args.plan, args.node) + mask = {} + for partition in ['train', 'val']: + _mask = df['Sample'] == None + for i, element in enumerate(plan[partition]): + cl_filter = element['CELL'] + dr_filter = element['DRUG'] + __mask = df['Sample'].isin(cl_filter) & df['Drug1'].isin(dr_filter) + _mask = _mask | __mask + mask[partition] = _mask + + return mask['train'], mask['val'] + + +def training_mask(df): + return np.random.rand(len(df)) < 0.8 + + +def read_dataframe(args): + df = pd.read_csv(args.dataframe_from, low_memory=False, na_values='na').fillna(0) + df.rename(columns={'SAMPLE': 'Sample', 'DRUG': 'Drug1'}, inplace=True) + df_y = df[['AUC', 'Sample', 'Drug1']] + + cols = df.columns.to_list() + cl_columns = list(filter(lambda x: x.startswith('GE_'), cols)) + dd_columns = list(filter(lambda x: x.startswith('DD_'), cols)) + + df_cl = df.loc[:, cl_columns] + df_dd = df.loc[:, dd_columns] + + return df_y, df_cl, df_dd + + +def build_dataframe(args): + df_y, df_cl, df_dd = read_dataframe(args) + + # mask = training_mask(df_y) + train_mask, val_mask = build_masks(args, df_y) + + y_train = pd.DataFrame(data=df_y[train_mask].reset_index(drop=True)) + y_val = pd.DataFrame(data=df_y[val_mask].reset_index(drop=True)) + + x_train_0 = df_cl[train_mask].reset_index(drop=True) + x_train_1 = df_dd[train_mask].reset_index(drop=True) + + x_val_0 = df_cl[val_mask].reset_index(drop=True) + x_val_1 = df_dd[val_mask].reset_index(drop=True) + + # store + store = pd.HDFStore('topN.uno.h5', 'w') + store.put('y_train', y_train) + store.put('y_val', y_val) + store.put('x_train_0', x_train_0) + store.put('x_train_1', x_train_1) + store.put('x_val_0', x_val_0) + store.put('x_val_1', x_val_1) + + +if __name__ == '__main__': + parsed, unparsed = parse_arguments() + build_dataframe(parsed) diff --git a/Pilot1/Uno/uno_baseline_keras2.py b/Pilot1/Uno/uno_baseline_keras2.py index 1aa1f081..9ea9c8b8 100644 --- a/Pilot1/Uno/uno_baseline_keras2.py +++ b/Pilot1/Uno/uno_baseline_keras2.py @@ -444,7 +444,7 @@ def warmup_scheduler(epoch): if args.tb: callbacks.append(tensorboard) if args.save_weights: - callbacks.append(SimpleWeightSaver(args.save_path + '/' + args.save_weights)) + callbacks.append(MultiGPUCheckpoint(args.save_weights)) if args.use_exported_data is not None: train_gen = DataFeeder(filename=args.use_exported_data, batch_size=args.batch_size, shuffle=args.shuffle, single=args.single, agg_dose=args.agg_dose) diff --git a/Pilot1/Uno/uno_data.py b/Pilot1/Uno/uno_data.py index 5ede815e..52450fb2 100644 --- a/Pilot1/Uno/uno_data.py +++ b/Pilot1/Uno/uno_data.py @@ -955,7 +955,7 @@ def __init__(self, partition='train', filename=None, batch_size=32, shuffle=Fals # 4 inputs for single drug model (cell, dose1, descriptor, fingerprint) # 7 inputs for drug pair model (cell, dose1, dose1, dr1.descriptor, dr1.fingerprint, dr2.descriptor, dr2.fingerprint) self.input_size = 4 if self.single else 7 - self.input_size = 3 if agg_dose else self.input_size + self.input_size = 2 if agg_dose else self.input_size self.store = pd.HDFStore(filename, mode='r') y = self.store.select('y_{}'.format(self.partition)) @@ -973,7 +973,7 @@ def __getitem__(self, idx): start = self.index_map[idx] * self.batch_size stop = (self.index_map[idx] + 1) * self.batch_size x = [self.store.select('x_{0}_{1}'.format(self.partition, i), start=start, stop=stop) for i in range(self.input_size)] - y = self.store.select('y_{}'.format(self.partition), start=start, stop=stop, columns=[self.target]) + y = self.store.select('y_{}'.format(self.partition), start=start, stop=stop)[self.target] return x, y def reset(self): @@ -982,8 +982,12 @@ def reset(self): pass def get_response(self, copy=False): - self.index = [item for step in range(self.steps) for item in range(self.index_map[step] * self.batch_size, (self.index_map[step] + 1) * self.batch_size)] - df = self.store.get('y_{}'.format(self.partition)).iloc[self.index,:] + if self.shuffle: + self.index = [item for step in range(self.steps) for item in range(self.index_map[step] * self.batch_size, (self.index_map[step] + 1) * self.batch_size)] + df = self.store.get('y_{}'.format(self.partition)).iloc[self.index,:] + else: + df = self.store.get('y_{}'.format(self.partition)) + if self.agg_dose is None: df['Dose1'] = self.store.get('x_{}_0'.format(self.partition)).iloc[self.index,:] if not self.single: From 691a8b3984c57151ed087edd0f1ae997c3b9bab8 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 19 Jun 2019 09:38:09 -0500 Subject: [PATCH 092/331] Small fix to Exception --- Pilot1/Uno/topN_to_uno.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Pilot1/Uno/topN_to_uno.py b/Pilot1/Uno/topN_to_uno.py index 1f7c2b6a..ffc153d1 100644 --- a/Pilot1/Uno/topN_to_uno.py +++ b/Pilot1/Uno/topN_to_uno.py @@ -24,7 +24,7 @@ def read_plan(filename, node): if node in plan: return plan[node] else: - raise Exception('Node index {} was not found in plan file') + raise Exception('Node index "{}" was not found in plan file'.format(node)) def build_masks(args, df): From 40bb60474f037df65f2482f6b40d9ba87aa0af3b Mon Sep 17 00:00:00 2001 From: Hyunseung Yoo Date: Thu, 27 Jun 2019 22:10:19 -0500 Subject: [PATCH 093/331] read hdf format master dataframe --- Pilot1/Uno/topN_to_uno.py | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/Pilot1/Uno/topN_to_uno.py b/Pilot1/Uno/topN_to_uno.py index ffc153d1..87c03a9e 100644 --- a/Pilot1/Uno/topN_to_uno.py +++ b/Pilot1/Uno/topN_to_uno.py @@ -1,4 +1,5 @@ import argparse +import os import json import pandas as pd import numpy as np @@ -36,8 +37,8 @@ def build_masks(args, df): for partition in ['train', 'val']: _mask = df['Sample'] == None for i, element in enumerate(plan[partition]): - cl_filter = element['CELL'] - dr_filter = element['DRUG'] + cl_filter = element['cell'] + dr_filter = element['drug'] __mask = df['Sample'].isin(cl_filter) & df['Drug1'].isin(dr_filter) _mask = _mask | __mask mask[partition] = _mask @@ -49,7 +50,7 @@ def training_mask(df): return np.random.rand(len(df)) < 0.8 -def read_dataframe(args): +def read_dataframe_from_csv(args): df = pd.read_csv(args.dataframe_from, low_memory=False, na_values='na').fillna(0) df.rename(columns={'SAMPLE': 'Sample', 'DRUG': 'Drug1'}, inplace=True) df_y = df[['AUC', 'Sample', 'Drug1']] @@ -64,8 +65,28 @@ def read_dataframe(args): return df_y, df_cl, df_dd +def read_dataframe_from_hdf(args): + store = pd.HDFStore(args.dataframe_from, 'r') + df = store.get('df') + df.rename(columns={'CELL': 'Sample', 'DRUG': 'Drug1'}, inplace=True) + df_y = df[['AUC', 'Sample', 'Drug1']] + + cols = df.columns.to_list() + cl_columns = list(filter(lambda x: x.startswith('GE_'), cols)) + dd_columns = list(filter(lambda x: x.startswith('DD_'), cols)) + + df_cl = df.loc[:, cl_columns] + df_dd = df.loc[:, dd_columns] + + return df_y, df_cl, df_dd + + def build_dataframe(args): - df_y, df_cl, df_dd = read_dataframe(args) + _, ext = os.path.splitext(args.dataframe_from) + if ext == '.h5' or ext == '.hdf5': + df_y, df_cl, df_dd = read_dataframe_from_hdf(args) + else: + df_y, df_cl, df_dd = read_dataframe_from_csv(args) # mask = training_mask(df_y) train_mask, val_mask = build_masks(args, df_y) From 569836c16fdb48a2e3eba21436992009e44d7b4d Mon Sep 17 00:00:00 2001 From: Hyunseung Yoo Date: Thu, 27 Jun 2019 22:32:34 -0500 Subject: [PATCH 094/331] add dose_aggregated AUC prediction model --- Pilot1/Uno/uno_auc_model.txt | 39 ++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100644 Pilot1/Uno/uno_auc_model.txt diff --git a/Pilot1/Uno/uno_auc_model.txt b/Pilot1/Uno/uno_auc_model.txt new file mode 100644 index 00000000..00d2224e --- /dev/null +++ b/Pilot1/Uno/uno_auc_model.txt @@ -0,0 +1,39 @@ +[Global_Params] +train_sources=['CCLE'] +test_sources=['train'] +cell_types=None +cell_features=['rnaseq'] +drug_features=['descriptors'] +dense=[1000, 1000, 1000] +dense_feature_layers=[1000, 1000, 1000] +activation='relu' +loss='mse' +optimizer='adam' +scaling='std' +drop=0 +epochs=50 +batch_size=512 +validation_split=0.2 +cv=1 +max_val_loss=1.0 +learning_rate=None +base_lr=None +residual=False +reduce_lr=False +warmup_lr=False +batch_normalization=False +feature_subsample=0 +rng_seed=2018 +save_path='save/uno' +no_gen=False +verbose=False +no_response_source=True +no_feature_source=True +use_landmark_genes=True +agg_dose='AUC' +preprocess_rnaseq='source_scale' +single=True + +[Monitor_Params] +solr_root='' +timeout=3600 From e25be3b9293bb02abc0331c7cba4f7802eba649b Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 16 Jul 2019 13:05:20 -0500 Subject: [PATCH 095/331] Create cache directory if it does not exist --- Pilot1/Uno/uno_data.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Pilot1/Uno/uno_data.py b/Pilot1/Uno/uno_data.py index 52450fb2..c15e217e 100644 --- a/Pilot1/Uno/uno_data.py +++ b/Pilot1/Uno/uno_data.py @@ -654,6 +654,10 @@ def save_to_cache(self, cache, params): for k in ['self', 'cache', 'single']: if k in params: del params[k] + dirname = os.path.dirname(cache) + if not os.path.exists(dirname): + logger.debug('Creating directory for cache: %s', dirname) + os.mkdir(dirname) param_fname = '{}.params.json'.format(cache) with open(param_fname, 'w') as param_file: json.dump(params, param_file, sort_keys=True) From 317ed93470882d82953c9ce64dab296aa71b05e0 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 16 Jul 2019 14:09:40 -0500 Subject: [PATCH 096/331] Fix typos --- common/default_utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/common/default_utils.py b/common/default_utils.py index 8f90066a..35b0d483 100644 --- a/common/default_utils.py +++ b/common/default_utils.py @@ -319,19 +319,19 @@ def set_seed(seed): def finalize_parameters(bmk): - """Utility to parse parameters in common as well as parmeters + """Utility to parse parameters in common as well as parameters particular to each benchmark. Parameters ---------- bmk : benchmark object Object that has benchmark filepaths and specifications - + Return ---------- gParameters : python dictionary Dictionary with all the parameters necessary to run the benchmark. - Command line overwrites config file especifications + Command line overwrites config file specifications """ # Parse common parameters From b42dec3832a3f3391e803b9639d3aa3168a86f36 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 16 Jul 2019 14:38:13 -0500 Subject: [PATCH 097/331] Post questions regarding CombinedDataLoader.load_from_cache() --- Pilot1/Uno/uno_data.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Pilot1/Uno/uno_data.py b/Pilot1/Uno/uno_data.py index c15e217e..1488b6a8 100644 --- a/Pilot1/Uno/uno_data.py +++ b/Pilot1/Uno/uno_data.py @@ -622,6 +622,7 @@ def __init__(self, seed=SEED): self.seed = seed def load_from_cache(self, cache, params): + """ NOTE: How does this function return an error? (False?) -Wozniak """ param_fname = '{}.params.json'.format(cache) if not os.path.isfile(param_fname): logger.warning('Cache parameter file does not exist: %s', param_fname) @@ -648,6 +649,7 @@ def load_from_cache(self, cache, params): self.__dict__.update(obj.__dict__) logger.info('Loaded data from cache: %s', fname) return True + # NOTE: This is unreachable -Wozniak return False def save_to_cache(self, cache, params): From de97e859772412a76a5718f6a7ea4afc082c727a Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Tue, 16 Jul 2019 14:38:24 -0500 Subject: [PATCH 098/331] Fix typo --- Pilot1/Uno/uno_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Pilot1/Uno/uno_data.py b/Pilot1/Uno/uno_data.py index 1488b6a8..1406a8de 100644 --- a/Pilot1/Uno/uno_data.py +++ b/Pilot1/Uno/uno_data.py @@ -636,7 +636,7 @@ def load_from_cache(self, cache, params): ignore_keys = ['cache', 'partition_by', 'single'] equal, diffs = dict_compare(params, cached_params, ignore_keys) if not equal: - logger.warning('Cache parameter mismatch: %s\nSaved: %s\nAttemptd to load: %s', diffs, cached_params, params) + logger.warning('Cache parameter mismatch: %s\nSaved: %s\nAttempted to load: %s', diffs, cached_params, params) logger.warning('\nRemove %s to rebuild data cache.\n', param_fname) raise ValueError('Could not load from a cache with incompatible keys:', diffs) else: From b66d38f41974d459155c5dba4833a67509165bdd Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Thu, 18 Jul 2019 09:41:21 -0500 Subject: [PATCH 099/331] Improve log messages --- Pilot1/Uno/uno_baseline_keras2.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Pilot1/Uno/uno_baseline_keras2.py b/Pilot1/Uno/uno_baseline_keras2.py index 9ea9c8b8..be3f8f47 100644 --- a/Pilot1/Uno/uno_baseline_keras2.py +++ b/Pilot1/Uno/uno_baseline_keras2.py @@ -403,7 +403,7 @@ def warmup_scheduler(epoch): template_model = build_model(loader, args, silent=True) if args.initial_weights: - logger.info("Loading weights from {}".format(args.initial_weights)) + logger.info("Loading initial weights from {}".format(args.initial_weights)) template_model.load_weights(args.initial_weights) if len(args.gpus) > 1: @@ -444,6 +444,7 @@ def warmup_scheduler(epoch): if args.tb: callbacks.append(tensorboard) if args.save_weights: + logger.info("Will save weights to: " + args.save_weights) callbacks.append(MultiGPUCheckpoint(args.save_weights)) if args.use_exported_data is not None: From e5d79e7deb45cce2da40c514e8f479519fec4917 Mon Sep 17 00:00:00 2001 From: Hyunseung Yoo Date: Thu, 1 Aug 2019 11:37:39 -0500 Subject: [PATCH 100/331] loocv data util --- Pilot1/Uno/loocv_data_util.py | 91 +++++++++++++++++++++++++++++++++++ 1 file changed, 91 insertions(+) create mode 100644 Pilot1/Uno/loocv_data_util.py diff --git a/Pilot1/Uno/loocv_data_util.py b/Pilot1/Uno/loocv_data_util.py new file mode 100644 index 00000000..d42a41fb --- /dev/null +++ b/Pilot1/Uno/loocv_data_util.py @@ -0,0 +1,91 @@ +import argparse +import json +import pandas as pd +import numpy as np + + +def parse_arguments(): + parser = argparse.ArgumentParser() + parser.add_argument('--dataframe_from', type=str, default='GDSC.h5', + help='Dataframe file name contains all data points') + parser.add_argument('--plan', type=str, default='plan.json', + help='Plan data file') + parser.add_argument('--node', type=str, default=None, + help='node number to execute') + + args, unparsed = parser.parse_known_args() + return args, unparsed + + +def read_plan(filename, node): + print("reading {} file for node {}".format(filename, node)) + with open(filename, 'r') as plan_file: + plan = json.load(plan_file) + if node in plan: + return plan[node] + else: + raise Exception('Node index "{}" was not found in plan file'.format(node)) + + +def build_masks(args, df): + if args.node is None: + raise Exception('Node id is not given') + + plan = read_plan(args.plan, args.node) + mask = {} + for partition in ['train', 'val']: + _mask = df['Sample'] is None + for i, element in enumerate(plan[partition]): + cl_filter = element['cell'] + dr_filter = element['drug'] + __mask = df['Sample'].isin(cl_filter) & df['Drug1'].isin(dr_filter) + _mask = _mask | __mask + mask[partition] = _mask + + return mask['train'], mask['val'] + + +def training_mask(df): + return np.random.rand(len(df)) < 0.8 + + +def build_dataframe(args): + store = pd.HDFStore(args.dataframe_from, 'r') + df_y = store.get('y_train') + df_ds = store.get('x_train_0') + df_cl = store.get('x_train_1') + df_dd = store.get('x_train_2') + df_fp = store.get('x_train_3') + + train_mask, val_mask = build_masks(args, df_y) + + y_train = pd.DataFrame(data=df_y[train_mask].reset_index(drop=True)) + y_val = pd.DataFrame(data=df_y[val_mask].reset_index(drop=True)) + + x_train_0 = df_ds[train_mask].reset_index(drop=True) + x_train_1 = df_cl[train_mask].reset_index(drop=True) + x_train_2 = df_dd[train_mask].reset_index(drop=True) + x_train_3 = df_fp[train_mask].reset_index(drop=True) + + x_val_0 = df_ds[val_mask].reset_index(drop=True) + x_val_1 = df_cl[val_mask].reset_index(drop=True) + x_val_2 = df_dd[val_mask].reset_index(drop=True) + x_val_3 = df_fp[val_mask].reset_index(drop=True) + + # store + store = pd.HDFStore('topN.uno.h5', 'w') + store.put('y_train', y_train) + store.put('y_val', y_val) + store.put('x_train_0', x_train_0) + store.put('x_train_1', x_train_1) + store.put('x_train_2', x_train_2) + store.put('x_train_3', x_train_3) + store.put('x_val_0', x_val_0) + store.put('x_val_1', x_val_1) + store.put('x_val_2', x_val_2) + store.put('x_val_3', x_val_3) + + +if __name__ == '__main__': + parsed, unparsed = parse_arguments() + build_dataframe(parsed) From ded070d26a620d68a103c0befcd6a4c93bc1660f Mon Sep 17 00:00:00 2001 From: Hyunseung Yoo Date: Thu, 1 Aug 2019 23:40:38 -0500 Subject: [PATCH 101/331] use table format --- Pilot1/Uno/loocv_data_util.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/Pilot1/Uno/loocv_data_util.py b/Pilot1/Uno/loocv_data_util.py index d42a41fb..412dba5b 100644 --- a/Pilot1/Uno/loocv_data_util.py +++ b/Pilot1/Uno/loocv_data_util.py @@ -74,16 +74,16 @@ def build_dataframe(args): # store store = pd.HDFStore('topN.uno.h5', 'w') - store.put('y_train', y_train) - store.put('y_val', y_val) - store.put('x_train_0', x_train_0) - store.put('x_train_1', x_train_1) - store.put('x_train_2', x_train_2) - store.put('x_train_3', x_train_3) - store.put('x_val_0', x_val_0) - store.put('x_val_1', x_val_1) - store.put('x_val_2', x_val_2) - store.put('x_val_3', x_val_3) + store.put('y_train', y_train, format='t') + store.put('y_val', y_val, format='t') + store.put('x_train_0', x_train_0, format='t') + store.put('x_train_1', x_train_1, format='t') + store.put('x_train_2', x_train_2, format='t') + store.put('x_train_3', x_train_3, format='t') + store.put('x_val_0', x_val_0, format='t') + store.put('x_val_1', x_val_1, format='t') + store.put('x_val_2', x_val_2, format='t') + store.put('x_val_3', x_val_3, format='t') if __name__ == '__main__': From 259c59bf39ff5d929be51fb1dde1137604067028 Mon Sep 17 00:00:00 2001 From: Hyunseung Yoo Date: Sun, 4 Aug 2019 10:22:55 -0500 Subject: [PATCH 102/331] add fom default model --- Pilot1/Uno/uno_fom_model.txt | 38 ++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 Pilot1/Uno/uno_fom_model.txt diff --git a/Pilot1/Uno/uno_fom_model.txt b/Pilot1/Uno/uno_fom_model.txt new file mode 100644 index 00000000..cf66baae --- /dev/null +++ b/Pilot1/Uno/uno_fom_model.txt @@ -0,0 +1,38 @@ +[Global_Params] +train_sources=['GDSC'] +test_sources=['train'] +cell_types=None +cell_features=['rnaseq'] +drug_features=['descriptors', 'fingerprints'] +dense=[1000, 1000, 1000] +dense_feature_layers=[1000, 1000, 1000] +activation='relu' +loss='mse' +optimizer='adam' +scaling='std' +drop=0 +epochs=50 +batch_size=512 +validation_split=0.2 +cv=1 +max_val_loss=1.0 +learning_rate=None +base_lr=None +residual=False +reduce_lr=False +warmup_lr=False +batch_normalization=False +feature_subsample=0 +rng_seed=2018 +save_path='save/uno' +no_gen=False +verbose = False +use_landmark_genes=True +preprocess_rnaseq='source_scale' +no_feature_source=True +no_response_source=True +single=True + +[Monitor_Params] +solr_root='' +timeout=-1 From 131b04a96549061ad729472d014c61598a19f0bc Mon Sep 17 00:00:00 2001 From: Hyunseung Yoo Date: Tue, 13 Aug 2019 15:22:54 -0500 Subject: [PATCH 103/331] generate random split when node is not given; fix hdfstore issues --- Pilot1/Uno/topN_to_uno.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/Pilot1/Uno/topN_to_uno.py b/Pilot1/Uno/topN_to_uno.py index 87c03a9e..5ab5f3ff 100644 --- a/Pilot1/Uno/topN_to_uno.py +++ b/Pilot1/Uno/topN_to_uno.py @@ -30,7 +30,9 @@ def read_plan(filename, node): def build_masks(args, df): if args.node is None: - raise Exception('Node id is not given') + print('node is None. Generate Random split') + mask = np.random.rand(len(df)) < 0.8 + return mask, ~mask plan = read_plan(args.plan, args.node) mask = {} @@ -96,18 +98,20 @@ def build_dataframe(args): x_train_0 = df_cl[train_mask].reset_index(drop=True) x_train_1 = df_dd[train_mask].reset_index(drop=True) + x_train_1.columns = [''] * len(x_train_1.columns) x_val_0 = df_cl[val_mask].reset_index(drop=True) x_val_1 = df_dd[val_mask].reset_index(drop=True) + x_val_1.columns = [''] * len(x_val_1.columns) # store - store = pd.HDFStore('topN.uno.h5', 'w') - store.put('y_train', y_train) - store.put('y_val', y_val) - store.put('x_train_0', x_train_0) - store.put('x_train_1', x_train_1) - store.put('x_val_0', x_val_0) - store.put('x_val_1', x_val_1) + store = pd.HDFStore('topN.uno.h5', 'w', complevel=9, complib='blosc:snappy') + store.put('y_train', y_train, format='table') + store.put('y_val', y_val, format='table') + store.put('x_train_0', x_train_0, format='table') + store.put('x_train_1', x_train_1, format='table') + store.put('x_val_0', x_val_0, format='table') + store.put('x_val_1', x_val_1, format='table') if __name__ == '__main__': From 042f673148969846a9f1de8708cd7e3fc7ccb822 Mon Sep 17 00:00:00 2001 From: Hyunseung Yoo Date: Thu, 15 Aug 2019 09:33:09 -0500 Subject: [PATCH 104/331] set timeout unlimited --- Pilot1/Uno/uno_auc_model.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Pilot1/Uno/uno_auc_model.txt b/Pilot1/Uno/uno_auc_model.txt index 00d2224e..23f93ba8 100644 --- a/Pilot1/Uno/uno_auc_model.txt +++ b/Pilot1/Uno/uno_auc_model.txt @@ -36,4 +36,4 @@ single=True [Monitor_Params] solr_root='' -timeout=3600 +timeout=-1 From 6a9c9ea579538799a7a4a6dbce12c3c29627e952 Mon Sep 17 00:00:00 2001 From: Hyunseung Yoo Date: Wed, 21 Aug 2019 09:32:09 -0500 Subject: [PATCH 105/331] set hyper-params for auc training --- Pilot1/Uno/uno_auc_model.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Pilot1/Uno/uno_auc_model.txt b/Pilot1/Uno/uno_auc_model.txt index 23f93ba8..4a803b43 100644 --- a/Pilot1/Uno/uno_auc_model.txt +++ b/Pilot1/Uno/uno_auc_model.txt @@ -8,15 +8,15 @@ dense=[1000, 1000, 1000] dense_feature_layers=[1000, 1000, 1000] activation='relu' loss='mse' -optimizer='adam' +optimizer='sgd' scaling='std' drop=0 epochs=50 -batch_size=512 +batch_size=32 validation_split=0.2 cv=1 max_val_loss=1.0 -learning_rate=None +learning_rate=0.0001 base_lr=None residual=False reduce_lr=False From 86abd85f946b65cddebcb10974d5897e0029f14c Mon Sep 17 00:00:00 2001 From: Hyunseung Yoo Date: Thu, 22 Aug 2019 15:55:34 -0500 Subject: [PATCH 106/331] code cleanup --- Pilot1/Uno/topN_to_uno.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Pilot1/Uno/topN_to_uno.py b/Pilot1/Uno/topN_to_uno.py index 5ab5f3ff..dd81d9f3 100644 --- a/Pilot1/Uno/topN_to_uno.py +++ b/Pilot1/Uno/topN_to_uno.py @@ -31,13 +31,13 @@ def read_plan(filename, node): def build_masks(args, df): if args.node is None: print('node is None. Generate Random split') - mask = np.random.rand(len(df)) < 0.8 + mask = training_mask(df) return mask, ~mask plan = read_plan(args.plan, args.node) mask = {} for partition in ['train', 'val']: - _mask = df['Sample'] == None + _mask = df['Sample'] is None for i, element in enumerate(plan[partition]): cl_filter = element['cell'] dr_filter = element['drug'] From 16fb63f9fd4eace1ad54a1228335e3d19ead237a Mon Sep 17 00:00:00 2001 From: Hyunseung Yoo Date: Fri, 23 Aug 2019 00:43:13 -0500 Subject: [PATCH 107/331] use plangen api --- Pilot1/Uno/plangen.py | 1489 +++++++++++++++++++++++++++++++++++++ Pilot1/Uno/topN_to_uno.py | 87 ++- 2 files changed, 1560 insertions(+), 16 deletions(-) create mode 100644 Pilot1/Uno/plangen.py diff --git a/Pilot1/Uno/plangen.py b/Pilot1/Uno/plangen.py new file mode 100644 index 00000000..5eccdcca --- /dev/null +++ b/Pilot1/Uno/plangen.py @@ -0,0 +1,1489 @@ + +from collections import deque +from collections import namedtuple +from enum import Enum +import glob +import itertools as it +import json +import numpy as np +import os +import sys +import sqlite3 +from sqlite3 import Error as db_Error + +# import planargs + +from abc import ABC, abstractmethod # abstract class support +from collections import OrderedDict +from scipy.special import comb +from pprint import pprint as pp +from datetime import datetime + +ISO_TIMESTAMP = "seconds" # timestamp to ISO string +ISO_TIMESTAMP_ENCODE = "%Y-%m-%dT%H:%M:%S" # ISO string to timestamp +DEBUG_SQL = False + +def isempty(path): + """Determine whether the given directory is empty.""" + flist = glob.glob(os.path.join(path,'*')) + return flist == [] + + +def validate_args(args): + """Validate the execution arguments as defined in planargs.py. + + This function validates input arguments defined in the 'args' namespace. + The inputs are lists series of feature-set names (fs_names), files + (fs_paths) and partitioning attributes (fs_parts). fs_names and fs_files + must designate the same number of parameters. For example: + + --fs_names CELL DRUG --fs_paths cells.txt drugs.txt + + The CELL name is paired with the cells.txt file, DRUG with drugs.txt, etc. + Currently, this one for one correspondence also applies to the fs_part arg, + which specifies the number of partitions the feature-set list is broken + into at every level of the plan generation recursion. A complete example + might look like this: + + --fsnames CELL DRUG --fs_paths cells.txt drugs.txt --fs_parts 2 2 + + An output directory for the plan in any of its formats is given by out_dir. + An input directory may be specified via in_dir to simplify the coding of + fs_paths. Otherwise, feature-set files must be fully specified. Each of the + files is read and returned. + + Returns + Upon success, a tuple is returned. It contains: + + t[0] - the generator class implementing the appropriate partition() + function. + + t[1] - a list of feature-set entry lists is returned. All entries + are stripped of white-space, all white-space lines have been removed. + For example: + + [[CELL1 ... CELLn] [DRUG1 ... DRUGn]] + + Additionally, an args.lines list is created where each entry contains + the entry count of the corresponding fs_paths file argument. + """ + params = {} + verbose = args.verbose + + fs_names_len = len(args.fs_names) + fs_paths_len = len(args.fs_paths) + fs_parts_len = len(args.fs_parts) + + nbr_feature_sets = fs_names_len + test_lengths = [fs_names_len, fs_paths_len, fs_parts_len] + reqd_lengths = [nbr_feature_sets] * 3 + + if test_lengths != reqd_lengths: + sys.exit("Error: The lengths of all feature set definition args (fs_<>) must be identical") + + if nbr_feature_sets <= 1: + sys.exit("Error: Partitioning requires multiple feature sets") + + for nparts in args.fs_parts: + if nparts < 1 or nparts >= 8: + sys.exit("Error: Invalid partitioning value %d" % nparts) + + # validate input and output directories + if args.in_dir and not os.path.isdir(args.in_dir): + sys.exit("Error: --in_dir must designate a directory, '%s' is not valid" % args.in_dir) + + if not os.path.isdir(args.out_dir): + sys.exit("Error: --out_dir must designate a directory, '%s' is not valid" % args.out_dir) + + if not args.overwrite and not isempty(args.out_dir): + sys.exit("Error: --out_dir '%s' is not empty, --overwrite not specified" % args.out_dir) + + if verbose: + print("Writing plan information to %s" % os.path.abspath(args.out_dir)) + + # expand, validate and load input feature-set content lists + fs_content = [] + args.fs_lines = [] + file_error = False + if args.in_dir == None: + args.in_dir = '' # prepare for use in os.path.join() + + for i, path in enumerate(args.fs_paths): + fullpath = os.path.join(args.in_dir, path) + if not os.path.exists(fullpath): + file_error = True + print("Error: %s file not found" % fullpath) + else: + with open(fullpath, 'r') as f: # read text and sanitize + raw_lines = f.readlines() + + text = [line.strip() for line in raw_lines] + text = [l for l in text if l != ''] + fs_content.append(text) + args.fs_lines.append(len(text)) + + if verbose: + print("Loading '%s' feature set definition from %s - %d lines" + % (args.fs_names[i], fullpath, len(text))) + + if file_error: + sys.exit("Terminating due to error") + + # construct a partitioning object exporting a partion() function + if args.partition_strategy == 'leaveout': + generator = LeaveoutSubsetGenerator() + + # return feature-set contents lists + return generator, fs_content + + +class SubsetGenerator(ABC): + """Abstract class implementing a data partitioning method. + + The SubsetGenerator class provides a template for subclasses that implement + mechanisms for dividing sets of lists into sublists for the purpose of + defining unique ML training and validation sets. + + Subclasses must implement those methods defined as @abstractmethod. + The validate() function provided here does a sanity test for all anticipated + partitioning schemes. Subclasses should implement their specializations. + """ + + def __init__(self, name=''): + self.name = name + self.term_msg = "Terminating due to error" + + @abstractmethod + def partition( + self, + base, + size=None, + count=None, + name='-unspecified-' + ): + """Partition a feature-set array. + + Partition the 'base', a list of elements, using the abstract arguments + 'size' and 'count' to tailor the implementation's algorithm. 'name' is + used in error reporting and is optional. + """ + validate(self, base, size, count, name) + return [] + + def get_plan_label(self, plan_dict, root_name): + root = plan_dict[root_name] + return root['label'] + + def _validation_error(self, base_len, size, count, name='-unspecified-'): + """Provide a common error reporting function. """ + print("Base list length: %d requested %d sublists of length %d" % + (base_len, count, size)) + + def validate(self, base, size=None, count=None, name='-unspecified-'): + """Provide basic request validation, specific generators may impose + additional requirements. + """ + berror = False + base_len = len(base) + + if size == None or size <= 0 or size > base_len: + berror = True + else: + unique_combos = comb(base_len, size) # implements N take K + if count > unique_combos: + berror = True + if berror: + SubsetGenerator._validation_error(self, base_len, size, count, name) + + return not berror + +# +# UNDER EVALUATION ????????????????????????????????????????????????????? +# + +class IterativeSubsetGenerator(SubsetGenerator): + """ Tom Brettin method... subset generation via iteration over base""" + def __init__(self): + SubsetGenerator.__init__(self, 'IterativeSubsetGenerator') + + def partition(self, base, size=None, count=0, name=None): + """ """ + + if size is None: + print("Error: Unspecified list partitioning size") + sys.exit(3) + + """ + base_len = len(base) + if count == 0: # a simplification useful in the iterative approach + count = base_len + """ + + is_valid = SubsetGenerator.validate(self, base, size, count, name) + if not is_valid: + print(self.term_msg) + sys.exit(1) + + if count > base_len: + SubsetGenerator._validation_error(self, base_len, size, count, name) + print(self.term_msg) + sys.exit(2) + + np_base = np.array(base) + selected_sublists = [] + omit_size = base_len - size + increment = min(size, omit_size) + + # omit consecutive blocks of feature-name entries + for i in range(count): + org = i * increment + if org >= base_len: + org = org % base_len + if org == 0 and i > 0: + print("Warning: %d sublists of %s completed short of the requested %d" + % (i, name, count)) + break + + end = org + size + sublist = np_base.take(range(org, end), mode='wrap') + print(sublist) + selected_sublists.append(sublist) + + return selected_sublists + + +class LeaveoutSubsetGenerator(SubsetGenerator): + """CANDLE milestone 13 style feature set partitioning. + + All SubsetGenerator subclasses are required to implement partition(), + plan_init() and plan_term() functions. + """ + + def __init__(self): + SubsetGenerator.__init__(self, 'LeaveoutSubsetGenerator') + self.strategy = "leaveout" + + def plan_init(self, fs_names, fs_paths, fs_lines, fs_parts, maxdepth, root_name='1'): + """Initialize - collect plan metadata """ + currtime = datetime.now() + details = {'fs_names': fs_names, 'fs_filepaths':fs_paths, 'fs_parts': fs_parts} + details['create_date'] = currtime.isoformat(timespec=ISO_TIMESTAMP) + details['strategy'] = self.strategy + + label = '' + for i in range(len(fs_names)): + if i != 0: + label += '_' + s = '{}{}-p{}'.format(fs_names[i], fs_lines[i], fs_parts[i]) + label += s + + if maxdepth > 0: + label += '-maxdepth{}'.format(maxdepth) + + details['label'] = label + plan_dict = OrderedDict() + plan_dict[root_name] = details + return root_name, plan_dict + + def plan_term(self, plan_dict, root_name, nbr_subplans): + """Completion - post plan summary metadata """ + meta = plan_dict[root_name] + meta['nbr_subplans'] = nbr_subplans + + + def partition(self, base, size='n/a', count=None, name=None): + """Partition a feature-set list into lists of equal sized elements. + + This partitioner accepts a list of feature-set names and returns + 'count' lists, the elements evenly divided between these lists. + The last sublist will contain more or fewer elements if the base + list cannot be evenly divided. + + Args + base: A list of feature-set names. + size: Ignored, not used in this implementation. + count: The number of equal sized partitions requested, required. + name: A tag used for debug/error tracing. Not used in this + implementation. + + These arguments are common to all partition functions defined in + SubsetGenerator subclasses. + + Returns + When the input 'base' list contains a number of entries equal to or + greater than 'count', a list of 'count' sublists is returned. For + example: + + [[CELL1, ..., CELL4], [CELL5, ..., CELL7]] + + Otherwise the base list is returned as a list of lists, each list + containing one feature from the input list. This implementation + maintains compatibility with the "standard" return format discussed + above. + """ + + base_len = len(base) + if base_len < count: # can partition any further? + return [[feature] for feature in base] + + size = base_len // count + sublists = [] + + for i in range(count): + org = i * size + end = org + size + if i != count - 1: + part = base[org:end] + else: + part = base[org:] + sublists.append(part) + + return sublists + +#------------------------------------------------------------------------------ +# Database support, table and column definitions, DDL and DML +# Refer to the plan_prep() function for a discussion of the "planstat" and +# "runhist" tables defined below. +#------------------------------------------------------------------------------ + +class RunType(Enum): + RUN_ALL = 0 + RESTART = 1 + +class RunStat(Enum): # subplan execution status + SCHEDULED = 'scheduled' + COMPLETE = 'complete' + +# planstat table, rows are returned via the PlanstatRow namedtuple + +_planstat_ddl = """ + CREATE TABLE IF NOT EXISTS planstat ( + plan_name TEXT NOT NULL PRIMARY KEY, + create_date TEXT NOT NULL, + feature_sets TEXT NOT NULL, + partitions TEXT NOT NULL, + nbr_subplans INTEGER + ); """ + +PlanstatRow = namedtuple('PlanstatRow', + [ + 'rowid', + 'plan_name', + 'create_date', + 'feature_sets', + 'partitions', + 'nbr_subplans' + ] +) + +_select_row_from_planstat = """ + SELECT rowid, + plan_name, create_date, feature_sets, partitions, nbr_subplans + FROM planstat + WHERE plan_name='{}' + """ + +_insert_planstat_plan = """ + INSERT INTO planstat ( + plan_name, create_date, feature_sets, partitions, nbr_subplans) + VALUES ('{}', '{}', '{}', '{}', {}) + """ + +_delete_planstat_plan = """ + DELETE FROM planstat where rowid = {} + """ + +# runhist table, rows are returned via the RunhistRow namedtuple + +_runhist_ddl = """ + CREATE TABLE IF NOT EXISTS runhist ( + plan_id INTEGER NOT NULL, + subplan_id TEXT NOT NULL, + status TEXT NOT NULL, + start_time TEXT NOT NULL, + stop_time TEXT, + run_mins INT, + mae REAL, + mse REAL, + r_square REAL, + other_info TEXT, + weights_fn TEXT, + PRIMARY KEY (plan_id, subplan_id) + ); """ + +RunhistRow = namedtuple('RunhistRow', + [ + 'plan_id', + 'subplan_id', + 'status', + 'start_time', + 'stop_time', + 'run_mins', + 'mae', + 'mse', + 'r_square', + 'other_info', + 'weights_fn' + ] +) + +_select_row_from_runhist = """ + SELECT plan_id, subplan_id, status, + start_time, stop_time, run_mins, + mae, mse, r_square, other_info, weights_fn + FROM runhist + WHERE plan_id = {} and subplan_id = '{}' + """ + +_insupd_scheduled_runhist = """ + REPLACE INTO runhist(plan_id, subplan_id, status, start_time, + stop_time, run_mins, mae, mse, r_square, other_info, weights_fn) + VALUES({}, '{}', '{}', '{}', + NULL, NULL, NULL, NULL, NULL, NULL, NULL) + """ + +_insupd_completed_runhist = """ + UPDATE runhist SET + status = '{}', + stop_time = '{}', + run_mins = {}, + mae = {}, + mse = {}, + r_square = {}, + other_info = '{}', + weights_fn = '{}' + WHERE + plan_id = {} AND subplan_id='{}' + """ + +_delete_from_runhistory = """ + DELETE FROM runhist where plan_id = {} + """ + +#------------------------------------------------------------------------------ +# "Plan management" Database functions +# +# db_connect - establish database connection returning conn handle +# execute_sql_stmt - execute a SQL statement with optional error trap +# plan_prep - prepare for the execution of a multi-step "plan" +# start_subplan - start a subplan, (ex. '1.4.8'), write RunhistRow +# stop_subplan - stop a subplan, update RunhistRow +# get_subplan_runhist - return a RunhistRow for a given subplan +# plan_remove - remove all database records for the named plan +#------------------------------------------------------------------------------ + +def execute_sql_stmt(conn, stmt, cursor=None, trap_exception=False): + """Execute a SQL statement. + + This is a convenience function that wraps the execution of a given SQL + statement with exception handling and cleanup logic. + + Args + conn: An open database connection handle + stmt: A fully instantiated SQL statement + + cursor: Optionally, a cursor managed by the caller. If + local cursor is used. Provide a cursor if you must + operate on it after completion, fetchall() for example. + + trap_exception: By default exceptions raised by the database must be + handled by the caller. If True, errors are reflected + by the boolean return value and the cursor and/or + connection handle provided by the caller are closed.. + + Returns + False indicates that an exception occurred, else True. + """ + + if cursor: + lclcsr = cursor + else: + lclcsr = conn.cursor() + try: + if DEBUG_SQL: + with open("plangen_db.log", "a") as fp: + fp.write("STMT: " + stmt + "\n") + + db_exception = False + lclcsr.execute(stmt) + + except db_Error as e: + db_exception = True + print('execute_sql_stmt:', stmt) + print('execute_sql_stmt:', e) + if not trap_exception: + raise + finally: + if not cursor: + lclcsr.close() + + if db_exception: + if cursor: + cursor.close() + conn.close() + + return not db_exception + + +def db_connect(db_path): + """Connect to the plan management database. + + Establish a connection to the sqlite3 database contained in the named file. + A plan management database is created and populated at db_path if the file + does not exist. + + Args + db_path: A relative or absolute path or ":memory:" + + Returns + A connection handle is returned upon success, else None + """ + + if db_path == ':memory:' or not os.path.exists(db_path): + prev_allocated = False + else: + prev_allocated = True + + try: + conn = sqlite3.connect(db_path) + except db_Error as error: + print('db_connect', error) + raise + + # create plan management tables on initial database allocation + if conn and not prev_allocated: + complete = execute_sql_stmt(conn, _planstat_ddl) + complete &= execute_sql_stmt(conn, _runhist_ddl) + + if complete: + conn.commit() + else: + conn.close() + conn = None + return conn + + +def plan_remove(db_path, plan_path): + """Delete the named plan from the plan managment database. + + The relative plan name is extracted from the plan_path by removing the + leading directories and the trailing filetype suffix from the given + plan_path. The planstat row is retrieved and the associated rowid is + the plan_id identifying the target runhist table rows. + + Returns + Zero indicates deletion complete, -1 if the plan name is not matched. + """ + + status = 0 + conn = db_connect(db_path) + plan_key = _get_planstat_key(plan_path) + stmt = _select_row_from_planstat.format(plan_key) + csr = conn.cursor() + execute_sql_stmt(conn, stmt, cursor=csr) + nrow = csr.rowcount + row = csr.fetchone() + + print("%d run history rows deleted" % nrow) + + if not row: + print("Error: CLEANUP request failed - %s has not been run" % plan_key) + status = -1 + else: + plan_rec = PlanstatRow._make(row) # column-name addressable + rowid = plan_rec.rowid # the unique rowid is the plan uniquifier + _delete_runhistory(conn, rowid) + stmt = _delete_planstat_plan.format(rowid) + status = execute_sql_stmt(conn, stmt) + + csr.close() + conn.close() + return status + + +def plan_prep(db_path, plan_path, run_type=RunType.RUN_ALL): + """Prepare to run a plan, a hierarchy of interdependent subplans. + + Plan names and related information are stored in the planstat (PLAN STATUS) + table. There is one row for each plan submitted. A positive, unique integer + called the 'rowid' is assigned to table rows by the database manager. The + rowid of a planstat table row is defined here as the "plan_id". The plan_id + together with a textual "subplan_id" (example: '1.2.4') form a composite + key that is the primary key of the runhist (RUN HISTORY) table. The purpose + of this function is to register the plan and return the associated plan_id. + + RunTypes + When a new plan is presented it is registered in the planstat table and + during its execution a large number of runhist (RUN HISTORY) table + entries are created and then updated. To prevent unintended loss of + data one of the following "RunTypes" is specified on the initial + plan_prep() call and again on subsequent start_subplan() calls. + + Specify RUN_ALL on the first attempt to run a plan. If the plan name + is already registered, the request fails and neither the planstat or + runstat tables are changed. + + Specify RESTART if a prior attempt to run a plan did not complete. The + presence of a corresponding planstat record is verified. start_subplan() + returns a SKIP status if the associated runhist row (if any) is marked + COMPLETE. + + Args + db_path: plan management database path (relative or absolute) + plan_path: JSON plan file (relative or absolute) + run_type: RunType.RUN_ALL, the default, or RunType.RESTART + + Returns + A negative value indicates a fatal error. + + Otherwise the integer returned is the plan_id used together with a + subplan_id string used in subsequent start_subplan(), stop_subplan() + and get_subplan_hist() calls. + """ + + # load the plan and retrieve identity info + plan_dict = load_plan(plan_path) + create_date = get_plan_create_date(plan_dict) + feature_sets = get_plan_fs_names(plan_dict) + partitions = get_plan_fs_parts(plan_dict) + nbr_subplans = get_plan_nbr_subplans(plan_dict) + + # de termine if a plan of the given name has already been registered + conn = db_connect(db_path) + plan_key = _get_planstat_key(plan_path) + stmt = _select_row_from_planstat.format(plan_key) + csr = conn.cursor() + execute_sql_stmt(conn, stmt, cursor=csr) + row = csr.fetchone() + + if not row: + rowid = -1 + else: + plan_rec = PlanstatRow._make(row) # column-name addressable + rowid = plan_rec.rowid # the unique rowid will be the uniquifier returned + + # compare run_type to initial expectations + error = False + + if run_type == RunType.RUN_ALL and rowid > 0: + print("Error: RUN_ALL specified but plan: %s has already been defined" % plan_key) + error = True + + elif run_type == RunType.RESTART and rowid < 0: + print("Warning: RESTART specified but plan: %s has not been previously run" % plan_key) + + elif rowid > 0 and create_date != create_date: # DEBUG ???????????????????????????????????? plan_rec.create_date: + print("Error: RESTART specified but the signature of the previously defined plan: %s does not match" % plan_key) + error = True + + # register new plans acquiring the uniquifying plan_id used to compose runhistory table keys + if not error and rowid < 0: + feature_sets = str(feature_sets) + feature_sets = feature_sets.replace("'", "") # create string literal from list of str + partitions = str(partitions) # create string literal from list of int + + stmt = _insert_planstat_plan.format( + plan_key, + create_date, + feature_sets, + partitions, + nbr_subplans + ) + + status = execute_sql_stmt(conn, stmt, cursor=csr) + rowid = csr.lastrowid + + # cleanup resources and return uniquifier or error indicator + csr.close() + conn.commit() + + if error: + return -1 + else: + return rowid + + +def start_subplan(db_path, plan_path, plan_id=None, subplan_id=None, run_type=None): + """Schedule the execution of a subplan. + + This function writes a RunhistRow record to the runhist table indicating that + the named plan/subplan has been SCHEDULED. The row includes the "start time". + If the given run_type is RESTART, it is possible that the subplan has already + run, as indicated by the status returned. + + Args + db_path: plan management database path (relative or absolute) + plan_path: JSON plan file (relative or absolute) + plan_id: the plan identifier returned by plan_prep() + subplan_id the subplan identifier ex. '1 4.8' + run_type: RunType.RUN_ALL or RunType.RESTART + + Returns + Zero indicates that a RunhistRow record has been created to represent + the subplan. -1 is returned from a RESTART call if the a RunhistRow + already exists for the plan/subplan and is marked COMPLETE. + """ + + conn = db_connect(db_path) + csr = conn.cursor() + skip = False + + # skip previously completed work if RESTART + if run_type == RunType.RESTART: + stmt = _select_row_from_runhist.format(plan_id, subplan_id) + execute_sql_stmt(conn, stmt, cursor=csr) + row = csr.fetchone() + + if row: + runhist_rec = RunhistRow._make(row) + if runhist_rec.status == RunStat.COMPLETE.name: + skip = True + + # construct/reinit a new runhist record + if not skip: + currtime = datetime.now() + start_time = currtime.isoformat(timespec=ISO_TIMESTAMP) + + stmt = _insupd_scheduled_runhist.format( + plan_id, + subplan_id, + RunStat.SCHEDULED.name, + start_time + ) + + execute_sql_stmt(conn, stmt, cursor=csr) + + csr.close() + conn.commit() + conn.close() + + if skip: + return -1 + else: + return 0 + + +def stop_subplan(db_path, plan_id=None, subplan_id=None, comp_info_dict={}): + """Complete the execution of a subplan. + + This function updates the RunhistRow record created by start_subplan() + updating the status to COMPLETE, the completion timestamp, and "user + fields" (such as MAE, MSE, R2) returned by the model. + + A comp_dict dictionary is populated with the names and default values + for columns implemented in the RunhistRow table. Values matching those + names are extracted from the comp_info_dict are written to the table. + + Args + db_path: plan management database path (relative or absolute) + plan_path: JSON plan file (relative or absolute) + plan_id: the plan identifier returned by plan_prep() + comp_info_dict: supplemental completion data dictionar + """ + + conn = db_connect(db_path) + csr = conn.cursor() + curr_time = datetime.now() + stop_time = curr_time.isoformat(timespec=ISO_TIMESTAMP) + + comp_dict = dict(mae=0.0, mse=0.0, r_square=0.0, weights_fn='N/A', unprocessed='') + remainder = _acquire_actuals(comp_dict, comp_info_dict) + + if len(remainder) == 0: + other_info = '' + else: + other_info = json.dumps(remainder) + + # fetch row to retrieve schedule info + stmt = _select_row_from_runhist.format(plan_id, subplan_id) + execute_sql_stmt(conn, stmt, csr) + row = csr.fetchone() + + if row: # expected, caller error if already marked COMPLETED + runhist_rec = RunhistRow._make(row) + if runhist_rec.status != RunStat.COMPLETE.name: + start_time = datetime.strptime(runhist_rec.start_time, ISO_TIMESTAMP_ENCODE) + duration = curr_time - start_time + run_mins = int((duration.total_seconds() + 59) / 60) + + # update runhist record + stmt = _insupd_completed_runhist.format( + # column values + RunStat.COMPLETE.name, + stop_time, + run_mins, + comp_dict['mae'], + comp_dict['mse'], + comp_dict['r_square'], + other_info, + comp_dict['weights_fn'], + # key spec + plan_id, + subplan_id + ) + + execute_sql_stmt(conn, stmt) + + # cleanup + csr.close() + conn.commit() + conn.close() + + +def get_subplan_runhist(db_path, plan_id=None, subplan_id=None): + """Return the RunhistRow record for a given plan/subplan. + + Args + db_path: plan management database path (relative or absolute) + plan_id: the plan identifier returned by plan_prep() + subplan_id the subplan identifier ex. '1 4.8' + + Returns + The RunhistRow associated with the given plan/subplan is returned if + found. + """ + conn = db_connect(db_path) + stmt = _select_row_from_runhist.format(plan_id, subplan_id) + csr = conn.cursor() + execute_sql_stmt(conn, stmt, csr) + row = csr.fetchone() + + if not row: + plan_rec = None + else: + plan_rec = RunhistRow._make(row) + + return plan_rec + +def _acquire_actuals(dft_dict, actuals_dict): + """Extract values from dictionary overlaying defaults.""" + actuals = actuals_dict.copy() + for key, value in dft_dict.items(): + if key in actuals: + dft_dict[key] = actuals[key] + actuals.pop(key) + + return actuals # possibly empty + + +def _get_planstat_key(plan_path): + """Extract the name portion of a plan from a filepath.""" + basename = os.path.basename(plan_path) + basepfx = basename.split(sep='.') + return basepfx[0] + + +def _delete_runhistory(conn, plan_id): + """Delete RunhistRows containing the given plan_id.""" + csr = conn.cursor() + stmt = _delete_from_runhistory.format(plan_id) + execute_sql_stmt(conn, stmt, cursor=csr, trap_exception=True) + rowcount = csr.rowcount + print("CLEANUP processing removed %d run history records" % rowcount) + csr.close() + return rowcount + + +#------------------------------------------------------------------------------ +# Plan navigation, content retrieval +#------------------------------------------------------------------------------ + +def load_plan(filepath): + """Load a JSON transfer learning plan. + + The named JSON tranfer learning plan file is loaded in a manner that preserves + the entry order imposed when the plan was created. This allows the root entry + to be easily located regardless of the plan entry naming scheme in use. + + Args + filepath: A relative or absolute path to the JSON file. + + Returns + An entry-ordered plan in OrderedDict format is returned. + """ + + with open(filepath, 'r') as f: + ordered_plan_dict = json.load(f, object_pairs_hook=OrderedDict) + return ordered_plan_dict + +def get_plan_create_date(plan_dict): + _, value = _get_first_entry(plan_dict) + return value['create_date'] + +def get_plan_fs_names(plan_dict): + _, value = _get_first_entry(plan_dict) + return value['fs_names'] + +def get_plan_fs_parts(plan_dict): + _, value = _get_first_entry(plan_dict) + return value['fs_parts'] + +def get_plan_nbr_subplans(plan_dict): + _, value = _get_first_entry(plan_dict) + return value['nbr_subplans'] + +def _get_first_entry(ordered_dict): + key, value = next(iter(ordered_dict.items())) + return key, value + +def get_subplan(plan_dict, subplan_id=None): + """Retrieve the content of a named subplan or the root plan. + + Args + plan_dict: The plan dictionary as returned by load_plan(). + subplan_id: The name of the desired subplan. Omit this arg to acquire + the content and name of the plan tree root. + + Returns + A (content, subplan_id) pair is returned. The returned name is useful when + using default arguments to retrieve the root plan. + """ + + if subplan_id is None: + subplan_id, content = _get_first_entry(plan_dict) + else: + content = plan_dict.get(subplan_id) + return content, subplan_id + + +def get_predecessor(plan_dict, subplan_id): + """Acquire the name of the predecessor (parent) of a given subplan. + + The plan tree is a true tree. All subplans have exactly one + predecessor/parent. Use this function to walk 'up' the tree. + + Args + plan_dict: The plan dictionary as returned by load_plan(). + subplan_id: The name of the target subplan. + + Returns + The name of the parent subplan is returned. If the root plan name + is specified None is returned. + """ + + segments = subplan_id.split(sep='.') + if len(segments) <= 1: + subplan_id = None + else: + segments.pop() + subplan_id = '.'.join(segments) + return subplan_id + + +def get_successors(plan_dict, subplan_id): + """Acquire the names of the successors (children) of a given subplan. + + All subplans other than 'leaf' subplans have at least one successor. Use + this function to walk 'down' one or more plan subtrees. + + Args + plan_dict: The plan dictionary as returned by load_plan(). + subplan_id: The name of the target subplan. + + Returns + A list of the names of all successors (children) of the given subplan + is returned. The list may be empty. + """ + successor_names = [] + for i in it.count(start=1): + new_name = subplan_id + '.' + str(i) + value = plan_dict.get(new_name) + if not value: + break + successor_names.append(new_name) + + return successor_names + + +def _get_named_set(plan_dict, subplan_id, section_tag, fs_name, collector, parent_features=None): + """ """ + + while True: + content, _ = get_subplan(plan_dict, subplan_id) + assert(content) + + section = content[section_tag] + for i, section_features in enumerate(section): + feature_list = section_features[fs_name] + collector.insert(i, feature_list) + + if not parent_features: + break + + # visit parent node, root has no feature information and ends upward traversal + subplan_id = get_predecessor(plan_dict, subplan_id) + grand_parent_id = get_predecessor(plan_dict, subplan_id) + + if not grand_parent_id: + break + + +def get_subplan_features(plan_dict, subplan_id, parent_features=False): + """Return train and validation features associated with a named subplan. + + Args + plan_dict: The plan dictionary as returned by load_plan()x. + subplan_id: The name of the target subplan + parent_features: True or False + + Returns + The result is four-tuple (t0, t1, t2, t30) constructed as follows. + Some applications may choose to discard some of the returns, t0 and + t1, for example. + + t0 - the result dictionary which is disassmbled as follows + t1 - a list of feature names found in the train/validate sets + t2 - training feature set dictionary as described below + t3 - validation feature set dictionary as described below + + t2 and t3 are dictionaries that represent one or more training sets + and one or more validation sets, respectively. The key of each entry + is a feature-set name as returned in the t1 list, ['cell', 'drug'] for + example. The value of each is a list of lists. + + Consider a training feature set dictionary returned as follows: + + { + 'cell': [[C1, C2, C3, C4], [C5, C6, C7, C8]], + 'drug': [[ [D1, D2] , [D3, D4]] + } + + The feature sets defined here are the combination of (cell[0], drug[0]) + and (cell[1], drug[1]). The lenghts, i.e. number of sublists of each + dictionary entry are always equal. + """ + + # acquire feature_set names populated in the plan + content, _ = get_subplan(plan_dict, subplan_id) + if not content: + return None, None + + # peek inside the training set to capture active feature-set names + train_set = content['train'][0] + fs_names = [name for name in train_set.keys()] + + # categorize the results + result = {} + result[0] = fs_names + result['train'] = {} + result['val'] = {} + + for set_name, pf in [('train', True), ('val', False)]: + if pf == True: + pf = parent_features + + for fs_name in fs_names: + collector = [] + _get_named_set( + plan_dict, + subplan_id, + set_name, + fs_name, + collector, + parent_features=pf + ) + + result[set_name][fs_name] = collector + + return result, result[0], result['train'], result['val'] + +#------------------------------------------------------------------------------ +# Plan construction +#------------------------------------------------------------------------------ + +def build_dictionary_from_lists(seq_list, names): + """Create a dictionary with 'names' as labels and 'seq_list' values.""" + dict = {} + for seq, tag in zip(seq_list, names): + dict[tag] = list(seq) + return dict + + +def build_plan_tree(args, feature_set_content, parent_plan_id='', depth=0, data_pfx='', plan_pfx=''): + """Generate a plan supporting training, transfer-learning, resume-training. + + ADD GENERAL DOC + + This function is recursive. + + Arguments: + args: A namespace capturing the values of command line arguments + and parameter values derived from those arguments. Refer to + validate_args(). + + feature_set_content: This is a list of sublists, where each sublist + contains the names of the nth group of feature-set elements. + + parent_plan_id: This is the name of the parent's plan. The name + is extended with '.nn' at each level of the recursion to + ensure that parentage/liniage is fully conveyed in each + (subplan) plan_id. + + depth: Specify 0 on the root call. This arg can be used to + determine/set the current level of the recursion. + + data_pfx: Reserved for constructing feature-set name files. + plan_pfx: Reserved for constructing plan control files. + + Returns + args.plan_dict contains a dictionary representing the plan. This may be + JSONized. + + The number of planning steps (nbr of subplans in the plan tree) is explicitly + returned. + """ + curr_depth = depth + 1 + if args.maxdepth > 0 and curr_depth >= args.maxdepth: + return 0 + + all_parts = [] + + #flat_partitions = [] # preserve, used for file-based approach + #files = [] # preserve, used for file-based approach + #sequence = 0 # preserve, used for file-based approach + xxx = False + + for i in range(len(args.fs_names)): + group = feature_set_content[i] + count = args.fs_parts[i] + feature_set_name = args.fs_names[i] + partitions = args.generator.partition(feature_set_content[i], count=count) + all_parts.append(partitions) + + # acquire a cross-product of all feature-set partitions + parts_xprod = np.array(list(it.product(*all_parts))) + steps = len(parts_xprod) + + if steps > 1: + substeps = 0 + for step in range(steps): + train = [] + val = [] + + # split into validation and training components + for i, plan in enumerate(parts_xprod): + section = build_dictionary_from_lists(plan, args.fs_names) + if i == step: + val.append(section) + else: + train.append(section) + + # generate next depth/level (successor) plans + curr_plan_id = '{}.{}'.format(parent_plan_id, step + 1) + args.plan_dict[curr_plan_id] = {'val': val, 'train': train} + data_name = '{}.{}'.format(data_pfx, step + 1) + plan_name = '{}.{}'.format(plan_pfx, step + 1) + + # depth-first, shorthand representation of tree showing first feature names + if args.debug: + indent = ' ' * (depth * 4) + print(indent, curr_plan_id) + indent += ' ' * 4 + fs = parts_xprod[step] + for i in range(len(fs)): + print(indent, args.fs_names[i], 'count:', len(fs[i]), 'first:', fs[i][0]) + + substeps += build_plan_tree( + args, + parts_xprod[step], + parent_plan_id=curr_plan_id, + depth=curr_depth, + data_pfx=data_name, + plan_pfx=plan_name + ) + + steps += substeps + return steps + + """ + # THIS IS A WORK-IN-PROGRESS ... GENERATING FILES FOR DATA AND PLAN + + files.append([]) + files_ndx = len(files) - 1 + + for j in range(len(partitions)): + part = partitions[j] + flat_partitions.append(part) + if len(part) == 0: + sys.exit("big trouble ?????????????") + + sequence += 1 + file_name = '{}.{}.{}'.format(data_pfx, sequence, feature_set_name) + print("writing file %s with %d entries" % (file_name, len(part))) # write out 'part' + #write_file(file_name, part) + pair = (feature_set_name, file_name) + files[files_ndx].append(pair) + + file_xprod = np.array(list(it.product(*files))) + nbr_plans = len(file_xprod) + + for seq in range(nbr_plans): + plan_string = '' + + for ndx, curr in enumerate(file_xprod): + if ndx == seq: + plan_string += '--val (' + else: + plan_string += '--inc (' + for (tag, fname) in curr: + plan_string += '{}-{} '.format(tag, fname) + plan_string += ')' + + file_name = '{}.{}'.format(plan_pfx, seq + 1) + print(file_name) + plan_lines = list(plan_string) + #write_file(file_name, plan_lines) + + # construct list of omitted feature entries + + for seq in range(nbr_plans): + omitted_feature_content = [] + org = 0 + + for i in partition_spec: + omitted_feature_content.append(flat_partitions[org]) + org = i + + data_name = '{}.{}'.format(data_pfx, seq + 1) + plan_name = '{}.{}'.format(plan_pfx, seq + 1) + + steps = build_plan_tree( + args, + omitted_feature_content, + parent_plan_id=curr_plan_id, + depth=curr_depth, + data_pfx=data_name, + plan_pfx=plan_name + ) + return + """ + +def write_file(fname, title, string_list): + """Write text expressed as an array of lines to file.""" + with open(fname, 'w') as f: + for line in string_list: + f.write(line) + +def write_dict_to_json(dictionary, fname): + """Write dictionary to a json file.""" + with open(fname, 'w') as f: + json.dump(dictionary, f) + +#---------------------------------------------------------------------------------- +# various hard-coded lists, test cases - the synthetic feature-sets remain useful +#---------------------------------------------------------------------------------- + +""" +synthetic_cell_names = ['cell_' + '%04d' % (x) for x in range(1000)] +synthetic_drug_names = ['drug_' + '%04d' % (x) for x in range(1000)] +""" + +#---------------------------------------------------------------------------------- +# mainline +#---------------------------------------------------------------------------------- + +def main(): + # Acquire and validate arguments + args = planargs.parse_arguments() + args.json = True # the only available option thus far + + generator, feature_set_content = validate_args(args) + args.generator = generator + + root_name, args.plan_dict = generator.plan_init( + fs_names = args.fs_names, # validated cmdline arg + fs_paths = args.fs_paths, # validated cmdline arg + fs_lines = args.fs_lines, # created by validate_args + fs_parts = args.fs_parts, # validated cmdline arg + maxdepth = args.maxdepth + ) + + # feature_set_content = [cell_names, drug_names] + # feature_set_content = [synthetic_cell_names, synthetic_drug_names] + + # remove by-1 dimensions, they do not need to be represented in the plan explicitly + while True: + try: + ndx = args.fs_parts.index(1) + args.fs_names.pop(ndx) + args.fs_paths.pop(ndx) + args.fs_lines.pop(ndx) + args.fs_parts.pop(ndx) + except ValueError: + break + + # Plan generation + data_fname_pfx = os.path.join(args.out_dir, 'DATA.1') + plan_fname_pfx = os.path.join(args.out_dir, 'PLAN.1') + + steps = build_plan_tree( + args, # command line argument namespace + feature_set_content, # for example [[cell1 ... celln] [drug1 ... drugn]] + parent_plan_id=root_name, # name of root plan, subplan names created from this stem + data_pfx=data_fname_pfx, # DATA file prefix, building block for feature name files + plan_pfx=plan_fname_pfx # PLAN file prefix, building block for plan name files + ) + + generator.plan_term(args.plan_dict, root_name, steps) + print("Plan generation complete, total steps: %d" % steps) + + if args.json: + label = args.generator.get_plan_label(args.plan_dict, root_name) + qualified_name = 'plangen_' + label + '.json' + json_file_name = os.path.join(args.out_dir, qualified_name) + json_abspath = os.path.abspath(json_file_name) + write_dict_to_json(args.plan_dict, json_abspath) + print("%s JSON file written" % json_abspath) + + if args.print_tree: + print("Plan dictionary generated") + pp(args.plan_dict, width=160) # DEBUG comment this out for large plans + + if args.test: + test1(json_abspath, "test1_sql.db") + # test2(json_abspath, "test2_sql.db") + +#---------------------------------------------------------------------------------- +# test plan navigation and subplan entry retrieval +#---------------------------------------------------------------------------------- + +def test2(plan_path, db_path): + run_type = RunType.RESTART + #run_type = RunType.RUN_ALL + + plan_name = os.path.basename(plan_path) + plan_id = plan_prep(db_path, plan_name, run_type) + + plan_dict = load_plan(plan_path) + metadata, root_name = get_subplan(plan_dict) + + queue = deque() + queue.append(root_name) + + print("Test2 start") + for iloop in it.count(start = 0): + if len(queue) == 0: + print("Test2 complete - proc loop count: %d" % iloop) + break + + curr_subplan = queue.popleft() + successor_names = get_successors(plan_dict, curr_subplan) + for successor in successor_names: + queue.append(successor) + + if len(curr_subplan) == 1: + continue + + status = start_subplan( + db_path, + plan_path, + plan_id=plan_id, + subplan_id=curr_subplan, + run_type=run_type + ) + + if status < 0: + continue + + completion_status = dict(mse=1.1, mae=2.2, r_square=.555) + + stop_subplan( + db_path, + plan_id=plan_id, + subplan_id=curr_subplan, + comp_info_dict=completion_status + ) + print("Completing subplan %6d" % iloop) + +#---------------------------------------------------------------------------------- +# +def test1(plan_path, db_path): + run_type = RunType.RESTART + #run_type = RunType.RUN_ALL + + plan_name = os.path.basename(plan_path) + plan_id = plan_prep(db_path, plan_name, run_type) + + if (plan_id < 0): + sys.exit("Terminating due to database detected error") + + print("\nBegin plan navigation and subplan retrieval test") + plan_dict = load_plan(plan_path) + + # plan root name value returned when subplan_id= is omitted + metadata, root_name = get_subplan(plan_dict) + + # the root has no parent / predecessor + parent_name = get_predecessor(plan_dict, root_name) + print("Demonstrate that root \'%s\' predecessor is not defined: %s" % (root_name, parent_name)) + + # the root contains metadata, it is not a run specification + successor_names = get_successors(plan_dict, root_name) + print("\nThe first runable configurations are defined in %s\n" % successor_names) + + # the root is the predecessor of these first level runables + for sname in successor_names: + parent_name = get_predecessor(plan_dict, sname) + print("The parent of %s is %s" % (sname, parent_name)) + + # run the right subtree + print("\nRun the rightmost subtree \n") + for i in it.count(start = 1): + listlen = len(successor_names) + if listlen == 0: + break + + for name in successor_names: + status = start_subplan( + db_path, + plan_path, + plan_id=plan_id, + subplan_id=name, + run_type=run_type + ) + + if status < 0: + print("subplan: %s skipped, previously processed" % name) + + select_one = successor_names[listlen - 1] + parent_name = get_predecessor(plan_dict, select_one) + print("%-16s is a successor of %-16s - all successors: %s" % (select_one, parent_name, successor_names)) + +# ??????????????????????????????????????????????????????????? + value,_ = get_subplan(plan_dict, select_one) + + if i < 3: + for pf in [False, True]: + _, fs_name_list, train_list, val_list = get_subplan_features(plan_dict, select_one, parent_features=pf) + print("\nsubplan original:", select_one, "parent features:", pf) + pp(plan_dict[select_one]) + print("\nflattened TRAIN") + pp(train_list) + print("\nflattened VAL") + pp(val_list) + +# ??????????????????????????????????????????????????????????? + + # test retrieval api + row = get_subplan_runhist(db_path, plan_id=plan_id, subplan_id=select_one) + #print(row) + + # post subplan termination + completion_status = dict(mse=1.1, mae=2.2, r_square=.555, misc='no such column', data=123) + + stop_subplan( + db_path, + plan_id=plan_id, + subplan_id=select_one, + comp_info_dict=completion_status + ) + + successor_names = get_successors(plan_dict, select_one) + + print("\nEnd of branch reached") +# plan_remove(db_path, "plangen_cell8-p2_drug8-p2.json") + +#---------------------------------------------------------------------------------- + +if __name__ == "__main__": + main() diff --git a/Pilot1/Uno/topN_to_uno.py b/Pilot1/Uno/topN_to_uno.py index dd81d9f3..d9a2d6d0 100644 --- a/Pilot1/Uno/topN_to_uno.py +++ b/Pilot1/Uno/topN_to_uno.py @@ -13,6 +13,10 @@ def parse_arguments(): help='Plan data file') parser.add_argument('--node', type=str, default=None, help='node number to execute') + parser.add_argument('--incremental', action='store_true', + help='True for building dataset incrementally') + parser.add_argument('--fold', type=str, default=None, + help='pre-calculated indexes for cross fold validation') args, unparsed = parser.parse_known_args() return args, unparsed @@ -22,29 +26,64 @@ def read_plan(filename, node): print("reading {} file for node {}".format(filename, node)) with open(filename, 'r') as plan_file: plan = json.load(plan_file) + if node is None: + return plan + if node in plan: return plan[node] else: raise Exception('Node index "{}" was not found in plan file'.format(node)) +# def build_masks(args, df): +# if args.node is None: +# print('node is None. Generate Random split') +# mask = training_mask(df) +# return mask, ~mask +# +# plan = read_plan(args.plan, args.node) +# mask = {} +# for partition in ['train', 'val']: +# _mask = df['Sample'] is None +# for i, element in enumerate(plan[partition]): +# cl_filter = element['cell'] +# dr_filter = element['drug'] +# __mask = df['Sample'].isin(cl_filter) & df['Drug1'].isin(dr_filter) +# _mask = _mask | __mask +# mask[partition] = _mask +# +# return mask['train'], mask['val'] + + def build_masks(args, df): if args.node is None: print('node is None. Generate Random split') mask = training_mask(df) return mask, ~mask - plan = read_plan(args.plan, args.node) + print('from new build_mask: {} {} {}'.format(args.plan, args.node, args.incremental)) + import plangen + plan = read_plan(args.plan, None) + ids = {} mask = {} + _, _, ids['train'], ids['val'] = plangen.get_subplan_features(plan, args.node, args.incremental) + for partition in ['train', 'val']: _mask = df['Sample'] is None - for i, element in enumerate(plan[partition]): - cl_filter = element['cell'] - dr_filter = element['drug'] - __mask = df['Sample'].isin(cl_filter) & df['Drug1'].isin(dr_filter) + for i in range(len(ids[partition]['cell'])): + if 'cell' in ids[partition] and 'drug' in ids[partition]: + cl_filter = ids[partition]['cell'][i] + dr_filter = ids[partition]['drug'][i] + __mask = df['Sample'].isin(cl_filter) & df['Drug1'].isin(dr_filter) + elif 'cell' in ids[partition]: + cl_filter = ids[partition]['cell'][i] + __mask = df['Sample'].isin(cl_filter) + elif 'drug' in ids[partition]: + dr_filter = ids[partition]['drug'][i] + __mask = df['Drug1'].isin(dr_filter) + _mask = _mask | __mask mask[partition] = _mask - return mask['train'], mask['val'] @@ -90,19 +129,35 @@ def build_dataframe(args): else: df_y, df_cl, df_dd = read_dataframe_from_csv(args) - # mask = training_mask(df_y) - train_mask, val_mask = build_masks(args, df_y) + if args.fold is not None: + tr_id = pd.read_csv('{}_tr_id.csv'.format(args.fold)) + vl_id = pd.read_csv('{}_vl_id.csv'.format(args.fold)) + tr_idx = tr_id.iloc[:, 0].dropna().values.astype(int).tolist() + vl_idx = vl_id.iloc[:, 0].dropna().values.astype(int).tolist() + + y_train = df_y.iloc[tr_idx, :] + y_val = df_y.iloc[vl_idx, :] + + x_train_0 = df_cl.iloc[tr_idx, :] + x_train_1 = df_dd.iloc[tr_idx, :] + x_train_1.columns = [''] * len(x_train_1.columns) + + x_val_0 = df_cl.iloc[vl_idx, :] + x_val_1 = df_dd.iloc[vl_idx, :] + x_val_1.columns = [''] * len(x_val_1.columns) + else: + train_mask, val_mask = build_masks(args, df_y) - y_train = pd.DataFrame(data=df_y[train_mask].reset_index(drop=True)) - y_val = pd.DataFrame(data=df_y[val_mask].reset_index(drop=True)) + y_train = pd.DataFrame(data=df_y[train_mask].reset_index(drop=True)) + y_val = pd.DataFrame(data=df_y[val_mask].reset_index(drop=True)) - x_train_0 = df_cl[train_mask].reset_index(drop=True) - x_train_1 = df_dd[train_mask].reset_index(drop=True) - x_train_1.columns = [''] * len(x_train_1.columns) + x_train_0 = df_cl[train_mask].reset_index(drop=True) + x_train_1 = df_dd[train_mask].reset_index(drop=True) + x_train_1.columns = [''] * len(x_train_1.columns) - x_val_0 = df_cl[val_mask].reset_index(drop=True) - x_val_1 = df_dd[val_mask].reset_index(drop=True) - x_val_1.columns = [''] * len(x_val_1.columns) + x_val_0 = df_cl[val_mask].reset_index(drop=True) + x_val_1 = df_dd[val_mask].reset_index(drop=True) + x_val_1.columns = [''] * len(x_val_1.columns) # store store = pd.HDFStore('topN.uno.h5', 'w', complevel=9, complib='blosc:snappy') From 1cad699279894f12bda8ca352e4cc0e5efccfb95 Mon Sep 17 00:00:00 2001 From: Hyunseung Yoo Date: Sun, 25 Aug 2019 06:12:02 -0500 Subject: [PATCH 108/331] add support for feather format --- Pilot1/Uno/topN_to_uno.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/Pilot1/Uno/topN_to_uno.py b/Pilot1/Uno/topN_to_uno.py index d9a2d6d0..e0957b55 100644 --- a/Pilot1/Uno/topN_to_uno.py +++ b/Pilot1/Uno/topN_to_uno.py @@ -93,7 +93,22 @@ def training_mask(df): def read_dataframe_from_csv(args): df = pd.read_csv(args.dataframe_from, low_memory=False, na_values='na').fillna(0) - df.rename(columns={'SAMPLE': 'Sample', 'DRUG': 'Drug1'}, inplace=True) + df.rename(columns={'CELL': 'Sample', 'DRUG': 'Drug1'}, inplace=True) + df_y = df[['AUC', 'Sample', 'Drug1']] + + cols = df.columns.to_list() + cl_columns = list(filter(lambda x: x.startswith('GE_'), cols)) + dd_columns = list(filter(lambda x: x.startswith('DD_'), cols)) + + df_cl = df.loc[:, cl_columns] + df_dd = df.loc[:, dd_columns] + + return df_y, df_cl, df_dd + + +def read_dataframe_from_feather(args): + df = pd.read_feather(args.dataframe_from).fillna(0) + df.rename(columns={'CELL': 'Sample', 'DRUG': 'Drug1'}, inplace=True) df_y = df[['AUC', 'Sample', 'Drug1']] cols = df.columns.to_list() @@ -126,6 +141,8 @@ def build_dataframe(args): _, ext = os.path.splitext(args.dataframe_from) if ext == '.h5' or ext == '.hdf5': df_y, df_cl, df_dd = read_dataframe_from_hdf(args) + elif ext == '.feather': + df_y, df_cl, df_dd = read_dataframe_from_feather(args) else: df_y, df_cl, df_dd = read_dataframe_from_csv(args) From 88cd80c7d92116d285afb491b831bb2ca03aebde Mon Sep 17 00:00:00 2001 From: Hyunseung Yoo Date: Wed, 28 Aug 2019 14:13:59 -0500 Subject: [PATCH 109/331] add dataframe index. --- Pilot1/Uno/topN_to_uno.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/Pilot1/Uno/topN_to_uno.py b/Pilot1/Uno/topN_to_uno.py index e0957b55..a4bafcc4 100644 --- a/Pilot1/Uno/topN_to_uno.py +++ b/Pilot1/Uno/topN_to_uno.py @@ -69,7 +69,7 @@ def build_masks(args, df): _, _, ids['train'], ids['val'] = plangen.get_subplan_features(plan, args.node, args.incremental) for partition in ['train', 'val']: - _mask = df['Sample'] is None + _mask = df['Sample'] == None for i in range(len(ids[partition]['cell'])): if 'cell' in ids[partition] and 'drug' in ids[partition]: cl_filter = ids[partition]['cell'][i] @@ -81,7 +81,6 @@ def build_masks(args, df): elif 'drug' in ids[partition]: dr_filter = ids[partition]['drug'][i] __mask = df['Drug1'].isin(dr_filter) - _mask = _mask | __mask mask[partition] = _mask return mask['train'], mask['val'] @@ -152,15 +151,15 @@ def build_dataframe(args): tr_idx = tr_id.iloc[:, 0].dropna().values.astype(int).tolist() vl_idx = vl_id.iloc[:, 0].dropna().values.astype(int).tolist() - y_train = df_y.iloc[tr_idx, :] - y_val = df_y.iloc[vl_idx, :] + y_train = df_y.iloc[tr_idx, :].reset_index(drop=True) + y_val = df_y.iloc[vl_idx, :].reset_index(drop=True) - x_train_0 = df_cl.iloc[tr_idx, :] - x_train_1 = df_dd.iloc[tr_idx, :] + x_train_0 = df_cl.iloc[tr_idx, :].reset_index(drop=True) + x_train_1 = df_dd.iloc[tr_idx, :].reset_index(drop=True) x_train_1.columns = [''] * len(x_train_1.columns) - x_val_0 = df_cl.iloc[vl_idx, :] - x_val_1 = df_dd.iloc[vl_idx, :] + x_val_0 = df_cl.iloc[vl_idx, :].reset_index(drop=True) + x_val_1 = df_dd.iloc[vl_idx, :].reset_index(drop=True) x_val_1.columns = [''] * len(x_val_1.columns) else: train_mask, val_mask = build_masks(args, df_y) From f1afad301d3666f6ea44d0aefb06bf13ff63c6c8 Mon Sep 17 00:00:00 2001 From: brettin Date: Tue, 27 Aug 2019 07:41:10 -0400 Subject: [PATCH 110/331] using information from milestone12 HPO --- Pilot1/Uno/uno_auc_model.txt | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/Pilot1/Uno/uno_auc_model.txt b/Pilot1/Uno/uno_auc_model.txt index 4a803b43..7789f732 100644 --- a/Pilot1/Uno/uno_auc_model.txt +++ b/Pilot1/Uno/uno_auc_model.txt @@ -4,13 +4,13 @@ test_sources=['train'] cell_types=None cell_features=['rnaseq'] drug_features=['descriptors'] -dense=[1000, 1000, 1000] +dense=[1000, 1000, 1000, 1000, 1000] dense_feature_layers=[1000, 1000, 1000] activation='relu' loss='mse' -optimizer='sgd' +optimizer='adamax' scaling='std' -drop=0 +drop=.1 epochs=50 batch_size=32 validation_split=0.2 @@ -18,22 +18,28 @@ cv=1 max_val_loss=1.0 learning_rate=0.0001 base_lr=None +agg_dose='AUC' residual=False -reduce_lr=False -warmup_lr=False +reduce_lr=True +warmup_lr=True batch_normalization=False feature_subsample=0 rng_seed=2018 -save_path='save/uno' no_gen=False verbose=False -no_response_source=True -no_feature_source=True -use_landmark_genes=True -agg_dose='AUC' + + preprocess_rnaseq='source_scale' +gpus=1 +use_landmark_genes=True +no_feature_source=True +no_response_source=True +cp=True +save_path='/ccs/home/brettin/project_work/brettin/milestone13/save/uno' + single=True +timeout=-1 [Monitor_Params] solr_root='' -timeout=-1 + From bf0e73a7a63136762473e23cce58d98ca57ba436 Mon Sep 17 00:00:00 2001 From: Harry Yoo Date: Wed, 28 Aug 2019 16:34:42 -0400 Subject: [PATCH 111/331] remove hard-coded path --- Pilot1/Uno/uno_auc_model.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Pilot1/Uno/uno_auc_model.txt b/Pilot1/Uno/uno_auc_model.txt index 7789f732..2d9158a0 100644 --- a/Pilot1/Uno/uno_auc_model.txt +++ b/Pilot1/Uno/uno_auc_model.txt @@ -35,7 +35,7 @@ use_landmark_genes=True no_feature_source=True no_response_source=True cp=True -save_path='/ccs/home/brettin/project_work/brettin/milestone13/save/uno' +save_path='save/uno' single=True timeout=-1 From 00683b89301a9017abb300407991abdaf5b0cac2 Mon Sep 17 00:00:00 2001 From: Hyunseung Yoo Date: Wed, 4 Sep 2019 21:29:21 -0500 Subject: [PATCH 112/331] close filepointer --- Pilot1/Uno/topN_to_uno.py | 1 + 1 file changed, 1 insertion(+) diff --git a/Pilot1/Uno/topN_to_uno.py b/Pilot1/Uno/topN_to_uno.py index a4bafcc4..dc85ec0a 100644 --- a/Pilot1/Uno/topN_to_uno.py +++ b/Pilot1/Uno/topN_to_uno.py @@ -183,6 +183,7 @@ def build_dataframe(args): store.put('x_train_1', x_train_1, format='table') store.put('x_val_0', x_val_0, format='table') store.put('x_val_1', x_val_1, format='table') + store.close() if __name__ == '__main__': From b56c4b8252fa6a546e837c1ac03b3dbe973fe986 Mon Sep 17 00:00:00 2001 From: Hyunseung Yoo Date: Fri, 6 Sep 2019 13:19:26 -0500 Subject: [PATCH 113/331] output file name can be changed by --ouput --- Pilot1/Uno/topN_to_uno.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Pilot1/Uno/topN_to_uno.py b/Pilot1/Uno/topN_to_uno.py index dc85ec0a..42ef4c12 100644 --- a/Pilot1/Uno/topN_to_uno.py +++ b/Pilot1/Uno/topN_to_uno.py @@ -17,6 +17,8 @@ def parse_arguments(): help='True for building dataset incrementally') parser.add_argument('--fold', type=str, default=None, help='pre-calculated indexes for cross fold validation') + parser.add_argument('--output', type=str, default='topN.uno.h5', + help='output filename') args, unparsed = parser.parse_known_args() return args, unparsed @@ -176,7 +178,7 @@ def build_dataframe(args): x_val_1.columns = [''] * len(x_val_1.columns) # store - store = pd.HDFStore('topN.uno.h5', 'w', complevel=9, complib='blosc:snappy') + store = pd.HDFStore(args.output, 'w', complevel=9, complib='blosc:snappy') store.put('y_train', y_train, format='table') store.put('y_val', y_val, format='table') store.put('x_train_0', x_train_0, format='table') From 7a57d3b8280d28bd45bde682439218e9723d25b9 Mon Sep 17 00:00:00 2001 From: Hyunseung Yoo Date: Thu, 19 Sep 2019 11:16:53 -0500 Subject: [PATCH 114/331] add AUC training example --- Pilot1/Uno/README.AUC.md | 137 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 137 insertions(+) create mode 100644 Pilot1/Uno/README.AUC.md diff --git a/Pilot1/Uno/README.AUC.md b/Pilot1/Uno/README.AUC.md new file mode 100644 index 00000000..ae4bce2a --- /dev/null +++ b/Pilot1/Uno/README.AUC.md @@ -0,0 +1,137 @@ +# Training with static datafile +Use static datafile prebuilt and shared at `/vol/ml/hsyoo/shared/top_21_auc_1fold.uno.h5` + +``` +python uno_baseline_keras2.py --config_file uno_auc_model.txt --cache cache/top6_auc --use_exported_data /vol/ml/hsyoo/shared/top_21_auc_1fold.uno.h5 +``` + +The log will look like below, + +``` +Using TensorFlow backend. +Importing candle utils for keras +Configuration file: /ssd1/homes/hsyoo/projects/CANDLE/Benchmarks/Pilot1/Uno/uno_auc_model.txt +{'activation': 'relu', + 'agg_dose': 'AUC', + 'base_lr': None, + 'batch_normalization': False, + 'batch_size': 32, + 'cell_features': ['rnaseq'], + 'cell_types': None, + 'cp': True, + 'cv': 1, + 'dense': [1000, 1000, 1000, 1000, 1000], + 'dense_feature_layers': [1000, 1000, 1000], + 'drop': 0.1, + 'drug_features': ['descriptors'], + 'epochs': 50, + 'feature_subsample': 0, + 'gpus': 1, + 'learning_rate': 0.0001, + 'loss': 'mse', + 'max_val_loss': 1.0, + 'no_feature_source': True, + 'no_gen': False, + 'no_response_source': True, + 'optimizer': 'adamax', + 'preprocess_rnaseq': 'source_scale', + 'reduce_lr': True, + 'residual': False, + 'rng_seed': 2018, + 'save_path': 'save/uno', + 'scaling': 'std', + 'single': True, + 'solr_root': '', + 'test_sources': ['train'], + 'timeout': -1, + 'train_sources': ['CCLE'], + 'use_landmark_genes': True, + 'validation_split': 0.2, + 'verbose': False, + 'warmup_lr': True} +Params: +{'activation': 'relu', + 'agg_dose': 'AUC', + 'base_lr': None, + 'batch_normalization': False, + 'batch_size': 32, + 'by_cell': None, + 'by_drug': None, + 'cache': 'cache/top6_auc', + 'cell_feature_subset_path': '', + 'cell_features': ['rnaseq'], + 'cell_subset_path': '', + 'cell_types': None, + 'config_file': 'uno_auc_model.txt', + 'cp': True, + 'cv': 1, + 'datatype': , + 'dense': [1000, 1000, 1000, 1000, 1000], + 'dense_feature_layers': [1000, 1000, 1000], + 'drop': 0.1, + 'drug_feature_subset_path': '', + 'drug_features': ['descriptors'], + 'drug_median_response_max': 1, + 'drug_median_response_min': -1, + 'drug_subset_path': '', + 'epochs': 50, + 'experiment_id': 'EXP000', + 'export_csv': None, + 'export_data': None, + 'feature_subsample': 0, + 'feature_subset_path': '', + 'gpus': [], + 'growth_bins': 0, + 'initial_weights': None, + 'learning_rate': 0.0001, + 'logfile': None, + 'loss': 'mse', + 'max_val_loss': 1.0, + 'no_feature_source': True, + 'no_gen': False, + 'no_response_source': True, + 'optimizer': 'adamax', + 'output_dir': '/ssd1/homes/hsyoo/projects/CANDLE/Benchmarks/Pilot1/Uno/Output/EXP000/RUN000', + 'partition_by': None, + 'preprocess_rnaseq': 'source_scale', + 'reduce_lr': True, + 'residual': False, + 'rng_seed': 2018, + 'run_id': 'RUN000', + 'save_path': 'save/uno', + 'save_weights': None, + 'scaling': 'std', + 'shuffle': False, + 'single': True, + 'solr_root': '', + 'tb': False, + 'tb_prefix': 'tb', + 'test_sources': ['train'], + 'timeout': -1, + 'train_bool': True, + 'train_sources': ['CCLE'], + 'use_exported_data': '/vol/ml/hsyoo/shared/top_21_auc_1fold.uno.h5', + 'use_filtered_genes': False, + 'use_landmark_genes': True, + 'validation_split': 0.2, + 'verbose': None, + 'warmup_lr': True} + + ... +Total params: 16,224,001 +Trainable params: 16,224,001 +Non-trainable params: 0 +... +Between random pairs in y_val: + mse: 0.0474 + mae: 0.1619 + r2: -1.0103 + corr: -0.0051 +Data points per epoch: train = 423952, val = 52994 +Steps per epoch: train = 13248, val = 1656 +Epoch 1/50 +13248/13248 [==============================] - 198s 15ms/step - loss: 0.0235 - mae: 0.1048 - r2: -0.1311 - val_loss: 0.0145 - val_mae: 0.0903 - val_r2: 0.3393 +Current time ....198.278 +Epoch 2/50 +... +``` From 9f895f766878a413eca90495840647672cf7f0cf Mon Sep 17 00:00:00 2001 From: Hyunseung Yoo Date: Sun, 8 Sep 2019 23:08:43 -0500 Subject: [PATCH 115/331] handle edge case when validation partition is smaller than batch size --- Pilot1/Uno/uno_data.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/Pilot1/Uno/uno_data.py b/Pilot1/Uno/uno_data.py index 1406a8de..b25a3748 100644 --- a/Pilot1/Uno/uno_data.py +++ b/Pilot1/Uno/uno_data.py @@ -967,7 +967,11 @@ def __init__(self, partition='train', filename=None, batch_size=32, shuffle=Fals y = self.store.select('y_{}'.format(self.partition)) self.index = y.index self.size = len(self.index) - self.steps = self.size // self.batch_size + if self.size >= self.batch_size: + self.steps = self.size // self.batch_size + else: + self.steps = 1 + self.batch_size = self.size self.index_map = np.arange(self.steps) if self.shuffle: np.random.shuffle(self.index_map) From 38228261cc6abfd55773d940bca68b4b6017badd Mon Sep 17 00:00:00 2001 From: Hyunseung Yoo Date: Thu, 19 Sep 2019 11:48:51 -0500 Subject: [PATCH 116/331] update data file location --- Pilot1/Uno/README.AUC.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Pilot1/Uno/README.AUC.md b/Pilot1/Uno/README.AUC.md index ae4bce2a..b80fee7c 100644 --- a/Pilot1/Uno/README.AUC.md +++ b/Pilot1/Uno/README.AUC.md @@ -1,8 +1,8 @@ # Training with static datafile -Use static datafile prebuilt and shared at `/vol/ml/hsyoo/shared/top_21_auc_1fold.uno.h5` +Use static datafile prebuilt and shared at `http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/top_21_auc_1fold.uno.h5` ``` -python uno_baseline_keras2.py --config_file uno_auc_model.txt --cache cache/top6_auc --use_exported_data /vol/ml/hsyoo/shared/top_21_auc_1fold.uno.h5 +python uno_baseline_keras2.py --config_file uno_auc_model.txt --cache cache/top6_auc --use_exported_data top_21_auc_1fold.uno.h5 ``` The log will look like below, @@ -110,7 +110,7 @@ Params: 'timeout': -1, 'train_bool': True, 'train_sources': ['CCLE'], - 'use_exported_data': '/vol/ml/hsyoo/shared/top_21_auc_1fold.uno.h5', + 'use_exported_data': 'top_21_auc_1fold.uno.h5', 'use_filtered_genes': False, 'use_landmark_genes': True, 'validation_split': 0.2, From 55096050697629f3038b991c0b3aa552dec87865 Mon Sep 17 00:00:00 2001 From: Hyunseung Yoo Date: Thu, 14 Nov 2019 10:02:13 -0600 Subject: [PATCH 117/331] add early stopping --- Pilot1/Uno/uno.py | 4 ++++ Pilot1/Uno/uno_baseline_keras2.py | 3 +++ 2 files changed, 7 insertions(+) diff --git a/Pilot1/Uno/uno.py b/Pilot1/Uno/uno.py index d4731e50..d246d58e 100644 --- a/Pilot1/Uno/uno.py +++ b/Pilot1/Uno/uno.py @@ -140,6 +140,10 @@ def set_locals(self): 'type': float, 'default': None, 'help': 'base learning rate'}, + {'name': 'es', + 'type': candle.str2bool, + 'default': False, + 'help': 'early stopping on val_loss'}, {'name': 'cp', 'type': candle.str2bool, 'default': False, diff --git a/Pilot1/Uno/uno_baseline_keras2.py b/Pilot1/Uno/uno_baseline_keras2.py index be3f8f47..03e32864 100644 --- a/Pilot1/Uno/uno_baseline_keras2.py +++ b/Pilot1/Uno/uno_baseline_keras2.py @@ -427,6 +427,7 @@ def warmup_scheduler(epoch): candle_monitor = candle.CandleRemoteMonitor(params=params) timeout_monitor = candle.TerminateOnTimeOut(params['timeout']) + es_monitor = keras.callbacks.EarlyStopping(patience=10, verbose=1) reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=0.00001) warmup_lr = LearningRateScheduler(warmup_scheduler) @@ -435,6 +436,8 @@ def warmup_scheduler(epoch): history_logger = LoggingCallback(logger.debug) callbacks = [candle_monitor, timeout_monitor, history_logger] + if args.es: + callbacks.append(es_monitor) if args.reduce_lr: callbacks.append(reduce_lr) if args.warmup_lr: From 7ad0b417ecf150814818ee69f586bad32a213394 Mon Sep 17 00:00:00 2001 From: Hyunseung Yoo Date: Thu, 14 Nov 2019 11:09:33 -0600 Subject: [PATCH 118/331] auto-detect input size --- Pilot1/Uno/uno_data.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/Pilot1/Uno/uno_data.py b/Pilot1/Uno/uno_data.py index b25a3748..8cc6561d 100644 --- a/Pilot1/Uno/uno_data.py +++ b/Pilot1/Uno/uno_data.py @@ -958,12 +958,9 @@ def __init__(self, partition='train', filename=None, batch_size=32, shuffle=Fals self.single = single self.agg_dose = agg_dose self.target = agg_dose if agg_dose is not None else 'Growth' - # 4 inputs for single drug model (cell, dose1, descriptor, fingerprint) - # 7 inputs for drug pair model (cell, dose1, dose1, dr1.descriptor, dr1.fingerprint, dr2.descriptor, dr2.fingerprint) - self.input_size = 4 if self.single else 7 - self.input_size = 2 if agg_dose else self.input_size self.store = pd.HDFStore(filename, mode='r') + self.input_size = len(list(filter(lambda x: x.startswith('/x_train'), self.store.keys()))) y = self.store.select('y_{}'.format(self.partition)) self.index = y.index self.size = len(self.index) From d900dbb1838894deeb7a2a15f500af411e85cb46 Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Wed, 19 Jun 2019 09:38:09 -0500 Subject: [PATCH 119/331] Small fix to Exception --- Pilot1/Uno/topN_to_uno.py | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/Pilot1/Uno/topN_to_uno.py b/Pilot1/Uno/topN_to_uno.py index 42ef4c12..cfe58358 100644 --- a/Pilot1/Uno/topN_to_uno.py +++ b/Pilot1/Uno/topN_to_uno.py @@ -37,26 +37,6 @@ def read_plan(filename, node): raise Exception('Node index "{}" was not found in plan file'.format(node)) -# def build_masks(args, df): -# if args.node is None: -# print('node is None. Generate Random split') -# mask = training_mask(df) -# return mask, ~mask -# -# plan = read_plan(args.plan, args.node) -# mask = {} -# for partition in ['train', 'val']: -# _mask = df['Sample'] is None -# for i, element in enumerate(plan[partition]): -# cl_filter = element['cell'] -# dr_filter = element['drug'] -# __mask = df['Sample'].isin(cl_filter) & df['Drug1'].isin(dr_filter) -# _mask = _mask | __mask -# mask[partition] = _mask -# -# return mask['train'], mask['val'] - - def build_masks(args, df): if args.node is None: print('node is None. Generate Random split') From 240524904ea803c4884f8df2fd2bb6b3fe970a1f Mon Sep 17 00:00:00 2001 From: Hyunseung Yoo Date: Thu, 15 Aug 2019 09:33:09 -0500 Subject: [PATCH 120/331] set timeout unlimited --- Pilot1/Uno/uno_auc_model.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/Pilot1/Uno/uno_auc_model.txt b/Pilot1/Uno/uno_auc_model.txt index 2d9158a0..23b28522 100644 --- a/Pilot1/Uno/uno_auc_model.txt +++ b/Pilot1/Uno/uno_auc_model.txt @@ -42,4 +42,3 @@ timeout=-1 [Monitor_Params] solr_root='' - From 64b864660c5a82ee94151c3cc310146aab65e8c8 Mon Sep 17 00:00:00 2001 From: Hyunseung Yoo Date: Fri, 23 Aug 2019 00:43:13 -0500 Subject: [PATCH 121/331] use plangen api --- Pilot1/Uno/topN_to_uno.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/Pilot1/Uno/topN_to_uno.py b/Pilot1/Uno/topN_to_uno.py index cfe58358..42ef4c12 100644 --- a/Pilot1/Uno/topN_to_uno.py +++ b/Pilot1/Uno/topN_to_uno.py @@ -37,6 +37,26 @@ def read_plan(filename, node): raise Exception('Node index "{}" was not found in plan file'.format(node)) +# def build_masks(args, df): +# if args.node is None: +# print('node is None. Generate Random split') +# mask = training_mask(df) +# return mask, ~mask +# +# plan = read_plan(args.plan, args.node) +# mask = {} +# for partition in ['train', 'val']: +# _mask = df['Sample'] is None +# for i, element in enumerate(plan[partition]): +# cl_filter = element['cell'] +# dr_filter = element['drug'] +# __mask = df['Sample'].isin(cl_filter) & df['Drug1'].isin(dr_filter) +# _mask = _mask | __mask +# mask[partition] = _mask +# +# return mask['train'], mask['val'] + + def build_masks(args, df): if args.node is None: print('node is None. Generate Random split') From 938a770f7c63dfafe9ebc8b46d8722fc6fecfd2b Mon Sep 17 00:00:00 2001 From: Austin Clyde Date: Thu, 21 Nov 2019 09:04:54 -0600 Subject: [PATCH 122/331] added mordred --- Pilot1/Uno/uno.py | 2 +- Pilot1/Uno/uno_data.py | 38 +++++++++++++++++++++++++++++++++++++- 2 files changed, 38 insertions(+), 2 deletions(-) diff --git a/Pilot1/Uno/uno.py b/Pilot1/Uno/uno.py index d246d58e..05b0dc76 100644 --- a/Pilot1/Uno/uno.py +++ b/Pilot1/Uno/uno.py @@ -48,7 +48,7 @@ def set_locals(self): 'help': 'use rnaseq cell line feature set or none at all'}, {'name': 'drug_features', 'nargs': '+', - 'choices': ['descriptors', 'fingerprints', 'none'], + 'choices': ['descriptors', 'fingerprints', 'none', 'mordred'], 'help': 'use dragon7 descriptors or fingerprint descriptors for drug features or none at all'}, {'name': 'by_cell', 'type': str, diff --git a/Pilot1/Uno/uno_data.py b/Pilot1/Uno/uno_data.py index 8cc6561d..dec46ffe 100644 --- a/Pilot1/Uno/uno_data.py +++ b/Pilot1/Uno/uno_data.py @@ -281,6 +281,38 @@ def load_drug_data(ncols=None, scaling='std', imputing='mean', dropna=None, add_ return df_desc, df_fp +def load_mordred_descriptors(ncols=None, scaling='std', imputing='mean', dropna=None, add_prefix=True, feature_subset=None): + path = get_file(DATA_URL + 'extended_combined_mordred_descriptors') + + df = pd.read_csv(path, engine='c', sep='\t', na_values=['na', '-', '']) + df.iloc[:, 1:] = df.iloc[:, 1:].apply(pd.to_numeric, errors='coerce') + df.iloc[:, 1:] = df.iloc[:, 1:].astype(np.float32) + + df1 = pd.DataFrame(df.loc[:, 'DRUG']) + df1.rename(columns={'DRUG': 'Drug'}, inplace=True) + + df2 = df.drop('DRUG', 1) + if add_prefix: + df2 = df2.add_prefix('dragon7.') + + df2 = impute_and_scale(df2, scaling, imputing) + + df_desc = pd.concat([df1, df2], axis=1) + + df1 = pd.DataFrame(df_desc.loc[:, 'Drug']) + df2 = df_desc.drop('Drug', 1) + if add_prefix: + df2 = df2.add_prefix('dragon7.') + if feature_subset: + df2 = df2[[x for x in df2.columns if x in feature_subset]] + df2 = impute_and_scale(df2, scaling=scaling, imputing=imputing, dropna=dropna) + df_desc = pd.concat([df1, df2], axis=1) + + logger.info('Loaded Mordred drug descriptors: %s', df_desc.shape) + + return df_desc + + def load_drug_descriptors(ncols=None, scaling='std', imputing='mean', dropna=None, add_prefix=True, feature_subset=None): df_info = load_drug_info() df_info['Drug'] = df_info['PUBCHEM'] @@ -878,13 +910,17 @@ def load(self, cache=None, ncols=None, scaling='std', dropna=None, df_drug_desc = load_drug_descriptors(ncols=ncols, scaling=scaling, dropna=dropna, feature_subset=drug_feature_subset) elif fea == 'fingerprints': df_drug_fp = load_drug_fingerprints(ncols=ncols, scaling=scaling, dropna=dropna, feature_subset=drug_feature_subset) + elif fea == 'mordred' : + df_drug_mordred = load_mordred_descriptors(ncols=ncols, scaling=scaling, dropna=dropna, feature_subset=drug_feature_subset) + # df_drug_desc, df_drug_fp = load_drug_data(ncols=ncols, scaling=scaling, dropna=dropna) cell_df_dict = {'rnaseq': 'df_cell_rnaseq'} drug_df_dict = {'descriptors': 'df_drug_desc', - 'fingerprints': 'df_drug_fp'} + 'fingerprints': 'df_drug_fp', + 'mordred' : 'df_drug_mordred'} # df_cell_ids = df_cell_rnaseq[['Sample']].drop_duplicates() # df_drug_ids = pd.concat([df_drug_desc[['Drug']], df_drug_fp[['Drug']]]).drop_duplicates() From 6c193fe169be9da61eae330379f15a97e39f6707 Mon Sep 17 00:00:00 2001 From: Austin Clyde Date: Thu, 21 Nov 2019 13:10:35 -0600 Subject: [PATCH 123/331] changed prefix for mordred --- Pilot1/Uno/uno_data.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Pilot1/Uno/uno_data.py b/Pilot1/Uno/uno_data.py index dec46ffe..a5fbeeb9 100644 --- a/Pilot1/Uno/uno_data.py +++ b/Pilot1/Uno/uno_data.py @@ -293,7 +293,7 @@ def load_mordred_descriptors(ncols=None, scaling='std', imputing='mean', dropna= df2 = df.drop('DRUG', 1) if add_prefix: - df2 = df2.add_prefix('dragon7.') + df2 = df2.add_prefix('mordred.') df2 = impute_and_scale(df2, scaling, imputing) @@ -302,7 +302,7 @@ def load_mordred_descriptors(ncols=None, scaling='std', imputing='mean', dropna= df1 = pd.DataFrame(df_desc.loc[:, 'Drug']) df2 = df_desc.drop('Drug', 1) if add_prefix: - df2 = df2.add_prefix('dragon7.') + df2 = df2.add_prefix('mordred.') if feature_subset: df2 = df2[[x for x in df2.columns if x in feature_subset]] df2 = impute_and_scale(df2, scaling=scaling, imputing=imputing, dropna=dropna) From f2c2c819e3a5484690cb38543cd64e7d4653819c Mon Sep 17 00:00:00 2001 From: Jamal Date: Mon, 25 Nov 2019 13:46:51 -0700 Subject: [PATCH 124/331] Fixed misordered imports, hardcoded path in for output. --- Pilot3/P3B5/p3b5.py | 3 +-- Pilot3/P3B5/p3b5_baseline_pytorch.py | 8 ++++---- Pilot3/P3B5/p3b5_default_model.txt | 2 +- 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/Pilot3/P3B5/p3b5.py b/Pilot3/P3B5/p3b5.py index aba56cf9..4740638f 100644 --- a/Pilot3/P3B5/p3b5.py +++ b/Pilot3/P3B5/p3b5.py @@ -1,12 +1,11 @@ import os import sys -import candle - file_path = os.path.dirname(os.path.realpath(__file__)) lib_path2 = os.path.abspath(os.path.join(file_path, '..', '..', 'common')) sys.path.append(lib_path2) +import candle REQUIRED = [ 'learning_rate', diff --git a/Pilot3/P3B5/p3b5_baseline_pytorch.py b/Pilot3/P3B5/p3b5_baseline_pytorch.py index 77415250..134e769a 100644 --- a/Pilot3/P3B5/p3b5_baseline_pytorch.py +++ b/Pilot3/P3B5/p3b5_baseline_pytorch.py @@ -1,12 +1,12 @@ -import candle -import p3b5 as bmk - import torch import torch.nn as nn from torch import optim import torch.nn.functional as F from torch.utils.data import DataLoader +import p3b5 as bmk +import candle + from darts.api.config import banner from darts.data.p3b3 import P3B3 from darts.modules.network import Network @@ -31,7 +31,7 @@ def initialize_parameters(): ) # Initialize parameters - gParameters = candle.initialize_parameters(p3b5_bench) + gParameters = candle.finalize_parameters(p3b5_bench) #bmk.logger.info('Params: {}'.format(gParameters)) return gParameters diff --git a/Pilot3/P3B5/p3b5_default_model.txt b/Pilot3/P3B5/p3b5_default_model.txt index 6ad42c5c..786092bb 100644 --- a/Pilot3/P3B5/p3b5_default_model.txt +++ b/Pilot3/P3B5/p3b5_default_model.txt @@ -2,7 +2,7 @@ model_name = 'p3b5' unrolled = True data_url = 'ftp://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot3/' -savepath = '/gpfs/alpine/proj-shared/med107/yngtodd/src/checkout/Benchmarks/Pilot3/P3B5' +savepath = '.' log_interval = 10 train_data = 'P3B3_data.tar.gz' learning_rate = 0.01 From 20a8fd88360cefb86078721e3483c566de57e273 Mon Sep 17 00:00:00 2001 From: Andrew Weisman Date: Tue, 10 Dec 2019 20:04:04 -0500 Subject: [PATCH 125/331] Undid old changes to Pilot1/Uno/uno_baseline_keras2.py --- Pilot1/Uno/uno_baseline_keras2.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/Pilot1/Uno/uno_baseline_keras2.py b/Pilot1/Uno/uno_baseline_keras2.py index 165a818d..03e32864 100644 --- a/Pilot1/Uno/uno_baseline_keras2.py +++ b/Pilot1/Uno/uno_baseline_keras2.py @@ -268,10 +268,6 @@ def build_model(loader, args, permanent_dropout=True, silent=False): def initialize_parameters(default_model = 'uno_default_model.txt'): # Build benchmark object -# mymodel_common = candle.Benchmark(file_path,os.getenv("DEFAULT_PARAMS_FILE"),'keras',prog='myprog',desc='My model') -# unoBmk = benchmark.BenchmarkUno(benchmark.file_path, 'uno_default_model.txt', 'keras', -# #unoBmk = benchmark.BenchmarkUno(benchmark.file_path, os.getenv("DEFAULT_PARAMS_FILE"), 'keras', -# #prog='uno_baseline', desc='Build neural network based models to predict tumor response to single and paired drugs.') unoBmk = benchmark.BenchmarkUno(benchmark.file_path, default_model, 'keras', prog='uno_baseline', desc='Build neural network based models to predict tumor response to single and paired drugs.') From ccaa03c8071cb1c8928567921531116196f8bead Mon Sep 17 00:00:00 2001 From: Andrew Weisman Date: Wed, 11 Dec 2019 00:21:57 -0500 Subject: [PATCH 126/331] Looks like I broke benchmarks by removing cache_subdir, but implementing a fix that shouldn't great them and also allow for setting datadir to be an absolute path --- common/file_utils.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/common/file_utils.py b/common/file_utils.py index b506f03c..526a4373 100644 --- a/common/file_utils.py +++ b/common/file_utils.py @@ -39,8 +39,9 @@ def chunk_read(response, chunk_size=8192, reporthook=None): def get_file(fname, origin, untar=False, - md5_hash=None, datadir='../Data/common'): + #md5_hash=None, datadir='../Data/common'): #md5_hash=None, cache_subdir='common', datadir='../Data/common'): + md5_hash=None, cache_subdir='common', datadir=None): # datadir argument was never actually used so changing it to None """ Downloads a file from a URL if it not already in the cache. Passing the MD5 hash will verify the file after download as well as if it is already present in the cache. @@ -57,15 +58,19 @@ def get_file(fname, origin, untar=False, MD5 hash of the file for verification cache_subdir : string directory being used as the cache + datadir : string + if set, datadir becomes its setting (which could be e.g. an absolute path) and cache_subdir no longer matters Returns ---------- Path to the downloaded file """ - #file_path = os.path.dirname(os.path.realpath(__file__)) - #datadir_base = os.path.expanduser(os.path.join(file_path, '..', 'Data')) - #datadir = os.path.join(datadir_base, cache_subdir) + if datadir is None: + file_path = os.path.dirname(os.path.realpath(__file__)) + datadir_base = os.path.expanduser(os.path.join(file_path, '..', 'Data')) + datadir = os.path.join(datadir_base, cache_subdir) + if not os.path.exists(datadir): os.makedirs(datadir) From 13bc138e2f5b35452f1db8bfe121516598177fea Mon Sep 17 00:00:00 2001 From: Harry Yoo Date: Thu, 2 Jan 2020 12:11:07 -0800 Subject: [PATCH 127/331] infer on single & dose aggregated models --- Pilot1/Uno/uno_infer.py | 40 +++++++++++++++++++++++++++------------- 1 file changed, 27 insertions(+), 13 deletions(-) diff --git a/Pilot1/Uno/uno_infer.py b/Pilot1/Uno/uno_infer.py index 2bf43d1e..f4dcd62d 100644 --- a/Pilot1/Uno/uno_infer.py +++ b/Pilot1/Uno/uno_infer.py @@ -20,13 +20,18 @@ def log_evaluation(metric_outputs, description='Comparing y_true and y_pred:'): def get_parser(): parser = argparse.ArgumentParser(description='Uno infer script') parser.add_argument("--data", + required=True, help="data file to infer on. expect exported file from uno_baseline_keras2.py") - parser.add_argument("--model_file", help="json model description file") + parser.add_argument("--model_file", required=True, help="json model description file") parser.add_argument("--weights_file", help="model weights file") parser.add_argument("--partition", default='all', choices=['train', 'val', 'all'], help="partition of test dataset") parser.add_argument("-n", "--n_pred", type=int, default=1, help="the number of predictions to make") + parser.add_argument("--single", default=False, help="do not use drug pair representation") + parser.add_argument("--agg_dose", default=None, + choices=['AUC', 'IC50', 'HS', 'AAC1', 'AUC1', 'DSS1'], + help="use dose-independent response data with the specified aggregation metric") return parser @@ -49,18 +54,21 @@ def main(): cv_y_list = [] df_pred_list = [] cv_stats = {'mae': [], 'mse': [], 'r2': [], 'corr': []} + target = args.agg_dose or 'Growth' + for cv in range(args.n_pred): cv_pred = [] dataset = ['train', 'val'] if args.partition == 'all' else [args.partition] for partition in dataset: - test_gen = DataFeeder(filename=args.data, partition=partition, batch_size=1024) - y_test_pred = model.predict_generator(test_gen, test_gen.steps) + test_gen = DataFeeder(filename=args.data, partition=partition, batch_size=1024, single=args.single, agg_dose=args.agg_dose) + y_test_pred = model.predict_generator(test_gen, test_gen.steps + 1) + y_test_pred = y_test_pred[:test_gen.size] y_test_pred = y_test_pred.flatten() df_y = test_gen.get_response(copy=True) - y_test = df_y['Growth'].values + y_test = df_y[target].values - df_pred = df_y.assign(PredictedGrowth=y_test_pred, GrowthError=y_test_pred - y_test) + df_pred = df_y.assign(**{f'Predicted{target}': y_test_pred, f'{target}Error': y_test_pred - y_test}) df_pred_list.append(df_pred) test_gen.close() @@ -70,7 +78,7 @@ def main(): cv_pred_list.append(np.concatenate(cv_pred)) # calcuate stats for mse, mae, r2, corr - scores = evaluate_prediction(df_pred['Growth'], df_pred['PredictedGrowth']) + scores = evaluate_prediction(df_pred[target], df_pred[f'Predicted{target}']) # log_evaluation(scores, description=cv) [cv_stats[key].append(scores[key]) for key in scores.keys()] @@ -78,21 +86,27 @@ def main(): cv_y = pd.concat(cv_y_list) # save to tsv - df_pred.sort_values(['Sample', 'Drug1', 'Drug2', 'Dose1', 'Dose2', 'Growth'], inplace=True) + headers = ['Sample', 'Drug1'] + if not args.single: headers.append('Drug2') + if not args.agg_dose: headers.append('Dose1') + if not args.single and not args.agg_dose: headers.append('Dose2') + headers.append(target) + + df_pred.sort_values(headers, inplace=True) df_pred.to_csv('uno_pred.all.tsv', sep='\t', index=False, float_format='%.6g') df_sum = cv_y.assign() - df_sum['PredGrowthMean'] = np.mean(cv_pred_list, axis=0) - df_sum['PredGrowthStd'] = np.std(cv_pred_list, axis=0) - df_sum['PredGrowthMin'] = np.min(cv_pred_list, axis=0) - df_sum['PredGrowthMax'] = np.max(cv_pred_list, axis=0) + df_sum[f'Pred{target}Mean'] = np.mean(cv_pred_list, axis=0) + df_sum[f'Pred{target}Std'] = np.std(cv_pred_list, axis=0) + df_sum[f'Pred{target}Min'] = np.min(cv_pred_list, axis=0) + df_sum[f'Pred{target}Max'] = np.max(cv_pred_list, axis=0) df_sum.to_csv('uno_pred.tsv', index=False, sep='\t', float_format='%.6g') - # scores = evaluate_prediction(df_sum['Growth'], df_sum['PredGrowthMean']) - scores = evaluate_prediction(df_pred['Growth'], df_pred['PredictedGrowth']) + scores = evaluate_prediction(df_pred[f'{target}'], df_pred[f'Predicted{target}']) log_evaluation(scores, description='Testing on data from {} on partition {} ({} rows)'.format(args.data, args.partition, len(cv_y))) + print(' mean std min max') for key in ['mse', 'mae', 'r2', 'corr']: print('{}: {:.4f}, {:.4f}, {:.4f}, {:.4f}'.format(key, np.around(np.mean(cv_stats[key], axis=0), decimals=4), From 34b2494ad46dc2945ee5fef659ccd1cf97bd6db2 Mon Sep 17 00:00:00 2001 From: Harry Yoo Date: Thu, 2 Jan 2020 13:36:56 -0800 Subject: [PATCH 128/331] remove obsolete utility script --- Pilot1/Uno/loocv_data_util.py | 91 ----------------------------------- 1 file changed, 91 deletions(-) delete mode 100644 Pilot1/Uno/loocv_data_util.py diff --git a/Pilot1/Uno/loocv_data_util.py b/Pilot1/Uno/loocv_data_util.py deleted file mode 100644 index 412dba5b..00000000 --- a/Pilot1/Uno/loocv_data_util.py +++ /dev/null @@ -1,91 +0,0 @@ -import argparse -import json -import pandas as pd -import numpy as np - - -def parse_arguments(): - parser = argparse.ArgumentParser() - parser.add_argument('--dataframe_from', type=str, default='GDSC.h5', - help='Dataframe file name contains all data points') - parser.add_argument('--plan', type=str, default='plan.json', - help='Plan data file') - parser.add_argument('--node', type=str, default=None, - help='node number to execute') - - args, unparsed = parser.parse_known_args() - return args, unparsed - - -def read_plan(filename, node): - print("reading {} file for node {}".format(filename, node)) - with open(filename, 'r') as plan_file: - plan = json.load(plan_file) - if node in plan: - return plan[node] - else: - raise Exception('Node index "{}" was not found in plan file'.format(node)) - - -def build_masks(args, df): - if args.node is None: - raise Exception('Node id is not given') - - plan = read_plan(args.plan, args.node) - mask = {} - for partition in ['train', 'val']: - _mask = df['Sample'] is None - for i, element in enumerate(plan[partition]): - cl_filter = element['cell'] - dr_filter = element['drug'] - __mask = df['Sample'].isin(cl_filter) & df['Drug1'].isin(dr_filter) - _mask = _mask | __mask - mask[partition] = _mask - - return mask['train'], mask['val'] - - -def training_mask(df): - return np.random.rand(len(df)) < 0.8 - - -def build_dataframe(args): - store = pd.HDFStore(args.dataframe_from, 'r') - df_y = store.get('y_train') - df_ds = store.get('x_train_0') - df_cl = store.get('x_train_1') - df_dd = store.get('x_train_2') - df_fp = store.get('x_train_3') - - train_mask, val_mask = build_masks(args, df_y) - - y_train = pd.DataFrame(data=df_y[train_mask].reset_index(drop=True)) - y_val = pd.DataFrame(data=df_y[val_mask].reset_index(drop=True)) - - x_train_0 = df_ds[train_mask].reset_index(drop=True) - x_train_1 = df_cl[train_mask].reset_index(drop=True) - x_train_2 = df_dd[train_mask].reset_index(drop=True) - x_train_3 = df_fp[train_mask].reset_index(drop=True) - - x_val_0 = df_ds[val_mask].reset_index(drop=True) - x_val_1 = df_cl[val_mask].reset_index(drop=True) - x_val_2 = df_dd[val_mask].reset_index(drop=True) - x_val_3 = df_fp[val_mask].reset_index(drop=True) - - # store - store = pd.HDFStore('topN.uno.h5', 'w') - store.put('y_train', y_train, format='t') - store.put('y_val', y_val, format='t') - store.put('x_train_0', x_train_0, format='t') - store.put('x_train_1', x_train_1, format='t') - store.put('x_train_2', x_train_2, format='t') - store.put('x_train_3', x_train_3, format='t') - store.put('x_val_0', x_val_0, format='t') - store.put('x_val_1', x_val_1, format='t') - store.put('x_val_2', x_val_2, format='t') - store.put('x_val_3', x_val_3, format='t') - - -if __name__ == '__main__': - parsed, unparsed = parse_arguments() - build_dataframe(parsed) From 775d6aa20e78f797f1b3dc92705dfdf4988f6e37 Mon Sep 17 00:00:00 2001 From: Harry Yoo Date: Thu, 2 Jan 2020 14:37:16 -0800 Subject: [PATCH 129/331] update README --- Pilot1/Uno/README.AUC.md | 110 ++++++++++++++++++++++----------------- 1 file changed, 62 insertions(+), 48 deletions(-) diff --git a/Pilot1/Uno/README.AUC.md b/Pilot1/Uno/README.AUC.md index b80fee7c..48ce0b54 100644 --- a/Pilot1/Uno/README.AUC.md +++ b/Pilot1/Uno/README.AUC.md @@ -1,54 +1,25 @@ -# Training with static datafile -Use static datafile prebuilt and shared at `http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/top_21_auc_1fold.uno.h5` +# Predicting AUC values for Top21 cancer types + +## Data prep +A static dataset is prebuilt and available at `http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/top_21_auc_1fold.uno.h5`. Along with the datset file, you will also need a cache file (a byproduct of data-building process) to skip the data-building process. ``` -python uno_baseline_keras2.py --config_file uno_auc_model.txt --cache cache/top6_auc --use_exported_data top_21_auc_1fold.uno.h5 +$ wget http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/top_21_auc_1fold.uno.h5 +$ mkdir -p cache +$ cd cache +$ wget http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/top6.cache.tgz +$ tar xvzf top6.cache.tgz +$ cd - ``` -The log will look like below, +## Training ``` -Using TensorFlow backend. -Importing candle utils for keras -Configuration file: /ssd1/homes/hsyoo/projects/CANDLE/Benchmarks/Pilot1/Uno/uno_auc_model.txt -{'activation': 'relu', - 'agg_dose': 'AUC', - 'base_lr': None, - 'batch_normalization': False, - 'batch_size': 32, - 'cell_features': ['rnaseq'], - 'cell_types': None, - 'cp': True, - 'cv': 1, - 'dense': [1000, 1000, 1000, 1000, 1000], - 'dense_feature_layers': [1000, 1000, 1000], - 'drop': 0.1, - 'drug_features': ['descriptors'], - 'epochs': 50, - 'feature_subsample': 0, - 'gpus': 1, - 'learning_rate': 0.0001, - 'loss': 'mse', - 'max_val_loss': 1.0, - 'no_feature_source': True, - 'no_gen': False, - 'no_response_source': True, - 'optimizer': 'adamax', - 'preprocess_rnaseq': 'source_scale', - 'reduce_lr': True, - 'residual': False, - 'rng_seed': 2018, - 'save_path': 'save/uno', - 'scaling': 'std', - 'single': True, - 'solr_root': '', - 'test_sources': ['train'], - 'timeout': -1, - 'train_sources': ['CCLE'], - 'use_landmark_genes': True, - 'validation_split': 0.2, - 'verbose': False, - 'warmup_lr': True} +python uno_baseline_keras2.py --config_file uno_auc_model.txt \ + --cache cache/top6_auc \ + --use_exported_data top_21_auc_1fold.uno.h5 + +... Params: {'activation': 'relu', 'agg_dose': 'AUC', @@ -130,8 +101,51 @@ Between random pairs in y_val: Data points per epoch: train = 423952, val = 52994 Steps per epoch: train = 13248, val = 1656 Epoch 1/50 -13248/13248 [==============================] - 198s 15ms/step - loss: 0.0235 - mae: 0.1048 - r2: -0.1311 - val_loss: 0.0145 - val_mae: 0.0903 - val_r2: 0.3393 -Current time ....198.278 -Epoch 2/50 +13248/13248 [==============================] - 102s 8ms/step - loss: 0.0268 - mae: 0.0794 - r2: -0.2754 - val_loss: 0.0092 - val_mae: 0.0725 - val_r2: 0.5657 +Current time ....101.892 +... +13248/13248 [==============================] - 102s 8ms/step - loss: 0.004572, lr: 0.000010, mae: 0.046159, r2: 0.782253, val_loss: 0.005335, val_mae: 0.049082, val_r2: 0.748585 +Comparing y_true and y_pred: + mse: 0.0053 + mae: 0.0490 + r2: 0.7742 + corr: 0.8800 +``` + + +## Inference +The script `uno_infer.py` takes a couple of parameters for inferences. You are required to specify a datafile (the same dataset for training, `top_21_auc_1fold.uno.h5` in this case), model file, and trained weights. You can choose a partition as a inference input (training, validation, or all) and number of predictions for each data points (-n). +``` +$ python uno_infer.py --data top_21_auc_1fold.uno.h5 \ + --model_file top21_ref/model.json \ + --weights_file top21_ref/weights.h5 \ + --partition val \ + -n 30 \ + --single True \ + --agg_dose AUC ... + mse: 0.0058 + mae: 0.0505 + r2: 0.7543 + corr: 0.8688 + mean std min max +mse: 0.0058, 0.0000, 0.0058, 0.0058 +mae: 0.0505, 0.0001, 0.0504, 0.0506 +r2: 0.7543, 0.0007, 0.7527, 0.7557 +corr: 0.8688, 0.0004, 0.8679, 0.8696 +``` + +After the inference script completes, you should be able to find `uno_pred.all.tsv` and `uno_pred.tsv` files, which contains all predicted value and error, and aggregated statistics for each data point respectively. See below for example, +``` +$ head -n 4 uno_pred.all.tsv +AUC Sample Drug1 PredictedAUC AUCError +0.7153 CCLE.22RV1 CCLE.1 0.726853 0.011553 +0.7153 CCLE.22RV1 CCLE.1 0.745033 0.0297334 +0.7153 CCLE.22RV1 CCLE.1 0.752899 0.0375985 + +$ head -n 4 uno_pred.tsv +AUC Sample Drug1 PredAUCMean PredAUCStd PredAUCMin PredAUCMax +0.918 CTRP.HCC-1438 CTRP.302 0.954987 0.0109111 0.938283 0.983576 +0.6474 NCI60.IGR-OV1 NSC.757440 0.680934 0.0279046 0.644829 0.755912 +0.5675 NCI60.CCRF-CEM NSC.381866 0.591151 0.0228838 0.553855 0.645553 ``` From 17d9e410e0a08824024e82286359a84caa0eed92 Mon Sep 17 00:00:00 2001 From: Harry Yoo Date: Thu, 9 Jan 2020 16:33:36 -0600 Subject: [PATCH 130/331] add support for parquet format. remove commented lines --- Pilot1/Uno/topN_to_uno.py | 37 +++++++++++++++++-------------------- 1 file changed, 17 insertions(+), 20 deletions(-) diff --git a/Pilot1/Uno/topN_to_uno.py b/Pilot1/Uno/topN_to_uno.py index 42ef4c12..4c81c539 100644 --- a/Pilot1/Uno/topN_to_uno.py +++ b/Pilot1/Uno/topN_to_uno.py @@ -37,26 +37,6 @@ def read_plan(filename, node): raise Exception('Node index "{}" was not found in plan file'.format(node)) -# def build_masks(args, df): -# if args.node is None: -# print('node is None. Generate Random split') -# mask = training_mask(df) -# return mask, ~mask -# -# plan = read_plan(args.plan, args.node) -# mask = {} -# for partition in ['train', 'val']: -# _mask = df['Sample'] is None -# for i, element in enumerate(plan[partition]): -# cl_filter = element['cell'] -# dr_filter = element['drug'] -# __mask = df['Sample'].isin(cl_filter) & df['Drug1'].isin(dr_filter) -# _mask = _mask | __mask -# mask[partition] = _mask -# -# return mask['train'], mask['val'] - - def build_masks(args, df): if args.node is None: print('node is None. Generate Random split') @@ -122,6 +102,21 @@ def read_dataframe_from_feather(args): return df_y, df_cl, df_dd +def read_dataframe_from_parquet(args): + df = pd.read_parquet(args.dataframe_from).fillna(0) + df.rename(columns={'CELL': 'Sample', 'DRUG': 'Drug1'}, inplace=True) + df_y = df[['AUC', 'Sample', 'Drug1']] + + cols = df.columns.to_list() + cl_columns = list(filter(lambda x: x.startswith('GE_'), cols)) + dd_columns = list(filter(lambda x: x.startswith('DD_'), cols)) + + df_cl = df.loc[:, cl_columns] + df_dd = df.loc[:, dd_columns] + + return df_y, df_cl, df_dd + + def read_dataframe_from_hdf(args): store = pd.HDFStore(args.dataframe_from, 'r') df = store.get('df') @@ -144,6 +139,8 @@ def build_dataframe(args): df_y, df_cl, df_dd = read_dataframe_from_hdf(args) elif ext == '.feather': df_y, df_cl, df_dd = read_dataframe_from_feather(args) + elif ext == '.parquet': + df_y, df_cl, df_dd = read_dataframe_from_parquet(args) else: df_y, df_cl, df_dd = read_dataframe_from_csv(args) From ce5ef01f226a87c9926f1f6d64b87f7bbf41f380 Mon Sep 17 00:00:00 2001 From: Harry Yoo Date: Fri, 10 Jan 2020 10:44:22 -0600 Subject: [PATCH 131/331] refactoring functions --- Pilot1/Uno/topN_to_uno.py | 73 ++++++++------------------------------- 1 file changed, 15 insertions(+), 58 deletions(-) diff --git a/Pilot1/Uno/topN_to_uno.py b/Pilot1/Uno/topN_to_uno.py index 4c81c539..7e4d1d8d 100644 --- a/Pilot1/Uno/topN_to_uno.py +++ b/Pilot1/Uno/topN_to_uno.py @@ -40,7 +40,7 @@ def read_plan(filename, node): def build_masks(args, df): if args.node is None: print('node is None. Generate Random split') - mask = training_mask(df) + mask = get_random_mask(df) return mask, ~mask print('from new build_mask: {} {} {}'.format(args.plan, args.node, args.incremental)) @@ -68,58 +68,23 @@ def build_masks(args, df): return mask['train'], mask['val'] -def training_mask(df): +def get_random_mask(df): return np.random.rand(len(df)) < 0.8 -def read_dataframe_from_csv(args): - df = pd.read_csv(args.dataframe_from, low_memory=False, na_values='na').fillna(0) - df.rename(columns={'CELL': 'Sample', 'DRUG': 'Drug1'}, inplace=True) - df_y = df[['AUC', 'Sample', 'Drug1']] - - cols = df.columns.to_list() - cl_columns = list(filter(lambda x: x.startswith('GE_'), cols)) - dd_columns = list(filter(lambda x: x.startswith('DD_'), cols)) - - df_cl = df.loc[:, cl_columns] - df_dd = df.loc[:, dd_columns] - - return df_y, df_cl, df_dd - - -def read_dataframe_from_feather(args): - df = pd.read_feather(args.dataframe_from).fillna(0) - df.rename(columns={'CELL': 'Sample', 'DRUG': 'Drug1'}, inplace=True) - df_y = df[['AUC', 'Sample', 'Drug1']] - - cols = df.columns.to_list() - cl_columns = list(filter(lambda x: x.startswith('GE_'), cols)) - dd_columns = list(filter(lambda x: x.startswith('DD_'), cols)) - - df_cl = df.loc[:, cl_columns] - df_dd = df.loc[:, dd_columns] - - return df_y, df_cl, df_dd - - -def read_dataframe_from_parquet(args): - df = pd.read_parquet(args.dataframe_from).fillna(0) - df.rename(columns={'CELL': 'Sample', 'DRUG': 'Drug1'}, inplace=True) - df_y = df[['AUC', 'Sample', 'Drug1']] - - cols = df.columns.to_list() - cl_columns = list(filter(lambda x: x.startswith('GE_'), cols)) - dd_columns = list(filter(lambda x: x.startswith('DD_'), cols)) - - df_cl = df.loc[:, cl_columns] - df_dd = df.loc[:, dd_columns] - - return df_y, df_cl, df_dd - +def read_dataframe(args): + _, ext = os.path.splitext(args.dataframe_from) + if ext == '.h5' or ext == '.hdf5': + store = pd.HDFStore(args.dataframe_from, 'r') + df = store.get('df') + store.close() + elif ext == '.feather': + df = pd.read_feather(args.dataframe_from).fillna(0) + elif ext == '.parquet': + df = pd.read_parquet(args.dataframe_from).fillna(0) + else: + df = pd.read_csv(args.dataframe_from, low_memory=False, na_values='na').fillna(0) -def read_dataframe_from_hdf(args): - store = pd.HDFStore(args.dataframe_from, 'r') - df = store.get('df') df.rename(columns={'CELL': 'Sample', 'DRUG': 'Drug1'}, inplace=True) df_y = df[['AUC', 'Sample', 'Drug1']] @@ -134,15 +99,7 @@ def read_dataframe_from_hdf(args): def build_dataframe(args): - _, ext = os.path.splitext(args.dataframe_from) - if ext == '.h5' or ext == '.hdf5': - df_y, df_cl, df_dd = read_dataframe_from_hdf(args) - elif ext == '.feather': - df_y, df_cl, df_dd = read_dataframe_from_feather(args) - elif ext == '.parquet': - df_y, df_cl, df_dd = read_dataframe_from_parquet(args) - else: - df_y, df_cl, df_dd = read_dataframe_from_csv(args) + df_y, df_cl, df_dd = read_dataframe(args) if args.fold is not None: tr_id = pd.read_csv('{}_tr_id.csv'.format(args.fold)) From bcc63d360d2a8c802a0d0f6b03d4973fb677c6e9 Mon Sep 17 00:00:00 2001 From: Harry Yoo Date: Fri, 10 Jan 2020 11:37:09 -0600 Subject: [PATCH 132/331] filter by selected cell and drug labels --- Pilot1/Uno/topN_to_uno.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/Pilot1/Uno/topN_to_uno.py b/Pilot1/Uno/topN_to_uno.py index 7e4d1d8d..234362ef 100644 --- a/Pilot1/Uno/topN_to_uno.py +++ b/Pilot1/Uno/topN_to_uno.py @@ -17,6 +17,10 @@ def parse_arguments(): help='True for building dataset incrementally') parser.add_argument('--fold', type=str, default=None, help='pre-calculated indexes for cross fold validation') + parser.add_argument('--cell_feature_selection', default=None, + help='Plain text list for cell feature filtering. one item per line') + parser.add_argument('--drug_feature_selection', default=None, + help='Plain text list for drug feature filtering. one item per line') parser.add_argument('--output', type=str, default='topN.uno.h5', help='output filename') @@ -92,6 +96,14 @@ def read_dataframe(args): cl_columns = list(filter(lambda x: x.startswith('GE_'), cols)) dd_columns = list(filter(lambda x: x.startswith('DD_'), cols)) + if args.cell_feature_selection is not None: + features = set(pd.read_csv(args.cell_feature_selection, skip_blank_lines=True, header=None)[0].to_list()) + cl_columns = list(filter(lambda x: x in features, cl_columns)) + + if args.drug_feature_selection is not None: + features = set(pd.read_csv(args.drug_feature_selection, skip_blank_lines=True, header=None)[0].to_list()) + dd_columns = list(filter(lambda x: x in features, dd_columns)) + df_cl = df.loc[:, cl_columns] df_dd = df.loc[:, dd_columns] From 8899e942b917cd37071c97c0906ab5bcd4fd03b7 Mon Sep 17 00:00:00 2001 From: Harry Yoo Date: Fri, 10 Jan 2020 13:20:34 -0600 Subject: [PATCH 133/331] add test partition for fold case --- Pilot1/Uno/topN_to_uno.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/Pilot1/Uno/topN_to_uno.py b/Pilot1/Uno/topN_to_uno.py index 234362ef..1482e0ab 100644 --- a/Pilot1/Uno/topN_to_uno.py +++ b/Pilot1/Uno/topN_to_uno.py @@ -118,9 +118,11 @@ def build_dataframe(args): vl_id = pd.read_csv('{}_vl_id.csv'.format(args.fold)) tr_idx = tr_id.iloc[:, 0].dropna().values.astype(int).tolist() vl_idx = vl_id.iloc[:, 0].dropna().values.astype(int).tolist() + tr_vl_idx = tr_idx + vl_idx y_train = df_y.iloc[tr_idx, :].reset_index(drop=True) y_val = df_y.iloc[vl_idx, :].reset_index(drop=True) + y_test = df_y.loc[~df_y.index.isin(tr_vl_idx), :].reset_index(drop=True) x_train_0 = df_cl.iloc[tr_idx, :].reset_index(drop=True) x_train_1 = df_dd.iloc[tr_idx, :].reset_index(drop=True) @@ -129,6 +131,10 @@ def build_dataframe(args): x_val_0 = df_cl.iloc[vl_idx, :].reset_index(drop=True) x_val_1 = df_dd.iloc[vl_idx, :].reset_index(drop=True) x_val_1.columns = [''] * len(x_val_1.columns) + + x_test_0 = df_cl.iloc[~df_cl.index.isin(tr_vl_idx), :].reset_index(drop=True) + x_test_1 = df_dd.iloc[~df_dd.index.isin(tr_vl_idx), :].reset_index(drop=True) + x_test_1.columns = [''] * len(x_val_1.columns) else: train_mask, val_mask = build_masks(args, df_y) @@ -151,6 +157,10 @@ def build_dataframe(args): store.put('x_train_1', x_train_1, format='table') store.put('x_val_0', x_val_0, format='table') store.put('x_val_1', x_val_1, format='table') + if y_test is not None: + store.put('y_test', y_test, format='table') + store.put('x_test_0', x_test_0, format='table') + store.put('x_test_1', x_test_1, format='table') store.close() From 0f617d2c69c587ec821d91a9897eadbcb5ef8b33 Mon Sep 17 00:00:00 2001 From: Harry Yoo Date: Tue, 14 Jan 2020 16:28:37 -0600 Subject: [PATCH 134/331] save input feature shapes in exported data and use to rebuild loader/model --- Pilot1/Uno/topN_to_uno.py | 9 +++++++++ Pilot1/Uno/uno_baseline_keras2.py | 14 +++++++++++--- Pilot1/Uno/uno_data.py | 14 +++++++++++++- 3 files changed, 33 insertions(+), 4 deletions(-) diff --git a/Pilot1/Uno/topN_to_uno.py b/Pilot1/Uno/topN_to_uno.py index 1482e0ab..6aaf2e31 100644 --- a/Pilot1/Uno/topN_to_uno.py +++ b/Pilot1/Uno/topN_to_uno.py @@ -1,6 +1,7 @@ import argparse import os import json +from collections import OrderedDict import pandas as pd import numpy as np @@ -157,6 +158,14 @@ def build_dataframe(args): store.put('x_train_1', x_train_1, format='table') store.put('x_val_0', x_val_0, format='table') store.put('x_val_1', x_val_1, format='table') + + # keep input feature list and shape + cl_width = len(df_cl.columns) + dd_width = len(df_dd.columns) + store.put('model', pd.DataFrame()) + store.get_storer('model').attrs.input_features = OrderedDict([('cell.rnaseq', 'cell.rnaseq'), ('drug1.descriptors', 'drug.descriptors')]) + store.get_storer('model').attrs.feature_shapes = OrderedDict([('cell.rnaseq', (cl_width,)), ('drug.descriptors', (dd_width,))]) + if y_test is not None: store.put('y_test', y_test, format='table') store.put('x_test_0', x_test_0, format='table') diff --git a/Pilot1/Uno/uno_baseline_keras2.py b/Pilot1/Uno/uno_baseline_keras2.py index 03e32864..7d700dbc 100644 --- a/Pilot1/Uno/uno_baseline_keras2.py +++ b/Pilot1/Uno/uno_baseline_keras2.py @@ -318,6 +318,7 @@ def run(params): test_sources=args.test_sources, embed_feature_source=not args.no_feature_source, encode_response_source=not args.no_response_source, + use_exported_data=args.use_exported_data, ) target = args.agg_dose or 'Growth' @@ -366,13 +367,20 @@ def run(params): store.append('y_{}'.format(partition), y.astype({target: 'float32'}), format='table', data_column=True, min_itemsize=config_min_itemsize) logger.info('Generating {} dataset. {} / {}'.format(partition, i, gen.steps)) + + # save input_features and feature_shapes from loader + store.put('model', pd.DataFrame()) + store.get_storer('model').attrs.input_features = loader.input_features + store.get_storer('model').attrs.feature_shapes = loader.feature_shapes + store.close() logger.info('Completed generating {}'.format(fname)) return - loader.partition_data(cv_folds=args.cv, train_split=train_split, val_split=val_split, - cell_types=args.cell_types, by_cell=args.by_cell, by_drug=args.by_drug, - cell_subset_path=args.cell_subset_path, drug_subset_path=args.drug_subset_path) + if args.use_exported_data is None: + loader.partition_data(cv_folds=args.cv, train_split=train_split, val_split=val_split, + cell_types=args.cell_types, by_cell=args.by_cell, by_drug=args.by_drug, + cell_subset_path=args.cell_subset_path, drug_subset_path=args.drug_subset_path) model = build_model(loader, args) logger.info('Combined model:') diff --git a/Pilot1/Uno/uno_data.py b/Pilot1/Uno/uno_data.py index a5fbeeb9..b88ceb41 100644 --- a/Pilot1/Uno/uno_data.py +++ b/Pilot1/Uno/uno_data.py @@ -837,7 +837,7 @@ def load(self, cache=None, ncols=None, scaling='std', dropna=None, cell_feature_subset_path=None, drug_feature_subset_path=None, drug_lower_response=1, drug_upper_response=-1, drug_response_span=0, drug_median_response_min=-1, drug_median_response_max=1, - use_landmark_genes=False, use_filtered_genes=False, + use_landmark_genes=False, use_filtered_genes=False, use_exported_data=None, preprocess_rnaseq=None, single=False, # train_sources=['GDSC', 'CTRP', 'ALMANAC', 'NCI60'], train_sources=['GDSC', 'CTRP', 'ALMANAC'], @@ -859,6 +859,18 @@ def load(self, cache=None, ncols=None, scaling='std', dropna=None, self.build_feature_list(single=single) return + # rebuild cache equivalent from the exported dataset + if use_exported_data is not None: + with pd.HDFStore(use_exported_data, 'r') as store: + if '/model' in store.keys(): + self.input_features = store.get_storer('model').attrs.input_features + self.feature_shapes = store.get_storer('model').attrs.feature_shapes + self.input_dim = sum([np.prod(self.feature_shapes[x]) for x in self.input_features.values()]) + return + else: + logger.warning('\nExported dataset does not have model info. Please rebuild the dataset.\n') + raise ValueError('Could not load model info from the dataset:', use_exported_data) + logger.info('Loading data from scratch ...') if agg_dose: From 2976ca1dffa7ce1f8db560e61544f753fcdf9e67 Mon Sep 17 00:00:00 2001 From: Harry Yoo Date: Wed, 15 Jan 2020 16:57:12 -0600 Subject: [PATCH 135/331] use difference dense network for cell/drug inputs --- Pilot1/Uno/uno.py | 10 ++++++++++ Pilot1/Uno/uno_baseline_keras2.py | 9 ++++++++- 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/Pilot1/Uno/uno.py b/Pilot1/Uno/uno.py index 05b0dc76..6fdfbb73 100644 --- a/Pilot1/Uno/uno.py +++ b/Pilot1/Uno/uno.py @@ -100,6 +100,16 @@ def set_locals(self): 'nargs': '+', 'type': int, 'help': 'number of neurons in intermediate dense layers in the feature encoding submodels'}, + {'name': 'dense_cell_feature_layers', + 'nargs': '+', + 'type': int, + 'default': None, + 'help': 'number of neurons in intermediate dense layers in the cell feature encoding submodels'}, + {'name': 'dense_drug_feature_layers', + 'nargs': '+', + 'type': int, + 'default': None, + 'help': 'number of neurons in intermediate dense layers in the drug feature encoding submodels'}, {'name': 'use_landmark_genes', 'type': candle.str2bool, 'default': False, diff --git a/Pilot1/Uno/uno_baseline_keras2.py b/Pilot1/Uno/uno_baseline_keras2.py index 7d700dbc..a220a59f 100644 --- a/Pilot1/Uno/uno_baseline_keras2.py +++ b/Pilot1/Uno/uno_baseline_keras2.py @@ -223,8 +223,15 @@ def build_model(loader, args, permanent_dropout=True, silent=False): for fea_type, shape in loader.feature_shapes.items(): base_type = fea_type.split('.')[0] if base_type in ['cell', 'drug']: + if args.dense_cell_feature_layers is not None and base_type == 'cell': + dense_feature_layers = args.dense_cell_feature_layers + elif args.dense_drug_feature_layers is not None and base_type == 'drug': + dense_feature_layers = args.dense_drug_feature_layers + else: + dense_feature_layers = args.dense_feature_layers + box = build_feature_model(input_shape=shape, name=fea_type, - dense_layers=args.dense_feature_layers, + dense_layers=dense_feature_layers, dropout_rate=dropout_rate, permanent_dropout=permanent_dropout) if not silent: logger.debug('Feature encoding submodel for %s:', fea_type) From 75ea791b8cd9e9fb62954476e800c0e076ef6966 Mon Sep 17 00:00:00 2001 From: Harry Yoo Date: Wed, 15 Jan 2020 16:58:23 -0600 Subject: [PATCH 136/331] use test partition for prediction when available --- Pilot1/Uno/uno_baseline_keras2.py | 22 +++++++++++++++------- Pilot1/Uno/uno_data.py | 9 +++++++-- 2 files changed, 22 insertions(+), 9 deletions(-) diff --git a/Pilot1/Uno/uno_baseline_keras2.py b/Pilot1/Uno/uno_baseline_keras2.py index a220a59f..9b3ed462 100644 --- a/Pilot1/Uno/uno_baseline_keras2.py +++ b/Pilot1/Uno/uno_baseline_keras2.py @@ -468,6 +468,7 @@ def warmup_scheduler(epoch): if args.use_exported_data is not None: train_gen = DataFeeder(filename=args.use_exported_data, batch_size=args.batch_size, shuffle=args.shuffle, single=args.single, agg_dose=args.agg_dose) val_gen = DataFeeder(partition='val', filename=args.use_exported_data, batch_size=args.batch_size, shuffle=args.shuffle, single=args.single, agg_dose=args.agg_dose) + test_gen = DataFeeder(partition='test', filename=args.use_exported_data, batch_size=args.batch_size, shuffle=args.shuffle, single=args.single, agg_dose=args.agg_dose) else: train_gen = CombinedDataGenerator(loader, fold=fold, batch_size=args.batch_size, shuffle=args.shuffle, single=args.single) val_gen = CombinedDataGenerator(loader, partition='val', fold=fold, batch_size=args.batch_size, shuffle=args.shuffle, single=args.single) @@ -487,20 +488,27 @@ def warmup_scheduler(epoch): callbacks=callbacks, validation_data=(x_val_list, y_val)) else: - logger.info('Data points per epoch: train = %d, val = %d', train_gen.size, val_gen.size) - logger.info('Steps per epoch: train = %d, val = %d', train_gen.steps, val_gen.steps) + logger.info('Data points per epoch: train = %d, val = %d, test = %d', train_gen.size, val_gen.size, test_gen.size) + logger.info('Steps per epoch: train = %d, val = %d, test = %d', train_gen.steps, val_gen.steps, test_gen.steps) history = model.fit_generator(train_gen, train_gen.steps, epochs=args.epochs, callbacks=callbacks, validation_data=val_gen, validation_steps=val_gen.steps) - if args.no_gen: - y_val_pred = model.predict(x_val_list, batch_size=args.batch_size) + # prediction on holdout(test) when exists or use validation set + if len(test_gen) > 0: + df_val = test_gen.get_response(copy=True) + y_val = df_val[target].values + y_val_pred = model.predict_generator(test_gen, test_gen.steps + 1) + y_val_pred = y_val_pred[:test_gen.size] else: - val_gen.reset() - y_val_pred = model.predict_generator(val_gen, val_gen.steps + 1) - y_val_pred = y_val_pred[:val_gen.size] + if args.no_gen: + y_val_pred = model.predict(x_val_list, batch_size=args.batch_size) + else: + val_gen.reset() + y_val_pred = model.predict_generator(val_gen, val_gen.steps + 1) + y_val_pred = y_val_pred[:val_gen.size] y_val_pred = y_val_pred.flatten() diff --git a/Pilot1/Uno/uno_data.py b/Pilot1/Uno/uno_data.py index b88ceb41..e687ca4a 100644 --- a/Pilot1/Uno/uno_data.py +++ b/Pilot1/Uno/uno_data.py @@ -866,6 +866,7 @@ def load(self, cache=None, ncols=None, scaling='std', dropna=None, self.input_features = store.get_storer('model').attrs.input_features self.feature_shapes = store.get_storer('model').attrs.feature_shapes self.input_dim = sum([np.prod(self.feature_shapes[x]) for x in self.input_features.values()]) + self.test_sep_sources = [] return else: logger.warning('\nExported dataset does not have model info. Please rebuild the dataset.\n') @@ -1009,8 +1010,12 @@ def __init__(self, partition='train', filename=None, batch_size=32, shuffle=Fals self.store = pd.HDFStore(filename, mode='r') self.input_size = len(list(filter(lambda x: x.startswith('/x_train'), self.store.keys()))) - y = self.store.select('y_{}'.format(self.partition)) - self.index = y.index + try: + y = self.store.select('y_{}'.format(self.partition)) + self.index = y.index + except KeyError: + self.index = [] + self.size = len(self.index) if self.size >= self.batch_size: self.steps = self.size // self.batch_size From 4d6b6212b55250977414c8ef28fb2b8cfe5219fc Mon Sep 17 00:00:00 2001 From: Harry Yoo Date: Wed, 15 Jan 2020 17:12:11 -0600 Subject: [PATCH 137/331] add to ignored key list --- Pilot1/Uno/uno_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Pilot1/Uno/uno_data.py b/Pilot1/Uno/uno_data.py index e687ca4a..599d1abe 100644 --- a/Pilot1/Uno/uno_data.py +++ b/Pilot1/Uno/uno_data.py @@ -665,7 +665,7 @@ def load_from_cache(self, cache, params): except json.JSONDecodeError as e: logger.warning('Could not decode parameter file %s: %s', param_fname, e) return False - ignore_keys = ['cache', 'partition_by', 'single'] + ignore_keys = ['cache', 'partition_by', 'single', 'use_exported_data'] equal, diffs = dict_compare(params, cached_params, ignore_keys) if not equal: logger.warning('Cache parameter mismatch: %s\nSaved: %s\nAttempted to load: %s', diffs, cached_params, params) From 2be1c341c128ac08c5d13a8b21b8a0e287730521 Mon Sep 17 00:00:00 2001 From: Harry Yoo Date: Wed, 15 Jan 2020 18:20:31 -0600 Subject: [PATCH 138/331] use candle lib for plot_history function --- Pilot1/Uno/uno_baseline_keras2.py | 18 ++---------------- 1 file changed, 2 insertions(+), 16 deletions(-) diff --git a/Pilot1/Uno/uno_baseline_keras2.py b/Pilot1/Uno/uno_baseline_keras2.py index 9b3ed462..07b32e72 100644 --- a/Pilot1/Uno/uno_baseline_keras2.py +++ b/Pilot1/Uno/uno_baseline_keras2.py @@ -135,20 +135,6 @@ def log_evaluation(metric_outputs, description='Comparing y_true and y_pred:'): logger.info(' {}: {:.4f}'.format(metric, value)) -#def plot_history(out, history, metric='loss', title=None): -# title = title or 'model {}'.format(metric) -# val_metric = 'val_{}'.format(metric) -# plt.figure(figsize=(8, 6)) -# plt.plot(history.history[metric], marker='o') -# plt.plot(history.history[val_metric], marker='d') -# plt.title(title) -# plt.ylabel(metric) -# plt.xlabel('epoch') -# plt.legend(['train_{}'.format(metric), 'val_{}'.format(metric)], loc='upper center') -# png = '{}.plot.{}.png'.format(out, metric) -# plt.savefig(png, bbox_inches='tight') -# - class LoggingCallback(Callback): def __init__(self, print_fcn=print): Callback.__init__(self) @@ -521,9 +507,9 @@ def warmup_scheduler(epoch): df_pred_list.append(df_val) if hasattr(history, 'loss'): - plot_history(prefix, history, 'loss') + candle.plot_history(prefix, history, 'loss') if hasattr(history, 'r2'): - plot_history(prefix, history, 'r2') + candle.plot_history(prefix, history, 'r2') pred_fname = prefix + '.predicted.tsv' df_pred = pd.concat(df_pred_list) From 54388e9995d2b5a0338633452d6f9ae28c45f43b Mon Sep 17 00:00:00 2001 From: Harry Yoo Date: Wed, 15 Jan 2020 18:30:37 -0600 Subject: [PATCH 139/331] add plot_metrics function in candle lib, usecase in uno --- Pilot1/Uno/uno_baseline_keras2.py | 5 +-- common/candle/__init__.py | 1 + common/viz_utils.py | 66 +++++++++++++++++++++++++++++++ 3 files changed, 68 insertions(+), 4 deletions(-) diff --git a/Pilot1/Uno/uno_baseline_keras2.py b/Pilot1/Uno/uno_baseline_keras2.py index 07b32e72..ce755d26 100644 --- a/Pilot1/Uno/uno_baseline_keras2.py +++ b/Pilot1/Uno/uno_baseline_keras2.py @@ -506,10 +506,7 @@ def warmup_scheduler(epoch): df_val[target + 'Error'] = y_val_pred - y_val df_pred_list.append(df_val) - if hasattr(history, 'loss'): - candle.plot_history(prefix, history, 'loss') - if hasattr(history, 'r2'): - candle.plot_history(prefix, history, 'r2') + candle.plot_metrics(history, title=None, skip_ep=0, outdir='./save/', add_lr=True) pred_fname = prefix + '.predicted.tsv' df_pred = pd.concat(df_pred_list) diff --git a/common/candle/__init__.py b/common/candle/__init__.py index 627a5a40..5e0f4ca0 100644 --- a/common/candle/__init__.py +++ b/common/candle/__init__.py @@ -73,6 +73,7 @@ from keras_utils import mae from keras_utils import mse + from viz_utils import plot_metrics from solr_keras import CandleRemoteMonitor from solr_keras import compute_trainable_params diff --git a/common/viz_utils.py b/common/viz_utils.py index 2ca87eae..cb24a2b6 100644 --- a/common/viz_utils.py +++ b/common/viz_utils.py @@ -1,3 +1,4 @@ +from pathlib import Path import matplotlib as mpl mpl.use('Agg') import matplotlib.pyplot as plt @@ -359,3 +360,68 @@ def plot_percentile_predictions(Ypred, Ypred_Lp, Ypred_Hp, percentile_list, pred plt.close() print('Generated plot: ', figprefix + '_percentile_predictions.png') + +# plot training and validation metrics together and generate one chart per metrics +def plot_metrics(history, title=None, skip_ep=0, outdir='.', add_lr=False): + """ Plots keras training curves history. + Args: + skip_ep: number of epochs to skip when plotting metrics + add_lr: add curve of learning rate progression over epochs + """ + + def capitalize_metric(met): + return ' '.join(s.capitalize() for s in met.split('_')) + + all_metrics = list(history.history.keys()) + pr_metrics = ['_'.join(m.split('_')[1:]) for m in all_metrics if 'val' in m] + + epochs = np.asarray(history.epoch) + 1 + if len(epochs) <= skip_ep: + skip_ep = 0 + eps = epochs[skip_ep:] + hh = history.history + + for p, m in enumerate(pr_metrics): + metric_name = m + metric_name_val = 'val_' + m + + y_tr = hh[metric_name][skip_ep:] + y_vl = hh[metric_name_val][skip_ep:] + + ymin = min(set(y_tr).union(y_vl)) + ymax = max(set(y_tr).union(y_vl)) + lim = (ymax - ymin) * 0.1 + ymin, ymax = ymin - lim, ymax + lim + + # Start figure + fig, ax1 = plt.subplots() + + # Plot metrics + ax1.plot(eps, y_tr, color='b', marker='.', linestyle='-', linewidth=1, alpha=0.6, label=capitalize_metric(metric_name)) + ax1.plot(eps, y_vl, color='r', marker='.', linestyle='--', linewidth=1, alpha=0.6, label=capitalize_metric(metric_name_val)) + ax1.set_xlabel('Epoch') + ax1.set_ylabel(capitalize_metric(metric_name)) + ax1.set_xlim([min(eps) - 1, max(eps) + 1]) + ax1.set_ylim([ymin, ymax]) + ax1.tick_params('y', colors='k') + + # Add learning rate + if (add_lr is True) and ('lr' in hh): + ax2 = ax1.twinx() + ax2.plot(eps, hh['lr'][skip_ep:], color='g', marker='.', linestyle=':', linewidth=1, + alpha=0.6, markersize=5, label='LR') + ax2.set_ylabel('Learning rate', color='g', fontsize=12) + + ax2.set_yscale('log') + ax2.tick_params('y', colors='g') + + ax1.grid(True) + legend = ax1.legend(loc='best', prop={'size': 10}) + frame = legend.get_frame() + frame.set_facecolor('0.95') + if title is not None: + plt.title(title) + + figpath = Path(outdir) / (metric_name + '.png') + plt.savefig(figpath, bbox_inches='tight') + plt.close() From 3902b5f13ece5ff0cdeab127c965be52542f398f Mon Sep 17 00:00:00 2001 From: Harry Yoo Date: Thu, 16 Jan 2020 09:06:57 -0600 Subject: [PATCH 140/331] use sample size for checking availability --- Pilot1/Uno/uno_baseline_keras2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Pilot1/Uno/uno_baseline_keras2.py b/Pilot1/Uno/uno_baseline_keras2.py index ce755d26..3819b295 100644 --- a/Pilot1/Uno/uno_baseline_keras2.py +++ b/Pilot1/Uno/uno_baseline_keras2.py @@ -483,7 +483,7 @@ def warmup_scheduler(epoch): validation_steps=val_gen.steps) # prediction on holdout(test) when exists or use validation set - if len(test_gen) > 0: + if test_gen.size > 0: df_val = test_gen.get_response(copy=True) y_val = df_val[target].values y_val_pred = model.predict_generator(test_gen, test_gen.steps + 1) From 196f09da816be5c183165c9d31cf065eb53658af Mon Sep 17 00:00:00 2001 From: Harry Yoo Date: Thu, 16 Jan 2020 09:34:29 -0600 Subject: [PATCH 141/331] fix lint issues --- Pilot1/Uno/uno.py | 12 ++++++------ Pilot1/Uno/uno_baseline_keras2.py | 16 +++++----------- Pilot1/Uno/uno_data.py | 11 +++++------ 3 files changed, 16 insertions(+), 23 deletions(-) diff --git a/Pilot1/Uno/uno.py b/Pilot1/Uno/uno.py index 6fdfbb73..2794e20f 100644 --- a/Pilot1/Uno/uno.py +++ b/Pilot1/Uno/uno.py @@ -206,14 +206,14 @@ def set_locals(self): 'type': int, 'default': 0, 'help': 'number of bins to use when discretizing growth response'}, - {'name' : 'initial_weights', - 'type' : str, + {'name': 'initial_weights', + 'type': str, 'default': None, - 'help' : 'file name of initial weights'}, - {'name' : 'save_weights', + 'help': 'file name of initial weights'}, + {'name': 'save_weights', 'type': str, - 'default' : None, - 'help': 'name of file to save weights to' } + 'default': None, + 'help': 'name of file to save weights to'} ] required = [ diff --git a/Pilot1/Uno/uno_baseline_keras2.py b/Pilot1/Uno/uno_baseline_keras2.py index 3819b295..2a4f656e 100644 --- a/Pilot1/Uno/uno_baseline_keras2.py +++ b/Pilot1/Uno/uno_baseline_keras2.py @@ -18,10 +18,6 @@ from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error from scipy.stats.stats import pearsonr -# For non-interactive plotting -import matplotlib as mpl -import matplotlib.pyplot as plt - import uno as benchmark import candle @@ -29,7 +25,6 @@ from uno_data import CombinedDataLoader, CombinedDataGenerator, DataFeeder -mpl.use('Agg') logger = logging.getLogger(__name__) os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' @@ -187,7 +182,8 @@ def build_feature_model(input_shape, name='', dense_layers=[1000, 1000], model = Model(x_input, h, name=name) return model -class SimpleWeightSaver(Callback): + +class SimpleWeightSaver(Callback): def __init__(self, fname): self.fname = fname @@ -200,7 +196,6 @@ def set_model(self, model): def on_train_end(self, logs={}): self.model.save_weights(self.fname) - def build_model(loader, args, permanent_dropout=True, silent=False): @@ -258,7 +253,7 @@ def build_model(loader, args, permanent_dropout=True, silent=False): return Model(inputs, output) -def initialize_parameters(default_model = 'uno_default_model.txt'): +def initialize_parameters(default_model='uno_default_model.txt'): # Build benchmark object unoBmk = benchmark.BenchmarkUno(benchmark.file_path, default_model, 'keras', @@ -282,7 +277,7 @@ def run(params): ext = extension_from_parameters(args) verify_path(args.save_path) prefix = args.save_path + ext - logfile = args.logfile if args.logfile else prefix+'.log' + logfile = args.logfile if args.logfile else prefix + '.log' set_up_logger(logfile, args.verbose) logger.info('Params: {}'.format(params)) @@ -420,7 +415,6 @@ def warmup_scheduler(epoch): if args.learning_rate: K.set_value(optimizer.lr, args.learning_rate) - model.compile(loss=args.loss, optimizer=optimizer, metrics=[mae, r2]) # calculate trainable and non-trainable params @@ -435,7 +429,7 @@ def warmup_scheduler(epoch): checkpointer = MultiGPUCheckpoint(prefix + cv_ext + '.model.h5', save_best_only=True) tensorboard = TensorBoard(log_dir="tb/{}{}{}".format(args.tb_prefix, ext, cv_ext)) history_logger = LoggingCallback(logger.debug) - + callbacks = [candle_monitor, timeout_monitor, history_logger] if args.es: callbacks.append(es_monitor) diff --git a/Pilot1/Uno/uno_data.py b/Pilot1/Uno/uno_data.py index 599d1abe..c0b67172 100644 --- a/Pilot1/Uno/uno_data.py +++ b/Pilot1/Uno/uno_data.py @@ -923,17 +923,16 @@ def load(self, cache=None, ncols=None, scaling='std', dropna=None, df_drug_desc = load_drug_descriptors(ncols=ncols, scaling=scaling, dropna=dropna, feature_subset=drug_feature_subset) elif fea == 'fingerprints': df_drug_fp = load_drug_fingerprints(ncols=ncols, scaling=scaling, dropna=dropna, feature_subset=drug_feature_subset) - elif fea == 'mordred' : + elif fea == 'mordred': df_drug_mordred = load_mordred_descriptors(ncols=ncols, scaling=scaling, dropna=dropna, feature_subset=drug_feature_subset) - # df_drug_desc, df_drug_fp = load_drug_data(ncols=ncols, scaling=scaling, dropna=dropna) cell_df_dict = {'rnaseq': 'df_cell_rnaseq'} drug_df_dict = {'descriptors': 'df_drug_desc', 'fingerprints': 'df_drug_fp', - 'mordred' : 'df_drug_mordred'} + 'mordred': 'df_drug_mordred'} # df_cell_ids = df_cell_rnaseq[['Sample']].drop_duplicates() # df_drug_ids = pd.concat([df_drug_desc[['Drug']], df_drug_fp[['Drug']]]).drop_duplicates() @@ -1044,14 +1043,14 @@ def reset(self): def get_response(self, copy=False): if self.shuffle: self.index = [item for step in range(self.steps) for item in range(self.index_map[step] * self.batch_size, (self.index_map[step] + 1) * self.batch_size)] - df = self.store.get('y_{}'.format(self.partition)).iloc[self.index,:] + df = self.store.get('y_{}'.format(self.partition)).iloc[self.index, :] else: df = self.store.get('y_{}'.format(self.partition)) if self.agg_dose is None: - df['Dose1'] = self.store.get('x_{}_0'.format(self.partition)).iloc[self.index,:] + df['Dose1'] = self.store.get('x_{}_0'.format(self.partition)).iloc[self.index, :] if not self.single: - df['Dose2'] = self.store.get('x_{}_1'.format(self.partition)).iloc[self.index,:] + df['Dose2'] = self.store.get('x_{}_1'.format(self.partition)).iloc[self.index, :] return df.copy() if copy else df def close(self): From f94de3cbd541ab0be4a3212fce2e756cd273707a Mon Sep 17 00:00:00 2001 From: Harry Yoo Date: Thu, 16 Jan 2020 10:06:10 -0800 Subject: [PATCH 142/331] error handling since test partition can be optional --- Pilot1/Uno/uno_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Pilot1/Uno/uno_data.py b/Pilot1/Uno/uno_data.py index c0b67172..46d02187 100644 --- a/Pilot1/Uno/uno_data.py +++ b/Pilot1/Uno/uno_data.py @@ -1071,7 +1071,7 @@ def __init__(self, data, partition='train', fold=0, source=None, batch_size=32, elif partition == 'val': index = data.val_indexes[fold] else: - index = data.test_indexes[fold] + index = data.test_indexes[fold] if hasattr(data, 'test_indexes') else [] if source: df = data.df_response[['Source']].iloc[index, :] From b5b1ea1ce74daf8f6486b5e1adb471a35263b34a Mon Sep 17 00:00:00 2001 From: Harry Yoo Date: Thu, 16 Jan 2020 11:03:08 -0800 Subject: [PATCH 143/331] fix build error; add closing single quote --- Pilot1/P1B2/p1b2_baseline_keras2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Pilot1/P1B2/p1b2_baseline_keras2.py b/Pilot1/P1B2/p1b2_baseline_keras2.py index d4018453..642d5f22 100644 --- a/Pilot1/P1B2/p1b2_baseline_keras2.py +++ b/Pilot1/P1B2/p1b2_baseline_keras2.py @@ -17,7 +17,7 @@ import p1b2 import candle -def initialize_parameters(default_model = 'p1b2_default_model.txt): +def initialize_parameters(default_model = 'p1b2_default_model.txt'): # Build benchmark object p1b2Bmk = p1b2.BenchmarkP1B2(p1b2.file_path, default_model, 'keras', From 6d9728623eaac49cf28f71df7f9d5105ae21eaa8 Mon Sep 17 00:00:00 2001 From: "Gounley, John P" Date: Tue, 11 Feb 2020 09:00:43 -0500 Subject: [PATCH 144/331] Restructure p3b4 data flow. Add training of embedding as settable parameter --- Pilot3/P3B4/p3b4.py | 2 +- Pilot3/P3B4/p3b4_baseline_keras2.py | 5 +- Pilot3/P3B4/p3b4_default_model.txt | 4 +- Pilot3/P3B4/tf_mthcan.py | 132 +++++++++++++--------------- 4 files changed, 67 insertions(+), 76 deletions(-) diff --git a/Pilot3/P3B4/p3b4.py b/Pilot3/P3B4/p3b4.py index 08498acc..8098104d 100644 --- a/Pilot3/P3B4/p3b4.py +++ b/Pilot3/P3B4/p3b4.py @@ -17,7 +17,7 @@ required = [ 'learning_rate', 'batch_size', 'epochs', 'dropout', \ 'optimizer', 'wv_len', \ - 'attention_size'] + 'attention_size', 'embed_train'] diff --git a/Pilot3/P3B4/p3b4_baseline_keras2.py b/Pilot3/P3B4/p3b4_baseline_keras2.py index 19b64d03..90e0900d 100644 --- a/Pilot3/P3B4/p3b4_baseline_keras2.py +++ b/Pilot3/P3B4/p3b4_baseline_keras2.py @@ -52,7 +52,7 @@ def run(gParameters): batch_size = gParameters[ 'batch_size' ] epochs = gParameters[ 'epochs' ] dropout = gParameters[ 'dropout' ] - + embed_train = gParameters[ 'embed_train' ] optimizer = gParameters[ 'optimizer' ] if optimizer == 0: @@ -121,7 +121,8 @@ def run(gParameters): attention_size= attention_size, dropout_rate = dropout, lr = learning_rate, - optimizer= optimizer + optimizer= optimizer, + embed_train = embed_train ) ret = model.train( diff --git a/Pilot3/P3B4/p3b4_default_model.txt b/Pilot3/P3B4/p3b4_default_model.txt index 02044931..76f66fed 100644 --- a/Pilot3/P3B4/p3b4_default_model.txt +++ b/Pilot3/P3B4/p3b4_default_model.txt @@ -3,10 +3,10 @@ data_url = 'ftp://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot3/' train_data = 'P3B3_data.tar.gz' model_name = 'p3b4' batch_size = 64 -epochs = 1 +epochs = 10 optimizer = 0 learning_rate = 0.0001 wv_len = 50 attention_size = 500 dropout = 0.1 - +embed_train = False diff --git a/Pilot3/P3B4/tf_mthcan.py b/Pilot3/P3B4/tf_mthcan.py index ec3f8af6..0b11881e 100644 --- a/Pilot3/P3B4/tf_mthcan.py +++ b/Pilot3/P3B4/tf_mthcan.py @@ -13,7 +13,8 @@ def __init__(self): class hcan(object): def __init__(self,embedding_matrix,num_classes,max_sents,max_words, - attention_size=512,dropout_rate=0.9,activation=tf.nn.elu,lr=0.0001, optimizer= 'adam'): + attention_size=512,dropout_rate=0.9,activation=tf.nn.elu,lr=0.0001, + optimizer= 'adam', embed_train = True): tf.reset_default_graph() @@ -27,62 +28,17 @@ def __init__(self,embedding_matrix,num_classes,max_sents,max_words, self.attention_size = attention_size self.activation = activation self.num_tasks = len(num_classes) + self.embed_train = embed_train #doc input - self.doc_input = tf.placeholder(tf.int32, shape=[None,max_sents,max_words]) - doc_embeds = tf.map_fn(self._attention_step,self.doc_input,dtype=tf.float32) - - #classification functions - logits = [] - self.predictions = [] - for i in range(self.num_tasks): - logit = tf.layers.dense(doc_embeds,num_classes[i], - kernel_initializer=tf.contrib.layers.xavier_initializer()) - logits.append(logit) - self.predictions.append(tf.nn.softmax(logit)) - - #loss, accuracy, and training functions - self.labels = [] - self.loss = 0 - for i in range(self.num_tasks): - label = tf.placeholder(tf.int32,shape=[None]) - self.labels.append(label) - loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits[i],labels=label)) - self.loss += loss/self.num_tasks - # self.optimizer = tf.train.AdamOptimizer(lr,0.9,0.99).minimize(self.loss) - if optimizer == 'adam': - self.optimizer = tf.train.AdamOptimizer(lr,0.9,0.99).minimize(self.loss) - elif optimizer == 'sgd': - self.optimizer = tf.train.GradientDescentOptimizer( lr ).minimize( self.loss ) - elif optimizer == 'adadelta': - self.optimizer = tf.train.AdadeltaOptimizer( learning_rate= lr ).minimize( self.loss ) - else: - self.optimizer = tf.train.RMSPropOptimizer( lr ).minimize( self.loss ) - - #init op - config = tf.ConfigProto() - config.gpu_options.allow_growth = True - self.saver = tf.train.Saver() - self.sess = tf.Session(config=config) - self.sess.run(tf.global_variables_initializer()) - - def _attention_step(self,doc): - - words_per_line = tf.reduce_sum(tf.sign(doc),1) - num_lines = tf.reduce_sum(tf.sign(words_per_line)) - max_words_ = tf.reduce_max(words_per_line) - doc_input_reduced = doc[:num_lines,:max_words_] - num_words = words_per_line[:num_lines] + self.doc_input = tf.placeholder(tf.int32, shape=[None,max_sents,max_words]) # batch x sents x words + batch_size = tf.shape(self.doc_input)[0] + doc_input_reshape = tf.reshape(self.doc_input,(-1,max_words)) # batch*sents x words #word embeddings word_embeds = tf.gather(tf.get_variable('embeddings',initializer=self.embedding_matrix, - dtype=tf.float32),doc_input_reduced) - word_embeds = tf.nn.dropout(word_embeds,self.dropout) - - #masking - mask_base = tf.cast(tf.sequence_mask(num_words,max_words_),tf.float32) - mask = tf.tile(tf.expand_dims(mask_base,2),[1,1,self.attention_size]) - mask2 = tf.tile(tf.expand_dims(mask_base,2),[1,1,max_words_]) + dtype=tf.float32, trainable=self.embed_train),doc_input_reshape) + word_embeds = tf.nn.dropout(word_embeds,self.dropout) # batch*sents x words x attention_size #word self attention Q = tf.layers.conv1d(word_embeds,self.attention_size,1,padding='same', @@ -92,31 +48,27 @@ def _attention_step(self,doc): V = tf.layers.conv1d(word_embeds,self.attention_size,1,padding='same', activation=self.activation,kernel_initializer=tf.contrib.layers.xavier_initializer()) - Q = tf.where(tf.equal(mask,0),tf.zeros_like(Q),Q) - K = tf.where(tf.equal(mask,0),tf.zeros_like(K),K) - V = tf.where(tf.equal(mask,0),tf.zeros_like(V),V) - outputs = tf.matmul(Q,tf.transpose(K,[0, 2, 1])) outputs = outputs/(K.get_shape().as_list()[-1]**0.5) outputs = tf.where(tf.equal(outputs,0),tf.ones_like(outputs)*-1000,outputs) outputs = tf.nn.dropout(tf.nn.softmax(outputs),self.dropout) - outputs = tf.where(tf.equal(mask2,0),tf.zeros_like(outputs),outputs) - outputs = tf.matmul(outputs,V) - outputs = tf.where(tf.equal(mask,0),tf.zeros_like(outputs),outputs) + outputs = tf.matmul(outputs,V) # batch*sents x words x attention_size #word target attention - Q = tf.get_variable('word_Q',(1,1,self.attention_size), + Q = tf.get_variable('word_Q',(1,self.attention_size,1), tf.float32,tf.orthogonal_initializer()) - Q = tf.tile(Q,[num_lines,1,1]) + Q = tf.tile(Q,[batch_size*max_sents,1,1]) V = outputs - outputs = tf.matmul(Q,tf.transpose(outputs,[0, 2, 1])) + outputs = tf.matmul(outputs,Q) # batch*sents x words x 1 outputs = outputs/(K.get_shape().as_list()[-1]**0.5) outputs = tf.where(tf.equal(outputs,0),tf.ones_like(outputs)*-1000,outputs) outputs = tf.nn.dropout(tf.nn.softmax(outputs),self.dropout) - outputs = tf.matmul(outputs,V) - sent_embeds = tf.transpose(outputs,[1,0,2]) - sent_embeds = tf.nn.dropout(sent_embeds,self.dropout) + outputs = tf.expand_dims(tf.squeeze(outputs,[2]),1) # batch*sents x 1 x words + outputs = tf.matmul(outputs,V) # batch*sents x 1 x attention_size + + sent_embeds = tf.reshape(outputs,(-1,max_sents,self.attention_size)) + sent_embeds = tf.nn.dropout(sent_embeds,self.dropout) # batch x sents x attention_size #sent self attention Q = tf.layers.conv1d(sent_embeds,self.attention_size,1,padding='same', @@ -128,21 +80,57 @@ def _attention_step(self,doc): outputs = tf.matmul(Q,tf.transpose(K,[0, 2, 1])) outputs = outputs/(K.get_shape().as_list()[-1]**0.5) + outputs = tf.where(tf.equal(outputs,0),tf.ones_like(outputs)*-1000,outputs) outputs = tf.nn.dropout(tf.nn.softmax(outputs),self.dropout) - outputs = tf.matmul(outputs,V) + outputs = tf.matmul(outputs,V) # batch x sents x attention_size #sent target attention - Q = tf.get_variable('sent_Q',(1,1,self.attention_size), + Q = tf.get_variable('sent_Q',(1,self.attention_size,1), tf.float32,tf.orthogonal_initializer()) + Q = tf.tile(Q,[batch_size,1,1]) V = outputs - outputs = tf.matmul(Q,tf.transpose(outputs,[0, 2, 1])) + outputs = tf.matmul(outputs,Q) # batch x sents x 1 outputs = outputs/(K.get_shape().as_list()[-1]**0.5) + outputs = tf.where(tf.equal(outputs,0),tf.ones_like(outputs)*-1000,outputs) outputs = tf.nn.dropout(tf.nn.softmax(outputs),self.dropout) - outputs = tf.matmul(outputs,V) - doc_embed = tf.nn.dropout(tf.squeeze(outputs,[0]),self.dropout) + outputs = tf.expand_dims(tf.squeeze(outputs,[2]),1) # batch x 1 x sents + outputs = tf.matmul(outputs,V) # batch x 1 x attention_size + doc_embeds = tf.nn.dropout(tf.squeeze(outputs,[1]),self.dropout) # batch x attention_size - return tf.squeeze(doc_embed,[0]) + #classification functions + logits = [] + self.predictions = [] + for i in range(self.num_tasks): + logit = tf.layers.dense(doc_embeds,num_classes[i], + kernel_initializer=tf.contrib.layers.xavier_initializer()) + logits.append(logit) + self.predictions.append(tf.nn.softmax(logit)) + + #loss, accuracy, and training functions + self.labels = [] + self.loss = 0 + for i in range(self.num_tasks): + label = tf.placeholder(tf.int32,shape=[None]) + self.labels.append(label) + loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits[i],labels=label)) + self.loss += loss/self.num_tasks + # self.optimizer = tf.train.AdamOptimizer(lr,0.9,0.99).minimize(self.loss) + if optimizer == 'adam': + self.optimizer = tf.train.AdamOptimizer(lr,0.9,0.99).minimize(self.loss) + elif optimizer == 'sgd': + self.optimizer = tf.train.GradientDescentOptimizer( lr ).minimize( self.loss ) + elif optimizer == 'adadelta': + self.optimizer = tf.train.AdadeltaOptimizer( learning_rate= lr ).minimize( self.loss ) + else: + self.optimizer = tf.train.RMSPropOptimizer( lr ).minimize( self.loss ) + + #init op + config = tf.ConfigProto() + config.gpu_options.allow_growth = True + self.saver = tf.train.Saver() + self.sess = tf.Session(config=config) + self.sess.run(tf.global_variables_initializer()) def train(self,data,labels,batch_size=100,epochs=50,validation_data=None): @@ -298,6 +286,7 @@ def load(self,filename): #create data vocab = np.random.rand(vocab_size,embedding_size) + vocab[0,:] = 0 X = np.random.randint(0,vocab_size,(train_samples+test_samples,max_lines,max_words)) #optional masking @@ -328,3 +317,4 @@ def load(self,filename): print(history.history) + From 413d5b17dbeee25499cf125199663f4739efb682 Mon Sep 17 00:00:00 2001 From: "Gounley, John P" Date: Tue, 11 Feb 2020 10:45:18 -0500 Subject: [PATCH 145/331] Add new version of clipping scheme --- Pilot3/P3B4/tf_mthcan.py | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/Pilot3/P3B4/tf_mthcan.py b/Pilot3/P3B4/tf_mthcan.py index 0b11881e..2d6155ba 100644 --- a/Pilot3/P3B4/tf_mthcan.py +++ b/Pilot3/P3B4/tf_mthcan.py @@ -33,7 +33,14 @@ def __init__(self,embedding_matrix,num_classes,max_sents,max_words, #doc input self.doc_input = tf.placeholder(tf.int32, shape=[None,max_sents,max_words]) # batch x sents x words batch_size = tf.shape(self.doc_input)[0] - doc_input_reshape = tf.reshape(self.doc_input,(-1,max_words)) # batch*sents x words + + words_per_sent = tf.reduce_sum(tf.sign(self.doc_input),2) # batch X sents + max_words_ = tf.reduce_max(words_per_sent) + sents_per_doc = tf.reduce_sum(tf.sign(words_per_sent),1) # batch + max_sents_ = tf.reduce_max(sents_per_doc) + doc_input_reduced = self.doc_input[:,:max_sents_,:max_words_] #clip + + doc_input_reshape = tf.reshape(doc_input_reduced,(-1,max_words_)) # batch*sents x words #word embeddings word_embeds = tf.gather(tf.get_variable('embeddings',initializer=self.embedding_matrix, @@ -55,19 +62,18 @@ def __init__(self,embedding_matrix,num_classes,max_sents,max_words, outputs = tf.matmul(outputs,V) # batch*sents x words x attention_size #word target attention - Q = tf.get_variable('word_Q',(1,self.attention_size,1), + Q = tf.get_variable('word_Q',(1,1,self.attention_size), tf.float32,tf.orthogonal_initializer()) - Q = tf.tile(Q,[batch_size*max_sents,1,1]) + Q = tf.tile(Q,[batch_size*max_sents_,1,1]) V = outputs - outputs = tf.matmul(outputs,Q) # batch*sents x words x 1 + outputs = tf.matmul(Q,tf.transpose(outputs,[0, 2, 1])) outputs = outputs/(K.get_shape().as_list()[-1]**0.5) outputs = tf.where(tf.equal(outputs,0),tf.ones_like(outputs)*-1000,outputs) outputs = tf.nn.dropout(tf.nn.softmax(outputs),self.dropout) - outputs = tf.expand_dims(tf.squeeze(outputs,[2]),1) # batch*sents x 1 x words outputs = tf.matmul(outputs,V) # batch*sents x 1 x attention_size - sent_embeds = tf.reshape(outputs,(-1,max_sents,self.attention_size)) + sent_embeds = tf.reshape(outputs,(-1,max_sents_,self.attention_size)) sent_embeds = tf.nn.dropout(sent_embeds,self.dropout) # batch x sents x attention_size #sent self attention @@ -85,16 +91,15 @@ def __init__(self,embedding_matrix,num_classes,max_sents,max_words, outputs = tf.matmul(outputs,V) # batch x sents x attention_size #sent target attention - Q = tf.get_variable('sent_Q',(1,self.attention_size,1), + Q = tf.get_variable('sent_Q',(1,1,self.attention_size), tf.float32,tf.orthogonal_initializer()) Q = tf.tile(Q,[batch_size,1,1]) V = outputs - outputs = tf.matmul(outputs,Q) # batch x sents x 1 + outputs = tf.matmul(Q,tf.transpose(outputs,[0, 2, 1])) outputs = outputs/(K.get_shape().as_list()[-1]**0.5) outputs = tf.where(tf.equal(outputs,0),tf.ones_like(outputs)*-1000,outputs) outputs = tf.nn.dropout(tf.nn.softmax(outputs),self.dropout) - outputs = tf.expand_dims(tf.squeeze(outputs,[2]),1) # batch x 1 x sents outputs = tf.matmul(outputs,V) # batch x 1 x attention_size doc_embeds = tf.nn.dropout(tf.squeeze(outputs,[1]),self.dropout) # batch x attention_size From abe8668e5a87b5e7163831bb194800147e6c23ff Mon Sep 17 00:00:00 2001 From: Brettin Date: Fri, 14 Feb 2020 04:16:27 -0800 Subject: [PATCH 146/331] new files --- Pilot1/Attn1/attn_bin_working_jan7_h5.py | 568 +++++++++++++++++++++++ Pilot1/Attn1/attn_bin_working_jan7_h5.sh | 14 + Pilot1/Attn1/cmd1.sh | 17 + Pilot1/Attn1/cmd2.sh | 5 + 4 files changed, 604 insertions(+) create mode 100644 Pilot1/Attn1/attn_bin_working_jan7_h5.py create mode 100755 Pilot1/Attn1/attn_bin_working_jan7_h5.sh create mode 100755 Pilot1/Attn1/cmd1.sh create mode 100755 Pilot1/Attn1/cmd2.sh diff --git a/Pilot1/Attn1/attn_bin_working_jan7_h5.py b/Pilot1/Attn1/attn_bin_working_jan7_h5.py new file mode 100644 index 00000000..570fc94f --- /dev/null +++ b/Pilot1/Attn1/attn_bin_working_jan7_h5.py @@ -0,0 +1,568 @@ +import itertools +import pandas as pd +import numpy as np +import os +import sys +import gzip +import argparse +import sklearn + +import matplotlib +matplotlib.use('Agg') + +import matplotlib.pyplot as plt + +import tensorflow as tf + +import keras as ke +from keras import backend as K + +from keras.layers import Input, Dense, Dropout, Activation, BatchNormalization +from keras.optimizers import SGD, Adam, RMSprop, Adadelta +from keras.models import Sequential, Model, model_from_json, model_from_yaml +from keras.utils import np_utils, multi_gpu_model + +from keras.callbacks import ModelCheckpoint, CSVLogger, ReduceLROnPlateau, EarlyStopping + +from sklearn.utils.class_weight import compute_class_weight +from sklearn.model_selection import train_test_split +from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, roc_auc_score, confusion_matrix, balanced_accuracy_score, classification_report +from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler +from sklearn.metrics import recall_score, auc, roc_curve, f1_score, precision_recall_curve + + +file_path = os.path.dirname(os.path.realpath(__file__)) +lib_path = os.path.abspath(os.path.join(file_path, '..', '..', 'common')) +sys.path.append(lib_path) + +psr = argparse.ArgumentParser(description='input agg csv file') +psr.add_argument('--in', default='in_file') +psr.add_argument('--ep', type=int, default=400) +psr.add_argument('--save_dir', default=".") +args=vars(psr.parse_args()) +if not args['save_dir'].endswith('/'): + args['save_dir'] = args['save_dir'] + '/' +print(args) + +EPOCH = args['ep'] +BATCH = 32 +nb_classes = 2 + +data_path = args['in'] + +# df_toss = (pd.read_csv(data_path,nrows=1).values) + +# print('df_toss:', df_toss.shape) + +# PL = df_toss.size +# PS = PL - 1 + +# print('PL=',PL) + +#PL = 6213 # 38 + 60483 +#PS = 6212 # 60483 +DR = 0.2 # Dropout rate + +def r2(y_true, y_pred): + SS_res = K.sum(K.square(y_true - y_pred)) + SS_tot = K.sum(K.square(y_true - K.mean(y_true))) + return (1 - SS_res/(SS_tot + K.epsilon())) + + + +def tf_auc(y_true, y_pred): + auc = tf.metrics.auc(y_true, y_pred)[1] + K.get_session().run(tf.local_variables_initializer()) + return auc + + +#from sklearn.metrics import roc_auc_score +#import tensorflow as tf + +def auroc( y_true, y_pred ) : + score = tf.py_func( lambda y_true, y_pred : roc_auc_score( y_true, y_pred, average='macro', sample_weight=None).astype('float32'), + [y_true, y_pred], + 'float32', + stateful=False, + name='sklearnAUC' ) + return score + + +def load_data(): + + # start change # + if args['in'].endswith('h5') or args['in'].endswith('hdf5'): + print ('processing h5 in file {}'.format(args['in'])) + + df_x_train_0 = pd.read_hdf(args['in'], 'x_train_0').astype(np.float32) + df_x_train_1 = pd.read_hdf(args['in'], 'x_train_1').astype(np.float32) + X_train = pd.concat([df_x_train_0, df_x_train_1], axis=1, sort=False) + del df_x_train_0, df_x_train_1 + + df_x_test_0 = pd.read_hdf(args['in'], 'x_test_0').astype(np.float32) + df_x_test_1 = pd.read_hdf(args['in'], 'x_test_1').astype(np.float32) + X_test = pd.concat([df_x_test_0, df_x_test_1], axis=1, sort=False) + del df_x_test_0, df_x_test_1 + + df_x_val_0 = pd.read_hdf(args['in'], 'x_val_0').astype(np.float32) + df_x_val_1 = pd.read_hdf(args['in'], 'x_val_1').astype(np.float32) + X_val = pd.concat([df_x_val_0, df_x_val_1], axis=1, sort=False) + del df_x_val_0, df_x_val_1 + + Y_train = pd.read_hdf(args['in'], 'y_train') + Y_test = pd.read_hdf(args['in'], 'y_test') + Y_val = pd.read_hdf(args['in'], 'y_val') + + # assumes AUC is in the third column at index 2 + # df_y = df['AUC'].astype('int') + # df_x = df.iloc[:,3:].astype(np.float32) + + # assumes dataframe has already been scaled + # scaler = StandardScaler() + # df_x = scaler.fit_transform(df_x) + + else: + print ('expecting in file file suffix h5') + sys.exit() + + + print('x_train shape:', X_train.shape) + print('x_test shape:', X_test.shape) + + return X_train, Y_train, X_val, Y_val, X_test, Y_test + + +X_train, _Y_train, X_val, _Y_val, X_test, _Y_test = load_data() +# move this inside the load_data function +Y_train = _Y_train['AUC'] +Y_test = _Y_test['AUC'] +Y_val = _Y_val['AUC'] + +Y_train_neg, Y_train_pos = np.bincount(Y_train) +Y_test_neg, Y_test_pos = np.bincount(Y_test) +Y_val_neg, Y_val_pos = np.bincount(Y_val) + +Y_train_total = Y_train_neg + Y_train_pos +Y_test_total = Y_test_neg + Y_test_pos +Y_val_total = Y_val_neg + Y_val_pos + +total = Y_train_total + Y_test_total + Y_val_total +neg = Y_train_neg + Y_test_neg + Y_val_neg +pos = Y_train_pos + Y_test_pos + Y_val_pos + +print('Examples:\n Total: {}\n Positive: {} ({:.2f}% of total)\n'.format( + total, pos, 100 * pos / total)) + +Y_train = np_utils.to_categorical(Y_train,nb_classes) +Y_test = np_utils.to_categorical(Y_test,nb_classes) +Y_val = np_utils.to_categorical(Y_val,nb_classes) + +# ----------------------- from stack overflow + +y_integers = np.argmax(Y_train, axis=1) +class_weights = compute_class_weight('balanced', np.unique(y_integers), y_integers) +d_class_weights = dict(enumerate(class_weights)) + +print('X_train shape:', X_train.shape) +print('X_test shape:', X_test.shape) + +print('Y_train shape:', Y_train.shape) +print('Y_test shape:', Y_test.shape) + +PS=X_train.shape[1] +inputs = Input(shape=(PS,)) + +x = Dense(1000, activation='relu')(inputs) +x = BatchNormalization()(x) + +a = Dense(1000, activation='relu')(x) +a = BatchNormalization()(a) + +b = Dense(1000, activation='softmax')(x) +x = ke.layers.multiply([a,b]) + +x = Dense(500, activation='relu')(x) +x = BatchNormalization()(x) +x = Dropout(DR)(x) + +x = Dense(250, activation='relu')(x) +x = BatchNormalization()(x) +x = Dropout(DR)(x) + +x = Dense(125, activation='relu')(x) +x = BatchNormalization()(x) +x = Dropout(DR)(x) + +x = Dense(60, activation='relu')(x) +x = BatchNormalization()(x) +x = Dropout(DR)(x) + +x = Dense(30, activation='relu')(x) +x = BatchNormalization()(x) +x = Dropout(DR)(x) + +outputs = Dense(2, activation='softmax')(x) + +model = Model(inputs=inputs, outputs=outputs) + +model.summary() + +#parallel_model = multi_gpu_model(model, gpus=4) +#parallel_model.compile(loss='mean_squared_error', +# optimizer=SGD(lr=0.0001, momentum=0.9), +# metrics=['mae',r2]) + +model.compile(loss='categorical_crossentropy', + optimizer=SGD(lr=0.00001, momentum=0.9), +# optimizer=Adam(lr=0.00001), +# optimizer=RMSprop(lr=0.0001), +# optimizer=Adadelta(), + metrics=['acc',tf_auc]) + +# set up a bunch of callbacks to do work during model training.. + +checkpointer = ModelCheckpoint(filepath=args['save_dir'] + 'Agg_attn_bin.autosave.model.h5', verbose=1, save_weights_only=False, save_best_only=True) +csv_logger = CSVLogger(args['save_dir'] + 'Agg_attn_bin.training.log') +reduce_lr = ReduceLROnPlateau(monitor='val_tf_auc', factor=0.20, patience=40, verbose=1, mode='auto', min_delta=0.0001, cooldown=3, min_lr=0.000000001) +early_stop = EarlyStopping(monitor='val_tf_auc', patience=200, verbose=1, mode='auto') + + + +#history = parallel_model.fit(X_train, Y_train, + +history = model.fit(X_train, Y_train, class_weight=d_class_weights, + batch_size=BATCH, + epochs=EPOCH, + verbose=1, + validation_data=(X_val, Y_val), + callbacks = [checkpointer, csv_logger, reduce_lr, early_stop]) + + +score = model.evaluate(X_test, Y_test, verbose=0) + +Y_predict = model.predict(X_test) + +threshold = 0.5 + +Y_pred_int = (Y_predict[:,0] < threshold).astype(np.int) +Y_test_int = (Y_test[:,0] < threshold).astype(np.int) + +print ('creating table of predictions') +f = open(args['save_dir'] + 'Agg_attn_bin.predictions.tsv', 'w') +for index, row in _Y_test.iterrows(): + if row['AUC'] == 1: + if Y_pred_int[index] == 1: + call='TP' + else: + call='FN' + if row['AUC'] == 0: + if Y_pred_int[index] == 0: + call = 'TN' + else: + call = 'FP' + # 1 TN 0 0.6323 NCI60.786-0 NSC.256439 NSC.102816 + print(index, "\t", call, "\t", Y_pred_int[index], "\t", row['AUC'], "\t", row['Sample'], "\t", row['Drug1'], file=f) +f.close() + +#print(Y_test[:,0]) +#print(Y_predict[:,0]) + +false_pos_rate, true_pos_rate, thresholds = roc_curve(Y_test[:,0], Y_predict[:,0]) + +#print(thresholds) + +roc_auc = auc(false_pos_rate, true_pos_rate) + +auc_keras = roc_auc +fpr_keras = false_pos_rate +tpr_keras = true_pos_rate + +print ('creating figure 1 at ', args['save_dir'] + 'Agg_attn_bin.auroc.pdf') +plt.figure(1) +plt.plot([0, 1], [0, 1], 'k--', label="No Skill") +plt.plot(fpr_keras, tpr_keras, label='Keras (area = {:.3f})'.format(auc_keras)) +plt.xlabel('False positive rate') +plt.ylabel('True positive rate') +plt.title('ROC curve') +plt.legend(loc='best') + +plt.savefig(args['save_dir'] + 'Agg_attn_bin.auroc.pdf', bbox_inches='tight') +plt.close() + + +# Zoom in view of the upper left corner. +print ('creating figure 2 at ', args['save_dir'] + 'Agg_attn_bin.auroc2.pdf') +plt.figure(2) +plt.xlim(0, 0.2) +plt.ylim(0.8, 1) +plt.plot([0, 1], [0, 1], 'k--', label="No Skill") +plt.plot(fpr_keras, tpr_keras, label='Keras (area = {:.3f})'.format(auc_keras)) +plt.xlabel('False positive rate') +plt.ylabel('True positive rate') +plt.title('ROC curve (zoomed in at top left)') +plt.legend(loc='best') + +plt.savefig(args['save_dir'] + 'Agg_attn_bin.auroc2.pdf', bbox_inches='tight') +plt.close() + + +f1 = f1_score(Y_test_int, Y_pred_int) + +precision, recall, thresholds = precision_recall_curve(Y_test[:,0], Y_predict[:,0]) + +#print(thresholds) + +pr_auc = auc(recall, precision) + +pr_keras = pr_auc +precision_keras = precision +recall_keras = recall + +print +print + +print('f1=%.3f auroc=%.3f aucpr=%.3f' % (f1, auc_keras, pr_keras)) + +print ('creating figure 3 at ', args['save_dir'] + 'Agg_attn_bin.aurpr.pdf') +plt.figure(1) +no_skill = len(Y_test_int[Y_test_int==1]) / len(Y_test_int) +plt.plot([0, 1], [no_skill, no_skill], linestyle='--', label='No Skill') +plt.plot(recall_keras, precision_keras, label='PR Keras (area = {:.3f})'.format(pr_keras)) +plt.xlabel('Recall') +plt.ylabel('Precision') +plt.title('PR curve') +plt.legend(loc='best') + +plt.savefig(args['save_dir'] + 'Agg_attn_bin.aurpr.pdf', bbox_inches='tight') + +plt.close() + + +def plot_confusion_matrix(cm, classes, + normalize=False, + title='Confusion matrix', + cmap=plt.cm.Blues): + """ + This function prints and plots the confusion matrix. + Normalization can be applied by setting `normalize=True`. + """ + if normalize: + cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] + print("Normalized confusion matrix") + else: + print('Confusion matrix, without normalization') + + print(cm) + + plt.imshow(cm, interpolation='nearest', cmap=cmap) + plt.title(title) + plt.colorbar() + tick_marks = np.arange(len(classes)) + plt.xticks(tick_marks, classes, rotation=45) + plt.yticks(tick_marks, classes) + + fmt = '.2f' if normalize else 'd' + thresh = cm.max() / 2. + for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): + plt.text(j, i, format(cm[i, j], fmt), + horizontalalignment="center", + color="white" if cm[i, j] > thresh else "black") + + plt.ylabel('True label') + plt.xlabel('Predicted label') + plt.tight_layout() + +class_names=["Non-Response","Response"] + +# Compute confusion matrix +cnf_matrix = sklearn.metrics.confusion_matrix(Y_test_int, Y_pred_int) +np.set_printoptions(precision=2) + +# Plot non-normalized confusion matrix +#plt.figure() +print ('creating figure 4 at ', args['save_dir'] + 'Agg_attn_bin.confusion_without_norm.pdf') +plot_confusion_matrix(cnf_matrix, classes=class_names, + title='Confusion matrix, without normalization') +plt.savefig(args['save_dir'] + 'Agg_attn_bin.confusion_without_norm.pdf', bbox_inches='tight') + +plt.close() + + + +def plot_confusion_matrix(cm, classes, + normalize=False, + title='Confusion matrix', + cmap=plt.cm.Blues): + """ + This function prints and plots the confusion matrix. + Normalization can be applied by setting `normalize=True`. + """ + if normalize: + cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] + print("Normalized confusion matrix") + else: + print('Confusion matrix, without normalization') + + print(cm) + + plt.imshow(cm, interpolation='nearest', cmap=cmap) + plt.title(title) + plt.colorbar() + tick_marks = np.arange(len(classes)) + plt.xticks(tick_marks, classes, rotation=45) + plt.yticks(tick_marks, classes) + + fmt = '.2f' if normalize else 'd' + thresh = cm.max() / 2. + for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): + plt.text(j, i, format(cm[i, j], fmt), + horizontalalignment="center", + color="white" if cm[i, j] > thresh else "black") + + plt.ylabel('True label') + plt.xlabel('Predicted label') + plt.tight_layout() + +class_names=["Non-Response","Response"] + +# Compute confusion matrix +cnf_matrix = sklearn.metrics.confusion_matrix(Y_test_int, Y_pred_int) +np.set_printoptions(precision=2) + +# Plot non-normalized confusion matrix +#plt.figure() +plot_confusion_matrix(cnf_matrix, classes=class_names, + title='Confusion matrix, without normalization') +plt.savefig(args['save_dir'] + 'Agg_attn_bin.confusion_without_norm.pdf', bbox_inches='tight') + +plt.close() + +# Plot normalized confusion matrix +#plt.figure() +plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True, + title='Normalized confusion matrix') +plt.savefig(args['save_dir'] + 'Agg_attn_bin.confusion_with_norm.pdf', bbox_inches='tight') + +plt.close() + + +print('Examples:\n Total: {}\n Positive: {} ({:.2f}% of total)\n'.format( + total, pos, 100 * pos / total)) + + +print(sklearn.metrics.roc_auc_score(Y_test_int, Y_pred_int)) + +print(sklearn.metrics.balanced_accuracy_score(Y_test_int, Y_pred_int)) + +print(sklearn.metrics.classification_report(Y_test_int, Y_pred_int)) + +print(sklearn.metrics.confusion_matrix(Y_test_int, Y_pred_int)) + +print("score") +print(score) + +#exit() + +# summarize history for accuracy +plt.plot(history.history['acc']) +plt.plot(history.history['val_acc']) +plt.title('Model Accuracy') +plt.ylabel('accuracy') +plt.xlabel('epoch') +plt.legend(['train', 'test'], loc='upper left') + +plt.savefig(args['save_dir'] + 'Agg_attn_bin.accuracy.png', bbox_inches='tight') +plt.savefig(args['save_dir'] + 'Agg_attn_bin.accuracy.pdf', bbox_inches='tight') + +plt.close() + +# summarize history for loss +plt.plot(history.history['loss']) +plt.plot(history.history['val_loss']) +plt.title('Model Loss') +plt.ylabel('loss') +plt.xlabel('epoch') +plt.legend(['train', 'test'], loc='upper left') + +plt.savefig(args['save_dir'] + 'Agg_attn_bin.loss.png', bbox_inches='tight') +plt.savefig(args['save_dir'] + 'Agg_attn_bin.loss.pdf', bbox_inches='tight') + + +print('Test val_loss:', score[0]) +print('Test accuracy:', score[1]) + +# serialize model to JSON +model_json = model.to_json() +with open(args['save_dir'] + "Agg_attn_bin.model.json", "w") as json_file: + json_file.write(model_json) + +# serialize model to YAML +model_yaml = model.to_yaml() +with open(args['save_dir'] + "Agg_attn_bin.model.yaml", "w") as yaml_file: + yaml_file.write(model_yaml) + + +# serialize weights to HDF5 +model.save_weights(args['save_dir'] + "Agg_attn_bin.model.h5") +print("Saved model to disk") + +# load json and create model +json_file = open(args['save_dir'] + 'Agg_attn_bin.model.json', 'r') +loaded_model_json = json_file.read() +json_file.close() +loaded_model_json = model_from_json(loaded_model_json) + + +# load yaml and create model +yaml_file = open(args['save_dir'] + 'Agg_attn_bin.model.yaml', 'r') +loaded_model_yaml = yaml_file.read() +yaml_file.close() +loaded_model_yaml = model_from_yaml(loaded_model_yaml) + + +# load weights into new model +loaded_model_json.load_weights(args['save_dir'] + "Agg_attn_bin.model.h5") +print("Loaded json model from disk") + +# evaluate json loaded model on test data +loaded_model_json.compile(loss='binary_crossentropy', optimizer='SGD', metrics=['accuracy']) +score_json = loaded_model_json.evaluate(X_test, Y_test, verbose=0) + +print('json Validation loss:', score_json[0]) +print('json Validation accuracy:', score_json[1]) + +print("json %s: %.2f%%" % (loaded_model_json.metrics_names[1], score_json[1]*100)) + + +# load weights into new model +loaded_model_yaml.load_weights(args['save_dir'] + "Agg_attn_bin.model.h5") +print("Loaded yaml model from disk") + +# evaluate loaded model on test data +loaded_model_yaml.compile(loss='binary_crossentropy', optimizer='SGD', metrics=['accuracy']) +score_yaml = loaded_model_yaml.evaluate(X_test, Y_test, verbose=0) + +print('yaml Validation loss:', score_yaml[0]) +print('yaml Validation accuracy:', score_yaml[1]) + +print("yaml %s: %.2f%%" % (loaded_model_yaml.metrics_names[1], score_yaml[1]*100)) + +# predict using loaded yaml model on test and training data + +predict_yaml_train = loaded_model_yaml.predict(X_train) + +predict_yaml_test = loaded_model_yaml.predict(X_test) + + +print('Yaml_train_shape:', predict_yaml_train.shape) +print('Yaml_test_shape:', predict_yaml_test.shape) + + +predict_yaml_train_classes = np.argmax(predict_yaml_train, axis=1) +predict_yaml_test_classes = np.argmax(predict_yaml_test, axis=1) + +np.savetxt(args['save_dir'] + "Agg_attn_bin_predict_yaml_train.csv", predict_yaml_train, delimiter=",", fmt="%.3f") +np.savetxt(args['save_dir'] + "Agg_attn_bin_predict_yaml_test.csv", predict_yaml_test, delimiter=",", fmt="%.3f") + +np.savetxt(args['save_dir'] + "Agg_attn_bin_predict_yaml_train_classes.csv", predict_yaml_train_classes, delimiter=",",fmt="%d") +np.savetxt(args['save_dir'] + "Agg_attn_bin_predict_yaml_test_classes.csv", predict_yaml_test_classes, delimiter=",",fmt="%d") diff --git a/Pilot1/Attn1/attn_bin_working_jan7_h5.sh b/Pilot1/Attn1/attn_bin_working_jan7_h5.sh new file mode 100755 index 00000000..8aa341ab --- /dev/null +++ b/Pilot1/Attn1/attn_bin_working_jan7_h5.sh @@ -0,0 +1,14 @@ +prefix=/scratch/brettin/Agg_attn_bin_iter2 + +m=$1 +echo $m + +device=$(($m % 8)) +n="0$m" + +export CUDA_VISIBLE_DEVICES=$device +mkdir -p $prefix/save/$n + +python attn_bin_working_jan7_h5.py --in $prefix/top21_r10/top_21_1fold_"$n".h5 \ + --ep 200 \ + --save_dir $prefix/save/"$n"/ > $prefix/save/$n.log diff --git a/Pilot1/Attn1/cmd1.sh b/Pilot1/Attn1/cmd1.sh new file mode 100755 index 00000000..104543d2 --- /dev/null +++ b/Pilot1/Attn1/cmd1.sh @@ -0,0 +1,17 @@ +prefix=/scratch/brettin/Agg_attn_bin_iter1 +# prefix=$HOME + +for m in $(seq -w 0 7); do + + device=$(($m % 8)) + n="00$m" + + export CUDA_VISIBLE_DEVICES=$device + mkdir -p $prefix/save/$n + + python attn_bin_working_jan7_h5.py --in /scratch/data/benchmarks/binary_811_splits/top_21_1fold_"$n".h5 \ + --ep 200 \ + --save_dir $prefix/save/"$n"/ > $prefix/save/$n.log & + + sleep 2 +done diff --git a/Pilot1/Attn1/cmd2.sh b/Pilot1/Attn1/cmd2.sh new file mode 100755 index 00000000..ca4dc21f --- /dev/null +++ b/Pilot1/Attn1/cmd2.sh @@ -0,0 +1,5 @@ + +for n in $(cat $1) ; do + echo $n + ./attn_bin_working_jan7_h5.sh $n +done From c8f65a209b8fe61fcaaba5191bcb093d9d8f5440 Mon Sep 17 00:00:00 2001 From: Harry Yoo Date: Fri, 14 Feb 2020 12:45:58 -0600 Subject: [PATCH 147/331] update readme --- Pilot1/Uno/README.AUC.md | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/Pilot1/Uno/README.AUC.md b/Pilot1/Uno/README.AUC.md index 48ce0b54..4145423a 100644 --- a/Pilot1/Uno/README.AUC.md +++ b/Pilot1/Uno/README.AUC.md @@ -1,22 +1,16 @@ # Predicting AUC values for Top21 cancer types ## Data prep -A static dataset is prebuilt and available at `http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/top_21_auc_1fold.uno.h5`. Along with the datset file, you will also need a cache file (a byproduct of data-building process) to skip the data-building process. +A static dataset is prebuilt and available at `http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/top_21_auc_1fold.uno.h5`. ``` $ wget http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/top_21_auc_1fold.uno.h5 -$ mkdir -p cache -$ cd cache -$ wget http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/top6.cache.tgz -$ tar xvzf top6.cache.tgz -$ cd - ``` ## Training ``` python uno_baseline_keras2.py --config_file uno_auc_model.txt \ - --cache cache/top6_auc \ --use_exported_data top_21_auc_1fold.uno.h5 ... From 1849fcd0afc21cca2c38a9ddac55379ea25ec653 Mon Sep 17 00:00:00 2001 From: Harry Yoo Date: Fri, 14 Feb 2020 12:53:47 -0600 Subject: [PATCH 148/331] update readme --- Pilot1/Uno/README.AUC.md | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/Pilot1/Uno/README.AUC.md b/Pilot1/Uno/README.AUC.md index 4145423a..3b308ece 100644 --- a/Pilot1/Uno/README.AUC.md +++ b/Pilot1/Uno/README.AUC.md @@ -11,7 +11,7 @@ $ wget http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/top_21_auc ## Training ``` python uno_baseline_keras2.py --config_file uno_auc_model.txt \ - --use_exported_data top_21_auc_1fold.uno.h5 + --use_exported_data top_21_auc_1fold.uno.h5 --es True ... Params: @@ -22,7 +22,7 @@ Params: 'batch_size': 32, 'by_cell': None, 'by_drug': None, - 'cache': 'cache/top6_auc', + 'cache': None, 'cell_feature_subset_path': '', 'cell_features': ['rnaseq'], 'cell_subset_path': '', @@ -32,6 +32,8 @@ Params: 'cv': 1, 'datatype': , 'dense': [1000, 1000, 1000, 1000, 1000], + 'dense_cell_feature_layers': None, + 'dense_drug_feature_layers': None, 'dense_feature_layers': [1000, 1000, 1000], 'drop': 0.1, 'drug_feature_subset_path': '', @@ -40,6 +42,7 @@ Params: 'drug_median_response_min': -1, 'drug_subset_path': '', 'epochs': 50, + 'es': True, 'experiment_id': 'EXP000', 'export_csv': None, 'export_data': None, @@ -59,6 +62,7 @@ Params: 'output_dir': '/ssd1/homes/hsyoo/projects/CANDLE/Benchmarks/Pilot1/Uno/Output/EXP000/RUN000', 'partition_by': None, 'preprocess_rnaseq': 'source_scale', + 'profiling': False, 'reduce_lr': True, 'residual': False, 'rng_seed': 2018, @@ -92,8 +96,8 @@ Between random pairs in y_val: mae: 0.1619 r2: -1.0103 corr: -0.0051 -Data points per epoch: train = 423952, val = 52994 -Steps per epoch: train = 13248, val = 1656 +Data points per epoch: train = 423952, val = 52994, test = 52994 +Steps per epoch: train = 13248, val = 1656, test = 1656 Epoch 1/50 13248/13248 [==============================] - 102s 8ms/step - loss: 0.0268 - mae: 0.0794 - r2: -0.2754 - val_loss: 0.0092 - val_mae: 0.0725 - val_r2: 0.5657 Current time ....101.892 From 9dcf72154953239dfa695308c1d4618da90057ec Mon Sep 17 00:00:00 2001 From: brettin Date: Sat, 15 Feb 2020 13:06:38 -0500 Subject: [PATCH 149/331] enable 6 jobs per node --- Pilot1/Attn1/attn_bin_working_jan7_h5.sh | 38 +++++++++++++++++++----- 1 file changed, 30 insertions(+), 8 deletions(-) diff --git a/Pilot1/Attn1/attn_bin_working_jan7_h5.sh b/Pilot1/Attn1/attn_bin_working_jan7_h5.sh index 8aa341ab..a195bc92 100755 --- a/Pilot1/Attn1/attn_bin_working_jan7_h5.sh +++ b/Pilot1/Attn1/attn_bin_working_jan7_h5.sh @@ -1,14 +1,36 @@ -prefix=/scratch/brettin/Agg_attn_bin_iter2 +#!/bin/bash + +prefix="/gpfs/alpine/scratch/brettin/med106" +local_prefix="/mnt/bb/$USER" m=$1 echo $m -device=$(($m % 8)) -n="0$m" +for i in $(cat $m) ; do + device=$(($i % 6)) + # n="0$i" + n="00$i" + + export CUDA_VISIBLE_DEVICES=$device + mkdir -p "$prefix"/save/"$n" + mkdir -p "$local_prefix"/save/"$n" + mkdir -p "$local_prefix"/top21_baseline + + echo "copying files to $local_prefix/top21_baseline" + + cp "$prefix"/Data_sets/top21_baseline/top_21_1fold_"$n".h5 \ + $local_prefix/top21_baseline/ + + ls $local_prefix/top21_baseline + + echo "running attn_bin_working_jan7_h5.py --in $local_prefix/top21_baseline/top_21_1fold_"$n".h5" + python attn_bin_working_jan7_h5.py --in $local_prefix/top21_baseline/top_21_1fold_"$n".h5 \ + --ep 2 \ + --save_dir "$local_prefix"/save/"$n"/ > "$local_prefix"/save/"$n".log & + sleep 2 +done -export CUDA_VISIBLE_DEVICES=$device -mkdir -p $prefix/save/$n +wait -python attn_bin_working_jan7_h5.py --in $prefix/top21_r10/top_21_1fold_"$n".h5 \ - --ep 200 \ - --save_dir $prefix/save/"$n"/ > $prefix/save/$n.log +echo "running cp -r $local_prefix/save/* $prefix/save/" +cp -r $local_prefix/save/* $prefix/save/ From 923794a58ead98e25072b62206af89052dee4df4 Mon Sep 17 00:00:00 2001 From: brettin Date: Sat, 15 Feb 2020 13:06:59 -0500 Subject: [PATCH 150/331] new file --- Pilot1/Attn1/attn_bsub.sh | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100755 Pilot1/Attn1/attn_bsub.sh diff --git a/Pilot1/Attn1/attn_bsub.sh b/Pilot1/Attn1/attn_bsub.sh new file mode 100755 index 00000000..722e3f0c --- /dev/null +++ b/Pilot1/Attn1/attn_bsub.sh @@ -0,0 +1,15 @@ +#!/bin/bash +#BSUB -W 1:00 +#BSUB -nnodes 1 +#BSUB -P med106 +#BSUB -alloc_flags NVME + +module load gcc/4.8.5 +module load spectrum-mpi/10.3.0.1-20190611 +module load cuda/10.1.168 +export PATH="/ccs/proj/med106/gounley1/summit/miniconda37/bin:$PATH" + + +# This is in testing +jsrun -n 1 -a 1 -c 42 -g 6 ./attn_bin_working_jan7_h5.sh 0 > attn1.log 2>&1 & + From 95a60f0e4beae53fd4edb1059a7574d5fac4bc02 Mon Sep 17 00:00:00 2001 From: brettin Date: Sat, 15 Feb 2020 13:09:33 -0500 Subject: [PATCH 151/331] new file --- Pilot1/Attn1/0 | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100755 Pilot1/Attn1/0 diff --git a/Pilot1/Attn1/0 b/Pilot1/Attn1/0 new file mode 100755 index 00000000..e8371f00 --- /dev/null +++ b/Pilot1/Attn1/0 @@ -0,0 +1,6 @@ +0 +1 +2 +3 +4 +5 From 0fbe1dc54516592527031c53a07023a69b18b260 Mon Sep 17 00:00:00 2001 From: brettin Date: Thu, 20 Feb 2020 08:53:38 -0500 Subject: [PATCH 152/331] make jsrun resource set aware for summit --- Pilot1/Attn1/attn_bin_working_jan7_h5.sh | 45 ++++++++++++++++-------- 1 file changed, 30 insertions(+), 15 deletions(-) diff --git a/Pilot1/Attn1/attn_bin_working_jan7_h5.sh b/Pilot1/Attn1/attn_bin_working_jan7_h5.sh index a195bc92..3d75e083 100755 --- a/Pilot1/Attn1/attn_bin_working_jan7_h5.sh +++ b/Pilot1/Attn1/attn_bin_working_jan7_h5.sh @@ -4,30 +4,45 @@ prefix="/gpfs/alpine/scratch/brettin/med106" local_prefix="/mnt/bb/$USER" m=$1 -echo $m +datadir=$2 + +echo "input arg file: $m" +echo "input datadir: $datadir" for i in $(cat $m) ; do + device=$(($i % 6)) - # n="0$i" - n="00$i" + + # pad with zeros to conform to input file names + if [ $i -lt 10 ] ; then + n=00"$i" + else + n=0"$i" + fi export CUDA_VISIBLE_DEVICES=$device - mkdir -p "$prefix"/save/"$n" - mkdir -p "$local_prefix"/save/"$n" - mkdir -p "$local_prefix"/top21_baseline - echo "copying files to $local_prefix/top21_baseline" + # should test if JSM_GPU_ASSIGNMENTS is empty + if [ $JSM_GPU_ASSIGNMENTS -eq $device ] ; then + echo "processing line value $i from infile $m using device $device on input $n" + mkdir -p "$prefix"/save/"$datadir"/"$n" + mkdir -p "$local_prefix"/save/"$datadir"/"$n" + mkdir -p "$local_prefix"/"$datadir" + + echo "copying files to $local_prefix/$datadir" + cp "$prefix"/Data_sets/"$datadir"/top_21_1fold_"$n".h5 \ + $local_prefix/"$datadir"/ + + ls $local_prefix/"$datadir" - cp "$prefix"/Data_sets/top21_baseline/top_21_1fold_"$n".h5 \ - $local_prefix/top21_baseline/ + echo "running attn_bin_working_jan7_h5.py --in $local_prefix/$datadir/top_21_1fold_"$n".h5" + python attn_bin_working_jan7_h5.py --in $local_prefix/"$datadir"/top_21_1fold_"$n".h5 \ + --ep 200 \ + --save_dir "$local_prefix"/save/"$datadir"/"$n"/ > "$local_prefix"/save/"$datadir"/"$n".log & + sleep 2 - ls $local_prefix/top21_baseline + fi - echo "running attn_bin_working_jan7_h5.py --in $local_prefix/top21_baseline/top_21_1fold_"$n".h5" - python attn_bin_working_jan7_h5.py --in $local_prefix/top21_baseline/top_21_1fold_"$n".h5 \ - --ep 2 \ - --save_dir "$local_prefix"/save/"$n"/ > "$local_prefix"/save/"$n".log & - sleep 2 done wait From f861770430eaa06ed962a68e24090488df44722a Mon Sep 17 00:00:00 2001 From: brettin Date: Thu, 20 Feb 2020 08:54:30 -0500 Subject: [PATCH 153/331] optimized resource sets --- Pilot1/Attn1/attn_bsub.sh | 50 +++++++++++++++++++++++++++++++++++---- 1 file changed, 46 insertions(+), 4 deletions(-) diff --git a/Pilot1/Attn1/attn_bsub.sh b/Pilot1/Attn1/attn_bsub.sh index 722e3f0c..5f6bdae4 100755 --- a/Pilot1/Attn1/attn_bsub.sh +++ b/Pilot1/Attn1/attn_bsub.sh @@ -1,15 +1,57 @@ #!/bin/bash -#BSUB -W 1:00 -#BSUB -nnodes 1 +#BSUB -W 12:00 +#BSUB -nnodes 160 #BSUB -P med106 #BSUB -alloc_flags NVME +#BSUB -J attn1 + +# need 92 nodes for 12 hr run +# with 12 hour run should be able to do 180 (15*12) epochs +# +# at 17 nodes per data set, need to run 6 datasets (102 nodes) +# module load gcc/4.8.5 module load spectrum-mpi/10.3.0.1-20190611 module load cuda/10.1.168 export PATH="/ccs/proj/med106/gounley1/summit/miniconda37/bin:$PATH" +for i in $(seq 1 16) ; do + jsrun -n 6 -a 1 -c 7 -g 1 ./attn_bin_working_jan7_h5.sh $i top21_baseline > attn1.top21_baseline."$i".log 2>&1 & +done + +for i in $(seq 1 16) ; do + jsrun -n 6 -a 1 -c 7 -g 1 ./attn_bin_working_jan7_h5.sh $i top21_r.0_baseline > attn1.top21_r.0_baseline."$i".log 2>&1 & +done + +for i in $(seq 1 16) ; do + jsrun -n 6 -a 1 -c 7 -g 1 ./attn_bin_working_jan7_h5.sh $i top21_r.0_gap1 > attn1.top21_r.0_gap1."$i".log 2>&1 & +done + +for i in $(seq 1 16) ; do + jsrun -n 6 -a 1 -c 7 -g 1 ./attn_bin_working_jan7_h5.sh $i top21_r.0_gap2 > attn1.top21_r.0_gap2."$i".log 2>&1 & +done + +for i in $(seq 1 16) ; do + jsrun -n 6 -a 1 -c 7 -g 1 ./attn_bin_working_jan7_h5.sh $i top21_r.5_baseline > attn1.top21_r.5_baseline."$i".log 2>&1 & +done + +for i in $(seq 1 16) ; do + jsrun -n 6 -a 1 -c 7 -g 1 ./attn_bin_working_jan7_h5.sh $i top21_r.5_gap1 > attn1.top21_r.5_gap1."$i".log 2>&1 & +done + +for i in $(seq 1 16) ; do + jsrun -n 6 -a 1 -c 7 -g 1 ./attn_bin_working_jan7_h5.sh $i top21_r.5_gap2 > attn1.top21_r.5_gap2."$i".log 2>&1 & +done + +for i in $(seq 1 16) ; do + jsrun -n 6 -a 1 -c 7 -g 1 ./attn_bin_working_jan7_h5.sh $i top21_r.9_baseline > attn1.top21_r.9_baseline."$i".log 2>&1 & +done -# This is in testing -jsrun -n 1 -a 1 -c 42 -g 6 ./attn_bin_working_jan7_h5.sh 0 > attn1.log 2>&1 & +for i in $(seq 1 16) ; do + jsrun -n 6 -a 1 -c 7 -g 1 ./attn_bin_working_jan7_h5.sh $i top21_r.9_gap1 > attn1.top21_r.9_gap1."$i".log 2>&1 & +done +for i in $(seq 1 16) ; do + jsrun -n 6 -a 1 -c 7 -g 1 ./attn_bin_working_jan7_h5.sh $i top21_r.9_gap2 > attn1.top21_r.9_gap2."$i".log 2>&1 & +done From b4879b70978b6915a3eaf3c907361761dca3f15d Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Wed, 26 Feb 2020 12:56:27 -0600 Subject: [PATCH 154/331] First pass at making P1: Attn1 candle compliant --- Pilot1/Attn1/attn.py | 268 +++++++++++ Pilot1/Attn1/attn_baseline_keras2.py | 679 +++++++++++++++++++++++++++ Pilot1/Attn1/attn_default_model.txt | 30 ++ 3 files changed, 977 insertions(+) create mode 100644 Pilot1/Attn1/attn.py create mode 100644 Pilot1/Attn1/attn_baseline_keras2.py create mode 100644 Pilot1/Attn1/attn_default_model.txt diff --git a/Pilot1/Attn1/attn.py b/Pilot1/Attn1/attn.py new file mode 100644 index 00000000..d6af6a8b --- /dev/null +++ b/Pilot1/Attn1/attn.py @@ -0,0 +1,268 @@ +from __future__ import print_function + +import os +import sys +import logging + +import pandas as pd +import numpy as np + +from sklearn.metrics import mean_squared_error +from sklearn.metrics import r2_score +from scipy.stats.stats import pearsonr + +file_path = os.path.dirname(os.path.realpath(__file__)) +#lib_path = os.path.abspath(os.path.join(file_path, '..')) +#sys.path.append(lib_path) +lib_path2 = os.path.abspath(os.path.join(file_path, '..', '..', 'common')) +sys.path.append(lib_path2) + +import candle + +logger = logging.getLogger(__name__) +candle.set_parallelism_threads() + +additional_definitions = [ +{'name':'latent_dim', + 'action':'store', + 'type': int, + 'help':'latent dimensions'}, +{'name':'model', + 'default':'ae', + 'choices':['ae', 'vae', 'cvae'], + 'help':'model to use: ae, vae, cvae'}, +{'name':'use_landmark_genes', + 'type': candle.str2bool, + 'default': False, + 'help':'use the 978 landmark genes from LINCS (L1000) as expression features'}, +{'name':'residual', + 'type': candle.str2bool, + 'default': False, + 'help':'add skip connections to the layers'}, +{'name':'reduce_lr', + 'type': candle.str2bool, + 'default': False, + 'help':'reduce learning rate on plateau'}, +{'name':'warmup_lr', + 'type': candle.str2bool, + 'default': False, + 'help':'gradually increase learning rate on start'}, +{'name':'base_lr', + 'type': float, + 'help':'base learning rate'}, +{'name':'epsilon_std', + 'type': float, + 'help':'epsilon std for sampling latent noise'}, +{'name':'cp', + 'type': candle.str2bool, + 'default': False, + 'help':'checkpoint models with best val_loss'}, +#{'name':'shuffle', + #'type': candle.str2bool, + #'default': False, + #'help':'shuffle data'}, +{'name':'tb', + 'type': candle.str2bool, + 'default': False, + 'help':'use tensorboard'}, +{'name':'tsne', + 'type': candle.str2bool, + 'default': False, + 'help':'generate tsne plot of the latent representation'} +] + +required = [ + 'activation', + 'batch_size', + 'dense', + 'drop', + 'epochs', + 'initialization', + 'learning_rate', + 'loss', + 'noise_factor', + 'optimizer', + 'rng_seed', + 'model', + 'scaling', + 'validation_split', + 'latent_dim', + 'feature_subsample', + 'batch_normalization', + 'epsilon_std', + 'solr_root', + 'timeout' + ] + +class BenchmarkAttn(candle.Benchmark): + + def set_locals(self): + """Functionality to set variables specific for the benchmark + - required: set of required parameters for the benchmark. + - additional_definitions: list of dictionaries describing the additional parameters for the + benchmark. + """ + + if required is not None: + self.required = set(required) + if additional_definitions is not None: + self.additional_definitions = additional_definitions + +def extension_from_parameters(params, framework=''): + """Construct string for saving model with annotation of parameters""" + ext = framework + ext += '.{}'.format(params['model']) + for i, n in enumerate(params['dense']): + if n: + ext += '.D{}={}'.format(i+1, n) + ext += '.A={}'.format(params['activation']) + ext += '.B={}'.format(params['batch_size']) + ext += '.E={}'.format(params['epochs']) + ext += '.L={}'.format(params['latent_dim']) + ext += '.LR={}'.format(params['learning_rate']) + ext += '.S={}'.format(params['scaling']) + if params['epsilon_std'] != 1.0: + ext += '.EPS={}'.format(params['epsilon_std']) + if params['drop']: + ext += '.DR={}'.format(params['drop']) + if params['batch_normalization']: + ext += '.BN' + if params['warmup_lr']: + ext += '.WU_LR' + if params['reduce_lr']: + ext += '.Re_LR' + if params['residual']: + ext += '.Res' + + return ext +def load_data(params, seed): + + # start change # + if params['in'].endswith('h5') or params['in'].endswith('hdf5'): + print ('processing h5 in file {}'.format(params['in'])) + + df_x_train_0 = pd.read_hdf(params['in'], 'x_train_0').astype(np.float32) + df_x_train_1 = pd.read_hdf(params['in'], 'x_train_1').astype(np.float32) + X_train = pd.concat([df_x_train_0, df_x_train_1], axis=1, sort=False) + del df_x_train_0, df_x_train_1 + + df_x_test_0 = pd.read_hdf(params['in'], 'x_test_0').astype(np.float32) + df_x_test_1 = pd.read_hdf(params['in'], 'x_test_1').astype(np.float32) + X_test = pd.concat([df_x_test_0, df_x_test_1], axis=1, sort=False) + del df_x_test_0, df_x_test_1 + + df_x_val_0 = pd.read_hdf(params['in'], 'x_val_0').astype(np.float32) + df_x_val_1 = pd.read_hdf(params['in'], 'x_val_1').astype(np.float32) + X_val = pd.concat([df_x_val_0, df_x_val_1], axis=1, sort=False) + del df_x_val_0, df_x_val_1 + + Y_train = pd.read_hdf(params['in'], 'y_train') + Y_test = pd.read_hdf(params['in'], 'y_test') + Y_val = pd.read_hdf(params['in'], 'y_val') + + # assumes AUC is in the third column at index 2 + # df_y = df['AUC'].astype('int') + # df_x = df.iloc[:,3:].astype(np.float32) + + # assumes dataframe has already been scaled + # scaler = StandardScaler() + # df_x = scaler.fit_transform(df_x) + + else: + print ('expecting in file file suffix h5') + sys.exit() + + + print('x_train shape:', X_train.shape) + print('x_test shape:', X_test.shape) + + return X_train, Y_train, X_val, Y_val, X_test, Y_test + + # start change # + if params['in'].endswith('h5') or params['in'].endswith('hdf5'): + print ('processing h5 in file {}'.format(params['in'])) + + df_x_train_0 = pd.read_hdf(params['in'], 'x_train_0').astype(np.float32) + df_x_train_1 = pd.read_hdf(params['in'], 'x_train_1').astype(np.float32) + X_train = pd.concat([df_x_train_0, df_x_train_1], axis=1, sort=False) + del df_x_train_0, df_x_train_1 + + df_x_test_0 = pd.read_hdf(params['in'], 'x_test_0').astype(np.float32) + df_x_test_1 = pd.read_hdf(params['in'], 'x_test_1').astype(np.float32) + X_test = pd.concat([df_x_test_0, df_x_test_1], axis=1, sort=False) + del df_x_test_0, df_x_test_1 + + df_x_val_0 = pd.read_hdf(params['in'], 'x_val_0').astype(np.float32) + df_x_val_1 = pd.read_hdf(params['in'], 'x_val_1').astype(np.float32) + X_val = pd.concat([df_x_val_0, df_x_val_1], axis=1, sort=False) + del df_x_val_0, df_x_val_1 + + Y_train = pd.read_hdf(params['in'], 'y_train') + Y_test = pd.read_hdf(params['in'], 'y_test') + Y_val = pd.read_hdf(params['in'], 'y_val') + + # assumes AUC is in the third column at index 2 + # df_y = df['AUC'].astype('int') + # df_x = df.iloc[:,3:].astype(np.float32) + + # assumes dataframe has already been scaled + # scaler = StandardScaler() + # df_x = scaler.fit_transform(df_x) + + else: + print ('expecting in file file suffix h5') + sys.exit() + + + print('x_train shape:', X_train.shape) + print('x_test shape:', X_test.shape) + + return X_train, Y_train, X_val, Y_val, X_test, Y_test + + + + + +def load_data_orig(params, seed): + if params['with_type']: + drop_cols = ['case_id'] + onehot_cols = ['cancer_type'] + else: + drop_cols = ['case_id', 'cancer_type'] + onehot_cols = None + + if params['use_landmark_genes']: + lincs_file = 'lincs1000.tsv' + lincs_path = candle.fetch_file(url_p1b1 + lincs_file) + df_l1000 = pd.read_csv(lincs_path, sep='\t') + usecols = df_l1000['gdc'] + drop_cols = None + else: + usecols = None + + return candle.load_X_data(params['url_attn'], params['file_train'], params['file_test'], + drop_cols=drop_cols, + onehot_cols=onehot_cols, + usecols=usecols, + n_cols=params['feature_subsample'], + shuffle=params['shuffle'], + scaling=params['scaling'], + validation_split=params['validation_split'], + dtype=params['datatype'], + seed=seed) + + +def evaluate_autoencoder(y_pred, y_test): + try: + mse = mean_squared_error(y_pred, y_test) + r2 = r2_score(y_test, y_pred) + corr, _ = pearsonr(y_pred.flatten(), y_test.flatten()) + # print('Mean squared error: {}%'.format(mse)) + except: + #when nan or something else breaks mean_squared_error computation + # we may check earlier before computation also: + #np.isnan(y_pred).any() or np.isnan(y_test).any()): + r2 = 0 + mse = 0 + corr = 0 + return {'mse': mse, 'r2_score': r2, 'correlation': corr} diff --git a/Pilot1/Attn1/attn_baseline_keras2.py b/Pilot1/Attn1/attn_baseline_keras2.py new file mode 100644 index 00000000..b354b168 --- /dev/null +++ b/Pilot1/Attn1/attn_baseline_keras2.py @@ -0,0 +1,679 @@ +from __future__ import print_function + +import numpy as np + +import keras +from keras import backend as K +from keras import optimizers +from keras.models import Model +from keras.layers import BatchNormalization, Dense, Dropout, Input, Lambda +from keras.callbacks import Callback, ModelCheckpoint, ReduceLROnPlateau, LearningRateScheduler, TensorBoard +from keras.metrics import binary_crossentropy, mean_squared_error +from scipy.stats.stats import pearsonr +from sklearn.manifold import TSNE + +import warnings +with warnings.catch_warnings(): + warnings.filterwarnings("ignore", category=DeprecationWarning) + from sklearn.metrics import r2_score + from sklearn.metrics import accuracy_score + +import matplotlib as mpl +mpl.use('Agg') +import matplotlib.pyplot as plt + +## Imports from actual +import itertools +import pandas as pd +import numpy as np +import os +import sys +import gzip +import argparse +import sklearn + +import matplotlib +matplotlib.use('Agg') + +import matplotlib.pyplot as plt + +import tensorflow as tf + +import keras as ke +from keras import backend as K + +from keras.layers import Input, Dense, Dropout, Activation, BatchNormalization +from keras.optimizers import SGD, Adam, RMSprop, Adadelta +from keras.models import Sequential, Model, model_from_json, model_from_yaml +from keras.utils import np_utils, multi_gpu_model + +from keras.callbacks import ModelCheckpoint, CSVLogger, ReduceLROnPlateau, EarlyStopping + +from sklearn.utils.class_weight import compute_class_weight +from sklearn.model_selection import train_test_split +from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, roc_auc_score, confusion_matrix, balanced_accuracy_score, classification_report +from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler +from sklearn.metrics import recall_score, auc, roc_curve, f1_score, precision_recall_curve +### + + + + +import attn +import candle + +np.set_printoptions(precision=4) + + +def r2(y_true, y_pred): + SS_res = K.sum(K.square(y_true - y_pred)) + SS_tot = K.sum(K.square(y_true - K.mean(y_true))) + return (1 - SS_res/(SS_tot + K.epsilon())) + + + +def tf_auc(y_true, y_pred): + auc = tf.metrics.auc(y_true, y_pred)[1] + K.get_session().run(tf.local_variables_initializer()) + return auc + + +#from sklearn.metrics import roc_auc_score +#import tensorflow as tf + +def auroc( y_true, y_pred ) : + score = tf.py_func( lambda y_true, y_pred : roc_auc_score( y_true, y_pred, average='macro', sample_weight=None).astype('float32'), + [y_true, y_pred], + 'float32', + stateful=False, + name='sklearnAUC' ) + return score + + +def covariance(x, y): + return K.mean(x * y) - K.mean(x) * K.mean(y) + + +def corr(y_true, y_pred): + cov = covariance(y_true, y_pred) + var1 = covariance(y_true, y_true) + var2 = covariance(y_pred, y_pred) + return cov / (K.sqrt(var1 * var2) + K.epsilon()) + + +def xent(y_true, y_pred): + return binary_crossentropy(y_true, y_pred) + + +def mse(y_true, y_pred): + return mean_squared_error(y_true, y_pred) + + +class MetricHistory(Callback): + def on_epoch_begin(self, epoch, logs=None): + print("\n") + + def on_epoch_end(self, epoch, logs=None): + y_pred = self.model.predict(self.validation_data[0]) + r2 = r2_score(self.validation_data[1], y_pred) + corr, _ = pearsonr(self.validation_data[1].flatten(), y_pred.flatten()) + print("\nval_r2:", r2) + print(y_pred.shape) + print("\nval_corr:", corr, "val_r2:", r2) + print("\n") + + +class LoggingCallback(Callback): + def __init__(self, print_fcn=print): + Callback.__init__(self) + self.print_fcn = print_fcn + + def on_epoch_end(self, epoch, logs={}): + msg = "[Epoch: %i] %s" % (epoch, ", ".join("%s: %f" % (k, v) for k, v in sorted(logs.items()))) + self.print_fcn(msg) + +def build_type_classifier(x_train, y_train, x_test, y_test): + y_train = np.argmax(y_train, axis=1) + y_test = np.argmax(y_test, axis=1) + from xgboost import XGBClassifier + clf = XGBClassifier(max_depth=6, n_estimators=100) + clf.fit(x_train, y_train, eval_set=[(x_train, y_train), (x_test, y_test)], verbose=False) + y_pred = clf.predict(x_test) + acc = accuracy_score(y_test, y_pred) + print(acc) + return clf + +def initialize_parameters(default_model = 'attn_default_model.txt'): + + # Build benchmark object + attnBmk = attn.BenchmarkAttn(attn.file_path, default_model, 'keras', + prog='attn_baseline', desc='Multi-task (DNN) for data extraction from clinical reports - Pilot 3 Benchmark 1') + + # Initialize parameters + gParameters = candle.finalize_parameters(attnBmk) + #attn.logger.info('Params: {}'.format(gParameters)) + + return gParameters + +def save_cache(cache_file, x_train, y_train, x_val, y_val, x_test, y_test, x_labels, y_labels): + with h5py.File(cache_file, 'w') as hf: + hf.create_dataset("x_train", data=x_train) + hf.create_dataset("y_train", data=y_train) + hf.create_dataset("x_val", data=x_val) + hf.create_dataset("y_val", data=y_val) + hf.create_dataset("x_test", data=x_test) + hf.create_dataset("y_test", data=y_test) + hf.create_dataset("x_labels", (len(x_labels), 1), 'S100', data=[x.encode("ascii", "ignore") for x in x_labels]) + hf.create_dataset("y_labels", (len(y_labels), 1), 'S100', data=[x.encode("ascii", "ignore") for x in y_labels]) + + +def load_cache(cache_file): + with h5py.File(cache_file, 'r') as hf: + x_train = hf['x_train'][:] + y_train = hf['y_train'][:] + x_val = hf['x_val'][:] + y_val = hf['y_val'][:] + x_test = hf['x_test'][:] + y_test = hf['y_test'][:] + x_labels = [x[0].decode('unicode_escape') for x in hf['x_labels'][:]] + y_labels = [x[0].decode('unicode_escape') for x in hf['y_labels'][:]] + return x_train, y_train, x_val, y_val, x_test, y_test, x_labels, y_labels + + +def run(params): +## + nb_classes = 2 +## + + + args = candle.ArgumentStruct(**params) + seed = args.rng_seed + candle.set_seed(seed) + + # Construct extension to save model + ext = attn.extension_from_parameters(params, '.keras') + candle.verify_path(params['save_path']) + prefix = '{}{}'.format(params['save_path'], ext) + logfile = params['logfile'] if params['logfile'] else prefix+'.log' + candle.set_up_logger(logfile, attn.logger, params['verbose']) + attn.logger.info('Params: {}'.format(params)) + + # Get default parameters for initialization and optimizer functions + keras_defaults = candle.keras_default_config() + + ## + X_train, _Y_train, X_val, _Y_val, X_test, _Y_test = attn.load_data(params, seed) + + # move this inside the load_data function + Y_train = _Y_train['AUC'] + Y_test = _Y_test['AUC'] + Y_val = _Y_val['AUC'] + + Y_train_neg, Y_train_pos = np.bincount(Y_train) + Y_test_neg, Y_test_pos = np.bincount(Y_test) + Y_val_neg, Y_val_pos = np.bincount(Y_val) + + Y_train_total = Y_train_neg + Y_train_pos + Y_test_total = Y_test_neg + Y_test_pos + Y_val_total = Y_val_neg + Y_val_pos + + total = Y_train_total + Y_test_total + Y_val_total + neg = Y_train_neg + Y_test_neg + Y_val_neg + pos = Y_train_pos + Y_test_pos + Y_val_pos + + print('Examples:\n Total: {}\n Positive: {} ({:.2f}% of total)\n'.format( + total, pos, 100 * pos / total)) + + Y_train = np_utils.to_categorical(Y_train,nb_classes) + Y_test = np_utils.to_categorical(Y_test,nb_classes) + Y_val = np_utils.to_categorical(Y_val,nb_classes) + + # ----------------------- from stack overflow + + y_integers = np.argmax(Y_train, axis=1) + class_weights = compute_class_weight('balanced', np.unique(y_integers), y_integers) + d_class_weights = dict(enumerate(class_weights)) + + print('X_train shape:', X_train.shape) + print('X_test shape:', X_test.shape) + + print('Y_train shape:', Y_train.shape) + print('Y_test shape:', Y_test.shape) + + PS=X_train.shape[1] + inputs = Input(shape=(PS,)) + DR = params['drop'] + x = Dense(1000, activation='relu')(inputs) + x = BatchNormalization()(x) + + a = Dense(1000, activation='relu')(x) + a = BatchNormalization()(a) + + b = Dense(1000, activation='softmax')(x) + x = ke.layers.multiply([a,b]) + + x = Dense(500, activation='relu')(x) + x = BatchNormalization()(x) + x = Dropout(DR)(x) + + x = Dense(250, activation='relu')(x) + x = BatchNormalization()(x) + x = Dropout(DR)(x) + + x = Dense(125, activation='relu')(x) + x = BatchNormalization()(x) + x = Dropout(DR)(x) + + x = Dense(60, activation='relu')(x) + x = BatchNormalization()(x) + x = Dropout(DR)(x) + + x = Dense(30, activation='relu')(x) + x = BatchNormalization()(x) + x = Dropout(DR)(x) + + outputs = Dense(2, activation='softmax')(x) + + model = Model(inputs=inputs, outputs=outputs) + + model.summary() + + #parallel_model = multi_gpu_model(model, gpus=4) + #parallel_model.compile(loss='mean_squared_error', + # optimizer=SGD(lr=0.0001, momentum=0.9), + # metrics=['mae',r2]) + + model.compile(loss=params['loss'], + optimizer=SGD(lr=0.00001, momentum=0.9), + # optimizer=Adam(lr=0.00001), + # optimizer=RMSprop(lr=0.0001), + # optimizer=Adadelta(), + metrics=['acc',tf_auc]) + + # set up a bunch of callbacks to do work during model training.. + + if not os.path.exists(params['save_dir']): + os.makedirs(params['save_dir']) + + checkpointer = ModelCheckpoint(filepath=params['save_dir'] + 'Agg_attn_bin.autosave.model.h5', verbose=1, save_weights_only=False, save_best_only=True) + csv_logger = CSVLogger('{}/Agg_attn_bin.training.log'.format(params['save_dir'])) + reduce_lr = ReduceLROnPlateau(monitor='val_tf_auc', factor=0.20, patience=40, verbose=1, mode='auto', min_delta=0.0001, cooldown=3, min_lr=0.000000001) + early_stop = EarlyStopping(monitor='val_tf_auc', patience=200, verbose=1, mode='auto') + candle_monitor = candle.CandleRemoteMonitor(params=params) + + candle_monitor = candle.CandleRemoteMonitor(params=params) + timeout_monitor = candle.TerminateOnTimeOut(params['timeout']) + history_logger = LoggingCallback(attn.logger.debug) + + callbacks = [candle_monitor, timeout_monitor, csv_logger, history_logger] + + if params['reduce_lr']: + callbacks.append(reduce_lr) + + if params['cp']: + callbacks.append(checkpointer) + + if params['early_stop']: + callbacks.append(early_stop) + + #history = parallel_model.fit(X_train, Y_train, + epochs = params['epochs'] + batch_size=params['batch_size'] + history = model.fit(X_train, Y_train, class_weight=d_class_weights, + batch_size=batch_size, + epochs=epochs, + verbose=1, + validation_data=(X_val, Y_val), + callbacks = callbacks) + + + score = model.evaluate(X_test, Y_test, verbose=0) + + Y_predict = model.predict(X_test) + + post_process(params, X_train, X_test, Y_test, _Y_test, Y_predict, pos, total, score, history, model) + + ### + + attn.logger.handlers = [] + + return history + + +def post_process(params, X_train, X_test, Y_test, _Y_test, Y_predict, pos, total, score, history, model): + + threshold = 0.5 + + Y_pred_int = (Y_predict[:,0] < threshold).astype(np.int) + Y_test_int = (Y_test[:,0] < threshold).astype(np.int) + + print ('creating table of predictions') + f = open(params['save_dir'] + 'Agg_attn_bin.predictions.tsv', 'w') + for index, row in _Y_test.iterrows(): + if row['AUC'] == 1: + if Y_pred_int[index] == 1: + call='TP' + else: + call='FN' + if row['AUC'] == 0: + if Y_pred_int[index] == 0: + call = 'TN' + else: + call = 'FP' + # 1 TN 0 0.6323 NCI60.786-0 NSC.256439 NSC.102816 + print(index, "\t", call, "\t", Y_pred_int[index], "\t", row['AUC'], "\t", row['Sample'], "\t", row['Drug1'], file=f) + f.close() + + + false_pos_rate, true_pos_rate, thresholds = roc_curve(Y_test[:,0], Y_predict[:,0]) + + #print(thresholds) + + roc_auc = auc(false_pos_rate, true_pos_rate) + + auc_keras = roc_auc + fpr_keras = false_pos_rate + tpr_keras = true_pos_rate + + print ('creating figure 1 at ', params['save_dir'] + 'Agg_attn_bin.auroc.pdf') + plt.figure(1) + plt.plot([0, 1], [0, 1], 'k--', label="No Skill") + plt.plot(fpr_keras, tpr_keras, label='Keras (area = {:.3f})'.format(auc_keras)) + plt.xlabel('False positive rate') + plt.ylabel('True positive rate') + plt.title('ROC curve') + plt.legend(loc='best') + + plt.savefig(params['save_dir'] + 'Agg_attn_bin.auroc.pdf', bbox_inches='tight') + plt.close() + + + # Zoom in view of the upper left corner. + print ('creating figure 2 at ', params['save_dir'] + 'Agg_attn_bin.auroc2.pdf') + plt.figure(2) + plt.xlim(0, 0.2) + plt.ylim(0.8, 1) + plt.plot([0, 1], [0, 1], 'k--', label="No Skill") + plt.plot(fpr_keras, tpr_keras, label='Keras (area = {:.3f})'.format(auc_keras)) + plt.xlabel('False positive rate') + plt.ylabel('True positive rate') + plt.title('ROC curve (zoomed in at top left)') + plt.legend(loc='best') + + plt.savefig(params['save_dir'] + 'Agg_attn_bin.auroc2.pdf', bbox_inches='tight') + plt.close() + + + f1 = f1_score(Y_test_int, Y_pred_int) + + precision, recall, thresholds = precision_recall_curve(Y_test[:,0], Y_predict[:,0]) + + #print(thresholds) + + pr_auc = auc(recall, precision) + + pr_keras = pr_auc + precision_keras = precision + recall_keras = recall + + print + print + + print('f1=%.3f auroc=%.3f aucpr=%.3f' % (f1, auc_keras, pr_keras)) + + print ('creating figure 3 at ', params['save_dir'] + 'Agg_attn_bin.aurpr.pdf') + plt.figure(1) + no_skill = len(Y_test_int[Y_test_int==1]) / len(Y_test_int) + plt.plot([0, 1], [no_skill, no_skill], linestyle='--', label='No Skill') + plt.plot(recall_keras, precision_keras, label='PR Keras (area = {:.3f})'.format(pr_keras)) + plt.xlabel('Recall') + plt.ylabel('Precision') + plt.title('PR curve') + plt.legend(loc='best') + + plt.savefig(params['save_dir'] + 'Agg_attn_bin.aurpr.pdf', bbox_inches='tight') + + plt.close() + + + def plot_confusion_matrix(cm, classes, + normalize=False, + title='Confusion matrix', + cmap=plt.cm.Blues): + """ + This function prints and plots the confusion matrix. + Normalization can be applied by setting `normalize=True`. + """ + if normalize: + cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] + print("Normalized confusion matrix") + else: + print('Confusion matrix, without normalization') + + print(cm) + + plt.imshow(cm, interpolation='nearest', cmap=cmap) + plt.title(title) + plt.colorbar() + tick_marks = np.arange(len(classes)) + plt.xticks(tick_marks, classes, rotation=45) + plt.yticks(tick_marks, classes) + + fmt = '.2f' if normalize else 'd' + thresh = cm.max() / 2. + for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): + plt.text(j, i, format(cm[i, j], fmt), + horizontalalignment="center", + color="white" if cm[i, j] > thresh else "black") + + plt.ylabel('True label') + plt.xlabel('Predicted label') + plt.tight_layout() + + class_names=["Non-Response","Response"] + + # Compute confusion matrix + cnf_matrix = sklearn.metrics.confusion_matrix(Y_test_int, Y_pred_int) + np.set_printoptions(precision=2) + + # Plot non-normalized confusion matrix + #plt.figure() + print ('creating figure 4 at ', params['save_dir'] + 'Agg_attn_bin.confusion_without_norm.pdf') + plot_confusion_matrix(cnf_matrix, classes=class_names, + title='Confusion matrix, without normalization') + plt.savefig(params['save_dir'] + 'Agg_attn_bin.confusion_without_norm.pdf', bbox_inches='tight') + + plt.close() + + + + def plot_confusion_matrix(cm, classes, + normalize=False, + title='Confusion matrix', + cmap=plt.cm.Blues): + """ + This function prints and plots the confusion matrix. + Normalization can be applied by setting `normalize=True`. + """ + if normalize: + cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] + print("Normalized confusion matrix") + else: + print('Confusion matrix, without normalization') + + print(cm) + + plt.imshow(cm, interpolation='nearest', cmap=cmap) + plt.title(title) + plt.colorbar() + tick_marks = np.arange(len(classes)) + plt.xticks(tick_marks, classes, rotation=45) + plt.yticks(tick_marks, classes) + + fmt = '.2f' if normalize else 'd' + thresh = cm.max() / 2. + for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): + plt.text(j, i, format(cm[i, j], fmt), + horizontalalignment="center", + color="white" if cm[i, j] > thresh else "black") + + plt.ylabel('True label') + plt.xlabel('Predicted label') + plt.tight_layout() + + class_names=["Non-Response","Response"] + + # Compute confusion matrix + cnf_matrix = sklearn.metrics.confusion_matrix(Y_test_int, Y_pred_int) + np.set_printoptions(precision=2) + + # Plot non-normalized confusion matrix + #plt.figure() + plot_confusion_matrix(cnf_matrix, classes=class_names, + title='Confusion matrix, without normalization') + plt.savefig(params['save_dir'] + 'Agg_attn_bin.confusion_without_norm.pdf', bbox_inches='tight') + + plt.close() + + # Plot normalized confusion matrix + #plt.figure() + plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True, + title='Normalized confusion matrix') + plt.savefig(params['save_dir'] + 'Agg_attn_bin.confusion_with_norm.pdf', bbox_inches='tight') + + plt.close() + + + print('Examples:\n Total: {}\n Positive: {} ({:.2f}% of total)\n'.format( + total, pos, 100 * pos / total)) + + + print(sklearn.metrics.roc_auc_score(Y_test_int, Y_pred_int)) + + print(sklearn.metrics.balanced_accuracy_score(Y_test_int, Y_pred_int)) + + print(sklearn.metrics.classification_report(Y_test_int, Y_pred_int)) + + print(sklearn.metrics.confusion_matrix(Y_test_int, Y_pred_int)) + + print("score") + print(score) + + #exit() + + # summarize history for accuracy + plt.plot(history.history['acc']) + plt.plot(history.history['val_acc']) + plt.title('Model Accuracy') + plt.ylabel('accuracy') + plt.xlabel('epoch') + plt.legend(['train', 'test'], loc='upper left') + + plt.savefig(params['save_dir'] + 'Agg_attn_bin.accuracy.png', bbox_inches='tight') + plt.savefig(params['save_dir'] + 'Agg_attn_bin.accuracy.pdf', bbox_inches='tight') + + plt.close() + + # summarize history for loss + plt.plot(history.history['loss']) + plt.plot(history.history['val_loss']) + plt.title('Model Loss') + plt.ylabel('loss') + plt.xlabel('epoch') + plt.legend(['train', 'test'], loc='upper left') + + plt.savefig(params['save_dir'] + 'Agg_attn_bin.loss.png', bbox_inches='tight') + plt.savefig(params['save_dir'] + 'Agg_attn_bin.loss.pdf', bbox_inches='tight') + + + print('Test val_loss:', score[0]) + print('Test accuracy:', score[1]) + + # serialize model to JSON + model_json = model.to_json() + with open(params['save_dir'] + "Agg_attn_bin.model.json", "w") as json_file: + json_file.write(model_json) + + # serialize model to YAML + model_yaml = model.to_yaml() + with open(params['save_dir'] + "Agg_attn_bin.model.yaml", "w") as yaml_file: + yaml_file.write(model_yaml) + + + # serialize weights to HDF5 + model.save_weights(params['save_dir'] + "Agg_attn_bin.model.h5") + print("Saved model to disk") + + # load json and create model + json_file = open(params['save_dir'] + 'Agg_attn_bin.model.json', 'r') + loaded_model_json = json_file.read() + json_file.close() + loaded_model_json = model_from_json(loaded_model_json) + + + # load yaml and create model + yaml_file = open(params['save_dir'] + 'Agg_attn_bin.model.yaml', 'r') + loaded_model_yaml = yaml_file.read() + yaml_file.close() + loaded_model_yaml = model_from_yaml(loaded_model_yaml) + + + # load weights into new model + loaded_model_json.load_weights(params['save_dir'] + "Agg_attn_bin.model.h5") + print("Loaded json model from disk") + + # evaluate json loaded model on test data + loaded_model_json.compile(loss='binary_crossentropy', optimizer=params['optimizer'], metrics=['accuracy']) + score_json = loaded_model_json.evaluate(X_test, Y_test, verbose=0) + + print('json Validation loss:', score_json[0]) + print('json Validation accuracy:', score_json[1]) + + print("json %s: %.2f%%" % (loaded_model_json.metrics_names[1], score_json[1]*100)) + + + # load weights into new model + loaded_model_yaml.load_weights(params['save_dir'] + "Agg_attn_bin.model.h5") + print("Loaded yaml model from disk") + + # evaluate loaded model on test data + loaded_model_yaml.compile(loss='binary_crossentropy', optimizer=params['optimizer'], metrics=['accuracy']) + score_yaml = loaded_model_yaml.evaluate(X_test, Y_test, verbose=0) + + print('yaml Validation loss:', score_yaml[0]) + print('yaml Validation accuracy:', score_yaml[1]) + + print("yaml %s: %.2f%%" % (loaded_model_yaml.metrics_names[1], score_yaml[1]*100)) + + # predict using loaded yaml model on test and training data + + predict_yaml_train = loaded_model_yaml.predict(X_train) + + predict_yaml_test = loaded_model_yaml.predict(X_test) + + + print('Yaml_train_shape:', predict_yaml_train.shape) + print('Yaml_test_shape:', predict_yaml_test.shape) + + + predict_yaml_train_classes = np.argmax(predict_yaml_train, axis=1) + predict_yaml_test_classes = np.argmax(predict_yaml_test, axis=1) + + np.savetxt(params['save_dir'] + "Agg_attn_bin_predict_yaml_train.csv", predict_yaml_train, delimiter=",", fmt="%.3f") + np.savetxt(params['save_dir'] + "Agg_attn_bin_predict_yaml_test.csv", predict_yaml_test, delimiter=",", fmt="%.3f") + + np.savetxt(params['save_dir'] + "Agg_attn_bin_predict_yaml_train_classes.csv", predict_yaml_train_classes, delimiter=",",fmt="%d") + np.savetxt(params['save_dir'] + "Agg_attn_bin_predict_yaml_test_classes.csv", predict_yaml_test_classes, delimiter=",",fmt="%d") + + + + +def main(): + params = initialize_parameters() + run(params) + + +if __name__ == '__main__': + main() + if K.backend() == 'tensorflow': + K.clear_session() diff --git a/Pilot1/Attn1/attn_default_model.txt b/Pilot1/Attn1/attn_default_model.txt new file mode 100644 index 00000000..3e66fa0e --- /dev/null +++ b/Pilot1/Attn1/attn_default_model.txt @@ -0,0 +1,30 @@ +[Global_Params] +model_name='attn' +dense=[2000, 600] +batch_size=32 +epochs=1 +activation='relu' +loss='categorical_crossentropy' +optimizer='sgd' +drop=0.2 +learning_rate=None +scaling='minmax' +model='ae' +noise_factor=0 +validation_split=0.1 +epsilon_std=1.0 +rng_seed=2017 +initialization='glorot_uniform' +latent_dim=2 +batch_normalization=False +in='top_21_1fold_001.h5' +save_path='candle_save' +save_dir='./save/001/' +cp=True +early_stop=True +reduce_lr=True +feature_subsample=0 + +[Monitor_Params] +solr_root='' +timeout=3600 From f373df72a2d90a46dcf0e469c825128335a2d4f0 Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Wed, 26 Feb 2020 14:47:16 -0600 Subject: [PATCH 155/331] o remove model hyperparameter --- Pilot1/Attn1/attn.py | 58 ----------------------------- Pilot1/Attn1/attn_default_model.txt | 2 - 2 files changed, 60 deletions(-) diff --git a/Pilot1/Attn1/attn.py b/Pilot1/Attn1/attn.py index d6af6a8b..d7e08da2 100644 --- a/Pilot1/Attn1/attn.py +++ b/Pilot1/Attn1/attn.py @@ -27,14 +27,6 @@ 'action':'store', 'type': int, 'help':'latent dimensions'}, -{'name':'model', - 'default':'ae', - 'choices':['ae', 'vae', 'cvae'], - 'help':'model to use: ae, vae, cvae'}, -{'name':'use_landmark_genes', - 'type': candle.str2bool, - 'default': False, - 'help':'use the 978 landmark genes from LINCS (L1000) as expression features'}, {'name':'residual', 'type': candle.str2bool, 'default': False, @@ -80,14 +72,11 @@ 'initialization', 'learning_rate', 'loss', - 'noise_factor', 'optimizer', 'rng_seed', - 'model', 'scaling', 'validation_split', 'latent_dim', - 'feature_subsample', 'batch_normalization', 'epsilon_std', 'solr_root', @@ -111,7 +100,6 @@ def set_locals(self): def extension_from_parameters(params, framework=''): """Construct string for saving model with annotation of parameters""" ext = framework - ext += '.{}'.format(params['model']) for i, n in enumerate(params['dense']): if n: ext += '.D{}={}'.format(i+1, n) @@ -220,49 +208,3 @@ def load_data(params, seed): return X_train, Y_train, X_val, Y_val, X_test, Y_test - - - -def load_data_orig(params, seed): - if params['with_type']: - drop_cols = ['case_id'] - onehot_cols = ['cancer_type'] - else: - drop_cols = ['case_id', 'cancer_type'] - onehot_cols = None - - if params['use_landmark_genes']: - lincs_file = 'lincs1000.tsv' - lincs_path = candle.fetch_file(url_p1b1 + lincs_file) - df_l1000 = pd.read_csv(lincs_path, sep='\t') - usecols = df_l1000['gdc'] - drop_cols = None - else: - usecols = None - - return candle.load_X_data(params['url_attn'], params['file_train'], params['file_test'], - drop_cols=drop_cols, - onehot_cols=onehot_cols, - usecols=usecols, - n_cols=params['feature_subsample'], - shuffle=params['shuffle'], - scaling=params['scaling'], - validation_split=params['validation_split'], - dtype=params['datatype'], - seed=seed) - - -def evaluate_autoencoder(y_pred, y_test): - try: - mse = mean_squared_error(y_pred, y_test) - r2 = r2_score(y_test, y_pred) - corr, _ = pearsonr(y_pred.flatten(), y_test.flatten()) - # print('Mean squared error: {}%'.format(mse)) - except: - #when nan or something else breaks mean_squared_error computation - # we may check earlier before computation also: - #np.isnan(y_pred).any() or np.isnan(y_test).any()): - r2 = 0 - mse = 0 - corr = 0 - return {'mse': mse, 'r2_score': r2, 'correlation': corr} diff --git a/Pilot1/Attn1/attn_default_model.txt b/Pilot1/Attn1/attn_default_model.txt index 3e66fa0e..e957f74b 100644 --- a/Pilot1/Attn1/attn_default_model.txt +++ b/Pilot1/Attn1/attn_default_model.txt @@ -9,8 +9,6 @@ optimizer='sgd' drop=0.2 learning_rate=None scaling='minmax' -model='ae' -noise_factor=0 validation_split=0.1 epsilon_std=1.0 rng_seed=2017 From 827ae4de8d6f35cae39b1e280d828bd7726c7082 Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Wed, 26 Feb 2020 15:03:22 -0600 Subject: [PATCH 156/331] o Clean imports and add nb_classes as hyperparameters --- Pilot1/Attn1/attn.py | 1 + Pilot1/Attn1/attn_baseline_keras2.py | 52 +++++----------------------- Pilot1/Attn1/attn_default_model.txt | 1 + 3 files changed, 11 insertions(+), 43 deletions(-) diff --git a/Pilot1/Attn1/attn.py b/Pilot1/Attn1/attn.py index d7e08da2..ec53e528 100644 --- a/Pilot1/Attn1/attn.py +++ b/Pilot1/Attn1/attn.py @@ -109,6 +109,7 @@ def extension_from_parameters(params, framework=''): ext += '.L={}'.format(params['latent_dim']) ext += '.LR={}'.format(params['learning_rate']) ext += '.S={}'.format(params['scaling']) + if params['epsilon_std'] != 1.0: ext += '.EPS={}'.format(params['epsilon_std']) if params['drop']: diff --git a/Pilot1/Attn1/attn_baseline_keras2.py b/Pilot1/Attn1/attn_baseline_keras2.py index b354b168..e518204c 100644 --- a/Pilot1/Attn1/attn_baseline_keras2.py +++ b/Pilot1/Attn1/attn_baseline_keras2.py @@ -1,28 +1,5 @@ from __future__ import print_function -import numpy as np - -import keras -from keras import backend as K -from keras import optimizers -from keras.models import Model -from keras.layers import BatchNormalization, Dense, Dropout, Input, Lambda -from keras.callbacks import Callback, ModelCheckpoint, ReduceLROnPlateau, LearningRateScheduler, TensorBoard -from keras.metrics import binary_crossentropy, mean_squared_error -from scipy.stats.stats import pearsonr -from sklearn.manifold import TSNE - -import warnings -with warnings.catch_warnings(): - warnings.filterwarnings("ignore", category=DeprecationWarning) - from sklearn.metrics import r2_score - from sklearn.metrics import accuracy_score - -import matplotlib as mpl -mpl.use('Agg') -import matplotlib.pyplot as plt - -## Imports from actual import itertools import pandas as pd import numpy as np @@ -47,24 +24,19 @@ from keras.models import Sequential, Model, model_from_json, model_from_yaml from keras.utils import np_utils, multi_gpu_model -from keras.callbacks import ModelCheckpoint, CSVLogger, ReduceLROnPlateau, EarlyStopping +from keras.callbacks import Callback, ModelCheckpoint, CSVLogger, ReduceLROnPlateau, EarlyStopping, TensorBoard from sklearn.utils.class_weight import compute_class_weight from sklearn.model_selection import train_test_split from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, roc_auc_score, confusion_matrix, balanced_accuracy_score, classification_report from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler from sklearn.metrics import recall_score, auc, roc_curve, f1_score, precision_recall_curve -### - - - import attn import candle np.set_printoptions(precision=4) - def r2(y_true, y_pred): SS_res = K.sum(K.square(y_true - y_pred)) SS_tot = K.sum(K.square(y_true - K.mean(y_true))) @@ -181,11 +153,6 @@ def load_cache(cache_file): def run(params): -## - nb_classes = 2 -## - - args = candle.ArgumentStruct(**params) seed = args.rng_seed candle.set_seed(seed) @@ -224,12 +191,12 @@ def run(params): print('Examples:\n Total: {}\n Positive: {} ({:.2f}% of total)\n'.format( total, pos, 100 * pos / total)) + nb_classes = params['nb_classes'] + Y_train = np_utils.to_categorical(Y_train,nb_classes) Y_test = np_utils.to_categorical(Y_test,nb_classes) Y_val = np_utils.to_categorical(Y_val,nb_classes) - # ----------------------- from stack overflow - y_integers = np.argmax(Y_train, axis=1) class_weights = compute_class_weight('balanced', np.unique(y_integers), y_integers) d_class_weights = dict(enumerate(class_weights)) @@ -242,7 +209,10 @@ def run(params): PS=X_train.shape[1] inputs = Input(shape=(PS,)) + DR = params['drop'] + + #TODO: specify dense and activation via hyperparameters x = Dense(1000, activation='relu')(inputs) x = BatchNormalization()(x) @@ -282,7 +252,7 @@ def run(params): #parallel_model.compile(loss='mean_squared_error', # optimizer=SGD(lr=0.0001, momentum=0.9), # metrics=['mae',r2]) - + # TODO: specify optimizer via hyperparameters model.compile(loss=params['loss'], optimizer=SGD(lr=0.00001, momentum=0.9), # optimizer=Adam(lr=0.00001), @@ -316,7 +286,6 @@ def run(params): if params['early_stop']: callbacks.append(early_stop) - #history = parallel_model.fit(X_train, Y_train, epochs = params['epochs'] batch_size=params['batch_size'] history = model.fit(X_train, Y_train, class_weight=d_class_weights, @@ -331,10 +300,10 @@ def run(params): Y_predict = model.predict(X_test) + # see big fuction below, creates plots etc. + # TODO: Break post_process into multiple functions post_process(params, X_train, X_test, Y_test, _Y_test, Y_predict, pos, total, score, history, model) - ### - attn.logger.handlers = [] return history @@ -416,9 +385,6 @@ def post_process(params, X_train, X_test, Y_test, _Y_test, Y_predict, pos, total precision_keras = precision recall_keras = recall - print - print - print('f1=%.3f auroc=%.3f aucpr=%.3f' % (f1, auc_keras, pr_keras)) print ('creating figure 3 at ', params['save_dir'] + 'Agg_attn_bin.aurpr.pdf') diff --git a/Pilot1/Attn1/attn_default_model.txt b/Pilot1/Attn1/attn_default_model.txt index e957f74b..b5909005 100644 --- a/Pilot1/Attn1/attn_default_model.txt +++ b/Pilot1/Attn1/attn_default_model.txt @@ -22,6 +22,7 @@ cp=True early_stop=True reduce_lr=True feature_subsample=0 +nb_classes=2 [Monitor_Params] solr_root='' From 951fa54d1185f667b0365c865ca82c2b81855b7c Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Wed, 26 Feb 2020 17:16:12 -0600 Subject: [PATCH 157/331] o Add README --- Pilot1/Attn1/README.md | 173 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 173 insertions(+) create mode 100644 Pilot1/Attn1/README.md diff --git a/Pilot1/Attn1/README.md b/Pilot1/Attn1/README.md new file mode 100644 index 00000000..c79dca8b --- /dev/null +++ b/Pilot1/Attn1/README.md @@ -0,0 +1,173 @@ +The Pilot1 Attn Benchmark loads the hdf5 file specified by hyperparameter "in" specifying files of the format: + +top_21_1fold_001.h5, top_21_1fold_002.h5 ..top_21_1fold_0kk.h5 + +## Sample run: + +``` +python attn_baseline_keras2.py +``` + +... +processing h5 in file top_21_1fold_001.h5 +x_train shape: (271915, 6212) +x_test shape: (33989, 6212) +Examples: +Total: 339893 +Positive: 12269 (3.61% of total) + +X_train shape: (271915, 6212) +X_test shape: (33989, 6212) +Y_train shape: (271915, 2) +Y_test shape: (33989, 2) +Instructions for updating: +If using Keras pass \*\_constraint arguments to layers. +Model: "model_1" + +--- + +# Layer (type) Output Shape Param # Connected to + +input_1 (InputLayer) (None, 6212) 0 + +--- + +dense_1 (Dense) (None, 1000) 6213000 input_1[0][0] + +--- + +batch_normalization_1 (BatchNor (None, 1000) 4000 dense_1[0][0] + +--- + +dense_2 (Dense) (None, 1000) 1001000 batch_normalization_1[0][0] + +--- + +batch_normalization_2 (BatchNor (None, 1000) 4000 dense_2[0][0] + +--- + +dense_3 (Dense) (None, 1000) 1001000 batch_normalization_1[0][0] + +--- + +multiply_1 (Multiply) (None, 1000) 0 batch_normalization_2[0][0] + dense_3[0][0] + +--- + +dense_4 (Dense) (None, 500) 500500 multiply_1[0][0] + +--- + +batch_normalization_3 (BatchNor (None, 500) 2000 dense_4[0][0] + +--- + +dropout_1 (Dropout) (None, 500) 0 batch_normalization_3[0][0] + +--- + +dense_5 (Dense) (None, 250) 125250 dropout_1[0][0] + +--- + +batch_normalization_4 (BatchNor (None, 250) 1000 dense_5[0][0] + +--- + +dropout_2 (Dropout) (None, 250) 0 batch_normalization_4[0][0] + +--- + +dense_6 (Dense) (None, 125) 31375 dropout_2[0][0] + +--- + +batch_normalization_5 (BatchNor (None, 125) 500 dense_6[0][0] + +--- + +dropout_3 (Dropout) (None, 125) 0 batch_normalization_5[0][0] + +--- + +dense_7 (Dense) (None, 60) 7560 dropout_3[0][0] + +--- + +batch_normalization_6 (BatchNor (None, 60) 240 dense_7[0][0] + +--- + +dropout_4 (Dropout) (None, 60) 0 batch_normalization_6[0][0] + +--- + +dense_8 (Dense) (None, 30) 1830 dropout_4[0][0] + +--- + +batch_normalization_7 (BatchNor (None, 30) 120 dense_8[0][0] + +--- + +dropout_5 (Dropout) (None, 30) 0 batch_normalization_7[0][0] + +--- + +# dense_9 (Dense) (None, 2) 62 dropout_5[0][0] + +Total params: 8,893,437 +Trainable params: 8,887,507 +Non-trainable params: 5,930 +.. +.. +271915/271915 [==============================] - 631s 2ms/step - loss: 0.8681 - acc: 0.5548 - tf_auc: 0.5371 - val_loss: 0.6010 - val_acc: 0.8365 - val_tf_auc: 0.5743 +Current time ....631.567 + +Epoch 00001: val_loss improved from inf to 0.60103, saving model to ./save/001/Agg_attn_bin.autosave.model.h5 +creating table of predictions +creating figure 1 at ./save/001/Agg_attn_bin.auroc.pdf +creating figure 2 at ./save/001/Agg_attn_bin.auroc2.pdf +f1=0.234 auroc=0.841 aucpr=0.990 +creating figure 3 at ./save/001/Agg_attn_bin.aurpr.pdf +creating figure 4 at ./save/001/Agg_attn_bin.confusion_without_norm.pdf +Confusion matrix, without normalization +[[27591 5190][ 360 848]] +Confusion matrix, without normalization +[[27591 5190][ 360 848]] +Normalized confusion matrix +[[0.84 0.16][0.3 0.7 ]] +Examples: +Total: 339893 +Positive: 12269 (3.61% of total) + +0.7718316679565835 +0.7718316679565836 +precision recall f1-score support + + 0 0.99 0.84 0.91 32781 + 1 0.14 0.70 0.23 1208 + +micro avg 0.84 0.84 0.84 33989 +macro avg 0.56 0.77 0.57 33989 +weighted avg 0.96 0.84 0.88 33989 + +[[27591 5190][ 360 848]] +score +[0.5760348070144456, 0.8367118835449219, 0.5936741828918457] +Test val_loss: 0.5760348070144456 +Test accuracy: 0.8367118835449219 +Saved model to disk +Loaded json model from disk +json Validation loss: 0.560062773128295 +json Validation accuracy: 0.8367118835449219 +json accuracy: 83.67% +Loaded yaml model from disk +yaml Validation loss: 0.560062773128295 +yaml Validation accuracy: 0.8367118835449219 +yaml accuracy: 83.67% +Yaml_train_shape: (271915, 2) +Yaml_test_shape: (33989, 2) From 648a446a4d910dcd4a571eec1555cd7bfae33584 Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Thu, 27 Feb 2020 11:20:20 -0600 Subject: [PATCH 158/331] o More descriptive booleans: Thanks Jamal for suggestion. o Make optimizer via hyperparameters --- Pilot1/Attn1/attn.py | 4 ++-- Pilot1/Attn1/attn_baseline_keras2.py | 19 ++++++++++++++++--- Pilot1/Attn1/attn_default_model.txt | 9 +++++---- 3 files changed, 23 insertions(+), 9 deletions(-) diff --git a/Pilot1/Attn1/attn.py b/Pilot1/Attn1/attn.py index ec53e528..91dd1f2e 100644 --- a/Pilot1/Attn1/attn.py +++ b/Pilot1/Attn1/attn.py @@ -45,7 +45,7 @@ {'name':'epsilon_std', 'type': float, 'help':'epsilon std for sampling latent noise'}, -{'name':'cp', +{'name':'use_cp', 'type': candle.str2bool, 'default': False, 'help':'checkpoint models with best val_loss'}, @@ -53,7 +53,7 @@ #'type': candle.str2bool, #'default': False, #'help':'shuffle data'}, -{'name':'tb', +{'name':'use_tb', 'type': candle.str2bool, 'default': False, 'help':'use tensorboard'}, diff --git a/Pilot1/Attn1/attn_baseline_keras2.py b/Pilot1/Attn1/attn_baseline_keras2.py index e518204c..568a1d8c 100644 --- a/Pilot1/Attn1/attn_baseline_keras2.py +++ b/Pilot1/Attn1/attn_baseline_keras2.py @@ -252,7 +252,17 @@ def run(params): #parallel_model.compile(loss='mean_squared_error', # optimizer=SGD(lr=0.0001, momentum=0.9), # metrics=['mae',r2]) - # TODO: specify optimizer via hyperparameters + if params['optimizer'] == 'sgd': + optimizer = SGD(params, lr=params['learning_rate'], momentum=params['momentum']) + elif params['optimizer'] == 'adam': + optimizer = Adam(params, lr=params['learning_rate']) + elif params['optimizer'] == 'rmsprop': + optimizer = RMSProp(params, lr=params['learning_rate']) + elif params['optimizer'] == 'adadelta': + optimizer = Adadelta() + else: + optimizer=SGD(lr=0.00001, momentum=0.9) + model.compile(loss=params['loss'], optimizer=SGD(lr=0.00001, momentum=0.9), # optimizer=Adam(lr=0.00001), @@ -273,6 +283,8 @@ def run(params): candle_monitor = candle.CandleRemoteMonitor(params=params) timeout_monitor = candle.TerminateOnTimeOut(params['timeout']) + tensorboard = TensorBoard(log_dir="tb/tb{}".format(ext)) + history_logger = LoggingCallback(attn.logger.debug) callbacks = [candle_monitor, timeout_monitor, csv_logger, history_logger] @@ -280,9 +292,10 @@ def run(params): if params['reduce_lr']: callbacks.append(reduce_lr) - if params['cp']: + if params['use_cp']: callbacks.append(checkpointer) - + if params['use_tb']: + callbacks.append(tensorboard) if params['early_stop']: callbacks.append(early_stop) diff --git a/Pilot1/Attn1/attn_default_model.txt b/Pilot1/Attn1/attn_default_model.txt index b5909005..5e95a72e 100644 --- a/Pilot1/Attn1/attn_default_model.txt +++ b/Pilot1/Attn1/attn_default_model.txt @@ -7,7 +7,8 @@ activation='relu' loss='categorical_crossentropy' optimizer='sgd' drop=0.2 -learning_rate=None +learning_rate=0.00001 +momentum=0.9 scaling='minmax' validation_split=0.1 epsilon_std=1.0 @@ -15,14 +16,14 @@ rng_seed=2017 initialization='glorot_uniform' latent_dim=2 batch_normalization=False -in='top_21_1fold_001.h5' save_path='candle_save' -save_dir='./save/001/' -cp=True +use_cp=False early_stop=True reduce_lr=True feature_subsample=0 nb_classes=2 +save_dir='./save/001/' +in='top_21_1fold_001.h5' [Monitor_Params] solr_root='' From 805609b5eed3623c2598b1032100efc333bd9570 Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Thu, 27 Feb 2020 11:29:53 -0600 Subject: [PATCH 159/331] o Fix README --- Pilot1/Attn1/README.md | 148 ++++++++++++++--------------------------- 1 file changed, 51 insertions(+), 97 deletions(-) diff --git a/Pilot1/Attn1/README.md b/Pilot1/Attn1/README.md index c79dca8b..0a11e239 100644 --- a/Pilot1/Attn1/README.md +++ b/Pilot1/Attn1/README.md @@ -7,7 +7,6 @@ top_21_1fold_001.h5, top_21_1fold_002.h5 ..top_21_1fold_0kk.h5 ``` python attn_baseline_keras2.py ``` - ... processing h5 in file top_21_1fold_001.h5 x_train shape: (271915, 6212) @@ -21,103 +20,58 @@ X_test shape: (33989, 6212) Y_train shape: (271915, 2) Y_test shape: (33989, 2) Instructions for updating: -If using Keras pass \*\_constraint arguments to layers. +If using Keras pass *_constraint arguments to layers. Model: "model_1" - ---- - -# Layer (type) Output Shape Param # Connected to - -input_1 (InputLayer) (None, 6212) 0 - ---- - -dense_1 (Dense) (None, 1000) 6213000 input_1[0][0] - ---- - -batch_normalization_1 (BatchNor (None, 1000) 4000 dense_1[0][0] - ---- - -dense_2 (Dense) (None, 1000) 1001000 batch_normalization_1[0][0] - ---- - -batch_normalization_2 (BatchNor (None, 1000) 4000 dense_2[0][0] - ---- - -dense_3 (Dense) (None, 1000) 1001000 batch_normalization_1[0][0] - ---- - -multiply_1 (Multiply) (None, 1000) 0 batch_normalization_2[0][0] - dense_3[0][0] - ---- - -dense_4 (Dense) (None, 500) 500500 multiply_1[0][0] - ---- - -batch_normalization_3 (BatchNor (None, 500) 2000 dense_4[0][0] - ---- - -dropout_1 (Dropout) (None, 500) 0 batch_normalization_3[0][0] - ---- - -dense_5 (Dense) (None, 250) 125250 dropout_1[0][0] - ---- - -batch_normalization_4 (BatchNor (None, 250) 1000 dense_5[0][0] - ---- - -dropout_2 (Dropout) (None, 250) 0 batch_normalization_4[0][0] - ---- - -dense_6 (Dense) (None, 125) 31375 dropout_2[0][0] - ---- - -batch_normalization_5 (BatchNor (None, 125) 500 dense_6[0][0] - ---- - -dropout_3 (Dropout) (None, 125) 0 batch_normalization_5[0][0] - ---- - -dense_7 (Dense) (None, 60) 7560 dropout_3[0][0] - ---- - -batch_normalization_6 (BatchNor (None, 60) 240 dense_7[0][0] - ---- - -dropout_4 (Dropout) (None, 60) 0 batch_normalization_6[0][0] - ---- - -dense_8 (Dense) (None, 30) 1830 dropout_4[0][0] - ---- - -batch_normalization_7 (BatchNor (None, 30) 120 dense_8[0][0] - ---- - -dropout_5 (Dropout) (None, 30) 0 batch_normalization_7[0][0] - ---- - -# dense_9 (Dense) (None, 2) 62 dropout_5[0][0] +__________________________________________________________________________________________________ +Layer (type) Output Shape Param # Connected to +================================================================================================== +input_1 (InputLayer) (None, 6212) 0 +__________________________________________________________________________________________________ +dense_1 (Dense) (None, 1000) 6213000 input_1[0][0] +__________________________________________________________________________________________________ +batch_normalization_1 (BatchNor (None, 1000) 4000 dense_1[0][0] +__________________________________________________________________________________________________ +dense_2 (Dense) (None, 1000) 1001000 batch_normalization_1[0][0] +__________________________________________________________________________________________________ +batch_normalization_2 (BatchNor (None, 1000) 4000 dense_2[0][0] +__________________________________________________________________________________________________ +dense_3 (Dense) (None, 1000) 1001000 batch_normalization_1[0][0] +__________________________________________________________________________________________________ +multiply_1 (Multiply) (None, 1000) 0 batch_normalization_2[0][0] + dense_3[0][0] +__________________________________________________________________________________________________ +dense_4 (Dense) (None, 500) 500500 multiply_1[0][0] +__________________________________________________________________________________________________ +batch_normalization_3 (BatchNor (None, 500) 2000 dense_4[0][0] +__________________________________________________________________________________________________ +dropout_1 (Dropout) (None, 500) 0 batch_normalization_3[0][0] +__________________________________________________________________________________________________ +dense_5 (Dense) (None, 250) 125250 dropout_1[0][0] +__________________________________________________________________________________________________ +batch_normalization_4 (BatchNor (None, 250) 1000 dense_5[0][0] +__________________________________________________________________________________________________ +dropout_2 (Dropout) (None, 250) 0 batch_normalization_4[0][0] +__________________________________________________________________________________________________ +dense_6 (Dense) (None, 125) 31375 dropout_2[0][0] +__________________________________________________________________________________________________ +batch_normalization_5 (BatchNor (None, 125) 500 dense_6[0][0] +__________________________________________________________________________________________________ +dropout_3 (Dropout) (None, 125) 0 batch_normalization_5[0][0] +__________________________________________________________________________________________________ +dense_7 (Dense) (None, 60) 7560 dropout_3[0][0] +__________________________________________________________________________________________________ +batch_normalization_6 (BatchNor (None, 60) 240 dense_7[0][0] +__________________________________________________________________________________________________ +dropout_4 (Dropout) (None, 60) 0 batch_normalization_6[0][0] +__________________________________________________________________________________________________ +dense_8 (Dense) (None, 30) 1830 dropout_4[0][0] +__________________________________________________________________________________________________ +batch_normalization_7 (BatchNor (None, 30) 120 dense_8[0][0] +__________________________________________________________________________________________________ +dropout_5 (Dropout) (None, 30) 0 batch_normalization_7[0][0] +__________________________________________________________________________________________________ +dense_9 (Dense) (None, 2) 62 dropout_5[0][0] +================================================================================================== Total params: 8,893,437 Trainable params: 8,887,507 From e58e409fade42bc0ac16e3278edb80ef14846069 Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Thu, 27 Feb 2020 11:41:43 -0600 Subject: [PATCH 160/331] o Update README --- Pilot1/Attn1/README.md | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/Pilot1/Attn1/README.md b/Pilot1/Attn1/README.md index 0a11e239..b582ebd6 100644 --- a/Pilot1/Attn1/README.md +++ b/Pilot1/Attn1/README.md @@ -1,14 +1,15 @@ -The Pilot1 Attn Benchmark loads the hdf5 file specified by hyperparameter "in" specifying files of the format: - -top_21_1fold_001.h5, top_21_1fold_002.h5 ..top_21_1fold_0kk.h5 +The Pilot1 Attn Benchmark requires an hdf5 file specified by the hyperparameter "in", name of this file for default case is: top_21_1fold_001.h5 +Any file of the form top_21_1fold_"ijk".h5 can be used as input ## Sample run: - ``` python attn_baseline_keras2.py ``` +Params: {'model_name': 'attn', 'dense': [2000, 600], 'batch_size': 32, 'epochs': 1, 'activation': 'relu', 'loss': 'categorical_crossentropy', 'optimizer': 'sgd', 'drop': 0.2, 'learning_rate': 1e-05, 'momentum': 0.7, 'scaling': 'minmax', 'validation_split': 0.1, 'epsilon_std': 1.0, 'rng_seed': 2017, 'initialization': 'glorot_uniform', 'latent_dim': 2, 'batch_normalization': False, 'in': 'top_21_1fold_001.h5', 'save_path': 'candle_save', 'save_dir': './save/001/', 'use_cp': False, 'early_stop': True, 'reduce_lr': True, 'feature_subsample': 0, 'nb_classes': 2, 'solr_root': '', 'timeout': 3600, 'verbose': None, 'logfile': None, 'train_bool': True, 'experiment_id': 'EXP000', 'run_id': 'RUN000', 'shuffle': False, 'gpus': [], 'profiling': False, 'residual': False, 'warmup_lr': False, 'use_tb': False, 'tsne': False, 'datatype': , 'output_dir': '/nfs2/jain/Benchmarks/Pilot1/Attn/Output/EXP000/RUN000'} +... ... processing h5 in file top_21_1fold_001.h5 + x_train shape: (271915, 6212) x_test shape: (33989, 6212) Examples: From 2cc5aca606114da786fb96c99371a34d9e42de79 Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Thu, 27 Feb 2020 11:46:02 -0600 Subject: [PATCH 161/331] Update README.md Fix formating --- Pilot1/Attn1/README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Pilot1/Attn1/README.md b/Pilot1/Attn1/README.md index b582ebd6..0b5603b9 100644 --- a/Pilot1/Attn1/README.md +++ b/Pilot1/Attn1/README.md @@ -1,10 +1,10 @@ The Pilot1 Attn Benchmark requires an hdf5 file specified by the hyperparameter "in", name of this file for default case is: top_21_1fold_001.h5 + Any file of the form top_21_1fold_"ijk".h5 can be used as input ## Sample run: ``` python attn_baseline_keras2.py -``` Params: {'model_name': 'attn', 'dense': [2000, 600], 'batch_size': 32, 'epochs': 1, 'activation': 'relu', 'loss': 'categorical_crossentropy', 'optimizer': 'sgd', 'drop': 0.2, 'learning_rate': 1e-05, 'momentum': 0.7, 'scaling': 'minmax', 'validation_split': 0.1, 'epsilon_std': 1.0, 'rng_seed': 2017, 'initialization': 'glorot_uniform', 'latent_dim': 2, 'batch_normalization': False, 'in': 'top_21_1fold_001.h5', 'save_path': 'candle_save', 'save_dir': './save/001/', 'use_cp': False, 'early_stop': True, 'reduce_lr': True, 'feature_subsample': 0, 'nb_classes': 2, 'solr_root': '', 'timeout': 3600, 'verbose': None, 'logfile': None, 'train_bool': True, 'experiment_id': 'EXP000', 'run_id': 'RUN000', 'shuffle': False, 'gpus': [], 'profiling': False, 'residual': False, 'warmup_lr': False, 'use_tb': False, 'tsne': False, 'datatype': , 'output_dir': '/nfs2/jain/Benchmarks/Pilot1/Attn/Output/EXP000/RUN000'} ... ... @@ -126,3 +126,4 @@ yaml Validation accuracy: 0.8367118835449219 yaml accuracy: 83.67% Yaml_train_shape: (271915, 2) Yaml_test_shape: (33989, 2) +``` From e99fdb566bb4730aba7611e03afa59778e5174d5 Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Thu, 27 Feb 2020 12:12:42 -0600 Subject: [PATCH 162/331] o Use common/keras_utils, using the build_optimizer function --- Pilot1/Attn1/attn_baseline_keras2.py | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/Pilot1/Attn1/attn_baseline_keras2.py b/Pilot1/Attn1/attn_baseline_keras2.py index 568a1d8c..9a0cbbdb 100644 --- a/Pilot1/Attn1/attn_baseline_keras2.py +++ b/Pilot1/Attn1/attn_baseline_keras2.py @@ -252,19 +252,16 @@ def run(params): #parallel_model.compile(loss='mean_squared_error', # optimizer=SGD(lr=0.0001, momentum=0.9), # metrics=['mae',r2]) - if params['optimizer'] == 'sgd': - optimizer = SGD(params, lr=params['learning_rate'], momentum=params['momentum']) - elif params['optimizer'] == 'adam': - optimizer = Adam(params, lr=params['learning_rate']) - elif params['optimizer'] == 'rmsprop': - optimizer = RMSProp(params, lr=params['learning_rate']) - elif params['optimizer'] == 'adadelta': - optimizer = Adadelta() - else: - optimizer=SGD(lr=0.00001, momentum=0.9) + kerasDefaults = candle.keras_default_config() + if params['momentum']: + callbacks.append(checkpointer) + kerasDefaults['momentum_sgd'] = params['momentum'] + optimizer = candle.build_optimizer(params['optimizer'], params['learning_rate'], params['momentum'], kerasDefaults) + model.compile(loss=params['loss'], - optimizer=SGD(lr=0.00001, momentum=0.9), + optimizer=optimizer, + # SGD(lr=0.00001, momentum=0.9), # optimizer=Adam(lr=0.00001), # optimizer=RMSprop(lr=0.0001), # optimizer=Adadelta(), From 72cc79120baa0da02d624afc698e6ef3a947d4f2 Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Thu, 27 Feb 2020 12:14:54 -0600 Subject: [PATCH 163/331] o Fix typo --- Pilot1/Attn1/attn_baseline_keras2.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/Pilot1/Attn1/attn_baseline_keras2.py b/Pilot1/Attn1/attn_baseline_keras2.py index 9a0cbbdb..b6f917db 100644 --- a/Pilot1/Attn1/attn_baseline_keras2.py +++ b/Pilot1/Attn1/attn_baseline_keras2.py @@ -254,11 +254,10 @@ def run(params): # metrics=['mae',r2]) kerasDefaults = candle.keras_default_config() if params['momentum']: - callbacks.append(checkpointer) - kerasDefaults['momentum_sgd'] = params['momentum'] + kerasDefaults['momentum_sgd'] = params['momentum'] + + optimizer = candle.build_optimizer(params['optimizer'], params['learning_rate'], kerasDefaults) - optimizer = candle.build_optimizer(params['optimizer'], params['learning_rate'], params['momentum'], kerasDefaults) - model.compile(loss=params['loss'], optimizer=optimizer, # SGD(lr=0.00001, momentum=0.9), From 488c21fafb8a62e9b753f82eb236462f779369d7 Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Thu, 27 Feb 2020 18:52:28 -0600 Subject: [PATCH 164/331] Update README.md --- Pilot1/Attn1/README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Pilot1/Attn1/README.md b/Pilot1/Attn1/README.md index 0b5603b9..75598173 100644 --- a/Pilot1/Attn1/README.md +++ b/Pilot1/Attn1/README.md @@ -1,5 +1,8 @@ The Pilot1 Attn Benchmark requires an hdf5 file specified by the hyperparameter "in", name of this file for default case is: top_21_1fold_001.h5 +Download the file here: +wget http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/top_21_1fold_001.h5 + Any file of the form top_21_1fold_"ijk".h5 can be used as input ## Sample run: From 8854477046694d86e19e73d686ad02e83ddfcabe Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Thu, 27 Feb 2020 18:52:45 -0600 Subject: [PATCH 165/331] Update README.md --- Pilot1/Attn1/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Pilot1/Attn1/README.md b/Pilot1/Attn1/README.md index 75598173..acd5799f 100644 --- a/Pilot1/Attn1/README.md +++ b/Pilot1/Attn1/README.md @@ -1,7 +1,7 @@ The Pilot1 Attn Benchmark requires an hdf5 file specified by the hyperparameter "in", name of this file for default case is: top_21_1fold_001.h5 Download the file here: -wget http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/top_21_1fold_001.h5 +wget http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/top_21_1fold_001.h5 (~4GB) Any file of the form top_21_1fold_"ijk".h5 can be used as input From a858c784bea0ebf1e8e6b2576f508da4dcc1cc80 Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Thu, 27 Feb 2020 20:29:15 -0600 Subject: [PATCH 166/331] Auto-download input file --- Pilot1/Attn1/README.md | 57 +++++++++++++++-------------- Pilot1/Attn1/attn.py | 45 ++++++++++++----------- Pilot1/Attn1/attn_default_model.txt | 3 +- 3 files changed, 55 insertions(+), 50 deletions(-) diff --git a/Pilot1/Attn1/README.md b/Pilot1/Attn1/README.md index acd5799f..a9fa4ced 100644 --- a/Pilot1/Attn1/README.md +++ b/Pilot1/Attn1/README.md @@ -1,11 +1,12 @@ The Pilot1 Attn Benchmark requires an hdf5 file specified by the hyperparameter "in", name of this file for default case is: top_21_1fold_001.h5 -Download the file here: -wget http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/top_21_1fold_001.h5 (~4GB) +Benchmark auto downloads the file below: +http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/top_21_1fold_001.h5 (~4GB) -Any file of the form top_21_1fold_"ijk".h5 can be used as input +Any file of the form top*21_1fold*"ijk".h5 can be used as input ## Sample run: + ``` python attn_baseline_keras2.py Params: {'model_name': 'attn', 'dense': [2000, 600], 'batch_size': 32, 'epochs': 1, 'activation': 'relu', 'loss': 'categorical_crossentropy', 'optimizer': 'sgd', 'drop': 0.2, 'learning_rate': 1e-05, 'momentum': 0.7, 'scaling': 'minmax', 'validation_split': 0.1, 'epsilon_std': 1.0, 'rng_seed': 2017, 'initialization': 'glorot_uniform', 'latent_dim': 2, 'batch_normalization': False, 'in': 'top_21_1fold_001.h5', 'save_path': 'candle_save', 'save_dir': './save/001/', 'use_cp': False, 'early_stop': True, 'reduce_lr': True, 'feature_subsample': 0, 'nb_classes': 2, 'solr_root': '', 'timeout': 3600, 'verbose': None, 'logfile': None, 'train_bool': True, 'experiment_id': 'EXP000', 'run_id': 'RUN000', 'shuffle': False, 'gpus': [], 'profiling': False, 'residual': False, 'warmup_lr': False, 'use_tb': False, 'tsne': False, 'datatype': , 'output_dir': '/nfs2/jain/Benchmarks/Pilot1/Attn/Output/EXP000/RUN000'} @@ -27,54 +28,54 @@ Instructions for updating: If using Keras pass *_constraint arguments to layers. Model: "model_1" __________________________________________________________________________________________________ -Layer (type) Output Shape Param # Connected to +Layer (type) Output Shape Param # Connected to ================================================================================================== -input_1 (InputLayer) (None, 6212) 0 +input_1 (InputLayer) (None, 6212) 0 __________________________________________________________________________________________________ -dense_1 (Dense) (None, 1000) 6213000 input_1[0][0] +dense_1 (Dense) (None, 1000) 6213000 input_1[0][0] __________________________________________________________________________________________________ -batch_normalization_1 (BatchNor (None, 1000) 4000 dense_1[0][0] +batch_normalization_1 (BatchNor (None, 1000) 4000 dense_1[0][0] __________________________________________________________________________________________________ -dense_2 (Dense) (None, 1000) 1001000 batch_normalization_1[0][0] +dense_2 (Dense) (None, 1000) 1001000 batch_normalization_1[0][0] __________________________________________________________________________________________________ -batch_normalization_2 (BatchNor (None, 1000) 4000 dense_2[0][0] +batch_normalization_2 (BatchNor (None, 1000) 4000 dense_2[0][0] __________________________________________________________________________________________________ -dense_3 (Dense) (None, 1000) 1001000 batch_normalization_1[0][0] +dense_3 (Dense) (None, 1000) 1001000 batch_normalization_1[0][0] __________________________________________________________________________________________________ -multiply_1 (Multiply) (None, 1000) 0 batch_normalization_2[0][0] - dense_3[0][0] +multiply_1 (Multiply) (None, 1000) 0 batch_normalization_2[0][0] + dense_3[0][0] __________________________________________________________________________________________________ -dense_4 (Dense) (None, 500) 500500 multiply_1[0][0] +dense_4 (Dense) (None, 500) 500500 multiply_1[0][0] __________________________________________________________________________________________________ -batch_normalization_3 (BatchNor (None, 500) 2000 dense_4[0][0] +batch_normalization_3 (BatchNor (None, 500) 2000 dense_4[0][0] __________________________________________________________________________________________________ -dropout_1 (Dropout) (None, 500) 0 batch_normalization_3[0][0] +dropout_1 (Dropout) (None, 500) 0 batch_normalization_3[0][0] __________________________________________________________________________________________________ -dense_5 (Dense) (None, 250) 125250 dropout_1[0][0] +dense_5 (Dense) (None, 250) 125250 dropout_1[0][0] __________________________________________________________________________________________________ -batch_normalization_4 (BatchNor (None, 250) 1000 dense_5[0][0] +batch_normalization_4 (BatchNor (None, 250) 1000 dense_5[0][0] __________________________________________________________________________________________________ -dropout_2 (Dropout) (None, 250) 0 batch_normalization_4[0][0] +dropout_2 (Dropout) (None, 250) 0 batch_normalization_4[0][0] __________________________________________________________________________________________________ -dense_6 (Dense) (None, 125) 31375 dropout_2[0][0] +dense_6 (Dense) (None, 125) 31375 dropout_2[0][0] __________________________________________________________________________________________________ -batch_normalization_5 (BatchNor (None, 125) 500 dense_6[0][0] +batch_normalization_5 (BatchNor (None, 125) 500 dense_6[0][0] __________________________________________________________________________________________________ -dropout_3 (Dropout) (None, 125) 0 batch_normalization_5[0][0] +dropout_3 (Dropout) (None, 125) 0 batch_normalization_5[0][0] __________________________________________________________________________________________________ -dense_7 (Dense) (None, 60) 7560 dropout_3[0][0] +dense_7 (Dense) (None, 60) 7560 dropout_3[0][0] __________________________________________________________________________________________________ -batch_normalization_6 (BatchNor (None, 60) 240 dense_7[0][0] +batch_normalization_6 (BatchNor (None, 60) 240 dense_7[0][0] __________________________________________________________________________________________________ -dropout_4 (Dropout) (None, 60) 0 batch_normalization_6[0][0] +dropout_4 (Dropout) (None, 60) 0 batch_normalization_6[0][0] __________________________________________________________________________________________________ -dense_8 (Dense) (None, 30) 1830 dropout_4[0][0] +dense_8 (Dense) (None, 30) 1830 dropout_4[0][0] __________________________________________________________________________________________________ -batch_normalization_7 (BatchNor (None, 30) 120 dense_8[0][0] +batch_normalization_7 (BatchNor (None, 30) 120 dense_8[0][0] __________________________________________________________________________________________________ -dropout_5 (Dropout) (None, 30) 0 batch_normalization_7[0][0] +dropout_5 (Dropout) (None, 30) 0 batch_normalization_7[0][0] __________________________________________________________________________________________________ -dense_9 (Dense) (None, 2) 62 dropout_5[0][0] +dense_9 (Dense) (None, 2) 62 dropout_5[0][0] ================================================================================================== Total params: 8,893,437 diff --git a/Pilot1/Attn1/attn.py b/Pilot1/Attn1/attn.py index 91dd1f2e..101cd5e8 100644 --- a/Pilot1/Attn1/attn.py +++ b/Pilot1/Attn1/attn.py @@ -130,24 +130,28 @@ def load_data(params, seed): if params['in'].endswith('h5') or params['in'].endswith('hdf5'): print ('processing h5 in file {}'.format(params['in'])) - df_x_train_0 = pd.read_hdf(params['in'], 'x_train_0').astype(np.float32) - df_x_train_1 = pd.read_hdf(params['in'], 'x_train_1').astype(np.float32) + url = params['data_url'] + file_train = params['in'] + train_file = candle.get_file(file_train, url+file_train, cache_subdir='Pilot1') + + df_x_train_0 = pd.read_hdf(train_file, 'x_train_0').astype(np.float32) + df_x_train_1 = pd.read_hdf(train_file, 'x_train_1').astype(np.float32) X_train = pd.concat([df_x_train_0, df_x_train_1], axis=1, sort=False) del df_x_train_0, df_x_train_1 - df_x_test_0 = pd.read_hdf(params['in'], 'x_test_0').astype(np.float32) - df_x_test_1 = pd.read_hdf(params['in'], 'x_test_1').astype(np.float32) + df_x_test_0 = pd.read_hdf(train_file, 'x_test_0').astype(np.float32) + df_x_test_1 = pd.read_hdf(train_file, 'x_test_1').astype(np.float32) X_test = pd.concat([df_x_test_0, df_x_test_1], axis=1, sort=False) del df_x_test_0, df_x_test_1 - df_x_val_0 = pd.read_hdf(params['in'], 'x_val_0').astype(np.float32) - df_x_val_1 = pd.read_hdf(params['in'], 'x_val_1').astype(np.float32) + df_x_val_0 = pd.read_hdf(train_file, 'x_val_0').astype(np.float32) + df_x_val_1 = pd.read_hdf(train_file, 'x_val_1').astype(np.float32) X_val = pd.concat([df_x_val_0, df_x_val_1], axis=1, sort=False) del df_x_val_0, df_x_val_1 - Y_train = pd.read_hdf(params['in'], 'y_train') - Y_test = pd.read_hdf(params['in'], 'y_test') - Y_val = pd.read_hdf(params['in'], 'y_val') + Y_train = pd.read_hdf(train_file, 'y_train') + Y_test = pd.read_hdf(train_file, 'y_test') + Y_val = pd.read_hdf(train_file, 'y_val') # assumes AUC is in the third column at index 2 # df_y = df['AUC'].astype('int') @@ -156,7 +160,6 @@ def load_data(params, seed): # assumes dataframe has already been scaled # scaler = StandardScaler() # df_x = scaler.fit_transform(df_x) - else: print ('expecting in file file suffix h5') sys.exit() @@ -168,27 +171,27 @@ def load_data(params, seed): return X_train, Y_train, X_val, Y_val, X_test, Y_test # start change # - if params['in'].endswith('h5') or params['in'].endswith('hdf5'): - print ('processing h5 in file {}'.format(params['in'])) + if train_file.endswith('h5') or train_file.endswith('hdf5'): + print ('processing h5 in file {}'.format(train_file)) - df_x_train_0 = pd.read_hdf(params['in'], 'x_train_0').astype(np.float32) - df_x_train_1 = pd.read_hdf(params['in'], 'x_train_1').astype(np.float32) + df_x_train_0 = pd.read_hdf(train_file, 'x_train_0').astype(np.float32) + df_x_train_1 = pd.read_hdf(train_file, 'x_train_1').astype(np.float32) X_train = pd.concat([df_x_train_0, df_x_train_1], axis=1, sort=False) del df_x_train_0, df_x_train_1 - df_x_test_0 = pd.read_hdf(params['in'], 'x_test_0').astype(np.float32) - df_x_test_1 = pd.read_hdf(params['in'], 'x_test_1').astype(np.float32) + df_x_test_0 = pd.read_hdf(train_file, 'x_test_0').astype(np.float32) + df_x_test_1 = pd.read_hdf(train_file, 'x_test_1').astype(np.float32) X_test = pd.concat([df_x_test_0, df_x_test_1], axis=1, sort=False) del df_x_test_0, df_x_test_1 - df_x_val_0 = pd.read_hdf(params['in'], 'x_val_0').astype(np.float32) - df_x_val_1 = pd.read_hdf(params['in'], 'x_val_1').astype(np.float32) + df_x_val_0 = pd.read_hdf(train_file, 'x_val_0').astype(np.float32) + df_x_val_1 = pd.read_hdf(train_file, 'x_val_1').astype(np.float32) X_val = pd.concat([df_x_val_0, df_x_val_1], axis=1, sort=False) del df_x_val_0, df_x_val_1 - Y_train = pd.read_hdf(params['in'], 'y_train') - Y_test = pd.read_hdf(params['in'], 'y_test') - Y_val = pd.read_hdf(params['in'], 'y_val') + Y_train = pd.read_hdf(train_file, 'y_train') + Y_test = pd.read_hdf(train_file, 'y_test') + Y_val = pd.read_hdf(train_file, 'y_val') # assumes AUC is in the third column at index 2 # df_y = df['AUC'].astype('int') diff --git a/Pilot1/Attn1/attn_default_model.txt b/Pilot1/Attn1/attn_default_model.txt index 5e95a72e..0d2ef410 100644 --- a/Pilot1/Attn1/attn_default_model.txt +++ b/Pilot1/Attn1/attn_default_model.txt @@ -1,4 +1,6 @@ [Global_Params] +data_url = 'ftp://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/' +in='top_21_1fold_001.h5' model_name='attn' dense=[2000, 600] batch_size=32 @@ -23,7 +25,6 @@ reduce_lr=True feature_subsample=0 nb_classes=2 save_dir='./save/001/' -in='top_21_1fold_001.h5' [Monitor_Params] solr_root='' From e9c29f956fa091d35c2fbe1ca962b568d673a975 Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Mon, 23 Mar 2020 07:42:17 -0500 Subject: [PATCH 167/331] o CANDLE compliant Model of COVID-19 --- Models/ADRP/README.md | 144 ++++++++ Models/ADRP/adrp.py | 178 ++++++++++ Models/ADRP/adrp_baseline_keras2.py | 487 ++++++++++++++++++++++++++++ Models/ADRP/adrp_default_model.txt | 31 ++ 4 files changed, 840 insertions(+) create mode 100644 Models/ADRP/README.md create mode 100644 Models/ADRP/adrp.py create mode 100644 Models/ADRP/adrp_baseline_keras2.py create mode 100644 Models/ADRP/adrp_default_model.txt diff --git a/Models/ADRP/README.md b/Models/ADRP/README.md new file mode 100644 index 00000000..9cfc536d --- /dev/null +++ b/Models/ADRP/README.md @@ -0,0 +1,144 @@ +The Pilot1 ADRP Benchmark loads a csv file + +Benchmark auto downloads the file below: +http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/ (~500MB) + +## Sample run: + +``` +$ python adrp_baseline_keras2.py +Using TensorFlow backend. +Importing candle utils for keras +Configuration file: /home/jain/CANDLE/Benchmarks/Models/ADRP/adrp_default_model.txt +{'activation': 'relu', + 'batch_normalization': False, + 'batch_size': 32, + 'data_url': 'ftp://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/', + 'dense': [250, 125, 60, 30], + 'drop': 0.1, + 'early_stop': True, + 'epochs': 1, + 'epsilon_std': 1.0, + 'feature_subsample': 0, + 'in': 'adrp-p1.csv', + 'initialization': 'glorot_uniform', + 'latent_dim': 2, + 'learning_rate': 0.0001, + 'loss': 'mean_squared_error', + 'model_name': 'adrp', + 'momentum': 0.9, + 'nb_classes': 2, + 'optimizer': 'sgd', + 'reduce_lr': True, + 'rng_seed': 2017, + 'save_dir': './save/001/', + 'save_path': 'candle_save', + 'scaling': 'minmax', + 'solr_root': '', + 'timeout': 3600, + 'use_cp': False, + 'validation_split': 0.1} +Params: +{'activation': 'relu', + 'batch_normalization': False, + 'batch_size': 32, + 'data_url': 'ftp://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/', + 'datatype': , + 'dense': [250, 125, 60, 30], + 'drop': 0.1, + 'early_stop': True, + 'epochs': 1, + 'epsilon_std': 1.0, + 'experiment_id': 'EXP000', + 'feature_subsample': 0, + 'gpus': [], + 'in': 'adrp-p1.csv', + 'initialization': 'glorot_uniform', + 'latent_dim': 2, + 'learning_rate': 0.0001, + 'logfile': None, + 'loss': 'mean_squared_error', + 'model_name': 'adrp', + 'momentum': 0.9, + 'nb_classes': 2, + 'optimizer': 'sgd', + 'output_dir': '/home/jain/CANDLE/Benchmarks/Models/ADRP/Output/EXP000/RUN000', + 'profiling': False, + 'reduce_lr': True, + 'residual': False, + 'rng_seed': 2017, + 'run_id': 'RUN000', + 'save_dir': './save/001/', + 'save_path': 'candle_save', + 'scaling': 'minmax', + 'shuffle': False, + 'solr_root': '', + 'timeout': 3600, + 'train_bool': True, + 'tsne': False, + 'use_cp': False, + 'use_tb': False, + 'validation_split': 0.1, + 'verbose': None, + 'warmup_lr': False} +WARNING:tensorflow:From /home/jain/CANDLE/Benchmarks/common/keras_utils.py:51: The name tf.set_random_seed is deprecated. Please use tf.compat.v1.set_random_seed instead. + +Params: {'data_url': 'ftp://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/', 'in': 'adrp-p1.csv', 'model_name': 'adrp', 'dense': [250, 125, 60, 30], 'batch_size': 32, 'epochs': 1, 'activation': 'relu', 'loss': 'mean_squared_error', 'optimizer': 'sgd', 'drop': 0.1, 'learning_rate': 0.0001, 'momentum': 0.9, 'scaling': 'minmax', 'validation_split': 0.1, 'epsilon_std': 1.0, 'rng_seed': 2017, 'initialization': 'glorot_uniform', 'latent_dim': 2, 'batch_normalization': False, 'save_path': 'candle_save', 'use_cp': False, 'early_stop': True, 'reduce_lr': True, 'feature_subsample': 0, 'nb_classes': 2, 'save_dir': './save/001/', 'solr_root': '', 'timeout': 3600, 'verbose': None, 'logfile': None, 'train_bool': True, 'experiment_id': 'EXP000', 'run_id': 'RUN000', 'shuffle': False, 'gpus': [], 'profiling': False, 'residual': False, 'warmup_lr': False, 'use_tb': False, 'tsne': False, 'datatype': , 'output_dir': '/home/jain/CANDLE/Benchmarks/Models/ADRP/Output/EXP000/RUN000'} +processing csv in file adrp-p1.csv +PL= 1614 +X_train shape: (27447, 1613) +X_test shape: (6862, 1613) +Y_train shape: (27447,) +Y_test shape: (6862,) +WARNING:tensorflow:From /home/jain/.local/lib/python3.7/site-packages/tensorflow_core/python/ops/resource_variable_ops.py:1630: calling BaseResourceVariable.__init__ (from tensorflow.python.ops.resource_variable_ops) with constraint is deprecated and will be removed in a future version. +Instructions for updating: +If using Keras pass *_constraint arguments to layers. +Model: "model_1" +_________________________________________________________________ +Layer (type) Output Shape Param # +================================================================= +input_1 (InputLayer) (None, 1613) 0 +_________________________________________________________________ +dense_1 (Dense) (None, 250) 403500 +_________________________________________________________________ +dropout_1 (Dropout) (None, 250) 0 +_________________________________________________________________ +dense_2 (Dense) (None, 125) 31375 +_________________________________________________________________ +dropout_2 (Dropout) (None, 125) 0 +_________________________________________________________________ +dense_3 (Dense) (None, 60) 7560 +_________________________________________________________________ +dropout_3 (Dropout) (None, 60) 0 +_________________________________________________________________ +dense_4 (Dense) (None, 30) 1830 +_________________________________________________________________ +dropout_4 (Dropout) (None, 30) 0 +_________________________________________________________________ +dense_5 (Dense) (None, 1) 31 +================================================================= +Total params: 444,296 +Trainable params: 444,296 +Non-trainable params: 0 +_________________________________________________________________ +/home/jain/.local/lib/python3.7/site-packages/keras/callbacks/callbacks.py:998: UserWarning: `epsilon` argument is deprecated and will be removed, use `min_delta` instead. + warnings.warn('`epsilon` argument is deprecated and ' +2020-03-23 07:27:52.405136: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcuda.so.1 +2020-03-23 07:27:52.407468: E tensorflow/stream_executor/cuda/cuda_driver.cc:318] failed call to cuInit: CUDA_ERROR_UNKNOWN: unknown error +2020-03-23 07:27:52.407497: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (jain): /proc/driver/nvidia/version does not exist +2020-03-23 07:27:52.407740: I tensorflow/core/platform/cpu_feature_guard.cc:142] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 +2020-03-23 07:27:52.417402: I tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency: 2808000000 Hz +2020-03-23 07:27:52.417634: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x3308680 initialized for platform Host (this does not guarantee that XLA will be used). Devices: +2020-03-23 07:27:52.417654: I tensorflow/compiler/xla/service/service.cc:176] StreamExecutor device (0): Host, Default Version +WARNING:tensorflow:From /home/jain/.local/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:422: The name tf.global_variables is deprecated. Please use tf.compat.v1.global_variables instead. + +Train on 27447 samples, validate on 6862 samples +Epoch 1/1 +27447/27447 [==============================] - 4s 148us/step - loss: 3.4695 - mae: 1.3269 - r2: -2.1720 - val_loss: 1.2343 - val_mae: 0.9235 - val_r2: -0.1880 + +Epoch 00001: val_loss improved from inf to 1.23431, saving model to agg_adrp.autosave.model.h5 +[1.2343122459159792, 0.9235042333602905, -0.18803702294826508] +dict_keys(['val_loss', 'val_mae', 'val_r2', 'loss', 'mae', 'r2', 'lr']) +Test val_loss: 1.2343122459159792 +Test val_mae: 0.9235042333602905 +``` diff --git a/Models/ADRP/adrp.py b/Models/ADRP/adrp.py new file mode 100644 index 00000000..73054c78 --- /dev/null +++ b/Models/ADRP/adrp.py @@ -0,0 +1,178 @@ +from __future__ import print_function + +import os +import sys +import logging + +import pandas as pd +import numpy as np + +from sklearn.metrics import mean_squared_error +from sklearn.metrics import r2_score +from scipy.stats.stats import pearsonr + +from sklearn.model_selection import train_test_split +from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error +from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler + + +file_path = os.path.dirname(os.path.realpath(__file__)) +# lib_path = os.path.abspath(os.path.join(file_path, '..')) +# sys.path.append(lib_path) +lib_path2 = os.path.abspath(os.path.join(file_path, "..", "..", "common")) +sys.path.append(lib_path2) + +import candle + +logger = logging.getLogger(__name__) +candle.set_parallelism_threads() + +additional_definitions = [ + {"name": "latent_dim", "action": "store", "type": int, "help": "latent dimensions"}, + { + "name": "residual", + "type": candle.str2bool, + "default": False, + "help": "add skip connections to the layers", + }, + { + "name": "reduce_lr", + "type": candle.str2bool, + "default": False, + "help": "reduce learning rate on plateau", + }, + { + "name": "warmup_lr", + "type": candle.str2bool, + "default": False, + "help": "gradually increase learning rate on start", + }, + {"name": "base_lr", "type": float, "help": "base learning rate"}, + { + "name": "epsilon_std", + "type": float, + "help": "epsilon std for sampling latent noise", + }, + { + "name": "use_cp", + "type": candle.str2bool, + "default": False, + "help": "checkpoint models with best val_loss", + }, + # {'name':'shuffle', + #'type': candle.str2bool, + #'default': False, + #'help':'shuffle data'}, + { + "name": "use_tb", + "type": candle.str2bool, + "default": False, + "help": "use tensorboard", + }, + { + "name": "tsne", + "type": candle.str2bool, + "default": False, + "help": "generate tsne plot of the latent representation", + }, +] + +required = [ + "activation", + "batch_size", + "dense", + "drop", + "epochs", + "initialization", + "learning_rate", + "loss", + "optimizer", + "rng_seed", + "scaling", + "validation_split", + "latent_dim", + "batch_normalization", + "epsilon_std", + "solr_root", + "timeout", +] + + +class BenchmarkAdrp(candle.Benchmark): + def set_locals(self): + """Functionality to set variables specific for the benchmark + - required: set of required parameters for the benchmark. + - additional_definitions: list of dictionaries describing the additional parameters for the + benchmark. + """ + + if required is not None: + self.required = set(required) + if additional_definitions is not None: + self.additional_definitions = additional_definitions + + +def extension_from_parameters(params, framework=""): + """Construct string for saving model with annotation of parameters""" + ext = framework + for i, n in enumerate(params["dense"]): + if n: + ext += ".D{}={}".format(i + 1, n) + ext += ".A={}".format(params["activation"]) + ext += ".B={}".format(params["batch_size"]) + ext += ".E={}".format(params["epochs"]) + ext += ".L={}".format(params["latent_dim"]) + ext += ".LR={}".format(params["learning_rate"]) + ext += ".S={}".format(params["scaling"]) + + if params["epsilon_std"] != 1.0: + ext += ".EPS={}".format(params["epsilon_std"]) + if params["drop"]: + ext += ".DR={}".format(params["drop"]) + if params["batch_normalization"]: + ext += ".BN" + if params["warmup_lr"]: + ext += ".WU_LR" + if params["reduce_lr"]: + ext += ".Re_LR" + if params["residual"]: + ext += ".Res" + + return ext + + +def load_data(params, seed): + + # start change # + if params["in"].endswith("csv") or params["in"].endswith("csv"): + print("processing csv in file {}".format(params["in"])) + + url = params["data_url"] + file_train = params["in"] + train_file = candle.get_file( + file_train, url + file_train, cache_subdir="Pilot1" + ) + df = (pd.read_csv(train_file, skiprows=1).values).astype("float32") + + PL = df.shape[1] + PL -= 1 + print("PL=", PL) + + PS = PL - 1 + + df_y = df[:, 0].astype("float32") + df_x = df[:, 1:PL].astype(np.float32) + + df_y.shape + df_x.shape + scaler = StandardScaler() + df_x = scaler.fit_transform(df_x) + + X_train, X_test, Y_train, Y_test = train_test_split( + df_x, df_y, test_size=0.20, random_state=42 + ) + else: + print("expecting in file file suffix csv") + sys.exit() + + return X_train, Y_train, X_test, Y_test, PS diff --git a/Models/ADRP/adrp_baseline_keras2.py b/Models/ADRP/adrp_baseline_keras2.py new file mode 100644 index 00000000..bc5e61d5 --- /dev/null +++ b/Models/ADRP/adrp_baseline_keras2.py @@ -0,0 +1,487 @@ +from __future__ import print_function + +import itertools +import pandas as pd +import numpy as np +import os +import sys +import gzip +import argparse +import sklearn + +import matplotlib + +matplotlib.use("Agg") + +import matplotlib.pyplot as plt + +import tensorflow as tf + +import keras as ke +from keras import backend as K + +from keras.layers import Input, Dense, Dropout, Activation, BatchNormalization +from keras.optimizers import SGD, Adam, RMSprop, Adadelta +from keras.models import Sequential, Model, model_from_json, model_from_yaml +from keras.utils import np_utils, multi_gpu_model + +from keras.callbacks import ( + Callback, + ModelCheckpoint, + CSVLogger, + ReduceLROnPlateau, + EarlyStopping, + TensorBoard, +) + +from sklearn.utils.class_weight import compute_class_weight +from sklearn.model_selection import train_test_split +from sklearn.metrics import ( + r2_score, + mean_squared_error, + mean_absolute_error, + roc_auc_score, + confusion_matrix, + balanced_accuracy_score, + classification_report, +) +from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler +from sklearn.metrics import ( + recall_score, + auc, + roc_curve, + f1_score, + precision_recall_curve, +) + +import adrp +import candle + +np.set_printoptions(precision=4) + + +def r2(y_true, y_pred): + SS_res = K.sum(K.square(y_true - y_pred)) + SS_tot = K.sum(K.square(y_true - K.mean(y_true))) + return 1 - SS_res / (SS_tot + K.epsilon()) + + +def tf_auc(y_true, y_pred): + auc = tf.metrics.auc(y_true, y_pred)[1] + K.get_session().run(tf.local_variables_initializer()) + return auc + + +# from sklearn.metrics import roc_auc_score +# import tensorflow as tf + + +def auroc(y_true, y_pred): + score = tf.py_func( + lambda y_true, y_pred: roc_auc_score( + y_true, y_pred, average="macro", sample_weight=None + ).astype("float32"), + [y_true, y_pred], + "float32", + stateful=False, + name="sklearnAUC", + ) + return score + + +def covariance(x, y): + return K.mean(x * y) - K.mean(x) * K.mean(y) + + +def corr(y_true, y_pred): + cov = covariance(y_true, y_pred) + var1 = covariance(y_true, y_true) + var2 = covariance(y_pred, y_pred) + return cov / (K.sqrt(var1 * var2) + K.epsilon()) + + +def xent(y_true, y_pred): + return binary_crossentropy(y_true, y_pred) + + +def mse(y_true, y_pred): + return mean_squared_error(y_true, y_pred) + + +class MetricHistory(Callback): + def on_epoch_begin(self, epoch, logs=None): + print("\n") + + def on_epoch_end(self, epoch, logs=None): + y_pred = self.model.predict(self.validation_data[0]) + r2 = r2_score(self.validation_data[1], y_pred) + corr, _ = pearsonr(self.validation_data[1].flatten(), y_pred.flatten()) + print("\nval_r2:", r2) + print(y_pred.shape) + print("\nval_corr:", corr, "val_r2:", r2) + print("\n") + + +class LoggingCallback(Callback): + def __init__(self, print_fcn=print): + Callback.__init__(self) + self.print_fcn = print_fcn + + def on_epoch_end(self, epoch, logs={}): + msg = "[Epoch: %i] %s" % ( + epoch, + ", ".join("%s: %f" % (k, v) for k, v in sorted(logs.items())), + ) + self.print_fcn(msg) + + +def build_type_classifier(x_train, y_train, x_test, y_test): + y_train = np.argmax(y_train, axis=1) + y_test = np.argmax(y_test, axis=1) + from xgboost import XGBClassifier + + clf = XGBClassifier(max_depth=6, n_estimators=100) + clf.fit( + x_train, y_train, eval_set=[(x_train, y_train), (x_test, y_test)], verbose=False + ) + y_pred = clf.predict(x_test) + acc = accuracy_score(y_test, y_pred) + print(acc) + return clf + + +def initialize_parameters(default_model="adrp_default_model.txt"): + + # Build benchmark object + adrpBmk = adrp.BenchmarkAdrp( + adrp.file_path, + default_model, + "keras", + prog="adrp_baseline", + desc="Multi-task (DNN) for data extraction from clinical reports - Pilot 3 Benchmark 1", + ) + + # Initialize parameters + gParameters = candle.finalize_parameters(adrpBmk) + # adrp.logger.info('Params: {}'.format(gParameters)) + + return gParameters + + +def save_cache( + cache_file, x_train, y_train, x_val, y_val, x_test, y_test, x_labels, y_labels +): + with h5py.File(cache_file, "w") as hf: + hf.create_dataset("x_train", data=x_train) + hf.create_dataset("y_train", data=y_train) + hf.create_dataset("x_val", data=x_val) + hf.create_dataset("y_val", data=y_val) + hf.create_dataset("x_test", data=x_test) + hf.create_dataset("y_test", data=y_test) + hf.create_dataset( + "x_labels", + (len(x_labels), 1), + "S100", + data=[x.encode("ascii", "ignore") for x in x_labels], + ) + hf.create_dataset( + "y_labels", + (len(y_labels), 1), + "S100", + data=[x.encode("ascii", "ignore") for x in y_labels], + ) + + +def load_cache(cache_file): + with h5py.File(cache_file, "r") as hf: + x_train = hf["x_train"][:] + y_train = hf["y_train"][:] + x_val = hf["x_val"][:] + y_val = hf["y_val"][:] + x_test = hf["x_test"][:] + y_test = hf["y_test"][:] + x_labels = [x[0].decode("unicode_escape") for x in hf["x_labels"][:]] + y_labels = [x[0].decode("unicode_escape") for x in hf["y_labels"][:]] + return x_train, y_train, x_val, y_val, x_test, y_test, x_labels, y_labels + + +def run(params): + args = candle.ArgumentStruct(**params) + seed = args.rng_seed + candle.set_seed(seed) + + # Construct extension to save model + ext = adrp.extension_from_parameters(params, ".keras") + candle.verify_path(params["save_path"]) + prefix = "{}{}".format(params["save_path"], ext) + logfile = params["logfile"] if params["logfile"] else prefix + ".log" + candle.set_up_logger(logfile, adrp.logger, params["verbose"]) + adrp.logger.info("Params: {}".format(params)) + + # Get default parameters for initialization and optimizer functions + keras_defaults = candle.keras_default_config() + + ## + X_train, Y_train, X_test, Y_test, PS = adrp.load_data(params, seed) + + print("X_train shape:", X_train.shape) + print("X_test shape:", X_test.shape) + + print("Y_train shape:", Y_train.shape) + print("Y_test shape:", Y_test.shape) + + # Initialize weights and learning rule + initializer_weights = candle.build_initializer( + params["initialization"], keras_defaults, seed + ) + initializer_bias = candle.build_initializer("constant", keras_defaults, 0.0) + + activation = params["activation"] + # TODO: set output_dim + output_dim = 1 + + # TODO: Use dense_layers for creating inputs/outputs + dense_layers = params["dense"] + + inputs = Input(shape=(PS,)) + + if dense_layers != None: + if type(dense_layers) != list: + dense_layers = list(dense_layers) + for i, l in enumerate(dense_layers): + if i == 0: + x = Dense( + l, + activation=activation, + kernel_initializer=initializer_weights, + bias_initializer=initializer_bias, + )(inputs) + else: + x = Dense( + l, + activation=activation, + kernel_initializer=initializer_weights, + bias_initializer=initializer_bias, + )(x) + if params["drop"]: + x = Dropout(params["drop"])(x) + output = Dense( + output_dim, + activation=activation, + kernel_initializer=initializer_weights, + bias_initializer=initializer_bias, + )(x) + else: + output = Dense( + output_dim, + activation=activation, + kernel_initializer=initializer_weights, + bias_initializer=initializer_bias, + )(inputs) + + # x = Dense(250, activation=ac)(inputs) + + # x = Dropout(DR)(x) + # x = Dense(125, activation=ac)(x) + # x = Dropout(DR)(x) + # x = Dense(60, activation=ac)(x) + # x = Dropout(DR)(x) + # x = Dense(30, activation=ac)(x) + # x = Dropout(DR)(x) + # outputs = Dense(1, activation=ac)(x) + + model = Model(inputs=inputs, outputs=output) + + model.summary() + + kerasDefaults = candle.keras_default_config() + if params["momentum"]: + kerasDefaults["momentum_sgd"] = params["momentum"] + + optimizer = candle.build_optimizer( + params["optimizer"], params["learning_rate"], kerasDefaults + ) + + model.compile( + loss=params["loss"], optimizer=optimizer, metrics=["mae", r2], + ) + + # set up a bunch of callbacks to do work during model training.. + + checkpointer = ModelCheckpoint( + filepath="agg_adrp.autosave.model.h5", + verbose=1, + save_weights_only=False, + save_best_only=True, + ) + csv_logger = CSVLogger("agg_adrp.training.log") + reduce_lr = ReduceLROnPlateau( + monitor="val_loss", + factor=0.75, + patience=20, + verbose=1, + mode="auto", + epsilon=0.0001, + cooldown=3, + min_lr=0.000000001, + ) + early_stop = EarlyStopping(monitor="val_loss", patience=100, verbose=1, mode="auto") + + # history = parallel_model.fit(X_train, Y_train, + epochs = params["epochs"] + batch_size = params["batch_size"] + + history = model.fit( + X_train, + Y_train, + batch_size=batch_size, + epochs=epochs, + verbose=1, + validation_data=(X_test, Y_test), + callbacks=[checkpointer, csv_logger, reduce_lr, early_stop], + ) + + score = model.evaluate(X_test, Y_test, verbose=0) + + print(score) + + print(history.history.keys()) + + # see big fuction below, creates plots etc. + # TODO: Break post_process into multiple functions + post_process(params, X_train, X_test, Y_test, score, history, model) + + adrp.logger.handlers = [] + + return history + + +def post_process(params, X_train, X_test, Y_test, score, history, model): + + # summarize history for MAE + plt.plot(history.history["mae"]) + plt.plot(history.history["val_mae"]) + plt.title("Model Mean Absolute Error") + plt.ylabel("mae") + plt.xlabel("epoch") + plt.legend(["train", "test"], loc="upper left") + + plt.savefig("agg_adrp.mae.png", bbox_inches="tight") + plt.savefig("agg_adrp.mae.pdf", bbox_inches="tight") + + plt.close() + + # summarize history for loss + plt.plot(history.history["loss"]) + plt.plot(history.history["val_loss"]) + plt.title("Model Loss") + plt.ylabel("loss") + plt.xlabel("epoch") + plt.legend(["train", "test"], loc="upper left") + + plt.savefig("agg_adrp.loss.png", bbox_inches="tight") + plt.savefig("agg_adrp.loss.pdf", bbox_inches="tight") + + plt.close() + + print("Test val_loss:", score[0]) + print("Test val_mae:", score[1]) + + exit() + + # serialize model to JSON + model_json = model.to_json() + with open("agg_adrp.model.json", "w") as json_file: + json_file.write(model_json) + + # serialize model to YAML + model_yaml = model.to_yaml() + with open("agg_adrp.model.yaml", "w") as yaml_file: + yaml_file.write(model_yaml) + + # serialize weights to HDF5 + model.save_weights("agg_adrp.model.h5") + print("Saved model to disk") + + exit() + + # load json and create model + json_file = open("agg_adrp.model.json", "r") + loaded_model_json = json_file.read() + json_file.close() + loaded_model_json = model_from_json(loaded_model_json) + + # load yaml and create model + yaml_file = open("agg_adrp.model.yaml", "r") + loaded_model_yaml = yaml_file.read() + yaml_file.close() + loaded_model_yaml = model_from_yaml(loaded_model_yaml) + + # load weights into new model + loaded_model_json.load_weights("agg_adrp.model.h5") + print("Loaded json model from disk") + + # evaluate json loaded model on test data + loaded_model_json.compile( + loss="binary_crossentropy", optimizer="SGD", metrics=["mean_absolute_error"] + ) + score_json = loaded_model_json.evaluate(X_test, Y_test, verbose=0) + + print("json Validation loss:", score_json[0]) + print("json Validation mae:", score_json[1]) + + print("json %s: %.2f%%" % (loaded_model_json.metrics_names[1], score_json[1] * 100)) + + # load weights into new model + loaded_model_yaml.load_weights("agg_adrp.model.h5") + print("Loaded yaml model from disk") + + # evaluate loaded model on test data + loaded_model_yaml.compile( + loss="binary_crossentropy", optimizer="SGD", metrics=["mean_absolute_error"] + ) + score_yaml = loaded_model_yaml.evaluate(X_test, Y_test, verbose=0) + + print("yaml Validation loss:", score_yaml[0]) + print("yaml Validation mae:", score_yaml[1]) + + print("yaml %s: %.2f%%" % (loaded_model_yaml.metrics_names[1], score_yaml[1] * 100)) + + # predict using loaded yaml model on test and training data + + predict_yaml_train = loaded_model_yaml.predict(X_train) + + predict_yaml_test = loaded_model_yaml.predict(X_test) + + print("Yaml_train_shape:", predict_yaml_train.shape) + print("Yaml_test_shape:", predict_yaml_test.shape) + + predict_yaml_train_classes = np.argmax(predict_yaml_train, axis=1) + predict_yaml_test_classes = np.argmax(predict_yaml_test, axis=1) + + np.savetxt("predict_yaml_train.csv", predict_yaml_train, delimiter=",", fmt="%.3f") + np.savetxt("predict_yaml_test.csv", predict_yaml_test, delimiter=",", fmt="%.3f") + + np.savetxt( + "predict_yaml_train_classes.csv", + predict_yaml_train_classes, + delimiter=",", + fmt="%d", + ) + np.savetxt( + "predict_yaml_test_classes.csv", + predict_yaml_test_classes, + delimiter=",", + fmt="%d", + ) + + +def main(): + params = initialize_parameters() + run(params) + + +if __name__ == "__main__": + main() + if K.backend() == "tensorflow": + K.clear_session() diff --git a/Models/ADRP/adrp_default_model.txt b/Models/ADRP/adrp_default_model.txt new file mode 100644 index 00000000..cef0160b --- /dev/null +++ b/Models/ADRP/adrp_default_model.txt @@ -0,0 +1,31 @@ +[Global_Params] +data_url = 'ftp://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/' +in='adrp-p1.csv' +model_name='adrp' +dense=[250, 125, 60, 30] +batch_size=32 +epochs=1 +activation='relu' +loss='mean_squared_error' +optimizer='sgd' +drop=0.1 +learning_rate=0.0001 +momentum=0.9 +scaling='minmax' +validation_split=0.1 +epsilon_std=1.0 +rng_seed=2017 +initialization='glorot_uniform' +latent_dim=2 +batch_normalization=False +save_path='candle_save' +use_cp=False +early_stop=True +reduce_lr=True +feature_subsample=0 +nb_classes=2 +save_dir='./save/001/' + +[Monitor_Params] +solr_root='' +timeout=3600 From f621553f0c0341a8096af2b0728cf80a842f8ab2 Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Mon, 23 Mar 2020 14:39:07 -0500 Subject: [PATCH 168/331] o Move from Models to example folder o Use save_path --- {Models => examples}/ADRP/README.md | 35 +++++++------- {Models => examples}/ADRP/adrp.py | 0 .../ADRP/adrp_baseline_keras2.py | 46 ++++++++++++------- .../ADRP/adrp_default_model.txt | 3 +- 4 files changed, 47 insertions(+), 37 deletions(-) rename {Models => examples}/ADRP/README.md (77%) rename {Models => examples}/ADRP/adrp.py (100%) rename {Models => examples}/ADRP/adrp_baseline_keras2.py (90%) rename {Models => examples}/ADRP/adrp_default_model.txt (91%) diff --git a/Models/ADRP/README.md b/examples/ADRP/README.md similarity index 77% rename from Models/ADRP/README.md rename to examples/ADRP/README.md index 9cfc536d..9d687e4d 100644 --- a/Models/ADRP/README.md +++ b/examples/ADRP/README.md @@ -6,10 +6,10 @@ http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/ (~500MB) ## Sample run: ``` -$ python adrp_baseline_keras2.py +$ python adrp_baseline_keras2.py Using TensorFlow backend. Importing candle utils for keras -Configuration file: /home/jain/CANDLE/Benchmarks/Models/ADRP/adrp_default_model.txt +Configuration file: /home/jain/CANDLE/fork/Benchmarks/examples/ADRP/adrp_default_model.txt {'activation': 'relu', 'batch_normalization': False, 'batch_size': 32, @@ -31,8 +31,7 @@ Configuration file: /home/jain/CANDLE/Benchmarks/Models/ADRP/adrp_default_model 'optimizer': 'sgd', 'reduce_lr': True, 'rng_seed': 2017, - 'save_dir': './save/001/', - 'save_path': 'candle_save', + 'save_path': './001/', 'scaling': 'minmax', 'solr_root': '', 'timeout': 3600, @@ -62,14 +61,13 @@ Params: 'momentum': 0.9, 'nb_classes': 2, 'optimizer': 'sgd', - 'output_dir': '/home/jain/CANDLE/Benchmarks/Models/ADRP/Output/EXP000/RUN000', + 'output_dir': '/home/jain/CANDLE/fork/Benchmarks/examples/ADRP/Output/EXP000/RUN000', 'profiling': False, 'reduce_lr': True, 'residual': False, 'rng_seed': 2017, 'run_id': 'RUN000', - 'save_dir': './save/001/', - 'save_path': 'candle_save', + 'save_path': './001/', 'scaling': 'minmax', 'shuffle': False, 'solr_root': '', @@ -81,9 +79,9 @@ Params: 'validation_split': 0.1, 'verbose': None, 'warmup_lr': False} -WARNING:tensorflow:From /home/jain/CANDLE/Benchmarks/common/keras_utils.py:51: The name tf.set_random_seed is deprecated. Please use tf.compat.v1.set_random_seed instead. +WARNING:tensorflow:From /home/jain/CANDLE/fork/Benchmarks/common/keras_utils.py:51: The name tf.set_random_seed is deprecated. Please use tf.compat.v1.set_random_seed instead. -Params: {'data_url': 'ftp://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/', 'in': 'adrp-p1.csv', 'model_name': 'adrp', 'dense': [250, 125, 60, 30], 'batch_size': 32, 'epochs': 1, 'activation': 'relu', 'loss': 'mean_squared_error', 'optimizer': 'sgd', 'drop': 0.1, 'learning_rate': 0.0001, 'momentum': 0.9, 'scaling': 'minmax', 'validation_split': 0.1, 'epsilon_std': 1.0, 'rng_seed': 2017, 'initialization': 'glorot_uniform', 'latent_dim': 2, 'batch_normalization': False, 'save_path': 'candle_save', 'use_cp': False, 'early_stop': True, 'reduce_lr': True, 'feature_subsample': 0, 'nb_classes': 2, 'save_dir': './save/001/', 'solr_root': '', 'timeout': 3600, 'verbose': None, 'logfile': None, 'train_bool': True, 'experiment_id': 'EXP000', 'run_id': 'RUN000', 'shuffle': False, 'gpus': [], 'profiling': False, 'residual': False, 'warmup_lr': False, 'use_tb': False, 'tsne': False, 'datatype': , 'output_dir': '/home/jain/CANDLE/Benchmarks/Models/ADRP/Output/EXP000/RUN000'} +Params: {'data_url': 'ftp://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/', 'in': 'adrp-p1.csv', 'model_name': 'adrp', 'dense': [250, 125, 60, 30], 'batch_size': 32, 'epochs': 1, 'activation': 'relu', 'loss': 'mean_squared_error', 'optimizer': 'sgd', 'drop': 0.1, 'learning_rate': 0.0001, 'momentum': 0.9, 'scaling': 'minmax', 'validation_split': 0.1, 'epsilon_std': 1.0, 'rng_seed': 2017, 'initialization': 'glorot_uniform', 'latent_dim': 2, 'batch_normalization': False, 'save_path': './001/', 'use_cp': False, 'early_stop': True, 'reduce_lr': True, 'feature_subsample': 0, 'nb_classes': 2, 'solr_root': '', 'timeout': 3600, 'verbose': None, 'logfile': None, 'train_bool': True, 'experiment_id': 'EXP000', 'run_id': 'RUN000', 'shuffle': False, 'gpus': [], 'profiling': False, 'residual': False, 'warmup_lr': False, 'use_tb': False, 'tsne': False, 'datatype': , 'output_dir': '/home/jain/CANDLE/fork/Benchmarks/examples/ADRP/Output/EXP000/RUN000'} processing csv in file adrp-p1.csv PL= 1614 X_train shape: (27447, 1613) @@ -123,22 +121,23 @@ Non-trainable params: 0 _________________________________________________________________ /home/jain/.local/lib/python3.7/site-packages/keras/callbacks/callbacks.py:998: UserWarning: `epsilon` argument is deprecated and will be removed, use `min_delta` instead. warnings.warn('`epsilon` argument is deprecated and ' -2020-03-23 07:27:52.405136: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcuda.so.1 -2020-03-23 07:27:52.407468: E tensorflow/stream_executor/cuda/cuda_driver.cc:318] failed call to cuInit: CUDA_ERROR_UNKNOWN: unknown error -2020-03-23 07:27:52.407497: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (jain): /proc/driver/nvidia/version does not exist -2020-03-23 07:27:52.407740: I tensorflow/core/platform/cpu_feature_guard.cc:142] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 -2020-03-23 07:27:52.417402: I tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency: 2808000000 Hz -2020-03-23 07:27:52.417634: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x3308680 initialized for platform Host (this does not guarantee that XLA will be used). Devices: -2020-03-23 07:27:52.417654: I tensorflow/compiler/xla/service/service.cc:176] StreamExecutor device (0): Host, Default Version +2020-03-23 14:36:20.461062: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcuda.so.1 +2020-03-23 14:36:20.463626: E tensorflow/stream_executor/cuda/cuda_driver.cc:318] failed call to cuInit: CUDA_ERROR_UNKNOWN: unknown error +2020-03-23 14:36:20.463720: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (jain): /proc/driver/nvidia/version does not exist +2020-03-23 14:36:20.464039: I tensorflow/core/platform/cpu_feature_guard.cc:142] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 +2020-03-23 14:36:20.475490: I tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency: 2808000000 Hz +2020-03-23 14:36:20.475685: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x2dab430 initialized for platform Host (this does not guarantee that XLA will be used). Devices: +2020-03-23 14:36:20.475708: I tensorflow/compiler/xla/service/service.cc:176] StreamExecutor device (0): Host, Default Version WARNING:tensorflow:From /home/jain/.local/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:422: The name tf.global_variables is deprecated. Please use tf.compat.v1.global_variables instead. Train on 27447 samples, validate on 6862 samples Epoch 1/1 -27447/27447 [==============================] - 4s 148us/step - loss: 3.4695 - mae: 1.3269 - r2: -2.1720 - val_loss: 1.2343 - val_mae: 0.9235 - val_r2: -0.1880 +27447/27447 [==============================] - 5s 173us/step - loss: 3.4695 - mae: 1.3269 - r2: -2.1720 - val_loss: 1.2343 - val_mae: 0.9235 - val_r2: -0.1880 -Epoch 00001: val_loss improved from inf to 1.23431, saving model to agg_adrp.autosave.model.h5 +Epoch 00001: val_loss improved from inf to 1.23431, saving model to ./001/agg_adrp.autosave.model.h5 [1.2343122459159792, 0.9235042333602905, -0.18803702294826508] dict_keys(['val_loss', 'val_mae', 'val_r2', 'loss', 'mae', 'r2', 'lr']) +saving to path: ./001/ Test val_loss: 1.2343122459159792 Test val_mae: 0.9235042333602905 ``` diff --git a/Models/ADRP/adrp.py b/examples/ADRP/adrp.py similarity index 100% rename from Models/ADRP/adrp.py rename to examples/ADRP/adrp.py diff --git a/Models/ADRP/adrp_baseline_keras2.py b/examples/ADRP/adrp_baseline_keras2.py similarity index 90% rename from Models/ADRP/adrp_baseline_keras2.py rename to examples/ADRP/adrp_baseline_keras2.py index bc5e61d5..dc060a38 100644 --- a/Models/ADRP/adrp_baseline_keras2.py +++ b/examples/ADRP/adrp_baseline_keras2.py @@ -309,12 +309,12 @@ def run(params): # set up a bunch of callbacks to do work during model training.. checkpointer = ModelCheckpoint( - filepath="agg_adrp.autosave.model.h5", + filepath=params["save_path"] + "agg_adrp.autosave.model.h5", verbose=1, save_weights_only=False, save_best_only=True, ) - csv_logger = CSVLogger("agg_adrp.training.log") + csv_logger = CSVLogger(params["save_path"] + "agg_adrp.training.log") reduce_lr = ReduceLROnPlateau( monitor="val_loss", factor=0.75, @@ -357,6 +357,8 @@ def run(params): def post_process(params, X_train, X_test, Y_test, score, history, model): + save_path = params["save_path"] + print("saving to path: ", save_path) # summarize history for MAE plt.plot(history.history["mae"]) @@ -366,8 +368,8 @@ def post_process(params, X_train, X_test, Y_test, score, history, model): plt.xlabel("epoch") plt.legend(["train", "test"], loc="upper left") - plt.savefig("agg_adrp.mae.png", bbox_inches="tight") - plt.savefig("agg_adrp.mae.pdf", bbox_inches="tight") + plt.savefig(save_path + "agg_adrp.mae.png", bbox_inches="tight") + plt.savefig(save_path + "agg_adrp.mae.pdf", bbox_inches="tight") plt.close() @@ -379,8 +381,8 @@ def post_process(params, X_train, X_test, Y_test, score, history, model): plt.xlabel("epoch") plt.legend(["train", "test"], loc="upper left") - plt.savefig("agg_adrp.loss.png", bbox_inches="tight") - plt.savefig("agg_adrp.loss.pdf", bbox_inches="tight") + plt.savefig(save_path + "agg_adrp.loss.png", bbox_inches="tight") + plt.savefig(save_path + "agg_adrp.loss.pdf", bbox_inches="tight") plt.close() @@ -391,34 +393,34 @@ def post_process(params, X_train, X_test, Y_test, score, history, model): # serialize model to JSON model_json = model.to_json() - with open("agg_adrp.model.json", "w") as json_file: + with open(save_path + "agg_adrp.model.json", "w") as json_file: json_file.write(model_json) # serialize model to YAML model_yaml = model.to_yaml() - with open("agg_adrp.model.yaml", "w") as yaml_file: + with open(save_path + "agg_adrp.model.yaml", "w") as yaml_file: yaml_file.write(model_yaml) # serialize weights to HDF5 - model.save_weights("agg_adrp.model.h5") + model.save_weights(save_path + "agg_adrp.model.h5") print("Saved model to disk") exit() # load json and create model - json_file = open("agg_adrp.model.json", "r") + json_file = open(save_path + "agg_adrp.model.json", "r") loaded_model_json = json_file.read() json_file.close() loaded_model_json = model_from_json(loaded_model_json) # load yaml and create model - yaml_file = open("agg_adrp.model.yaml", "r") + yaml_file = open(save_path + "agg_adrp.model.yaml", "r") loaded_model_yaml = yaml_file.read() yaml_file.close() loaded_model_yaml = model_from_yaml(loaded_model_yaml) # load weights into new model - loaded_model_json.load_weights("agg_adrp.model.h5") + loaded_model_json.load_weights(save_path + "agg_adrp.model.h5") print("Loaded json model from disk") # evaluate json loaded model on test data @@ -433,7 +435,7 @@ def post_process(params, X_train, X_test, Y_test, score, history, model): print("json %s: %.2f%%" % (loaded_model_json.metrics_names[1], score_json[1] * 100)) # load weights into new model - loaded_model_yaml.load_weights("agg_adrp.model.h5") + loaded_model_yaml.load_weights(save_path + "agg_adrp.model.h5") print("Loaded yaml model from disk") # evaluate loaded model on test data @@ -459,17 +461,27 @@ def post_process(params, X_train, X_test, Y_test, score, history, model): predict_yaml_train_classes = np.argmax(predict_yaml_train, axis=1) predict_yaml_test_classes = np.argmax(predict_yaml_test, axis=1) - np.savetxt("predict_yaml_train.csv", predict_yaml_train, delimiter=",", fmt="%.3f") - np.savetxt("predict_yaml_test.csv", predict_yaml_test, delimiter=",", fmt="%.3f") + np.savetxt( + save_path + "predict_yaml_train.csv", + predict_yaml_train, + delimiter=",", + fmt="%.3f", + ) + np.savetxt( + save_path + "predict_yaml_test.csv", + predict_yaml_test, + delimiter=",", + fmt="%.3f", + ) np.savetxt( - "predict_yaml_train_classes.csv", + save_path + "predict_yaml_train_classes.csv", predict_yaml_train_classes, delimiter=",", fmt="%d", ) np.savetxt( - "predict_yaml_test_classes.csv", + save_path + "predict_yaml_test_classes.csv", predict_yaml_test_classes, delimiter=",", fmt="%d", diff --git a/Models/ADRP/adrp_default_model.txt b/examples/ADRP/adrp_default_model.txt similarity index 91% rename from Models/ADRP/adrp_default_model.txt rename to examples/ADRP/adrp_default_model.txt index cef0160b..39f23f70 100644 --- a/Models/ADRP/adrp_default_model.txt +++ b/examples/ADRP/adrp_default_model.txt @@ -18,13 +18,12 @@ rng_seed=2017 initialization='glorot_uniform' latent_dim=2 batch_normalization=False -save_path='candle_save' +save_path='./001/' use_cp=False early_stop=True reduce_lr=True feature_subsample=0 nb_classes=2 -save_dir='./save/001/' [Monitor_Params] solr_root='' From bb645758e690956b211ee6fb8a25f381e99d4177 Mon Sep 17 00:00:00 2001 From: Todd Young Date: Wed, 1 Apr 2020 09:44:20 -0400 Subject: [PATCH 169/331] Port linear darts This gives us the pieces necessary for handling linear networks in DARTS similar to the Uno benchmark. --- Pilot3/P3B5/darts/genotypes.py | 12 +- Pilot3/P3B5/darts/metrics/__init__.py | 2 + .../P3B5/darts/metrics/multitask_accuracy.py | 18 ++ Pilot3/P3B5/darts/metrics/multitask_loss.py | 21 ++ Pilot3/P3B5/darts/metrics/topk_accuracy.py | 27 ++ Pilot3/P3B5/darts/modules/linear/__init__.py | 0 .../P3B5/darts/modules/linear/linear_cell.py | 62 +++++ .../modules/linear/linear_mixed_layer.py | 41 +++ Pilot3/P3B5/darts/modules/linear_network.py | 254 ++++++++++++++++++ .../P3B5/darts/modules/operations/linear.py | 111 ++++++++ 10 files changed, 547 insertions(+), 1 deletion(-) create mode 100644 Pilot3/P3B5/darts/metrics/__init__.py create mode 100644 Pilot3/P3B5/darts/metrics/multitask_accuracy.py create mode 100644 Pilot3/P3B5/darts/metrics/multitask_loss.py create mode 100644 Pilot3/P3B5/darts/metrics/topk_accuracy.py create mode 100644 Pilot3/P3B5/darts/modules/linear/__init__.py create mode 100644 Pilot3/P3B5/darts/modules/linear/linear_cell.py create mode 100644 Pilot3/P3B5/darts/modules/linear/linear_mixed_layer.py create mode 100644 Pilot3/P3B5/darts/modules/linear_network.py create mode 100644 Pilot3/P3B5/darts/modules/operations/linear.py diff --git a/Pilot3/P3B5/darts/genotypes.py b/Pilot3/P3B5/darts/genotypes.py index caf874e7..e96681be 100644 --- a/Pilot3/P3B5/darts/genotypes.py +++ b/Pilot3/P3B5/darts/genotypes.py @@ -16,6 +16,16 @@ ] +LINEAR_PRIMITIVES = [ + 'linear_block', + 'skip_connect', + 'linear_conv', + 'linear_drop', + 'encoder', + 'none', +] + + AmoebaNet = Genotype( normal=[ ('avg_pool_3', 0), @@ -113,4 +123,4 @@ ('sep_conv_3', 2) ], reduce_concat=[2, 3, 4] -) \ No newline at end of file +) diff --git a/Pilot3/P3B5/darts/metrics/__init__.py b/Pilot3/P3B5/darts/metrics/__init__.py new file mode 100644 index 00000000..0e02fb01 --- /dev/null +++ b/Pilot3/P3B5/darts/metrics/__init__.py @@ -0,0 +1,2 @@ +from .topk_accuracy import accuracy_topk +from .topk_accuracy import multitask_accuracy_topk diff --git a/Pilot3/P3B5/darts/metrics/multitask_accuracy.py b/Pilot3/P3B5/darts/metrics/multitask_accuracy.py new file mode 100644 index 00000000..3f89599f --- /dev/null +++ b/Pilot3/P3B5/darts/metrics/multitask_accuracy.py @@ -0,0 +1,18 @@ +import darts.functional as F +from darts.api.metrics.average import MultitaskAveragingSupervisedMetric + + +class MultitaskAccuracy(MultitaskAveragingSupervisedMetric): + """ Multitask Classification accuracy """ + + def __init__(self, scope="train"): + super().__init__("accuracy", scope=scope) + + def _value_function(self, x_input, y_true, y_pred): + """ Return classification accuracy of input """ + return F.multitask_accuracy(y_true, y_pred) + + +def create(): + """ darts factory function """ + return MultitaskAccuracy() diff --git a/Pilot3/P3B5/darts/metrics/multitask_loss.py b/Pilot3/P3B5/darts/metrics/multitask_loss.py new file mode 100644 index 00000000..be2a3c54 --- /dev/null +++ b/Pilot3/P3B5/darts/metrics/multitask_loss.py @@ -0,0 +1,21 @@ +import torch.nn as nn + +import darts.functional as F +from darts.api.metrics.average import MultitaskAveragingSupervisedMetric + + +class MultitaskLoss(MultitaskAveragingSupervisedMetric): + """ Multitask Classification loss """ + + def __init__(self, scope="train", criterion=nn.CrossEntropyLoss()): + super().__init__("loss", scope=scope) + self.criterion = criterion + + def _value_function(self, x_input, y_true, y_pred, reduce=None): + """ Return loss value of input """ + return F.multitask_loss(y_true, y_pred, criterion=self.criterion, reduce=reduce) + + +def create(): + """ darts factory function """ + return MultitaskLoss() diff --git a/Pilot3/P3B5/darts/metrics/topk_accuracy.py b/Pilot3/P3B5/darts/metrics/topk_accuracy.py new file mode 100644 index 00000000..59b54142 --- /dev/null +++ b/Pilot3/P3B5/darts/metrics/topk_accuracy.py @@ -0,0 +1,27 @@ +import torch + + +def accuracy_topk(output, target, topk=(1,)): + """Computes the accuracy over the k top predictions for the specified values of k""" + with torch.no_grad(): + maxk = max(topk) + batch_size = target.size(0) + + _, pred = output.topk(maxk, 1, True, True) + pred = pred.t() + correct = pred.eq(target.view(1, -1).expand_as(pred)) + + res = [] + for k in topk: + correct_k = correct[:k].view(-1).float().sum(0, keepdim=True) + res.append(correct_k.mul_(100.0 / batch_size)) + return res + + +def multitask_accuracy_topk(output, target, topk=(1,)): + """Compute the topk accuracy for multitask problems""" + topk_accuracies = {} + for key, value in target.items(): + topk_accuracies[key] = accuracy_topk(output[key], target[key], topk) + + return topk_accuracies diff --git a/Pilot3/P3B5/darts/modules/linear/__init__.py b/Pilot3/P3B5/darts/modules/linear/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/Pilot3/P3B5/darts/modules/linear/linear_cell.py b/Pilot3/P3B5/darts/modules/linear/linear_cell.py new file mode 100644 index 00000000..4854d485 --- /dev/null +++ b/Pilot3/P3B5/darts/modules/linear/linear_cell.py @@ -0,0 +1,62 @@ +import torch +import torch.nn as nn + +from darts.api import Model +from darts.modules.linear.linear_mixed_layer import MixedLayer + + +class Cell(Model): + + def __init__(self, num_nodes, multiplier, cpp, cp, c, reduction, reduction_prev): + """ + :param steps: 4, number of layers inside a cell + :param multiplier: 4 + :param cpp: 48 + :param cp: 48 + :param c: 16 + :param reduction: indicates whether to reduce the output maps width + :param reduction_prev: when previous cell reduced width, s1_d = s0_d//2 + in order to keep same shape between s1 and s0, we adopt prep0 layer to + reduce the s0 width by half. + """ + super(Cell, self).__init__() + + # indicating current cell is reduction or not + self.reduction = reduction + self.reduction_prev = reduction_prev + + # steps inside a cell + self.num_nodes = num_nodes # 4 + self.multiplier = multiplier # 4 + + self.layers = nn.ModuleList() + + for i in range(self.num_nodes): + # for each i inside cell, it connects with all previous output + # plus previous two cells' output + for j in range(2 + i): + # for reduction cell, it will reduce the heading 2 inputs only + stride = 2 if reduction and j < 2 else 1 + layer = MixedLayer(c, stride) + self.layers.append(layer) + + def forward(self, s0, s1, weights): + """ + :param s0: + :param s1: + :param weights: [14, 8] + :return: + """ + states = [s0, s1] + offset = 0 + # for each node, receive input from all previous intermediate nodes and s0, s1 + for i in range(self.num_nodes): # 4 + # [40, 16, 32, 32] + #s = sum(self.layers[offset + j](h, weights[offset + j]) for j, h in enumerate(states)) + offset += len(states) + # append one state since s is the elem-wise addition of all output + #states.append(s) + #print('node:',i, s.shape, self.reduction) + + # concat along dim=channel + return torch.cat(states[-self.multiplier:], dim=1) # 6 of [40, 16, 32, 32] diff --git a/Pilot3/P3B5/darts/modules/linear/linear_mixed_layer.py b/Pilot3/P3B5/darts/modules/linear/linear_mixed_layer.py new file mode 100644 index 00000000..61d6a5ab --- /dev/null +++ b/Pilot3/P3B5/darts/modules/linear/linear_mixed_layer.py @@ -0,0 +1,41 @@ +import torch +import torch.nn as nn + +from darts.api import Model +from darts.genotypes import LINEAR_PRIMITIVES +from darts.modules.operations.linear import OPS + + +class MixedLayer(Model): + """ A mixture of 8 unit types + + We use weights to aggregate these outputs while training. + and softmax to select the strongest edges while inference. + """ + def __init__(self, c, stride): + super(MixedLayer, self).__init__() + self.reset(c, stride) + + def reset(self, c, stride): + self.layers = nn.ModuleList() + + for primitive in LINEAR_PRIMITIVES: + layer = OPS[primitive](c, stride, False) + + if 'pool' in primitive: + layer = nn.Sequential(layer, nn.BatchNorm1d(c, affine=False)) + + self.layers.append(layer) + + def forward(self, x, weights): + """ + Parameters + ---------- + x : torch.tensor + Data + + Weights : torch.tensor + alpha, [op_num:8], the output = sum of alpha * op(x) + """ + x = [w * layer(x) for w, layer in zip(weights, self.layers)] + return sum(x) diff --git a/Pilot3/P3B5/darts/modules/linear_network.py b/Pilot3/P3B5/darts/modules/linear_network.py new file mode 100644 index 00000000..3fb73c92 --- /dev/null +++ b/Pilot3/P3B5/darts/modules/linear_network.py @@ -0,0 +1,254 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +from darts.api import Model +from darts.modules.linear.linear_cell import Cell +from darts.modules.classifier import MultitaskClassifier +from darts.genotypes import LINEAR_PRIMITIVES, Genotype + + +class Hyperparameters: + c = 100 # 8 + num_nodes = 2 + num_cells = 3 + channel_multiplier = 2 + stem_channel_multiplier = 2 +# input_dim = 5270 + input_dim = 5270 +# gene_dim = 942 + intermediate_dim = 100 + + +class Network(Model): + """ Collection of cells """ + + def __init__(self, tasks, criterion, device='cpu', hyperparams=Hyperparameters()): + super(Network, self).__init__() + self.tasks = tasks + self.criterion = criterion + self.device = device + self.c = hyperparams.c + self.num_cells = hyperparams.num_cells + self.num_nodes = hyperparams.num_nodes + self.channel_multiplier = hyperparams.channel_multiplier + + # stem_multiplier is for stem network, + # and multiplier is for general cell + c_curr = hyperparams.stem_channel_multiplier * self.c # 3*16 + # stem network, convert 3 channel to c_curr + self.stem = nn.Sequential( + nn.Linear( + hyperparams.input_dim, hyperparams.intermediate_dim + ), + ).to(self.device) + + # c_curr means a factor of the output channels of current cell + # output channels = multiplier * c_curr + cpp, cp, c_curr = c_curr, c_curr, self.c + self.cells = nn.ModuleList() + reduction_prev = False + for i in range(hyperparams.num_cells): + + # for layer in the middle [1/3, 2/3], reduce via stride=2 + if i in [hyperparams.num_cells // 3, 2 * hyperparams.num_cells // 3]: + c_curr *= 2 + reduction = True + else: + reduction = False + + # [cp, h, h] => [multiplier*c_curr, h/h//2, h/h//2] + # the output channels = multiplier * c_curr + cell = Cell( + hyperparams.num_nodes, + hyperparams.channel_multiplier, + cpp, + cp, + c_curr, + reduction, + reduction_prev + ).to(self.device) + # update reduction_prev + reduction_prev = reduction + self.cells += [cell] + cpp, cp = cp, hyperparams.channel_multiplier * c_curr + + # adaptive pooling output size to 1x1 + # since cp records last cell's output channels + # it indicates the input channel number + # self.classifier = self.fc_layers(cp, tasks) + #self.classifier = MultitaskClassifier(cp, tasks) + self.classifier = MultitaskClassifier(500, tasks) # 500 + + # k is the total number of edges inside single cell, 14 + k = sum(1 for i in range(self.num_nodes) for j in range(2 + i)) + num_ops = len(LINEAR_PRIMITIVES) # 8 + + # TODO + # this kind of implementation will add alpha into self.parameters() + # it has num k of alpha parameters, and each alpha shape: [num_ops] + # it requires grad and can be converted to cpu/gpu automatically + self.alpha_normal = nn.Parameter(torch.randn(k, num_ops)) + self.alpha_reduce = nn.Parameter(torch.randn(k, num_ops)) + + with torch.no_grad(): + # initialize to smaller value + self.alpha_normal.mul_(1e-3) + self.alpha_reduce.mul_(1e-3) + + self._arch_parameters = [ + self.alpha_normal, + self.alpha_reduce, + ] + + def fc_layers(self, cp, tasks): + """ Create fully connnected layers for each task """ + fc_layers = {} + for task, dim in tasks.items(): + fc_layers[task] = nn.Linear(cp, dim).to(self.device) + return fc_layers + + def new(self): + """ Create a new model initialzed with current alpha parameters. + + Weights are left untouched. + + Returns + ------- + model : Network + New model initialized with current alpha. + """ + model = Network( + self.tasks, + self.criterion + ).to(self.device) + + for x, y in zip(model.arch_parameters(), self.arch_parameters()): + x.data.copy_(y.data) + + return model + + def forward(self, x): + """ + in: torch.Size([3, 3, 32, 32]) + stem: torch.Size([3, 48, 32, 32]) + cell: 0 torch.Size([3, 64, 32, 32]) False + cell: 1 torch.Size([3, 64, 32, 32]) False + cell: 2 torch.Size([3, 128, 16, 16]) True + cell: 3 torch.Size([3, 128, 16, 16]) False + cell: 4 torch.Size([3, 128, 16, 16]) False + cell: 5 torch.Size([3, 256, 8, 8]) True + cell: 6 torch.Size([3, 256, 8, 8]) False + cell: 7 torch.Size([3, 256, 8, 8]) False + pool: torch.Size([16, 256, 1, 1]) + linear: [b, 10] + :param x: + :return: + """ + #print('network in:', x.shape) + # s0 & s1 means the last cells' output + s0 = s1 = self.stem(x) # [b, 3, 32, 32] => [b, 48, 32, 32] + #print('network stem:', s0.shape) + #print('network stem1:', s1.shape) + + for i, cell in enumerate(self.cells): + # weights are shared across all reduction cell or normal cell + # according to current cell's type, it choose which architecture parameters + # to use + if cell.reduction: # if current cell is reduction cell + weights = F.softmax(self.alpha_reduce, dim=-1) + else: + weights = F.softmax(self.alpha_normal, dim=-1) # [14, 8] + # execute cell() firstly and then assign s0=s1, s1=result + s0, s1 = s1, cell(s0, s1, weights) # [40, 64, 32, 32] + #print('cell:',i, s1.shape, cell.reduction, cell.reduction_prev) + #print('\n') + + # s1 is the last cell's output + #out = self.global_pooling(s1) + out = s1 + # logits = {} + # for task, fc in self.classifier.items(): + # logits[task] = fc(out.view(out.size(0), -1)) + logits = self.classifier(out.view(out.size(0), -1)) + + return logits + + def loss_value(self, x_data, y_true, y_pred, reduce='mean'): + """ Calculate a value of loss function """ + y_pred = self(x_data) + + losses = {} + for key, value in y_true.items(): + losses[key] = F.nll_loss(F.log_softmax(y_pred[key], dim=1), y_true[key]) + + if reduce: + total = 0 + for _, value in losses.items(): + total += value + + if reduce == "mean": + losses = total / len(losses) + elif reduce == "sum": + losses = total + + return losses + + def arch_parameters(self): + return self._arch_parameters + + def genotype(self): + """ + :return: + """ + def _parse(weights): + """ + :param weights: [14, 8] + :return: + """ + gene = [] + n = 2 + start = 0 + for i in range(self.num_nodes): # for each node + end = start + n + W = weights[start:end].copy() # [2, 8], [3, 8], ... + edges = sorted(range(i + 2), # i+2 is the number of connection for node i + key=lambda x: -max(W[x][k] # by descending order + for k in range(len(W[x])) # get strongest ops + if k != LINEAR_PRIMITIVES.index('none')) + )[:2] # only has two inputs + for j in edges: # for every input nodes j of current node i + k_best = None + for k in range(len(W[j])): # get strongest ops for current input j->i + if k != LINEAR_PRIMITIVES.index('none'): + if k_best is None or W[j][k] > W[j][k_best]: + k_best = k + gene.append((LINEAR_PRIMITIVES[k_best], j)) # save ops and input node + start = end + n += 1 + return gene + + gene_normal = _parse(F.softmax(self.alpha_normal, dim=-1).data.cpu().numpy()) + gene_reduce = _parse(F.softmax(self.alpha_reduce, dim=-1).data.cpu().numpy()) + + concat = range(2 + self.num_nodes - self.channel_multiplier, self.num_nodes + 2) + genotype = Genotype( + normal=gene_normal, normal_concat=concat, + reduce=gene_reduce, reduce_concat=concat + ) + + return genotype + + +def new(c, num_classes, num_layers, criterion, device, steps=4, multiplier=4, stem_multiplier=3): + """ + create a new model and initialize it with current alpha parameters. + However, its weights are left untouched. + :return: + """ + model = Network(c, num_classes, num_layers, criterion, steps, multiplier, stem_multiplier).to(device) + + for x, y in zip(model_new.arch_parameters(), self.arch_parameters()): + x.data.copy_(y.data) + + return model diff --git a/Pilot3/P3B5/darts/modules/operations/linear.py b/Pilot3/P3B5/darts/modules/operations/linear.py new file mode 100644 index 00000000..3695b502 --- /dev/null +++ b/Pilot3/P3B5/darts/modules/operations/linear.py @@ -0,0 +1,111 @@ +""" +Linear operations. +""" + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from darts.api import Model + + +OPS = { + 'none' : lambda c, stride, affice: Zero(), + 'skip_connect' : lambda c, stride, affine: Identity(), + 'linear_block' : lambda c, stride, affine: LinearBlock(c, c, affine=affine), + 'linear_conv' : lambda c, stride, affine: LinearConv(c, c, 1), + 'linear_drop' : lambda c, stride, affine: LinearDrop(c, c, 1), + 'encoder' : lambda c, stride, affine: Encoder(c, c, 1), +} + + +class LinearBlock(Model): + """ Linear block consisting of two fully connected layers + + Example + ------- + x: torch.Size([2, 10, 12]) + out: [batch_size, c_out, d//2] + out: torch.Size([2, 10, 6]) + """ + + def __init__(self, c_in, c_out, affine=True): + super(LinearBlock, self).__init__() + assert c_out % 2 == 0 + + self.fc1 = nn.Linear(c_in, c_in * 2) + self.fc2 = nn.Linear(c_in * 2, c_out) + + def forward(self, x): + x = torch.relu(x) + x = self.fc1(x) + out = self.fc2(x) + return out + + +class LinearDrop(Model): + """ Linear block with dropout """ + + def __init__(self, c_in, c_out, affine=True): + super(LinearDrop, self).__init__() + assert c_out % 2 == 0 + + self.fc1 = nn.Linear(c_in, c_in * 2) + self.fc2 = nn.Linear(c_in * 2, c_out) + + def forward(self, x): + x = torch.relu(x) + x = F.dropout(self.fc1(x)) + out = F.dropout(self.fc2(x)) + return out + + +class Encoder(Model): + """ Linear encoder """ + + def __init__(self, c_in, c_out, affine=True): + super(Encoder, self).__init__() + assert c_out % 2 == 0 + + self.fc1 = nn.Linear(c_in, c_in // 2) + self.fc2 = nn.Linear(c_in // 2, c_in) + + def forward(self, x): + x = torch.relu(x) + x = self.fc1(x) + return self.fc2(x) + + +class LinearConv(Model): + """ Linear => Conv => Linear """ + + def __init__(self, c_in, c_out, kernel_size): + super(LinearConv, self).__init__() + self.fc_1 = nn.Linear(c_in, c_in) + self.conv = nn.Conv1d(c_in, c_in, kernel_size) + self.fc_2 = nn.Linear(c_in, c_out) + + def forward(self, x): + x = torch.relu(x) + x = self.fc_1(x) + x = self.conv(x) + return x + + +class Identity(Model): + + def __init__(self): + super(Identity, self).__init__() + + def forward(self, x): + return x + + +class Zero(nn.Module): + """ Zero tensor by stride """ + + def __init__(self): + super(Zero, self).__init__() + + def forward(self, x): + return x From 6ccb4f3c78bda3ceaf6cd3ef476c76e98e7b7e1f Mon Sep 17 00:00:00 2001 From: Todd Young Date: Wed, 1 Apr 2020 10:20:58 -0400 Subject: [PATCH 170/331] Add utils This makes handling device placement a bit easier and gives us the ability to log a single task accuracy. --- Pilot3/P3B5/darts/utils/logging.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/Pilot3/P3B5/darts/utils/logging.py b/Pilot3/P3B5/darts/utils/logging.py index 81561286..a45e106c 100644 --- a/Pilot3/P3B5/darts/utils/logging.py +++ b/Pilot3/P3B5/darts/utils/logging.py @@ -25,3 +25,21 @@ def log_accuracy(accuracy, split: str='train'): ) logger.info(acc_info) + + +def log_single_accuracy(accuracy, split: str='train'): + """ Log the average accuracy for a single task + + Parameters + ---------- + accuracy: darts.MultitaskAccuracyMeter + Current accuracy meter state + + split: str + Either training of testing + """ + acc_info = ( + f">>> {split.upper()} Accuracy - Response: {accuracy.get_avg_accuracy('response'):.4f}, " + ) + + logger.info(acc_info) From 5578b9e0388f8134ee6511869f0bb2b20e1c849f Mon Sep 17 00:00:00 2001 From: Todd Young Date: Wed, 1 Apr 2020 10:22:10 -0400 Subject: [PATCH 171/331] Add utils This gives us an easy way to handle device placement for tensors and a way to log a single task accuracy metric. --- Pilot3/P3B5/darts/utils/random.py | 40 +++++++++++++++++++++++++++++++ Pilot3/P3B5/darts/utils/tensor.py | 17 +++++++++++++ 2 files changed, 57 insertions(+) create mode 100644 Pilot3/P3B5/darts/utils/random.py create mode 100644 Pilot3/P3B5/darts/utils/tensor.py diff --git a/Pilot3/P3B5/darts/utils/random.py b/Pilot3/P3B5/darts/utils/random.py new file mode 100644 index 00000000..75682a05 --- /dev/null +++ b/Pilot3/P3B5/darts/utils/random.py @@ -0,0 +1,40 @@ +import os +import torch +import random +import numpy as np + + +class Seeds: + pythonhash = 0 + pythonrand = 0 + numpy = 0 + torch = 0 + + +class SeedControl: + + def __init__(self, seeds=Seeds()): + self.s = seeds + + def fix_all_seeds(self, seed: int): + """Fix all seeds to the same seed""" + self.s.pythonhash = seed + self.s.pythonrand = seed + self.s.numpy = seed + self.s.torch = seed + self.set_seeds() + + def set_seeds(self): + os.environ['PYTHONHASHSEED'] = str(self.s.pythonhash) + random.seed(self.s.pythonrand) + np.random.seed(self.s.numpy) + torch.random.manual_seed(self.s.torch) + + def get_seeds(self): + return { + 'PythonHash': self.s.pythonhash, + 'PythonRand': self.s.pythonrand, + 'Numpy': self.s.numpy, + 'Torch': self.s.torch + } + diff --git a/Pilot3/P3B5/darts/utils/tensor.py b/Pilot3/P3B5/darts/utils/tensor.py new file mode 100644 index 00000000..77257713 --- /dev/null +++ b/Pilot3/P3B5/darts/utils/tensor.py @@ -0,0 +1,17 @@ +import torch + + +def to_device(tensor, device: torch.device): + """ Convert tensor-like object to given PyTorch device """ + if tensor is None: + return tensor + elif isinstance(tensor, torch.Tensor): + return tensor.to(device) + elif isinstance(tensor, dict): + return {k: to_device(v, device) for k, v in tensor.items()} + elif isinstance(tensor, list): + return [to_device(v, device) for v in tensor] + elif isinstance(tensor, tuple): + return tuple(to_device(v, device) for v in tensor) + else: + raise NotImplementedError \ No newline at end of file From f274e804c039e2a6e6b24f4a3399e3d8618956d2 Mon Sep 17 00:00:00 2001 From: Jamal Date: Wed, 1 Apr 2020 22:44:31 -0600 Subject: [PATCH 172/331] Added switch for TF2 random seed function. --- common/keras_utils.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/common/keras_utils.py b/common/keras_utils.py index 06119051..c57c7583 100644 --- a/common/keras_utils.py +++ b/common/keras_utils.py @@ -48,7 +48,10 @@ def set_seed(seed): if K.backend() == 'tensorflow': import tensorflow as tf - tf.set_random_seed(seed) + if tf.__version__ < "2.0.0": + tf.set_random_seed(seed) + else: + tf.random.set_seed(seed) def get_function(name): From 6a66624bfa46a8e989f6d1dd56318a16e7cd8d3b Mon Sep 17 00:00:00 2001 From: Jamal Date: Wed, 1 Apr 2020 22:48:42 -0600 Subject: [PATCH 173/331] First draft of Milestone 16 functions and test script --- common/P1_utils.py | 565 +++++++++++++++++++++++++++++ common/candle/__init__.py | 17 +- common/data_preprocessing_utils.py | 133 +++++++ common/feature_selection_utils.py | 177 +++++++++ examples/M16/M16_test.py | 111 ++++++ 5 files changed, 1002 insertions(+), 1 deletion(-) create mode 100644 common/P1_utils.py create mode 100644 common/data_preprocessing_utils.py create mode 100644 common/feature_selection_utils.py create mode 100644 examples/M16/M16_test.py diff --git a/common/P1_utils.py b/common/P1_utils.py new file mode 100644 index 00000000..94871e8a --- /dev/null +++ b/common/P1_utils.py @@ -0,0 +1,565 @@ +import sys +import pandas as pd +import numpy as np +import patsy +import numpy.linalg as la +from sklearn.feature_selection import mutual_info_regression +import statsmodels.api as sm + + +################### Auxiliary functions of COXEN start here #################### + + + +def calculate_concordance_correlation_coefficient(u, v): + ''' + This function calculates the concordance correlation coefficient between two input 1-D numpy arrays. + + Parameters: + ----------- + u: 1-D numpy array of a variable + v: 1-D numpy array of a variable + + Returns: + -------- + ccc: a numeric value of concordance correlation coefficient between the two input variables. + ''' + a = 2 * np.mean((u - np.mean(u)) * (v - np.mean(v))) + b = np.mean(np.square(u - np.mean(u))) + np.mean(np.square(v - np.mean(v))) + np.square(np.mean(u) - np.mean(v)) + ccc = a/b + return ccc + + + +def generalization_feature_selection(data1, data2, measure, cutoff): + ''' + This function uses the Pearson correlation coefficient to select the features that are generalizable + between data1 and data2. + + Parameters: + ----------- + data1: 2D numpy array of the first dataset with a size of (n_samples_1, n_features) + data2: 2D numpy array of the second dataset with a size of (n_samples_2, n_features) + measure: string. 'pearson' indicates the Pearson correlation coefficient; + 'ccc' indicates the concordance correlation coefficient. Default is 'pearson'. + cutoff: a positive number for selecting generalizable features. If cutoff < 1, this function selects + the features with a correlation coefficient >= cutoff. If cutoff >= 1, it must be an + integer indicating the number of features to be selected based on correlation coefficient. + + Returns: + -------- + fid: 1-D numpy array containing the indices of selected features. + ''' + cor1 = np.corrcoef(np.transpose(data1)) + cor2 = np.corrcoef(np.transpose(data2)) + num = data1.shape[1] + cor = [] + if measure == 'pearson': + for i in range(num): + cor.append(np.corrcoef(np.vstack((list(cor1[:i, i]) + list(cor1[(i + 1):, i]), + list(cor2[:i, i]) + list(cor2[(i + 1):, i]))))[0, 1]) + elif measure == 'ccc': + for i in range(num): + cor.append(calculate_concordance_correlation_coefficient(np.array(list(cor1[:i, i]) + list(cor1[(i + 1):, i])), + np.array(list(cor2[:i, i]) + list(cor2[(i + 1):, i])))) + cor = np.array(cor) + fid = np.argsort(-cor)[:int(cutoff)] + return fid + + + +################### Auxiliary functions of COXEN end here #################### + +def coxen_single_drug_gene_selection(source_data, target_data, drug_response_data, drug_response_col, tumor_col, + prediction_power_measure='pearson', num_predictive_gene=100, generalization_power_measure='ccc', + num_generalizable_gene=50, multi_drug_mode=False): + ''' + This function selects genes for drug response prediction using the COXEN approach. The COXEN approach is + designed for selecting genes to predict the response of tumor cells to a specific drug. This function + assumes no missing data exist. + + Parameters: + ----------- + source_data: pandas data frame of gene expressions of tumors, for which drug response is known. Its size is + [n_source_samples, n_features]. + target_data: pandas data frame of gene expressions of tumors, for which drug response needs to be predicted. + Its size is [n_target_samples, n_features]. source_data and target_data have the same set + of features and the orders of features must match. + drug_response_data: pandas data frame of drug response values for a drug. It must include a column of drug + response values and a column of tumor IDs. + drug_response_col: non-negative integer or string. If integer, it is the column index of drug response in + drug_response_data. If string, it is the column name of drug response. + tumor_col: non-negative integer or string. If integer, it is the column index of tumor IDs in drug_response_data. + If string, it is the column name of tumor IDs. + prediction_power_measure: string. 'pearson' uses the absolute value of Pearson correlation coefficient to + measure prediction power of gene; 'mutual_info' uses the mutual information to measure prediction power + of gene. Default is 'pearson'. + num_predictive_gene: positive integer indicating the number of predictive genes to be selected. + generalization_power_measure: string. 'pearson' indicates the Pearson correlation coefficient; + 'ccc' indicates the concordance correlation coefficient. Default is 'ccc'. + num_generalizable_gene: positive integer indicating the number of generalizable genes to be selected. + multi_drug_mode: boolean, indicating whether the function runs as an auxiliary function of COXEN + gene selection for multiple drugs. Default is False. + + Returns: + -------- + indices: 1-D numpy array containing the indices of selected genes, if multi_drug_mode is False; + 1-D numpy array of indices of sorting all genes according to their prediction power, if multi_drug_mode is True. + ''' + + if isinstance(drug_response_col, str): + drug_response_col = np.where(drug_response_data.columns == drug_response_col)[0][0] + + if isinstance(tumor_col, str): + tumor_col = np.where(drug_response_data.columns == tumor_col)[0][0] + + drug_response_data = drug_response_data.copy() + drug_response_data = drug_response_data.iloc[np.where(np.isin(drug_response_data.iloc[:, tumor_col], + source_data.index))[0], :] + + source_data = source_data.copy() + source_data = source_data.iloc[np.where(np.isin(source_data.index, drug_response_data.iloc[:, tumor_col]))[0], :] + + source_std_id = select_features_by_variation(source_data, variation_measure='std', threshold=0.00000001) + target_std_id = select_features_by_variation(target_data, variation_measure='std', threshold=0.00000001) + std_id = np.sort(np.intersect1d(source_std_id, target_std_id)) + source_data = source_data.iloc[:, std_id] + target_data = target_data.copy() + target_data = target_data.iloc[:, std_id] + + # Perform the first step of COXEN approach to select predictive genes. To avoid exceeding the memory limit, + # the prediction power of genes is calculated in batches. + batchSize = 1000 + numBatch = int(np.ceil(source_data.shape[1]/batchSize)) + prediction_power = np.empty((source_data.shape[1], 1)) + prediction_power.fill(np.nan) + for i in range(numBatch): + startIndex = i*batchSize + endIndex = min((i+1)*batchSize, source_data.shape[1]) + + if prediction_power_measure == 'pearson': + cor_i = np.corrcoef(np.vstack((np.transpose(source_data.iloc[:, startIndex:endIndex].loc[drug_response_data.iloc[:, tumor_col], + :].values), np.reshape(drug_response_data.iloc[:, drug_response_col].values, (1, drug_response_data.shape[0]))))) + prediction_power[startIndex:endIndex, 0] = abs(cor_i[:-1, -1]) + + if prediction_power_measure == 'mutual_info': + mi = mutual_info_regression(X=source_data.iloc[:, startIndex:endIndex].loc[drug_response_data.iloc[:, tumor_col], :].values, + y=drug_response_data.iloc[:, drug_response_col].values) + prediction_power[startIndex:endIndex, 0] = mi + + if multi_drug_mode: + indices = np.argsort(-prediction_power[:, 0]) + return std_id[indices] + + num_predictive_gene = int(min(num_predictive_gene, source_data.shape[1])) + gid1 = np.argsort(-prediction_power[:, 0])[:num_predictive_gene] + + # keep only predictive genes for source and target data + source_data = source_data.iloc[:, gid1] + target_data = target_data.iloc[:, gid1] + num_generalizable_gene = int(min(num_generalizable_gene, len(gid1))) + # perform the second step of COXEN approach to select generalizable genes among the predictive genes + gid2 = generalization_feature_selection(source_data.values, target_data.values, generalization_power_measure, + num_generalizable_gene) + + indices = std_id[gid1[gid2]] + + return np.sort(indices) + +def coxen_multi_drug_gene_selection(source_data, target_data, drug_response_data, drug_response_col, tumor_col, drug_col, + prediction_power_measure='lm', num_predictive_gene=100, generalization_power_measure='ccc', + num_generalizable_gene=50, union_of_single_drug_selection=False): + ''' + This function uses the COXEN approach to select genes for predicting the response of multiple drugs. + It assumes no missing data exist. It works in three modes. + (1) If union_of_single_drug_selection is True, prediction_power_measure must be either 'pearson' or 'mutual_info'. + This functions runs coxen_single_drug_gene_selection for every drug with the parameter setting and takes the + union of the selected genes of every drug as the output. The size of the selected gene set may be larger than + num_generalizable_gene. + (2) If union_of_single_drug_selection is False and prediction_power_measure is 'lm', this function uses a + linear model to fit the response of multiple drugs using the expression of a gene, while the drugs are + one-hot encoded. The p-value associated with the coefficient of gene expression is used as the prediction + power measure, according to which num_predictive_gene genes will be selected. Then, among the predictive + genes, num_generalizable_gene generalizable genes will be selected. + (3) If union_of_single_drug_selection is False and prediction_power_measure is 'pearson' or 'mutual_info', + for each drug this functions ranks the genes according to their power of predicting the + response of the drug. The union of an equal number of predictive genes for every drug will be generated, + and its size must be at least num_predictive_gene. Then, num_generalizable_gene generalizable genes + will be selected. + + Parameters: + ----------- + source_data: pandas data frame of gene expressions of tumors, for which drug response is known. Its size is + [n_source_samples, n_features]. + target_data: pandas data frame of gene expressions of tumors, for which drug response needs to be predicted. + Its size is [n_target_samples, n_features]. source_data and target_data have the same set + of features and the orders of features must match. + drug_response_data: pandas data frame of drug response that must include a column of drug response values, + a column of tumor IDs, and a column of drug IDs. + drug_response_col: non-negative integer or string. If integer, it is the column index of drug response in + drug_response_data. If string, it is the column name of drug response. + tumor_col: non-negative integer or string. If integer, it is the column index of tumor IDs in drug_response_data. + If string, it is the column name of tumor IDs. + drug_col: non-negative integer or string. If integer, it is the column index of drugs in drug_response_data. + If string, it is the column name of drugs. + prediction_power_measure: string. 'pearson' uses the absolute value of Pearson correlation coefficient to + measure prediction power of a gene; 'mutual_info' uses the mutual information to measure prediction power + of a gene; 'lm' uses the linear regression model to select predictive genes for multiple drugs. Default is 'lm'. + num_predictive_gene: positive integer indicating the number of predictive genes to be selected. + generalization_power_measure: string. 'pearson' indicates the Pearson correlation coefficient; + 'ccc' indicates the concordance correlation coefficient. Default is 'ccc'. + num_generalizable_gene: positive integer indicating the number of generalizable genes to be selected. + union_of_single_drug_selection: boolean, indicating whether the final gene set should be the union of genes + selected for every drug. + + Returns: + -------- + indices: 1-D numpy array containing the indices of selected genes. + ''' + + if isinstance(drug_response_col, str): + drug_response_col = np.where(drug_response_data.columns == drug_response_col)[0][0] + + if isinstance(tumor_col, str): + tumor_col = np.where(drug_response_data.columns == tumor_col)[0][0] + + if isinstance(drug_col, str): + drug_col = np.where(drug_response_data.columns == drug_col)[0][0] + + drug_response_data = drug_response_data.copy() + drug_response_data = drug_response_data.iloc[np.where(np.isin(drug_response_data.iloc[:, tumor_col], + source_data.index))[0], :] + drugs = np.unique(drug_response_data.iloc[:, drug_col]) + + source_data = source_data.copy() + source_data = source_data.iloc[np.where(np.isin(source_data.index, drug_response_data.iloc[:, tumor_col]))[0], :] + + source_std_id = select_features_by_variation(source_data, variation_measure='std', threshold=0.00000001) + target_std_id = select_features_by_variation(target_data, variation_measure='std', threshold=0.00000001) + std_id = np.sort(np.intersect1d(source_std_id, target_std_id)) + source_data = source_data.iloc[:, std_id] + target_data = target_data.copy() + target_data = target_data.iloc[:, std_id] + + num_predictive_gene = int(min(num_predictive_gene, source_data.shape[1])) + + if union_of_single_drug_selection: + if prediction_power_measure != 'pearson' and prediction_power_measure != 'mutual_info': + print('pearson or mutual_info must be used as prediction_power_measure for taking the union of selected genes of every drugs') + sys.exit(1) + gid1 = np.array([]).astype(np.int64) + for d in drugs: + idd = np.where(drug_response_data.iloc[:, drug_col] == d)[0] + response_d = drug_response_data.iloc[idd, :] + gid2 = coxen_single_drug_gene_selection(source_data, target_data, response_d, drug_response_col, tumor_col, + prediction_power_measure, num_predictive_gene, generalization_power_measure, num_generalizable_gene) + gid1 = np.union1d(gid1, gid2) + return np.sort(std_id[gid1]) + + if prediction_power_measure == 'lm': + pvalue = np.empty((source_data.shape[1], 1)) + pvalue.fill(np.nan) + drug_m = np.identity(len(drugs)) + drug_m = pd.DataFrame(drug_m, index=drugs) + drug_sample = drug_m.loc[drug_response_data.iloc[:, drug_col], :].values + for i in range(source_data.shape[1]): + ge_sample = source_data.iloc[:, i].loc[drug_response_data.iloc[:, tumor_col]].values + sample = np.hstack((np.reshape(ge_sample, (len(ge_sample), 1)), drug_sample)) + sample = sm.add_constant(sample) + mod = sm.OLS(drug_response_data.iloc[:, drug_response_col].values, sample) + try: + res = mod.fit() + pvalue[i, 0] = res.pvalues[1] + except: + pvalue[i, 0] = 1 + + gid1 = np.argsort(pvalue[:, 0])[:num_predictive_gene] + + elif prediction_power_measure == 'pearson' or prediction_power_measure == 'mutual_info': + gene_rank = np.empty((len(drugs), source_data.shape[1])) + gene_rank.fill(np.nan) + gene_rank = pd.DataFrame(gene_rank, index=drugs) + for d in range(len(drugs)): + idd = np.where(drug_response_data.iloc[:, drug_col] == drugs[d])[0] + response_d = drug_response_data.iloc[idd, :] + temp_rank = coxen_single_drug_gene_selection(source_data, target_data, response_d, + drug_response_col, tumor_col, prediction_power_measure, num_predictive_gene=None, + generalization_power_measure=None, num_generalizable_gene=None, multi_drug_mode=True) + gene_rank.iloc[d, :len(temp_rank)] = temp_rank + for i in range(int(np.ceil(num_predictive_gene/len(drugs))), source_data.shape[1]+1): + gid1 = np.unique(np.reshape(gene_rank.iloc[:, :i].values, (1, gene_rank.shape[0]*i))[0, :]) + gid1 = gid1[np.where(np.invert(np.isnan(gid1)))[0]] + if len(gid1) >= num_predictive_gene: + break + gid1 = gid1.astype(np.int64) + + # keep only predictive genes for source and target data + source_data = source_data.iloc[:, gid1] + target_data = target_data.iloc[:, gid1] + num_generalizable_gene = int(min(num_generalizable_gene, len(gid1))) + + # perform the second step of COXEN approach to select generalizable genes among the predictive genes + gid2 = generalization_feature_selection(source_data.values, target_data.values, generalization_power_measure, + num_generalizable_gene) + + indices = std_id[gid1[gid2]] + + return np.sort(indices) + +def generate_gene_set_data(data, genes, gene_name_type='entrez', gene_set_category='c6.all', metric='mean', + standardize=False): + ''' + This function generates genomic data summarized at the gene set level. + + Parameters: + ----------- + data: numpy array or pandas data frame of numeric values, with a shape of [n_samples, n_features]. + genes: 1-D array or list of gene names with a length of n_features. It indicates which gene a genomic + feature belongs to. + gene_name_type: string, indicating the type of gene name used in genes. 'entrez' indicates Entrez gene ID and + 'symbols' indicates HGNC gene symbol. Default is 'symbols'. + gene_set_category: string, indicating the gene sets for which data will be calculated. 'c2.cgp' indicates gene sets + affected by chemical and genetic perturbations; 'c2.cp.biocarta' indicates BioCarta gene sets; 'c2.cp.kegg' + indicates KEGG gene sets; 'c2.cp.pid' indicates PID gene sets; 'c2.cp.reactome' indicates Reactome gene sets; + 'c5.bp' indicates GO biological processes; 'c5.cc' indicates GO cellular components; 'c5.mf' indicates + GO molecular functions; 'c6.all' indicates oncogenic signatures. Default is 'c6.all'. + metric: string, indicating the way to calculate gene-set-level data. 'mean' calculates the mean of gene + features belonging to the same gene set. 'sum' calculates the summation of gene features belonging + to the same gene set. 'max' calculates the maximum of gene features. 'min' calculates the minimum + of gene features. 'abs_mean' calculates the mean of absolute values. 'abs_maximum' calculates + the maximum of absolute values. Default is 'mean'. + standardize: boolean, indicating whether to standardize features before calculation. Standardization transforms + each feature to have a zero mean and a unit standard deviation. + + Returns: + -------- + gene_set_data: a data frame of calculated gene-set-level data. Column names are the gene set names. + ''' + + sample_name = None + if isinstance(data, pd.DataFrame): + sample_name = data.index + data = data.values + elif not isinstance(data, np.ndarray): + print('Input data must be a numpy array or pandas data frame') + sys.exit(1) + + if standardize: + scaler = StandardScaler() + data = scaler.fit_transform(data) + + genes = [str(i) for i in genes] + + if gene_name_type == 'entrez': + gene_set_category = gene_set_category + '.v7.0.entrez.gmt' + if gene_name_type == 'symbols': + gene_set_category = gene_set_category + '.v7.0.symbols.gmt' + f = open('../../Data/examples/Gene_Sets/MSigDB.v7.0/' + gene_set_category, 'r') + x = f.readlines() + gene_sets = {} + for i in range(len(x)): + temp = x[i].split('\n')[0].split('\t') + gene_sets[temp[0]] = temp[2:] + + gene_set_data = np.empty((data.shape[0], len(gene_sets))) + gene_set_data.fill(np.nan) + gene_set_names = np.array(list(gene_sets.keys())) + for i in range(len(gene_set_names)): + idi = np.where(np.isin(genes, gene_sets[gene_set_names[i]]))[0] + if len(idi) > 0: + if metric == 'sum': + gene_set_data[:, i] = np.nansum(data[:, idi], axis=1) + elif metric == 'max': + gene_set_data[:, i] = np.nanmax(data[:, idi], axis=1) + elif metric == 'min': + gene_set_data[:, i] = np.nanmin(data[:, idi], axis=1) + elif metric == 'abs_mean': + gene_set_data[:, i] = np.nanmean(np.absolute(data[:, idi]), axis=1) + elif metric == 'abs_maximum': + gene_set_data[:, i] = np.nanmax(np.absolute(data[:, idi]), axis=1) + else: #'mean' + gene_set_data[:, i] = np.nanmean(data[:, idi], axis=1) + + if sample_name is None: + gene_set_data = pd.DataFrame(gene_set_data, columns=gene_set_names) + else: + gene_set_data = pd.DataFrame(gene_set_data, columns=gene_set_names, index=sample_name) + keep_id = np.where(np.sum(np.invert(pd.isna(gene_set_data)), axis=0) > 0)[0] + gene_set_data = gene_set_data.iloc[:, keep_id] + + return gene_set_data + +################### Auxiliary functions of ComBat start here #################### + + + +def design_mat(mod, numerical_covariates, batch_levels): + # require levels to make sure they are in the same order as we use in the + # rest of the script. + design = patsy.dmatrix("~ 0 + C(batch, levels=%s)" % str(batch_levels), + mod, return_type="dataframe") + + mod = mod.drop(["batch"], axis=1) + numerical_covariates = list(numerical_covariates) + sys.stderr.write("found %i batches\n" % design.shape[1]) + other_cols = [c for i, c in enumerate(mod.columns) + if not i in numerical_covariates] + factor_matrix = mod[other_cols] + design = pd.concat((design, factor_matrix), axis=1) + if numerical_covariates is not None: + sys.stderr.write("found %i numerical covariates...\n" + % len(numerical_covariates)) + for i, nC in enumerate(numerical_covariates): + cname = mod.columns[nC] + sys.stderr.write("\t{0}\n".format(cname)) + design[cname] = mod[mod.columns[nC]] + sys.stderr.write("found %i categorical variables:" % len(other_cols)) + sys.stderr.write("\t" + ", ".join(other_cols) + '\n') + return design + + +def it_sol(sdat, g_hat, d_hat, g_bar, t2, a, b, conv=0.0001): + n = (1 - np.isnan(sdat)).sum(axis=1) + g_old = g_hat.copy() + d_old = d_hat.copy() + + change = 1 + count = 0 + while change > conv: + # print g_hat.shape, g_bar.shape, t2.shape + g_new = postmean(g_hat, g_bar, n, d_old, t2) + sum2 = ((sdat - np.dot(g_new.values.reshape((g_new.shape[0], 1)), np.ones((1, sdat.shape[1])))) ** 2).sum( + axis=1) + d_new = postvar(sum2, n, a, b) + + change = max((abs(g_new - g_old) / g_old).max(), (abs(d_new - d_old) / d_old).max()) + g_old = g_new # .copy() + d_old = d_new # .copy() + count = count + 1 + adjust = (g_new, d_new) + return adjust + + +def aprior(gamma_hat): + m = gamma_hat.mean() + s2 = gamma_hat.var() + return (2 * s2 + m ** 2) / s2 + + +def bprior(gamma_hat): + m = gamma_hat.mean() + s2 = gamma_hat.var() + return (m * s2 + m ** 3) / s2 + + +def postmean(g_hat, g_bar, n, d_star, t2): + return (t2 * n * g_hat + d_star * g_bar) / (t2 * n + d_star) + + +def postvar(sum2, n, a, b): + return (0.5 * sum2 + b) / (n / 2.0 + a - 1.0) + + + +################### Auxiliary functions of ComBat end here #################### + +def combat_batch_effect_removal(data, batch_labels, model=None, numerical_covariates=None): + ''' + This function corrects for batch effect in data. + + Parameters: + ----------- + data: pandas data frame of numeric values, with a size of (n_features, n_samples) + batch_labels: pandas series, with a length of n_samples. It should provide the batch labels of samples. + Its indices are the same as the column names (sample names) in "data". + model: an object of patsy.design_info.DesignMatrix. It is a design matrix describing the covariate + information on the samples that could cause batch effects. If not provided, this function + will attempt to coarsely correct just based on the information provided in "batch". + numerical_covariates: a list of the names of covariates in "model" that are numerical rather than + categorical. + + Returns: + -------- + corrected : pandas data frame of numeric values, with a size of (n_features, n_samples). It is + the data with batch effects corrected. + ''' + + if isinstance(numerical_covariates, str): + numerical_covariates = [numerical_covariates] + if numerical_covariates is None: + numerical_covariates = [] + + if model is not None and isinstance(model, pd.DataFrame): + model["batch"] = list(batch_labels) + else: + model = pd.DataFrame({'batch': batch_labels}) + + batch_items = model.groupby("batch").groups.items() + batch_levels = [k for k, v in batch_items] + batch_info = [v for k, v in batch_items] + n_batch = len(batch_info) + n_batches = np.array([len(v) for v in batch_info]) + n_array = float(sum(n_batches)) + + # drop intercept + drop_cols = [cname for cname, inter in ((model == 1).all()).iteritems() if inter == True] + drop_idxs = [list(model.columns).index(cdrop) for cdrop in drop_cols] + model = model[[c for c in model.columns if not c in drop_cols]] + numerical_covariates = [list(model.columns).index(c) if isinstance(c, str) else c + for c in numerical_covariates if not c in drop_cols] + + design = design_mat(model, numerical_covariates, batch_levels) + + sys.stderr.write("Standardizing Data across genes.\n") + B_hat = np.dot(np.dot(la.inv(np.dot(design.T, design)), design.T), data.T) + grand_mean = np.dot((n_batches / n_array).T, B_hat[:n_batch, :]) + var_pooled = np.dot(((data - np.dot(design, B_hat).T) ** 2), np.ones((int(n_array), 1)) / int(n_array)) + + stand_mean = np.dot(grand_mean.T.reshape((len(grand_mean), 1)), np.ones((1, int(n_array)))) + tmp = np.array(design.copy()) + tmp[:, :n_batch] = 0 + stand_mean += np.dot(tmp, B_hat).T + + s_data = ((data - stand_mean) / np.dot(np.sqrt(var_pooled), np.ones((1, int(n_array))))) + + sys.stderr.write("Fitting L/S model and finding priors\n") + batch_design = design[design.columns[:n_batch]] + gamma_hat = np.dot(np.dot(la.inv(np.dot(batch_design.T, batch_design)), batch_design.T), s_data.T) + + delta_hat = [] + + for i, batch_idxs in enumerate(batch_info): + delta_hat.append(s_data[batch_idxs].var(axis=1)) + + gamma_bar = gamma_hat.mean(axis=1) + t2 = gamma_hat.var(axis=1) + + a_prior = list(map(aprior, delta_hat)) + b_prior = list(map(bprior, delta_hat)) + + sys.stderr.write("Finding parametric adjustments\n") + gamma_star, delta_star = [], [] + for i, batch_idxs in enumerate(batch_info): + temp = it_sol(s_data[batch_idxs], gamma_hat[i], + delta_hat[i], gamma_bar[i], t2[i], a_prior[i], b_prior[i]) + + gamma_star.append(temp[0]) + delta_star.append(temp[1]) + + sys.stdout.write("Adjusting data\n") + bayesdata = s_data + gamma_star = np.array(gamma_star) + delta_star = np.array(delta_star) + + for j, batch_idxs in enumerate(batch_info): + dsq = np.sqrt(delta_star[j, :]) + dsq = dsq.reshape((len(dsq), 1)) + denom = np.dot(dsq, np.ones((1, n_batches[j]))) + numer = np.array(bayesdata[batch_idxs] - np.dot(batch_design.loc[batch_idxs], gamma_star).T) + + bayesdata[batch_idxs] = numer / denom + + vpsq = np.sqrt(var_pooled).reshape((len(var_pooled), 1)) + bayesdata = bayesdata * np.dot(vpsq, np.ones((1, int(n_array)))) + stand_mean + + return bayesdata diff --git a/common/candle/__init__.py b/common/candle/__init__.py index 5e0f4ca0..059d0df5 100644 --- a/common/candle/__init__.py +++ b/common/candle/__init__.py @@ -49,10 +49,25 @@ from uq_utils import overprediction_check from uq_utils import generate_index_distribution -#profiling +# import from profiling_utils from profiling_utils import start_profiling from profiling_utils import stop_profiling +# import from data_preprocessing_utils +from data_preprocessing_utils import quantile_normalization +from data_preprocessing_utils import generate_cross_validation_partition + +# feature selection +from feature_selection_utils import select_features_by_missing_values +from feature_selection_utils import select_features_by_variation +from feature_selection_utils import select_decorrelated_features + +# P1-specific +from P1_utils import coxen_single_drug_gene_selection +from P1_utils import coxen_multi_drug_gene_selection +from P1_utils import generate_gene_set_data +from P1_utils import combat_batch_effect_removal + # import benchmark-dependent utils import sys if 'keras' in sys.modules: diff --git a/common/data_preprocessing_utils.py b/common/data_preprocessing_utils.py new file mode 100644 index 00000000..04cefc58 --- /dev/null +++ b/common/data_preprocessing_utils.py @@ -0,0 +1,133 @@ +import sys +import pandas as pd +import numpy as np +import numpy.linalg as la +from scipy import stats +from collections import Counter + +def quantile_normalization(data): + ''' + This function does quantile normalization to input data. After normalization, the samples (rows) in output + data follow the same distribution, which is the average distribution calculated based on all samples. + This function allows missing values, and assume missing values occur at random. + + Parameters: + ----------- + data: numpy array or pandas data frame of numeric values, with a shape of [n_samples, n_features]. + + Returns: + -------- + norm_data: numpy array or pandas data frame containing the data after quantile normalization. + ''' + + colnames = None + rownames = None + if isinstance(data, pd.DataFrame): + colnames = data.columns + rownames = data.index + data = data.values + elif not isinstance(data, np.ndarray): + print('Input data must be a numpy array or pandas data frame') + sys.exit(1) + + norm_data = data.copy() + nan_mask = np.isnan(norm_data) + if np.sum(nan_mask) > 0: + n_samples, n_features = norm_data.shape + for i in range(n_samples): + idi_nan = np.where(np.isnan(norm_data[i, :]))[0] + if len(idi_nan) > 0: + idi = np.setdiff1d(range(n_features), idi_nan) + norm_data[i, idi_nan] = np.random.choice(norm_data[i, idi], size=len(idi_nan), replace=True) + + quantiles = np.mean(np.sort(norm_data, axis=1), axis=0) + ranks = np.apply_along_axis(stats.rankdata, 1, norm_data) + rank_indices = ranks.astype(int) - 1 + norm_data = quantiles[rank_indices] + + if np.sum(nan_mask) > 0: + row_id, col_id = np.where(nan_mask) + norm_data[row_id, col_id] = np.nan + + if colnames is not None and rownames is not None: + norm_data = pd.DataFrame(norm_data, columns=colnames, index=rownames) + + return norm_data + +def generate_cross_validation_partition(group_label, n_folds=5, n_repeats=1, portions=None, random_seed=None): + ''' + This function generates partition indices of samples for cross-validation analysis. + + Parameters: + ----------- + group_label: 1-D array or list of group labels of samples. If there are no groups in samples, a list of + sample indices can be supplied for generating partitions based on individual samples rather than sample groups. + n_folds: positive integer larger than 1, indicating the number of folds for cross-validation. Default is 5. + n_repeats: positive integer, indicating how many times the n_folds cross-validation should be repeated. + So the total number of cross-validation trials is n_folds * n_repeats. Default is 1. + portions: 1-D array or list of positive integers, indicating the number of data folds in each set + (e.g. training set, testing set, or validation set) after partitioning. The summation of elements + in portions must be equal to n_folds. Default is [1, n_folds - 1]. + random_seed: positive integer, the seed for random generator. Default is None. + + Returns: + -------- + partition: list of n_folds * n_repeats lists, each of which contains len(portions) sample index lists for + a cross-validation trial. + ''' + + group_counter = Counter(group_label) + unique_label = np.array(list(group_counter.keys())) + n_group = len(unique_label) + if n_group < n_folds: + print('The number of groups in labels can not be smaller than the number of folds.') + sys.exit(1) + sorted_label = np.array(sorted(unique_label, key=lambda x: group_counter[x], reverse=True)) + + if portions is None: + portions = [1, n_folds - 1] + else: + if np.sum(portions) != n_folds: + print('The summation of elements in portions must be equal to n_folds') + sys.exit(1) + + if random_seed is not None: + np.random.seed(random_seed) + + n_set = len(portions) + partition = [] + for r in range(n_repeats): + + if r == 0 and random_seed is None: + label = sorted_label.copy() + else: + idr = np.random.permutation(n_group) + label = sorted_label[idr] + + folds = [[] for _ in range(n_folds)] + fold_size = np.zeros((n_folds, )) + + for g in range(n_group): + f = np.argmin(fold_size) + folds[f].append(label[g]) + fold_size[f] += group_counter[label[g]] + + for f in range(n_folds): + folds[f] = list(np.where(np.isin(group_label, folds[f]))[0]) + + a = list(range(n_folds)) + list(range(n_folds)) + for f in range(n_folds): + temp = [] + end = f + for s in range(n_set): + start = end + end = start + portions[s] + t = [] + for i in range(start, end): + t = t + folds[a[i]] + temp.append(sorted(t)) + partition.append(temp) + + return partition + + diff --git a/common/feature_selection_utils.py b/common/feature_selection_utils.py new file mode 100644 index 00000000..ee1c47da --- /dev/null +++ b/common/feature_selection_utils.py @@ -0,0 +1,177 @@ +import sys +import pandas as pd +import numpy as np +import numpy.linalg as la +from astropy.stats import median_absolute_deviation +import matplotlib.pyplot as plt + +def select_features_by_missing_values(data, threshold=0.1): + ''' + This function returns the indices of the features whose missing rates are smaller than the threshold. + + Parameters: + ----------- + data: numpy array or pandas data frame of numeric values, with a shape of [n_samples, n_features] + threshold: float in the range of [0, 1]. Features with a missing rate smaller than threshold will be selected. + Default is 0.1 + + Returns: + -------- + indices: 1-D numpy array containing the indices of selected features + ''' + + if isinstance(data, pd.DataFrame): + data = data.values + elif not isinstance(data, np.ndarray): + print('Input data must be a numpy array or pandas data frame') + sys.exit(1) + + missing_rate = np.sum(np.isnan(data), axis=0) / data.shape[0] + indices = np.where(missing_rate < threshold)[0] + + indices = np.sort(indices) + + return indices + +def select_features_by_variation(data, variation_measure='var', threshold=None, portion=None, draw_histogram=False, + bins=100, log=False): + ''' + This function evaluates the variations of individual features and returns the indices of features with large + variations. Missing values are ignored in evaluating variation. + + Parameters: + ----------- + data: numpy array or pandas data frame of numeric values, with a shape of [n_samples, n_features]. + variation_metric: string indicating the metric used for evaluating feature variation. 'var' indicates variance; + 'std' indicates standard deviation; 'mad' indicates median absolute deviation. Default is 'var'. + threshold: float. Features with a variation larger than threshold will be selected. Default is None. + portion: float in the range of [0, 1]. It is the portion of features to be selected based on variation. + The number of selected features will be the smaller of int(portion * n_features) and the total number of + features with non-missing variations. Default is None. threshold and portion can not take real values + and be used simultaneously. + draw_histogram: boolean, whether to draw a histogram of feature variations. Default is False. + bins: positive integer, the number of bins in the histogram. Default is the smaller of 50 and the number of + features with non-missing variations. + log: boolean, indicating whether the histogram should be drawn on log scale. + + + Returns: + -------- + indices: 1-D numpy array containing the indices of selected features. If both threshold and + portion are None, indices will be an empty array. + ''' + + if isinstance(data, pd.DataFrame): + data = data.values + elif not isinstance(data, np.ndarray): + print('Input data must be a numpy array or pandas data frame') + sys.exit(1) + + if variation_measure == 'std': + v_all = np.nanstd(a=data, axis=0) + elif variation_measure == 'mad': + v_all = median_absolute_deviation(data=data, axis=0, ignore_nan=True) + else: + v_all = np.nanvar(a=data, axis=0) + + indices = np.where(np.invert(np.isnan(v_all)))[0] + v = v_all[indices] + + if draw_histogram: + if len(v) < 50: + print('There must be at least 50 features with variation measures to draw a histogram') + else: + bins = int(min(bins, len(v))) + _ = plt.hist(v, bins=bins, log=log) + plt.show() + + if threshold is None and portion is None: + return np.array([]) + elif threshold is not None and portion is not None: + print('threshold and portion can not be used simultaneously. Only one of them can take a real value') + sys.exit(1) + + if threshold is not None: + indices = indices[np.where(v > threshold)[0]] + else: + n_f = int(min(portion * data.shape[1], len(v))) + indices = indices[np.argsort(-v)[:n_f]] + + indices = np.sort(indices) + + return indices + +def select_decorrelated_features(data, method='pearson', threshold=None, random_seed=None): + ''' + This function selects features whose mutual absolute correlation coefficients are smaller than a threshold. + It allows missing values in data. The correlation coefficient of two features are calculated based on + the observations that are not missing in both features. Features with only one or no value present and + features with a zero standard deviation are not considered for selection. + + Parameters: + ----------- + data: numpy array or pandas data frame of numeric values, with a shape of [n_samples, n_features]. + method: string indicating the method used for calculating correlation coefficient. 'pearson' indicates Pearson + correlation coefficient; 'kendall' indicates Kendall Tau correlation coefficient; 'spearman' indicates + Spearman rank correlation coefficient. Default is 'pearson'. + threshold: float. If two features have an absolute correlation coefficient higher than threshold, + one of the features is removed. If threshold is None, a feature is removed only when the two features + are exactly identical. Default is None. + random_seed: positive integer, seed of random generator for ordering the features. If it is None, features + are not re-ordered before feature selection and thus the first feature is always selected. Default is None. + + Returns: + -------- + indices: 1-D numpy array containing the indices of selected features. + ''' + + if isinstance(data, np.ndarray): + data = pd.DataFrame(data) + elif not isinstance(data, pd.DataFrame): + print('Input data must be a numpy array or pandas data frame') + sys.exit(1) + + present = np.where(np.sum(np.invert(pd.isna(data)), axis=0) > 1)[0] + present = present[np.where(np.nanstd(data.iloc[:, present].values, axis=0) > 0)[0]] + + data = data.iloc[:, present] + + num_f = data.shape[1] + if random_seed is not None: + np.random.seed(random_seed) + random_order = np.random.permutation(num_f) + data = data.iloc[:, random_order] + + if threshold is not None: + if np.sum(pd.isna(data).values) == 0 and method == 'pearson': + cor = np.corrcoef(data.values, rowvar=False) + else: + cor = data.corr(method=method).values + else: + data = data.values + + rm = np.full(num_f, False) + index = 0 + while index < num_f-1: + if rm[index]: + index += 1 + continue + idi = np.array(range(index+1, num_f)) + idi = idi[np.where(rm[idi] == False)[0]] + if len(idi) > 0: + if threshold is None: + idi = idi[np.where(np.sum(np.isnan(data[:, idi]) ^ np.isnan(data[:, index][:, np.newaxis]), axis=0) == 0)[0]] + if len(idi) > 0: + idi = idi[np.where(np.nansum(abs(data[:, idi] - data[:, index][:, np.newaxis]), axis=0) == 0)[0]] + else: + idi = idi[np.where(abs(cor[index, idi]) >= threshold)[0]] + if len(idi) > 0: + rm[idi] = True + index += 1 + + indices = np.where(rm == False)[0] + if random_seed is not None: + indices = random_order[indices] + indices = np.sort(present[indices]) + + return indices diff --git a/examples/M16/M16_test.py b/examples/M16/M16_test.py new file mode 100644 index 00000000..47dbe90a --- /dev/null +++ b/examples/M16/M16_test.py @@ -0,0 +1,111 @@ +import os +import sys +import pandas as pd +import numpy as np +import keras + +#from Milestone_16_Functions import select_features_by_missing_values, select_features_by_variation, select_decorrelated_features, \ +#quantile_normalization, generate_cross_validation_partition, generate_gene_set_data, combat_batch_effect_removal + +file_path = os.path.dirname(os.path.realpath(__file__)) +# lib_path = os.path.abspath(os.path.join(file_path, '..')) +# sys.path.append(lib_path) +lib_path2 = os.path.abspath(os.path.join(file_path, "..", "..", "common")) +sys.path.append(lib_path2) + +import candle + +# download all the data if needed from the repo +data_url = 'http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/Candle_Milestone_16_Version_12_15_2019/Data/Data_For_Testing/' +file_name = 'small_drug_descriptor_data_unique_samples.txt' +unique_samples = candle.get_file(file_name, data_url+file_name, cache_subdir='examples') + +file_name = 'small_drug_response_data.txt' +response_data = candle.get_file(file_name, data_url+file_name, cache_subdir='examples') + +file_name = 'Gene_Expression_Full_Data_Unique_Samples.txt' +gene_expression = candle.get_file(file_name, data_url+file_name, cache_subdir='examples') + +file_name = 'CCLE_NCI60_Gene_Expression_Full_Data.txt' +ccle_nci60 = candle.get_file(file_name, data_url+file_name, cache_subdir='examples') + +# download all the gene_set files needed +data_url = 'http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/Candle_Milestone_16_Version_12_15_2019/Data/Gene_Sets/MSigDB.v7.0/' +for gene_set_category in ['c2.cgp','c2.cp.biocarta','c2.cp.kegg','c2.cp.pid','c2.cp.reactome','c5.bp','c5.cc','c5.mf','c6.all']: + for gene_name_type in ['entrez', 'symbols']: + file_name = gene_set_category+'.v7.0.'+gene_name_type+'.gmt' + local_file = candle.get_file(file_name, data_url+file_name, cache_subdir='examples/Gene_Sets/MSigDB.v7.0') + + +# Select features based on_missing_values + + +data = pd.read_csv(unique_samples, sep='\t', engine='c', + na_values=['na', '-', ''], header=0, index_col=0, low_memory=False) +id = candle.select_features_by_missing_values(data, threshold=0.1) +id = candle.select_features_by_missing_values(data.values, threshold=0.3) + + + +# Select features based on variation + +data = pd.read_csv(unique_samples, sep='\t', engine='c', + na_values=['na', '-', ''], header=0, index_col=0, low_memory=False) +id = candle.select_features_by_variation(data, variation_measure='var', threshold=100, portion=None, + draw_histogram=False) +id = candle.select_features_by_variation(data, variation_measure='std', portion=0.2) + + + +# Select uncorrelated features + +data = pd.read_csv(unique_samples, sep='\t', engine='c', + na_values=['na', '-', ''], header=0, index_col=0, low_memory=False) +id = candle.select_decorrelated_features(data, method='pearson', threshold=None, random_seed=None) +id = candle.select_decorrelated_features(data, method='spearman', threshold=0.8, random_seed=10) + + + +# Generate cross-validation partitions of data + +data = pd.read_csv(response_data, + sep='\t', engine='c', na_values=['na', '-', ''], header=0, index_col=None, low_memory=False) +p = candle.generate_cross_validation_partition(range(10), n_folds=5, n_repeats=2, portions=None, random_seed=None) +p = candle.generate_cross_validation_partition(data.CELL, n_folds=5, n_repeats=1, portions=[1, 1, 1, 2], random_seed=1) + + + +# Generate gene-set-level data + +data = pd.read_csv(gene_expression, sep='\t', engine='c', + na_values=['na', '-', ''], header=0, index_col=[0, 1], low_memory=False) +data = data.iloc[:5000, :] +gene_set_data = candle.generate_gene_set_data(np.transpose(data), [i[0] for i in data.index], gene_name_type='entrez', + gene_set_category='c6.all', metric='mean', standardize=False) +gene_set_data = candle.generate_gene_set_data(np.transpose(data.values), [i[1] for i in data.index], gene_name_type='symbol', + gene_set_category='c2.cp.kegg', metric='sum', standardize=False) + + + +# Quantile normalization of gene expression data + +data = pd.read_csv(gene_expression, sep='\t', engine='c', + na_values=['na', '-', ''], header=0, index_col=[0, 1], low_memory=False) +norm_data = candle.quantile_normalization(np.transpose(data)) + + + +# Combat batch normalization on gene expression data + +data = pd.read_csv(ccle_nci60, + sep='\t', engine='c', na_values=['na', '-', ''], header=0, index_col=[0, 1], low_memory=False) + +resource = np.array([i.split('.')[0] for i in data.columns]) +id = np.where(resource == 'NCI60')[0] +norm_data_NCI60 = candle.quantile_normalization(np.transpose(data.iloc[:, id])) +id = np.where(resource == 'CCLE')[0] +norm_data_CCLE = candle.quantile_normalization(np.transpose(data.iloc[:, id])) +norm_data = pd.concat((norm_data_NCI60, norm_data_CCLE), axis=0) +norm_data = np.transpose(norm_data) +corrected_data = candle.combat_batch_effect_removal(norm_data, pd.Series([i.split('.')[0] for i in norm_data.columns], index=norm_data.columns)) + From 6946ce298bebddef85cde7409430bf1ddf50eba2 Mon Sep 17 00:00:00 2001 From: Jamal Date: Wed, 1 Apr 2020 23:27:55 -0600 Subject: [PATCH 174/331] Added data_dir to arguments (autogenerated path). Added some diagnostic output. --- common/P1_utils.py | 4 ++-- examples/M16/M16_test.py | 39 ++++++++++++++++++++++++++++----------- 2 files changed, 30 insertions(+), 13 deletions(-) diff --git a/common/P1_utils.py b/common/P1_utils.py index 94871e8a..217ff4bc 100644 --- a/common/P1_utils.py +++ b/common/P1_utils.py @@ -307,7 +307,7 @@ def coxen_multi_drug_gene_selection(source_data, target_data, drug_response_data return np.sort(indices) def generate_gene_set_data(data, genes, gene_name_type='entrez', gene_set_category='c6.all', metric='mean', - standardize=False): + standardize=False, data_dir='../../Data/examples/Gene_Sets/MSigDB.v7.0/'): ''' This function generates genomic data summarized at the gene set level. @@ -354,7 +354,7 @@ def generate_gene_set_data(data, genes, gene_name_type='entrez', gene_set_catego gene_set_category = gene_set_category + '.v7.0.entrez.gmt' if gene_name_type == 'symbols': gene_set_category = gene_set_category + '.v7.0.symbols.gmt' - f = open('../../Data/examples/Gene_Sets/MSigDB.v7.0/' + gene_set_category, 'r') + f = open(data_dir + gene_set_category, 'r') x = f.readlines() gene_sets = {} for i in range(len(x)): diff --git a/examples/M16/M16_test.py b/examples/M16/M16_test.py index 47dbe90a..62fdc6be 100644 --- a/examples/M16/M16_test.py +++ b/examples/M16/M16_test.py @@ -35,34 +35,51 @@ for gene_name_type in ['entrez', 'symbols']: file_name = gene_set_category+'.v7.0.'+gene_name_type+'.gmt' local_file = candle.get_file(file_name, data_url+file_name, cache_subdir='examples/Gene_Sets/MSigDB.v7.0') - +# extract base directory for gene_set data files +data_dir = local_file.split(file_name)[0] +print('Gene Set data is locally stored at ',data_dir) # Select features based on_missing_values - +print('Original dataframe') data = pd.read_csv(unique_samples, sep='\t', engine='c', na_values=['na', '-', ''], header=0, index_col=0, low_memory=False) +print(data) + +print('Testing select_features_by_missing values') +print('Threshold - 0.1') id = candle.select_features_by_missing_values(data, threshold=0.1) +print(id) +print('Threshold - 0.3') id = candle.select_features_by_missing_values(data.values, threshold=0.3) - - +print(id) # Select features based on variation -data = pd.read_csv(unique_samples, sep='\t', engine='c', - na_values=['na', '-', ''], header=0, index_col=0, low_memory=False) +#data = pd.read_csv(unique_samples, sep='\t', engine='c', +# na_values=['na', '-', ''], header=0, index_col=0, low_memory=False) +print('Testing select_features_by_variation') +print('Variabce, 100') id = candle.select_features_by_variation(data, variation_measure='var', threshold=100, portion=None, draw_histogram=False) +print(id) +print('std, 0.2') id = candle.select_features_by_variation(data, variation_measure='std', portion=0.2) +print(id) # Select uncorrelated features -data = pd.read_csv(unique_samples, sep='\t', engine='c', - na_values=['na', '-', ''], header=0, index_col=0, low_memory=False) +#data = pd.read_csv(unique_samples, sep='\t', engine='c', +# na_values=['na', '-', ''], header=0, index_col=0, low_memory=False) +print('Testing select_decorrelated_features') +print('Pearson') id = candle.select_decorrelated_features(data, method='pearson', threshold=None, random_seed=None) +print(id) +print('Spearman') id = candle.select_decorrelated_features(data, method='spearman', threshold=0.8, random_seed=10) +print(id) @@ -81,9 +98,9 @@ na_values=['na', '-', ''], header=0, index_col=[0, 1], low_memory=False) data = data.iloc[:5000, :] gene_set_data = candle.generate_gene_set_data(np.transpose(data), [i[0] for i in data.index], gene_name_type='entrez', - gene_set_category='c6.all', metric='mean', standardize=False) -gene_set_data = candle.generate_gene_set_data(np.transpose(data.values), [i[1] for i in data.index], gene_name_type='symbol', - gene_set_category='c2.cp.kegg', metric='sum', standardize=False) + gene_set_category='c6.all', metric='mean', standardize=False, data_dir=data_dir) +gene_set_data = candle.generate_gene_set_data(np.transpose(data.values), [i[1] for i in data.index], gene_name_type='symbols', + gene_set_category='c2.cp.kegg', metric='sum', standardize=False, data_dir=data_dir) From c55a249885477df2a7e9321ffa101f01ebd3ec9a Mon Sep 17 00:00:00 2001 From: Jamal Date: Wed, 1 Apr 2020 23:43:10 -0600 Subject: [PATCH 175/331] Added more output, fixed some typos --- examples/M16/M16_test.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/examples/M16/M16_test.py b/examples/M16/M16_test.py index 62fdc6be..9839cc99 100644 --- a/examples/M16/M16_test.py +++ b/examples/M16/M16_test.py @@ -49,23 +49,23 @@ print('Testing select_features_by_missing values') print('Threshold - 0.1') id = candle.select_features_by_missing_values(data, threshold=0.1) -print(id) +print('Column IDs', id) print('Threshold - 0.3') id = candle.select_features_by_missing_values(data.values, threshold=0.3) -print(id) +print('Column IDs', id) # Select features based on variation #data = pd.read_csv(unique_samples, sep='\t', engine='c', # na_values=['na', '-', ''], header=0, index_col=0, low_memory=False) print('Testing select_features_by_variation') -print('Variabce, 100') +print('Variance, 100') id = candle.select_features_by_variation(data, variation_measure='var', threshold=100, portion=None, draw_histogram=False) -print(id) +print('Column IDs', id) print('std, 0.2') id = candle.select_features_by_variation(data, variation_measure='std', portion=0.2) -print(id) +print('Column IDs', id) @@ -76,15 +76,16 @@ print('Testing select_decorrelated_features') print('Pearson') id = candle.select_decorrelated_features(data, method='pearson', threshold=None, random_seed=None) -print(id) +print('Column IDs', id) print('Spearman') id = candle.select_decorrelated_features(data, method='spearman', threshold=0.8, random_seed=10) -print(id) +print('Column IDs', id) # Generate cross-validation partitions of data +print('Testing generate_cross_validation_partition') data = pd.read_csv(response_data, sep='\t', engine='c', na_values=['na', '-', ''], header=0, index_col=None, low_memory=False) p = candle.generate_cross_validation_partition(range(10), n_folds=5, n_repeats=2, portions=None, random_seed=None) @@ -94,6 +95,7 @@ # Generate gene-set-level data +print('Testing generate_gene_set_data') data = pd.read_csv(gene_expression, sep='\t', engine='c', na_values=['na', '-', ''], header=0, index_col=[0, 1], low_memory=False) data = data.iloc[:5000, :] @@ -106,6 +108,7 @@ # Quantile normalization of gene expression data +print('Testing quantile_normalization') data = pd.read_csv(gene_expression, sep='\t', engine='c', na_values=['na', '-', ''], header=0, index_col=[0, 1], low_memory=False) norm_data = candle.quantile_normalization(np.transpose(data)) @@ -114,6 +117,7 @@ # Combat batch normalization on gene expression data +print('Testing combat_batch_effect_removal') data = pd.read_csv(ccle_nci60, sep='\t', engine='c', na_values=['na', '-', ''], header=0, index_col=[0, 1], low_memory=False) From c380aaf164587e3132738b903fce53b1ce21d595 Mon Sep 17 00:00:00 2001 From: Cristina Garcia Cardona Date: Thu, 2 Apr 2020 08:27:17 -0600 Subject: [PATCH 176/331] addded abstention functionality in new uq keras script --- Pilot1/Attn1/attn.py | 1 + Pilot1/Attn1/attn_default_model.txt | 2 +- common/candle/__init__.py | 8 + common/candle_keras/__init__.py | 8 + common/uq_keras_utils.py | 269 ++++++++++++++++++++++++++++ 5 files changed, 287 insertions(+), 1 deletion(-) create mode 100644 common/uq_keras_utils.py diff --git a/Pilot1/Attn1/attn.py b/Pilot1/Attn1/attn.py index 101cd5e8..951f89c7 100644 --- a/Pilot1/Attn1/attn.py +++ b/Pilot1/Attn1/attn.py @@ -97,6 +97,7 @@ def set_locals(self): if additional_definitions is not None: self.additional_definitions = additional_definitions + def extension_from_parameters(params, framework=''): """Construct string for saving model with annotation of parameters""" ext = framework diff --git a/Pilot1/Attn1/attn_default_model.txt b/Pilot1/Attn1/attn_default_model.txt index 0d2ef410..e6d76f98 100644 --- a/Pilot1/Attn1/attn_default_model.txt +++ b/Pilot1/Attn1/attn_default_model.txt @@ -1,5 +1,5 @@ [Global_Params] -data_url = 'ftp://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/' +data_url = 'http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/' in='top_21_1fold_001.h5' model_name='attn' dense=[2000, 600] diff --git a/common/candle/__init__.py b/common/candle/__init__.py index 5e0f4ca0..b70b1ad9 100644 --- a/common/candle/__init__.py +++ b/common/candle/__init__.py @@ -78,6 +78,14 @@ from solr_keras import CandleRemoteMonitor from solr_keras import compute_trainable_params from solr_keras import TerminateOnTimeOut + + from uq_keras_utils import abstention_variable_initialization + from uq_keras_utils import abstention_loss + from uq_keras_utils import abs_acc + from uq_keras_utils import acc_class1 + from uq_keras_utils import abs_acc_class1 + from uq_keras_utils import modify_labels + from uq_keras_utils import AbstentionAdapt_Callback elif 'torch' in sys.modules: print ('Importing candle utils for pytorch') diff --git a/common/candle_keras/__init__.py b/common/candle_keras/__init__.py index bf37f7ec..faf7d949 100644 --- a/common/candle_keras/__init__.py +++ b/common/candle_keras/__init__.py @@ -67,3 +67,11 @@ from solr_keras import compute_trainable_params from solr_keras import TerminateOnTimeOut +#import from uq_keras_utils +from uq_keras_utils import abstention_variable_initialization +from uq_keras_utils import abstention_loss +from uq_keras_utils import abs_acc +from uq_keras_utils import acc_class1 +from uq_keras_utils import abs_acc_class1 +from uq_keras_utils import modify_labels +from uq_keras_utils import AbstentionAdapt_Callback diff --git a/common/uq_keras_utils.py b/common/uq_keras_utils.py new file mode 100644 index 00000000..1d777343 --- /dev/null +++ b/common/uq_keras_utils.py @@ -0,0 +1,269 @@ +from __future__ import absolute_import + + +from keras import backend as K + +from keras.callbacks import Callback + +from keras.utils import np_utils + +import numpy as np + +################################################################### + +# For Abstention Model + +# These are the parameters of the abstention loss +mu = None # Factor weighting abstention term in cost function (auto tunes) +mask = None # Mask for abstention: it is 1 on the output associated to the + # abstention class and 0 otherwise +nb_classes = None # integer or vector of integers with the index of the abstention class + +def abstention_variable_initialization(mu0, mask_, nb_classes_): + """ Function that initializes parameters of the abstention loss + + Parameters + ---------- + mu0 : float + Initial weight of abstention term in cost function + mask_ : ndarray + Numpy array to use as initialiser for global mask variable + nb_classes_ : int or ndarray + Integer or numpy array defining indices of the abstention class + """ + + global mu, mask, nb_classes + + # Parameter Initialization + mu = K.variable(value=mu0) # Weight of abstention term + mask = K.variable(value=mask_) # Mask to compute cost + nb_classes = K.variable(value=nb_classes_, dtype='int64') # integer or vector of integers with the index of the abstention class + + +def abstention_loss(y_true, y_pred): + """ Function to compute abstention loss. It is composed by two terms: (i) original loss of the multiclass classification problem, (ii) cost associated to the abstaining samples. + + Parameters + ---------- + y_true : keras tensor + True values to predict + y_pred : keras tensor + Prediction made by the model. It is assumed that this keras tensor includes extra columns to store the abstaining classes. + """ + cost = 0 + base_pred = (1-mask)*y_pred + base_true = y_true + base_cost = K.categorical_crossentropy(base_true,base_pred) + abs_pred = K.mean(mask*y_pred, axis=-1) + cost = (1.-abs_pred)*base_cost - mu*K.log(1.-abs_pred) + + return cost + + +def abs_acc(y_true, y_pred): + """ Function to estimate accuracy over the predicted samples after removing the samples where the model is abstaining. + + Parameters + ---------- + y_true : keras tensor + True values to predict + y_pred : keras tensor + Prediction made by the model. It is assumed that this keras tensor includes extra columns to store the abstaining classes. + """ + + # matching in original classes + true_pred = K.sum(K.cast(K.equal(K.argmax(y_true, axis=-1), K.argmax(y_pred, axis=-1)), 'int64')) + + # total abstention + total_abs = K.sum(K.cast(K.equal(K.argmax(y_pred, axis=-1), nb_classes), 'int64')) + + # total predicted in original classes + total_pred = K.sum(K.cast(K.equal(K.argmax(y_pred, axis=-1), K.argmax(y_pred, axis=-1)), 'int64')) + + return true_pred/(total_pred - total_abs) + + +def acc_class1(y_true, y_pred): + """ Function to estimate accuracy over the class 1 prediction. This estimation is global (i.e. abstaining samples are not removed) + + Parameters + ---------- + y_true : keras tensor + True values to predict + y_pred : keras tensor + Prediction made by the model. It is assumed that this keras tensor includes extra columns to store the abstaining classes. + """ + + # Find samples in ground truth belonging to class 1 + ytrueint = K.argmax(y_true, axis=-1) + + # Compute total number of ground truth samples in class 1 + total_true1 = K.sum(ytrueint) + + # Find samples in prediction belonging to class 1 + ypredint = K.argmax(y_pred[:,:2], axis=-1) + + # Find correctly predicted class 1 samples + true1_pred = K.sum(ytrueint*ypredint) + + # Compute accuracy in class 1 + acc = true1_pred / total_true1 + + # Since there are so few samples in class 1 + # it is possible that ground truth does not + # have any sample in class 1, leading to a divide + # by zero and not valid accuracy + # Therefore, for the accuracy to be valid + # total_true1 should be greater than zero + # otherwise, return 0. + + condition = K.greater(total_true1, 0) + + return K.switch(condition, acc, K.zeros_like(acc, dtype=acc.dtype)) + + +def abs_acc_class1(y_true, y_pred): + """ Function to estimate accuracy over the class 1 prediction after removing the samples where the model is abstaining + + Parameters + ---------- + y_true : keras tensor + True values to predict + y_pred : keras tensor + Prediction made by the model. It is assumed that this keras tensor includes extra columns to store the abstaining classes. + """ + + # Find locations of true 1 prediction + ytrueint = K.argmax(y_true, axis=-1) + + # Find locations that are predicted (not abstained) + mask_pred = K.cast(K.not_equal(K.argmax(y_pred, axis=-1), nb_classes), 'int64') + + # Compute total number of ground truth samples in class 1 filtering abstaining predictions + total_true1 = K.sum(ytrueint * mask_pred) + + # matching in original class 1 after removing abstention + true1_pred = K.sum(mask_pred * ytrueint * K.cast(K.equal(K.argmax(y_true, axis=-1), K.argmax(y_pred, axis=-1)), 'int64')) + + # Compute accuracy in class 1 + acc = true1_pred / total_true1 + + # Since there are so few samples in class 1 + # it is possible that ground truth does not + # have any sample in class 1, leading to a divide + # by zero and not valid accuracy + # Therefore, for the accuracy to be valid + # total_true1 should be greater than zero + # otherwise, return 0. + + condition = K.greater(total_true1, 0) + + return K.switch(condition, acc, K.zeros_like(acc, dtype=acc.dtype)) + + +class AbstentionAdapt_Callback(Callback): + """ This callback is used to adapt the parameter mu in the abstention loss. + Factor mu (weight of the abstention term in the abstention loss) is increased or decreased to try to match the accuracy set as target. + The accuracy to use must be specified as the 'monitor' argument in the initialization of the callback. It could be: the accuracy without abstention samples (abs_acc), the accuracy over class 1 without abstention samples (abs_acc_class1), etc. + If the current monitored accuracy is smaller than the target set, mu increases to promote more abstention. + If the current monitored accuracy is greater than the target set, mu decreases to promote more predictions (less abstention). + """ + + def __init__(self, monitor, init_abs_epoch=4, scale_factor=0.95, target_acc=0.95): + """ Initializer of the AbstentionAdapt_Callback. + Parameters + ---------- + monitor : keras metric + Metric to monitor during the run and use as base to adapt the weight of the abstention term (i.e. mu) in the asbstention cost function + init_abs_epoch : integer + Value of the epochs to start adjusting the weight of the abstention term (i.e. mu). Default: 4. + scale_factor: float + Factor to scale (increase by dividing or decrease by multiplying) the weight of the abstention term (i.e. mu). Default: 0.95. + target_acc: float + Target accuracy to achieve in the current training. Default: 0.95. + """ + super(AbstentionAdapt_Callback, self).__init__() + + self.monitor = monitor + self.init_abs_epoch = init_abs_epoch # epoch to init abstention + self.scale_factor = scale_factor # factor to scale mu (weight for abstention term in cost function) + self.target_acc = target_acc # target accuracy (value specified as parameter of the run) + self.muvalues = [] # array to store mu evolution + + + def on_epoch_end(self, epoch, logs=None): + """ Initializer of the AbstentionAdapt_Callback. + Parameters + ---------- + epoch : integer + Current epoch in training. + logs : keras logs + Metrics stored during current keras training. + """ + + new_mu_val = K.get_value(mu) + if epoch > self.init_abs_epoch: + + current = logs.get(self.monitor) + + if current is None: + warnings.warn( 'Abstention Adapt conditioned on metric `%s` ' 'which is not available. Available metrics are: %s' % (self.monitor, ','.join(list(logs.keys()))), RuntimeWarning) + else: + # modify mu as needed + if current > self.target_acc: #increase abstention penalty + new_mu_val /= self.scale_factor + elif current < self.target_acc: #decrease abstention penalty + new_mu_val *= self.scale_factor + + K.set_value(mu, new_mu_val) + self.muvalues.append( new_mu_val ) + + #print('epoch: %d, mu: %f' % (epoch, new_mu_val)) + + +def modify_labels(numclasses_out, ytrain, ytest, yval): + """ This function generates a categorical representation with a class added for indicating abstention. + + Parameters + ---------- + numclasses_out : integer + Original number of classes + 1 abstention class + ytrain : ndarray + Numpy array of the classes (labels) in the training set + ytest : ndarray + Numpy array of the classes (labels) in the testing set + yval : ndarray + Numpy array of the classes (labels) in the validation set + """ + + classestrain = np.max(ytrain) + 1 + classestest = np.max(ytest) + 1 + classesval = np.max(yval) + 1 + + assert( classestrain == classestest ) + assert( classesval == classestest ) + assert( (classestrain+1) == numclasses_out ) # In this case only one other slot for abstention is created + + labels_train = np_utils.to_categorical( ytrain, numclasses_out ) + labels_test = np_utils.to_categorical( ytest, numclasses_out ) + labels_val = np_utils.to_categorical( yval, numclasses_out ) + + # For sanity check + mask_vec = np.zeros(labels_train.shape) + mask_vec[:,-1] = 1 + i = np.random.choice(range(labels_train.shape[0])) + sanity_check = mask_vec[i,:]*labels_train[i,:] + print(sanity_check.shape) + if ytrain.ndim > 1: + ll = ytrain.shape[1] + else: + ll = 0 + + for i in range( ll ): + for j in range( numclasses_out ): + if sanity_check[i,j] == 1: + print('Problem at ',i,j) + + return labels_train, labels_test, labels_val + +################################################################### From 5ad46edf17f621fa4edb1a7a21fc4decf54a3000 Mon Sep 17 00:00:00 2001 From: Harry Yoo Date: Thu, 2 Apr 2020 13:18:26 -0500 Subject: [PATCH 177/331] handle both tf.keras and keras models --- common/solr_keras.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/common/solr_keras.py b/common/solr_keras.py index a4944b77..f3d0fdfd 100644 --- a/common/solr_keras.py +++ b/common/solr_keras.py @@ -5,7 +5,6 @@ import numpy as np import requests -from keras import backend as K from keras.callbacks import Callback @@ -20,10 +19,17 @@ def compute_trainable_params(model): ---------- python dictionary that contains trainable_params, non_trainable_params and total_params """ + if str(type(model)).startswith(" Date: Thu, 2 Apr 2020 16:33:29 -0600 Subject: [PATCH 178/331] Bug fixes, improved test output. --- common/P1_utils.py | 6 +- examples/M16/M16_test.py | 167 ++++++++++++++++++++++++++------------- 2 files changed, 114 insertions(+), 59 deletions(-) diff --git a/common/P1_utils.py b/common/P1_utils.py index 217ff4bc..211489f6 100644 --- a/common/P1_utils.py +++ b/common/P1_utils.py @@ -1,11 +1,13 @@ import sys -import pandas as pd import numpy as np -import patsy import numpy.linalg as la +import pandas as pd +import patsy from sklearn.feature_selection import mutual_info_regression +from sklearn.preprocessing import StandardScaler import statsmodels.api as sm +from feature_selection_utils import select_features_by_variation ################### Auxiliary functions of COXEN start here #################### diff --git a/examples/M16/M16_test.py b/examples/M16/M16_test.py index 9839cc99..6825ceff 100644 --- a/examples/M16/M16_test.py +++ b/examples/M16/M16_test.py @@ -3,22 +3,23 @@ import pandas as pd import numpy as np import keras +import warnings +warnings.filterwarnings("ignore") + -#from Milestone_16_Functions import select_features_by_missing_values, select_features_by_variation, select_decorrelated_features, \ -#quantile_normalization, generate_cross_validation_partition, generate_gene_set_data, combat_batch_effect_removal file_path = os.path.dirname(os.path.realpath(__file__)) -# lib_path = os.path.abspath(os.path.join(file_path, '..')) -# sys.path.append(lib_path) lib_path2 = os.path.abspath(os.path.join(file_path, "..", "..", "common")) sys.path.append(lib_path2) + + import candle # download all the data if needed from the repo data_url = 'http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/Candle_Milestone_16_Version_12_15_2019/Data/Data_For_Testing/' file_name = 'small_drug_descriptor_data_unique_samples.txt' -unique_samples = candle.get_file(file_name, data_url+file_name, cache_subdir='examples') +drug_descriptor = candle.get_file(file_name, data_url+file_name, cache_subdir='examples') file_name = 'small_drug_response_data.txt' response_data = candle.get_file(file_name, data_url+file_name, cache_subdir='examples') @@ -29,6 +30,8 @@ file_name = 'CCLE_NCI60_Gene_Expression_Full_Data.txt' ccle_nci60 = candle.get_file(file_name, data_url+file_name, cache_subdir='examples') + + # download all the gene_set files needed data_url = 'http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/Candle_Milestone_16_Version_12_15_2019/Data/Gene_Sets/MSigDB.v7.0/' for gene_set_category in ['c2.cgp','c2.cp.biocarta','c2.cp.kegg','c2.cp.pid','c2.cp.reactome','c5.bp','c5.cc','c5.mf','c6.all']: @@ -37,96 +40,146 @@ local_file = candle.get_file(file_name, data_url+file_name, cache_subdir='examples/Gene_Sets/MSigDB.v7.0') # extract base directory for gene_set data files data_dir = local_file.split(file_name)[0] -print('Gene Set data is locally stored at ',data_dir) +print('Gene Set data is locally stored at ', data_dir) -# Select features based on_missing_values -print('Original dataframe') -data = pd.read_csv(unique_samples, sep='\t', engine='c', - na_values=['na', '-', ''], header=0, index_col=0, low_memory=False) -print(data) -print('Testing select_features_by_missing values') -print('Threshold - 0.1') +# Select features based on_missing_values +print('\n') +print('Testing select_features_by_missing_values') +print('Drug descriptor dataframe includes 10 drugs (rows) and 10 drug descriptor features (columns)') +data = pd.read_csv(drug_descriptor, sep='\t', engine='c', na_values=['na', '-', ''], header=0, index_col=0, low_memory=False) +print(data) +print('Select features with missing rates smaller than 0.1') id = candle.select_features_by_missing_values(data, threshold=0.1) -print('Column IDs', id) -print('Threshold - 0.3') +print('Feature IDs', id) +print('Select features with missing rates smaller than 0.3') id = candle.select_features_by_missing_values(data.values, threshold=0.3) -print('Column IDs', id) +print('Feature IDs', id) + -# Select features based on variation -#data = pd.read_csv(unique_samples, sep='\t', engine='c', -# na_values=['na', '-', ''], header=0, index_col=0, low_memory=False) +# Select features based on variation +print('\n') print('Testing select_features_by_variation') -print('Variance, 100') -id = candle.select_features_by_variation(data, variation_measure='var', threshold=100, portion=None, - draw_histogram=False) -print('Column IDs', id) -print('std, 0.2') +print('Select features with a variance larger than 100') +id = candle.select_features_by_variation(data, variation_measure='var', threshold=100, portion=None, draw_histogram=False) +print('Feature IDs', id) +print('Select the top 2 features with the largest standard deviation') id = candle.select_features_by_variation(data, variation_measure='std', portion=0.2) -print('Column IDs', id) - +print('Feature IDs', id) -# Select uncorrelated features -#data = pd.read_csv(unique_samples, sep='\t', engine='c', -# na_values=['na', '-', ''], header=0, index_col=0, low_memory=False) +# Select decorrelated features +print('\n') print('Testing select_decorrelated_features') -print('Pearson') -id = candle.select_decorrelated_features(data, method='pearson', threshold=None, random_seed=None) -print('Column IDs', id) -print('Spearman') +print('Select features that are not identical to each other and are not all missing.') +id = candle.select_decorrelated_features(data, threshold=None, random_seed=None) +print('Feature IDs', id) +print('Select features whose absolute mutual Spearman correlation coefficient is smaller than 0.8') id = candle.select_decorrelated_features(data, method='spearman', threshold=0.8, random_seed=10) -print('Column IDs', id) +print('Feature IDs', id) # Generate cross-validation partitions of data - +print('\n') print('Testing generate_cross_validation_partition') -data = pd.read_csv(response_data, - sep='\t', engine='c', na_values=['na', '-', ''], header=0, index_col=None, low_memory=False) +print('Generate 5-fold cross-validation partition of 10 samples twice') p = candle.generate_cross_validation_partition(range(10), n_folds=5, n_repeats=2, portions=None, random_seed=None) +print(p) +print('Drug response data of 5 cell lines treated by various drugs.') +data = pd.read_csv(response_data, sep='\t', engine='c', na_values=['na', '-', ''], header=0, index_col=None, low_memory=False) +print(data) +print('Generate partition indices to divide the data into 4 sets without shared cell lines for 5 times.') p = candle.generate_cross_validation_partition(data.CELL, n_folds=5, n_repeats=1, portions=[1, 1, 1, 2], random_seed=1) - - - -# Generate gene-set-level data - -print('Testing generate_gene_set_data') -data = pd.read_csv(gene_expression, sep='\t', engine='c', - na_values=['na', '-', ''], header=0, index_col=[0, 1], low_memory=False) -data = data.iloc[:5000, :] -gene_set_data = candle.generate_gene_set_data(np.transpose(data), [i[0] for i in data.index], gene_name_type='entrez', - gene_set_category='c6.all', metric='mean', standardize=False, data_dir=data_dir) -gene_set_data = candle.generate_gene_set_data(np.transpose(data.values), [i[1] for i in data.index], gene_name_type='symbols', - gene_set_category='c2.cp.kegg', metric='sum', standardize=False, data_dir=data_dir) +print(p) # Quantile normalization of gene expression data - +print('\n') print('Testing quantile_normalization') -data = pd.read_csv(gene_expression, sep='\t', engine='c', - na_values=['na', '-', ''], header=0, index_col=[0, 1], low_memory=False) +print('Gene expression data of 897 cell lines (columns) and 17741 genes (rows).') +data = pd.read_csv(gene_expression, sep='\t', engine='c', na_values=['na', '-', ''], header=0, index_col=[0, 1], low_memory=False) +print(data) +print('Before normalization') +third_quartile = data.quantile(0.75, axis=0) +print('Max difference of third quartile between cell lines is ' + str(np.round(a=np.max(third_quartile) - np.min(third_quartile), decimals=2))) +second_quartile = data.quantile(0.5, axis=0) +print('Max difference of median between cell lines is ' + str(np.round(a=np.max(second_quartile) - np.min(second_quartile), decimals=2))) +first_quartile = data.quantile(0.25, axis=0) +print('Max difference of first quartile between cell lines is ' + str(np.round(a=np.max(first_quartile) - np.min(first_quartile), decimals=2))) norm_data = candle.quantile_normalization(np.transpose(data)) +norm_data = np.transpose(norm_data) +print('After normalization') +third_quartile = norm_data.quantile(0.75, axis=0) +print('Max difference of third quartile between cell lines is ' + str(np.round(a=np.max(third_quartile) - np.min(third_quartile), decimals=2))) +second_quartile = norm_data.quantile(0.5, axis=0) +print('Max difference of median between cell lines is ' + str(np.round(a=np.max(second_quartile) - np.min(second_quartile), decimals=2))) +first_quartile = norm_data.quantile(0.25, axis=0) +print('Max difference of first quartile between cell lines is ' + str(np.round(a=np.max(first_quartile) - np.min(first_quartile), decimals=2))) -# Combat batch normalization on gene expression data +# Generate gene-set-level data +print('\n') +print('Testing generate_gene_set_data') +gene_set_data = candle.generate_gene_set_data(np.transpose(norm_data), [i[0] for i in norm_data.index], gene_name_type='entrez', + gene_set_category='c6.all', metric='mean', standardize=True, data_dir=data_dir) +print('Generate gene-set-level data of 897 cell lines and 189 oncogenic signature gene sets') +print(gene_set_data) +gene_set_data = candle.generate_gene_set_data(np.transpose(norm_data), [i[1] for i in norm_data.index], gene_name_type='symbols', + gene_set_category='c2.cp.kegg', metric='sum', standardize=True, data_dir=data_dir) +print('Generate gene-set-level data of 897 cell lines and 186 KEGG pathways') +print(gene_set_data) + -print('Testing combat_batch_effect_removal') -data = pd.read_csv(ccle_nci60, - sep='\t', engine='c', na_values=['na', '-', ''], header=0, index_col=[0, 1], low_memory=False) +# Combat batch normalization on gene expression data +print('\n') +print('Testing combat_batch_effect_removal') +print('Gene expression data of 60 NCI60 cell lines and 1018 CCLE cell lines with 17741 genes.') +data = pd.read_csv(ccle_nci60, sep='\t', engine='c', na_values=['na', '-', ''], header=0, index_col=[0, 1], low_memory=False) +print(data) resource = np.array([i.split('.')[0] for i in data.columns]) + +print('Before removal of batch effect between NCI60 and CCLE datasets') + +# Identify NCI60 cell lines and quantile normalize their gene expression data id = np.where(resource == 'NCI60')[0] norm_data_NCI60 = candle.quantile_normalization(np.transpose(data.iloc[:, id])) +print('Average third quartile of NCI60 cell lines is ' + str(np.round(a=np.mean(norm_data_NCI60.quantile(0.75, axis=1)), decimals=2))) +print('Average median of NCI60 cell lines is ' + str(np.round(a=np.mean(norm_data_NCI60.quantile(0.5, axis=1)), decimals=2))) +print('Average first quartile of NCI60 cell lines is ' + str(np.round(a=np.mean(norm_data_NCI60.quantile(0.25, axis=1)), decimals=2))) + +# Identify CCLE cell lines and quantile normalize their gene expression data id = np.where(resource == 'CCLE')[0] norm_data_CCLE = candle.quantile_normalization(np.transpose(data.iloc[:, id])) +print('Average third quartile of CCLE cell lines is ' + str(np.round(a=np.mean(norm_data_CCLE.quantile(0.75, axis=1)), decimals=2))) +print('Average median of CCLE cell lines is ' + str(np.round(a=np.mean(norm_data_CCLE.quantile(0.5, axis=1)), decimals=2))) +print('Average first quartile of CCLE cell lines is ' + str(np.round(a=np.mean(norm_data_CCLE.quantile(0.25, axis=1)), decimals=2))) + +# Combine normalized data of NCI60 cell lines and CCLE cell lines norm_data = pd.concat((norm_data_NCI60, norm_data_CCLE), axis=0) norm_data = np.transpose(norm_data) + +# Apply ComBat algorithm to remove the batch effect between NCI60 and CCLE corrected_data = candle.combat_batch_effect_removal(norm_data, pd.Series([i.split('.')[0] for i in norm_data.columns], index=norm_data.columns)) +print('After removal of batch effect between NCI60 and CCLE datasets') + +resource = np.array([i.split('.')[0] for i in corrected_data.columns]) +id = np.where(resource == 'NCI60')[0] +corrected_data_NCI60 = np.transpose(corrected_data.iloc[:, id]) +print('Average third quartile of NCI60 cell lines is ' + str(np.round(a=np.mean(corrected_data_NCI60.quantile(0.75, axis=1)), decimals=2))) +print('Average median of NCI60 cell lines is ' + str(np.round(a=np.mean(corrected_data_NCI60.quantile(0.5, axis=1)), decimals=2))) +print('Average first quartile of NCI60 cell lines is ' + str(np.round(a=np.mean(corrected_data_NCI60.quantile(0.25, axis=1)), decimals=2))) + +# Identify CCLE cell lines and quantile normalize their gene expression data +id = np.where(resource == 'CCLE')[0] +corrected_data_CCLE = np.transpose(corrected_data.iloc[:, id]) +print('Average third quartile of CCLE cell lines is ' + str(np.round(a=np.mean(corrected_data_CCLE.quantile(0.75, axis=1)), decimals=2))) +print('Average median of CCLE cell lines is ' + str(np.round(a=np.mean(corrected_data_CCLE.quantile(0.5, axis=1)), decimals=2))) +print('Average first quartile of CCLE cell lines is ' + str(np.round(a=np.mean(corrected_data_CCLE.quantile(0.25, axis=1)), decimals=2))) From 9866b9d65c2f878e3d14466a035a889fb3a9ef5c Mon Sep 17 00:00:00 2001 From: Cristina Garcia Cardona Date: Thu, 2 Apr 2020 17:26:40 -0600 Subject: [PATCH 179/331] Added abstention demo for attention model --- Pilot1/Attn1/attn.py | 2 +- Pilot1/Attn1/attn_abs_default_model.txt | 28 ++ Pilot1/Attn1/attn_abstention_keras2.py | 520 ++++++++++++++++++++++++ Pilot1/Attn1/attn_baseline_keras2.py | 389 +++++------------- Pilot1/Attn1/attn_default_model.txt | 9 +- Pilot1/Attn1/attn_viz_utils.py | 83 ++++ common/uq_keras_utils.py | 2 +- common/viz_utils.py | 2 + 8 files changed, 744 insertions(+), 291 deletions(-) create mode 100644 Pilot1/Attn1/attn_abs_default_model.txt create mode 100644 Pilot1/Attn1/attn_abstention_keras2.py create mode 100644 Pilot1/Attn1/attn_viz_utils.py diff --git a/Pilot1/Attn1/attn.py b/Pilot1/Attn1/attn.py index 951f89c7..32b91301 100644 --- a/Pilot1/Attn1/attn.py +++ b/Pilot1/Attn1/attn.py @@ -104,7 +104,7 @@ def extension_from_parameters(params, framework=''): for i, n in enumerate(params['dense']): if n: ext += '.D{}={}'.format(i+1, n) - ext += '.A={}'.format(params['activation']) + ext += '.A={}'.format(params['activation'][0]) ext += '.B={}'.format(params['batch_size']) ext += '.E={}'.format(params['epochs']) ext += '.L={}'.format(params['latent_dim']) diff --git a/Pilot1/Attn1/attn_abs_default_model.txt b/Pilot1/Attn1/attn_abs_default_model.txt new file mode 100644 index 00000000..608118cc --- /dev/null +++ b/Pilot1/Attn1/attn_abs_default_model.txt @@ -0,0 +1,28 @@ +[Global_Params] +data_url = 'http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/' +in='top_21_1fold_001.h5' +model_name='attn_abs' +dense=[1000, 1000, 1000, 500, 250, 125, 60, 30, 2] +batch_size=32 +epochs=2 +activation=['relu', 'relu', 'softmax', 'relu', 'relu', 'relu', 'relu', 'relu', 'softmax'] +loss='categorical_crossentropy' +optimizer='sgd' +drop=0.2 +learning_rate=0.00001 +momentum=0.9 +validation_split=0.1 +rng_seed=2017 +use_cp=False +early_stop=True +reduce_lr=True +feature_subsample=0 +output_dir='save_abs/EXP01/' +experiment_id='01' +run_id='1' +save_path='save_abs/EXP01/' +target_abs_acc=0.85 + +[Monitor_Params] +solr_root='' +timeout=3600 diff --git a/Pilot1/Attn1/attn_abstention_keras2.py b/Pilot1/Attn1/attn_abstention_keras2.py new file mode 100644 index 00000000..e1b096ae --- /dev/null +++ b/Pilot1/Attn1/attn_abstention_keras2.py @@ -0,0 +1,520 @@ +from __future__ import print_function + +#import itertools +import pandas as pd +import numpy as np +import os +import sys +import gzip +import argparse +import sklearn + +import tensorflow as tf + +import keras as ke +from keras import backend as K + +from keras.layers import Input, Dense, Dropout, Activation, BatchNormalization +from keras.optimizers import SGD, Adam, RMSprop, Adadelta +from keras.models import Sequential, Model, model_from_json, model_from_yaml +from keras.utils import np_utils, multi_gpu_model + +from keras.callbacks import Callback, ModelCheckpoint, CSVLogger, ReduceLROnPlateau, EarlyStopping, TensorBoard + +from sklearn.utils.class_weight import compute_class_weight +from sklearn.model_selection import train_test_split +from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, roc_auc_score, confusion_matrix, balanced_accuracy_score, classification_report +from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler +from sklearn.metrics import recall_score, auc, roc_curve, f1_score, precision_recall_curve + +import attn +import candle + +import attn_viz_utils as attnviz + +np.set_printoptions(precision=4) + +additional_definitions = [ +{'name':'target_abs_acc', + 'type': float, + 'default': 0.7, + 'help':'target abstention accuracy'}, +{'name':'abs_scale_factor', + 'type': float, + 'default': 0.9, + 'help':'factor to increase or decrease weight for abstention term in cost function'} +] + +required = [ + 'activation', + 'batch_size', + 'dense', + 'drop', + 'epochs', + 'learning_rate', + 'loss', + 'optimizer', + 'rng_seed', + 'validation_split', + 'solr_root', + 'timeout', + 'target_abs_acc'] + + +class BenchmarkAttnAbs(candle.Benchmark): + def set_locals(self): + """Functionality to set variables specific for the benchmark + - required: set of required parameters for the benchmark. + - additional_definitions: list of dictionaries describing the additional parameters for the + benchmark. + """ + + if required is not None: + self.required = set(required) + if additional_definitions is not None: + self.additional_definitions = additional_definitions + attn.additional_definitions + + + +def tf_auc(y_true, y_pred): + auc = tf.metrics.auc(y_true, y_pred)[1] + K.get_session().run(tf.local_variables_initializer()) + return auc + + +def auroc( y_true, y_pred ) : + score = tf.py_func( lambda y_true, y_pred : roc_auc_score( y_true, y_pred, average='macro', sample_weight=None).astype('float32'), + [y_true, y_pred], + 'float32', + stateful=False, + name='sklearnAUC' ) + return score + + +class MetricHistory(Callback): + def on_epoch_begin(self, epoch, logs=None): + print("\n") + + def on_epoch_end(self, epoch, logs=None): + y_pred = self.model.predict(self.validation_data[0]) + r2 = r2_score(self.validation_data[1], y_pred) + corr, _ = pearsonr(self.validation_data[1].flatten(), y_pred.flatten()) + print("\nval_r2:", r2) + print(y_pred.shape) + print("\nval_corr:", corr, "val_r2:", r2) + print("\n") + + +def build_type_classifier(x_train, y_train, x_test, y_test): + y_train = np.argmax(y_train, axis=1) + y_test = np.argmax(y_test, axis=1) + from xgboost import XGBClassifier + clf = XGBClassifier(max_depth=6, n_estimators=100) + clf.fit(x_train, y_train, eval_set=[(x_train, y_train), (x_test, y_test)], verbose=False) + y_pred = clf.predict(x_test) + acc = accuracy_score(y_test, y_pred) + print(acc) + return clf + + +def initialize_parameters(default_model = 'attn_abs_default_model.txt'): + + # Build benchmark object + attnAbsBmk = BenchmarkAttnAbs(attn.file_path, default_model, 'keras', + prog='attention_abstention', desc='Attention model with abstention - Pilot 1 Benchmark') + + # Initialize parameters + gParameters = candle.finalize_parameters(attnAbsBmk) + #attn.logger.info('Params: {}'.format(gParameters)) + + return gParameters + + +def save_cache(cache_file, x_train, y_train, x_val, y_val, x_test, y_test, x_labels, y_labels): + with h5py.File(cache_file, 'w') as hf: + hf.create_dataset("x_train", data=x_train) + hf.create_dataset("y_train", data=y_train) + hf.create_dataset("x_val", data=x_val) + hf.create_dataset("y_val", data=y_val) + hf.create_dataset("x_test", data=x_test) + hf.create_dataset("y_test", data=y_test) + hf.create_dataset("x_labels", (len(x_labels), 1), 'S100', data=[x.encode("ascii", "ignore") for x in x_labels]) + hf.create_dataset("y_labels", (len(y_labels), 1), 'S100', data=[x.encode("ascii", "ignore") for x in y_labels]) + + +def load_cache(cache_file): + with h5py.File(cache_file, 'r') as hf: + x_train = hf['x_train'][:] + y_train = hf['y_train'][:] + x_val = hf['x_val'][:] + y_val = hf['y_val'][:] + x_test = hf['x_test'][:] + y_test = hf['y_test'][:] + x_labels = [x[0].decode('unicode_escape') for x in hf['x_labels'][:]] + y_labels = [x[0].decode('unicode_escape') for x in hf['y_labels'][:]] + return x_train, y_train, x_val, y_val, x_test, y_test, x_labels, y_labels + +def extension_from_parameters(params, framework=''): + """Construct string for saving model with annotation of parameters""" + ext = framework + '.abs' + for i, n in enumerate(params['dense']): + if n: + ext += '.D{}={}'.format(i+1, n) + ext += '.A={}'.format(params['activation'][0]) + ext += '.B={}'.format(params['batch_size']) + ext += '.E={}'.format(params['epochs']) + ext += '.LR={}'.format(params['learning_rate']) + + if params['drop']: + ext += '.DR={}'.format(params['drop']) + if params['warmup_lr']: + ext += '.WU_LR' + if params['reduce_lr']: + ext += '.Re_LR' + if params['residual']: + ext += '.Res' + + return ext + + +def build_attention_model_with_abstention(params, PS): + + assert (len(params['dense']) == len(params['activation'])) + assert (len(params['dense']) > 3) + + DR = params['drop'] + inputs = Input(shape=(PS,)) + x = Dense(params['dense'][0], activation=params['activation'][0])(inputs) + x = BatchNormalization()(x) + a = Dense(params['dense'][1], activation=params['activation'][1])(x) + a = BatchNormalization()(a) + b = Dense(params['dense'][2], activation=params['activation'][2])(x) + x = ke.layers.multiply([a,b]) + + for i in range(3, len(params['dense'])-1): + x = Dense(params['dense'][i], activation=params['activation'][i])(x) + x = BatchNormalization()(x) + x = Dropout(DR)(x) + + # Abstention part + outputs = Dense(params['dense'][-1]+1, activation='sigmoid')(x) + model = Model(inputs=inputs, outputs=outputs) + model.summary() + + return model + + +def run(params): + args = candle.ArgumentStruct(**params) + seed = args.rng_seed + candle.set_seed(seed) + + # Construct extension to save model + ext = extension_from_parameters(params, 'keras') + candle.verify_path(params['save_path']) + prefix = '{}{}'.format(params['save_path'], ext) + logfile = params['logfile'] if params['logfile'] else prefix+'.log' + root_fname = 'Agg_attn_abs_bin' + candle.set_up_logger(logfile, attn.logger, params['verbose']) + attn.logger.info('Params: {}'.format(params)) + + # Get default parameters for initialization and optimizer functions + keras_defaults = candle.keras_default_config() + + ## + X_train, _Y_train, X_val, _Y_val, X_test, _Y_test = attn.load_data(params, seed) + + # move this inside the load_data function + Y_train = _Y_train['AUC'] + Y_test = _Y_test['AUC'] + Y_val = _Y_val['AUC'] + + Y_train_neg, Y_train_pos = np.bincount(Y_train) + Y_test_neg, Y_test_pos = np.bincount(Y_test) + Y_val_neg, Y_val_pos = np.bincount(Y_val) + + Y_train_total = Y_train_neg + Y_train_pos + Y_test_total = Y_test_neg + Y_test_pos + Y_val_total = Y_val_neg + Y_val_pos + + total = Y_train_total + Y_test_total + Y_val_total + neg = Y_train_neg + Y_test_neg + Y_val_neg + pos = Y_train_pos + Y_test_pos + Y_val_pos + + print('Examples:\n Total: {}\n Positive: {} ({:.2f}% of total)\n'.format( + total, pos, 100 * pos / total)) + + nb_classes = params['dense'][-1] + + # Convert classes to categorical with an extra slot for the abstaining class + Y_train, Y_test, Y_val = candle.modify_labels(nb_classes+1, Y_train, Y_test, Y_val) + + # Disable class weight (for initial testing of the abstention classifier) + #y_integers = np.argmax(Y_train, axis=1) + #class_weights = compute_class_weight('balanced', np.unique(y_integers), y_integers) + #d_class_weights = dict(enumerate(class_weights)) + + print('X_train shape:', X_train.shape) + print('X_test shape:', X_test.shape) + + print('Y_train shape:', Y_train.shape) + print('Y_test shape:', Y_test.shape) + + PS = X_train.shape[1] + model = build_attention_model_with_abstention(params, PS) + + # Configure abstention model + mask_ = np.zeros(nb_classes+1) + mask_[-1] = 1 + mu0 = 0.5 # In the long term this is not as important since mu auto tunes, however it may require a large number of epochs to converge if set far away from target + + candle.abstention_variable_initialization(mu0, mask_, nb_classes) + + #parallel_model = multi_gpu_model(model, gpus=4) + #parallel_model.compile(loss='mean_squared_error', + # optimizer=SGD(lr=0.0001, momentum=0.9), + # metrics=['mae',r2]) + kerasDefaults = candle.keras_default_config() + if params['momentum']: + kerasDefaults['momentum_sgd'] = params['momentum'] + + optimizer = candle.build_optimizer(params['optimizer'], params['learning_rate'], kerasDefaults) + + # compile model with abstention loss + model.compile(loss=candle.abstention_loss, optimizer=optimizer, metrics=['acc',tf_auc,candle.abs_acc,candle.acc_class1,candle.abs_acc_class1]) + + + # set up a bunch of callbacks to do work during model training.. + checkpointer = ModelCheckpoint(filepath=params['save_path'] + root_fname + '.autosave.model.h5', verbose=1, save_weights_only=False, save_best_only=True) + csv_logger = CSVLogger('{}/{}.training.log'.format(params['save_path'], root_fname)) + reduce_lr = ReduceLROnPlateau(monitor='val_tf_auc', factor=0.20, patience=40, verbose=1, mode='auto', min_delta=0.0001, cooldown=3, min_lr=0.000000001) + early_stop = EarlyStopping(monitor='val_tf_auc', patience=200, verbose=1, mode='auto') + candle_monitor = candle.CandleRemoteMonitor(params=params) + + candle_monitor = candle.CandleRemoteMonitor(params=params) + timeout_monitor = candle.TerminateOnTimeOut(params['timeout']) + tensorboard = TensorBoard(log_dir="tb/tb{}".format(ext)) + + history_logger = candle.LoggingCallback(attn.logger.debug) + + abstention_cbk = candle.AbstentionAdapt_Callback(monitor='val_abs_acc_class1', scale_factor=params['abs_scale_factor'], target_acc=params['target_abs_acc']) + + callbacks = [candle_monitor, timeout_monitor, csv_logger, history_logger, abstention_cbk] + + if params['reduce_lr']: + callbacks.append(reduce_lr) + + if params['use_cp']: + callbacks.append(checkpointer) + if params['use_tb']: + callbacks.append(tensorboard) + if params['early_stop']: + callbacks.append(early_stop) + + epochs = params['epochs'] + batch_size=params['batch_size'] + history = model.fit(X_train, Y_train, #class_weight=d_class_weights, + batch_size=batch_size, + epochs=epochs, + verbose=1, + validation_data=(X_val, Y_val), + callbacks = callbacks) + + # diagnostic plots + if 'loss' in history.history.keys(): + candle.plot_history(params['save_path'] + root_fname, history, 'loss') + if 'acc' in history.history.keys(): + candle.plot_history(params['save_path'] + root_fname, history, 'acc') + if 'abs_acc' in history.history.keys(): + candle.plot_history(params['save_path'] + root_fname, history, 'abs_acc') + # Plot mu evolution + fname = params['save_path'] + root_fname + '.mu.png' + xlabel='Epochs' + ylabel='Abstention Weight mu' + title='mu Evolution' + attnviz.plot_array(abstention_cbk.muvalues, xlabel, ylabel, title, fname) + + # Evaluate model + score = model.evaluate(X_test, Y_test, verbose=0) + Y_predict = model.predict(X_test) + evaluate_abstention(params, root_fname, nb_classes, Y_test, _Y_test, Y_predict, pos, total, score) + + save_and_test_saved_model(params, model, root_fname, X_train, X_test, Y_test) + + attn.logger.handlers = [] + + return history + + +def evaluate_abstention(params, root_fname, nb_classes, Y_test, _Y_test, Y_predict, pos, total, score): + Y_pred_int = np.argmax(Y_predict, axis=1).astype(np.int) + Y_test_int = np.argmax(Y_test, axis=1).astype(np.int) + + # Get samples where it abstains from predicting + Y_pred_abs = (Y_pred_int == nb_classes).astype(np.int) + + abs0 = 0 + abs1 = 0 + print ('creating table of predictions (with abstention)') + f = open(params['save_path'] + root_fname + '.predictions.tsv', 'w') + + for index, row in _Y_test.iterrows(): + + if row['AUC'] == 1: + if Y_pred_abs[index] == 1: # abstaining in this sample + call='ABS1' + abs1 += 1 + else: # Prediction is made (no abstention) + if Y_pred_int[index] == 1: + call='TP' + else: + call='FN' + if row['AUC'] == 0: + if Y_pred_abs[index] == 1: # abstaining in this sample + call='ABS0' + abs0 += 1 + else: # Prediction is made (no abstention) + if Y_pred_int[index] == 0: + call = 'TN' + else: + call = 'FP' + + print(index, "\t", call, "\t", Y_pred_int[index], "\t", row['AUC'], "\t", Y_pred_abs[index], "\t", row['Sample'], "\t", row['Drug1'], file=f) + + f.close() + + # Filtering samples by predictions made (i.e. leave just the predicted samples where there is NO abstention) + index_pred_noabs = (Y_pred_int < nb_classes) + Y_test_noabs = Y_test[index_pred_noabs,:2] + Y_test_int_noabs = Y_test_int[index_pred_noabs] + Y_pred_noabs = Y_predict[index_pred_noabs,:2] / np.sum(Y_predict[index_pred_noabs,:2], axis=1, keepdims=True) + Y_pred_int_noabs = Y_pred_int[index_pred_noabs] + false_pos_rate, true_pos_rate, thresholds = roc_curve(Y_test_noabs[:,0], Y_pred_noabs[:,0]) + + roc_auc = auc(false_pos_rate, true_pos_rate) + + auc_keras = roc_auc + fpr_keras = false_pos_rate + tpr_keras = true_pos_rate + + # ROC plots + fname = params['save_path'] + root_fname + '.auroc.pdf' + print ('creating figure at ', fname) + add_lbl = ' (after removing abstained samples) ' + attnviz.plot_ROC(fpr_keras, tpr_keras, auc_keras, fname, xlabel_add=add_lbl, ylabel_add=add_lbl) + # Zoom in view of the upper left corner. + fname = params['save_path'] + root_fname + '.auroc_zoom.pdf' + print ('creating figure at ', fname) + attnviz.plot_ROC(fpr_keras, tpr_keras, auc_keras, fname, xlabel_add=add_lbl, ylabel_add=add_lbl, zoom=True) + + f1 = f1_score(Y_test_int_noabs, Y_pred_int_noabs) + precision, recall, thresholds = precision_recall_curve(Y_test_noabs[:,0], Y_pred_noabs[:,0]) + pr_auc = auc(recall, precision) + pr_keras = pr_auc + precision_keras = precision + recall_keras = recall + print('f1=%.3f auroc=%.3f aucpr=%.3f' % (f1, auc_keras, pr_keras)) + # Plot RF + fname = params['save_path'] + root_fname + '.aurpr.pdf' + print ('creating figure at ', fname) + no_skill = len(Y_test_int_noabs[Y_test_int_noabs==1]) / len(Y_test_int_noabs) + attnviz.plot_RF(recall_keras, precision_keras, pr_keras, no_skill, fname, xlabel_add=add_lbl, ylabel_add=add_lbl) + + # Compute confusion matrix (complete) + cnf_matrix = sklearn.metrics.confusion_matrix(Y_test_int, Y_pred_int) + # Plot non-normalized confusion matrix + class_names=['Non-Response','Response', 'Abstain'] + fname = params['save_path'] + root_fname + '.confusion_without_norm.pdf' + attnviz.plot_confusion_matrix(cnf_matrix, fname, classes=class_names, title='Confusion matrix, without normalization') + # Plot normalized confusion matrix + fname = params['save_path'] + root_fname + '.confusion_with_norm.pdf' + attnviz.plot_confusion_matrix(cnf_matrix, fname, classes=class_names, normalize=True, title='Normalized confusion matrix') + + + print('Examples:\n Total: {}\n Positive: {} ({:.2f}% of total)\n'.format(total, pos, 100 * pos / total)) + total_pred = Y_pred_int_noabs.shape[0] + print('Abstention (in prediction): Label0: {} ({:.2f}% of total pred)\n Label1: {} ({:.2f}% of total pred)\n'.format(abs0, 100 * abs0 / total_pred, abs1, 100 * abs1 / total_pred)) + print(sklearn.metrics.roc_auc_score(Y_test_int_noabs, Y_pred_int_noabs)) + print(sklearn.metrics.balanced_accuracy_score(Y_test_int_noabs, Y_pred_int_noabs)) + print(sklearn.metrics.classification_report(Y_test_int_noabs, Y_pred_int_noabs)) + print(sklearn.metrics.confusion_matrix(Y_test_int_noabs, Y_pred_int_noabs)) + print('Score: ', score) + print('Test val_loss (not abstained samples):', score[0]) + print('Test accuracy (not abstained samples):', score[1]) + + +def save_and_test_saved_model(params, model, root_fname, X_train, X_test, Y_test): + + # serialize model to JSON + model_json = model.to_json() + with open(params['save_path'] + root_fname + '.model.json', "w") as json_file: + json_file.write(model_json) + + # serialize model to YAML + model_yaml = model.to_yaml() + with open(params['save_path'] + root_fname + '.model.yaml', "w") as yaml_file: + + yaml_file.write(model_yaml) + + # serialize weights to HDF5 + model.save_weights(params['save_path'] + root_fname + '.model.h5') + print("Saved model to disk") + + # load json and create model + json_file = open(params['save_path'] + root_fname + '.model.json', 'r') + loaded_model_json = json_file.read() + json_file.close() + loaded_model_json = model_from_json(loaded_model_json) + + # load yaml and create model + yaml_file = open(params['save_path'] + root_fname + '.model.yaml', 'r') + loaded_model_yaml = yaml_file.read() + yaml_file.close() + loaded_model_yaml = model_from_yaml(loaded_model_yaml) + #yaml.load(input, Loader=yaml.FullLoader) + + # load weights into new model + loaded_model_json.load_weights(params['save_path'] + root_fname + '.model.h5') + #input = params['save_path'] + root_fname + '.model.h5' + #loaded_model_json.load(input, Loader=yaml.FullLoader) + #print("Loaded json model from disk") + + # evaluate json loaded model on test data + loaded_model_json.compile(loss=candle.abstention_loss, optimizer='SGD', metrics=[candle.abs_acc]) + score_json = loaded_model_json.evaluate(X_test, Y_test, verbose=0) + print('json Validation abstention loss:', score_json[0]) + print('json Validation abstention accuracy:', score_json[1]) + print("json %s: %.2f%%" % (loaded_model_json.metrics_names[1], score_json[1]*100)) + + # load weights into new model + loaded_model_yaml.load_weights(params['save_path'] + root_fname + '.model.h5') + print("Loaded yaml model from disk") + # evaluate yaml loaded model on test data + loaded_model_yaml.compile(loss=candle.abstention_loss, optimizer='SGD', metrics=[candle.abs_acc]) + score_yaml = loaded_model_yaml.evaluate(X_test, Y_test, verbose=0) + print('yaml Validation abstention loss:', score_yaml[0]) + print('yaml Validation abstention accuracy:', score_yaml[1]) + print("yaml %s: %.2f%%" % (loaded_model_yaml.metrics_names[1], score_yaml[1]*100)) + + # predict using loaded yaml model on test and training data + predict_yaml_train = loaded_model_yaml.predict(X_train) + predict_yaml_test = loaded_model_yaml.predict(X_test) + print('Yaml_train_shape:', predict_yaml_train.shape) + print('Yaml_test_shape:', predict_yaml_test.shape) + predict_yaml_train_classes = np.argmax(predict_yaml_train, axis=1) + predict_yaml_test_classes = np.argmax(predict_yaml_test, axis=1) + np.savetxt(params['save_path'] + root_fname + '_predict_yaml_train.csv', predict_yaml_train, delimiter=",", fmt="%.3f") + np.savetxt(params['save_path'] + root_fname + '_predict_yaml_test.csv', predict_yaml_test, delimiter=",", fmt="%.3f") + np.savetxt(params['save_path'] + root_fname + '_predict_yaml_train_classes.csv', predict_yaml_train_classes, delimiter=",",fmt="%d") + np.savetxt(params['save_path'] + root_fname + '_predict_yaml_test_classes.csv', predict_yaml_test_classes, delimiter=",",fmt="%d") + + +def main(): + params = initialize_parameters() + run(params) + + +if __name__ == '__main__': + main() + if K.backend() == 'tensorflow': + K.clear_session() diff --git a/Pilot1/Attn1/attn_baseline_keras2.py b/Pilot1/Attn1/attn_baseline_keras2.py index b6f917db..478391a3 100644 --- a/Pilot1/Attn1/attn_baseline_keras2.py +++ b/Pilot1/Attn1/attn_baseline_keras2.py @@ -9,11 +9,6 @@ import argparse import sklearn -import matplotlib -matplotlib.use('Agg') - -import matplotlib.pyplot as plt - import tensorflow as tf import keras as ke @@ -35,6 +30,8 @@ import attn import candle +import attn_viz_utils as attnviz + np.set_printoptions(precision=4) def r2(y_true, y_pred): @@ -152,16 +149,43 @@ def load_cache(cache_file): return x_train, y_train, x_val, y_val, x_test, y_test, x_labels, y_labels +def build_attention_model(params, PS): + + assert (len(params['dense']) == len(params['activation'])) + assert (len(params['dense']) > 3) + + DR = params['drop'] + inputs = Input(shape=(PS,)) + x = Dense(params['dense'][0], activation=params['activation'][0])(inputs) + x = BatchNormalization()(x) + a = Dense(params['dense'][1], activation=params['activation'][1])(x) + a = BatchNormalization()(a) + b = Dense(params['dense'][2], activation=params['activation'][2])(x) + x = ke.layers.multiply([a,b]) + + for i in range(3, len(params['dense'])-1): + x = Dense(params['dense'][i], activation=params['activation'][i])(x) + x = BatchNormalization()(x) + x = Dropout(DR)(x) + + outputs = Dense(params['dense'][-1], activation=params['activation'][-1])(x) + model = Model(inputs=inputs, outputs=outputs) + model.summary() + + return model + + def run(params): args = candle.ArgumentStruct(**params) seed = args.rng_seed candle.set_seed(seed) # Construct extension to save model - ext = attn.extension_from_parameters(params, '.keras') + ext = attn.extension_from_parameters(params, 'keras') candle.verify_path(params['save_path']) prefix = '{}{}'.format(params['save_path'], ext) logfile = params['logfile'] if params['logfile'] else prefix+'.log' + root_fname = 'Agg_attn_bin' candle.set_up_logger(logfile, attn.logger, params['verbose']) attn.logger.info('Params: {}'.format(params)) @@ -191,7 +215,7 @@ def run(params): print('Examples:\n Total: {}\n Positive: {} ({:.2f}% of total)\n'.format( total, pos, 100 * pos / total)) - nb_classes = params['nb_classes'] + nb_classes = params['dense'][-1] Y_train = np_utils.to_categorical(Y_train,nb_classes) Y_test = np_utils.to_categorical(Y_test,nb_classes) @@ -208,45 +232,7 @@ def run(params): print('Y_test shape:', Y_test.shape) PS=X_train.shape[1] - inputs = Input(shape=(PS,)) - - DR = params['drop'] - - #TODO: specify dense and activation via hyperparameters - x = Dense(1000, activation='relu')(inputs) - x = BatchNormalization()(x) - - a = Dense(1000, activation='relu')(x) - a = BatchNormalization()(a) - - b = Dense(1000, activation='softmax')(x) - x = ke.layers.multiply([a,b]) - - x = Dense(500, activation='relu')(x) - x = BatchNormalization()(x) - x = Dropout(DR)(x) - - x = Dense(250, activation='relu')(x) - x = BatchNormalization()(x) - x = Dropout(DR)(x) - - x = Dense(125, activation='relu')(x) - x = BatchNormalization()(x) - x = Dropout(DR)(x) - - x = Dense(60, activation='relu')(x) - x = BatchNormalization()(x) - x = Dropout(DR)(x) - - x = Dense(30, activation='relu')(x) - x = BatchNormalization()(x) - x = Dropout(DR)(x) - - outputs = Dense(2, activation='softmax')(x) - - model = Model(inputs=inputs, outputs=outputs) - - model.summary() + model = build_attention_model(params, PS) #parallel_model = multi_gpu_model(model, gpus=4) #parallel_model.compile(loss='mean_squared_error', @@ -268,11 +254,8 @@ def run(params): # set up a bunch of callbacks to do work during model training.. - if not os.path.exists(params['save_dir']): - os.makedirs(params['save_dir']) - - checkpointer = ModelCheckpoint(filepath=params['save_dir'] + 'Agg_attn_bin.autosave.model.h5', verbose=1, save_weights_only=False, save_best_only=True) - csv_logger = CSVLogger('{}/Agg_attn_bin.training.log'.format(params['save_dir'])) + checkpointer = ModelCheckpoint(filepath=params['save_path'] + root_fname + '.autosave.model.h5', verbose=1, save_weights_only=False, save_best_only=True) + csv_logger = CSVLogger('{}/{}.training.log'.format(params['save_path'], root_fname)) reduce_lr = ReduceLROnPlateau(monitor='val_tf_auc', factor=0.20, patience=40, verbose=1, mode='auto', min_delta=0.0001, cooldown=3, min_lr=0.000000001) early_stop = EarlyStopping(monitor='val_tf_auc', patience=200, verbose=1, mode='auto') candle_monitor = candle.CandleRemoteMonitor(params=params) @@ -304,21 +287,28 @@ def run(params): validation_data=(X_val, Y_val), callbacks = callbacks) + # diagnostic plots + if 'loss' in history.history.keys(): + candle.plot_history(params['save_path'] + root_fname, history, 'loss') + if 'acc' in history.history.keys(): + candle.plot_history(params['save_path'] + root_fname, history, 'acc') + if 'tf_auc' in history.history.keys(): + candle.plot_history(params['save_path'] + root_fname, history, 'tf_auc') + # Evaluate model score = model.evaluate(X_test, Y_test, verbose=0) - Y_predict = model.predict(X_test) + + evaluate_model(params, root_fname, nb_classes, Y_test, _Y_test, Y_predict, pos, total, score) - # see big fuction below, creates plots etc. - # TODO: Break post_process into multiple functions - post_process(params, X_train, X_test, Y_test, _Y_test, Y_predict, pos, total, score, history, model) + save_and_test_saved_model(params, model, root_fname, X_train, X_test, Y_test) attn.logger.handlers = [] return history -def post_process(params, X_train, X_test, Y_test, _Y_test, Y_predict, pos, total, score, history, model): +def evaluate_model(params, root_fname, nb_classes, Y_test, _Y_test, Y_predict, pos, total, score): threshold = 0.5 @@ -326,7 +316,7 @@ def post_process(params, X_train, X_test, Y_test, _Y_test, Y_predict, pos, total Y_test_int = (Y_test[:,0] < threshold).astype(np.int) print ('creating table of predictions') - f = open(params['save_dir'] + 'Agg_attn_bin.predictions.tsv', 'w') + f = open(params['save_path'] + root_fname + '.predictions.tsv', 'w') for index, row in _Y_test.iterrows(): if row['AUC'] == 1: if Y_pred_int[index] == 1: @@ -344,303 +334,134 @@ def post_process(params, X_train, X_test, Y_test, _Y_test, Y_predict, pos, total false_pos_rate, true_pos_rate, thresholds = roc_curve(Y_test[:,0], Y_predict[:,0]) - #print(thresholds) - roc_auc = auc(false_pos_rate, true_pos_rate) - + auc_keras = roc_auc fpr_keras = false_pos_rate tpr_keras = true_pos_rate - - print ('creating figure 1 at ', params['save_dir'] + 'Agg_attn_bin.auroc.pdf') - plt.figure(1) - plt.plot([0, 1], [0, 1], 'k--', label="No Skill") - plt.plot(fpr_keras, tpr_keras, label='Keras (area = {:.3f})'.format(auc_keras)) - plt.xlabel('False positive rate') - plt.ylabel('True positive rate') - plt.title('ROC curve') - plt.legend(loc='best') - - plt.savefig(params['save_dir'] + 'Agg_attn_bin.auroc.pdf', bbox_inches='tight') - plt.close() - - + + # ROC plots + fname = params['save_path'] + root_fname + '.auroc.pdf' + print ('creating figure at ', fname) + attnviz.plot_ROC(fpr_keras, tpr_keras, auc_keras, fname) # Zoom in view of the upper left corner. - print ('creating figure 2 at ', params['save_dir'] + 'Agg_attn_bin.auroc2.pdf') - plt.figure(2) - plt.xlim(0, 0.2) - plt.ylim(0.8, 1) - plt.plot([0, 1], [0, 1], 'k--', label="No Skill") - plt.plot(fpr_keras, tpr_keras, label='Keras (area = {:.3f})'.format(auc_keras)) - plt.xlabel('False positive rate') - plt.ylabel('True positive rate') - plt.title('ROC curve (zoomed in at top left)') - plt.legend(loc='best') - - plt.savefig(params['save_dir'] + 'Agg_attn_bin.auroc2.pdf', bbox_inches='tight') - plt.close() - - + fname = params['save_path'] + root_fname + '.auroc_zoom.pdf' + print ('creating figure at ', fname) + attnviz.plot_ROC(fpr_keras, tpr_keras, auc_keras, fname, zoom=True) + f1 = f1_score(Y_test_int, Y_pred_int) - + precision, recall, thresholds = precision_recall_curve(Y_test[:,0], Y_predict[:,0]) - #print(thresholds) - pr_auc = auc(recall, precision) - + pr_keras = pr_auc precision_keras = precision recall_keras = recall - + print('f1=%.3f auroc=%.3f aucpr=%.3f' % (f1, auc_keras, pr_keras)) - - print ('creating figure 3 at ', params['save_dir'] + 'Agg_attn_bin.aurpr.pdf') - plt.figure(1) + # Plot RF + fname = params['save_path'] + root_fname + '.aurpr.pdf' + print ('creating figure at ', fname) no_skill = len(Y_test_int[Y_test_int==1]) / len(Y_test_int) - plt.plot([0, 1], [no_skill, no_skill], linestyle='--', label='No Skill') - plt.plot(recall_keras, precision_keras, label='PR Keras (area = {:.3f})'.format(pr_keras)) - plt.xlabel('Recall') - plt.ylabel('Precision') - plt.title('PR curve') - plt.legend(loc='best') - - plt.savefig(params['save_dir'] + 'Agg_attn_bin.aurpr.pdf', bbox_inches='tight') - - plt.close() - - - def plot_confusion_matrix(cm, classes, - normalize=False, - title='Confusion matrix', - cmap=plt.cm.Blues): - """ - This function prints and plots the confusion matrix. - Normalization can be applied by setting `normalize=True`. - """ - if normalize: - cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] - print("Normalized confusion matrix") - else: - print('Confusion matrix, without normalization') - - print(cm) - - plt.imshow(cm, interpolation='nearest', cmap=cmap) - plt.title(title) - plt.colorbar() - tick_marks = np.arange(len(classes)) - plt.xticks(tick_marks, classes, rotation=45) - plt.yticks(tick_marks, classes) - - fmt = '.2f' if normalize else 'd' - thresh = cm.max() / 2. - for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): - plt.text(j, i, format(cm[i, j], fmt), - horizontalalignment="center", - color="white" if cm[i, j] > thresh else "black") - - plt.ylabel('True label') - plt.xlabel('Predicted label') - plt.tight_layout() - - class_names=["Non-Response","Response"] - + attnviz.plot_RF(recall_keras, precision_keras, pr_keras, no_skill, fname) + # Compute confusion matrix cnf_matrix = sklearn.metrics.confusion_matrix(Y_test_int, Y_pred_int) - np.set_printoptions(precision=2) - # Plot non-normalized confusion matrix - #plt.figure() - print ('creating figure 4 at ', params['save_dir'] + 'Agg_attn_bin.confusion_without_norm.pdf') - plot_confusion_matrix(cnf_matrix, classes=class_names, - title='Confusion matrix, without normalization') - plt.savefig(params['save_dir'] + 'Agg_attn_bin.confusion_without_norm.pdf', bbox_inches='tight') - - plt.close() - - - - def plot_confusion_matrix(cm, classes, - normalize=False, - title='Confusion matrix', - cmap=plt.cm.Blues): - """ - This function prints and plots the confusion matrix. - Normalization can be applied by setting `normalize=True`. - """ - if normalize: - cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] - print("Normalized confusion matrix") - else: - print('Confusion matrix, without normalization') - - print(cm) - - plt.imshow(cm, interpolation='nearest', cmap=cmap) - plt.title(title) - plt.colorbar() - tick_marks = np.arange(len(classes)) - plt.xticks(tick_marks, classes, rotation=45) - plt.yticks(tick_marks, classes) - - fmt = '.2f' if normalize else 'd' - thresh = cm.max() / 2. - for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): - plt.text(j, i, format(cm[i, j], fmt), - horizontalalignment="center", - color="white" if cm[i, j] > thresh else "black") - - plt.ylabel('True label') - plt.xlabel('Predicted label') - plt.tight_layout() - class_names=["Non-Response","Response"] - - # Compute confusion matrix - cnf_matrix = sklearn.metrics.confusion_matrix(Y_test_int, Y_pred_int) - np.set_printoptions(precision=2) - - # Plot non-normalized confusion matrix - #plt.figure() - plot_confusion_matrix(cnf_matrix, classes=class_names, - title='Confusion matrix, without normalization') - plt.savefig(params['save_dir'] + 'Agg_attn_bin.confusion_without_norm.pdf', bbox_inches='tight') - - plt.close() - + fname = params['save_path'] + root_fname + '.confusion_without_norm.pdf' + print ('creating figure at ', fname) + attnviz.plot_confusion_matrix(cnf_matrix, fname, classes=class_names, title='Confusion matrix, without normalization') # Plot normalized confusion matrix - #plt.figure() - plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True, - title='Normalized confusion matrix') - plt.savefig(params['save_dir'] + 'Agg_attn_bin.confusion_with_norm.pdf', bbox_inches='tight') - - plt.close() - - + fname = params['save_path'] + root_fname + '.confusion_with_norm.pdf' + print ('creating figure at ', fname) + attnviz.plot_confusion_matrix(cnf_matrix, fname, classes=class_names, normalize=True, title='Normalized confusion matrix') + print('Examples:\n Total: {}\n Positive: {} ({:.2f}% of total)\n'.format( - total, pos, 100 * pos / total)) - - + total, pos, 100 * pos / total)) + print(sklearn.metrics.roc_auc_score(Y_test_int, Y_pred_int)) - + print(sklearn.metrics.balanced_accuracy_score(Y_test_int, Y_pred_int)) - + print(sklearn.metrics.classification_report(Y_test_int, Y_pred_int)) - + print(sklearn.metrics.confusion_matrix(Y_test_int, Y_pred_int)) - + print("score") print(score) - - #exit() - - # summarize history for accuracy - plt.plot(history.history['acc']) - plt.plot(history.history['val_acc']) - plt.title('Model Accuracy') - plt.ylabel('accuracy') - plt.xlabel('epoch') - plt.legend(['train', 'test'], loc='upper left') - - plt.savefig(params['save_dir'] + 'Agg_attn_bin.accuracy.png', bbox_inches='tight') - plt.savefig(params['save_dir'] + 'Agg_attn_bin.accuracy.pdf', bbox_inches='tight') - - plt.close() - - # summarize history for loss - plt.plot(history.history['loss']) - plt.plot(history.history['val_loss']) - plt.title('Model Loss') - plt.ylabel('loss') - plt.xlabel('epoch') - plt.legend(['train', 'test'], loc='upper left') - - plt.savefig(params['save_dir'] + 'Agg_attn_bin.loss.png', bbox_inches='tight') - plt.savefig(params['save_dir'] + 'Agg_attn_bin.loss.pdf', bbox_inches='tight') - - + print('Test val_loss:', score[0]) print('Test accuracy:', score[1]) + +def save_and_test_saved_model(params, model, root_fname, X_train, X_test, Y_test): + # serialize model to JSON model_json = model.to_json() - with open(params['save_dir'] + "Agg_attn_bin.model.json", "w") as json_file: - json_file.write(model_json) + with open(params['save_path'] + root_fname + ".model.json", "w") as json_file: + json_file.write(model_json) # serialize model to YAML model_yaml = model.to_yaml() - with open(params['save_dir'] + "Agg_attn_bin.model.yaml", "w") as yaml_file: - yaml_file.write(model_yaml) - + with open(params['save_path'] + root_fname + ".model.yaml", "w") as yaml_file: + yaml_file.write(model_yaml) # serialize weights to HDF5 - model.save_weights(params['save_dir'] + "Agg_attn_bin.model.h5") + model.save_weights(params['save_path'] + root_fname + ".model.h5") print("Saved model to disk") # load json and create model - json_file = open(params['save_dir'] + 'Agg_attn_bin.model.json', 'r') + json_file = open(params['save_path'] + root_fname + '.model.json', 'r') loaded_model_json = json_file.read() json_file.close() loaded_model_json = model_from_json(loaded_model_json) - - + # load yaml and create model - yaml_file = open(params['save_dir'] + 'Agg_attn_bin.model.yaml', 'r') + yaml_file = open(params['save_path'] + root_fname + '.model.yaml', 'r') loaded_model_yaml = yaml_file.read() yaml_file.close() loaded_model_yaml = model_from_yaml(loaded_model_yaml) - - + # load weights into new model - loaded_model_json.load_weights(params['save_dir'] + "Agg_attn_bin.model.h5") + loaded_model_json.load_weights(params['save_path'] + root_fname + ".model.h5") print("Loaded json model from disk") - + # evaluate json loaded model on test data loaded_model_json.compile(loss='binary_crossentropy', optimizer=params['optimizer'], metrics=['accuracy']) score_json = loaded_model_json.evaluate(X_test, Y_test, verbose=0) - + print('json Validation loss:', score_json[0]) print('json Validation accuracy:', score_json[1]) - + print("json %s: %.2f%%" % (loaded_model_json.metrics_names[1], score_json[1]*100)) - - + # load weights into new model - loaded_model_yaml.load_weights(params['save_dir'] + "Agg_attn_bin.model.h5") + loaded_model_yaml.load_weights(params['save_path'] + root_fname + ".model.h5") print("Loaded yaml model from disk") - + # evaluate loaded model on test data loaded_model_yaml.compile(loss='binary_crossentropy', optimizer=params['optimizer'], metrics=['accuracy']) score_yaml = loaded_model_yaml.evaluate(X_test, Y_test, verbose=0) - print('yaml Validation loss:', score_yaml[0]) print('yaml Validation accuracy:', score_yaml[1]) - print("yaml %s: %.2f%%" % (loaded_model_yaml.metrics_names[1], score_yaml[1]*100)) - + # predict using loaded yaml model on test and training data - predict_yaml_train = loaded_model_yaml.predict(X_train) - predict_yaml_test = loaded_model_yaml.predict(X_test) - - print('Yaml_train_shape:', predict_yaml_train.shape) print('Yaml_test_shape:', predict_yaml_test.shape) - - + predict_yaml_train_classes = np.argmax(predict_yaml_train, axis=1) predict_yaml_test_classes = np.argmax(predict_yaml_test, axis=1) - - np.savetxt(params['save_dir'] + "Agg_attn_bin_predict_yaml_train.csv", predict_yaml_train, delimiter=",", fmt="%.3f") - np.savetxt(params['save_dir'] + "Agg_attn_bin_predict_yaml_test.csv", predict_yaml_test, delimiter=",", fmt="%.3f") - - np.savetxt(params['save_dir'] + "Agg_attn_bin_predict_yaml_train_classes.csv", predict_yaml_train_classes, delimiter=",",fmt="%d") - np.savetxt(params['save_dir'] + "Agg_attn_bin_predict_yaml_test_classes.csv", predict_yaml_test_classes, delimiter=",",fmt="%d") - - + np.savetxt(params['save_path'] + root_fname + "_predict_yaml_train.csv", predict_yaml_train, delimiter=",", fmt="%.3f") + np.savetxt(params['save_path'] + root_fname + "_predict_yaml_test.csv", predict_yaml_test, delimiter=",", fmt="%.3f") + + np.savetxt(params['save_path'] + root_fname + "_predict_yaml_train_classes.csv", predict_yaml_train_classes, delimiter=",",fmt="%d") + np.savetxt(params['save_path'] + root_fname + "_predict_yaml_test_classes.csv", predict_yaml_test_classes, delimiter=",",fmt="%d") def main(): diff --git a/Pilot1/Attn1/attn_default_model.txt b/Pilot1/Attn1/attn_default_model.txt index e6d76f98..b20acf5a 100644 --- a/Pilot1/Attn1/attn_default_model.txt +++ b/Pilot1/Attn1/attn_default_model.txt @@ -2,10 +2,10 @@ data_url = 'http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/' in='top_21_1fold_001.h5' model_name='attn' -dense=[2000, 600] +dense=[1000, 1000, 1000, 500, 250, 125, 60, 30, 2] batch_size=32 epochs=1 -activation='relu' +activation=['relu', 'relu', 'softmax', 'relu', 'relu', 'relu', 'relu', 'relu', 'softmax'] loss='categorical_crossentropy' optimizer='sgd' drop=0.2 @@ -18,13 +18,12 @@ rng_seed=2017 initialization='glorot_uniform' latent_dim=2 batch_normalization=False -save_path='candle_save' use_cp=False early_stop=True reduce_lr=True feature_subsample=0 -nb_classes=2 -save_dir='./save/001/' +output_dir='./save/001/' +save_path='./save/001/' [Monitor_Params] solr_root='' diff --git a/Pilot1/Attn1/attn_viz_utils.py b/Pilot1/Attn1/attn_viz_utils.py new file mode 100644 index 00000000..650985e7 --- /dev/null +++ b/Pilot1/Attn1/attn_viz_utils.py @@ -0,0 +1,83 @@ +from __future__ import print_function + +import itertools + +import numpy as np + +import matplotlib +matplotlib.use('Agg') +import matplotlib.pyplot as plt + + +np.set_printoptions(precision=2) + + +def plot_ROC(fpr_keras, tpr_keras, auc_keras, fname, xlabel_add='', ylabel_add='', zoom=False): + + plt.figure() + if zoom: + plt.xlim(0, 0.2) + plt.ylim(0.8, 1) + + plt.plot([0, 1], [0, 1], 'k--', label='No Skill') + plt.plot(fpr_keras, tpr_keras, label='Keras (area = {:.3f})'.format(auc_keras)) + plt.xlabel('False positive rate' + xlabel_add) + plt.ylabel('True positive rate' + ylabel_add) + plt.title('ROC curve') + plt.legend(loc='best') + plt.savefig(fname, bbox_inches='tight') + plt.close() + + +def plot_RF(recall_keras, precision_keras, pr_keras, no_skill, fname, xlabel_add='', ylabel_add=''): + + plt.figure() + plt.plot([0, 1], [no_skill, no_skill], linestyle='--', label='No Skill') + plt.plot(recall_keras, precision_keras, label='PR Keras (area = {:.3f})'.format(pr_keras)) + plt.xlabel('Recall' + xlabel_add) + plt.ylabel('Precision' + ylabel_add) + plt.title('PR curve') + plt.legend(loc='best') + plt.savefig(fname, bbox_inches='tight') + plt.close() + + +def plot_confusion_matrix(cm, fname, classes, normalize=False, title='Confusion matrix'): + """ + This function prints and plots the confusion matrix. Normalization can be applied by setting `normalize=True`. + """ + if normalize: + cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] + print('Normalized confusion matrix') + else: + print('Confusion matrix, without normalization') + print(cm) + + cmap=plt.cm.Blues + plt.imshow(cm, interpolation='nearest', cmap=cmap) + plt.title(title) + plt.colorbar() + tick_marks = np.arange(len(classes)) + plt.xticks(tick_marks, classes, rotation=45) + plt.yticks(tick_marks, classes) + + fmt = '.2f' if normalize else 'd' + thresh = cm.max() / 2. + for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): + plt.text(j, i, format(cm[i, j], fmt), horizontalalignment="center", color="white" if cm[i, j] > thresh else "black") + + plt.ylabel('True label') + plt.xlabel('Predicted label') + plt.tight_layout() + plt.savefig(fname, bbox_inches='tight') + plt.close() + +def plot_array(nparray, xlabel, ylabel, title, fname): + + plt.figure() + plt.plot(nparray, lw=3.) + plt.xlabel(xlabel) + plt.ylabel(ylabel) + plt.title(title) + plt.savefig(fname, bbox_inches='tight') + plt.close() diff --git a/common/uq_keras_utils.py b/common/uq_keras_utils.py index 1d777343..58e537e2 100644 --- a/common/uq_keras_utils.py +++ b/common/uq_keras_utils.py @@ -216,7 +216,7 @@ def on_epoch_end(self, epoch, logs=None): new_mu_val *= self.scale_factor K.set_value(mu, new_mu_val) - self.muvalues.append( new_mu_val ) + self.muvalues.append( new_mu_val ) #print('epoch: %d, mu: %f' % (epoch, new_mu_val)) diff --git a/common/viz_utils.py b/common/viz_utils.py index cb24a2b6..c4eed5e2 100644 --- a/common/viz_utils.py +++ b/common/viz_utils.py @@ -17,6 +17,7 @@ def plot_history(out, history, metric='loss', title=None, width=8, height=6): plt.legend(['train_{}'.format(metric), 'val_{}'.format(metric)], loc='upper center') png = '{}.plot.{}.png'.format(out, metric) plt.savefig(png, bbox_inches='tight') + plt.close() def plot_scatter(data, classes, out, width=10, height=8): cmap = plt.cm.get_cmap('gist_rainbow') @@ -25,6 +26,7 @@ def plot_scatter(data, classes, out, width=10, height=8): plt.colorbar() png = '{}.png'.format(out) plt.savefig(png, bbox_inches='tight') + plt.close() def plot_error(y_true, y_pred, batch, file_ext, file_pre='output_dir', subsample=1000): if batch % 10: From 21d915399192f890fb4c44facbd35887a88fb6b5 Mon Sep 17 00:00:00 2001 From: Cristina Garcia Cardona Date: Fri, 3 Apr 2020 15:18:17 -0600 Subject: [PATCH 180/331] Included functionality to modify last layer of keras model fo uq --- Pilot1/Attn1/attn_abstention_keras2.py | 34 ++++-------------- common/candle/__init__.py | 1 + common/candle_keras/__init__.py | 1 + common/uq_keras_utils.py | 50 ++++++++++++++++++++++++++ 4 files changed, 58 insertions(+), 28 deletions(-) diff --git a/Pilot1/Attn1/attn_abstention_keras2.py b/Pilot1/Attn1/attn_abstention_keras2.py index e1b096ae..abff61c8 100644 --- a/Pilot1/Attn1/attn_abstention_keras2.py +++ b/Pilot1/Attn1/attn_abstention_keras2.py @@ -32,6 +32,8 @@ import attn_viz_utils as attnviz +from attn_baseline_keras2 import build_attention_model + np.set_printoptions(precision=4) additional_definitions = [ @@ -177,33 +179,6 @@ def extension_from_parameters(params, framework=''): return ext -def build_attention_model_with_abstention(params, PS): - - assert (len(params['dense']) == len(params['activation'])) - assert (len(params['dense']) > 3) - - DR = params['drop'] - inputs = Input(shape=(PS,)) - x = Dense(params['dense'][0], activation=params['activation'][0])(inputs) - x = BatchNormalization()(x) - a = Dense(params['dense'][1], activation=params['activation'][1])(x) - a = BatchNormalization()(a) - b = Dense(params['dense'][2], activation=params['activation'][2])(x) - x = ke.layers.multiply([a,b]) - - for i in range(3, len(params['dense'])-1): - x = Dense(params['dense'][i], activation=params['activation'][i])(x) - x = BatchNormalization()(x) - x = Dropout(DR)(x) - - # Abstention part - outputs = Dense(params['dense'][-1]+1, activation='sigmoid')(x) - model = Model(inputs=inputs, outputs=outputs) - model.summary() - - return model - - def run(params): args = candle.ArgumentStruct(**params) seed = args.rng_seed @@ -261,7 +236,10 @@ def run(params): print('Y_test shape:', Y_test.shape) PS = X_train.shape[1] - model = build_attention_model_with_abstention(params, PS) + model = build_attention_model(params, PS) + model = candle.add_model_output(model, mode='abstain', num_add=1, activation='sigmoid') + print('Model after modifying layer for abstention') + model.summary() # Configure abstention model mask_ = np.zeros(nb_classes+1) diff --git a/common/candle/__init__.py b/common/candle/__init__.py index 7f5084f2..6ca22a6e 100644 --- a/common/candle/__init__.py +++ b/common/candle/__init__.py @@ -100,6 +100,7 @@ from uq_keras_utils import acc_class1 from uq_keras_utils import abs_acc_class1 from uq_keras_utils import modify_labels + from uq_keras_utils import add_model_output from uq_keras_utils import AbstentionAdapt_Callback elif 'torch' in sys.modules: diff --git a/common/candle_keras/__init__.py b/common/candle_keras/__init__.py index faf7d949..9c685f20 100644 --- a/common/candle_keras/__init__.py +++ b/common/candle_keras/__init__.py @@ -74,4 +74,5 @@ from uq_keras_utils import acc_class1 from uq_keras_utils import abs_acc_class1 from uq_keras_utils import modify_labels +from uq_keras_utils import add_model_output from uq_keras_utils import AbstentionAdapt_Callback diff --git a/common/uq_keras_utils.py b/common/uq_keras_utils.py index 58e537e2..af313711 100644 --- a/common/uq_keras_utils.py +++ b/common/uq_keras_utils.py @@ -5,6 +5,9 @@ from keras.callbacks import Callback +from keras.models import Model +from keras.layers import Dense + from keras.utils import np_utils import numpy as np @@ -267,3 +270,50 @@ def modify_labels(numclasses_out, ytrain, ytest, yval): return labels_train, labels_test, labels_val ################################################################### + +def add_model_output(modelIn, mode=None, num_add=None, activation=None): + + if mode is None: + return modelIn + + numlayers = len(modelIn.layers) + # Find last dense layer + i = -1 + while 'dense' not in (modelIn.layers[i].name) and ((i+numlayers) > 0): + i -= 1 + # Minimal verification about the validity of the layer found + assert ((i + numlayers) >= 0) + assert ('dense' in modelIn.layers[i].name) + + # Compute new output size + if mode is 'abstain': + assert num_add is not None + new_output_size = modelIn.layers[i].output_shape[-1] + num_add + elif mode is 'qtl': # for quantile UQ + new_output_size = 3 * modelIn.layers[i].output_shape[-1] + else: # for heteroscedastic UQ + new_output_size = 2 * modelIn.layers[i].output_shape[-1] + + # Recover current layer options + config = modelIn.layers[i].get_config() + # Update number of units + config['units'] = new_output_size + # Update activation function if requested + if activation is not None: + config['activation'] = activation + # Create new Dense layer + reconstructed_layer = Dense.from_config(config) + # Connect new Dense last layer to previous one-before-last layer + additional = reconstructed_layer(modelIn.layers[i-1].output) + # If the layer to replace is not the last layer, add the remainder layers + if i < -1: + for j in range(i+1, 0): + config_j = modelIn.layers[j].get_config() + aux_j = layers.deserialize({'class_name': modelIn.layers[j].__class__.__name__, + 'config': config_j}) + reconstructed_layer = aux_j.from_config(config_j) + additional = reconstructed_layer(additional) + + modelOut = Model(modelIn.input, additional) + + return modelOut From ae49a8b2a7db20e4b79d196c49e7e3a7f92301dd Mon Sep 17 00:00:00 2001 From: Cristina Garcia Cardona Date: Fri, 3 Apr 2020 16:37:14 -0600 Subject: [PATCH 181/331] added documentation for uq_keras_utils --- common/uq_keras_utils.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/common/uq_keras_utils.py b/common/uq_keras_utils.py index af313711..6047947e 100644 --- a/common/uq_keras_utils.py +++ b/common/uq_keras_utils.py @@ -272,6 +272,27 @@ def modify_labels(numclasses_out, ytrain, ytest, yval): ################################################################### def add_model_output(modelIn, mode=None, num_add=None, activation=None): + """ This function modifies the last dense layer in the passed keras model. The modification includes adding units and optionally changing the activation function. + + Parameters + ---------- + modelIn : keras model + Keras model to be modified. + mode : string + Mode to modify the layer. It could be: + 'abstain' for adding an arbitrary number of units for the abstention optimization strategy. + 'qtl' for quantile regression which needs the outputs to be tripled. + 'het' for heteroscedastic regression which needs the outputs to be doubled. (current implicit default: 'het') + num_add : integer + Number of units to add. This only applies to the 'abstain' mode. + activation : string + String with keras specification of activation function (e.g. 'relu', 'sigomid', 'softmax', etc.) + + Return + ---------- + modelOut : keras model + Keras model after last dense layer has been modified as specified. If there is no mode specified it returns the same model. + """ if mode is None: return modelIn From 06b40738523349fb5a6c6ab4f773ac6262820818 Mon Sep 17 00:00:00 2001 From: yngtodd Date: Tue, 7 Apr 2020 23:32:45 -0400 Subject: [PATCH 182/331] Move darts to common This will allow darts to be used across benchmarks. Right now, we support both P3B5 and the uno benchmark. --- Pilot3/P3B5/darts/data/p3b3.py | 102 ------------------ Pilot3/P3B5/darts/utils/__init__.py | 0 {Pilot3/P3B5 => common}/darts/__init__.py | 0 {Pilot3/P3B5 => common}/darts/api/__init__.py | 0 {Pilot3/P3B5 => common}/darts/api/config.py | 0 {Pilot3/P3B5 => common}/darts/api/info.py | 0 {Pilot3/P3B5 => common}/darts/api/model.py | 0 {Pilot3/P3B5 => common}/darts/architecture.py | 0 {Pilot3/P3B5 => common}/darts/functional.py | 0 {Pilot3/P3B5 => common}/darts/genotypes.py | 0 .../data => common/darts/meters}/__init__.py | 0 .../P3B5 => common}/darts/meters/accuracy.py | 0 .../P3B5 => common}/darts/meters/average.py | 0 .../P3B5 => common}/darts/metrics/__init__.py | 0 .../darts/metrics/multitask_accuracy.py | 0 .../darts/metrics/multitask_loss.py | 0 .../darts/metrics/topk_accuracy.py | 0 .../darts/modules}/__init__.py | 0 {Pilot3/P3B5 => common}/darts/modules/cell.py | 0 .../darts/modules/classifier.py | 0 .../darts/modules/linear}/__init__.py | 0 .../darts/modules/linear/linear_cell.py | 0 .../modules/linear/linear_mixed_layer.py | 0 .../darts/modules/linear_network.py | 0 .../darts/modules/mixed_layer.py | 0 .../P3B5 => common}/darts/modules/network.py | 0 .../darts/modules/operations}/__init__.py | 0 .../darts/modules/operations/linear.py | 0 .../darts/modules/operations/original.py | 0 .../darts/storage}/__init__.py | 0 .../P3B5 => common}/darts/storage/genotype.py | 0 .../darts/utils}/__init__.py | 0 .../P3B5 => common}/darts/utils/logging.py | 0 {Pilot3/P3B5 => common}/darts/utils/random.py | 0 {Pilot3/P3B5 => common}/darts/utils/tensor.py | 0 {Pilot3/P3B5 => common}/darts/visualize.py | 0 36 files changed, 102 deletions(-) delete mode 100644 Pilot3/P3B5/darts/data/p3b3.py delete mode 100644 Pilot3/P3B5/darts/utils/__init__.py rename {Pilot3/P3B5 => common}/darts/__init__.py (100%) rename {Pilot3/P3B5 => common}/darts/api/__init__.py (100%) rename {Pilot3/P3B5 => common}/darts/api/config.py (100%) rename {Pilot3/P3B5 => common}/darts/api/info.py (100%) rename {Pilot3/P3B5 => common}/darts/api/model.py (100%) rename {Pilot3/P3B5 => common}/darts/architecture.py (100%) rename {Pilot3/P3B5 => common}/darts/functional.py (100%) rename {Pilot3/P3B5 => common}/darts/genotypes.py (100%) rename {Pilot3/P3B5/darts/data => common/darts/meters}/__init__.py (100%) rename {Pilot3/P3B5 => common}/darts/meters/accuracy.py (100%) rename {Pilot3/P3B5 => common}/darts/meters/average.py (100%) rename {Pilot3/P3B5 => common}/darts/metrics/__init__.py (100%) rename {Pilot3/P3B5 => common}/darts/metrics/multitask_accuracy.py (100%) rename {Pilot3/P3B5 => common}/darts/metrics/multitask_loss.py (100%) rename {Pilot3/P3B5 => common}/darts/metrics/topk_accuracy.py (100%) rename {Pilot3/P3B5/darts/meters => common/darts/modules}/__init__.py (100%) rename {Pilot3/P3B5 => common}/darts/modules/cell.py (100%) rename {Pilot3/P3B5 => common}/darts/modules/classifier.py (100%) rename {Pilot3/P3B5/darts/modules => common/darts/modules/linear}/__init__.py (100%) rename {Pilot3/P3B5 => common}/darts/modules/linear/linear_cell.py (100%) rename {Pilot3/P3B5 => common}/darts/modules/linear/linear_mixed_layer.py (100%) rename {Pilot3/P3B5 => common}/darts/modules/linear_network.py (100%) rename {Pilot3/P3B5 => common}/darts/modules/mixed_layer.py (100%) rename {Pilot3/P3B5 => common}/darts/modules/network.py (100%) rename {Pilot3/P3B5/darts/modules/linear => common/darts/modules/operations}/__init__.py (100%) rename {Pilot3/P3B5 => common}/darts/modules/operations/linear.py (100%) rename {Pilot3/P3B5 => common}/darts/modules/operations/original.py (100%) rename {Pilot3/P3B5/darts/modules/operations => common/darts/storage}/__init__.py (100%) rename {Pilot3/P3B5 => common}/darts/storage/genotype.py (100%) rename {Pilot3/P3B5/darts/storage => common/darts/utils}/__init__.py (100%) rename {Pilot3/P3B5 => common}/darts/utils/logging.py (100%) rename {Pilot3/P3B5 => common}/darts/utils/random.py (100%) rename {Pilot3/P3B5 => common}/darts/utils/tensor.py (100%) rename {Pilot3/P3B5 => common}/darts/visualize.py (100%) diff --git a/Pilot3/P3B5/darts/data/p3b3.py b/Pilot3/P3B5/darts/data/p3b3.py deleted file mode 100644 index 1285a158..00000000 --- a/Pilot3/P3B5/darts/data/p3b3.py +++ /dev/null @@ -1,102 +0,0 @@ -import os -import numpy as np -from torch.utils.data import Dataset - - -class P3B3(Dataset): - """P3B3 Synthetic Dataset. - - Args: - root: str - Root directory of dataset where CANDLE loads P3B3 data. - - partition: str - dataset partition to be loaded. - Must be either 'train' or 'test'. - """ - training_data_file = 'train_X.npy' - training_label_file = 'train_Y.npy' - test_data_file = 'test_X.npy' - test_label_file = 'test_Y.npy' - - def __init__(self, root, partition, subsite=True, - laterality=True, behavior=True, grade=True, - transform=None, target_transform=None): - self.root = root - self.partition = partition - self.transform = transform - self.target_transform = target_transform - self.subsite = subsite - self.laterality = laterality - self.behavior = behavior - self.grade = grade - - if self.partition == 'train': - data_file = self.training_data_file - label_file = self.training_label_file - elif self.partition == 'test': - data_file = self.test_data_file - label_file = self.test_label_file - else: - raise ValueError("Partition must either be 'train' or 'test'.") - - self.data = np.load(os.path.join(self.root, data_file)) - self.targets = self.get_targets(label_file) - - def __repr__(self): - fmt_str = 'Dataset ' + self.__class__.__name__ + '\n' - fmt_str += ' Number of datapoints: {}\n'.format(self.__len__()) - tmp = self.partition - fmt_str += ' Split: {}\n'.format(tmp) - fmt_str += ' Root Location: {}\n'.format(self.root) - return fmt_str - - def __len__(self): - return len(self.data) - - def load_data(self): - return self.data, self.targets - - def get_targets(self, label_file): - """Get dictionary of targets specified by user.""" - targets = np.load(os.path.join(self.root, label_file)) - - tasks = {} - if self.subsite: - tasks['subsite'] = targets[:, 0] - if self.laterality: - tasks['laterality'] = targets[:, 1] - if self.behavior: - tasks['behavior'] = targets[:, 2] - if self.grade: - tasks['grade'] = targets[:, 3] - - return tasks - - def __getitem__(self, idx): - """ - Parameters - ---------- - index : int - Index of the data to be loaded. - - Returns - ------- - (document, target) : tuple - where target is index of the target class. - """ - document = self.data[idx] - - if self.transform is not None: - document = self.transform(document) - - targets = {} - for key, value in self.targets.items(): - subset = value[idx] - - if self.target_transform is not None: - subset = self.target_transform(subset) - - targets[key] = subset - - return document, targets \ No newline at end of file diff --git a/Pilot3/P3B5/darts/utils/__init__.py b/Pilot3/P3B5/darts/utils/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/Pilot3/P3B5/darts/__init__.py b/common/darts/__init__.py similarity index 100% rename from Pilot3/P3B5/darts/__init__.py rename to common/darts/__init__.py diff --git a/Pilot3/P3B5/darts/api/__init__.py b/common/darts/api/__init__.py similarity index 100% rename from Pilot3/P3B5/darts/api/__init__.py rename to common/darts/api/__init__.py diff --git a/Pilot3/P3B5/darts/api/config.py b/common/darts/api/config.py similarity index 100% rename from Pilot3/P3B5/darts/api/config.py rename to common/darts/api/config.py diff --git a/Pilot3/P3B5/darts/api/info.py b/common/darts/api/info.py similarity index 100% rename from Pilot3/P3B5/darts/api/info.py rename to common/darts/api/info.py diff --git a/Pilot3/P3B5/darts/api/model.py b/common/darts/api/model.py similarity index 100% rename from Pilot3/P3B5/darts/api/model.py rename to common/darts/api/model.py diff --git a/Pilot3/P3B5/darts/architecture.py b/common/darts/architecture.py similarity index 100% rename from Pilot3/P3B5/darts/architecture.py rename to common/darts/architecture.py diff --git a/Pilot3/P3B5/darts/functional.py b/common/darts/functional.py similarity index 100% rename from Pilot3/P3B5/darts/functional.py rename to common/darts/functional.py diff --git a/Pilot3/P3B5/darts/genotypes.py b/common/darts/genotypes.py similarity index 100% rename from Pilot3/P3B5/darts/genotypes.py rename to common/darts/genotypes.py diff --git a/Pilot3/P3B5/darts/data/__init__.py b/common/darts/meters/__init__.py similarity index 100% rename from Pilot3/P3B5/darts/data/__init__.py rename to common/darts/meters/__init__.py diff --git a/Pilot3/P3B5/darts/meters/accuracy.py b/common/darts/meters/accuracy.py similarity index 100% rename from Pilot3/P3B5/darts/meters/accuracy.py rename to common/darts/meters/accuracy.py diff --git a/Pilot3/P3B5/darts/meters/average.py b/common/darts/meters/average.py similarity index 100% rename from Pilot3/P3B5/darts/meters/average.py rename to common/darts/meters/average.py diff --git a/Pilot3/P3B5/darts/metrics/__init__.py b/common/darts/metrics/__init__.py similarity index 100% rename from Pilot3/P3B5/darts/metrics/__init__.py rename to common/darts/metrics/__init__.py diff --git a/Pilot3/P3B5/darts/metrics/multitask_accuracy.py b/common/darts/metrics/multitask_accuracy.py similarity index 100% rename from Pilot3/P3B5/darts/metrics/multitask_accuracy.py rename to common/darts/metrics/multitask_accuracy.py diff --git a/Pilot3/P3B5/darts/metrics/multitask_loss.py b/common/darts/metrics/multitask_loss.py similarity index 100% rename from Pilot3/P3B5/darts/metrics/multitask_loss.py rename to common/darts/metrics/multitask_loss.py diff --git a/Pilot3/P3B5/darts/metrics/topk_accuracy.py b/common/darts/metrics/topk_accuracy.py similarity index 100% rename from Pilot3/P3B5/darts/metrics/topk_accuracy.py rename to common/darts/metrics/topk_accuracy.py diff --git a/Pilot3/P3B5/darts/meters/__init__.py b/common/darts/modules/__init__.py similarity index 100% rename from Pilot3/P3B5/darts/meters/__init__.py rename to common/darts/modules/__init__.py diff --git a/Pilot3/P3B5/darts/modules/cell.py b/common/darts/modules/cell.py similarity index 100% rename from Pilot3/P3B5/darts/modules/cell.py rename to common/darts/modules/cell.py diff --git a/Pilot3/P3B5/darts/modules/classifier.py b/common/darts/modules/classifier.py similarity index 100% rename from Pilot3/P3B5/darts/modules/classifier.py rename to common/darts/modules/classifier.py diff --git a/Pilot3/P3B5/darts/modules/__init__.py b/common/darts/modules/linear/__init__.py similarity index 100% rename from Pilot3/P3B5/darts/modules/__init__.py rename to common/darts/modules/linear/__init__.py diff --git a/Pilot3/P3B5/darts/modules/linear/linear_cell.py b/common/darts/modules/linear/linear_cell.py similarity index 100% rename from Pilot3/P3B5/darts/modules/linear/linear_cell.py rename to common/darts/modules/linear/linear_cell.py diff --git a/Pilot3/P3B5/darts/modules/linear/linear_mixed_layer.py b/common/darts/modules/linear/linear_mixed_layer.py similarity index 100% rename from Pilot3/P3B5/darts/modules/linear/linear_mixed_layer.py rename to common/darts/modules/linear/linear_mixed_layer.py diff --git a/Pilot3/P3B5/darts/modules/linear_network.py b/common/darts/modules/linear_network.py similarity index 100% rename from Pilot3/P3B5/darts/modules/linear_network.py rename to common/darts/modules/linear_network.py diff --git a/Pilot3/P3B5/darts/modules/mixed_layer.py b/common/darts/modules/mixed_layer.py similarity index 100% rename from Pilot3/P3B5/darts/modules/mixed_layer.py rename to common/darts/modules/mixed_layer.py diff --git a/Pilot3/P3B5/darts/modules/network.py b/common/darts/modules/network.py similarity index 100% rename from Pilot3/P3B5/darts/modules/network.py rename to common/darts/modules/network.py diff --git a/Pilot3/P3B5/darts/modules/linear/__init__.py b/common/darts/modules/operations/__init__.py similarity index 100% rename from Pilot3/P3B5/darts/modules/linear/__init__.py rename to common/darts/modules/operations/__init__.py diff --git a/Pilot3/P3B5/darts/modules/operations/linear.py b/common/darts/modules/operations/linear.py similarity index 100% rename from Pilot3/P3B5/darts/modules/operations/linear.py rename to common/darts/modules/operations/linear.py diff --git a/Pilot3/P3B5/darts/modules/operations/original.py b/common/darts/modules/operations/original.py similarity index 100% rename from Pilot3/P3B5/darts/modules/operations/original.py rename to common/darts/modules/operations/original.py diff --git a/Pilot3/P3B5/darts/modules/operations/__init__.py b/common/darts/storage/__init__.py similarity index 100% rename from Pilot3/P3B5/darts/modules/operations/__init__.py rename to common/darts/storage/__init__.py diff --git a/Pilot3/P3B5/darts/storage/genotype.py b/common/darts/storage/genotype.py similarity index 100% rename from Pilot3/P3B5/darts/storage/genotype.py rename to common/darts/storage/genotype.py diff --git a/Pilot3/P3B5/darts/storage/__init__.py b/common/darts/utils/__init__.py similarity index 100% rename from Pilot3/P3B5/darts/storage/__init__.py rename to common/darts/utils/__init__.py diff --git a/Pilot3/P3B5/darts/utils/logging.py b/common/darts/utils/logging.py similarity index 100% rename from Pilot3/P3B5/darts/utils/logging.py rename to common/darts/utils/logging.py diff --git a/Pilot3/P3B5/darts/utils/random.py b/common/darts/utils/random.py similarity index 100% rename from Pilot3/P3B5/darts/utils/random.py rename to common/darts/utils/random.py diff --git a/Pilot3/P3B5/darts/utils/tensor.py b/common/darts/utils/tensor.py similarity index 100% rename from Pilot3/P3B5/darts/utils/tensor.py rename to common/darts/utils/tensor.py diff --git a/Pilot3/P3B5/darts/visualize.py b/common/darts/visualize.py similarity index 100% rename from Pilot3/P3B5/darts/visualize.py rename to common/darts/visualize.py From a42677cc63fef886fe086036c7b731343eb5ec27 Mon Sep 17 00:00:00 2001 From: yngtodd Date: Tue, 7 Apr 2020 23:39:16 -0400 Subject: [PATCH 183/331] Fix hard coded parameter This was a carryover from the uno development stages. --- common/darts/modules/linear_network.py | 72 ++++++-------------------- 1 file changed, 17 insertions(+), 55 deletions(-) diff --git a/common/darts/modules/linear_network.py b/common/darts/modules/linear_network.py index 3fb73c92..80e43a2c 100644 --- a/common/darts/modules/linear_network.py +++ b/common/darts/modules/linear_network.py @@ -9,14 +9,12 @@ class Hyperparameters: - c = 100 # 8 - num_nodes = 2 - num_cells = 3 - channel_multiplier = 2 - stem_channel_multiplier = 2 -# input_dim = 5270 + c = 100 + num_nodes = 2 + num_cells = 3 + channel_multiplier = 1 + stem_channel_multiplier = 1 input_dim = 5270 -# gene_dim = 942 intermediate_dim = 100 @@ -35,7 +33,7 @@ def __init__(self, tasks, criterion, device='cpu', hyperparams=Hyperparameters() # stem_multiplier is for stem network, # and multiplier is for general cell - c_curr = hyperparams.stem_channel_multiplier * self.c # 3*16 + c_curr = hyperparams.stem_channel_multiplier * self.c # stem network, convert 3 channel to c_curr self.stem = nn.Sequential( nn.Linear( @@ -49,7 +47,6 @@ def __init__(self, tasks, criterion, device='cpu', hyperparams=Hyperparameters() self.cells = nn.ModuleList() reduction_prev = False for i in range(hyperparams.num_cells): - # for layer in the middle [1/3, 2/3], reduce via stride=2 if i in [hyperparams.num_cells // 3, 2 * hyperparams.num_cells // 3]: c_curr *= 2 @@ -60,12 +57,12 @@ def __init__(self, tasks, criterion, device='cpu', hyperparams=Hyperparameters() # [cp, h, h] => [multiplier*c_curr, h/h//2, h/h//2] # the output channels = multiplier * c_curr cell = Cell( - hyperparams.num_nodes, - hyperparams.channel_multiplier, - cpp, - cp, - c_curr, - reduction, + hyperparams.num_nodes, + hyperparams.channel_multiplier, + cpp, + cp, + c_curr, + reduction, reduction_prev ).to(self.device) # update reduction_prev @@ -73,21 +70,12 @@ def __init__(self, tasks, criterion, device='cpu', hyperparams=Hyperparameters() self.cells += [cell] cpp, cp = cp, hyperparams.channel_multiplier * c_curr - # adaptive pooling output size to 1x1 - # since cp records last cell's output channels - # it indicates the input channel number - # self.classifier = self.fc_layers(cp, tasks) - #self.classifier = MultitaskClassifier(cp, tasks) - self.classifier = MultitaskClassifier(500, tasks) # 500 + self.classifier = MultitaskClassifier(hyperparams.intermediate_dim, tasks) - # k is the total number of edges inside single cell, 14 + # k is the total number of edges inside single cell k = sum(1 for i in range(self.num_nodes) for j in range(2 + i)) num_ops = len(LINEAR_PRIMITIVES) # 8 - # TODO - # this kind of implementation will add alpha into self.parameters() - # it has num k of alpha parameters, and each alpha shape: [num_ops] - # it requires grad and can be converted to cpu/gpu automatically self.alpha_normal = nn.Parameter(torch.randn(k, num_ops)) self.alpha_reduce = nn.Parameter(torch.randn(k, num_ops)) @@ -106,7 +94,7 @@ def fc_layers(self, cp, tasks): fc_layers = {} for task, dim in tasks.items(): fc_layers[task] = nn.Linear(cp, dim).to(self.device) - return fc_layers + return fc_layers def new(self): """ Create a new model initialzed with current alpha parameters. @@ -119,7 +107,7 @@ def new(self): New model initialized with current alpha. """ model = Network( - self.tasks, + self.tasks, self.criterion ).to(self.device) @@ -129,27 +117,8 @@ def new(self): return model def forward(self, x): - """ - in: torch.Size([3, 3, 32, 32]) - stem: torch.Size([3, 48, 32, 32]) - cell: 0 torch.Size([3, 64, 32, 32]) False - cell: 1 torch.Size([3, 64, 32, 32]) False - cell: 2 torch.Size([3, 128, 16, 16]) True - cell: 3 torch.Size([3, 128, 16, 16]) False - cell: 4 torch.Size([3, 128, 16, 16]) False - cell: 5 torch.Size([3, 256, 8, 8]) True - cell: 6 torch.Size([3, 256, 8, 8]) False - cell: 7 torch.Size([3, 256, 8, 8]) False - pool: torch.Size([16, 256, 1, 1]) - linear: [b, 10] - :param x: - :return: - """ - #print('network in:', x.shape) # s0 & s1 means the last cells' output s0 = s1 = self.stem(x) # [b, 3, 32, 32] => [b, 48, 32, 32] - #print('network stem:', s0.shape) - #print('network stem1:', s1.shape) for i, cell in enumerate(self.cells): # weights are shared across all reduction cell or normal cell @@ -161,16 +130,9 @@ def forward(self, x): weights = F.softmax(self.alpha_normal, dim=-1) # [14, 8] # execute cell() firstly and then assign s0=s1, s1=result s0, s1 = s1, cell(s0, s1, weights) # [40, 64, 32, 32] - #print('cell:',i, s1.shape, cell.reduction, cell.reduction_prev) - #print('\n') # s1 is the last cell's output - #out = self.global_pooling(s1) - out = s1 - # logits = {} - # for task, fc in self.classifier.items(): - # logits[task] = fc(out.view(out.size(0), -1)) - logits = self.classifier(out.view(out.size(0), -1)) + logits = self.classifier(s1.view(s1.size(0), -1)) return logits From a6a83fc700c3d0fddd4c81b2270cff2da0eb7dc6 Mon Sep 17 00:00:00 2001 From: yngtodd Date: Tue, 7 Apr 2020 23:42:15 -0400 Subject: [PATCH 184/331] Rename operation module The placeholder 'orginal' (for the orignal paper's operations) was not very descriptive. These operations are all convolutional, renaming this accordingly makes more sense. --- common/darts/modules/cell.py | 4 +- common/darts/modules/classifier.py | 12 +- .../modules/linear/linear_mixed_layer.py | 4 +- common/darts/modules/mixed_layer.py | 2 +- common/darts/modules/operations/original.py | 167 ------------------ 5 files changed, 11 insertions(+), 178 deletions(-) delete mode 100644 common/darts/modules/operations/original.py diff --git a/common/darts/modules/cell.py b/common/darts/modules/cell.py index a25424cf..26415480 100644 --- a/common/darts/modules/cell.py +++ b/common/darts/modules/cell.py @@ -3,7 +3,7 @@ from darts.api import Model from darts.modules.mixed_layer import MixedLayer -from darts.modules.operations.original import ConvBlock, FactorizedReduce +from darts.modules.operations.convolution import ConvBlock, FactorizedReduce class Cell(Model): @@ -77,4 +77,4 @@ def forward(self, s0, s1, weights): #print('node:',i, s.shape, self.reduction) # concat along dim=channel - return torch.cat(states[-self.multiplier:], dim=1) # 6 of [40, 16, 32, 32] \ No newline at end of file + return torch.cat(states[-self.multiplier:], dim=1) # 6 of [40, 16, 32, 32] diff --git a/common/darts/modules/classifier.py b/common/darts/modules/classifier.py index 18910616..439c17f0 100644 --- a/common/darts/modules/classifier.py +++ b/common/darts/modules/classifier.py @@ -3,24 +3,24 @@ class MultitaskClassifier(nn.Module): - + def __init__(self, input_dim: int, tasks: Dict[str, int]): super(MultitaskClassifier, self).__init__() self.tasks = tasks - + for task, num_classes in tasks.items(): self.add_module( - task, + task, nn.Linear(input_dim, num_classes) ) - + def num_classes(self, task): """ Get number of classes for a task. """ return self.tasks[task] - + def forward(self, x): logits = {} for task, _ in self.tasks.items(): logits[task] = self._modules[task](x) - + return logits diff --git a/common/darts/modules/linear/linear_mixed_layer.py b/common/darts/modules/linear/linear_mixed_layer.py index 61d6a5ab..085fb56d 100644 --- a/common/darts/modules/linear/linear_mixed_layer.py +++ b/common/darts/modules/linear/linear_mixed_layer.py @@ -31,11 +31,11 @@ def forward(self, x, weights): """ Parameters ---------- - x : torch.tensor + x : torch.tensor Data Weights : torch.tensor alpha, [op_num:8], the output = sum of alpha * op(x) """ x = [w * layer(x) for w, layer in zip(weights, self.layers)] - return sum(x) + return sum(x) diff --git a/common/darts/modules/mixed_layer.py b/common/darts/modules/mixed_layer.py index 4ecd47d4..b95145c5 100644 --- a/common/darts/modules/mixed_layer.py +++ b/common/darts/modules/mixed_layer.py @@ -3,7 +3,7 @@ from darts.api import Model from darts.genotypes import PRIMITIVES -from darts.modules.operations.original import OPS +from darts.modules.operations.convolution import OPS class MixedLayer(Model): diff --git a/common/darts/modules/operations/original.py b/common/darts/modules/operations/original.py deleted file mode 100644 index 9bc5e14b..00000000 --- a/common/darts/modules/operations/original.py +++ /dev/null @@ -1,167 +0,0 @@ -""" -CNN NLP operations closely modeled after the original paper's vision task. -""" - -import torch -import torch.nn as nn - -from darts.api import Model - - -OPS = { - 'none' : lambda c, stride, affine: Zero(stride), - 'avg_pool_3' : lambda c, stride, affine: nn.AvgPool1d(3, stride=stride, padding=1, count_include_pad=False), - 'max_pool_3' : lambda c, stride, affine: nn.MaxPool1d(3, stride=stride, padding=1), - 'skip_connect': lambda c, stride, affine: Identity() if stride == 1 else FactorizedReduce(c, c, affine=affine), - 'sep_conv_3' : lambda c, stride, affine: SepConv(c, c, 3, stride, 1, affine=affine), - 'sep_conv_5' : lambda c, stride, affine: SepConv(c, c, 5, stride, 2, affine=affine), - 'sep_conv_7' : lambda c, stride, affine: SepConv(c, c, 7, stride, 3, affine=affine), - 'dil_conv_3' : lambda c, stride, affine: DilConv(c, c, 3, stride, 2, 2, affine=affine), - 'dil_conv_5' : lambda c, stride, affine: DilConv(c, c, 5, stride, 4, 2, affine=affine), - 'convblock_7' : lambda c, stride, affine: ConvBlock(c, c, 7, stride, 3, affine=affine), -} - - -class ConvBlock(Model): - """ ReLu -> Conv1d -> BatchNorm """ - - def __init__(self, c_in, c_out, kernel_size, stride, padding, affine=True): - super(ConvBlock, self).__init__() - - self.op = nn.Sequential( - nn.ReLU(inplace=False), - nn.Conv1d(c_in, c_out, kernel_size, stride=stride, padding=padding, bias=False), - nn.BatchNorm1d(c_out, affine=affine) - ) - - def forward(self, x): - return self.op(x) - - -class DilConv(Model): - """ ReLU Dilated Convolution """ - - def __init__(self, c_in, c_out, kernel_size, stride, padding, dilation, affine=True): - super(DilConv, self).__init__() - - self.op = nn.Sequential( - nn.ReLU(inplace=False), - - nn.Conv1d( - c_in, - c_in, - kernel_size=kernel_size, - stride=stride, - padding=padding, - dilation=dilation, - groups=c_in, - bias=False - ), - - nn.Conv1d( - c_in, - c_out, - kernel_size=1, - padding=0, - bias=False - ), - - nn.BatchNorm1d(c_out, affine=affine), - ) - - def forward(self, x): - return self.op(x) - - -class FactorizedReduce(Model): - """ Reduce the feature maps by half, maintaining number of channels - - Example - ------- - x: torch.Size([2, 10, 12]) - out: [batch_size, c_out, d//2] - out: torch.Size([2, 10, 6]) - """ - - def __init__(self, c_in, c_out, affine=True): - super(FactorizedReduce, self).__init__() - assert c_out % 2 == 0 - - self.conv_1 = nn.Conv1d(c_in, c_out // 2, 1, stride=2, padding=0, bias=False) - self.conv_2 = nn.Conv1d(c_in, c_out // 2, 1, stride=2, padding=0, bias=False) - self.bn = nn.BatchNorm1d(c_out, affine=affine) - - def forward(self, x): - x = torch.relu(x) - out = torch.cat([self.conv_1(x), self.conv_2(x[:, :, 1:])], dim=1) - out = self.bn(out) - return out - - -class Identity(Model): - - def __init__(self): - super(Identity, self).__init__() - - def forward(self, x): - return x - - -class SepConv(Model): - """ Separable Convolution Block """ - def __init__(self, c_in, c_out, kernel_size, stride, padding, affine=True): - super(SepConv, self).__init__() - - self.op = nn.Sequential( - nn.ReLU(inplace=False), - - nn.Conv1d( - c_in, - c_in, - kernel_size=kernel_size, - stride=stride, - padding=padding, - groups=c_in, - bias=False - ), - - nn.Conv1d( - c_in, - c_in, - kernel_size=1, - padding=0, - bias=False - ), - - nn.BatchNorm1d(c_in, affine=affine), - nn.ReLU(inplace=False), - - nn.Conv1d( - c_in, - c_in, - kernel_size=kernel_size, - stride=1, - padding=padding, - groups=c_in, - bias=False - ), - - nn.Conv1d(c_in, c_out, kernel_size=1, padding=0, bias=False), - nn.BatchNorm1d(c_out, affine=affine), - ) - - def forward(self, x): - return self.op(x) - - -class Zero(nn.Module): - """ Zero tensor by stride """ - - def __init__(self, stride): - super(Zero, self).__init__() - self.stride = stride - - def forward(self, x): - if self.stride == 1: - return x.mul(0.) - return x[:, :, ::self.stride].mul(0.) From 078401595b64b3b1c1cd23228dcabb437f65b910 Mon Sep 17 00:00:00 2001 From: yngtodd Date: Tue, 7 Apr 2020 23:43:52 -0400 Subject: [PATCH 185/331] Rename operations This is a more descriptive name than 'orginal'. --- .../darts/modules/operations/convolution.py | 167 ++++++++++++++++++ 1 file changed, 167 insertions(+) create mode 100644 common/darts/modules/operations/convolution.py diff --git a/common/darts/modules/operations/convolution.py b/common/darts/modules/operations/convolution.py new file mode 100644 index 00000000..9bc5e14b --- /dev/null +++ b/common/darts/modules/operations/convolution.py @@ -0,0 +1,167 @@ +""" +CNN NLP operations closely modeled after the original paper's vision task. +""" + +import torch +import torch.nn as nn + +from darts.api import Model + + +OPS = { + 'none' : lambda c, stride, affine: Zero(stride), + 'avg_pool_3' : lambda c, stride, affine: nn.AvgPool1d(3, stride=stride, padding=1, count_include_pad=False), + 'max_pool_3' : lambda c, stride, affine: nn.MaxPool1d(3, stride=stride, padding=1), + 'skip_connect': lambda c, stride, affine: Identity() if stride == 1 else FactorizedReduce(c, c, affine=affine), + 'sep_conv_3' : lambda c, stride, affine: SepConv(c, c, 3, stride, 1, affine=affine), + 'sep_conv_5' : lambda c, stride, affine: SepConv(c, c, 5, stride, 2, affine=affine), + 'sep_conv_7' : lambda c, stride, affine: SepConv(c, c, 7, stride, 3, affine=affine), + 'dil_conv_3' : lambda c, stride, affine: DilConv(c, c, 3, stride, 2, 2, affine=affine), + 'dil_conv_5' : lambda c, stride, affine: DilConv(c, c, 5, stride, 4, 2, affine=affine), + 'convblock_7' : lambda c, stride, affine: ConvBlock(c, c, 7, stride, 3, affine=affine), +} + + +class ConvBlock(Model): + """ ReLu -> Conv1d -> BatchNorm """ + + def __init__(self, c_in, c_out, kernel_size, stride, padding, affine=True): + super(ConvBlock, self).__init__() + + self.op = nn.Sequential( + nn.ReLU(inplace=False), + nn.Conv1d(c_in, c_out, kernel_size, stride=stride, padding=padding, bias=False), + nn.BatchNorm1d(c_out, affine=affine) + ) + + def forward(self, x): + return self.op(x) + + +class DilConv(Model): + """ ReLU Dilated Convolution """ + + def __init__(self, c_in, c_out, kernel_size, stride, padding, dilation, affine=True): + super(DilConv, self).__init__() + + self.op = nn.Sequential( + nn.ReLU(inplace=False), + + nn.Conv1d( + c_in, + c_in, + kernel_size=kernel_size, + stride=stride, + padding=padding, + dilation=dilation, + groups=c_in, + bias=False + ), + + nn.Conv1d( + c_in, + c_out, + kernel_size=1, + padding=0, + bias=False + ), + + nn.BatchNorm1d(c_out, affine=affine), + ) + + def forward(self, x): + return self.op(x) + + +class FactorizedReduce(Model): + """ Reduce the feature maps by half, maintaining number of channels + + Example + ------- + x: torch.Size([2, 10, 12]) + out: [batch_size, c_out, d//2] + out: torch.Size([2, 10, 6]) + """ + + def __init__(self, c_in, c_out, affine=True): + super(FactorizedReduce, self).__init__() + assert c_out % 2 == 0 + + self.conv_1 = nn.Conv1d(c_in, c_out // 2, 1, stride=2, padding=0, bias=False) + self.conv_2 = nn.Conv1d(c_in, c_out // 2, 1, stride=2, padding=0, bias=False) + self.bn = nn.BatchNorm1d(c_out, affine=affine) + + def forward(self, x): + x = torch.relu(x) + out = torch.cat([self.conv_1(x), self.conv_2(x[:, :, 1:])], dim=1) + out = self.bn(out) + return out + + +class Identity(Model): + + def __init__(self): + super(Identity, self).__init__() + + def forward(self, x): + return x + + +class SepConv(Model): + """ Separable Convolution Block """ + def __init__(self, c_in, c_out, kernel_size, stride, padding, affine=True): + super(SepConv, self).__init__() + + self.op = nn.Sequential( + nn.ReLU(inplace=False), + + nn.Conv1d( + c_in, + c_in, + kernel_size=kernel_size, + stride=stride, + padding=padding, + groups=c_in, + bias=False + ), + + nn.Conv1d( + c_in, + c_in, + kernel_size=1, + padding=0, + bias=False + ), + + nn.BatchNorm1d(c_in, affine=affine), + nn.ReLU(inplace=False), + + nn.Conv1d( + c_in, + c_in, + kernel_size=kernel_size, + stride=1, + padding=padding, + groups=c_in, + bias=False + ), + + nn.Conv1d(c_in, c_out, kernel_size=1, padding=0, bias=False), + nn.BatchNorm1d(c_out, affine=affine), + ) + + def forward(self, x): + return self.op(x) + + +class Zero(nn.Module): + """ Zero tensor by stride """ + + def __init__(self, stride): + super(Zero, self).__init__() + self.stride = stride + + def forward(self, x): + if self.stride == 1: + return x.mul(0.) + return x[:, :, ::self.stride].mul(0.) From 2f6172c6e8abf2adc457e5dfbe8bc0b3686f592e Mon Sep 17 00:00:00 2001 From: yngtodd Date: Wed, 8 Apr 2020 00:28:45 -0400 Subject: [PATCH 186/331] Remove unnecessary constructor This was not used since there is now a second constructor in the model to give us a new instance with set alpha parameters. --- common/darts/modules/linear_network.py | 20 +++----------------- 1 file changed, 3 insertions(+), 17 deletions(-) diff --git a/common/darts/modules/linear_network.py b/common/darts/modules/linear_network.py index 80e43a2c..2eb6e146 100644 --- a/common/darts/modules/linear_network.py +++ b/common/darts/modules/linear_network.py @@ -14,14 +14,13 @@ class Hyperparameters: num_cells = 3 channel_multiplier = 1 stem_channel_multiplier = 1 - input_dim = 5270 intermediate_dim = 100 class Network(Model): """ Collection of cells """ - def __init__(self, tasks, criterion, device='cpu', hyperparams=Hyperparameters()): + def __init__(self, input_dim, tasks, criterion, device='cpu', hyperparams=Hyperparameters()): super(Network, self).__init__() self.tasks = tasks self.criterion = criterion @@ -34,10 +33,10 @@ def __init__(self, tasks, criterion, device='cpu', hyperparams=Hyperparameters() # stem_multiplier is for stem network, # and multiplier is for general cell c_curr = hyperparams.stem_channel_multiplier * self.c - # stem network, convert 3 channel to c_curr + self.stem = nn.Sequential( nn.Linear( - hyperparams.input_dim, hyperparams.intermediate_dim + input_dim, hyperparams.intermediate_dim ), ).to(self.device) @@ -201,16 +200,3 @@ def _parse(weights): return genotype - -def new(c, num_classes, num_layers, criterion, device, steps=4, multiplier=4, stem_multiplier=3): - """ - create a new model and initialize it with current alpha parameters. - However, its weights are left untouched. - :return: - """ - model = Network(c, num_classes, num_layers, criterion, steps, multiplier, stem_multiplier).to(device) - - for x, y in zip(model_new.arch_parameters(), self.arch_parameters()): - x.data.copy_(y.data) - - return model From 53e9c5c3a2908fb71721dd744a6ad91c8af5df87 Mon Sep 17 00:00:00 2001 From: yngtodd Date: Wed, 8 Apr 2020 00:35:15 -0400 Subject: [PATCH 187/331] Remove unnecessary comments Just tidying up a bit. --- common/darts/modules/linear/linear_cell.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/common/darts/modules/linear/linear_cell.py b/common/darts/modules/linear/linear_cell.py index 4854d485..ca1fe2be 100644 --- a/common/darts/modules/linear/linear_cell.py +++ b/common/darts/modules/linear/linear_cell.py @@ -49,14 +49,10 @@ def forward(self, s0, s1, weights): """ states = [s0, s1] offset = 0 - # for each node, receive input from all previous intermediate nodes and s0, s1 + # for each node, receive input from + # all previous intermediate nodes and s0, s1 for i in range(self.num_nodes): # 4 - # [40, 16, 32, 32] - #s = sum(self.layers[offset + j](h, weights[offset + j]) for j, h in enumerate(states)) offset += len(states) - # append one state since s is the elem-wise addition of all output - #states.append(s) - #print('node:',i, s.shape, self.reduction) # concat along dim=channel - return torch.cat(states[-self.multiplier:], dim=1) # 6 of [40, 16, 32, 32] + return torch.cat(states[-self.multiplier:], dim=1) From 70db7df26a01b0fa3e21651024f3cf697f8db494 Mon Sep 17 00:00:00 2001 From: yngtodd Date: Wed, 8 Apr 2020 00:55:24 -0400 Subject: [PATCH 188/331] Refactor directories This directory structure makes a bit more sense. Previously the modules directory had Python modules pertaining to the linear networks and the convolutional networks. This will decrease burden on people trying to navigate the codebase. --- common/darts/modules/conv/__init__.py | 0 common/darts/modules/{ => conv}/cell.py | 2 +- common/darts/modules/{ => conv}/mixed_layer.py | 0 common/darts/modules/{ => conv}/network.py | 2 +- common/darts/modules/{ => linear}/linear_network.py | 0 5 files changed, 2 insertions(+), 2 deletions(-) create mode 100644 common/darts/modules/conv/__init__.py rename common/darts/modules/{ => conv}/cell.py (98%) rename common/darts/modules/{ => conv}/mixed_layer.py (100%) rename common/darts/modules/{ => conv}/network.py (99%) rename common/darts/modules/{ => linear}/linear_network.py (100%) diff --git a/common/darts/modules/conv/__init__.py b/common/darts/modules/conv/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/common/darts/modules/cell.py b/common/darts/modules/conv/cell.py similarity index 98% rename from common/darts/modules/cell.py rename to common/darts/modules/conv/cell.py index 26415480..203b4fab 100644 --- a/common/darts/modules/cell.py +++ b/common/darts/modules/conv/cell.py @@ -2,7 +2,7 @@ import torch.nn as nn from darts.api import Model -from darts.modules.mixed_layer import MixedLayer +from darts.modules.conv.mixed_layer import MixedLayer from darts.modules.operations.convolution import ConvBlock, FactorizedReduce diff --git a/common/darts/modules/mixed_layer.py b/common/darts/modules/conv/mixed_layer.py similarity index 100% rename from common/darts/modules/mixed_layer.py rename to common/darts/modules/conv/mixed_layer.py diff --git a/common/darts/modules/network.py b/common/darts/modules/conv/network.py similarity index 99% rename from common/darts/modules/network.py rename to common/darts/modules/conv/network.py index 8b0ea92c..c4c30872 100644 --- a/common/darts/modules/network.py +++ b/common/darts/modules/conv/network.py @@ -3,7 +3,7 @@ import torch.nn.functional as F from darts.api import Model -from darts.modules.cell import Cell +from darts.modules.conv.cell import Cell from darts.modules.classifier import MultitaskClassifier from darts.genotypes import PRIMITIVES, Genotype diff --git a/common/darts/modules/linear_network.py b/common/darts/modules/linear/linear_network.py similarity index 100% rename from common/darts/modules/linear_network.py rename to common/darts/modules/linear/linear_network.py From eae246737ce04c0e029b66ac480e2a3d76b2cb47 Mon Sep 17 00:00:00 2001 From: yngtodd Date: Wed, 8 Apr 2020 01:10:16 -0400 Subject: [PATCH 189/331] Add publically available classes Setting the imports in the root __init__.py shows what classes and functions we consider to be pulbic. This will help users find what they need without having to understand everything in the library. --- common/darts/__init__.py | 11 +++++++++++ common/darts/modules/conv/network.py | 2 +- .../darts/modules/linear/{linear_cell.py => cell.py} | 2 +- .../linear/{linear_mixed_layer.py => mixed_layer.py} | 0 .../modules/linear/{linear_network.py => network.py} | 4 ++-- 5 files changed, 15 insertions(+), 4 deletions(-) rename common/darts/modules/linear/{linear_cell.py => cell.py} (96%) rename common/darts/modules/linear/{linear_mixed_layer.py => mixed_layer.py} (100%) rename common/darts/modules/linear/{linear_network.py => network.py} (98%) diff --git a/common/darts/__init__.py b/common/darts/__init__.py index 974de4cc..e709a763 100644 --- a/common/darts/__init__.py +++ b/common/darts/__init__.py @@ -1,3 +1,14 @@ __author__ = 'Todd Young' __email__ = 'youngmt1@ornl.gov' __version__ = '0.1.0' + +from .architecture import Architecture +from .modules.conv.network imoprt ConvNetwork +from .modules.linear.network import LinearNetwork + + +__all__ = [ + "Architecture", + "ConvNetwork", + "LinearNetwork", +] diff --git a/common/darts/modules/conv/network.py b/common/darts/modules/conv/network.py index c4c30872..13644a11 100644 --- a/common/darts/modules/conv/network.py +++ b/common/darts/modules/conv/network.py @@ -18,7 +18,7 @@ class Hyperparameters: embedding_dim = 1500 -class Network(Model): +class ConvNetwork(Model): """ Collection of cells """ def __init__(self, tasks, criterion, device='cpu', hyperparams=Hyperparameters()): diff --git a/common/darts/modules/linear/linear_cell.py b/common/darts/modules/linear/cell.py similarity index 96% rename from common/darts/modules/linear/linear_cell.py rename to common/darts/modules/linear/cell.py index ca1fe2be..b533c61b 100644 --- a/common/darts/modules/linear/linear_cell.py +++ b/common/darts/modules/linear/cell.py @@ -2,7 +2,7 @@ import torch.nn as nn from darts.api import Model -from darts.modules.linear.linear_mixed_layer import MixedLayer +from darts.modules.linear.mixed_layer import MixedLayer class Cell(Model): diff --git a/common/darts/modules/linear/linear_mixed_layer.py b/common/darts/modules/linear/mixed_layer.py similarity index 100% rename from common/darts/modules/linear/linear_mixed_layer.py rename to common/darts/modules/linear/mixed_layer.py diff --git a/common/darts/modules/linear/linear_network.py b/common/darts/modules/linear/network.py similarity index 98% rename from common/darts/modules/linear/linear_network.py rename to common/darts/modules/linear/network.py index 2eb6e146..061ca46e 100644 --- a/common/darts/modules/linear/linear_network.py +++ b/common/darts/modules/linear/network.py @@ -3,7 +3,7 @@ import torch.nn.functional as F from darts.api import Model -from darts.modules.linear.linear_cell import Cell +from darts.modules.linear.cell import Cell from darts.modules.classifier import MultitaskClassifier from darts.genotypes import LINEAR_PRIMITIVES, Genotype @@ -17,7 +17,7 @@ class Hyperparameters: intermediate_dim = 100 -class Network(Model): +class LinearNetwork(Model): """ Collection of cells """ def __init__(self, input_dim, tasks, criterion, device='cpu', hyperparams=Hyperparameters()): From a332d131ba59f51fb37f888bd978a3c6e0b53d10 Mon Sep 17 00:00:00 2001 From: yngtodd Date: Wed, 8 Apr 2020 01:13:36 -0400 Subject: [PATCH 190/331] Follow naming conventions Since conv is used at various other locations in the library, we should keep consistent here. --- common/darts/modules/conv/mixed_layer.py | 2 +- common/darts/modules/operations/{convolution.py => conv.py} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename common/darts/modules/operations/{convolution.py => conv.py} (100%) diff --git a/common/darts/modules/conv/mixed_layer.py b/common/darts/modules/conv/mixed_layer.py index b95145c5..8da4373b 100644 --- a/common/darts/modules/conv/mixed_layer.py +++ b/common/darts/modules/conv/mixed_layer.py @@ -3,7 +3,7 @@ from darts.api import Model from darts.genotypes import PRIMITIVES -from darts.modules.operations.convolution import OPS +from darts.modules.operations.conv import OPS class MixedLayer(Model): diff --git a/common/darts/modules/operations/convolution.py b/common/darts/modules/operations/conv.py similarity index 100% rename from common/darts/modules/operations/convolution.py rename to common/darts/modules/operations/conv.py From 3fc416e6354cab7e80d875537244678d9cc579c4 Mon Sep 17 00:00:00 2001 From: yngtodd Date: Wed, 8 Apr 2020 02:00:28 -0400 Subject: [PATCH 191/331] Add README This is the start of the DARTS README. Right now it is serving as an outline for the pull Candle Benchmarks pull request. As this work is being finished, I will add examples on how to use the library here. --- common/darts/README.rst | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100644 common/darts/README.rst diff --git a/common/darts/README.rst b/common/darts/README.rst new file mode 100644 index 00000000..89c5ec1b --- /dev/null +++ b/common/darts/README.rst @@ -0,0 +1,39 @@ +===== +DARTS +===== + +Differentiable architecture search + + +Notes +----- + +The following steps should be finished before merging the PR: + +[] Expert level `Network` with user defined primitives and stem +[] Examples +[] README overview of the library + +Expert Level Network +-------------------- + +The user must define: + +1. Fundamental operations +2. Ops constructor for fundamental operations +3. Primitives list + +Draft +----- + +.. code-block:: python + + class Network(stem, primitives, ops): + self.stem = stem + self.primitives = primitives + self ops = ops + + def _helper_init(self, ...): + """ Helper to construct the private member variables """ + raise NotImplementedError + From 46883fd37bf08ceaa6078439064159d3e1633eb5 Mon Sep 17 00:00:00 2001 From: yngtodd Date: Wed, 8 Apr 2020 02:03:14 -0400 Subject: [PATCH 192/331] Fix README The checklist was not showing properly. --- common/darts/README.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/common/darts/README.rst b/common/darts/README.rst index 89c5ec1b..2a392975 100644 --- a/common/darts/README.rst +++ b/common/darts/README.rst @@ -10,9 +10,9 @@ Notes The following steps should be finished before merging the PR: -[] Expert level `Network` with user defined primitives and stem -[] Examples -[] README overview of the library +- [ ] Expert level `Network` with user defined primitives and stem +- [ ] Examples +- [ ] README overview of the library Expert Level Network -------------------- From f6cd5840909128d0fe7a79d5a47153c5966158e3 Mon Sep 17 00:00:00 2001 From: yngtodd Date: Wed, 8 Apr 2020 02:07:58 -0400 Subject: [PATCH 193/331] Update README Adding links to the authors' original paper. --- common/darts/README.rst | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/common/darts/README.rst b/common/darts/README.rst index 2a392975..855cc039 100644 --- a/common/darts/README.rst +++ b/common/darts/README.rst @@ -4,6 +4,9 @@ DARTS Differentiable architecture search +This is an adaptation of Hanxiao Liu et al's DARTS algorithm, extending +the work to handle convolutional neural networks for NLP problems and more. +Details of the original authors' approach can be found in their 2019 ICLR paper_. Notes ----- @@ -28,7 +31,10 @@ Draft .. code-block:: python - class Network(stem, primitives, ops): + class Network: + """ Expert mode network """ + + def __init__(self, stem, primitives, ops): self.stem = stem self.primitives = primitives self ops = ops @@ -37,3 +43,8 @@ Draft """ Helper to construct the private member variables """ raise NotImplementedError + +.. References +.. ---------- +.. _paper: https://openreview.net/forum?id=S1eYHoC5FX + From 1e45ec339a6de7d6a56ac78f578732278bd9b2e3 Mon Sep 17 00:00:00 2001 From: yngtodd Date: Wed, 8 Apr 2020 02:09:19 -0400 Subject: [PATCH 194/331] Fix README I will not stand even for busted pseudocode. --- common/darts/README.rst | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/common/darts/README.rst b/common/darts/README.rst index 855cc039..048eeb01 100644 --- a/common/darts/README.rst +++ b/common/darts/README.rst @@ -34,14 +34,14 @@ Draft class Network: """ Expert mode network """ - def __init__(self, stem, primitives, ops): - self.stem = stem - self.primitives = primitives - self ops = ops - - def _helper_init(self, ...): - """ Helper to construct the private member variables """ - raise NotImplementedError + def __init__(self, stem, primitives, ops): + self.stem = stem + self.primitives = primitives + self ops = ops + + def _helper_init(self, ...): + """ Helper to construct the private member variables """ + raise NotImplementedError .. References From 3adb24b57b954d1c27277626d0a29f279a07cb7e Mon Sep 17 00:00:00 2001 From: yngtodd Date: Wed, 8 Apr 2020 09:33:41 -0400 Subject: [PATCH 195/331] Fix typo Just fixing import mispelling. --- common/darts/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/darts/__init__.py b/common/darts/__init__.py index e709a763..c775c273 100644 --- a/common/darts/__init__.py +++ b/common/darts/__init__.py @@ -3,7 +3,7 @@ __version__ = '0.1.0' from .architecture import Architecture -from .modules.conv.network imoprt ConvNetwork +from .modules.conv.network import ConvNetwork from .modules.linear.network import LinearNetwork From 7db6c99dab0530aff0f0aa3c32dbd16ae380e991 Mon Sep 17 00:00:00 2001 From: Cristina Garcia Cardona Date: Wed, 8 Apr 2020 10:02:35 -0600 Subject: [PATCH 196/331] added confusion matrix note attn abs --- Pilot1/Attn1/attn_abstention_keras2.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Pilot1/Attn1/attn_abstention_keras2.py b/Pilot1/Attn1/attn_abstention_keras2.py index abff61c8..6a9c40dc 100644 --- a/Pilot1/Attn1/attn_abstention_keras2.py +++ b/Pilot1/Attn1/attn_abstention_keras2.py @@ -404,9 +404,11 @@ def evaluate_abstention(params, root_fname, nb_classes, Y_test, _Y_test, Y_predi class_names=['Non-Response','Response', 'Abstain'] fname = params['save_path'] + root_fname + '.confusion_without_norm.pdf' attnviz.plot_confusion_matrix(cnf_matrix, fname, classes=class_names, title='Confusion matrix, without normalization') + print('NOTE: Confusion matrix above has zeros in the last row since the ground-truth does not include samples in the abstaining class.') # Plot normalized confusion matrix fname = params['save_path'] + root_fname + '.confusion_with_norm.pdf' attnviz.plot_confusion_matrix(cnf_matrix, fname, classes=class_names, normalize=True, title='Normalized confusion matrix') + print('NOTE: Normalized confusion matrix above has NaNs in the last row since the ground-truth does not include samples in the abstaining class.') print('Examples:\n Total: {}\n Positive: {} ({:.2f}% of total)\n'.format(total, pos, 100 * pos / total)) From 80fe1f8d7f0205a773a10af88101b7c25b297ae1 Mon Sep 17 00:00:00 2001 From: Cristina Garcia Cardona Date: Wed, 8 Apr 2020 10:03:13 -0600 Subject: [PATCH 197/331] added config_file keywords verification --- common/default_utils.py | 47 +++++++++++++++++++++++++++++++++++------ 1 file changed, 41 insertions(+), 6 deletions(-) diff --git a/common/default_utils.py b/common/default_utils.py index 9cdb4dca..ba3d5e96 100644 --- a/common/default_utils.py +++ b/common/default_utils.py @@ -6,6 +6,7 @@ import inspect import logging +import warnings import os import sys @@ -30,6 +31,8 @@ DEFAULT_DATATYPE = np.float32 +PARAMETERS_CANDLE = ['config_file', 'verbose', 'logfile', 'save_path', 'model_file', 'data_type', 'dense', 'rng_seed', 'epochs', 'batch_size', 'train_bool', 'eval_bool', 'timeout', 'home_dir', 'train_data', 'test_data', 'output_dir', 'data_url', 'experiment_id', 'run_id', 'conv', 'locally_connected', 'activation', 'out_activation', 'lstm_size', 'recurrent_dropout', 'drop', 'pool', 'batch_normalization', 'loss', 'optimizer', 'metrics', 'scaling', 'shuffle', 'feature_subsample', 'learning_rate', 'initialization', 'val_split', 'train_steps', 'val_steps', 'test_steps', 'train_samples', 'val_samples', 'gpus', 'profiling'] + #### IO UTILS def fetch_file(link, subdir, untar=False, md5_hash=None): @@ -317,6 +320,36 @@ def set_seed(seed): random.seed(seed) +def check_file_parameters_exists(params_parser, params_file): + """Functionality to verify that the parameters defined in the configuraion file are recognizable by the command line parser (i.e. no uknown keywords are used in the configuration file). + + Parameters + ---------- + params_parser : python dictionary + Includes parameters set via the command line. + params_file : python dictionary + Includes parameters read from the configuration file. + + Global: + PARAMETERS_CANDLE : python list + Includes all the core keywords that are specified in CANDLE. + """ + # Get keywords from arguments coming via command line (and CANDLE supervisor) + args_dict = vars(params_parser) + args_set = set(args_dict.keys()) + # Get core CANDLE keywords + candle_set = set(PARAMETERS_CANDLE) + # Consolidate keywords from CANDLE core, command line and CANDLE supervisor + candle_set = candle_set.union(args_set) + # Get keywords used in config_file + file_set = set(params_file.keys()) + # Compute keywords that come from the config_file that are not in the CANDLE specs + diff_set = file_set.difference(candle_set) + + if ( len(diff_set) > 0 ): + message = 'These keywords used in the configuration file are not defined in CANDLE: ' + str(sorted(diff_set)) + warnings.warn(message, RuntimeWarning) + def finalize_parameters(bmk): """Utility to parse parameters in common as well as parameters @@ -352,11 +385,13 @@ def finalize_parameters(bmk): else: # a 'config_file' has been set --> use this file conffile = os.path.join(bmk.file_path, conffile_txt) - print("Configuration file: ", conffile) + #print("Configuration file: ", conffile) fileParameters = bmk.read_config_file(conffile)#aux.config_file)#args.config_file) # Get command-line parameters args = bmk.parser.parse_args() #print ('Params:', fileParameters) + # Check keywords from file against CANDLE common and module definitions + check_file_parameters_exists(args, fileParameters) # Consolidate parameter set. Command-line parameters overwrite file configuration gParameters = args_overwrite_config(args, fileParameters) # Check that required set of parameters has been defined @@ -599,11 +634,11 @@ def args_overwrite_config(args, config): params[key] = args_dict[key] - if 'datatype' not in params: - params['datatype'] = DEFAULT_DATATYPE + if 'data_type' not in params: + params['data_type'] = DEFAULT_DATATYPE else: - if params['datatype'] in set(['f16', 'f32', 'f64']): - params['datatype'] = get_choice(params['datatype']) + if params['data_type'] in set(['f16', 'f32', 'f64']): + params['data_type'] = get_choice(params['datatype']) if 'output_dir' not in params: params['output_dir'] = directory_from_parameters(params) @@ -826,7 +861,7 @@ def read_config_file(self, file): fileParams[k] = eval(v) fileParams = self.format_benchmark_config_arguments(fileParams) - pprint(fileParams) + #pprint(fileParams) return fileParams From d72821621d8f6074c2a1587f30b498eb299d6eec Mon Sep 17 00:00:00 2001 From: Cristina Garcia Cardona Date: Wed, 8 Apr 2020 12:27:13 -0600 Subject: [PATCH 198/331] updated datatype to data_type in benchmarks --- Pilot1/P1B1/p1b1.py | 4 ++-- Pilot1/P1B2/p1b2.py | 4 ++-- Pilot1/P1B2/p1b2_baseline_neon.py | 2 +- Pilot1/P1B3/p1b3_baseline_keras2.py | 2 +- Pilot1/TC1/tc1.py | 2 +- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/Pilot1/P1B1/p1b1.py b/Pilot1/P1B1/p1b1.py index 6eccb92c..ee4afb57 100644 --- a/Pilot1/P1B1/p1b1.py +++ b/Pilot1/P1B1/p1b1.py @@ -168,7 +168,7 @@ def load_data(params, seed): n_cols=params['feature_subsample'], shuffle=params['shuffle'], scaling=params['scaling'], - dtype=params['datatype'], + dtype=params['data_type'], validation_split=params['validation_split'], return_dataframe=False, return_header=True, @@ -200,7 +200,7 @@ def load_data_orig(params, seed): shuffle=params['shuffle'], scaling=params['scaling'], validation_split=params['validation_split'], - dtype=params['datatype'], + dtype=params['data_type'], seed=seed) diff --git a/Pilot1/P1B2/p1b2.py b/Pilot1/P1B2/p1b2.py index 206005d9..b1e3be49 100644 --- a/Pilot1/P1B2/p1b2.py +++ b/Pilot1/P1B2/p1b2.py @@ -94,7 +94,7 @@ def load_data_one_hot(params, seed): shuffle=params['shuffle'], scaling=params['scaling'], validation_split=params['validation_split'], - dtype=params['datatype'], + dtype=params['data_type'], seed=seed) @@ -109,7 +109,7 @@ def load_data(params, seed): shuffle=params['shuffle'], scaling=params['scaling'], validation_split=params['validation_split'], - dtype=params['datatype'], + dtype=params['data_type'], seed=seed) diff --git a/Pilot1/P1B2/p1b2_baseline_neon.py b/Pilot1/P1B2/p1b2_baseline_neon.py index e36ef67c..3628e249 100644 --- a/Pilot1/P1B2/p1b2_baseline_neon.py +++ b/Pilot1/P1B2/p1b2_baseline_neon.py @@ -101,7 +101,7 @@ def main(): rng_seed=seed, device_id=args.device_id, batch_size=gParameters['batch_size'], - datatype=gParameters['datatype'], + datatype=gParameters['data_type'], max_devices=args.max_devices, compat_mode=args.compat_mode) diff --git a/Pilot1/P1B3/p1b3_baseline_keras2.py b/Pilot1/P1B3/p1b3_baseline_keras2.py index f46e02cb..75986af6 100644 --- a/Pilot1/P1B3/p1b3_baseline_keras2.py +++ b/Pilot1/P1B3/p1b3_baseline_keras2.py @@ -259,7 +259,7 @@ def run(gParameters): seed = gParameters['rng_seed'] # Build dataset loader object - loader = benchmark.DataLoader(seed=seed, dtype=gParameters['datatype'], + loader = benchmark.DataLoader(seed=seed, dtype=gParameters['data_type'], val_split=gParameters['validation_split'], test_cell_split=gParameters['test_cell_split'], cell_features=gParameters['cell_features'], diff --git a/Pilot1/TC1/tc1.py b/Pilot1/TC1/tc1.py index b36c9663..631a89c4 100644 --- a/Pilot1/TC1/tc1.py +++ b/Pilot1/TC1/tc1.py @@ -69,4 +69,4 @@ def load_data(params): return candle.load_Xy_data_noheader(train_path, test_path, params['classes'], usecols, - scaling='maxabs',dtype=params['datatype']) + scaling='maxabs',dtype=params['data_type']) From 9ba7bd2610cb4a3b1a8c9e34d2b39837604b4c78 Mon Sep 17 00:00:00 2001 From: yngtodd Date: Wed, 8 Apr 2020 14:54:34 -0400 Subject: [PATCH 199/331] Add test This will be easier to show work on paths. --- Pilot3/P3B5/test.py | 16 ++++++++++++++++ common/darts/__init__.py | 5 ++++- common/darts/modules/conv/__init__.py | 4 ++++ common/darts/modules/conv/network.py | 14 +++++++------- 4 files changed, 31 insertions(+), 8 deletions(-) create mode 100644 Pilot3/P3B5/test.py diff --git a/Pilot3/P3B5/test.py b/Pilot3/P3B5/test.py new file mode 100644 index 00000000..abf8ff6d --- /dev/null +++ b/Pilot3/P3B5/test.py @@ -0,0 +1,16 @@ +import os +import sys + + +file_path = os.path.dirname(os.path.realpath(__file__)) +lib_path = os.path.abspath(os.path.join(file_path, '..')) +sys.path.append(lib_path) +lib_path2 = os.path.abspath(os.path.join(file_path, '..', '..', 'common',)) +sys.path.append(lib_path2) + + +import darts + +print(darts.architecture.Architecture) +print(darts.ConvNetwork) +print(darts.Genotype) diff --git a/common/darts/__init__.py b/common/darts/__init__.py index c775c273..05c0e81b 100644 --- a/common/darts/__init__.py +++ b/common/darts/__init__.py @@ -1,8 +1,11 @@ +from __future__ import absolute_import + __author__ = 'Todd Young' __email__ = 'youngmt1@ornl.gov' __version__ = '0.1.0' -from .architecture import Architecture + +from architecture import Architecture from .modules.conv.network import ConvNetwork from .modules.linear.network import LinearNetwork diff --git a/common/darts/modules/conv/__init__.py b/common/darts/modules/conv/__init__.py index e69de29b..0b2287c3 100644 --- a/common/darts/modules/conv/__init__.py +++ b/common/darts/modules/conv/__init__.py @@ -0,0 +1,4 @@ +from .cell import Cell +from .network import Network +from .mixed_layer import MixedLayer + diff --git a/common/darts/modules/conv/network.py b/common/darts/modules/conv/network.py index 13644a11..6cb95b4c 100644 --- a/common/darts/modules/conv/network.py +++ b/common/darts/modules/conv/network.py @@ -3,19 +3,19 @@ import torch.nn.functional as F from darts.api import Model -from darts.modules.conv.cell import Cell +from darts.modules.conv import Cell from darts.modules.classifier import MultitaskClassifier from darts.genotypes import PRIMITIVES, Genotype class Hyperparameters: - c = 8 - num_nodes = 2 - num_cells = 3 - channel_multiplier = 2 - stem_channel_multiplier = 2 + c = 8 + num_nodes = 2 + num_cells = 3 + channel_multiplier = 2 + stem_channel_multiplier = 2 num_embeddings = 35095 # vocab size - embedding_dim = 1500 + embedding_dim = 1500 class ConvNetwork(Model): From 588eeedabf63d7970c99c1a3cf40d55dd7cae940 Mon Sep 17 00:00:00 2001 From: Cristina Garcia Cardona Date: Wed, 8 Apr 2020 13:03:26 -0600 Subject: [PATCH 200/331] added benchmark definitions to keyword check --- common/default_utils.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/common/default_utils.py b/common/default_utils.py index ba3d5e96..6b802b4f 100644 --- a/common/default_utils.py +++ b/common/default_utils.py @@ -320,13 +320,15 @@ def set_seed(seed): random.seed(seed) -def check_file_parameters_exists(params_parser, params_file): +def check_file_parameters_exists(params_parser, params_benchmark, params_file): """Functionality to verify that the parameters defined in the configuraion file are recognizable by the command line parser (i.e. no uknown keywords are used in the configuration file). Parameters ---------- params_parser : python dictionary Includes parameters set via the command line. + params_benchmark : python list + Includes additional parameters defined in the benchmark. params_file : python dictionary Includes parameters read from the configuration file. @@ -337,10 +339,16 @@ def check_file_parameters_exists(params_parser, params_file): # Get keywords from arguments coming via command line (and CANDLE supervisor) args_dict = vars(params_parser) args_set = set(args_dict.keys()) + # Get keywords from benchmark definition + bmk_keys = [] + for item in params_benchmark: + bmk_keys.append( item['name'] ) + bmk_set = set(bmk_keys) # Get core CANDLE keywords candle_set = set(PARAMETERS_CANDLE) - # Consolidate keywords from CANDLE core, command line and CANDLE supervisor + # Consolidate keywords from CANDLE core, command line, CANDLE supervisor and benchmark candle_set = candle_set.union(args_set) + candle_set = candle_set.union(bmk_set) # Get keywords used in config_file file_set = set(params_file.keys()) # Compute keywords that come from the config_file that are not in the CANDLE specs @@ -391,7 +399,8 @@ def finalize_parameters(bmk): args = bmk.parser.parse_args() #print ('Params:', fileParameters) # Check keywords from file against CANDLE common and module definitions - check_file_parameters_exists(args, fileParameters) + bmk_dict = bmk.additional_definitions + check_file_parameters_exists(args, bmk_dict, fileParameters) # Consolidate parameter set. Command-line parameters overwrite file configuration gParameters = args_overwrite_config(args, fileParameters) # Check that required set of parameters has been defined From cb5a001c1c9220042cb5a1d4f908deddd8faeb89 Mon Sep 17 00:00:00 2001 From: yngtodd Date: Wed, 8 Apr 2020 16:28:00 -0400 Subject: [PATCH 201/331] Fix names There were some changes to the network names that were not updated in all files. --- Pilot3/P3B5/test.py | 4 +++- common/darts/__init__.py | 2 +- common/darts/modules/conv/__init__.py | 2 +- common/darts/modules/conv/cell.py | 2 +- 4 files changed, 6 insertions(+), 4 deletions(-) diff --git a/Pilot3/P3B5/test.py b/Pilot3/P3B5/test.py index abf8ff6d..bd4e0122 100644 --- a/Pilot3/P3B5/test.py +++ b/Pilot3/P3B5/test.py @@ -5,12 +5,14 @@ file_path = os.path.dirname(os.path.realpath(__file__)) lib_path = os.path.abspath(os.path.join(file_path, '..')) sys.path.append(lib_path) -lib_path2 = os.path.abspath(os.path.join(file_path, '..', '..', 'common',)) +lib_path2 = os.path.abspath(os.path.join(file_path, '..', '..', 'common', 'darts')) sys.path.append(lib_path2) import darts +#from darts.architecture import Architecture +print(Architecture) print(darts.architecture.Architecture) print(darts.ConvNetwork) print(darts.Genotype) diff --git a/common/darts/__init__.py b/common/darts/__init__.py index 05c0e81b..0a2b3f98 100644 --- a/common/darts/__init__.py +++ b/common/darts/__init__.py @@ -5,7 +5,7 @@ __version__ = '0.1.0' -from architecture import Architecture +from .architecture import Architecture from .modules.conv.network import ConvNetwork from .modules.linear.network import LinearNetwork diff --git a/common/darts/modules/conv/__init__.py b/common/darts/modules/conv/__init__.py index 0b2287c3..e8b50cc7 100644 --- a/common/darts/modules/conv/__init__.py +++ b/common/darts/modules/conv/__init__.py @@ -1,4 +1,4 @@ from .cell import Cell -from .network import Network +from .network import ConvNetwork from .mixed_layer import MixedLayer diff --git a/common/darts/modules/conv/cell.py b/common/darts/modules/conv/cell.py index 203b4fab..f902ce52 100644 --- a/common/darts/modules/conv/cell.py +++ b/common/darts/modules/conv/cell.py @@ -3,7 +3,7 @@ from darts.api import Model from darts.modules.conv.mixed_layer import MixedLayer -from darts.modules.operations.convolution import ConvBlock, FactorizedReduce +from darts.modules.operations.conv import ConvBlock, FactorizedReduce class Cell(Model): From 98ccbf5359363ce9f25d9c53324ba0ee130a6f75 Mon Sep 17 00:00:00 2001 From: yngtodd Date: Wed, 8 Apr 2020 17:25:21 -0400 Subject: [PATCH 202/331] Fix imports Relative imports had been failing. --- Pilot3/P3B5/p3b5_darts.py | 2 +- Pilot3/P3B5/test.py | 13 +++++++------ common/darts/__init__.py | 1 + 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/Pilot3/P3B5/p3b5_darts.py b/Pilot3/P3B5/p3b5_darts.py index 88207a73..cd095be0 100644 --- a/Pilot3/P3B5/p3b5_darts.py +++ b/Pilot3/P3B5/p3b5_darts.py @@ -44,7 +44,7 @@ def train(trainloader, validloader, model, architecture, criterion, optimizer, l x_search, target_search = next(valid_iter) x_search = x_search.to(device) - + for task, label in target_search.items(): target_search[task] = target_search[task].to(device) diff --git a/Pilot3/P3B5/test.py b/Pilot3/P3B5/test.py index bd4e0122..8715318a 100644 --- a/Pilot3/P3B5/test.py +++ b/Pilot3/P3B5/test.py @@ -5,14 +5,15 @@ file_path = os.path.dirname(os.path.realpath(__file__)) lib_path = os.path.abspath(os.path.join(file_path, '..')) sys.path.append(lib_path) -lib_path2 = os.path.abspath(os.path.join(file_path, '..', '..', 'common', 'darts')) +lib_path2 = os.path.abspath(os.path.join(file_path, '..', '..', 'common',)) sys.path.append(lib_path2) -import darts -#from darts.architecture import Architecture +from darts import Architecture +from darts import ConvNetwork +from darts.genotypes import Genotype + print(Architecture) -print(darts.architecture.Architecture) -print(darts.ConvNetwork) -print(darts.Genotype) +print(ConvNetwork) +print(Genotype) diff --git a/common/darts/__init__.py b/common/darts/__init__.py index 0a2b3f98..bccfd139 100644 --- a/common/darts/__init__.py +++ b/common/darts/__init__.py @@ -15,3 +15,4 @@ "ConvNetwork", "LinearNetwork", ] + From ca58e2ed52c743d0688efd70c2c489b5496116b6 Mon Sep 17 00:00:00 2001 From: Cristina Garcia Cardona Date: Wed, 8 Apr 2020 15:40:33 -0600 Subject: [PATCH 203/331] updated Uno_UQ scripts to finalize_parameters --- Pilot1/Uno_UQ/uno_holdoutUQ_data.py | 2 +- Pilot1/Uno_UQ/uno_inferUQ_keras2.py | 2 +- Pilot1/Uno_UQ/uno_trainUQ_keras2.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Pilot1/Uno_UQ/uno_holdoutUQ_data.py b/Pilot1/Uno_UQ/uno_holdoutUQ_data.py index 165f940f..5bcba4f1 100644 --- a/Pilot1/Uno_UQ/uno_holdoutUQ_data.py +++ b/Pilot1/Uno_UQ/uno_holdoutUQ_data.py @@ -25,7 +25,7 @@ def initialize_parameters(): prog='uno_holdoutUQ_data', desc='Build data split for UQ analysis in the problem of prediction of tumor response to drug pairs.') # Initialize parameters - gParameters = candle.initialize_parameters(unoBmk) + gParameters = candle.finalize_parameters(unoBmk) #benchmark.logger.info('Params: {}'.format(gParameters)) return gParameters diff --git a/Pilot1/Uno_UQ/uno_inferUQ_keras2.py b/Pilot1/Uno_UQ/uno_inferUQ_keras2.py index af1c7934..1155d3d3 100644 --- a/Pilot1/Uno_UQ/uno_inferUQ_keras2.py +++ b/Pilot1/Uno_UQ/uno_inferUQ_keras2.py @@ -68,7 +68,7 @@ def initialize_parameters(): unoBmk.required = unoBmk.required.union(required_local) # Initialize parameters - gParameters = candle.initialize_parameters(unoBmk) + gParameters = candle.finalize_parameters(unoBmk) #benchmark.logger.info('Params: {}'.format(gParameters)) return gParameters diff --git a/Pilot1/Uno_UQ/uno_trainUQ_keras2.py b/Pilot1/Uno_UQ/uno_trainUQ_keras2.py index 8a06da16..545eb962 100644 --- a/Pilot1/Uno_UQ/uno_trainUQ_keras2.py +++ b/Pilot1/Uno_UQ/uno_trainUQ_keras2.py @@ -76,7 +76,7 @@ def initialize_parameters(): prog='uno_trainUQ', desc='Build neural network based models to predict tumor response to single and paired drugs, including UQ analysis.') # Initialize parameters - gParameters = candle.initialize_parameters(unoUQBmk) + gParameters = candle.finalize_parameters(unoUQBmk) #benchmark.logger.info('Params: {}'.format(gParameters)) return gParameters From 6f84feb53bce2d0ecc5604f138f7e392e8a1d0ea Mon Sep 17 00:00:00 2001 From: Cristina Garcia Cardona Date: Wed, 8 Apr 2020 17:18:13 -0600 Subject: [PATCH 204/331] added dropout, momentum and early_stop keywords at the common level --- Pilot1/Attn1/attn_abs_default_model.txt | 2 +- common/default_utils.py | 12 +++++++++--- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/Pilot1/Attn1/attn_abs_default_model.txt b/Pilot1/Attn1/attn_abs_default_model.txt index 608118cc..6ee4afed 100644 --- a/Pilot1/Attn1/attn_abs_default_model.txt +++ b/Pilot1/Attn1/attn_abs_default_model.txt @@ -8,7 +8,7 @@ epochs=2 activation=['relu', 'relu', 'softmax', 'relu', 'relu', 'relu', 'relu', 'relu', 'softmax'] loss='categorical_crossentropy' optimizer='sgd' -drop=0.2 +dropout=0.2 learning_rate=0.00001 momentum=0.9 validation_split=0.1 diff --git a/common/default_utils.py b/common/default_utils.py index 6b802b4f..07befe98 100644 --- a/common/default_utils.py +++ b/common/default_utils.py @@ -31,7 +31,7 @@ DEFAULT_DATATYPE = np.float32 -PARAMETERS_CANDLE = ['config_file', 'verbose', 'logfile', 'save_path', 'model_file', 'data_type', 'dense', 'rng_seed', 'epochs', 'batch_size', 'train_bool', 'eval_bool', 'timeout', 'home_dir', 'train_data', 'test_data', 'output_dir', 'data_url', 'experiment_id', 'run_id', 'conv', 'locally_connected', 'activation', 'out_activation', 'lstm_size', 'recurrent_dropout', 'drop', 'pool', 'batch_normalization', 'loss', 'optimizer', 'metrics', 'scaling', 'shuffle', 'feature_subsample', 'learning_rate', 'initialization', 'val_split', 'train_steps', 'val_steps', 'test_steps', 'train_samples', 'val_samples', 'gpus', 'profiling'] +PARAMETERS_CANDLE = ['config_file', 'verbose', 'logfile', 'save_path', 'model_file', 'data_type', 'dense', 'rng_seed', 'epochs', 'batch_size', 'train_bool', 'eval_bool', 'timeout', 'home_dir', 'train_data', 'test_data', 'output_dir', 'data_url', 'experiment_id', 'run_id', 'conv', 'locally_connected', 'activation', 'out_activation', 'lstm_size', 'recurrent_dropout', 'dropout', 'pool', 'batch_normalization', 'loss', 'optimizer', 'metrics', 'scaling', 'shuffle', 'feature_subsample', 'learning_rate', 'early_stop', 'momentum', 'initialization', 'val_split', 'train_steps', 'val_steps', 'test_steps', 'train_samples', 'val_samples', 'gpus', 'profiling'] #### IO UTILS @@ -544,7 +544,7 @@ def get_common_parser(parser): # Processing between layers - parser.add_argument("--drop", type=float, + parser.add_argument("--dropout", type=float, default=argparse.SUPPRESS, help="ratio of dropout used in fully connected layers") parser.add_argument("--pool", type=int, @@ -584,7 +584,13 @@ def get_common_parser(parser): parser.add_argument("--learning_rate", default= argparse.SUPPRESS, type=float, help="overrides the learning rate for training") - + parser.add_argument("--early_stop", type=str2bool, + default= argparse.SUPPRESS, + help="activates keras callback for early stopping of training in function of the monitored variable specified") + parser.add_argument("--momentum", + default= argparse.SUPPRESS, type=float, + help="overrides the momentum to use in the SGD optimizer when training") + parser.add_argument("--initialization", default=argparse.SUPPRESS, choices=['constant', 'uniform', 'normal', 'glorot_uniform', 'lecun_uniform', 'he_normal'], From bc7e644b218e3e14441a0e1d80f1844cd7a251b4 Mon Sep 17 00:00:00 2001 From: Cristina Garcia Cardona Date: Wed, 8 Apr 2020 17:31:20 -0600 Subject: [PATCH 205/331] updated keywords in Pilot1/Attn1 --- Pilot1/Attn1/attn.py | 12 ++++++------ Pilot1/Attn1/attn_abs_default_model.txt | 6 +++--- Pilot1/Attn1/attn_abstention_keras2.py | 8 ++++---- Pilot1/Attn1/attn_baseline_keras2.py | 2 +- Pilot1/Attn1/attn_default_model.txt | 8 ++++---- 5 files changed, 18 insertions(+), 18 deletions(-) diff --git a/Pilot1/Attn1/attn.py b/Pilot1/Attn1/attn.py index 32b91301..ddc66ff6 100644 --- a/Pilot1/Attn1/attn.py +++ b/Pilot1/Attn1/attn.py @@ -67,7 +67,7 @@ 'activation', 'batch_size', 'dense', - 'drop', + 'dropout', 'epochs', 'initialization', 'learning_rate', @@ -75,7 +75,7 @@ 'optimizer', 'rng_seed', 'scaling', - 'validation_split', + 'val_split', 'latent_dim', 'batch_normalization', 'epsilon_std', @@ -113,8 +113,8 @@ def extension_from_parameters(params, framework=''): if params['epsilon_std'] != 1.0: ext += '.EPS={}'.format(params['epsilon_std']) - if params['drop']: - ext += '.DR={}'.format(params['drop']) + if params['dropout']: + ext += '.DR={}'.format(params['dropout']) if params['batch_normalization']: ext += '.BN' if params['warmup_lr']: @@ -128,11 +128,11 @@ def extension_from_parameters(params, framework=''): def load_data(params, seed): # start change # - if params['in'].endswith('h5') or params['in'].endswith('hdf5'): + if params['train_data'].endswith('h5') or params['train_data'].endswith('hdf5'): print ('processing h5 in file {}'.format(params['in'])) url = params['data_url'] - file_train = params['in'] + file_train = params['train_data'] train_file = candle.get_file(file_train, url+file_train, cache_subdir='Pilot1') df_x_train_0 = pd.read_hdf(train_file, 'x_train_0').astype(np.float32) diff --git a/Pilot1/Attn1/attn_abs_default_model.txt b/Pilot1/Attn1/attn_abs_default_model.txt index 6ee4afed..b7a95c5a 100644 --- a/Pilot1/Attn1/attn_abs_default_model.txt +++ b/Pilot1/Attn1/attn_abs_default_model.txt @@ -1,7 +1,7 @@ [Global_Params] data_url = 'http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/' -in='top_21_1fold_001.h5' -model_name='attn_abs' +train_data='top_21_1fold_001.h5' +model_file='attn_abs.model.h5' dense=[1000, 1000, 1000, 500, 250, 125, 60, 30, 2] batch_size=32 epochs=2 @@ -11,7 +11,7 @@ optimizer='sgd' dropout=0.2 learning_rate=0.00001 momentum=0.9 -validation_split=0.1 +val_split=0.1 rng_seed=2017 use_cp=False early_stop=True diff --git a/Pilot1/Attn1/attn_abstention_keras2.py b/Pilot1/Attn1/attn_abstention_keras2.py index 6a9c40dc..a321bfa3 100644 --- a/Pilot1/Attn1/attn_abstention_keras2.py +++ b/Pilot1/Attn1/attn_abstention_keras2.py @@ -51,13 +51,13 @@ 'activation', 'batch_size', 'dense', - 'drop', + 'dropout', 'epochs', 'learning_rate', 'loss', 'optimizer', 'rng_seed', - 'validation_split', + 'val_split', 'solr_root', 'timeout', 'target_abs_acc'] @@ -167,8 +167,8 @@ def extension_from_parameters(params, framework=''): ext += '.E={}'.format(params['epochs']) ext += '.LR={}'.format(params['learning_rate']) - if params['drop']: - ext += '.DR={}'.format(params['drop']) + if params['dropout']: + ext += '.DR={}'.format(params['dropout']) if params['warmup_lr']: ext += '.WU_LR' if params['reduce_lr']: diff --git a/Pilot1/Attn1/attn_baseline_keras2.py b/Pilot1/Attn1/attn_baseline_keras2.py index 478391a3..82302872 100644 --- a/Pilot1/Attn1/attn_baseline_keras2.py +++ b/Pilot1/Attn1/attn_baseline_keras2.py @@ -154,7 +154,7 @@ def build_attention_model(params, PS): assert (len(params['dense']) == len(params['activation'])) assert (len(params['dense']) > 3) - DR = params['drop'] + DR = params['dropout'] inputs = Input(shape=(PS,)) x = Dense(params['dense'][0], activation=params['activation'][0])(inputs) x = BatchNormalization()(x) diff --git a/Pilot1/Attn1/attn_default_model.txt b/Pilot1/Attn1/attn_default_model.txt index b20acf5a..654084a9 100644 --- a/Pilot1/Attn1/attn_default_model.txt +++ b/Pilot1/Attn1/attn_default_model.txt @@ -1,18 +1,18 @@ [Global_Params] data_url = 'http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/' -in='top_21_1fold_001.h5' -model_name='attn' +train_data='top_21_1fold_001.h5' +model_file='attn.model.h5' dense=[1000, 1000, 1000, 500, 250, 125, 60, 30, 2] batch_size=32 epochs=1 activation=['relu', 'relu', 'softmax', 'relu', 'relu', 'relu', 'relu', 'relu', 'softmax'] loss='categorical_crossentropy' optimizer='sgd' -drop=0.2 +dropout=0.2 learning_rate=0.00001 momentum=0.9 scaling='minmax' -validation_split=0.1 +val_split=0.1 epsilon_std=1.0 rng_seed=2017 initialization='glorot_uniform' From 58c46b679575d98d859c86ad15f93ac66ba16b6f Mon Sep 17 00:00:00 2001 From: Cristina Garcia Cardona Date: Wed, 8 Apr 2020 17:46:10 -0600 Subject: [PATCH 206/331] updated keywords in Pilot1/Combo --- Pilot1/Combo/combo.py | 6 +++++- Pilot1/Combo/combo_baseline_keras2.py | 6 +++--- Pilot1/Combo/combo_default_model.txt | 4 ++-- Pilot1/Combo/combo_dose.py | 6 +++--- Pilot1/Combo/combo_perf_bench_model.txt | 4 ++-- 5 files changed, 15 insertions(+), 11 deletions(-) diff --git a/Pilot1/Combo/combo.py b/Pilot1/Combo/combo.py index da35f207..de0f31fc 100644 --- a/Pilot1/Combo/combo.py +++ b/Pilot1/Combo/combo.py @@ -32,6 +32,10 @@ 'type':candle.str2bool, 'default':True, #action="store_true", 'help':"use the 978 landmark genes from LINCS (L1000) as expression features"}, +{'name':'use_combo_score', + 'type':candle.str2bool, + 'default':False, + 'help':"use combination score in place of percent growth (stored in 'GROWTH' column)"}, {'name':'preprocess_rnaseq', 'default':'none', 'choices':['source_scale', 'combat', 'none'], @@ -86,7 +90,7 @@ ] -required = [ 'activation', 'batch_size', 'dense', 'dense_feature_layers', 'drop', +required = [ 'activation', 'batch_size', 'dense', 'dense_feature_layers', 'dropout', 'epochs', 'learning_rate', 'loss', 'optimizer', 'residual', 'rng_seed', 'save_path', 'scaling', 'feature_subsample', 'validation_split', 'solr_root', 'timeout' diff --git a/Pilot1/Combo/combo_baseline_keras2.py b/Pilot1/Combo/combo_baseline_keras2.py index 1a836d9e..5a262574 100644 --- a/Pilot1/Combo/combo_baseline_keras2.py +++ b/Pilot1/Combo/combo_baseline_keras2.py @@ -94,8 +94,8 @@ def extension_from_parameters(args): ext += '.DF={}'.format(''.join([x[0] for x in sorted(args.drug_features)])) if args.feature_subsample > 0: ext += '.FS={}'.format(args.feature_subsample) - if args.drop > 0: - ext += '.DR={}'.format(args.drop) + if args.dropout > 0: + ext += '.DR={}'.format(args.dropout) if args.warmup_lr: ext += '.wu_lr' if args.reduce_lr: @@ -604,7 +604,7 @@ def build_feature_model(input_shape, name='', dense_layers=[1000, 1000], def build_model(loader, args, verbose=False): input_models = {} - dropout_rate = args.drop + dropout_rate = args.dropout permanent_dropout = True for fea_type, shape in loader.feature_shapes.items(): box = build_feature_model(input_shape=shape, name=fea_type, diff --git a/Pilot1/Combo/combo_default_model.txt b/Pilot1/Combo/combo_default_model.txt index 3ab500e2..3b57a979 100644 --- a/Pilot1/Combo/combo_default_model.txt +++ b/Pilot1/Combo/combo_default_model.txt @@ -7,10 +7,10 @@ activation='relu' loss='mse' optimizer='adam' scaling='std' -drop=0 +dropout=0 epochs=10 batch_size=32 -validation_split=0.2 +val_split=0.2 cv=1 cv_partition='overlapping' max_val_loss=1.0 diff --git a/Pilot1/Combo/combo_dose.py b/Pilot1/Combo/combo_dose.py index f57ee7e5..dc6e9a57 100644 --- a/Pilot1/Combo/combo_dose.py +++ b/Pilot1/Combo/combo_dose.py @@ -95,8 +95,8 @@ def extension_from_parameters(args): ext += '.DF={}'.format(''.join([x[0] for x in sorted(args.drug_features)])) if args.feature_subsample > 0: ext += '.FS={}'.format(args.feature_subsample) - if args.drop > 0: - ext += '.DR={}'.format(args.drop) + if args.dropout > 0: + ext += '.DR={}'.format(args.dropout) if args.warmup_lr: ext += '.wu_lr' if args.reduce_lr: @@ -609,7 +609,7 @@ def build_feature_model(input_shape, name='', dense_layers=[1000, 1000], def build_model(loader, args, verbose=False): input_models = {} - dropout_rate = args.drop + dropout_rate = args.dropout permanent_dropout = True for fea_type, shape in loader.feature_shapes.items(): box = build_feature_model(input_shape=shape, name=fea_type, diff --git a/Pilot1/Combo/combo_perf_bench_model.txt b/Pilot1/Combo/combo_perf_bench_model.txt index d581aea7..0a5b1b32 100644 --- a/Pilot1/Combo/combo_perf_bench_model.txt +++ b/Pilot1/Combo/combo_perf_bench_model.txt @@ -7,10 +7,10 @@ activation='relu' loss='mse' optimizer='adam' scaling='std' -drop=0 +dropout=0 epochs=10 batch_size=32 -validation_split=0.2 +val_split=0.2 cv=1 cv_partition='overlapping' max_val_loss=1.0 From c59907a7f4ffd11fbc9f74783a11017f106efccf Mon Sep 17 00:00:00 2001 From: Cristina Garcia Cardona Date: Wed, 8 Apr 2020 17:58:48 -0600 Subject: [PATCH 207/331] updated keywords in Pilot1/NT3 --- Pilot1/NT3/nt3.py | 6 +++--- Pilot1/NT3/nt3_baseline_keras2.py | 8 ++++---- Pilot1/NT3/nt3_baseline_keras2_tensorrt.py | 16 ++++++++-------- Pilot1/NT3/nt3_default_model.txt | 6 +++--- Pilot1/NT3/nt3_perf_bench_model.txt | 6 +++--- 5 files changed, 21 insertions(+), 21 deletions(-) diff --git a/Pilot1/NT3/nt3.py b/Pilot1/NT3/nt3.py index 1863b3a3..367a11db 100644 --- a/Pilot1/NT3/nt3.py +++ b/Pilot1/NT3/nt3.py @@ -24,17 +24,17 @@ 'conv', 'dense', 'activation', - 'out_act', + 'out_activation', 'loss', 'optimizer', 'metrics', 'epochs', 'batch_size', 'learning_rate', - 'drop', + 'dropout', 'classes', 'pool', - 'save', + 'save_path', 'timeout' ] diff --git a/Pilot1/NT3/nt3_baseline_keras2.py b/Pilot1/NT3/nt3_baseline_keras2.py index 0877bd04..9282cb36 100644 --- a/Pilot1/NT3/nt3_baseline_keras2.py +++ b/Pilot1/NT3/nt3_baseline_keras2.py @@ -133,10 +133,10 @@ def run(gParameters): if layer: model.add(Dense(layer)) model.add(Activation(gParameters['activation'])) - if gParameters['drop']: - model.add(Dropout(gParameters['drop'])) + if gParameters['dropout']: + model.add(Dropout(gParameters['dropout'])) model.add(Dense(gParameters['classes'])) - model.add(Activation(gParameters['out_act'])) + model.add(Activation(gParameters['out_activation'])) #Reference case #model.add(Conv1D(filters=128, kernel_size=20, strides=1, padding='valid', input_shape=(P, 1))) @@ -167,7 +167,7 @@ def run(gParameters): optimizer=optimizer, metrics=[gParameters['metrics']]) - output_dir = gParameters['save'] + output_dir = gParameters['save_path'] if not os.path.exists(output_dir): os.makedirs(output_dir) diff --git a/Pilot1/NT3/nt3_baseline_keras2_tensorrt.py b/Pilot1/NT3/nt3_baseline_keras2_tensorrt.py index ce03d86b..8194a5f3 100644 --- a/Pilot1/NT3/nt3_baseline_keras2_tensorrt.py +++ b/Pilot1/NT3/nt3_baseline_keras2_tensorrt.py @@ -87,17 +87,17 @@ def read_config_file(file): fileParams['conv'] = eval(config.get(section[0],'conv')) fileParams['dense'] = eval(config.get(section[0],'dense')) fileParams['activation'] = eval(config.get(section[0],'activation')) - fileParams['out_act'] = eval(config.get(section[0],'out_act')) + fileParams['out_activation'] = eval(config.get(section[0],'out_activation')) fileParams['loss'] = eval(config.get(section[0],'loss')) fileParams['optimizer'] = eval(config.get(section[0],'optimizer')) fileParams['metrics'] = eval(config.get(section[0],'metrics')) fileParams['epochs'] = eval(config.get(section[0],'epochs')) fileParams['batch_size'] = eval(config.get(section[0],'batch_size')) fileParams['learning_rate'] = eval(config.get(section[0], 'learning_rate')) - fileParams['drop'] = eval(config.get(section[0],'drop')) + fileParams['dropout'] = eval(config.get(section[0],'dropout')) fileParams['classes'] = eval(config.get(section[0],'classes')) fileParams['pool'] = eval(config.get(section[0],'pool')) - fileParams['save'] = eval(config.get(section[0], 'save')) + fileParams['save_path'] = eval(config.get(section[0], 'save_path')) # parse the remaining values for k,v in config.items(section[0]): @@ -219,11 +219,11 @@ def run(gParameters): model.add(Dense(layer)) model.add(Activation(gParameters['activation'])) # This has to be disabled for tensorrt otherwise I am getting an error - if False and gParameters['drop']: - model.add(Dropout(gParameters['drop'])) + if False and gParameters['dropout']: + model.add(Dropout(gParameters['dropout'])) #model.add(Dense(gParameters['classes'])) - #model.add(Activation(gParameters['out_act']), name='activation_5') - model.add(Dense(gParameters['classes'], activation=gParameters['out_act'], name='activation_5')) + #model.add(Activation(gParameters['out_activation']), name='activation_5') + model.add(Dense(gParameters['classes'], activation=gParameters['out_activation'], name='activation_5')) #Reference case #model.add(Conv1D(filters=128, kernel_size=20, strides=1, padding='valid', input_shape=(P, 1))) #model.add(Activation('relu')) @@ -258,7 +258,7 @@ def run(gParameters): optimizer=optimizer, metrics=[gParameters['metrics']]) - output_dir = gParameters['save'] + output_dir = gParameters['save_path'] if not os.path.exists(output_dir): os.makedirs(output_dir) diff --git a/Pilot1/NT3/nt3_default_model.txt b/Pilot1/NT3/nt3_default_model.txt index d848df78..84488116 100644 --- a/Pilot1/NT3/nt3_default_model.txt +++ b/Pilot1/NT3/nt3_default_model.txt @@ -6,15 +6,15 @@ model_name = 'nt3' conv = [128, 20, 1, 128, 10, 1] dense = [200,20] activation = 'relu' -out_act = 'softmax' +out_activation = 'softmax' loss = 'categorical_crossentropy' optimizer = 'sgd' metrics = 'accuracy' epochs = 400 batch_size = 20 learning_rate = 0.001 -drop = 0.1 +dropout = 0.1 classes = 2 pool = [1, 10] -save = '.' +save_path = '.' timeout = 3600 diff --git a/Pilot1/NT3/nt3_perf_bench_model.txt b/Pilot1/NT3/nt3_perf_bench_model.txt index 86a1873a..aad8e0e6 100644 --- a/Pilot1/NT3/nt3_perf_bench_model.txt +++ b/Pilot1/NT3/nt3_perf_bench_model.txt @@ -6,15 +6,15 @@ model_name = 'nt3' conv = [128, 20, 1, 128, 10, 1] dense = [200,20] activation = 'relu' -out_act = 'softmax' +out_activation = 'softmax' loss = 'categorical_crossentropy' optimizer = 'sgd' metrics = 'accuracy' epochs = 50 batch_size = 5 learning_rate = 0.001 -drop = 0.1 +dropout = 0.1 classes = 2 pool = [1, 10] -save = '.' +save_path = '.' timeout = 7200 From 44e748f34eb90cc1c4e9a35a797129cdff71eb57 Mon Sep 17 00:00:00 2001 From: Cristina Garcia Cardona Date: Wed, 8 Apr 2020 18:09:08 -0600 Subject: [PATCH 208/331] partial update of keywords in Pilot1/P1B1 --- Pilot1/P1B1/p1b1.py | 10 +++++++--- Pilot1/P1B1/p1b1_baseline_keras2.py | 2 +- Pilot1/P1B1/p1b1_default_model.txt | 4 ++-- Pilot1/P1B1/p1b1_perf_bench_model.txt | 4 ++-- 4 files changed, 12 insertions(+), 8 deletions(-) diff --git a/Pilot1/P1B1/p1b1.py b/Pilot1/P1B1/p1b1.py index ee4afb57..d52a081a 100644 --- a/Pilot1/P1B1/p1b1.py +++ b/Pilot1/P1B1/p1b1.py @@ -27,6 +27,10 @@ 'action':'store', 'type': int, 'help':'latent dimensions'}, +{'name':'model_name', + 'default':'p1b1', + 'type':str, + 'help':'prefix for file to save model'}, {'name':'model', 'default':'ae', 'choices':['ae', 'vae', 'cvae'], @@ -75,7 +79,7 @@ 'activation', 'batch_size', 'dense', - 'drop', + 'dropout', 'epochs', 'initialization', 'learning_rate', @@ -125,8 +129,8 @@ def extension_from_parameters(params, framework=''): ext += '.EPS={}'.format(params['epsilon_std']) if params['feature_subsample'] > 0: ext += '.FS={}'.format(params['feature_subsample']) - if params['drop']: - ext += '.DR={}'.format(params['drop']) + if params['dropout']: + ext += '.DR={}'.format(params['dropout']) if params['alpha_dropout']: ext += '.AD' if params['batch_normalization']: diff --git a/Pilot1/P1B1/p1b1_baseline_keras2.py b/Pilot1/P1B1/p1b1_baseline_keras2.py index 34fca44f..7eae97e8 100644 --- a/Pilot1/P1B1/p1b1_baseline_keras2.py +++ b/Pilot1/P1B1/p1b1_baseline_keras2.py @@ -189,7 +189,7 @@ def run(params): latent_dim = params['latent_dim'] activation = params['activation'] - dropout = params['drop'] + dropout = params['dropout'] dense_layers = params['dense'] dropout_layer = keras.layers.noise.AlphaDropout if params['alpha_dropout'] else Dropout diff --git a/Pilot1/P1B1/p1b1_default_model.txt b/Pilot1/P1B1/p1b1_default_model.txt index 3b319dde..c28ea998 100644 --- a/Pilot1/P1B1/p1b1_default_model.txt +++ b/Pilot1/P1B1/p1b1_default_model.txt @@ -9,13 +9,13 @@ epochs=100 activation='relu' loss='mse' optimizer='adam' -drop=0 +dropout=0 learning_rate=None base_lr=None scaling='minmax' model='ae' noise_factor=0 -validation_split=0.1 +val_split=0.1 epsilon_std=1.0 rng_seed=2017 initialization='glorot_uniform' diff --git a/Pilot1/P1B1/p1b1_perf_bench_model.txt b/Pilot1/P1B1/p1b1_perf_bench_model.txt index 877c628b..0f348de5 100644 --- a/Pilot1/P1B1/p1b1_perf_bench_model.txt +++ b/Pilot1/P1B1/p1b1_perf_bench_model.txt @@ -9,13 +9,13 @@ epochs=500 activation='relu' loss='mse' optimizer='adam' -drop=0 +dropout=0 learning_rate=None base_lr=None scaling='minmax' model='ae' noise_factor=0 -validation_split=0.1 +val_split=0.1 epsilon_std=1.0 rng_seed=2017 initialization='glorot_uniform' From c7b4d4cbef69504a37dd231191ccb2d5b83f68cd Mon Sep 17 00:00:00 2001 From: Cristina Garcia Cardona Date: Wed, 8 Apr 2020 18:22:32 -0600 Subject: [PATCH 209/331] partial update of keywords in Pilot1/P1B1 --- Pilot1/P1B1/p1b1.py | 16 ++++++++-------- Pilot1/P1B1/p1b1_default_model.txt | 6 +++--- Pilot1/P1B1/p1b1_perf_bench_model.txt | 6 +++--- Pilot1/P1B2/p1b2.py | 10 +++++----- Pilot1/P1B2/p1b2_baseline_keras2.py | 4 ++-- Pilot1/P1B2/p1b2_default_model.txt | 4 ++-- 6 files changed, 23 insertions(+), 23 deletions(-) diff --git a/Pilot1/P1B1/p1b1.py b/Pilot1/P1B1/p1b1.py index d52a081a..93f6b039 100644 --- a/Pilot1/P1B1/p1b1.py +++ b/Pilot1/P1B1/p1b1.py @@ -89,7 +89,7 @@ 'rng_seed', 'model', 'scaling', - 'validation_split', + 'val_split', 'latent_dim', 'feature_subsample', 'batch_normalization', @@ -154,15 +154,15 @@ def load_data(params, seed): if params['use_landmark_genes']: lincs_file = 'lincs1000.tsv' - lincs_path = candle.fetch_file(params['url_p1b1'] + lincs_file, 'Pilot1') + lincs_path = candle.fetch_file(params['data_url'] + lincs_file, 'Pilot1') df_l1000 = pd.read_csv(lincs_path, sep='\t') x_cols = df_l1000['gdc'].tolist() drop_cols = None else: x_cols = None - train_path = candle.fetch_file(params['url_p1b1'] + params['file_train'], 'Pilot1') - test_path = candle.fetch_file(params['url_p1b1'] + params['file_test'], 'Pilot1') + train_path = candle.fetch_file(params['data_url'] + params['train_data'], 'Pilot1') + test_path = candle.fetch_file(params['data_url'] + params['test_data'], 'Pilot1') return candle.load_csv_data(train_path, test_path, x_cols=x_cols, @@ -173,7 +173,7 @@ def load_data(params, seed): shuffle=params['shuffle'], scaling=params['scaling'], dtype=params['data_type'], - validation_split=params['validation_split'], + validation_split=params['val_split'], return_dataframe=False, return_header=True, seed=seed) @@ -189,21 +189,21 @@ def load_data_orig(params, seed): if params['use_landmark_genes']: lincs_file = 'lincs1000.tsv' - lincs_path = candle.fetch_file(url_p1b1 + lincs_file) + lincs_path = candle.fetch_file(params['data_url'] + lincs_file) df_l1000 = pd.read_csv(lincs_path, sep='\t') usecols = df_l1000['gdc'] drop_cols = None else: usecols = None - return candle.load_X_data(params['url_p1b1'], params['file_train'], params['file_test'], + return candle.load_X_data(params['data_url'], params['train_data'], params['test_data'], drop_cols=drop_cols, onehot_cols=onehot_cols, usecols=usecols, n_cols=params['feature_subsample'], shuffle=params['shuffle'], scaling=params['scaling'], - validation_split=params['validation_split'], + validation_split=params['val_split'], dtype=params['data_type'], seed=seed) diff --git a/Pilot1/P1B1/p1b1_default_model.txt b/Pilot1/P1B1/p1b1_default_model.txt index c28ea998..b6c86423 100644 --- a/Pilot1/P1B1/p1b1_default_model.txt +++ b/Pilot1/P1B1/p1b1_default_model.txt @@ -1,7 +1,7 @@ [Global_Params] -url_p1b1 = 'http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/P1B1/' -file_train = 'P1B1.dev.train.csv' -file_test = 'P1B1.dev.test.csv' +data_url = 'http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/P1B1/' +train_data = 'P1B1.dev.train.csv' +test_data = 'P1B1.dev.test.csv' model_name='p1b1' dense=[2000, 600] batch_size=100 diff --git a/Pilot1/P1B1/p1b1_perf_bench_model.txt b/Pilot1/P1B1/p1b1_perf_bench_model.txt index 0f348de5..01ffc46c 100644 --- a/Pilot1/P1B1/p1b1_perf_bench_model.txt +++ b/Pilot1/P1B1/p1b1_perf_bench_model.txt @@ -1,7 +1,7 @@ [Global_Params] -url_p1b1 = 'http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/P1B1/' -file_train = 'P1B1.dev.train.csv' -file_test = 'P1B1.dev.test.csv' +data_url = 'http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/P1B1/' +train_data = 'P1B1.dev.train.csv' +test_data = 'P1B1.dev.test.csv' model_name='p1b1' dense=[2000, 1000, 500, 200, 100] batch_size=100 diff --git a/Pilot1/P1B2/p1b2.py b/Pilot1/P1B2/p1b2.py index b1e3be49..20606e95 100644 --- a/Pilot1/P1B2/p1b2.py +++ b/Pilot1/P1B2/p1b2.py @@ -38,7 +38,7 @@ 'activation', 'batch_size', 'dense', - 'drop', + 'dropout', 'epochs', 'feature_subsample', 'initialization', @@ -48,7 +48,7 @@ 'penalty', 'rng_seed', 'scaling', - 'validation_split', + 'val_split', 'shuffle' ] @@ -71,7 +71,7 @@ def extension_from_parameters(params, framework): ext = framework ext += '.A={}'.format(params['activation']) ext += '.B={}'.format(params['batch_size']) - ext += '.D={}'.format(params['drop']) + ext += '.D={}'.format(params['dropout']) ext += '.E={}'.format(params['epochs']) if params['feature_subsample']: ext += '.F={}'.format(params['feature_subsample']) @@ -93,7 +93,7 @@ def load_data_one_hot(params, seed): n_cols=params['feature_subsample'], shuffle=params['shuffle'], scaling=params['scaling'], - validation_split=params['validation_split'], + validation_split=params['val_split'], dtype=params['data_type'], seed=seed) @@ -108,7 +108,7 @@ def load_data(params, seed): n_cols=params['feature_subsample'], shuffle=params['shuffle'], scaling=params['scaling'], - validation_split=params['validation_split'], + validation_split=params['val_split'], dtype=params['data_type'], seed=seed) diff --git a/Pilot1/P1B2/p1b2_baseline_keras2.py b/Pilot1/P1B2/p1b2_baseline_keras2.py index 642d5f22..f8fa50af 100644 --- a/Pilot1/P1B2/p1b2_baseline_keras2.py +++ b/Pilot1/P1B2/p1b2_baseline_keras2.py @@ -88,8 +88,8 @@ def run(gParameters): bias_initializer=initializer_bias, kernel_regularizer=l2(gParameters['penalty']), activity_regularizer=l2(gParameters['penalty']))(x) - if gParameters['drop']: - x = Dropout(gParameters['drop'])(x) + if gParameters['dropout']: + x = Dropout(gParameters['dropout'])(x) output = Dense(output_dim, activation=activation, kernel_initializer=initializer_weights, bias_initializer=initializer_bias)(x) diff --git a/Pilot1/P1B2/p1b2_default_model.txt b/Pilot1/P1B2/p1b2_default_model.txt index 10802e7e..df55f400 100644 --- a/Pilot1/P1B2/p1b2_default_model.txt +++ b/Pilot1/P1B2/p1b2_default_model.txt @@ -10,10 +10,10 @@ loss='categorical_crossentropy' optimizer='rmsprop' learning_rate=0.001 scaling='minmax' -drop=0. +dropout=0. feature_subsample=0 penalty=0.00001 -validation_split=0.1 +val_split=0.1 rng_seed=2017 initialization='glorot_uniform' save_path='save' From 393f8781e9f7719dea288dcd9e36fd7c84a4b1c4 Mon Sep 17 00:00:00 2001 From: Cristina Garcia Cardona Date: Wed, 8 Apr 2020 18:29:32 -0600 Subject: [PATCH 210/331] partial update of keywords in Pilot1/P1B2 --- Pilot1/P1B2/p1b2.py | 9 +++++++-- Pilot1/P1B2/p1b2_baseline_keras2.py | 8 ++++---- Pilot1/P1B2/p1b2_baseline_mxnet.py | 4 ++-- Pilot1/P1B2/p1b2_baseline_neon.py | 4 ++-- 4 files changed, 15 insertions(+), 10 deletions(-) diff --git a/Pilot1/P1B2/p1b2.py b/Pilot1/P1B2/p1b2.py index 20606e95..70bb9b8f 100644 --- a/Pilot1/P1B2/p1b2.py +++ b/Pilot1/P1B2/p1b2.py @@ -29,7 +29,12 @@ logger = logging.getLogger(__name__) -additional_definitions = [] +additional_definitions = [ +{'name':'reg_l2', +'type': float, +'default': 0., +'help':'weight of regularization for l2 norm of nn weights'} +] required = [ 'data_url', @@ -45,7 +50,7 @@ 'learning_rate', 'loss', 'optimizer', - 'penalty', + 'reg_l2', 'rng_seed', 'scaling', 'val_split', diff --git a/Pilot1/P1B2/p1b2_baseline_keras2.py b/Pilot1/P1B2/p1b2_baseline_keras2.py index f8fa50af..d336167e 100644 --- a/Pilot1/P1B2/p1b2_baseline_keras2.py +++ b/Pilot1/P1B2/p1b2_baseline_keras2.py @@ -80,14 +80,14 @@ def run(gParameters): x = Dense(l, activation=activation, kernel_initializer=initializer_weights, bias_initializer=initializer_bias, - kernel_regularizer=l2(gParameters['penalty']), - activity_regularizer=l2(gParameters['penalty']))(input_vector) + kernel_regularizer=l2(gParameters['reg_l2']), + activity_regularizer=l2(gParameters['reg_l2']))(input_vector) else: x = Dense(l, activation=activation, kernel_initializer=initializer_weights, bias_initializer=initializer_bias, - kernel_regularizer=l2(gParameters['penalty']), - activity_regularizer=l2(gParameters['penalty']))(x) + kernel_regularizer=l2(gParameters['reg_l2']), + activity_regularizer=l2(gParameters['reg_l2']))(x) if gParameters['dropout']: x = Dropout(gParameters['dropout'])(x) output = Dense(output_dim, activation=activation, diff --git a/Pilot1/P1B2/p1b2_baseline_mxnet.py b/Pilot1/P1B2/p1b2_baseline_mxnet.py index 3c423256..999fde02 100644 --- a/Pilot1/P1B2/p1b2_baseline_mxnet.py +++ b/Pilot1/P1B2/p1b2_baseline_mxnet.py @@ -88,8 +88,8 @@ def main(): for i,l in enumerate(layers): net = mx.sym.FullyConnected(data=net, num_hidden=l) net = mx.sym.Activation(data=net, act_type=activation) - if gParameters['drop']: - net = mx.sym.Dropout(data=net, p=gParameters['drop']) + if gParameters['dropout']: + net = mx.sym.Dropout(data=net, p=gParameters['dropout']) net = mx.sym.FullyConnected(data=net, num_hidden=num_classes)# 1) net = mx.symbol.SoftmaxOutput(data=net, label=out) diff --git a/Pilot1/P1B2/p1b2_baseline_neon.py b/Pilot1/P1B2/p1b2_baseline_neon.py index 3628e249..caea982b 100644 --- a/Pilot1/P1B2/p1b2_baseline_neon.py +++ b/Pilot1/P1B2/p1b2_baseline_neon.py @@ -123,8 +123,8 @@ def main(): for layer in gParameters['dense']: if layer: layers.append(Affine(nout=layer, init=initializer_weights, bias=initializer_bias, activation=activation)) - if gParameters['drop']: - layers.append(Dropout(keep=(1-gParameters['drop']))) + if gParameters['dropout']: + layers.append(Dropout(keep=(1-gParameters['dropout']))) layers.append(Affine(nout=output_dim, init=initializer_weights, bias=initializer_bias, activation=activation)) From 5476f1a069d68625a0b10eba94b86e8ae41a8571 Mon Sep 17 00:00:00 2001 From: Cristina Garcia Cardona Date: Wed, 8 Apr 2020 18:34:03 -0600 Subject: [PATCH 211/331] partial update of keywords in Pilot1/P1B3 --- Pilot1/P1B3/p1b3.py | 10 +++++----- Pilot1/P1B3/p1b3_baseline_keras2.py | 6 +++--- Pilot1/P1B3/p1b3_conv_model.txt | 4 ++-- Pilot1/P1B3/p1b3_default_model.txt | 4 ++-- Pilot1/P1B3/p1b3_perf_bench_model.txt | 4 ++-- 5 files changed, 14 insertions(+), 14 deletions(-) diff --git a/Pilot1/P1B3/p1b3.py b/Pilot1/P1B3/p1b3.py index 4683c042..b0be7db7 100644 --- a/Pilot1/P1B3/p1b3.py +++ b/Pilot1/P1B3/p1b3.py @@ -104,7 +104,7 @@ def set_locals(self): 'batch_normalization', 'category_cutoffs', 'cell_features', - 'drop', + 'dropout', 'drug_features', 'epochs', 'feature_subsample', @@ -119,7 +119,7 @@ def set_locals(self): 'scaling', 'subsample', 'test_cell_split', - 'validation_split', + 'val_split', 'cell_noise_sigma' ] @@ -193,7 +193,7 @@ def set_locals(self): # fileParams['batch_normalization']=eval(config.get(section[0],'batch_normalization')) # fileParams['category_cutoffs']=eval(config.get(section[0],'category_cutoffs')) # fileParams['cell_features']=eval(config.get(section[0],'cell_features')) -# fileParams['drop']=eval(config.get(section[0],'drop')) +# fileParams['dropout']=eval(config.get(section[0],'dropout')) # fileParams['drug_features']=eval(config.get(section[0],'drug_features')) # fileParams['epochs']=eval(config.get(section[0],'epochs')) # fileParams['feature_subsample']=eval(config.get(section[0],'feature_subsample')) @@ -208,7 +208,7 @@ def set_locals(self): # fileParams['scaling']=eval(config.get(section[0],'scaling')) # fileParams['subsample']=eval(config.get(section[0],'subsample')) # fileParams['test_cell_split']=eval(config.get(section[0],'test_cell_split')) -# fileParams['validation_split']=eval(config.get(section[0],'validation_split')) +# fileParams['val_split']=eval(config.get(section[0],'val_split')) # fileParams['cell_noise_sigma']=eval(config.get(section[0],'cell_noise_sigma')) # # # parse the remaining values @@ -241,7 +241,7 @@ def extension_from_parameters(params, framework): ext = framework ext += '.A={}'.format(params['activation']) ext += '.B={}'.format(params['batch_size']) - ext += '.D={}'.format(params['drop']) + ext += '.D={}'.format(params['dropout']) ext += '.E={}'.format(params['epochs']) if params['feature_subsample']: ext += '.F={}'.format(params['feature_subsample']) diff --git a/Pilot1/P1B3/p1b3_baseline_keras2.py b/Pilot1/P1B3/p1b3_baseline_keras2.py index 75986af6..009096f6 100644 --- a/Pilot1/P1B3/p1b3_baseline_keras2.py +++ b/Pilot1/P1B3/p1b3_baseline_keras2.py @@ -260,7 +260,7 @@ def run(gParameters): # Build dataset loader object loader = benchmark.DataLoader(seed=seed, dtype=gParameters['data_type'], - val_split=gParameters['validation_split'], + val_split=gParameters['val_split'], test_cell_split=gParameters['test_cell_split'], cell_features=gParameters['cell_features'], drug_features=gParameters['drug_features'], @@ -292,8 +292,8 @@ def run(gParameters): if gParameters['batch_normalization']: model.add(BatchNormalization()) model.add(Activation(gParameters['activation'])) - if gParameters['drop']: - model.add(Dropout(gParameters['drop'])) + if gParameters['dropout']: + model.add(Dropout(gParameters['dropout'])) else: # Build convolutional layers gen_shape = 'add_1d' layer_list = list(range(0, len(gParameters['conv']))) diff --git a/Pilot1/P1B3/p1b3_conv_model.txt b/Pilot1/P1B3/p1b3_conv_model.txt index 67af6d18..7f38c676 100644 --- a/Pilot1/P1B3/p1b3_conv_model.txt +++ b/Pilot1/P1B3/p1b3_conv_model.txt @@ -8,9 +8,9 @@ loss = 'mse' optimizer = 'sgd' learning_rate = 0.001 scaling = 'std' -drop = 0.1 +dropout = 0.1 feature_subsample = 0 -validation_split = 0.1 +val_split = 0.1 rng_seed = 2017 initialization = 'normal' min_logconc = -5. diff --git a/Pilot1/P1B3/p1b3_default_model.txt b/Pilot1/P1B3/p1b3_default_model.txt index 9d4645de..70beb20c 100644 --- a/Pilot1/P1B3/p1b3_default_model.txt +++ b/Pilot1/P1B3/p1b3_default_model.txt @@ -7,9 +7,9 @@ loss='mse' optimizer='sgd' learning_rate=0.001 scaling='std' -drop=0.1 +dropout=0.1 feature_subsample=0 -validation_split=0.1 +val_split=0.1 rng_seed=2017 initialization='normal' min_logconc=-5. diff --git a/Pilot1/P1B3/p1b3_perf_bench_model.txt b/Pilot1/P1B3/p1b3_perf_bench_model.txt index fc5aec7e..a288e817 100644 --- a/Pilot1/P1B3/p1b3_perf_bench_model.txt +++ b/Pilot1/P1B3/p1b3_perf_bench_model.txt @@ -7,9 +7,9 @@ loss='mse' optimizer='sgd' learning_rate=0.001 scaling='std' -drop=0.1 +dropout=0.1 feature_subsample=500 -validation_split=0.1 +val_split=0.1 rng_seed=2017 initialization='normal' min_logconc=-5. From 7718e3e353c4f7e8706360332ac0dd7e0a7d5b8c Mon Sep 17 00:00:00 2001 From: Cristina Garcia Cardona Date: Wed, 8 Apr 2020 18:44:58 -0600 Subject: [PATCH 212/331] partial update of keywords in Pilot1/TC1 --- Pilot1/TC1/tc1.py | 15 +++++++++++---- Pilot1/TC1/tc1_baseline_keras2.py | 26 +++++++++++++------------- Pilot1/TC1/tc1_default_model.txt | 8 ++++---- Pilot1/TC1/tc1_perf_bench_model.txt | 8 ++++---- 4 files changed, 32 insertions(+), 25 deletions(-) diff --git a/Pilot1/TC1/tc1.py b/Pilot1/TC1/tc1.py index 631a89c4..39ae16b0 100644 --- a/Pilot1/TC1/tc1.py +++ b/Pilot1/TC1/tc1.py @@ -18,27 +18,34 @@ 'nargs':'+', 'type': int, 'help':'network structure of shared layer'}, + {'name':'model_prefix', + 'default':'tc1', + 'type':str, + 'help':'prefix to build model name for saving'}, + {'name':'classes', + 'type':int, + 'default':36} ] required = [ 'data_url', 'train_data', 'test_data', - 'model_name', + 'model_prefix', 'conv', 'dense', 'activation', - 'out_act', + 'out_activation', 'loss', 'optimizer', 'feature_subsample', 'metrics', 'epochs', 'batch_size', - 'drop', + 'dropout', 'classes', 'pool', - 'save' + 'output_dir' ] diff --git a/Pilot1/TC1/tc1_baseline_keras2.py b/Pilot1/TC1/tc1_baseline_keras2.py index 6b7252cf..46e3b1f5 100644 --- a/Pilot1/TC1/tc1_baseline_keras2.py +++ b/Pilot1/TC1/tc1_baseline_keras2.py @@ -102,14 +102,14 @@ def run(gParameters): else: model.add(Dense(layer)) model.add(Activation(gParameters['activation'])) - if gParameters['drop']: - model.add(Dropout(gParameters['drop'])) + if gParameters['dropout']: + model.add(Dropout(gParameters['dropout'])) if dense_first: model.add(Flatten()) model.add(Dense(gParameters['classes'])) - model.add(Activation(gParameters['out_act'])) + model.add(Activation(gParameters['out_activation'])) model.summary() @@ -117,13 +117,13 @@ def run(gParameters): optimizer=gParameters['optimizer'], metrics=[gParameters['metrics']]) - output_dir = gParameters['save'] + output_dir = gParameters['output_dir'] if not os.path.exists(output_dir): os.makedirs(output_dir) # set up callbacks to do work during model training.. - model_name = gParameters['model_name'] - path = '{}/{}.autosave.model.h5'.format(output_dir, model_name) + model_prefix = gParameters['model_prefix'] + path = '{}/{}.autosave.model.h5'.format(output_dir, model_prefix) checkpointer = ModelCheckpoint(filepath=path, verbose=1, save_weights_only=False, save_best_only=True) csv_logger = CSVLogger('{}/training.log'.format(output_dir)) reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=10, verbose=1, mode='auto', epsilon=0.0001, cooldown=0, min_lr=0) @@ -142,35 +142,35 @@ def run(gParameters): # serialize model to JSON model_json = model.to_json() - with open("{}/{}.model.json".format(output_dir, model_name), "w") as json_file: + with open("{}/{}.model.json".format(output_dir, model_prefix), "w") as json_file: json_file.write(model_json) # serialize model to YAML model_yaml = model.to_yaml() - with open("{}/{}.model.yaml".format(output_dir, model_name), "w") as yaml_file: + with open("{}/{}.model.yaml".format(output_dir, model_prefix), "w") as yaml_file: yaml_file.write(model_yaml) # serialize weights to HDF5 - model.save_weights("{}/{}.model.h5".format(output_dir, model_name)) + model.save_weights("{}/{}.model.h5".format(output_dir, model_prefix)) print("Saved model to disk") # load json and create model - json_file = open('{}/{}.model.json'.format(output_dir, model_name), 'r') + json_file = open('{}/{}.model.json'.format(output_dir, model_prefix), 'r') loaded_model_json = json_file.read() json_file.close() loaded_model_json = model_from_json(loaded_model_json) # load yaml and create model - yaml_file = open('{}/{}.model.yaml'.format(output_dir, model_name), 'r') + yaml_file = open('{}/{}.model.yaml'.format(output_dir, model_prefix), 'r') loaded_model_yaml = yaml_file.read() yaml_file.close() loaded_model_yaml = model_from_yaml(loaded_model_yaml) # load weights into new model - loaded_model_json.load_weights('{}/{}.model.h5'.format(output_dir, model_name)) + loaded_model_json.load_weights('{}/{}.model.h5'.format(output_dir, model_prefix)) print("Loaded json model from disk") # evaluate json loaded model on test data @@ -187,7 +187,7 @@ def run(gParameters): # load weights into new model - loaded_model_yaml.load_weights('{}/{}.model.h5'.format(output_dir, model_name)) + loaded_model_yaml.load_weights('{}/{}.model.h5'.format(output_dir, model_prefix)) print("Loaded yaml model from disk") # evaluate loaded model on test data diff --git a/Pilot1/TC1/tc1_default_model.txt b/Pilot1/TC1/tc1_default_model.txt index b70487e5..673c9047 100644 --- a/Pilot1/TC1/tc1_default_model.txt +++ b/Pilot1/TC1/tc1_default_model.txt @@ -2,18 +2,18 @@ data_url = 'ftp://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/type-class/' train_data = 'type_18_300_train.csv' test_data = 'type_18_300_test.csv' -model_name = 'tc1' +model_prefix = 'tc1' conv=[128, 20, 1, 128, 10, 1] dense=[200,20] activation='relu' -out_act='softmax' +out_activation='softmax' loss='categorical_crossentropy' optimizer='sgd' metrics='accuracy' epochs=400 batch_size=20 -drop=0.1 +dropout=0.1 classes=36 feature_subsample=0 pool=[1, 10] -save='.' +output_dir='.' diff --git a/Pilot1/TC1/tc1_perf_bench_model.txt b/Pilot1/TC1/tc1_perf_bench_model.txt index 6c8d8168..46040f8f 100644 --- a/Pilot1/TC1/tc1_perf_bench_model.txt +++ b/Pilot1/TC1/tc1_perf_bench_model.txt @@ -2,18 +2,18 @@ data_url = 'ftp://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/type-class/' train_data = 'type_18_300_train.csv' test_data = 'type_18_300_test.csv' -model_name = 'tc1' +model_prefix = 'tc1' conv=[128, 20, 1, 128, 10, 1] dense=[200,20] activation='relu' -out_act='softmax' +out_activation='softmax' loss='categorical_crossentropy' optimizer='sgd' metrics='accuracy' epochs=20 batch_size=5 -drop=0.1 +dropout=0.1 classes=36 feature_subsample=0 pool=[1, 10] -save='.' +output_dir='.' From 6923fad9a087c8a905d8cfbe88ab6205c252c720 Mon Sep 17 00:00:00 2001 From: Cristina Garcia Cardona Date: Wed, 8 Apr 2020 18:47:58 -0600 Subject: [PATCH 213/331] partial update of keywords in Pilot1/P1B1 --- Pilot1/P1B1/p1b1.py | 4 ++-- Pilot1/P1B1/p1b1_default_model.txt | 2 +- Pilot1/P1B1/p1b1_perf_bench_model.txt | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Pilot1/P1B1/p1b1.py b/Pilot1/P1B1/p1b1.py index 93f6b039..91fac8b5 100644 --- a/Pilot1/P1B1/p1b1.py +++ b/Pilot1/P1B1/p1b1.py @@ -27,10 +27,10 @@ 'action':'store', 'type': int, 'help':'latent dimensions'}, -{'name':'model_name', +{'name':'model_prefix', 'default':'p1b1', 'type':str, - 'help':'prefix for file to save model'}, + 'help':'prefix to build model name for saving'}, {'name':'model', 'default':'ae', 'choices':['ae', 'vae', 'cvae'], diff --git a/Pilot1/P1B1/p1b1_default_model.txt b/Pilot1/P1B1/p1b1_default_model.txt index b6c86423..27e69a2b 100644 --- a/Pilot1/P1B1/p1b1_default_model.txt +++ b/Pilot1/P1B1/p1b1_default_model.txt @@ -2,7 +2,7 @@ data_url = 'http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/P1B1/' train_data = 'P1B1.dev.train.csv' test_data = 'P1B1.dev.test.csv' -model_name='p1b1' +model_prefix='p1b1' dense=[2000, 600] batch_size=100 epochs=100 diff --git a/Pilot1/P1B1/p1b1_perf_bench_model.txt b/Pilot1/P1B1/p1b1_perf_bench_model.txt index 01ffc46c..ca0854bf 100644 --- a/Pilot1/P1B1/p1b1_perf_bench_model.txt +++ b/Pilot1/P1B1/p1b1_perf_bench_model.txt @@ -2,7 +2,7 @@ data_url = 'http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/P1B1/' train_data = 'P1B1.dev.train.csv' test_data = 'P1B1.dev.test.csv' -model_name='p1b1' +model_prefix='p1b1' dense=[2000, 1000, 500, 200, 100] batch_size=100 epochs=500 From 849f01d447a72bcf1054dc9845c030d72c87dec8 Mon Sep 17 00:00:00 2001 From: Cristina Garcia Cardona Date: Wed, 8 Apr 2020 18:54:47 -0600 Subject: [PATCH 214/331] updated keywords in Pilot1/NT3 --- Pilot1/NT3/nt3.py | 7 ++++--- Pilot1/NT3/nt3_baseline_keras2.py | 18 +++++++++--------- Pilot1/NT3/nt3_baseline_keras2_tensorrt.py | 20 ++++++++++---------- Pilot1/NT3/nt3_default_model.txt | 2 +- Pilot1/NT3/nt3_perf_bench_model.txt | 2 +- 5 files changed, 25 insertions(+), 24 deletions(-) diff --git a/Pilot1/NT3/nt3.py b/Pilot1/NT3/nt3.py index 367a11db..646a4044 100644 --- a/Pilot1/NT3/nt3.py +++ b/Pilot1/NT3/nt3.py @@ -8,9 +8,10 @@ import candle additional_definitions = [ -{'name':'model_name', +{'name':'model_prefix', 'default':'nt3', - 'type':str}, + 'type':str, + 'help':'prefix to build model name for saving'}, {'name':'classes', 'type':int, 'default':2} @@ -20,7 +21,7 @@ 'data_url', 'train_data', 'test_data', - 'model_name', + 'model_prefix', 'conv', 'dense', 'activation', diff --git a/Pilot1/NT3/nt3_baseline_keras2.py b/Pilot1/NT3/nt3_baseline_keras2.py index 9282cb36..6f3aacdc 100644 --- a/Pilot1/NT3/nt3_baseline_keras2.py +++ b/Pilot1/NT3/nt3_baseline_keras2.py @@ -176,8 +176,8 @@ def run(gParameters): gParameters.update(candle.compute_trainable_params(model)) # set up a bunch of callbacks to do work during model training.. - model_name = gParameters['model_name'] - path = '{}/{}.autosave.model.h5'.format(output_dir, model_name) + model_prefix = gParameters['model_prefix'] + path = '{}/{}.autosave.model.h5'.format(output_dir, model_prefix) # checkpointer = ModelCheckpoint(filepath=path, verbose=1, save_weights_only=False, save_best_only=True) csv_logger = CSVLogger('{}/training.log'.format(output_dir)) reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=10, verbose=1, mode='auto', epsilon=0.0001, cooldown=0, min_lr=0) @@ -197,34 +197,34 @@ def run(gParameters): print('Test accuracy:', score[1]) # serialize model to JSON model_json = model.to_json() - with open("{}/{}.model.json".format(output_dir, model_name), "w") as json_file: + with open("{}/{}.model.json".format(output_dir, model_prefix), "w") as json_file: json_file.write(model_json) # serialize model to YAML model_yaml = model.to_yaml() - with open("{}/{}.model.yaml".format(output_dir, model_name), "w") as yaml_file: + with open("{}/{}.model.yaml".format(output_dir, model_prefix), "w") as yaml_file: yaml_file.write(model_yaml) # serialize weights to HDF5 - model.save_weights("{}/{}.weights.h5".format(output_dir, model_name)) + model.save_weights("{}/{}.weights.h5".format(output_dir, model_prefix)) print("Saved model to disk") # load json and create model - json_file = open('{}/{}.model.json'.format(output_dir, model_name), 'r') + json_file = open('{}/{}.model.json'.format(output_dir, model_prefix), 'r') loaded_model_json = json_file.read() json_file.close() loaded_model_json = model_from_json(loaded_model_json) # load yaml and create model - yaml_file = open('{}/{}.model.yaml'.format(output_dir, model_name), 'r') + yaml_file = open('{}/{}.model.yaml'.format(output_dir, model_prefix), 'r') loaded_model_yaml = yaml_file.read() yaml_file.close() loaded_model_yaml = model_from_yaml(loaded_model_yaml) # load weights into new model - loaded_model_json.load_weights('{}/{}.weights.h5'.format(output_dir, model_name)) + loaded_model_json.load_weights('{}/{}.weights.h5'.format(output_dir, model_prefix)) print("Loaded json model from disk") # evaluate json loaded model on test data @@ -239,7 +239,7 @@ def run(gParameters): print("json %s: %.2f%%" % (loaded_model_json.metrics_names[1], score_json[1]*100)) # load weights into new model - loaded_model_yaml.load_weights('{}/{}.weights.h5'.format(output_dir, model_name)) + loaded_model_yaml.load_weights('{}/{}.weights.h5'.format(output_dir, model_prefix)) print("Loaded yaml model from disk") # evaluate loaded model on test data diff --git a/Pilot1/NT3/nt3_baseline_keras2_tensorrt.py b/Pilot1/NT3/nt3_baseline_keras2_tensorrt.py index 8194a5f3..a9c6e295 100644 --- a/Pilot1/NT3/nt3_baseline_keras2_tensorrt.py +++ b/Pilot1/NT3/nt3_baseline_keras2_tensorrt.py @@ -83,7 +83,7 @@ def read_config_file(file): fileParams['data_url'] = eval(config.get(section[0],'data_url')) fileParams['train_data'] = eval(config.get(section[0],'train_data')) fileParams['test_data'] = eval(config.get(section[0],'test_data')) - fileParams['model_name'] = eval(config.get(section[0],'model_name')) + fileParams['model_prefix'] = eval(config.get(section[0],'model_prefix')) fileParams['conv'] = eval(config.get(section[0],'conv')) fileParams['dense'] = eval(config.get(section[0],'dense')) fileParams['activation'] = eval(config.get(section[0],'activation')) @@ -267,8 +267,8 @@ def run(gParameters): gParameters.update(compute_trainable_params(model)) # set up a bunch of callbacks to do work during model training.. - model_name = gParameters['model_name'] - path = '{}/{}.autosave.model.h5'.format(output_dir, model_name) + model_prefix = gParameters['model_prefix'] + path = '{}/{}.autosave.model.h5'.format(output_dir, model_prefix) # checkpointer = ModelCheckpoint(filepath=path, verbose=1, save_weights_only=False, save_best_only=True) csv_logger = CSVLogger('{}/training.log'.format(output_dir)) reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=10, verbose=1, mode='auto', epsilon=0.0001, cooldown=0, min_lr=0) @@ -343,34 +343,34 @@ def run(gParameters): print('Test accuracy:', score[1]) # serialize model to JSON model_json = model.to_json() - with open("{}/{}.model.json".format(output_dir, model_name), "w") as json_file: + with open("{}/{}.model.json".format(output_dir, model_prefix), "w") as json_file: json_file.write(model_json) # serialize model to YAML model_yaml = model.to_yaml() - with open("{}/{}.model.yaml".format(output_dir, model_name), "w") as yaml_file: + with open("{}/{}.model.yaml".format(output_dir, model_prefix), "w") as yaml_file: yaml_file.write(model_yaml) # serialize weights to HDF5 - model.save_weights("{}/{}.weights.h5".format(output_dir, model_name)) + model.save_weights("{}/{}.weights.h5".format(output_dir, model_prefix)) print("Saved model to disk") # load json and create model - json_file = open('{}/{}.model.json'.format(output_dir, model_name), 'r') + json_file = open('{}/{}.model.json'.format(output_dir, model_prefix), 'r') loaded_model_json = json_file.read() json_file.close() loaded_model_json = model_from_json(loaded_model_json) # load yaml and create model - yaml_file = open('{}/{}.model.yaml'.format(output_dir, model_name), 'r') + yaml_file = open('{}/{}.model.yaml'.format(output_dir, model_prefix), 'r') loaded_model_yaml = yaml_file.read() yaml_file.close() loaded_model_yaml = model_from_yaml(loaded_model_yaml) # load weights into new model - loaded_model_json.load_weights('{}/{}.weights.h5'.format(output_dir, model_name)) + loaded_model_json.load_weights('{}/{}.weights.h5'.format(output_dir, model_prefix)) print("Loaded json model from disk") # evaluate json loaded model on test data @@ -385,7 +385,7 @@ def run(gParameters): print("json %s: %.2f%%" % (loaded_model_json.metrics_names[1], score_json[1]*100)) # load weights into new model - loaded_model_yaml.load_weights('{}/{}.weights.h5'.format(output_dir, model_name)) + loaded_model_yaml.load_weights('{}/{}.weights.h5'.format(output_dir, model_prefix)) print("Loaded yaml model from disk") # evaluate loaded model on test data diff --git a/Pilot1/NT3/nt3_default_model.txt b/Pilot1/NT3/nt3_default_model.txt index 84488116..3082c8bc 100644 --- a/Pilot1/NT3/nt3_default_model.txt +++ b/Pilot1/NT3/nt3_default_model.txt @@ -2,7 +2,7 @@ data_url = 'ftp://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/normal-tumor/' train_data = 'nt_train2.csv' test_data = 'nt_test2.csv' -model_name = 'nt3' +model_prefix = 'nt3' conv = [128, 20, 1, 128, 10, 1] dense = [200,20] activation = 'relu' diff --git a/Pilot1/NT3/nt3_perf_bench_model.txt b/Pilot1/NT3/nt3_perf_bench_model.txt index aad8e0e6..a4ea5d15 100644 --- a/Pilot1/NT3/nt3_perf_bench_model.txt +++ b/Pilot1/NT3/nt3_perf_bench_model.txt @@ -2,7 +2,7 @@ data_url = 'ftp://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/normal-tumor/' train_data = 'nt_train2.csv' test_data = 'nt_test2.csv' -model_name = 'nt3' +model_prefix = 'nt3' conv = [128, 20, 1, 128, 10, 1] dense = [200,20] activation = 'relu' From 5cbcc050b0bc62ddd22073998d968355f92d8738 Mon Sep 17 00:00:00 2001 From: Cristina Garcia Cardona Date: Wed, 8 Apr 2020 18:57:00 -0600 Subject: [PATCH 215/331] updated keywords in Pilot1/T29 --- Pilot1/T29/t29_default_model.txt | 2 +- Pilot1/T29/t29res.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Pilot1/T29/t29_default_model.txt b/Pilot1/T29/t29_default_model.txt index 8d143315..33b04760 100644 --- a/Pilot1/T29/t29_default_model.txt +++ b/Pilot1/T29/t29_default_model.txt @@ -3,7 +3,7 @@ train_path='./rip.it.train.csv' test_path='./rip.it.test.csv' batch_size=64 epochs=100 -drop=0.2 +dropout=0.2 classes=2 optimizer='sgd' learning_rate=0.002 diff --git a/Pilot1/T29/t29res.py b/Pilot1/T29/t29res.py index e5a2dfb0..e4fea01e 100644 --- a/Pilot1/T29/t29res.py +++ b/Pilot1/T29/t29res.py @@ -91,8 +91,8 @@ def load_data(nb_classes, PL, gParameters): def f(x, gParameters, distance=1): input = x for i in range(distance): - if 'drop' in gParameters: - x = Dropout(gParameters['drop'])(x) + if 'dropout' in gParameters: + x = Dropout(gParameters['dropout'])(x) x = Dense(1000, activation=gParameters['activation'])(x) y = ke.layers.add([input,x]) return y @@ -105,7 +105,7 @@ def run(gParameters): EPOCH = gParameters['epochs'] BATCH = gParameters['batch_size'] nb_classes = gParameters['classes'] - DR = gParameters['drop'] + DR = gParameters['dropout'] ACTIVATION = gParameters['activation'] kerasDefaults = candle.keras_default_config() kerasDefaults['momentum_sgd'] = gParameters['momentum'] From 0161047146673724f2992f7094229aa9e7285503 Mon Sep 17 00:00:00 2001 From: Cristina Garcia Cardona Date: Wed, 8 Apr 2020 19:05:34 -0600 Subject: [PATCH 216/331] updated keywords in Pilot1/Uno --- Pilot1/Uno/uno.py | 4 ++-- Pilot1/Uno/uno_auc_model.txt | 4 ++-- Pilot1/Uno/uno_baseline_keras2.py | 6 +++--- Pilot1/Uno/uno_by_drug_example.txt | 4 ++-- Pilot1/Uno/uno_default_model.txt | 4 ++-- Pilot1/Uno/uno_fom_model.txt | 4 ++-- Pilot1/Uno/uno_perf_bench_model.txt | 4 ++-- 7 files changed, 15 insertions(+), 15 deletions(-) diff --git a/Pilot1/Uno/uno.py b/Pilot1/Uno/uno.py index 2794e20f..74581ef6 100644 --- a/Pilot1/Uno/uno.py +++ b/Pilot1/Uno/uno.py @@ -221,7 +221,7 @@ def set_locals(self): 'batch_size', 'dense', 'dense_feature_layers', - 'drop', + 'dropout', 'epochs', 'feature_subsample', 'learning_rate', @@ -231,7 +231,7 @@ def set_locals(self): 'rng_seed', 'save_path', 'scaling', - 'validation_split', + 'val_split', 'solr_root', 'timeout' ] diff --git a/Pilot1/Uno/uno_auc_model.txt b/Pilot1/Uno/uno_auc_model.txt index 23b28522..c92a850e 100644 --- a/Pilot1/Uno/uno_auc_model.txt +++ b/Pilot1/Uno/uno_auc_model.txt @@ -10,10 +10,10 @@ activation='relu' loss='mse' optimizer='adamax' scaling='std' -drop=.1 +dropout=.1 epochs=50 batch_size=32 -validation_split=0.2 +val_split=0.2 cv=1 max_val_loss=1.0 learning_rate=0.0001 diff --git a/Pilot1/Uno/uno_baseline_keras2.py b/Pilot1/Uno/uno_baseline_keras2.py index 2a4f656e..976f3a4f 100644 --- a/Pilot1/Uno/uno_baseline_keras2.py +++ b/Pilot1/Uno/uno_baseline_keras2.py @@ -76,8 +76,8 @@ def extension_from_parameters(args): ext += '.DF={}'.format(''.join([x[0] for x in sorted(args.drug_features)])) if args.feature_subsample > 0: ext += '.FS={}'.format(args.feature_subsample) - if args.drop > 0: - ext += '.DR={}'.format(args.drop) + if args.dropout > 0: + ext += '.DR={}'.format(args.dropout) if args.warmup_lr: ext += '.wu_lr' if args.reduce_lr: @@ -310,7 +310,7 @@ def run(params): ) target = args.agg_dose or 'Growth' - val_split = args.validation_split + val_split = args.val_split train_split = 1 - val_split if args.export_csv: diff --git a/Pilot1/Uno/uno_by_drug_example.txt b/Pilot1/Uno/uno_by_drug_example.txt index daa028a1..851f23c9 100644 --- a/Pilot1/Uno/uno_by_drug_example.txt +++ b/Pilot1/Uno/uno_by_drug_example.txt @@ -10,10 +10,10 @@ activation='relu' loss='mse' optimizer='adam' scaling='std' -drop=0 +dropout=0 epochs=10 batch_size=128 -validation_split=0.2 +val_split=0.2 cv=1 max_val_loss=1.0 learning_rate=None diff --git a/Pilot1/Uno/uno_default_model.txt b/Pilot1/Uno/uno_default_model.txt index 64a88df6..9cf9cff3 100644 --- a/Pilot1/Uno/uno_default_model.txt +++ b/Pilot1/Uno/uno_default_model.txt @@ -10,10 +10,10 @@ activation='relu' loss='mse' optimizer='adam' scaling='std' -drop=0 +dropout=0 epochs=10 batch_size=32 -validation_split=0.2 +val_split=0.2 cv=1 max_val_loss=1.0 learning_rate=None diff --git a/Pilot1/Uno/uno_fom_model.txt b/Pilot1/Uno/uno_fom_model.txt index cf66baae..e84a9442 100644 --- a/Pilot1/Uno/uno_fom_model.txt +++ b/Pilot1/Uno/uno_fom_model.txt @@ -10,10 +10,10 @@ activation='relu' loss='mse' optimizer='adam' scaling='std' -drop=0 +dropout=0 epochs=50 batch_size=512 -validation_split=0.2 +val_split=0.2 cv=1 max_val_loss=1.0 learning_rate=None diff --git a/Pilot1/Uno/uno_perf_bench_model.txt b/Pilot1/Uno/uno_perf_bench_model.txt index 234334f9..b065fb8c 100644 --- a/Pilot1/Uno/uno_perf_bench_model.txt +++ b/Pilot1/Uno/uno_perf_bench_model.txt @@ -10,10 +10,10 @@ activation='relu' loss='mse' optimizer='adam' scaling='std' -drop=0 +dropout=0 epochs=3 batch_size=32 -validation_split=0.2 +val_split=0.2 cv=1 max_val_loss=1.0 learning_rate=None From 830c77f510d7be79af15225222bcc12638854b9b Mon Sep 17 00:00:00 2001 From: Cristina Garcia Cardona Date: Wed, 8 Apr 2020 19:10:16 -0600 Subject: [PATCH 217/331] updated keywords in Pilot1/UnoMT --- Pilot1/UnoMT/unoMT.py | 2 +- Pilot1/UnoMT/unoMT_default_model.txt | 2 +- Pilot1/UnoMT/unoMT_pytorch_model.py | 6 +++--- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Pilot1/UnoMT/unoMT.py b/Pilot1/UnoMT/unoMT.py index 7ffa2497..3b4eaa0e 100644 --- a/Pilot1/UnoMT/unoMT.py +++ b/Pilot1/UnoMT/unoMT.py @@ -247,7 +247,7 @@ 'resp_num_layers_per_block', 'resp_num_blocks', 'resp_num_layers', - 'drop', + 'dropout', 'resp_activation', 'cl_clf_layer_dim', 'cl_clf_num_layers', diff --git a/Pilot1/UnoMT/unoMT_default_model.txt b/Pilot1/UnoMT/unoMT_default_model.txt index 8743f53a..ff0b569b 100644 --- a/Pilot1/UnoMT/unoMT_default_model.txt +++ b/Pilot1/UnoMT/unoMT_default_model.txt @@ -35,7 +35,7 @@ resp_layer_dim=2048 resp_num_layers_per_block=2 resp_num_blocks=4 resp_num_layers=2 -drop=0.1 +dropout=0.1 resp_activation='none' # Cell line classification network(s) diff --git a/Pilot1/UnoMT/unoMT_pytorch_model.py b/Pilot1/UnoMT/unoMT_pytorch_model.py index 2d94a0f0..b8477341 100644 --- a/Pilot1/UnoMT/unoMT_pytorch_model.py +++ b/Pilot1/UnoMT/unoMT_pytorch_model.py @@ -261,7 +261,7 @@ def build_nn(self): resp_num_layers_per_block=args.resp_num_layers_per_block, resp_num_blocks=args.resp_num_blocks, resp_num_layers=args.resp_num_layers, - resp_dropout=args.drop, + resp_dropout=args.dropout, resp_activation=args.resp_activation).to(device) @@ -372,7 +372,7 @@ def update_l2regularizer(self, reg): def update_dropout(self, dropout_rate): - self.args.drop = dropout_rate + self.args.dropout = dropout_rate # Regressor for drug response self.resp_net = RespNet( @@ -386,7 +386,7 @@ def update_dropout(self, dropout_rate): resp_num_layers_per_block=self.args.resp_num_layers_per_block, resp_num_blocks=self.args.resp_num_blocks, resp_num_layers=self.args.resp_num_layers, - resp_dropout=self.args.drop, + resp_dropout=self.args.dropout, resp_activation=self.args.resp_activation).to(self.device) From 4d6fa6eb61f573022f3e5041b95caeb1f9ff6ecc Mon Sep 17 00:00:00 2001 From: Cristina Garcia Cardona Date: Wed, 8 Apr 2020 19:16:08 -0600 Subject: [PATCH 218/331] updated keywords in Pilot1/Uno_UQ --- Pilot1/Uno_UQ/data_utils_/uno.py | 6 +++--- Pilot1/Uno_UQ/model_utils_/uno_model_utils.py | 6 +++--- Pilot1/Uno_UQ/uno_gCSI_modelUQ.txt | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/Pilot1/Uno_UQ/data_utils_/uno.py b/Pilot1/Uno_UQ/data_utils_/uno.py index 4c1ddc56..30c96b69 100644 --- a/Pilot1/Uno_UQ/data_utils_/uno.py +++ b/Pilot1/Uno_UQ/data_utils_/uno.py @@ -62,8 +62,8 @@ def extension_from_parameters(args): ext += '.DF={}'.format(''.join([x[0] for x in sorted(args.drug_features)])) if args.feature_subsample > 0: ext += '.FS={}'.format(args.feature_subsample) - if args.drop > 0: - ext += '.DR={}'.format(args.drop) + if args.dropout > 0: + ext += '.DR={}'.format(args.dropout) if args.warmup_lr: ext += '.wu_lr' if args.reduce_lr: @@ -337,7 +337,7 @@ def set_locals(self): 'batch_size', 'dense', 'dense_feature_layers', - 'drop', + 'dropout', 'epochs', 'feature_subsample', 'learning_rate', diff --git a/Pilot1/Uno_UQ/model_utils_/uno_model_utils.py b/Pilot1/Uno_UQ/model_utils_/uno_model_utils.py index 244c1ba8..36bb666c 100644 --- a/Pilot1/Uno_UQ/model_utils_/uno_model_utils.py +++ b/Pilot1/Uno_UQ/model_utils_/uno_model_utils.py @@ -130,7 +130,7 @@ def build_feature_model(input_shape, name='', dense_layers=[1000, 1000], def build_homoscedastic_model(loader, args, logger=None, permanent_dropout=True, silent=False): input_models = {} - dropout_rate = args.drop + dropout_rate = args.dropout reg_l2 = args.reg_l2 for fea_type, shape in loader.feature_shapes.items(): base_type = fea_type.split('.')[0] @@ -183,7 +183,7 @@ def build_homoscedastic_model(loader, args, logger=None, permanent_dropout=True, def build_heteroscedastic_model(loader, args, logger=None, permanent_dropout=True, silent=False): input_models = {} - dropout_rate = args.drop + dropout_rate = args.dropout reg_l2 = args.reg_l2 for fea_type, shape in loader.feature_shapes.items(): base_type = fea_type.split('.')[0] @@ -235,7 +235,7 @@ def build_heteroscedastic_model(loader, args, logger=None, permanent_dropout=Tru def build_quantile_model(loader, args, logger=None, permanent_dropout=True, silent=False): input_models = {} - dropout_rate = args.drop + dropout_rate = args.dropout reg_l2 = args.reg_l2 for fea_type, shape in loader.feature_shapes.items(): base_type = fea_type.split('.')[0] diff --git a/Pilot1/Uno_UQ/uno_gCSI_modelUQ.txt b/Pilot1/Uno_UQ/uno_gCSI_modelUQ.txt index 71fec820..1108a006 100644 --- a/Pilot1/Uno_UQ/uno_gCSI_modelUQ.txt +++ b/Pilot1/Uno_UQ/uno_gCSI_modelUQ.txt @@ -10,7 +10,7 @@ activation='relu' loss='mse' optimizer='adam' scaling='std' -drop=0 +dropout=0 epochs=10 batch_size=32 val_split=0.2 From 7bb7dfb79f5aafd9b67b8b9d113aaedb49dde4ef Mon Sep 17 00:00:00 2001 From: Cristina Garcia Cardona Date: Wed, 8 Apr 2020 19:22:54 -0600 Subject: [PATCH 219/331] updated save_path to output_dir in Pilot1/NT3 --- Pilot1/NT3/nt3.py | 2 +- Pilot1/NT3/nt3_baseline_keras2.py | 2 +- Pilot1/NT3/nt3_baseline_keras2_tensorrt.py | 2 +- Pilot1/NT3/nt3_default_model.txt | 2 +- Pilot1/NT3/nt3_perf_bench_model.txt | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Pilot1/NT3/nt3.py b/Pilot1/NT3/nt3.py index 646a4044..41764611 100644 --- a/Pilot1/NT3/nt3.py +++ b/Pilot1/NT3/nt3.py @@ -35,7 +35,7 @@ 'dropout', 'classes', 'pool', - 'save_path', + 'output_dir', 'timeout' ] diff --git a/Pilot1/NT3/nt3_baseline_keras2.py b/Pilot1/NT3/nt3_baseline_keras2.py index 6f3aacdc..aa4dfead 100644 --- a/Pilot1/NT3/nt3_baseline_keras2.py +++ b/Pilot1/NT3/nt3_baseline_keras2.py @@ -167,7 +167,7 @@ def run(gParameters): optimizer=optimizer, metrics=[gParameters['metrics']]) - output_dir = gParameters['save_path'] + output_dir = gParameters['output_dir'] if not os.path.exists(output_dir): os.makedirs(output_dir) diff --git a/Pilot1/NT3/nt3_baseline_keras2_tensorrt.py b/Pilot1/NT3/nt3_baseline_keras2_tensorrt.py index a9c6e295..ba2c2283 100644 --- a/Pilot1/NT3/nt3_baseline_keras2_tensorrt.py +++ b/Pilot1/NT3/nt3_baseline_keras2_tensorrt.py @@ -258,7 +258,7 @@ def run(gParameters): optimizer=optimizer, metrics=[gParameters['metrics']]) - output_dir = gParameters['save_path'] + output_dir = gParameters['output_dir'] if not os.path.exists(output_dir): os.makedirs(output_dir) diff --git a/Pilot1/NT3/nt3_default_model.txt b/Pilot1/NT3/nt3_default_model.txt index 3082c8bc..b5566b04 100644 --- a/Pilot1/NT3/nt3_default_model.txt +++ b/Pilot1/NT3/nt3_default_model.txt @@ -16,5 +16,5 @@ learning_rate = 0.001 dropout = 0.1 classes = 2 pool = [1, 10] -save_path = '.' +output_dir = '.' timeout = 3600 diff --git a/Pilot1/NT3/nt3_perf_bench_model.txt b/Pilot1/NT3/nt3_perf_bench_model.txt index a4ea5d15..0823aadf 100644 --- a/Pilot1/NT3/nt3_perf_bench_model.txt +++ b/Pilot1/NT3/nt3_perf_bench_model.txt @@ -16,5 +16,5 @@ learning_rate = 0.001 dropout = 0.1 classes = 2 pool = [1, 10] -save_path = '.' +output_dir = '.' timeout = 7200 From d1106d7a00af4c001389ce15c3ec738580a4a60a Mon Sep 17 00:00:00 2001 From: Cristina Garcia Cardona Date: Wed, 8 Apr 2020 20:05:27 -0600 Subject: [PATCH 220/331] fixed keyword bugs in pilot1 benchmarks --- Pilot1/Attn1/attn.py | 2 +- Pilot1/Combo/combo.py | 2 +- Pilot1/NT3/nt3_default_model.txt | 2 +- Pilot1/NT3/nt3_perf_bench_model.txt | 2 +- Pilot1/P1B2/p1b2_default_model.txt | 2 +- Pilot1/TC1/tc1_default_model.txt | 2 +- Pilot1/TC1/tc1_perf_bench_model.txt | 2 +- Pilot1/Uno_UQ/README.md | 1 + .../{uno_gCSI_modelUQ.txt => uno_defaultUQ_model.txt} | 10 ++++++---- Pilot1/Uno_UQ/uno_holdoutUQ_data.py | 4 ++-- Pilot1/Uno_UQ/uno_inferUQ_keras2.py | 4 ++-- Pilot1/Uno_UQ/uno_trainUQ_keras2.py | 4 ++-- 12 files changed, 20 insertions(+), 17 deletions(-) create mode 100644 Pilot1/Uno_UQ/README.md rename Pilot1/Uno_UQ/{uno_gCSI_modelUQ.txt => uno_defaultUQ_model.txt} (79%) diff --git a/Pilot1/Attn1/attn.py b/Pilot1/Attn1/attn.py index ddc66ff6..9676dc52 100644 --- a/Pilot1/Attn1/attn.py +++ b/Pilot1/Attn1/attn.py @@ -129,7 +129,7 @@ def load_data(params, seed): # start change # if params['train_data'].endswith('h5') or params['train_data'].endswith('hdf5'): - print ('processing h5 in file {}'.format(params['in'])) + print ('processing h5 in file {}'.format(params['train_data'])) url = params['data_url'] file_train = params['train_data'] diff --git a/Pilot1/Combo/combo.py b/Pilot1/Combo/combo.py index de0f31fc..f12d0bcf 100644 --- a/Pilot1/Combo/combo.py +++ b/Pilot1/Combo/combo.py @@ -92,7 +92,7 @@ required = [ 'activation', 'batch_size', 'dense', 'dense_feature_layers', 'dropout', 'epochs', 'learning_rate', 'loss', 'optimizer', 'residual', 'rng_seed', - 'save_path', 'scaling', 'feature_subsample', 'validation_split', + 'save_path', 'scaling', 'feature_subsample', 'val_split', 'solr_root', 'timeout' ] diff --git a/Pilot1/NT3/nt3_default_model.txt b/Pilot1/NT3/nt3_default_model.txt index b5566b04..a23e4afa 100644 --- a/Pilot1/NT3/nt3_default_model.txt +++ b/Pilot1/NT3/nt3_default_model.txt @@ -1,5 +1,5 @@ [Global_Params] -data_url = 'ftp://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/normal-tumor/' +data_url = 'http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/normal-tumor/' train_data = 'nt_train2.csv' test_data = 'nt_test2.csv' model_prefix = 'nt3' diff --git a/Pilot1/NT3/nt3_perf_bench_model.txt b/Pilot1/NT3/nt3_perf_bench_model.txt index 0823aadf..269d2bd0 100644 --- a/Pilot1/NT3/nt3_perf_bench_model.txt +++ b/Pilot1/NT3/nt3_perf_bench_model.txt @@ -1,5 +1,5 @@ [Global_Params] -data_url = 'ftp://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/normal-tumor/' +data_url = 'http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/normal-tumor/' train_data = 'nt_train2.csv' test_data = 'nt_test2.csv' model_prefix = 'nt3' diff --git a/Pilot1/P1B2/p1b2_default_model.txt b/Pilot1/P1B2/p1b2_default_model.txt index df55f400..1b399a7d 100644 --- a/Pilot1/P1B2/p1b2_default_model.txt +++ b/Pilot1/P1B2/p1b2_default_model.txt @@ -12,7 +12,7 @@ learning_rate=0.001 scaling='minmax' dropout=0. feature_subsample=0 -penalty=0.00001 +reg_l2=0.00001 val_split=0.1 rng_seed=2017 initialization='glorot_uniform' diff --git a/Pilot1/TC1/tc1_default_model.txt b/Pilot1/TC1/tc1_default_model.txt index 673c9047..8850edd1 100644 --- a/Pilot1/TC1/tc1_default_model.txt +++ b/Pilot1/TC1/tc1_default_model.txt @@ -1,5 +1,5 @@ [Global_Params] -data_url = 'ftp://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/type-class/' +data_url = 'http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/type-class/' train_data = 'type_18_300_train.csv' test_data = 'type_18_300_test.csv' model_prefix = 'tc1' diff --git a/Pilot1/TC1/tc1_perf_bench_model.txt b/Pilot1/TC1/tc1_perf_bench_model.txt index 46040f8f..167fabb1 100644 --- a/Pilot1/TC1/tc1_perf_bench_model.txt +++ b/Pilot1/TC1/tc1_perf_bench_model.txt @@ -1,5 +1,5 @@ [Global_Params] -data_url = 'ftp://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/type-class/' +data_url = 'http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/type-class/' train_data = 'type_18_300_train.csv' test_data = 'type_18_300_test.csv' model_prefix = 'tc1' diff --git a/Pilot1/Uno_UQ/README.md b/Pilot1/Uno_UQ/README.md new file mode 100644 index 00000000..af389664 --- /dev/null +++ b/Pilot1/Uno_UQ/README.md @@ -0,0 +1 @@ +## Uno_UQ: Predicting Tumor Dose Response across Multiple Data Sources with added UQ functionality. ## Functionality Uno_UQ adds uncertainty quantification (UQ) functionality to the Uno model. For information about the underlaying model, please refer to the Uno benchmark. This page overviews the added UQ functionality provided, which includes: - Generation of holdout set. - Training excluding the holdout set. - Inference for the specified data. - Training for homoscedastic and heteroscedastic models. - Empirical calibration of UQ for the trained models. ## Holdout The holdout script generates a set of identifiers to holdout during training, depending on the --partition_by argument. If --partition_by is 'drug_pair' it generates a set of drug IDs. If --partition_by is 'cell' it generates a set of cell IDs. In any other case it generates a set of indices. The fraction to reserve in the holdout set is given by the --val_split argument. #### Example output ``` python uno_holdoutUQ_data.py Using TensorFlow backend. Importing candle utils for keras Configuration file: uno_defaultUQ_model.txt Params: {'activation': 'relu', 'agg_dose': 'AUC', 'base_lr': None, 'batch_normalization': False, 'batch_size': 32, 'by_cell': None, 'by_drug': None, 'cache': None, 'cell_feature_subset_path': '', 'cell_features': ['rnaseq'], 'cell_subset_path': '', 'cell_types': None, 'cp': False, 'cv': 1, 'data_type': , 'dense': [1000, 1000, 1000], 'dense_feature_layers': [1000, 1000, 1000], 'dropout': 0.1, 'drug_feature_subset_path': '', 'drug_features': ['descriptors', 'fingerprints'], 'drug_median_response_max': 1, 'drug_median_response_min': -1, 'drug_subset_path': '', 'epochs': 10, 'exclude_cells': [], 'exclude_drugs': [], 'experiment_id': 'EXP000', 'export_csv': None, 'export_data': None, 'feature_subsample': 0, 'feature_subset_path': '', 'gpus': [], 'growth_bins': 0, 'initial_weights': None, 'learning_rate': 0.01, 'logfile': None, 'loss': 'mse', 'max_val_loss': 1.0, 'no_feature_source': True, 'no_gen': False, 'no_response_source': True, 'optimizer': 'sgd', 'output_dir': './Output/EXP000/RUN000', 'partition_by': 'cell', 'preprocess_rnaseq': 'none', 'reduce_lr': False, 'residual': False, 'rng_seed': 2018, 'run_id': 'RUN000', 'sample_repetition': False, 'save_path': 'save_default/', 'save_weights': 'default.weights.h5', 'scaling': 'std', 'shuffle': False, 'single': True, 'solr_root': '', 'tb': False, 'tb_prefix': 'tb', 'test_sources': ['train'], 'timeout': 3600, 'train_bool': True, 'train_sources': ['gCSI'], 'uq_exclude_cells_file': 'save_default/infer_cell_ids', 'use_exported_data': None, 'use_filtered_genes': False, 'use_landmark_genes': True, 'val_split': 0.2, 'verbose': None, 'warmup_lr': False} partition_by: cell Cell IDs in holdout set written in file: save_default/infer_cell_ids ``` ## Train The train script trains the model, as in the underlying Uno benchmark, but excluding the IDs in the holdout file. The file with the holdout set should be provided via one of the following arguments - --uq_exclude_drugs_file='file' if the file contains a set of drug IDs. - --uq_exclude_cells_file='file' if the file contains a set of cell IDs. - --uq_exclude_indices_file='file' if the file contains a set of indices. An additional --loss heteroscedastic option is available. This will learn the input-dependent noise level as well as the main regression variable specified (i.e. growth or AUC). #### Example output ``` python uno_trainUQ_keras2.py --cp True Using TensorFlow backend. Importing candle utils for keras Configuration file: uno_defaultUQ_model.txt Params: {'activation': 'relu', 'agg_dose': 'AUC', 'base_lr': None, 'batch_normalization': False, 'batch_size': 32, 'by_cell': None, 'by_drug': None, 'cache': None, 'cell_feature_subset_path': '', 'cell_features': ['rnaseq'], 'cell_subset_path': '', 'cell_types': None, 'cp': True, 'cv': 1, 'datatype': , 'dense': [1000, 1000, 1000], 'dense_feature_layers': [1000, 1000, 1000], 'drop': 0.1, 'drug_feature_subset_path': '', 'drug_features': ['descriptors', 'fingerprints'], 'drug_median_response_max': 1, 'drug_median_response_min': -1, 'drug_subset_path': '', 'epochs': 10, 'exclude_cells': [], 'exclude_drugs': [], 'exclude_indices': [], 'experiment_id': 'EXP000', 'export_csv': None, 'export_data': None, 'feature_subsample': 0, 'feature_subset_path': '', 'gpus': [], 'growth_bins': 0, 'initial_weights': None, 'learning_rate': 0.01, 'logfile': None, 'loss': 'mse', 'max_val_loss': 1.0, 'no_feature_source': True, 'no_gen': False, 'no_response_source': True, 'optimizer': 'sgd', 'output_dir': './Output/EXP000/RUN000', 'partition_by': 'cell', 'preprocess_rnaseq': 'none', 'reduce_lr': False, 'reg_l2': 0.0, 'residual': False, 'rng_seed': 2018, 'run_id': 'RUN000', 'sample_repetition': False, 'save_path': 'save_default/', 'save_weights': 'default.weights.h5', 'scaling': 'std', 'shuffle': False, 'single': True, 'solr_root': '', 'tb': False, 'tb_prefix': 'tb', 'test_sources': ['train'], 'timeout': 3600, 'train_bool': True, 'train_sources': ['gCSI'], 'uq_exclude_cells_file': 'save_default/infer_cell_ids', 'use_exported_data': None, 'use_filtered_genes': False, 'use_landmark_genes': True, 'val_split': 0.2, 'verbose': None, 'warmup_lr': False} Read file: save_default/infer_cell_ids Number of elements read: 72 Cells to exclude: ['gCSI.NCI-H889', 'gCSI.MEWO', 'gCSI.PA-TU-8902', 'gCSI.BCPAP', 'gCSI.CAL-12T', 'gCSI.NCI-H727', 'gCSI.HUH-1', 'gCSI.NUGC-4', 'gCSI.MKN74', 'gCSI.PK-1', 'gCSI.A2058', 'gCSI.RAJI', 'gCSI.JHH-7', 'gCSI.SUIT-2', 'gCSI.OE21', 'gCSI.HCC1806', 'gCSI.PANC-10-05', 'gCSI.RMG-I', 'gCSI.NCI-H1703', 'gCSI.KMS-34', 'gCSI.G-361', 'gCSI.EPLC-272H', 'gCSI.HEP-G2', 'gCSI.RERF-LC-MS', 'gCSI.COLO-800', 'gCSI.KM12', 'gCSI.DOHH-2', 'gCSI.EFM-19', 'gCSI.MDA-MB-468', 'gCSI.MHH-ES-1', 'gCSI.IPC-298', 'gCSI.GRANTA-519', 'gCSI.8305C', 'gCSI.KYSE-140', 'gCSI.MALME-3M', 'gCSI.MIA-PACA-2', 'gCSI.NCI-H1666', 'gCSI.PC-3', 'gCSI.RT4', 'gCSI.HUP-T4', 'gCSI.NCI-H1869', 'gCSI.WM-266-4', 'gCSI.KMM-1', 'gCSI.OE33', 'gCSI.SU-DHL-6', 'gCSI.QGP-1', 'gCSI.IGR-37', 'gCSI.VMRC-RCW', 'gCSI.NCI-H1838', 'gCSI.SW948', 'gCSI.COLO-679', 'gCSI.CAL-51', 'gCSI.HUCCT1', 'gCSI.LP-1', 'gCSI.RPMI-7951', 'gCSI.HPAF-II', 'gCSI.OCUM-1', 'gCSI.HOP-92', 'gCSI.NCI-H661', 'gCSI.TOV-112D', 'gCSI.PANC-03-27', 'gCSI.AGS', 'gCSI.HEC-59', 'gCSI.LN-18', 'gCSI.U-87-MG', 'gCSI.U-2-OS', 'gCSI.ABC-1', 'gCSI.IGR-1', 'gCSI.SK-MEL-3', 'gCSI.A549', 'gCSI.HCC4006', 'gCSI.NCI-H1355'] Combined model: __________________________________________________________________________________________________ Layer (type) Output Shape Param # Connected to ================================================================================================== input.cell.rnaseq (InputLayer) (None, 942) 0 __________________________________________________________________________________________________ input.drug1.descriptors (InputL (None, 5270) 0 __________________________________________________________________________________________________ input.drug1.fingerprints (Input (None, 2048) 0 __________________________________________________________________________________________________ cell.rnaseq (Model) (None, 1000) 2945000 input.cell.rnaseq[0][0] __________________________________________________________________________________________________ drug.descriptors (Model) (None, 1000) 7273000 input.drug1.descriptors[0][0] __________________________________________________________________________________________________ drug.fingerprints (Model) (None, 1000) 4051000 input.drug1.fingerprints[0][0] __________________________________________________________________________________________________ concatenate_1 (Concatenate) (None, 3000) 0 cell.rnaseq[1][0] drug.descriptors[1][0] drug.fingerprints[1][0] __________________________________________________________________________________________________ dense_10 (Dense) (None, 1000) 3001000 concatenate_1[0][0] __________________________________________________________________________________________________ permanent_dropout_10 (Permanent (None, 1000) 0 dense_10[0][0] __________________________________________________________________________________________________ dense_11 (Dense) (None, 1000) 1001000 permanent_dropout_10[0][0] __________________________________________________________________________________________________ permanent_dropout_11 (Permanent (None, 1000) 0 dense_11[0][0] __________________________________________________________________________________________________ dense_12 (Dense) (None, 1000) 1001000 permanent_dropout_11[0][0] __________________________________________________________________________________________________ permanent_dropout_12 (Permanent (None, 1000) 0 dense_12[0][0] __________________________________________________________________________________________________ dense_13 (Dense) (None, 1) 1001 permanent_dropout_12[0][0] ================================================================================================== Total params: 19,273,001 Trainable params: 19,273,001 Non-trainable params: 0 __________________________________________________________________________________________________ Training homoscedastic model: partition:train, rank:0, sharded index size:2784, batch_size:32, steps:87 partition:val, rank:0, sharded index size:704, batch_size:32, steps:22 Between random pairs in y_val: mse: 0.0604 mae: 0.1978 r2: -0.9105 corr: 0.0447 Data points per epoch: train = 2784, val = 704 Steps per epoch: train = 87, val = 22 Epoch 1/10 87/87 [==============================] - 15s 174ms/step - loss: 0.2165 - mae: 0.2144 - r2: -6.4761 - val_loss: 0.0247 - val_mae: 0.1244 - val_r2: 0.1916 Current time ....15.176 Epoch 2/10 87/87 [==============================] - 12s 142ms/step - loss: 0.0247 - mae: 0.1240 - r2: 0.1302 - val_loss: 0.0208 - val_mae: 0.1147 - val_r2: 0.3058 Current time ....28.323 Epoch 3/10 87/87 [==============================] - 12s 143ms/step - loss: 0.0219 - mae: 0.1157 - r2: 0.2278 - val_loss: 0.0197 - val_mae: 0.1112 - val_r2: 0.3565 Current time ....41.321 Epoch 4/10 87/87 [==============================] - 12s 143ms/step - loss: 0.0203 - mae: 0.1111 - r2: 0.2897 - val_loss: 0.0182 - val_mae: 0.1072 - val_r2: 0.3980 Current time ....54.330 Epoch 5/10 87/87 [==============================] - 13s 153ms/step - loss: 0.0187 - mae: 0.1066 - r2: 0.3388 - val_loss: 0.0189 - val_mae: 0.1090 - val_r2: 0.3804 Current time ....68.120 Epoch 6/10 87/87 [==============================] - 13s 148ms/step - loss: 0.0185 - mae: 0.1075 - r2: 0.3412 - val_loss: 0.0186 - val_mae: 0.1088 - val_r2: 0.3921 Current time ....80.967 Epoch 7/10 87/87 [==============================] - 13s 147ms/step - loss: 0.0185 - mae: 0.1069 - r2: 0.3468 - val_loss: 0.0177 - val_mae: 0.1043 - val_r2: 0.4259 Current time ....93.769 Epoch 8/10 87/87 [==============================] - 13s 150ms/step - loss: 0.0176 - mae: 0.1031 - r2: 0.3791 - val_loss: 0.0159 - val_mae: 0.0994 - val_r2: 0.4793 Current time ....107.421 Epoch 9/10 87/87 [==============================] - 13s 150ms/step - loss: 0.0177 - mae: 0.1034 - r2: 0.3745 - val_loss: 0.0161 - val_mae: 0.1000 - val_r2: 0.4696 Current time ....120.945 Epoch 10/10 87/87 [==============================] - 14s 159ms/step - loss: 0.0169 - mae: 0.1022 - r2: 0.4086 - val_loss: 0.0173 - val_mae: 0.1029 - val_r2: 0.4337 Current time ....134.744 Comparing y_true and y_pred: mse: 0.0165 mae: 0.1016 r2: 0.4782 corr: 0.7072 Testing predictions stored in file: save_default/uno.A=relu.B=32.E=10.O=sgd.LS=mse.LR=0.01.CF=r.DF=df.DR=0.1.L1000.D1=1000.D2=1000.D3=1000.predicted.tsv Model stored in file: save_default/uno.A=relu.B=32.E=10.O=sgd.LS=mse.LR=0.01.CF=r.DF=df.DR=0.1.L1000.D1=1000.D2=1000.D3=1000.model.json Model stored in file: save_default/uno.A=relu.B=32.E=10.O=sgd.LS=mse.LR=0.01.CF=r.DF=df.DR=0.1.L1000.D1=1000.D2=1000.D3=1000.model.h5 Model weights stored in file: save_default//default.weights.h5 partition:test, rank:0, sharded index size:0, batch_size:32, steps:0 ``` ## Infer The infer script does inference on a trained model, as in the underlying Uno benchmark. This script is able to use a pre-generated file or it can construct the data to do inference if a set of identifiers are provided. The argument --uq_infer_file must be used to specify the name of the file with the data (or the identifiers) to do inference. Additionally, if the data needs to be constructed, then one of the following arguments should be used to specify what type of identifiers are provided - --uq_infer_given_drugs=True if the file contains a set of drug IDs. - --uq_infer_given_cells=True if the file contains a set of cell IDs. - --uq_infer_given_indices=True if the file contains a set of indices. Note that the latter works if all the arguments for the data construction are set as well (usually those are taken from the model configuration file). Of course this specification and the trained model should be consistent for the script to work. Likewise, in the case that a pre-generated file is provided, the features included and the trained model should be consistent for the script to work. Note also that the --loss heteroscedastic option should be specified if the model was trained to predict the heterogeneous noise as well. #### Example output This assumes that a trained model (files default.model.json and default.weights.h5) is available at save_default folder. A sample json file compatible with the default model used in the training demo script is provided. After running the training script a default.weights.h5 file should be generated. Both, in combination, can be used for testing the inference demo script and would produce a similar output to the one shown next. ``` python uno_inferUQ_keras2.py Using TensorFlow backend. Importing candle utils for keras Configuration file: uno_defaultUQinfer_model.txt Params: {'activation': 'relu', 'agg_dose': 'AUC', 'base_lr': None, 'batch_normalization': False, 'batch_size': 32, 'by_cell': None, 'by_drug': None, 'cache': None, 'cell_feature_subset_path': '', 'cell_features': ['rnaseq'], 'cell_subset_path': '', 'cell_types': None, 'cp': False, 'cv': 1, 'datatype': , 'dense': [1000, 1000, 1000], 'dense_feature_layers': [1000, 1000, 1000], 'drop': 0.1, 'drug_feature_subset_path': '', 'drug_features': ['descriptors', 'fingerprints'], 'drug_median_response_max': 1, 'drug_median_response_min': -1, 'drug_subset_path': '', 'epochs': 10, 'exclude_cells': [], 'exclude_drugs': [], 'experiment_id': 'EXP000', 'export_csv': None, 'export_data': None, 'feature_subsample': 0, 'feature_subset_path': '', 'gpus': [], 'growth_bins': 0, 'initial_weights': None, 'learning_rate': None, 'logfile': None, 'loss': 'mse', 'max_val_loss': 1.0, 'model_file': 'save_default/default.model.json', 'n_pred': 10, 'no_feature_source': True, 'no_gen': False, 'no_response_source': True, 'optimizer': 'sgd', 'output_dir': './Output/EXP000/RUN000', 'partition_by': 'cell', 'preprocess_rnaseq': 'none', 'reduce_lr': False, 'residual': False, 'rng_seed': 2018, 'run_id': 'RUN000', 'sample_repetition': False, 'save_path': 'save_default/', 'save_weights': None, 'scaling': 'std', 'shuffle': False, 'single': True, 'solr_root': '', 'tb': False, 'tb_prefix': 'tb', 'test_sources': ['train'], 'timeout': 3600, 'train_bool': True, 'train_sources': ['gCSI'], 'uq_infer_file': 'save_default/infer_cell_ids', 'uq_infer_given_cells': True, 'uq_infer_given_drugs': False, 'uq_infer_given_indices': False, 'use_exported_data': None, 'use_filtered_genes': False, 'use_landmark_genes': True, 'val_split': 0.2, 'verbose': None, 'warmup_lr': False, 'weights_file': 'save_default/default.weights.h5'} __________________________________________________________________________________________________ Layer (type) Output Shape Param # Connected to ================================================================================================== input.cell.rnaseq (InputLayer) (None, 942) 0 __________________________________________________________________________________________________ input.drug1.descriptors (InputL (None, 5270) 0 __________________________________________________________________________________________________ input.drug1.fingerprints (Input (None, 2048) 0 __________________________________________________________________________________________________ cell.rnaseq (Model) (None, 1000) 2945000 input.cell.rnaseq[0][0] __________________________________________________________________________________________________ drug.descriptors (Model) (None, 1000) 7273000 input.drug1.descriptors[0][0] __________________________________________________________________________________________________ drug.fingerprints (Model) (None, 1000) 4051000 input.drug1.fingerprints[0][0] __________________________________________________________________________________________________ concatenate_1 (Concatenate) (None, 3000) 0 cell.rnaseq[1][0] drug.descriptors[1][0] drug.fingerprints[1][0] __________________________________________________________________________________________________ dense_10 (Dense) (None, 1000) 3001000 concatenate_1[0][0] __________________________________________________________________________________________________ permanent_dropout_10 (Permanent (None, 1000) 0 dense_10[0][0] __________________________________________________________________________________________________ dense_11 (Dense) (None, 1000) 1001000 permanent_dropout_10[0][0] __________________________________________________________________________________________________ permanent_dropout_11 (Permanent (None, 1000) 0 dense_11[0][0] __________________________________________________________________________________________________ dense_12 (Dense) (None, 1000) 1001000 permanent_dropout_11[0][0] __________________________________________________________________________________________________ permanent_dropout_12 (Permanent (None, 1000) 0 dense_12[0][0] __________________________________________________________________________________________________ dense_13 (Dense) (None, 1) 1001 permanent_dropout_12[0][0] ================================================================================================== Total params: 19,273,001 Trainable params: 19,273,001 Non-trainable params: 0 __________________________________________________________________________________________________ partition:test, rank:0, sharded index size:0, batch_size:32, steps:0 Read file: save_default/infer_cell_ids Number of elements read: 72 Comparing y_true and y_pred: mse: 0.0173 mae: 0.1012 r2: 0.4687 corr: 0.7001 Comparing y_true and y_pred: mse: 0.0172 mae: 0.1005 r2: 0.4720 corr: 0.7010 Comparing y_true and y_pred: mse: 0.0171 mae: 0.1033 r2: 0.4751 corr: 0.7064 Comparing y_true and y_pred: mse: 0.0175 mae: 0.1045 r2: 0.4627 corr: 0.6945 Comparing y_true and y_pred: mse: 0.0162 mae: 0.1007 r2: 0.5017 corr: 0.7277 Comparing y_true and y_pred: mse: 0.0166 mae: 0.1008 r2: 0.4921 corr: 0.7141 Comparing y_true and y_pred: mse: 0.0181 mae: 0.1059 r2: 0.4443 corr: 0.6878 Comparing y_true and y_pred: mse: 0.0167 mae: 0.1015 r2: 0.4875 corr: 0.7087 Comparing y_true and y_pred: mse: 0.0169 mae: 0.1032 r2: 0.4805 corr: 0.7106 Comparing y_true and y_pred: mse: 0.0169 mae: 0.0999 r2: 0.4817 corr: 0.7075 Predictions stored in file: save_default/uno.A=relu.B=32.E=10.O=sgd.LS=mse.LR=None.CF=r.DF=df.DR=0.1.L1000.D1=1000.D2=1000.D3=1000.predicted_INFER.tsv ``` ## Empirical Calibration Scripts included in the calibration subfolder compute empirical calibration for the inference results. The scripts with suffix HOM compute empirical calibration for inference with homoscedastic model, while the script with suffix HET computes empirical calibration for inference with a heteroscedastic model. To run the scripts it is necessary to provide the path to the file and the file with the inference results. Note that it is assumed that the file with the inference results includes each realization of the inference (implicit in the 'all' suffix), but for the homoscedastic case a script is provided to process an inference file with only the consolidated statistics (generally the average over all the realizations). Also, note that a specific format of the file with the inference results is assumed. Thus, a set of default values, reflecting the format of current CANDLE infer scripts, is used. More arbitrary formats may be usable, if they incurr in similar column offsets, but it would require passing the right parameters to the function reading the inference file. The script generates a series of plots and pickle (dill) files, displaying and encoding the empirical calibration computed. \ No newline at end of file diff --git a/Pilot1/Uno_UQ/uno_gCSI_modelUQ.txt b/Pilot1/Uno_UQ/uno_defaultUQ_model.txt similarity index 79% rename from Pilot1/Uno_UQ/uno_gCSI_modelUQ.txt rename to Pilot1/Uno_UQ/uno_defaultUQ_model.txt index 1108a006..67d6b247 100644 --- a/Pilot1/Uno_UQ/uno_gCSI_modelUQ.txt +++ b/Pilot1/Uno_UQ/uno_defaultUQ_model.txt @@ -8,15 +8,15 @@ dense=[1000, 1000, 1000] dense_feature_layers=[1000, 1000, 1000] activation='relu' loss='mse' -optimizer='adam' +optimizer='sgd' scaling='std' -dropout=0 +dropout=0.1 epochs=10 batch_size=32 val_split=0.2 cv=1 max_val_loss=1.0 -learning_rate=None +learning_rate=0.01 base_lr=None residual=False reduce_lr=False @@ -24,7 +24,8 @@ warmup_lr=False batch_normalization=False feature_subsample=0 rng_seed=2018 -save_path='save_gCSI/' +save_path='save_default/' +save_weights='default.weights.h5' no_gen=False verbose = False single=True @@ -33,6 +34,7 @@ no_feature_source=True no_response_source=True use_landmark_genes=True partition_by='cell' +uq_exclude_cells_file='save_default/infer_cell_ids' [Monitor_Params] solr_root='' diff --git a/Pilot1/Uno_UQ/uno_holdoutUQ_data.py b/Pilot1/Uno_UQ/uno_holdoutUQ_data.py index 5bcba4f1..d163ca06 100644 --- a/Pilot1/Uno_UQ/uno_holdoutUQ_data.py +++ b/Pilot1/Uno_UQ/uno_holdoutUQ_data.py @@ -18,10 +18,10 @@ os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' -def initialize_parameters(): +def initialize_parameters(default_model='uno_defaultUQ_model.txt'): # Build benchmark object - unoBmk = uno.BenchmarkUno(uno.file_path, 'uno_default_model.txt', 'keras', + unoBmk = uno.BenchmarkUno(uno.file_path, default_model, 'keras', prog='uno_holdoutUQ_data', desc='Build data split for UQ analysis in the problem of prediction of tumor response to drug pairs.') # Initialize parameters diff --git a/Pilot1/Uno_UQ/uno_inferUQ_keras2.py b/Pilot1/Uno_UQ/uno_inferUQ_keras2.py index 1155d3d3..552ab2aa 100644 --- a/Pilot1/Uno_UQ/uno_inferUQ_keras2.py +++ b/Pilot1/Uno_UQ/uno_inferUQ_keras2.py @@ -58,10 +58,10 @@ 'agg_dose', 'batch_size') -def initialize_parameters(): +def initialize_parameters(default_model='uno_defaultUQ_model.txt'): # Build benchmark object - unoBmk = uno.BenchmarkUno(uno.file_path, 'uno_default_model.txt', 'keras', + unoBmk = uno.BenchmarkUno(uno.file_path, default_model, 'keras', prog='uno_inferUQ', desc='Read models to predict tumor response to single and paired drugs.') unoBmk.additional_definitions += additional_definitions_local diff --git a/Pilot1/Uno_UQ/uno_trainUQ_keras2.py b/Pilot1/Uno_UQ/uno_trainUQ_keras2.py index 545eb962..49f5cd2f 100644 --- a/Pilot1/Uno_UQ/uno_trainUQ_keras2.py +++ b/Pilot1/Uno_UQ/uno_trainUQ_keras2.py @@ -69,10 +69,10 @@ def set_locals(self): -def initialize_parameters(): +def initialize_parameters(default_model='uno_defaultUQ_model.txt'): # Build benchmark object - unoUQBmk = UQUno(uno.file_path, 'uno_defaultUQ_model.txt', 'keras', + unoUQBmk = UQUno(uno.file_path, default_model, 'keras', prog='uno_trainUQ', desc='Build neural network based models to predict tumor response to single and paired drugs, including UQ analysis.') # Initialize parameters From d725f4d4e0a4c43b66720fc36a537d2fe7ce0eec Mon Sep 17 00:00:00 2001 From: Cristina Garcia Cardona Date: Wed, 8 Apr 2020 22:23:59 -0600 Subject: [PATCH 221/331] fixed keyword bugs and some default configuration files in pilot1 --- Pilot1/Combo/combo_baseline_keras2.py | 2 +- Pilot1/P1B1/p1b1_default_model.txt | 2 +- Pilot1/P1B2/p1b2.py | 4 ++++ Pilot1/P1B2/p1b2_baseline_keras2.py | 5 ++++- Pilot1/P1B2/p1b2_default_model.txt | 1 + Pilot1/Uno_UQ/README.md | 2 +- Pilot1/Uno_UQ/uno_defaultUQ_model.txt | 3 +-- Pilot1/Uno_UQ/uno_holdoutUQ_data.py | 6 +++--- 8 files changed, 16 insertions(+), 9 deletions(-) diff --git a/Pilot1/Combo/combo_baseline_keras2.py b/Pilot1/Combo/combo_baseline_keras2.py index 5a262574..379aaea7 100644 --- a/Pilot1/Combo/combo_baseline_keras2.py +++ b/Pilot1/Combo/combo_baseline_keras2.py @@ -673,7 +673,7 @@ def run(params): logger.info('Params: {}'.format(params)) loader = ComboDataLoader(seed=args.rng_seed, - val_split=args.validation_split, + val_split=args.val_split, cell_features=args.cell_features, drug_features=args.drug_features, use_mean_growth=args.use_mean_growth, diff --git a/Pilot1/P1B1/p1b1_default_model.txt b/Pilot1/P1B1/p1b1_default_model.txt index 27e69a2b..4becc88f 100644 --- a/Pilot1/P1B1/p1b1_default_model.txt +++ b/Pilot1/P1B1/p1b1_default_model.txt @@ -23,7 +23,7 @@ latent_dim=2 feature_subsample=0 batch_normalization=False alpha_dropout=False -save_path='save' +save_path='save/' [Monitor_Params] solr_root='' diff --git a/Pilot1/P1B2/p1b2.py b/Pilot1/P1B2/p1b2.py index 70bb9b8f..82787c2e 100644 --- a/Pilot1/P1B2/p1b2.py +++ b/Pilot1/P1B2/p1b2.py @@ -30,6 +30,10 @@ logger = logging.getLogger(__name__) additional_definitions = [ +{'name':'model_prefix', + 'default':'p1b2', + 'type':str, + 'help':'prefix to build model name for saving'}, {'name':'reg_l2', 'type': float, 'default': 0., diff --git a/Pilot1/P1B2/p1b2_baseline_keras2.py b/Pilot1/P1B2/p1b2_baseline_keras2.py index d336167e..7122eac0 100644 --- a/Pilot1/P1B2/p1b2_baseline_keras2.py +++ b/Pilot1/P1B2/p1b2_baseline_keras2.py @@ -34,7 +34,10 @@ def run(gParameters): # Construct extension to save model ext = p1b2.extension_from_parameters(gParameters, '.keras') - logfile = gParameters['logfile'] if gParameters['logfile'] else gParameters['save_path']+ext+'.log' + candle.verify_path(params['save_path']) + prefix = '{}{}'.format(params['save_path'], ext) + logfile = gParameters['logfile'] if gParameters['logfile'] else prefix+'.log' + candle.set_up_logger(logfile, p1b2.logger, params['verbose']) p1b2.logger.info('Params: {}'.format(gParameters)) # Get default parameters for initialization and optimizer functions diff --git a/Pilot1/P1B2/p1b2_default_model.txt b/Pilot1/P1B2/p1b2_default_model.txt index 1b399a7d..5d0ddbaf 100644 --- a/Pilot1/P1B2/p1b2_default_model.txt +++ b/Pilot1/P1B2/p1b2_default_model.txt @@ -2,6 +2,7 @@ data_url = 'http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/P1B2/' train_data = 'P1B2.train.csv' test_data = 'P1B2.test.csv' +model_prefix='p1b2' dense=[1024, 512, 256] batch_size=60 epochs=1 diff --git a/Pilot1/Uno_UQ/README.md b/Pilot1/Uno_UQ/README.md index af389664..3da30a33 100644 --- a/Pilot1/Uno_UQ/README.md +++ b/Pilot1/Uno_UQ/README.md @@ -1 +1 @@ -## Uno_UQ: Predicting Tumor Dose Response across Multiple Data Sources with added UQ functionality. ## Functionality Uno_UQ adds uncertainty quantification (UQ) functionality to the Uno model. For information about the underlaying model, please refer to the Uno benchmark. This page overviews the added UQ functionality provided, which includes: - Generation of holdout set. - Training excluding the holdout set. - Inference for the specified data. - Training for homoscedastic and heteroscedastic models. - Empirical calibration of UQ for the trained models. ## Holdout The holdout script generates a set of identifiers to holdout during training, depending on the --partition_by argument. If --partition_by is 'drug_pair' it generates a set of drug IDs. If --partition_by is 'cell' it generates a set of cell IDs. In any other case it generates a set of indices. The fraction to reserve in the holdout set is given by the --val_split argument. #### Example output ``` python uno_holdoutUQ_data.py Using TensorFlow backend. Importing candle utils for keras Configuration file: uno_defaultUQ_model.txt Params: {'activation': 'relu', 'agg_dose': 'AUC', 'base_lr': None, 'batch_normalization': False, 'batch_size': 32, 'by_cell': None, 'by_drug': None, 'cache': None, 'cell_feature_subset_path': '', 'cell_features': ['rnaseq'], 'cell_subset_path': '', 'cell_types': None, 'cp': False, 'cv': 1, 'data_type': , 'dense': [1000, 1000, 1000], 'dense_feature_layers': [1000, 1000, 1000], 'dropout': 0.1, 'drug_feature_subset_path': '', 'drug_features': ['descriptors', 'fingerprints'], 'drug_median_response_max': 1, 'drug_median_response_min': -1, 'drug_subset_path': '', 'epochs': 10, 'exclude_cells': [], 'exclude_drugs': [], 'experiment_id': 'EXP000', 'export_csv': None, 'export_data': None, 'feature_subsample': 0, 'feature_subset_path': '', 'gpus': [], 'growth_bins': 0, 'initial_weights': None, 'learning_rate': 0.01, 'logfile': None, 'loss': 'mse', 'max_val_loss': 1.0, 'no_feature_source': True, 'no_gen': False, 'no_response_source': True, 'optimizer': 'sgd', 'output_dir': './Output/EXP000/RUN000', 'partition_by': 'cell', 'preprocess_rnaseq': 'none', 'reduce_lr': False, 'residual': False, 'rng_seed': 2018, 'run_id': 'RUN000', 'sample_repetition': False, 'save_path': 'save_default/', 'save_weights': 'default.weights.h5', 'scaling': 'std', 'shuffle': False, 'single': True, 'solr_root': '', 'tb': False, 'tb_prefix': 'tb', 'test_sources': ['train'], 'timeout': 3600, 'train_bool': True, 'train_sources': ['gCSI'], 'uq_exclude_cells_file': 'save_default/infer_cell_ids', 'use_exported_data': None, 'use_filtered_genes': False, 'use_landmark_genes': True, 'val_split': 0.2, 'verbose': None, 'warmup_lr': False} partition_by: cell Cell IDs in holdout set written in file: save_default/infer_cell_ids ``` ## Train The train script trains the model, as in the underlying Uno benchmark, but excluding the IDs in the holdout file. The file with the holdout set should be provided via one of the following arguments - --uq_exclude_drugs_file='file' if the file contains a set of drug IDs. - --uq_exclude_cells_file='file' if the file contains a set of cell IDs. - --uq_exclude_indices_file='file' if the file contains a set of indices. An additional --loss heteroscedastic option is available. This will learn the input-dependent noise level as well as the main regression variable specified (i.e. growth or AUC). #### Example output ``` python uno_trainUQ_keras2.py --cp True Using TensorFlow backend. Importing candle utils for keras Configuration file: uno_defaultUQ_model.txt Params: {'activation': 'relu', 'agg_dose': 'AUC', 'base_lr': None, 'batch_normalization': False, 'batch_size': 32, 'by_cell': None, 'by_drug': None, 'cache': None, 'cell_feature_subset_path': '', 'cell_features': ['rnaseq'], 'cell_subset_path': '', 'cell_types': None, 'cp': True, 'cv': 1, 'datatype': , 'dense': [1000, 1000, 1000], 'dense_feature_layers': [1000, 1000, 1000], 'drop': 0.1, 'drug_feature_subset_path': '', 'drug_features': ['descriptors', 'fingerprints'], 'drug_median_response_max': 1, 'drug_median_response_min': -1, 'drug_subset_path': '', 'epochs': 10, 'exclude_cells': [], 'exclude_drugs': [], 'exclude_indices': [], 'experiment_id': 'EXP000', 'export_csv': None, 'export_data': None, 'feature_subsample': 0, 'feature_subset_path': '', 'gpus': [], 'growth_bins': 0, 'initial_weights': None, 'learning_rate': 0.01, 'logfile': None, 'loss': 'mse', 'max_val_loss': 1.0, 'no_feature_source': True, 'no_gen': False, 'no_response_source': True, 'optimizer': 'sgd', 'output_dir': './Output/EXP000/RUN000', 'partition_by': 'cell', 'preprocess_rnaseq': 'none', 'reduce_lr': False, 'reg_l2': 0.0, 'residual': False, 'rng_seed': 2018, 'run_id': 'RUN000', 'sample_repetition': False, 'save_path': 'save_default/', 'save_weights': 'default.weights.h5', 'scaling': 'std', 'shuffle': False, 'single': True, 'solr_root': '', 'tb': False, 'tb_prefix': 'tb', 'test_sources': ['train'], 'timeout': 3600, 'train_bool': True, 'train_sources': ['gCSI'], 'uq_exclude_cells_file': 'save_default/infer_cell_ids', 'use_exported_data': None, 'use_filtered_genes': False, 'use_landmark_genes': True, 'val_split': 0.2, 'verbose': None, 'warmup_lr': False} Read file: save_default/infer_cell_ids Number of elements read: 72 Cells to exclude: ['gCSI.NCI-H889', 'gCSI.MEWO', 'gCSI.PA-TU-8902', 'gCSI.BCPAP', 'gCSI.CAL-12T', 'gCSI.NCI-H727', 'gCSI.HUH-1', 'gCSI.NUGC-4', 'gCSI.MKN74', 'gCSI.PK-1', 'gCSI.A2058', 'gCSI.RAJI', 'gCSI.JHH-7', 'gCSI.SUIT-2', 'gCSI.OE21', 'gCSI.HCC1806', 'gCSI.PANC-10-05', 'gCSI.RMG-I', 'gCSI.NCI-H1703', 'gCSI.KMS-34', 'gCSI.G-361', 'gCSI.EPLC-272H', 'gCSI.HEP-G2', 'gCSI.RERF-LC-MS', 'gCSI.COLO-800', 'gCSI.KM12', 'gCSI.DOHH-2', 'gCSI.EFM-19', 'gCSI.MDA-MB-468', 'gCSI.MHH-ES-1', 'gCSI.IPC-298', 'gCSI.GRANTA-519', 'gCSI.8305C', 'gCSI.KYSE-140', 'gCSI.MALME-3M', 'gCSI.MIA-PACA-2', 'gCSI.NCI-H1666', 'gCSI.PC-3', 'gCSI.RT4', 'gCSI.HUP-T4', 'gCSI.NCI-H1869', 'gCSI.WM-266-4', 'gCSI.KMM-1', 'gCSI.OE33', 'gCSI.SU-DHL-6', 'gCSI.QGP-1', 'gCSI.IGR-37', 'gCSI.VMRC-RCW', 'gCSI.NCI-H1838', 'gCSI.SW948', 'gCSI.COLO-679', 'gCSI.CAL-51', 'gCSI.HUCCT1', 'gCSI.LP-1', 'gCSI.RPMI-7951', 'gCSI.HPAF-II', 'gCSI.OCUM-1', 'gCSI.HOP-92', 'gCSI.NCI-H661', 'gCSI.TOV-112D', 'gCSI.PANC-03-27', 'gCSI.AGS', 'gCSI.HEC-59', 'gCSI.LN-18', 'gCSI.U-87-MG', 'gCSI.U-2-OS', 'gCSI.ABC-1', 'gCSI.IGR-1', 'gCSI.SK-MEL-3', 'gCSI.A549', 'gCSI.HCC4006', 'gCSI.NCI-H1355'] Combined model: __________________________________________________________________________________________________ Layer (type) Output Shape Param # Connected to ================================================================================================== input.cell.rnaseq (InputLayer) (None, 942) 0 __________________________________________________________________________________________________ input.drug1.descriptors (InputL (None, 5270) 0 __________________________________________________________________________________________________ input.drug1.fingerprints (Input (None, 2048) 0 __________________________________________________________________________________________________ cell.rnaseq (Model) (None, 1000) 2945000 input.cell.rnaseq[0][0] __________________________________________________________________________________________________ drug.descriptors (Model) (None, 1000) 7273000 input.drug1.descriptors[0][0] __________________________________________________________________________________________________ drug.fingerprints (Model) (None, 1000) 4051000 input.drug1.fingerprints[0][0] __________________________________________________________________________________________________ concatenate_1 (Concatenate) (None, 3000) 0 cell.rnaseq[1][0] drug.descriptors[1][0] drug.fingerprints[1][0] __________________________________________________________________________________________________ dense_10 (Dense) (None, 1000) 3001000 concatenate_1[0][0] __________________________________________________________________________________________________ permanent_dropout_10 (Permanent (None, 1000) 0 dense_10[0][0] __________________________________________________________________________________________________ dense_11 (Dense) (None, 1000) 1001000 permanent_dropout_10[0][0] __________________________________________________________________________________________________ permanent_dropout_11 (Permanent (None, 1000) 0 dense_11[0][0] __________________________________________________________________________________________________ dense_12 (Dense) (None, 1000) 1001000 permanent_dropout_11[0][0] __________________________________________________________________________________________________ permanent_dropout_12 (Permanent (None, 1000) 0 dense_12[0][0] __________________________________________________________________________________________________ dense_13 (Dense) (None, 1) 1001 permanent_dropout_12[0][0] ================================================================================================== Total params: 19,273,001 Trainable params: 19,273,001 Non-trainable params: 0 __________________________________________________________________________________________________ Training homoscedastic model: partition:train, rank:0, sharded index size:2784, batch_size:32, steps:87 partition:val, rank:0, sharded index size:704, batch_size:32, steps:22 Between random pairs in y_val: mse: 0.0604 mae: 0.1978 r2: -0.9105 corr: 0.0447 Data points per epoch: train = 2784, val = 704 Steps per epoch: train = 87, val = 22 Epoch 1/10 87/87 [==============================] - 15s 174ms/step - loss: 0.2165 - mae: 0.2144 - r2: -6.4761 - val_loss: 0.0247 - val_mae: 0.1244 - val_r2: 0.1916 Current time ....15.176 Epoch 2/10 87/87 [==============================] - 12s 142ms/step - loss: 0.0247 - mae: 0.1240 - r2: 0.1302 - val_loss: 0.0208 - val_mae: 0.1147 - val_r2: 0.3058 Current time ....28.323 Epoch 3/10 87/87 [==============================] - 12s 143ms/step - loss: 0.0219 - mae: 0.1157 - r2: 0.2278 - val_loss: 0.0197 - val_mae: 0.1112 - val_r2: 0.3565 Current time ....41.321 Epoch 4/10 87/87 [==============================] - 12s 143ms/step - loss: 0.0203 - mae: 0.1111 - r2: 0.2897 - val_loss: 0.0182 - val_mae: 0.1072 - val_r2: 0.3980 Current time ....54.330 Epoch 5/10 87/87 [==============================] - 13s 153ms/step - loss: 0.0187 - mae: 0.1066 - r2: 0.3388 - val_loss: 0.0189 - val_mae: 0.1090 - val_r2: 0.3804 Current time ....68.120 Epoch 6/10 87/87 [==============================] - 13s 148ms/step - loss: 0.0185 - mae: 0.1075 - r2: 0.3412 - val_loss: 0.0186 - val_mae: 0.1088 - val_r2: 0.3921 Current time ....80.967 Epoch 7/10 87/87 [==============================] - 13s 147ms/step - loss: 0.0185 - mae: 0.1069 - r2: 0.3468 - val_loss: 0.0177 - val_mae: 0.1043 - val_r2: 0.4259 Current time ....93.769 Epoch 8/10 87/87 [==============================] - 13s 150ms/step - loss: 0.0176 - mae: 0.1031 - r2: 0.3791 - val_loss: 0.0159 - val_mae: 0.0994 - val_r2: 0.4793 Current time ....107.421 Epoch 9/10 87/87 [==============================] - 13s 150ms/step - loss: 0.0177 - mae: 0.1034 - r2: 0.3745 - val_loss: 0.0161 - val_mae: 0.1000 - val_r2: 0.4696 Current time ....120.945 Epoch 10/10 87/87 [==============================] - 14s 159ms/step - loss: 0.0169 - mae: 0.1022 - r2: 0.4086 - val_loss: 0.0173 - val_mae: 0.1029 - val_r2: 0.4337 Current time ....134.744 Comparing y_true and y_pred: mse: 0.0165 mae: 0.1016 r2: 0.4782 corr: 0.7072 Testing predictions stored in file: save_default/uno.A=relu.B=32.E=10.O=sgd.LS=mse.LR=0.01.CF=r.DF=df.DR=0.1.L1000.D1=1000.D2=1000.D3=1000.predicted.tsv Model stored in file: save_default/uno.A=relu.B=32.E=10.O=sgd.LS=mse.LR=0.01.CF=r.DF=df.DR=0.1.L1000.D1=1000.D2=1000.D3=1000.model.json Model stored in file: save_default/uno.A=relu.B=32.E=10.O=sgd.LS=mse.LR=0.01.CF=r.DF=df.DR=0.1.L1000.D1=1000.D2=1000.D3=1000.model.h5 Model weights stored in file: save_default//default.weights.h5 partition:test, rank:0, sharded index size:0, batch_size:32, steps:0 ``` ## Infer The infer script does inference on a trained model, as in the underlying Uno benchmark. This script is able to use a pre-generated file or it can construct the data to do inference if a set of identifiers are provided. The argument --uq_infer_file must be used to specify the name of the file with the data (or the identifiers) to do inference. Additionally, if the data needs to be constructed, then one of the following arguments should be used to specify what type of identifiers are provided - --uq_infer_given_drugs=True if the file contains a set of drug IDs. - --uq_infer_given_cells=True if the file contains a set of cell IDs. - --uq_infer_given_indices=True if the file contains a set of indices. Note that the latter works if all the arguments for the data construction are set as well (usually those are taken from the model configuration file). Of course this specification and the trained model should be consistent for the script to work. Likewise, in the case that a pre-generated file is provided, the features included and the trained model should be consistent for the script to work. Note also that the --loss heteroscedastic option should be specified if the model was trained to predict the heterogeneous noise as well. #### Example output This assumes that a trained model (files default.model.json and default.weights.h5) is available at save_default folder. A sample json file compatible with the default model used in the training demo script is provided. After running the training script a default.weights.h5 file should be generated. Both, in combination, can be used for testing the inference demo script and would produce a similar output to the one shown next. ``` python uno_inferUQ_keras2.py Using TensorFlow backend. Importing candle utils for keras Configuration file: uno_defaultUQinfer_model.txt Params: {'activation': 'relu', 'agg_dose': 'AUC', 'base_lr': None, 'batch_normalization': False, 'batch_size': 32, 'by_cell': None, 'by_drug': None, 'cache': None, 'cell_feature_subset_path': '', 'cell_features': ['rnaseq'], 'cell_subset_path': '', 'cell_types': None, 'cp': False, 'cv': 1, 'datatype': , 'dense': [1000, 1000, 1000], 'dense_feature_layers': [1000, 1000, 1000], 'drop': 0.1, 'drug_feature_subset_path': '', 'drug_features': ['descriptors', 'fingerprints'], 'drug_median_response_max': 1, 'drug_median_response_min': -1, 'drug_subset_path': '', 'epochs': 10, 'exclude_cells': [], 'exclude_drugs': [], 'experiment_id': 'EXP000', 'export_csv': None, 'export_data': None, 'feature_subsample': 0, 'feature_subset_path': '', 'gpus': [], 'growth_bins': 0, 'initial_weights': None, 'learning_rate': None, 'logfile': None, 'loss': 'mse', 'max_val_loss': 1.0, 'model_file': 'save_default/default.model.json', 'n_pred': 10, 'no_feature_source': True, 'no_gen': False, 'no_response_source': True, 'optimizer': 'sgd', 'output_dir': './Output/EXP000/RUN000', 'partition_by': 'cell', 'preprocess_rnaseq': 'none', 'reduce_lr': False, 'residual': False, 'rng_seed': 2018, 'run_id': 'RUN000', 'sample_repetition': False, 'save_path': 'save_default/', 'save_weights': None, 'scaling': 'std', 'shuffle': False, 'single': True, 'solr_root': '', 'tb': False, 'tb_prefix': 'tb', 'test_sources': ['train'], 'timeout': 3600, 'train_bool': True, 'train_sources': ['gCSI'], 'uq_infer_file': 'save_default/infer_cell_ids', 'uq_infer_given_cells': True, 'uq_infer_given_drugs': False, 'uq_infer_given_indices': False, 'use_exported_data': None, 'use_filtered_genes': False, 'use_landmark_genes': True, 'val_split': 0.2, 'verbose': None, 'warmup_lr': False, 'weights_file': 'save_default/default.weights.h5'} __________________________________________________________________________________________________ Layer (type) Output Shape Param # Connected to ================================================================================================== input.cell.rnaseq (InputLayer) (None, 942) 0 __________________________________________________________________________________________________ input.drug1.descriptors (InputL (None, 5270) 0 __________________________________________________________________________________________________ input.drug1.fingerprints (Input (None, 2048) 0 __________________________________________________________________________________________________ cell.rnaseq (Model) (None, 1000) 2945000 input.cell.rnaseq[0][0] __________________________________________________________________________________________________ drug.descriptors (Model) (None, 1000) 7273000 input.drug1.descriptors[0][0] __________________________________________________________________________________________________ drug.fingerprints (Model) (None, 1000) 4051000 input.drug1.fingerprints[0][0] __________________________________________________________________________________________________ concatenate_1 (Concatenate) (None, 3000) 0 cell.rnaseq[1][0] drug.descriptors[1][0] drug.fingerprints[1][0] __________________________________________________________________________________________________ dense_10 (Dense) (None, 1000) 3001000 concatenate_1[0][0] __________________________________________________________________________________________________ permanent_dropout_10 (Permanent (None, 1000) 0 dense_10[0][0] __________________________________________________________________________________________________ dense_11 (Dense) (None, 1000) 1001000 permanent_dropout_10[0][0] __________________________________________________________________________________________________ permanent_dropout_11 (Permanent (None, 1000) 0 dense_11[0][0] __________________________________________________________________________________________________ dense_12 (Dense) (None, 1000) 1001000 permanent_dropout_11[0][0] __________________________________________________________________________________________________ permanent_dropout_12 (Permanent (None, 1000) 0 dense_12[0][0] __________________________________________________________________________________________________ dense_13 (Dense) (None, 1) 1001 permanent_dropout_12[0][0] ================================================================================================== Total params: 19,273,001 Trainable params: 19,273,001 Non-trainable params: 0 __________________________________________________________________________________________________ partition:test, rank:0, sharded index size:0, batch_size:32, steps:0 Read file: save_default/infer_cell_ids Number of elements read: 72 Comparing y_true and y_pred: mse: 0.0173 mae: 0.1012 r2: 0.4687 corr: 0.7001 Comparing y_true and y_pred: mse: 0.0172 mae: 0.1005 r2: 0.4720 corr: 0.7010 Comparing y_true and y_pred: mse: 0.0171 mae: 0.1033 r2: 0.4751 corr: 0.7064 Comparing y_true and y_pred: mse: 0.0175 mae: 0.1045 r2: 0.4627 corr: 0.6945 Comparing y_true and y_pred: mse: 0.0162 mae: 0.1007 r2: 0.5017 corr: 0.7277 Comparing y_true and y_pred: mse: 0.0166 mae: 0.1008 r2: 0.4921 corr: 0.7141 Comparing y_true and y_pred: mse: 0.0181 mae: 0.1059 r2: 0.4443 corr: 0.6878 Comparing y_true and y_pred: mse: 0.0167 mae: 0.1015 r2: 0.4875 corr: 0.7087 Comparing y_true and y_pred: mse: 0.0169 mae: 0.1032 r2: 0.4805 corr: 0.7106 Comparing y_true and y_pred: mse: 0.0169 mae: 0.0999 r2: 0.4817 corr: 0.7075 Predictions stored in file: save_default/uno.A=relu.B=32.E=10.O=sgd.LS=mse.LR=None.CF=r.DF=df.DR=0.1.L1000.D1=1000.D2=1000.D3=1000.predicted_INFER.tsv ``` ## Empirical Calibration Scripts included in the calibration subfolder compute empirical calibration for the inference results. The scripts with suffix HOM compute empirical calibration for inference with homoscedastic model, while the script with suffix HET computes empirical calibration for inference with a heteroscedastic model. To run the scripts it is necessary to provide the path to the file and the file with the inference results. Note that it is assumed that the file with the inference results includes each realization of the inference (implicit in the 'all' suffix), but for the homoscedastic case a script is provided to process an inference file with only the consolidated statistics (generally the average over all the realizations). Also, note that a specific format of the file with the inference results is assumed. Thus, a set of default values, reflecting the format of current CANDLE infer scripts, is used. More arbitrary formats may be usable, if they incurr in similar column offsets, but it would require passing the right parameters to the function reading the inference file. The script generates a series of plots and pickle (dill) files, displaying and encoding the empirical calibration computed. \ No newline at end of file +## Uno_UQ: Predicting Tumor Dose Response across Multiple Data Sources with added UQ functionality. ## Functionality Uno_UQ adds uncertainty quantification (UQ) functionality to the Uno model. For information about the underlaying model, please refer to the Uno benchmark. This page overviews the added UQ functionality provided, which includes: - Generation of holdout set. - Training excluding the holdout set. - Inference for the specified data. - Training for homoscedastic and heteroscedastic models. - Empirical calibration of UQ for the trained models. ## Holdout The holdout script generates a set of identifiers to holdout during training, depending on the --partition_by argument. If --partition_by is 'drug_pair' it generates a set of drug IDs. If --partition_by is 'cell' it generates a set of cell IDs. In any other case it generates a set of indices. The fraction to reserve in the holdout set is given by the --val_split argument. #### Example output ``` python uno_holdoutUQ_data.py Using TensorFlow backend. Importing candle utils for keras Params: {'activation': 'relu', 'agg_dose': 'AUC', 'base_lr': None, 'batch_normalization': False, 'batch_size': 32, 'by_cell': None, 'by_drug': None, 'cache': None, 'cell_feature_subset_path': '', 'cell_features': ['rnaseq'], 'cell_subset_path': '', 'cell_types': None, 'cp': False, 'cv': 1, 'data_type': , 'dense': [1000, 1000, 1000], 'dense_feature_layers': [1000, 1000, 1000], 'dropout': 0.1, 'drug_feature_subset_path': '', 'drug_features': ['descriptors', 'fingerprints'], 'drug_median_response_max': 1, 'drug_median_response_min': -1, 'drug_subset_path': '', 'epochs': 10, 'exclude_cells': [], 'exclude_drugs': [], 'experiment_id': 'EXP000', 'export_csv': None, 'export_data': None, 'feature_subsample': 0, 'feature_subset_path': '', 'gpus': [], 'growth_bins': 0, 'initial_weights': None, 'learning_rate': 0.01, 'logfile': None, 'loss': 'mse', 'max_val_loss': 1.0, 'no_feature_source': True, 'no_gen': False, 'no_response_source': True, 'optimizer': 'sgd', 'output_dir': './Output/EXP000/RUN000', 'partition_by': 'cell', 'preprocess_rnaseq': 'none', 'profiling': False, 'reduce_lr': False, 'residual': False, 'rng_seed': 2018, 'run_id': 'RUN000', 'sample_repetition': False, 'save_path': 'save_default/', 'save_weights': 'default.weights.h5', 'scaling': 'std', 'shuffle': False, 'single': True, 'solr_root': '', 'tb': False, 'tb_prefix': 'tb', 'test_sources': ['train'], 'timeout': 3600, 'train_bool': True, 'train_sources': ['gCSI'], 'use_exported_data': None, 'use_filtered_genes': False, 'use_landmark_genes': True, 'val_split': 0.2, 'verbose': None, 'warmup_lr': False} partition_by: cell Cell IDs in holdout set written in file: save_default/infer_cell_ids ``` ## Train The train script trains the model, as in the underlying Uno benchmark, but excluding the IDs in the holdout file. The file with the holdout set should be provided via one of the following arguments - --uq_exclude_drugs_file='file' if the file contains a set of drug IDs. - --uq_exclude_cells_file='file' if the file contains a set of cell IDs. - --uq_exclude_indices_file='file' if the file contains a set of indices. An additional --loss heteroscedastic option is available. This will learn the input-dependent noise level as well as the main regression variable specified (i.e. growth or AUC). #### Example output ``` python uno_trainUQ_keras2.py --cp True --uq_exclude_cells_file 'save_default/infer_cell_ids' Using TensorFlow backend. Importing candle utils for keras Params: {'activation': 'relu', 'agg_dose': 'AUC', 'base_lr': None, 'batch_normalization': False, 'batch_size': 32, 'by_cell': None, 'by_drug': None, 'cache': None, 'cell_feature_subset_path': '', 'cell_features': ['rnaseq'], 'cell_subset_path': '', 'cell_types': None, 'cp': True, 'cv': 1, 'data_type': , 'dense': [1000, 1000, 1000], 'dense_feature_layers': [1000, 1000, 1000], 'dropout': 0.1, 'drug_feature_subset_path': '', 'drug_features': ['descriptors', 'fingerprints'], 'drug_median_response_max': 1, 'drug_median_response_min': -1, 'drug_subset_path': '', 'epochs': 10, 'exclude_cells': [], 'exclude_drugs': [], 'exclude_indices': [], 'experiment_id': 'EXP000', 'export_csv': None, 'export_data': None, 'feature_subsample': 0, 'feature_subset_path': '', 'gpus': [], 'growth_bins': 0, 'initial_weights': None, 'learning_rate': 0.01, 'logfile': None, 'loss': 'mse', 'max_val_loss': 1.0, 'no_feature_source': True, 'no_gen': False, 'no_response_source': True, 'optimizer': 'sgd', 'output_dir': './Output/EXP000/RUN000', 'partition_by': 'cell', 'preprocess_rnaseq': 'none', 'reduce_lr': False, 'reg_l2': 0.0, 'residual': False, 'rng_seed': 2018, 'run_id': 'RUN000', 'sample_repetition': False, 'save_path': 'save_default/', 'save_weights': 'saved.weights.h5', 'scaling': 'std', 'shuffle': False, 'single': True, 'solr_root': '', 'tb': False, 'tb_prefix': 'tb', 'test_sources': ['train'], 'timeout': 3600, 'train_bool': True, 'train_sources': ['gCSI'], 'uq_exclude_cells_file': 'save_default/infer_cell_ids', 'use_exported_data': None, 'use_filtered_genes': False, 'use_landmark_genes': True, 'val_split': 0.2, 'verbose': None, 'warmup_lr': False} Read file: save_default/infer_cell_ids Number of elements read: 72 Cells to exclude: ['gCSI.NCI-H889', 'gCSI.MEWO', 'gCSI.PA-TU-8902', 'gCSI.BCPAP', 'gCSI.CAL-12T', 'gCSI.NCI-H727', 'gCSI.HUH-1', 'gCSI.NUGC-4', 'gCSI.MKN74', 'gCSI.PK-1', 'gCSI.A2058', 'gCSI.RAJI', 'gCSI.JHH-7', 'gCSI.SUIT-2', 'gCSI.OE21', 'gCSI.HCC1806', 'gCSI.PANC-10-05', 'gCSI.RMG-I', 'gCSI.NCI-H1703', 'gCSI.KMS-34', 'gCSI.G-361', 'gCSI.EPLC-272H', 'gCSI.HEP-G2', 'gCSI.RERF-LC-MS', 'gCSI.COLO-800', 'gCSI.KM12', 'gCSI.DOHH-2', 'gCSI.EFM-19', 'gCSI.MDA-MB-468', 'gCSI.MHH-ES-1', 'gCSI.IPC-298', 'gCSI.GRANTA-519', 'gCSI.8305C', 'gCSI.KYSE-140', 'gCSI.MALME-3M', 'gCSI.MIA-PACA-2', 'gCSI.NCI-H1666', 'gCSI.PC-3', 'gCSI.RT4', 'gCSI.HUP-T4', 'gCSI.NCI-H1869', 'gCSI.WM-266-4', 'gCSI.KMM-1', 'gCSI.OE33', 'gCSI.SU-DHL-6', 'gCSI.QGP-1', 'gCSI.IGR-37', 'gCSI.VMRC-RCW', 'gCSI.NCI-H1838', 'gCSI.SW948', 'gCSI.COLO-679', 'gCSI.CAL-51', 'gCSI.HUCCT1', 'gCSI.LP-1', 'gCSI.RPMI-7951', 'gCSI.HPAF-II', 'gCSI.OCUM-1', 'gCSI.HOP-92', 'gCSI.NCI-H661', 'gCSI.TOV-112D', 'gCSI.PANC-03-27', 'gCSI.AGS', 'gCSI.HEC-59', 'gCSI.LN-18', 'gCSI.U-87-MG', 'gCSI.U-2-OS', 'gCSI.ABC-1', 'gCSI.IGR-1', 'gCSI.SK-MEL-3', 'gCSI.A549', 'gCSI.HCC4006', 'gCSI.NCI-H1355'] Combined model: __________________________________________________________________________________________________ Layer (type) Output Shape Param # Connected to ================================================================================================== input.cell.rnaseq (InputLayer) (None, 942) 0 __________________________________________________________________________________________________ input.drug1.descriptors (InputL (None, 5270) 0 __________________________________________________________________________________________________ input.drug1.fingerprints (Input (None, 2048) 0 __________________________________________________________________________________________________ cell.rnaseq (Model) (None, 1000) 2945000 input.cell.rnaseq[0][0] __________________________________________________________________________________________________ drug.descriptors (Model) (None, 1000) 7273000 input.drug1.descriptors[0][0] __________________________________________________________________________________________________ drug.fingerprints (Model) (None, 1000) 4051000 input.drug1.fingerprints[0][0] __________________________________________________________________________________________________ concatenate_1 (Concatenate) (None, 3000) 0 cell.rnaseq[1][0] drug.descriptors[1][0] drug.fingerprints[1][0] __________________________________________________________________________________________________ dense_10 (Dense) (None, 1000) 3001000 concatenate_1[0][0] __________________________________________________________________________________________________ permanent_dropout_10 (Permanent (None, 1000) 0 dense_10[0][0] __________________________________________________________________________________________________ dense_11 (Dense) (None, 1000) 1001000 permanent_dropout_10[0][0] __________________________________________________________________________________________________ permanent_dropout_11 (Permanent (None, 1000) 0 dense_11[0][0] __________________________________________________________________________________________________ dense_12 (Dense) (None, 1000) 1001000 permanent_dropout_11[0][0] __________________________________________________________________________________________________ permanent_dropout_12 (Permanent (None, 1000) 0 dense_12[0][0] __________________________________________________________________________________________________ dense_13 (Dense) (None, 1) 1001 permanent_dropout_12[0][0] ================================================================================================== Total params: 19,273,001 Trainable params: 19,273,001 Non-trainable params: 0 __________________________________________________________________________________________________ Training homoscedastic model: partition:train, rank:0, sharded index size:2784, batch_size:32, steps:87 partition:val, rank:0, sharded index size:704, batch_size:32, steps:22 Between random pairs in y_val: mse: 0.0604 mae: 0.1978 r2: -0.9105 corr: 0.0447 Data points per epoch: train = 2784, val = 704 Steps per epoch: train = 87, val = 22 Epoch 1/10 87/87 [==============================] - 15s 174ms/step - loss: 0.2165 - mae: 0.2144 - r2: -6.4761 - val_loss: 0.0247 - val_mae: 0.1244 - val_r2: 0.1916 Current time ....15.176 Epoch 2/10 87/87 [==============================] - 12s 142ms/step - loss: 0.0247 - mae: 0.1240 - r2: 0.1302 - val_loss: 0.0208 - val_mae: 0.1147 - val_r2: 0.3058 Current time ....28.323 Epoch 3/10 87/87 [==============================] - 12s 143ms/step - loss: 0.0219 - mae: 0.1157 - r2: 0.2278 - val_loss: 0.0197 - val_mae: 0.1112 - val_r2: 0.3565 Current time ....41.321 Epoch 4/10 87/87 [==============================] - 12s 143ms/step - loss: 0.0203 - mae: 0.1111 - r2: 0.2897 - val_loss: 0.0182 - val_mae: 0.1072 - val_r2: 0.3980 Current time ....54.330 Epoch 5/10 87/87 [==============================] - 13s 153ms/step - loss: 0.0187 - mae: 0.1066 - r2: 0.3388 - val_loss: 0.0189 - val_mae: 0.1090 - val_r2: 0.3804 Current time ....68.120 Epoch 6/10 87/87 [==============================] - 13s 148ms/step - loss: 0.0185 - mae: 0.1075 - r2: 0.3412 - val_loss: 0.0186 - val_mae: 0.1088 - val_r2: 0.3921 Current time ....80.967 Epoch 7/10 87/87 [==============================] - 13s 147ms/step - loss: 0.0185 - mae: 0.1069 - r2: 0.3468 - val_loss: 0.0177 - val_mae: 0.1043 - val_r2: 0.4259 Current time ....93.769 Epoch 8/10 87/87 [==============================] - 13s 150ms/step - loss: 0.0176 - mae: 0.1031 - r2: 0.3791 - val_loss: 0.0159 - val_mae: 0.0994 - val_r2: 0.4793 Current time ....107.421 Epoch 9/10 87/87 [==============================] - 13s 150ms/step - loss: 0.0177 - mae: 0.1034 - r2: 0.3745 - val_loss: 0.0161 - val_mae: 0.1000 - val_r2: 0.4696 Current time ....120.945 Epoch 10/10 87/87 [==============================] - 14s 159ms/step - loss: 0.0169 - mae: 0.1022 - r2: 0.4086 - val_loss: 0.0173 - val_mae: 0.1029 - val_r2: 0.4337 Current time ....134.744 Comparing y_true and y_pred: mse: 0.0165 mae: 0.1016 r2: 0.4782 corr: 0.7072 Testing predictions stored in file: save_default/uno.A=relu.B=32.E=10.O=sgd.LS=mse.LR=0.01.CF=r.DF=df.DR=0.1.L1000.D1=1000.D2=1000.D3=1000.predicted.tsv Model stored in file: save_default/uno.A=relu.B=32.E=10.O=sgd.LS=mse.LR=0.01.CF=r.DF=df.DR=0.1.L1000.D1=1000.D2=1000.D3=1000.model.json Model stored in file: save_default/uno.A=relu.B=32.E=10.O=sgd.LS=mse.LR=0.01.CF=r.DF=df.DR=0.1.L1000.D1=1000.D2=1000.D3=1000.model.h5 Model weights stored in file: save_default//default.weights.h5 partition:test, rank:0, sharded index size:0, batch_size:32, steps:0 ``` ## Infer The infer script does inference on a trained model, as in the underlying Uno benchmark. This script is able to use a pre-generated file or it can construct the data to do inference if a set of identifiers are provided. The argument --uq_infer_file must be used to specify the name of the file with the data (or the identifiers) to do inference. Additionally, if the data needs to be constructed, then one of the following arguments should be used to specify what type of identifiers are provided - --uq_infer_given_drugs=True if the file contains a set of drug IDs. - --uq_infer_given_cells=True if the file contains a set of cell IDs. - --uq_infer_given_indices=True if the file contains a set of indices. Note that the latter works if all the arguments for the data construction are set as well (usually those are taken from the model configuration file). Of course this specification and the trained model should be consistent for the script to work. Likewise, in the case that a pre-generated file is provided, the features included and the trained model should be consistent for the script to work. Note also that the --loss heteroscedastic option should be specified if the model was trained to predict the heterogeneous noise as well. #### Example output This assumes that a trained model (files default.model.json and default.weights.h5) is available at save_default folder. A sample json file compatible with the default model used in the training demo script is provided. After running the training script a default.weights.h5 file should be generated. Both, in combination, can be used for testing the inference demo script and would produce a similar output to the one shown next. ``` python uno_inferUQ_keras2.py --uq_infer_file save_default/infer_cell_ids --uq_infer_given_cells True --model_file save_default/uno.A\=relu.B\=32.E\=10.O\=sgd.LS\=mse.LR\=0.01.CF\=r.DF\=df.DR\=0.1.L1000.D1\=1000.D2\=1000.D3\=1000.model.h5 --weights_file save_default/saved.weights.h5 --n_pred 10 Using TensorFlow backend. Importing candle utils for keras Params: {'activation': 'relu', 'agg_dose': 'AUC', 'base_lr': None, 'batch_normalization': False, 'batch_size': 32, 'by_cell': None, 'by_drug': None, 'cache': None, 'cell_feature_subset_path': '', 'cell_features': ['rnaseq'], 'cell_subset_path': '', 'cell_types': None, 'cp': False, 'cv': 1, 'data_type': , 'dense': [1000, 1000, 1000], 'dense_feature_layers': [1000, 1000, 1000], 'dropout': 0.1, 'drug_feature_subset_path': '', 'drug_features': ['descriptors', 'fingerprints'], 'drug_median_response_max': 1, 'drug_median_response_min': -1, 'drug_subset_path': '', 'epochs': 10, 'exclude_cells': [], 'exclude_drugs': [], 'experiment_id': 'EXP000', 'export_csv': None, 'export_data': None, 'feature_subsample': 0, 'feature_subset_path': '', 'gpus': [], 'growth_bins': 0, 'initial_weights': None, 'learning_rate': 0.01, 'logfile': None, 'loss': 'mse', 'max_val_loss': 1.0, 'model_file': 'save_default/default.model.json', 'n_pred': 10, 'no_feature_source': True, 'no_gen': False, 'no_response_source': True, 'optimizer': 'sgd', 'output_dir': './Output/EXP000/RUN000', 'partition_by': 'cell', 'preprocess_rnaseq': 'none', 'profiling': False 'reduce_lr': False, 'residual': False, 'rng_seed': 2018, 'run_id': 'RUN000', 'sample_repetition': False, 'save_path': 'save_default/', 'save_weights': None, 'scaling': 'std', 'shuffle': False, 'single': True, 'solr_root': '', 'tb': False, 'tb_prefix': 'tb', 'test_sources': ['train'], 'timeout': 3600, 'train_bool': True, 'train_sources': ['gCSI'], 'uq_infer_file': 'save_default/infer_cell_ids', 'uq_infer_given_cells': True, 'uq_infer_given_drugs': False, 'uq_infer_given_indices': False, 'use_exported_data': None, 'use_filtered_genes': False, 'use_landmark_genes': True, 'val_split': 0.2, 'verbose': None, 'warmup_lr': False, 'weights_file': 'save_default/saved.weights.h5'} __________________________________________________________________________________________________ Layer (type) Output Shape Param # Connected to ================================================================================================== input.cell.rnaseq (InputLayer) (None, 942) 0 __________________________________________________________________________________________________ input.drug1.descriptors (InputL (None, 5270) 0 __________________________________________________________________________________________________ input.drug1.fingerprints (Input (None, 2048) 0 __________________________________________________________________________________________________ cell.rnaseq (Model) (None, 1000) 2945000 input.cell.rnaseq[0][0] __________________________________________________________________________________________________ drug.descriptors (Model) (None, 1000) 7273000 input.drug1.descriptors[0][0] __________________________________________________________________________________________________ drug.fingerprints (Model) (None, 1000) 4051000 input.drug1.fingerprints[0][0] __________________________________________________________________________________________________ concatenate_1 (Concatenate) (None, 3000) 0 cell.rnaseq[1][0] drug.descriptors[1][0] drug.fingerprints[1][0] __________________________________________________________________________________________________ dense_10 (Dense) (None, 1000) 3001000 concatenate_1[0][0] __________________________________________________________________________________________________ permanent_dropout_10 (Permanent (None, 1000) 0 dense_10[0][0] __________________________________________________________________________________________________ dense_11 (Dense) (None, 1000) 1001000 permanent_dropout_10[0][0] __________________________________________________________________________________________________ permanent_dropout_11 (Permanent (None, 1000) 0 dense_11[0][0] __________________________________________________________________________________________________ dense_12 (Dense) (None, 1000) 1001000 permanent_dropout_11[0][0] __________________________________________________________________________________________________ permanent_dropout_12 (Permanent (None, 1000) 0 dense_12[0][0] __________________________________________________________________________________________________ dense_13 (Dense) (None, 1) 1001 permanent_dropout_12[0][0] ================================================================================================== Total params: 19,273,001 Trainable params: 19,273,001 Non-trainable params: 0 __________________________________________________________________________________________________ partition:test, rank:0, sharded index size:0, batch_size:32, steps:0 Read file: save_default/infer_cell_ids Number of elements read: 72 Comparing y_true and y_pred: mse: 0.0173 mae: 0.1012 r2: 0.4687 corr: 0.7001 Comparing y_true and y_pred: mse: 0.0172 mae: 0.1005 r2: 0.4720 corr: 0.7010 Comparing y_true and y_pred: mse: 0.0171 mae: 0.1033 r2: 0.4751 corr: 0.7064 Comparing y_true and y_pred: mse: 0.0175 mae: 0.1045 r2: 0.4627 corr: 0.6945 Comparing y_true and y_pred: mse: 0.0162 mae: 0.1007 r2: 0.5017 corr: 0.7277 Comparing y_true and y_pred: mse: 0.0166 mae: 0.1008 r2: 0.4921 corr: 0.7141 Comparing y_true and y_pred: mse: 0.0181 mae: 0.1059 r2: 0.4443 corr: 0.6878 Comparing y_true and y_pred: mse: 0.0167 mae: 0.1015 r2: 0.4875 corr: 0.7087 Comparing y_true and y_pred: mse: 0.0169 mae: 0.1032 r2: 0.4805 corr: 0.7106 Comparing y_true and y_pred: mse: 0.0169 mae: 0.0999 r2: 0.4817 corr: 0.7075 Predictions stored in file: save_default/uno.A=relu.B=32.E=10.O=sgd.LS=mse.LR=None.CF=r.DF=df.DR=0.1.L1000.D1=1000.D2=1000.D3=1000.predicted_INFER.tsv ``` ## Empirical Calibration Scripts included in the calibration subfolder compute empirical calibration for the inference results. The scripts with suffix HOM compute empirical calibration for inference with homoscedastic model, while the script with suffix HET computes empirical calibration for inference with a heteroscedastic model. To run the scripts it is necessary to provide the path to the file and the file with the inference results. Note that it is assumed that the file with the inference results includes each realization of the inference (implicit in the 'all' suffix), but for the homoscedastic case a script is provided to process an inference file with only the consolidated statistics (generally the average over all the realizations). Also, note that a specific format of the file with the inference results is assumed. Thus, a set of default values, reflecting the format of current CANDLE infer scripts, is used. More arbitrary formats may be usable, if they incurr in similar column offsets, but it would require passing the right parameters to the function reading the inference file. The script generates a series of plots and pickle (dill) files, displaying and encoding the empirical calibration computed. \ No newline at end of file diff --git a/Pilot1/Uno_UQ/uno_defaultUQ_model.txt b/Pilot1/Uno_UQ/uno_defaultUQ_model.txt index 67d6b247..fc119df8 100644 --- a/Pilot1/Uno_UQ/uno_defaultUQ_model.txt +++ b/Pilot1/Uno_UQ/uno_defaultUQ_model.txt @@ -25,7 +25,7 @@ batch_normalization=False feature_subsample=0 rng_seed=2018 save_path='save_default/' -save_weights='default.weights.h5' +save_weights='saved.weights.h5' no_gen=False verbose = False single=True @@ -34,7 +34,6 @@ no_feature_source=True no_response_source=True use_landmark_genes=True partition_by='cell' -uq_exclude_cells_file='save_default/infer_cell_ids' [Monitor_Params] solr_root='' diff --git a/Pilot1/Uno_UQ/uno_holdoutUQ_data.py b/Pilot1/Uno_UQ/uno_holdoutUQ_data.py index d163ca06..feb6ee49 100644 --- a/Pilot1/Uno_UQ/uno_holdoutUQ_data.py +++ b/Pilot1/Uno_UQ/uno_holdoutUQ_data.py @@ -76,21 +76,21 @@ def run(params): print('partition_by: ', args.partition_by) if args.partition_by == 'drug_pair': - fname_drugs = 'infer_drug_ids' + fname_drugs = args.save_path + 'infer_drug_ids' pds = loader.get_drugs_in_val() with open(fname_drugs, 'w') as f: for item in pds: f.write('%s\n' % item) logger.info('Drug IDs in holdout set written in file: {}'.format(fname_drugs)) elif args.partition_by == 'cell': - fname_cells = 'infer_cell_ids' + fname_cells = args.save_path + 'infer_cell_ids' pcs = loader.get_cells_in_val() with open(fname_cells, 'w') as f: for item in pcs: f.write('%s\n' % item) logger.info('Cell IDs in holdout set written in file: {}'.format(fname_cells)) else : # - fname_index = 'infer_index_ids' + fname_index = args.save_path + 'infer_index_ids' pins = loader.get_index_in_val() with open(fname_index, 'w') as f: for item in pins: From b8e1a4dc4806292e6671ce90a7da16b7593b51b5 Mon Sep 17 00:00:00 2001 From: Cristina Garcia Cardona Date: Wed, 8 Apr 2020 22:51:19 -0600 Subject: [PATCH 222/331] fixed params bug in p1b2 --- Pilot1/P1B2/p1b2_baseline_keras2.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Pilot1/P1B2/p1b2_baseline_keras2.py b/Pilot1/P1B2/p1b2_baseline_keras2.py index 7122eac0..a8e85dff 100644 --- a/Pilot1/P1B2/p1b2_baseline_keras2.py +++ b/Pilot1/P1B2/p1b2_baseline_keras2.py @@ -34,10 +34,10 @@ def run(gParameters): # Construct extension to save model ext = p1b2.extension_from_parameters(gParameters, '.keras') - candle.verify_path(params['save_path']) - prefix = '{}{}'.format(params['save_path'], ext) + candle.verify_path(gParameters['save_path']) + prefix = '{}{}'.format(gParameters['save_path'], ext) logfile = gParameters['logfile'] if gParameters['logfile'] else prefix+'.log' - candle.set_up_logger(logfile, p1b2.logger, params['verbose']) + candle.set_up_logger(logfile, p1b2.logger, gParameters['verbose']) p1b2.logger.info('Params: {}'.format(gParameters)) # Get default parameters for initialization and optimizer functions From 866585a83f708500c2e7311dd68c90a95d054292 Mon Sep 17 00:00:00 2001 From: yngtodd Date: Thu, 9 Apr 2020 10:53:50 -0400 Subject: [PATCH 223/331] Use refactor for benchmark The P3B5 benchmark can now use the refactored code in common. --- Pilot3/P3B5/p3b5_baseline_pytorch.py | 54 +++++++++++--------------- Pilot3/P3B5/p3b5_darts.py | 31 +++++++-------- Pilot3/P3B5/test.py | 9 ++--- common/darts/__init__.py | 10 +++++ common/darts/functional.py | 8 ++-- common/darts/modules/conv/network.py | 23 +++++------ common/darts/modules/linear/network.py | 4 +- 7 files changed, 64 insertions(+), 75 deletions(-) diff --git a/Pilot3/P3B5/p3b5_baseline_pytorch.py b/Pilot3/P3B5/p3b5_baseline_pytorch.py index 134e769a..0f5e33f8 100644 --- a/Pilot3/P3B5/p3b5_baseline_pytorch.py +++ b/Pilot3/P3B5/p3b5_baseline_pytorch.py @@ -6,15 +6,7 @@ import p3b5 as bmk import candle - -from darts.api.config import banner -from darts.data.p3b3 import P3B3 -from darts.modules.network import Network -from darts.architecture import Architecture -from darts.storage.genotype import GenotypeStorage -from darts.functional import multitask_loss -from darts.meters.accuracy import MultitaskAccuracyMeter -from darts.utils.logging import log_accuracy +import darts from p3b5_darts import train, infer @@ -55,11 +47,11 @@ def run(params): args.cuda = torch.cuda.is_available() device = torch.device(f'cuda' if args.cuda else "cpu") - banner(device=device) + darts.banner(device=device) datapath = fetch_data(params) - train_data = P3B3(datapath, 'train') - valid_data = P3B3(datapath, 'test') + train_data = darts.P3B3(datapath, 'train') + valid_data = darts.P3B3(datapath, 'test') trainloader = DataLoader(train_data, batch_size=args.batch_size) validloader = DataLoader(valid_data, batch_size=args.batch_size) @@ -73,23 +65,23 @@ def run(params): 'grade': 3, } - model = Network(tasks=tasks, criterion=criterion, device=device).to(device) - architecture = Architecture(model, args, device=device) + model = darts.ConvNetwork(tasks=tasks, criterion=criterion, device=device).to(device) + architecture = darts.Architecture(model, args, device=device) optimizer = optim.SGD( - model.parameters(), - args.learning_rate, - momentum=args.momentum, + model.parameters(), + args.learning_rate, + momentum=args.momentum, weight_decay=args.weight_decay, ) scheduler = optim.lr_scheduler.CosineAnnealingLR( - optimizer, - float(args.epochs), + optimizer, + float(args.epochs), eta_min=args.learning_rate_min, ) - genotype_store = GenotypeStorage(root=args.savepath) + genotype_store = darts.GenotypeStorage(root=args.savepath) min_loss = 9999 for epoch in range(args.epochs): @@ -103,18 +95,18 @@ def run(params): # training train_acc, train_loss = train( - trainloader, - validloader, - model, - architecture, - criterion, - optimizer, - lr, - args, + trainloader, + validloader, + model, + architecture, + criterion, + optimizer, + lr, + args, tasks, device ) - + # validation valid_acc, valid_loss = infer(validloader, model, criterion, args, tasks, device) @@ -123,8 +115,8 @@ def run(params): min_loss = valid_loss print(f'\nEpoch {epoch} stats:') - log_accuracy(train_acc, 'train') - log_accuracy(valid_acc, 'valid') + darts.log_accuracy(train_acc, 'train') + darts.log_accuracy(valid_acc, 'valid') def main(): diff --git a/Pilot3/P3B5/p3b5_darts.py b/Pilot3/P3B5/p3b5_darts.py index cd095be0..567bfd02 100644 --- a/Pilot3/P3B5/p3b5_darts.py +++ b/Pilot3/P3B5/p3b5_darts.py @@ -10,14 +10,6 @@ import torch.nn.functional as F from torch.utils.data import DataLoader -from darts.api.config import banner -from darts.modules.network import Network -from darts.architecture import Architecture -from darts.meters.average import AverageMeter -from darts.functional import multitask_loss, multitask_accuracy -from darts.meters.accuracy import MultitaskAccuracyMeter -from darts.utils.logging import log_accuracy - file_path = os.path.dirname(os.path.realpath(__file__)) lib_path = os.path.abspath(os.path.join(file_path, '..')) @@ -26,9 +18,12 @@ sys.path.append(lib_path2) +import darts + + def train(trainloader, validloader, model, architecture, criterion, optimizer, lr, args, tasks, device): - losses = AverageMeter('LossMeter') - top1 = MultitaskAccuracyMeter(tasks) + losses = darts.AverageMeter('LossMeter') + top1 = darts.MultitaskAccuracyMeter(tasks) valid_iter = iter(trainloader) @@ -60,7 +55,7 @@ def train(trainloader, validloader, model, architecture, criterion, optimizer, l ) logits = model(data) - loss = multitask_loss(target, logits, criterion, reduce='mean') + loss = darts.multitask_loss(target, logits, criterion, reduce='mean') # 2. update weight optimizer.zero_grad() @@ -68,20 +63,20 @@ def train(trainloader, validloader, model, architecture, criterion, optimizer, l nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip) optimizer.step() - prec1 = multitask_accuracy(target, logits) + prec1 = darts.multitask_accuracy(target, logits) losses.update(loss.item(), batch_size) top1.update(prec1, batch_size) if step % args.log_interval == 0: print(f'Step: {step} loss: {losses.avg:.4}') - log_accuracy(top1) + darts.log_accuracy(top1) return top1, losses.avg def infer(validloader, model, criterion, args, tasks, device): - losses = AverageMeter('LossMeter') - top1 = MultitaskAccuracyMeter(tasks) + losses = darts.AverageMeter('LossMeter') + top1 = darts.MultitaskAccuracyMeter(tasks) model.eval() @@ -95,15 +90,15 @@ def infer(validloader, model, criterion, args, tasks, device): batch_size = data.size(0) logits = model(data) - loss = multitask_loss(target, logits, criterion, reduce='mean') + loss = darts.multitask_loss(target, logits, criterion, reduce='mean') - prec1 = multitask_accuracy(target, logits) + prec1 = darts.multitask_accuracy(target, logits) losses.update(loss.item(), batch_size) top1.update(prec1, batch_size) if step % args.log_interval == 0: print(f'>> Validation: {step} loss: {losses.avg:.4}') - log_accuracy(top1, 'valid') + darts.log_accuracy(top1, 'valid') return top1, losses.avg diff --git a/Pilot3/P3B5/test.py b/Pilot3/P3B5/test.py index 8715318a..49934f7b 100644 --- a/Pilot3/P3B5/test.py +++ b/Pilot3/P3B5/test.py @@ -9,11 +9,8 @@ sys.path.append(lib_path2) -from darts import Architecture -from darts import ConvNetwork -from darts.genotypes import Genotype +import darts -print(Architecture) -print(ConvNetwork) -print(Genotype) +print(darts.Architecture) +print(darts.ConvNetwork) diff --git a/common/darts/__init__.py b/common/darts/__init__.py index bccfd139..e6733adb 100644 --- a/common/darts/__init__.py +++ b/common/darts/__init__.py @@ -8,7 +8,17 @@ from .architecture import Architecture from .modules.conv.network import ConvNetwork from .modules.linear.network import LinearNetwork +from .storage.genotype import GenotypeStorage +from .data.p3b3 import P3B3 +from .api.config import banner +from .meters.average import AverageMeter +from .meters.accuracy import MultitaskAccuracyMeter +from .utils.logging import log_accuracy + +from .functional import ( + multitask_loss, multitask_loss, multitask_accuracy +) __all__ = [ "Architecture", diff --git a/common/darts/functional.py b/common/darts/functional.py index 902ec79c..fcc65806 100644 --- a/common/darts/functional.py +++ b/common/darts/functional.py @@ -47,12 +47,12 @@ def multitask_loss(target, logits, criterion, reduce='mean'): def accuracy(target: torch.tensor, output: torch.tensor,): - """ Computes accuracy - + """ Computes accuracy + Args: output: logits of the model target: true labels - + Returns: accuracy of the predictions """ @@ -91,4 +91,4 @@ def multitask_accuracy_topk(target, output, topk=(1,)): for key, value in target.items(): topk_accuracies[key] = accuracy_topk(output[key], target[key], topk) - return topk_accuracies \ No newline at end of file + return topk_accuracies diff --git a/common/darts/modules/conv/network.py b/common/darts/modules/conv/network.py index 6cb95b4c..f4d9d538 100644 --- a/common/darts/modules/conv/network.py +++ b/common/darts/modules/conv/network.py @@ -22,7 +22,7 @@ class ConvNetwork(Model): """ Collection of cells """ def __init__(self, tasks, criterion, device='cpu', hyperparams=Hyperparameters()): - super(Network, self).__init__() + super(ConvNetwork, self).__init__() self.tasks = tasks self.criterion = criterion self.device = device @@ -37,7 +37,7 @@ def __init__(self, tasks, criterion, device='cpu', hyperparams=Hyperparameters() # stem network, convert 3 channel to c_curr self.stem = nn.Sequential( nn.Embedding( - num_embeddings=hyperparams.num_embeddings, + num_embeddings=hyperparams.num_embeddings, embedding_dim=hyperparams.embedding_dim ), nn.Conv1d(hyperparams.embedding_dim, c_curr, 3, padding=1, bias=False), @@ -61,12 +61,12 @@ def __init__(self, tasks, criterion, device='cpu', hyperparams=Hyperparameters() # [cp, h, h] => [multiplier*c_curr, h/h//2, h/h//2] # the output channels = multiplier * c_curr cell = Cell( - hyperparams.num_nodes, - hyperparams.channel_multiplier, - cpp, - cp, - c_curr, - reduction, + hyperparams.num_nodes, + hyperparams.channel_multiplier, + cpp, + cp, + c_curr, + reduction, reduction_prev ).to(self.device) # update reduction_prev @@ -78,17 +78,12 @@ def __init__(self, tasks, criterion, device='cpu', hyperparams=Hyperparameters() self.global_pooling = nn.AdaptiveAvgPool1d(1) # since cp records last cell's output channels # it indicates the input channel number - # self.classifier = self.fc_layers(cp, tasks) self.classifier = MultitaskClassifier(cp, tasks) # k is the total number of edges inside single cell, 14 k = sum(1 for i in range(self.num_nodes) for j in range(2 + i)) num_ops = len(PRIMITIVES) # 8 - # TODO - # this kind of implementation will add alpha into self.parameters() - # it has num k of alpha parameters, and each alpha shape: [num_ops] - # it requires grad and can be converted to cpu/gpu automatically self.alpha_normal = nn.Parameter(torch.randn(k, num_ops)) self.alpha_reduce = nn.Parameter(torch.randn(k, num_ops)) @@ -119,7 +114,7 @@ def new(self): model : Network New model initialized with current alpha. """ - model = Network( + model = ConvNetwork( self.tasks, self.criterion ).to(self.device) diff --git a/common/darts/modules/linear/network.py b/common/darts/modules/linear/network.py index 061ca46e..470d309a 100644 --- a/common/darts/modules/linear/network.py +++ b/common/darts/modules/linear/network.py @@ -21,7 +21,7 @@ class LinearNetwork(Model): """ Collection of cells """ def __init__(self, input_dim, tasks, criterion, device='cpu', hyperparams=Hyperparameters()): - super(Network, self).__init__() + super(LinearNetwork, self).__init__() self.tasks = tasks self.criterion = criterion self.device = device @@ -105,7 +105,7 @@ def new(self): model : Network New model initialized with current alpha. """ - model = Network( + model = LinearNetwork( self.tasks, self.criterion ).to(self.device) From 9e0b8d9f884818551576aa6926d5a26eca126ef0 Mon Sep 17 00:00:00 2001 From: yngtodd Date: Thu, 9 Apr 2020 11:38:55 -0400 Subject: [PATCH 224/331] Add dataset abstractions This will give us utilities to easily manage data for DARTS. --- common/darts/__init__.py | 5 +- common/darts/datasets/__init__.py | 0 common/darts/datasets/p3b3.py | 102 ++++++++++++++++++++++++++++++ 3 files changed, 105 insertions(+), 2 deletions(-) create mode 100644 common/darts/datasets/__init__.py create mode 100644 common/darts/datasets/p3b3.py diff --git a/common/darts/__init__.py b/common/darts/__init__.py index e6733adb..c98ae304 100644 --- a/common/darts/__init__.py +++ b/common/darts/__init__.py @@ -4,13 +4,14 @@ __email__ = 'youngmt1@ornl.gov' __version__ = '0.1.0' - +# Essential pieces from .architecture import Architecture from .modules.conv.network import ConvNetwork from .modules.linear.network import LinearNetwork from .storage.genotype import GenotypeStorage -from .data.p3b3 import P3B3 +# Utilities that are not neccessary +from .datasets.p3b3 import P3B3 from .api.config import banner from .meters.average import AverageMeter from .meters.accuracy import MultitaskAccuracyMeter diff --git a/common/darts/datasets/__init__.py b/common/darts/datasets/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/common/darts/datasets/p3b3.py b/common/darts/datasets/p3b3.py new file mode 100644 index 00000000..1285a158 --- /dev/null +++ b/common/darts/datasets/p3b3.py @@ -0,0 +1,102 @@ +import os +import numpy as np +from torch.utils.data import Dataset + + +class P3B3(Dataset): + """P3B3 Synthetic Dataset. + + Args: + root: str + Root directory of dataset where CANDLE loads P3B3 data. + + partition: str + dataset partition to be loaded. + Must be either 'train' or 'test'. + """ + training_data_file = 'train_X.npy' + training_label_file = 'train_Y.npy' + test_data_file = 'test_X.npy' + test_label_file = 'test_Y.npy' + + def __init__(self, root, partition, subsite=True, + laterality=True, behavior=True, grade=True, + transform=None, target_transform=None): + self.root = root + self.partition = partition + self.transform = transform + self.target_transform = target_transform + self.subsite = subsite + self.laterality = laterality + self.behavior = behavior + self.grade = grade + + if self.partition == 'train': + data_file = self.training_data_file + label_file = self.training_label_file + elif self.partition == 'test': + data_file = self.test_data_file + label_file = self.test_label_file + else: + raise ValueError("Partition must either be 'train' or 'test'.") + + self.data = np.load(os.path.join(self.root, data_file)) + self.targets = self.get_targets(label_file) + + def __repr__(self): + fmt_str = 'Dataset ' + self.__class__.__name__ + '\n' + fmt_str += ' Number of datapoints: {}\n'.format(self.__len__()) + tmp = self.partition + fmt_str += ' Split: {}\n'.format(tmp) + fmt_str += ' Root Location: {}\n'.format(self.root) + return fmt_str + + def __len__(self): + return len(self.data) + + def load_data(self): + return self.data, self.targets + + def get_targets(self, label_file): + """Get dictionary of targets specified by user.""" + targets = np.load(os.path.join(self.root, label_file)) + + tasks = {} + if self.subsite: + tasks['subsite'] = targets[:, 0] + if self.laterality: + tasks['laterality'] = targets[:, 1] + if self.behavior: + tasks['behavior'] = targets[:, 2] + if self.grade: + tasks['grade'] = targets[:, 3] + + return tasks + + def __getitem__(self, idx): + """ + Parameters + ---------- + index : int + Index of the data to be loaded. + + Returns + ------- + (document, target) : tuple + where target is index of the target class. + """ + document = self.data[idx] + + if self.transform is not None: + document = self.transform(document) + + targets = {} + for key, value in self.targets.items(): + subset = value[idx] + + if self.target_transform is not None: + subset = self.target_transform(subset) + + targets[key] = subset + + return document, targets \ No newline at end of file From 07d45a6b7cbafad313f3b0ed26901b58aeddd40e Mon Sep 17 00:00:00 2001 From: Cristina Garcia Cardona Date: Thu, 9 Apr 2020 10:45:02 -0600 Subject: [PATCH 225/331] promoted model_name to CANDLE common keyword --- Pilot1/NT3/nt3.py | 6 +----- Pilot1/NT3/nt3_baseline_keras2.py | 18 +++++++++--------- Pilot1/NT3/nt3_baseline_keras2_tensorrt.py | 20 ++++++++++---------- Pilot1/NT3/nt3_default_model.txt | 2 +- Pilot1/NT3/nt3_perf_bench_model.txt | 2 +- Pilot1/P1B1/p1b1.py | 4 ---- Pilot1/P1B1/p1b1_default_model.txt | 2 +- Pilot1/P1B1/p1b1_perf_bench_model.txt | 2 +- Pilot1/P1B2/p1b2.py | 4 ---- Pilot1/P1B2/p1b2_default_model.txt | 2 +- Pilot1/TC1/tc1.py | 6 +----- Pilot1/TC1/tc1_baseline_keras2.py | 18 +++++++++--------- Pilot1/TC1/tc1_default_model.txt | 2 +- Pilot1/TC1/tc1_perf_bench_model.txt | 2 +- common/default_utils.py | 6 +++--- 15 files changed, 40 insertions(+), 56 deletions(-) diff --git a/Pilot1/NT3/nt3.py b/Pilot1/NT3/nt3.py index 41764611..c828b3d9 100644 --- a/Pilot1/NT3/nt3.py +++ b/Pilot1/NT3/nt3.py @@ -8,10 +8,6 @@ import candle additional_definitions = [ -{'name':'model_prefix', - 'default':'nt3', - 'type':str, - 'help':'prefix to build model name for saving'}, {'name':'classes', 'type':int, 'default':2} @@ -21,7 +17,7 @@ 'data_url', 'train_data', 'test_data', - 'model_prefix', + 'model_name', 'conv', 'dense', 'activation', diff --git a/Pilot1/NT3/nt3_baseline_keras2.py b/Pilot1/NT3/nt3_baseline_keras2.py index aa4dfead..582d4584 100644 --- a/Pilot1/NT3/nt3_baseline_keras2.py +++ b/Pilot1/NT3/nt3_baseline_keras2.py @@ -176,8 +176,8 @@ def run(gParameters): gParameters.update(candle.compute_trainable_params(model)) # set up a bunch of callbacks to do work during model training.. - model_prefix = gParameters['model_prefix'] - path = '{}/{}.autosave.model.h5'.format(output_dir, model_prefix) + model_name = gParameters['model_name'] + path = '{}/{}.autosave.model.h5'.format(output_dir, model_name) # checkpointer = ModelCheckpoint(filepath=path, verbose=1, save_weights_only=False, save_best_only=True) csv_logger = CSVLogger('{}/training.log'.format(output_dir)) reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=10, verbose=1, mode='auto', epsilon=0.0001, cooldown=0, min_lr=0) @@ -197,34 +197,34 @@ def run(gParameters): print('Test accuracy:', score[1]) # serialize model to JSON model_json = model.to_json() - with open("{}/{}.model.json".format(output_dir, model_prefix), "w") as json_file: + with open("{}/{}.model.json".format(output_dir, model_name), "w") as json_file: json_file.write(model_json) # serialize model to YAML model_yaml = model.to_yaml() - with open("{}/{}.model.yaml".format(output_dir, model_prefix), "w") as yaml_file: + with open("{}/{}.model.yaml".format(output_dir, model_name), "w") as yaml_file: yaml_file.write(model_yaml) # serialize weights to HDF5 - model.save_weights("{}/{}.weights.h5".format(output_dir, model_prefix)) + model.save_weights("{}/{}.weights.h5".format(output_dir, model_name)) print("Saved model to disk") # load json and create model - json_file = open('{}/{}.model.json'.format(output_dir, model_prefix), 'r') + json_file = open('{}/{}.model.json'.format(output_dir, model_name), 'r') loaded_model_json = json_file.read() json_file.close() loaded_model_json = model_from_json(loaded_model_json) # load yaml and create model - yaml_file = open('{}/{}.model.yaml'.format(output_dir, model_prefix), 'r') + yaml_file = open('{}/{}.model.yaml'.format(output_dir, model_name), 'r') loaded_model_yaml = yaml_file.read() yaml_file.close() loaded_model_yaml = model_from_yaml(loaded_model_yaml) # load weights into new model - loaded_model_json.load_weights('{}/{}.weights.h5'.format(output_dir, model_prefix)) + loaded_model_json.load_weights('{}/{}.weights.h5'.format(output_dir, model_name)) print("Loaded json model from disk") # evaluate json loaded model on test data @@ -239,7 +239,7 @@ def run(gParameters): print("json %s: %.2f%%" % (loaded_model_json.metrics_names[1], score_json[1]*100)) # load weights into new model - loaded_model_yaml.load_weights('{}/{}.weights.h5'.format(output_dir, model_prefix)) + loaded_model_yaml.load_weights('{}/{}.weights.h5'.format(output_dir, model_name)) print("Loaded yaml model from disk") # evaluate loaded model on test data diff --git a/Pilot1/NT3/nt3_baseline_keras2_tensorrt.py b/Pilot1/NT3/nt3_baseline_keras2_tensorrt.py index ba2c2283..429a4d2f 100644 --- a/Pilot1/NT3/nt3_baseline_keras2_tensorrt.py +++ b/Pilot1/NT3/nt3_baseline_keras2_tensorrt.py @@ -83,7 +83,7 @@ def read_config_file(file): fileParams['data_url'] = eval(config.get(section[0],'data_url')) fileParams['train_data'] = eval(config.get(section[0],'train_data')) fileParams['test_data'] = eval(config.get(section[0],'test_data')) - fileParams['model_prefix'] = eval(config.get(section[0],'model_prefix')) + fileParams['model_name'] = eval(config.get(section[0],'model_name')) fileParams['conv'] = eval(config.get(section[0],'conv')) fileParams['dense'] = eval(config.get(section[0],'dense')) fileParams['activation'] = eval(config.get(section[0],'activation')) @@ -267,8 +267,8 @@ def run(gParameters): gParameters.update(compute_trainable_params(model)) # set up a bunch of callbacks to do work during model training.. - model_prefix = gParameters['model_prefix'] - path = '{}/{}.autosave.model.h5'.format(output_dir, model_prefix) + model_name = gParameters['model_name'] + path = '{}/{}.autosave.model.h5'.format(output_dir, model_name) # checkpointer = ModelCheckpoint(filepath=path, verbose=1, save_weights_only=False, save_best_only=True) csv_logger = CSVLogger('{}/training.log'.format(output_dir)) reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=10, verbose=1, mode='auto', epsilon=0.0001, cooldown=0, min_lr=0) @@ -343,34 +343,34 @@ def run(gParameters): print('Test accuracy:', score[1]) # serialize model to JSON model_json = model.to_json() - with open("{}/{}.model.json".format(output_dir, model_prefix), "w") as json_file: + with open("{}/{}.model.json".format(output_dir, model_name), "w") as json_file: json_file.write(model_json) # serialize model to YAML model_yaml = model.to_yaml() - with open("{}/{}.model.yaml".format(output_dir, model_prefix), "w") as yaml_file: + with open("{}/{}.model.yaml".format(output_dir, model_name), "w") as yaml_file: yaml_file.write(model_yaml) # serialize weights to HDF5 - model.save_weights("{}/{}.weights.h5".format(output_dir, model_prefix)) + model.save_weights("{}/{}.weights.h5".format(output_dir, model_name)) print("Saved model to disk") # load json and create model - json_file = open('{}/{}.model.json'.format(output_dir, model_prefix), 'r') + json_file = open('{}/{}.model.json'.format(output_dir, model_name), 'r') loaded_model_json = json_file.read() json_file.close() loaded_model_json = model_from_json(loaded_model_json) # load yaml and create model - yaml_file = open('{}/{}.model.yaml'.format(output_dir, model_prefix), 'r') + yaml_file = open('{}/{}.model.yaml'.format(output_dir, model_name), 'r') loaded_model_yaml = yaml_file.read() yaml_file.close() loaded_model_yaml = model_from_yaml(loaded_model_yaml) # load weights into new model - loaded_model_json.load_weights('{}/{}.weights.h5'.format(output_dir, model_prefix)) + loaded_model_json.load_weights('{}/{}.weights.h5'.format(output_dir, model_name)) print("Loaded json model from disk") # evaluate json loaded model on test data @@ -385,7 +385,7 @@ def run(gParameters): print("json %s: %.2f%%" % (loaded_model_json.metrics_names[1], score_json[1]*100)) # load weights into new model - loaded_model_yaml.load_weights('{}/{}.weights.h5'.format(output_dir, model_prefix)) + loaded_model_yaml.load_weights('{}/{}.weights.h5'.format(output_dir, model_name)) print("Loaded yaml model from disk") # evaluate loaded model on test data diff --git a/Pilot1/NT3/nt3_default_model.txt b/Pilot1/NT3/nt3_default_model.txt index a23e4afa..e763f259 100644 --- a/Pilot1/NT3/nt3_default_model.txt +++ b/Pilot1/NT3/nt3_default_model.txt @@ -2,7 +2,7 @@ data_url = 'http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/normal-tumor/' train_data = 'nt_train2.csv' test_data = 'nt_test2.csv' -model_prefix = 'nt3' +model_name = 'nt3' conv = [128, 20, 1, 128, 10, 1] dense = [200,20] activation = 'relu' diff --git a/Pilot1/NT3/nt3_perf_bench_model.txt b/Pilot1/NT3/nt3_perf_bench_model.txt index 269d2bd0..41af2c32 100644 --- a/Pilot1/NT3/nt3_perf_bench_model.txt +++ b/Pilot1/NT3/nt3_perf_bench_model.txt @@ -2,7 +2,7 @@ data_url = 'http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/normal-tumor/' train_data = 'nt_train2.csv' test_data = 'nt_test2.csv' -model_prefix = 'nt3' +model_name = 'nt3' conv = [128, 20, 1, 128, 10, 1] dense = [200,20] activation = 'relu' diff --git a/Pilot1/P1B1/p1b1.py b/Pilot1/P1B1/p1b1.py index 91fac8b5..1699a140 100644 --- a/Pilot1/P1B1/p1b1.py +++ b/Pilot1/P1B1/p1b1.py @@ -27,10 +27,6 @@ 'action':'store', 'type': int, 'help':'latent dimensions'}, -{'name':'model_prefix', - 'default':'p1b1', - 'type':str, - 'help':'prefix to build model name for saving'}, {'name':'model', 'default':'ae', 'choices':['ae', 'vae', 'cvae'], diff --git a/Pilot1/P1B1/p1b1_default_model.txt b/Pilot1/P1B1/p1b1_default_model.txt index 4becc88f..486f0371 100644 --- a/Pilot1/P1B1/p1b1_default_model.txt +++ b/Pilot1/P1B1/p1b1_default_model.txt @@ -2,7 +2,7 @@ data_url = 'http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/P1B1/' train_data = 'P1B1.dev.train.csv' test_data = 'P1B1.dev.test.csv' -model_prefix='p1b1' +model_name='p1b1' dense=[2000, 600] batch_size=100 epochs=100 diff --git a/Pilot1/P1B1/p1b1_perf_bench_model.txt b/Pilot1/P1B1/p1b1_perf_bench_model.txt index ca0854bf..01ffc46c 100644 --- a/Pilot1/P1B1/p1b1_perf_bench_model.txt +++ b/Pilot1/P1B1/p1b1_perf_bench_model.txt @@ -2,7 +2,7 @@ data_url = 'http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/P1B1/' train_data = 'P1B1.dev.train.csv' test_data = 'P1B1.dev.test.csv' -model_prefix='p1b1' +model_name='p1b1' dense=[2000, 1000, 500, 200, 100] batch_size=100 epochs=500 diff --git a/Pilot1/P1B2/p1b2.py b/Pilot1/P1B2/p1b2.py index 82787c2e..70bb9b8f 100644 --- a/Pilot1/P1B2/p1b2.py +++ b/Pilot1/P1B2/p1b2.py @@ -30,10 +30,6 @@ logger = logging.getLogger(__name__) additional_definitions = [ -{'name':'model_prefix', - 'default':'p1b2', - 'type':str, - 'help':'prefix to build model name for saving'}, {'name':'reg_l2', 'type': float, 'default': 0., diff --git a/Pilot1/P1B2/p1b2_default_model.txt b/Pilot1/P1B2/p1b2_default_model.txt index 5d0ddbaf..33e37f87 100644 --- a/Pilot1/P1B2/p1b2_default_model.txt +++ b/Pilot1/P1B2/p1b2_default_model.txt @@ -2,7 +2,7 @@ data_url = 'http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/P1B2/' train_data = 'P1B2.train.csv' test_data = 'P1B2.test.csv' -model_prefix='p1b2' +model_name='p1b2' dense=[1024, 512, 256] batch_size=60 epochs=1 diff --git a/Pilot1/TC1/tc1.py b/Pilot1/TC1/tc1.py index 39ae16b0..5be856d1 100644 --- a/Pilot1/TC1/tc1.py +++ b/Pilot1/TC1/tc1.py @@ -18,10 +18,6 @@ 'nargs':'+', 'type': int, 'help':'network structure of shared layer'}, - {'name':'model_prefix', - 'default':'tc1', - 'type':str, - 'help':'prefix to build model name for saving'}, {'name':'classes', 'type':int, 'default':36} @@ -31,7 +27,7 @@ 'data_url', 'train_data', 'test_data', - 'model_prefix', + 'model_name', 'conv', 'dense', 'activation', diff --git a/Pilot1/TC1/tc1_baseline_keras2.py b/Pilot1/TC1/tc1_baseline_keras2.py index 46e3b1f5..bb916b09 100644 --- a/Pilot1/TC1/tc1_baseline_keras2.py +++ b/Pilot1/TC1/tc1_baseline_keras2.py @@ -122,8 +122,8 @@ def run(gParameters): os.makedirs(output_dir) # set up callbacks to do work during model training.. - model_prefix = gParameters['model_prefix'] - path = '{}/{}.autosave.model.h5'.format(output_dir, model_prefix) + model_name = gParameters['model_name'] + path = '{}/{}.autosave.model.h5'.format(output_dir, model_name) checkpointer = ModelCheckpoint(filepath=path, verbose=1, save_weights_only=False, save_best_only=True) csv_logger = CSVLogger('{}/training.log'.format(output_dir)) reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=10, verbose=1, mode='auto', epsilon=0.0001, cooldown=0, min_lr=0) @@ -142,35 +142,35 @@ def run(gParameters): # serialize model to JSON model_json = model.to_json() - with open("{}/{}.model.json".format(output_dir, model_prefix), "w") as json_file: + with open("{}/{}.model.json".format(output_dir, model_name), "w") as json_file: json_file.write(model_json) # serialize model to YAML model_yaml = model.to_yaml() - with open("{}/{}.model.yaml".format(output_dir, model_prefix), "w") as yaml_file: + with open("{}/{}.model.yaml".format(output_dir, model_name), "w") as yaml_file: yaml_file.write(model_yaml) # serialize weights to HDF5 - model.save_weights("{}/{}.model.h5".format(output_dir, model_prefix)) + model.save_weights("{}/{}.model.h5".format(output_dir, model_name)) print("Saved model to disk") # load json and create model - json_file = open('{}/{}.model.json'.format(output_dir, model_prefix), 'r') + json_file = open('{}/{}.model.json'.format(output_dir, model_name), 'r') loaded_model_json = json_file.read() json_file.close() loaded_model_json = model_from_json(loaded_model_json) # load yaml and create model - yaml_file = open('{}/{}.model.yaml'.format(output_dir, model_prefix), 'r') + yaml_file = open('{}/{}.model.yaml'.format(output_dir, model_name), 'r') loaded_model_yaml = yaml_file.read() yaml_file.close() loaded_model_yaml = model_from_yaml(loaded_model_yaml) # load weights into new model - loaded_model_json.load_weights('{}/{}.model.h5'.format(output_dir, model_prefix)) + loaded_model_json.load_weights('{}/{}.model.h5'.format(output_dir, model_name)) print("Loaded json model from disk") # evaluate json loaded model on test data @@ -187,7 +187,7 @@ def run(gParameters): # load weights into new model - loaded_model_yaml.load_weights('{}/{}.model.h5'.format(output_dir, model_prefix)) + loaded_model_yaml.load_weights('{}/{}.model.h5'.format(output_dir, model_name)) print("Loaded yaml model from disk") # evaluate loaded model on test data diff --git a/Pilot1/TC1/tc1_default_model.txt b/Pilot1/TC1/tc1_default_model.txt index 8850edd1..a22aed28 100644 --- a/Pilot1/TC1/tc1_default_model.txt +++ b/Pilot1/TC1/tc1_default_model.txt @@ -2,7 +2,7 @@ data_url = 'http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/type-class/' train_data = 'type_18_300_train.csv' test_data = 'type_18_300_test.csv' -model_prefix = 'tc1' +model_name = 'tc1' conv=[128, 20, 1, 128, 10, 1] dense=[200,20] activation='relu' diff --git a/Pilot1/TC1/tc1_perf_bench_model.txt b/Pilot1/TC1/tc1_perf_bench_model.txt index 167fabb1..dbf7fb04 100644 --- a/Pilot1/TC1/tc1_perf_bench_model.txt +++ b/Pilot1/TC1/tc1_perf_bench_model.txt @@ -2,7 +2,7 @@ data_url = 'http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/type-class/' train_data = 'type_18_300_train.csv' test_data = 'type_18_300_test.csv' -model_prefix = 'tc1' +model_name = 'tc1' conv=[128, 20, 1, 128, 10, 1] dense=[200,20] activation='relu' diff --git a/common/default_utils.py b/common/default_utils.py index 07befe98..7fd12d99 100644 --- a/common/default_utils.py +++ b/common/default_utils.py @@ -31,7 +31,7 @@ DEFAULT_DATATYPE = np.float32 -PARAMETERS_CANDLE = ['config_file', 'verbose', 'logfile', 'save_path', 'model_file', 'data_type', 'dense', 'rng_seed', 'epochs', 'batch_size', 'train_bool', 'eval_bool', 'timeout', 'home_dir', 'train_data', 'test_data', 'output_dir', 'data_url', 'experiment_id', 'run_id', 'conv', 'locally_connected', 'activation', 'out_activation', 'lstm_size', 'recurrent_dropout', 'dropout', 'pool', 'batch_normalization', 'loss', 'optimizer', 'metrics', 'scaling', 'shuffle', 'feature_subsample', 'learning_rate', 'early_stop', 'momentum', 'initialization', 'val_split', 'train_steps', 'val_steps', 'test_steps', 'train_samples', 'val_samples', 'gpus', 'profiling'] +PARAMETERS_CANDLE = ['config_file', 'verbose', 'logfile', 'save_path', 'model_name', 'data_type', 'dense', 'rng_seed', 'epochs', 'batch_size', 'train_bool', 'eval_bool', 'timeout', 'home_dir', 'train_data', 'test_data', 'output_dir', 'data_url', 'experiment_id', 'run_id', 'conv', 'locally_connected', 'activation', 'out_activation', 'lstm_size', 'recurrent_dropout', 'dropout', 'pool', 'batch_normalization', 'loss', 'optimizer', 'metrics', 'scaling', 'shuffle', 'feature_subsample', 'learning_rate', 'early_stop', 'momentum', 'initialization', 'val_split', 'train_steps', 'val_steps', 'test_steps', 'train_samples', 'val_samples', 'gpus', 'profiling'] #### IO UTILS @@ -433,9 +433,9 @@ def get_default_neon_parser(parser): help="file path to save model snapshots") # General behavior - parser.add_argument("--model_file", dest='model_file', type=str, + parser.add_argument("--model_name", dest='model_name', type=str, default=argparse.SUPPRESS, - help="specify trained model Pickle file") + help="specify model name to use when building filenames for saving") parser.add_argument("-d", "--data_type", dest='data_type', default=argparse.SUPPRESS, choices=['f16', 'f32', 'f64'], From 852d1b0f5decba34ce1a5a49b2cfe6043bc13e2e Mon Sep 17 00:00:00 2001 From: Cristina Garcia Cardona Date: Thu, 9 Apr 2020 11:04:41 -0600 Subject: [PATCH 226/331] reserved model_file keyword for infer scripts --- Pilot1/Attn1/attn_abs_default_model.txt | 2 +- Pilot1/Attn1/attn_default_model.txt | 2 +- Pilot1/Uno_UQ/uno_inferUQ_keras2.py | 5 +++++ 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/Pilot1/Attn1/attn_abs_default_model.txt b/Pilot1/Attn1/attn_abs_default_model.txt index b7a95c5a..86d22b50 100644 --- a/Pilot1/Attn1/attn_abs_default_model.txt +++ b/Pilot1/Attn1/attn_abs_default_model.txt @@ -1,7 +1,7 @@ [Global_Params] data_url = 'http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/' train_data='top_21_1fold_001.h5' -model_file='attn_abs.model.h5' +model_name='attn_abs' dense=[1000, 1000, 1000, 500, 250, 125, 60, 30, 2] batch_size=32 epochs=2 diff --git a/Pilot1/Attn1/attn_default_model.txt b/Pilot1/Attn1/attn_default_model.txt index 654084a9..a0e03982 100644 --- a/Pilot1/Attn1/attn_default_model.txt +++ b/Pilot1/Attn1/attn_default_model.txt @@ -1,7 +1,7 @@ [Global_Params] data_url = 'http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/' train_data='top_21_1fold_001.h5' -model_file='attn.model.h5' +model_name='attn' dense=[1000, 1000, 1000, 500, 250, 125, 60, 30, 2] batch_size=32 epochs=1 diff --git a/Pilot1/Uno_UQ/uno_inferUQ_keras2.py b/Pilot1/Uno_UQ/uno_inferUQ_keras2.py index 552ab2aa..505ea1cd 100644 --- a/Pilot1/Uno_UQ/uno_inferUQ_keras2.py +++ b/Pilot1/Uno_UQ/uno_inferUQ_keras2.py @@ -45,7 +45,12 @@ 'type': candle.str2bool, 'default': False, 'help':'Use given inference file to obtain indices to do inference'}, +{'name':'model_file', + 'type':str, + 'default':'saved.model.h5', + 'help':'trained model file'}, {'name':'weights_file', + 'type':str, 'default':'saved.weights.h5', 'help':'trained weights file (loading model file alone sometimes does not work in keras)'}, {'name':'n_pred', From 13db6835f060772880d26ded9c5979d3de44f903 Mon Sep 17 00:00:00 2001 From: Cristina Garcia Cardona Date: Thu, 9 Apr 2020 12:03:19 -0600 Subject: [PATCH 227/331] added definition of alpha_dropout keyword in p1b1 benchmark --- Pilot1/P1B1/p1b1.py | 6 +++++- Pilot1/P1B1/p1b1_baseline_keras2.py | 4 ++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/Pilot1/P1B1/p1b1.py b/Pilot1/P1B1/p1b1.py index 1699a140..ff85605a 100644 --- a/Pilot1/P1B1/p1b1.py +++ b/Pilot1/P1B1/p1b1.py @@ -68,7 +68,11 @@ {'name':'tsne', 'type': candle.str2bool, 'default': False, - 'help':'generate tsne plot of the latent representation'} + 'help':'generate tsne plot of the latent representation'}, +{'name':'alpha_dropout', + 'type': candle.str2bool, + 'default': False, + 'help':'use the AlphaDropout layer from keras instead of regular Dropout'} ] required = [ diff --git a/Pilot1/P1B1/p1b1_baseline_keras2.py b/Pilot1/P1B1/p1b1_baseline_keras2.py index 7eae97e8..05177230 100644 --- a/Pilot1/P1B1/p1b1_baseline_keras2.py +++ b/Pilot1/P1B1/p1b1_baseline_keras2.py @@ -6,7 +6,7 @@ from keras import backend as K from keras import optimizers from keras.models import Model -from keras.layers import BatchNormalization, Dense, Dropout, Input, Lambda +from keras.layers import BatchNormalization, Dense, Dropout, Input, Lambda, AlphaDropout from keras.callbacks import Callback, ModelCheckpoint, ReduceLROnPlateau, LearningRateScheduler, TensorBoard from keras.metrics import binary_crossentropy, mean_squared_error from scipy.stats.stats import pearsonr @@ -191,7 +191,7 @@ def run(params): activation = params['activation'] dropout = params['dropout'] dense_layers = params['dense'] - dropout_layer = keras.layers.noise.AlphaDropout if params['alpha_dropout'] else Dropout + dropout_layer = AlphaDropout if params['alpha_dropout'] else Dropout # Initialize weights and learning rule initializer_weights = candle.build_initializer(params['initialization'], keras_defaults, seed) From 1ce06d361d707ac77ccb5ac069c25c0cfe592142 Mon Sep 17 00:00:00 2001 From: yngtodd Date: Thu, 9 Apr 2020 14:36:42 -0400 Subject: [PATCH 228/331] Add expert level entry point This will let users fully customize how they would like to use DARTS with their own primitives. This is in contrast to the net configurations tailored for convolutional and linear models. --- Pilot3/P3B5/p3b5.py | 8 +- common/darts/modules/cell.py | 70 +++++++++++ common/darts/modules/conv/network.py | 7 -- common/darts/modules/mixed_layer.py | 38 ++++++ common/darts/modules/network.py | 178 +++++++++++++++++++++++++++ 5 files changed, 290 insertions(+), 11 deletions(-) create mode 100644 common/darts/modules/cell.py create mode 100644 common/darts/modules/mixed_layer.py create mode 100644 common/darts/modules/network.py diff --git a/Pilot3/P3B5/p3b5.py b/Pilot3/P3B5/p3b5.py index 4740638f..d69ac6ec 100644 --- a/Pilot3/P3B5/p3b5.py +++ b/Pilot3/P3B5/p3b5.py @@ -8,15 +8,15 @@ import candle REQUIRED = [ - 'learning_rate', + 'learning_rate', 'learning_rate_min', 'momentum', 'weight_decay', 'grad_clip', 'seed', 'unrolled', - 'batch_size', - 'epochs', + 'batch_size', + 'epochs', ] @@ -26,7 +26,7 @@ class BenchmarkP3B5(candle.Benchmark): def set_locals(self): """ Set parameters for the benchmark. - Args: + Args: required: set of required parameters for the benchmark. """ if REQUIRED is not None: diff --git a/common/darts/modules/cell.py b/common/darts/modules/cell.py new file mode 100644 index 00000000..cafa698f --- /dev/null +++ b/common/darts/modules/cell.py @@ -0,0 +1,70 @@ +import torch +import torch.nn as nn + +from darts.api import Model +from darts.modules.mixed_layer import MixedLayer +from darts.modules.operations.conv import ConvBlock + + +class Cell(Model): + + def __init__(self, num_nodes, multiplier, cpp, cp, c, primitives, ops): + """ + :param steps: 4, number of layers inside a cell + :param multiplier: 4 + :param cpp: 48 + :param cp: 48 + :param c: 16 + :param reduction: indicates whether to reduce the output maps width + :param reduction_prev: when previous cell reduced width, s1_d = s0_d//2 + in order to keep same shape between s1 and s0, we adopt prep0 layer to + reduce the s0 width by half. + """ + super(Cell, self).__init__() + self.preprocess0 = ConvBlock(cpp, c, 1, 1, 0, affine=False) + # preprocess1 deal with output from prev cell + self.preprocess1 = ConvBlock(cp, c, 1, 1, 0, affine=False) + + # steps inside a cell + self.num_nodes = num_nodes + self.multiplier = multiplier + + self.layers = nn.ModuleList() + + for i in range(self.num_nodes): + # for each i inside cell, it connects with all previous output + # plus previous two cells' output + for j in range(2 + i): + # for reduction cell, it will reduce the heading 2 inputs only + stride = 1 + layer = MixedLayer(c, stride, primitives, ops) + self.layers.append(layer) + + def forward(self, s0, s1, weights): + """ + :param s0: + :param s1: + :param weights: [14, 8] + :return: + """ + #print('s0:', s0.shape,end='=>') + s0 = self.preprocess0(s0) # [40, 48, 32, 32], [40, 16, 32, 32] + #print(s0.shape, self.reduction_prev) + #print('s1:', s1.shape,end='=>') + s1 = self.preprocess1(s1) # [40, 48, 32, 32], [40, 16, 32, 32] + #print(s1.shape) + + states = [s0, s1] + offset = 0 + # for each node, receive input from all previous intermediate nodes and s0, s1 + for i in range(self.num_nodes): # 4 + # [40, 16, 32, 32] + s = sum(self.layers[offset + j](h, weights[offset + j]) for j, h in enumerate(states)) + offset += len(states) + # append one state since s is the elem-wise addition of all output + states.append(s) + #print('node:',i, s.shape, self.reduction) + + # concat along dim=channel + return torch.cat(states[-self.multiplier:], dim=1) # 6 of [40, 16, 32, 32] + diff --git a/common/darts/modules/conv/network.py b/common/darts/modules/conv/network.py index f4d9d538..5b49222a 100644 --- a/common/darts/modules/conv/network.py +++ b/common/darts/modules/conv/network.py @@ -97,13 +97,6 @@ def __init__(self, tasks, criterion, device='cpu', hyperparams=Hyperparameters() self.alpha_reduce, ] - def fc_layers(self, cp, tasks): - """ Create fully connnected layers for each task """ - fc_layers = {} - for task, dim in tasks.items(): - fc_layers[task] = nn.Linear(cp, dim).to(self.device) - return fc_layers - def new(self): """ Create a new model initialzed with current alpha parameters. diff --git a/common/darts/modules/mixed_layer.py b/common/darts/modules/mixed_layer.py new file mode 100644 index 00000000..b4e4ca39 --- /dev/null +++ b/common/darts/modules/mixed_layer.py @@ -0,0 +1,38 @@ +import torch +import torch.nn as nn +from darts.api import Model + + +class MixedLayer(Model): + """ A mixture of 8 unit types + + We use weights to aggregate these outputs while training. + and softmax to select the strongest edges while inference. + """ + def __init__(self, c, stride, primitives, ops): + super(MixedLayer, self).__init__() + self.reset(c, stride, primitives, ops) + + def reset(self, c, stride): + self.layers = nn.ModuleList() + + for primitive in primitives: + layer = ops[primitive](c, stride, False) + + if 'pool' in primitive: + layer = nn.Sequential(layer, nn.BatchNorm1d(c, affine=False)) + + self.layers.append(layer) + + def forward(self, x, weights): + """ + Parameters + ---------- + x : torch.tensor + Data + + Weights : torch.tensor + alpha, [op_num:8], the output = sum of alpha * op(x) + """ + x = [w * layer(x) for w, layer in zip(weights, self.layers)] + return sum(x) diff --git a/common/darts/modules/network.py b/common/darts/modules/network.py new file mode 100644 index 00000000..26d4c490 --- /dev/null +++ b/common/darts/modules/network.py @@ -0,0 +1,178 @@ +from typing import Dict, List + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from darts.api import Model +from darts.modules import Cell +from darts.modules.classifier import MultitaskClassifier +from darts.genotypes import Genotype + + +class Hyperparameters: + c = 1 + num_nodes = 2 + num_cells = 3 + channel_multiplier = 1 + + +class Network(Model): + """ Collection of cells """ + + def __init__(self, + stem: nn.Module, + cell_dim: int, + primitives: List[str], + ops: Dict[], + tasks: Dict[str, int], + criterion, + device: str = 'cpu', + hyperparams=Hyperparameters()): + super(Network, self).__init__() + self.primitives = primitives + self.ops = ops + self.cell_dim = cell_dim + self.tasks = tasks + self.criterion = criterion + self.device = device + self.num_cells = hyperparams.num_cells + self.num_nodes = hyperparams.num_nodes + + self.stem = stem + + # c_curr means a factor of the output channels of current cell + c_curr = cell_dim * hyperparams.channel_multiplier * hyperparams.c + cpp, cp, c_curr = c_curr, c_curr, self.c + self.cells = nn.ModuleList() + for i in range(hyperparams.num_cells): + + cell = Cell( + hyperparams.num_nodes, + hyperparams.channel_multiplier, + cpp, + cp, + c_curr + ).to(self.device) + + self.cells += [cell] + + self.classifier = MultitaskClassifier(cell_dim, tasks) + + # k is the total number of edges inside single cell, 14 + k = sum(1 for i in range(self.num_nodes) for j in range(2 + i)) + num_ops = len(self.primitives) + + self.alpha_normal = nn.Parameter(torch.randn(k, num_ops)) + + with torch.no_grad(): + # initialize to smaller value + self.alpha_normal.mul_(1e-3) + + self._arch_parameters = [ + self.alpha_normal, + ] + + def new(self): + """ Create a new model initialzed with current alpha parameters. + + Weights are left untouched. + + Returns + ------- + model : Network + New model initialized with current alpha. + """ + model = Network( + self.stem, + self.cell_dim, + self.primitives, + self.ops, + self.tasks, + self.criterion + ).to(self.device) + + for x, y in zip(model.arch_parameters(), self.arch_parameters()): + x.data.copy_(y.data) + + return model + + def forward(self, x): + # s0 & s1 means the last cells' output + s0 = s1 = self.stem(x) # [b, 3, 32, 32] => [b, 48, 32, 32] + + for i, cell in enumerate(self.cells): + weights = F.softmax(self.alpha_normal, dim=-1) # [14, 8] + # execute cell() firstly and then assign s0=s1, s1=result + s0, out = s1, cell(s0, s1, weights) # [40, 64, 32, 32] + + logits = self.classifier(out.view(out.size(0), -1)) + + return logits + + def loss(self, data, target, reduce='mean'): + """ Calculate a value of loss function """ + logits = self(data) + + for task, logit in logits.items(): + logits[task] = logit.to(self.device) + + losses = {} + for task, label in target.items(): + label = label.to(self.device) + losses[task] = self.criterion(logits[task], label) + + if reduce: + total = 0 + for _, value in losses.items(): + total += value + + if reduce == "mean": + losses = total / len(losses) + elif reduce == "sum": + losses = total + else: + raise ValueError('Reduced loss must use either `mean` or `sum`!') + + return losses + + def arch_parameters(self): + return self._arch_parameters + + def genotype(self): + """ + :return: + """ + def _parse(weights): + gene = [] + n = 2 + start = 0 + for i in range(self.num_nodes): # for each node + end = start + n + W = weights[start:end].copy() + edges = sorted(range(i + 2), # i+2 is the number of connection for node i + key=lambda x: -max(W[x][k] # by descending order + for k in range(len(W[x])) # get strongest ops + if k != self.primitives.index('none')) + )[:2] # only has two inputs + for j in edges: # for every input nodes j of current node i + k_best = None + for k in range(len(W[j])): # get strongest ops for current input j->i + if k != self.primitives.index('none'): + if k_best is None or W[j][k] > W[j][k_best]: + k_best = k + gene.append((self.primitives[k_best], j)) # save ops and input node + start = end + n += 1 + return gene + + gene_normal = _parse(F.softmax(self.alpha_normal, dim=-1).data.cpu().numpy()) + concat = range(2 + self.num_nodes - self.channel_multiplier, self.num_nodes + 2) + + genotype = Genotype( + normal=gene_normal, normal_concat=concat, + reduce=gene_normal, reduce_concat=concat + ) + + return genotype + From c84ba7379c09a7042fe4a74dc418b5f9cadcf9b9 Mon Sep 17 00:00:00 2001 From: Cristina Garcia Cardona Date: Thu, 9 Apr 2020 12:47:22 -0600 Subject: [PATCH 229/331] fixed bug on uno baseline related to dropout keyword --- Pilot1/Uno/uno_baseline_keras2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Pilot1/Uno/uno_baseline_keras2.py b/Pilot1/Uno/uno_baseline_keras2.py index 976f3a4f..f6308104 100644 --- a/Pilot1/Uno/uno_baseline_keras2.py +++ b/Pilot1/Uno/uno_baseline_keras2.py @@ -200,7 +200,7 @@ def on_train_end(self, logs={}): def build_model(loader, args, permanent_dropout=True, silent=False): input_models = {} - dropout_rate = args.drop + dropout_rate = args.dropout for fea_type, shape in loader.feature_shapes.items(): base_type = fea_type.split('.')[0] if base_type in ['cell', 'drug']: From e2faa91cc185f075205e116a7467361e4de0f89c Mon Sep 17 00:00:00 2001 From: yngtodd Date: Thu, 9 Apr 2020 15:22:24 -0400 Subject: [PATCH 230/331] Add random and uno datasets This will give us some easily accessible data. --- common/darts/api/__init__.py | 1 + common/darts/api/dataset.py | 73 ++++++++++++ common/darts/datasets/random.py | 39 +++++++ common/darts/datasets/uno.py | 199 ++++++++++++++++++++++++++++++++ 4 files changed, 312 insertions(+) create mode 100644 common/darts/api/dataset.py create mode 100644 common/darts/datasets/random.py create mode 100644 common/darts/datasets/uno.py diff --git a/common/darts/api/__init__.py b/common/darts/api/__init__.py index 3b4d86eb..e9cb61d9 100644 --- a/common/darts/api/__init__.py +++ b/common/darts/api/__init__.py @@ -1 +1,2 @@ from .model import Model +from .dataset import InMemoryDataset diff --git a/common/darts/api/dataset.py b/common/darts/api/dataset.py new file mode 100644 index 00000000..157835b5 --- /dev/null +++ b/common/darts/api/dataset.py @@ -0,0 +1,73 @@ +from abc import abstractmethod +import pandas as pd + + +class Dataset: + """ Abstract dataset - Used for both Keras and Pytorch""" + + @abstractmethod + def __getitem__(self, idx): + """Gets batch at position `index`. + Parameters + ---------- + idx: index position of the batch in the data. + Returns + ------- + A batch + """ + raise NotImplementedError + + @abstractmethod + def __len__(self): + """Length of the dataset. + Returns + ------- + The number of samples in the data. + """ + raise NotImplementedError + + def on_epoch_end(self): + """ Keras method called at the end of every epoch. """ + pass + + def __iter__(self): + """Create a generator that iterates over the data.""" + for item in (self[i] for i in range(len(self))): + yield item + + +class InMemoryDataset(Dataset): + """ Abstract class for in memory data """ + + def load_data(self): + """ Load data and labels """ + raise NotImplementedError + + def dataframe(self): + """ Load the data as a pd.DataFrame """ + data, labels = self.load_data() + + if isinstance(labels, dict): + # We are in the multitask case + data_dict = {'data': data} + for key, value in labels.items(): + data_dict[key] = value + else: + data_dict = {'data': data, 'labels': labels} + + return pd.DataFrame(data_dict) + + def to_csv(self, path): + """ Save the data to disk """ + self.dataframe().to_csv(path, index=False) + + def load_cached(self, path): + """ Load the data from disk """ + frame = pd.read_csv(path) + + self.data = frame.pop('data') + + if len(frame.columns) > 1: + self.labels = frame.to_dict() + else: + self.labels = frame['labels'] diff --git a/common/darts/datasets/random.py b/common/darts/datasets/random.py new file mode 100644 index 00000000..e153a525 --- /dev/null +++ b/common/darts/datasets/random.py @@ -0,0 +1,39 @@ +import numpy as np +from typing import Dict +from torch.utils.data import Dataset + + +class RandomData(Dataset): + """ Random dataset - Useful for quick iterating """ + + def __init__(self, x_dim: int, num_samples: int, tasks: Dict[str, int], seed: int=13): + np.random.seed(seed) + self.data = self.create_data(x_dim, num_samples) + self.labels = self.create_labels(tasks, num_samples) + + def create_data(self, x_dim, num_samples): + data = [np.random.randn(x_dim).astype('f') for _ in range(num_samples)] + return np.stack(data) + + def create_labels(self, tasks, num_samples): + labels = {} + for task, num_classes in tasks.items(): + labels[task] = np.random.randint(num_classes, size=num_samples) + + return labels + + def index_labels(self, idx): + """ Index into the labels """ + return {key: value[idx] for key, value in self.labels.items()} + + def load_data(self): + return self.data, self.labels + + def __repr__(self): + return f'Random supervised dataset' + + def __len__(self): + return len(self.data) + + def __getitem__(self, idx): + return self.data[idx], self.index_labels(idx) diff --git a/common/darts/datasets/uno.py b/common/darts/datasets/uno.py new file mode 100644 index 00000000..4109f979 --- /dev/null +++ b/common/darts/datasets/uno.py @@ -0,0 +1,199 @@ +import os +import torch + +import numpy as np +import pandas as pd + +from darts.api import InMemoryDataset +from darts.datasets.utils import ( + download_url, makedir_exist_ok +) + + +class Uno(InMemoryDataset): + """Uno Dataset + + Parameters + ---------- + root str : + Root directory of dataset where ``processed/training.npy`` + ``processed/validation.npy and ``processed/test.npy`` exist. + + partition : str + dataset partition to be loaded. + Either 'train', 'validation', or 'test'. + + download : bool, optional + If true, downloads the dataset from the internet and + puts it in root directory. If dataset is already downloaded, it is not + downloaded again. + """ + urls = [ + 'http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/top_21_auc_1fold.uno.h5', + ] + + training_data_file = 'train_data.pt' + training_label_file = 'train_labels.pt' + test_data_file = 'test_data.pt' + test_label_file = 'test_labels.pt' + + def __init__(self, root, partition, transform=None, + target_transform=None, download=False): + self.root = os.path.expanduser(root) + self.transform = transform + self.target_transform = target_transform + + if download: + self.download() + + if not self._check_exists(): + raise RuntimeError('Dataset not found.' + + ' You can use download=True to download it') + + self.partition = partition + if self.partition == 'train': + data_file = self.training_data_file + label_file = self.training_label_file + elif self.partition == 'test': + data_file = self.test_data_file + label_file = self.test_label_file + else: + raise ValueError("Partition must either be 'train' or 'test'.") + + self.data = torch.load(os.path.join(self.processed_folder, data_file)) + self.targets = torch.load(os.path.join(self.processed_folder, label_file)) + + def __len__(self): + return len(self.data['gene_data']) + + def load_data(self): + return self.data, self.targets + + def read_data(self, data_file, partition): + """ Read in the H5 data """ + if partition == 'train': + gene_data = 'x_train_0' + drug_data = 'x_train_1' + else: + gene_data = 'x_val_0' + drug_data = 'x_val_1' + + gene_data = torch.tensor(pd.read_hdf(data_file, gene_data).values) + drug_data = torch.tensor(pd.read_hdf(data_file, drug_data).values) + data = {'gene_data': gene_data, 'drug_data': drug_data} + + return data + + def read_targets(self, data_file, partition): + """Get dictionary of targets specified by user.""" + if partition == 'train': + label = 'y_train' + else: + label = 'y_val' + + tasks = { + 'response': torch.tensor( + pd.read_hdf(data_file, label)['AUC'].apply(lambda x: 1 if x < 0.5 else 0) + ) + } + + return tasks + + def __getitem__(self, idx): + """ + Parameters + ---------- + index : int + Index of the data to be loaded. + + Returns + ------- + (document, target) : tuple + where target is index of the target class. + """ + data = self.data['gene_data'][idx] + + if self.transform is not None: + data = self.transform(data) + + targets = {} + for key, value in self.targets.items(): + subset = value[idx] + + if self.target_transform is not None: + subset = self.target_transform(subset) + + targets[key] = subset + + return data, targets + + @property + def raw_folder(self): + return os.path.join(self.root, self.__class__.__name__, 'raw') + + @property + def processed_folder(self): + return os.path.join(self.root, self.__class__.__name__, 'processed') + + def _check_exists(self): + return os.path.exists(os.path.join(self.processed_folder, self.training_data_file)) and \ + os.path.exists(os.path.join(self.processed_folder, self.training_label_file)) and \ + os.path.exists(os.path.join(self.processed_folder, self.test_data_file)) and \ + os.path.exists(os.path.join(self.processed_folder, self.test_label_file)) + + @staticmethod + def extract_array(path, remove_finished=False): + print('Extracting {}'.format(path)) + arry = np.load(path) + if remove_finished: + os.unlink(path) + + def download(self): + """Download the Synthetic data if it doesn't exist in processed_folder already.""" + + if self._check_exists(): + return + + makedir_exist_ok(self.raw_folder) + makedir_exist_ok(self.processed_folder) + + # download files + for url in self.urls: + filename = url.rpartition('/')[2] + file_path = os.path.join(self.raw_folder, filename) + download_url(url, root=self.raw_folder, filename=filename, md5=None) + #self.extract_array(path=file_path, remove_finished=False) + + # process and save as numpy files + print('Processing...') + + training_set = ( + self.read_data(os.path.join(self.raw_folder, 'top_21_auc_1fold.uno.h5'), 'train'), + self.read_targets(os.path.join(self.raw_folder, 'top_21_auc_1fold.uno.h5'), 'train') + ) + test_set = ( + self.read_data(os.path.join(self.raw_folder, 'top_21_auc_1fold.uno.h5'), 'test'), + self.read_targets(os.path.join(self.raw_folder, 'top_21_auc_1fold.uno.h5'), 'test') + ) + + # Save processed training data + train_data_path = os.path.join(self.processed_folder, self.training_data_file) + torch.save(training_set[0], train_data_path) + train_label_path = os.path.join(self.processed_folder, self.training_label_file) + torch.save(training_set[1], train_label_path) + + # Save processed test data + test_data_path = os.path.join(self.processed_folder, self.test_data_file) + torch.save(test_set[0], test_data_path) + test_label_path = os.path.join(self.processed_folder, self.test_label_file) + torch.save(test_set[1], test_label_path) + + print('Done!') + + def __repr__(self): + fmt_str = 'Dataset ' + self.__class__.__name__ + '\n' + fmt_str += ' Number of datapoints: {}\n'.format(self.__len__()) + tmp = self.partition + fmt_str += ' Split: {}\n'.format(tmp) + fmt_str += ' Root Location: {}\n'.format(self.root) + return fmt_str From 78715bd30567be63a6e877d1bda843d4b55087a5 Mon Sep 17 00:00:00 2001 From: yngtodd Date: Thu, 9 Apr 2020 15:26:32 -0400 Subject: [PATCH 231/331] Add dataset utilities. This is helpful in particular for the Uno data that Harry shared. --- common/darts/datasets/utils.py | 119 +++++++++++++++++++++++++++++++++ 1 file changed, 119 insertions(+) create mode 100644 common/darts/datasets/utils.py diff --git a/common/darts/datasets/utils.py b/common/darts/datasets/utils.py new file mode 100644 index 00000000..39b5a417 --- /dev/null +++ b/common/darts/datasets/utils.py @@ -0,0 +1,119 @@ +import os +import os.path +import hashlib +import errno +from tqdm import tqdm + + +def gen_bar_updater(pbar): + def bar_update(count, block_size, total_size): + if pbar.total is None and total_size: + pbar.total = total_size + progress_bytes = count * block_size + pbar.update(progress_bytes - pbar.n) + + return bar_update + + +def check_integrity(fpath, md5=None): + if md5 is None: + return True + if not os.path.isfile(fpath): + return False + md5o = hashlib.md5() + with open(fpath, 'rb') as f: + # read in 1MB chunks + for chunk in iter(lambda: f.read(1024 * 1024), b''): + md5o.update(chunk) + md5c = md5o.hexdigest() + if md5c != md5: + return False + return True + + +def makedir_exist_ok(dirpath): + """ + Python2 support for os.makedirs(.., exist_ok=True) + """ + try: + os.makedirs(dirpath) + except OSError as e: + if e.errno == errno.EEXIST: + pass + else: + raise + + +def download_url(url, root, filename, md5): + from six.moves import urllib + + root = os.path.expanduser(root) + fpath = os.path.join(root, filename) + + makedir_exist_ok(root) + + # downloads file + if os.path.isfile(fpath) and check_integrity(fpath, md5): + print('Using downloaded and verified file: ' + fpath) + else: + try: + print('Downloading ' + url + ' to ' + fpath) + urllib.request.urlretrieve( + url, fpath, + reporthook=gen_bar_updater(tqdm(unit='B', unit_scale=True)) + ) + except: + if url[:5] == 'https': + url = url.replace('https:', 'http:') + print('Failed download. Trying https -> http instead.' + ' Downloading ' + url + ' to ' + fpath) + urllib.request.urlretrieve( + url, fpath, + reporthook=gen_bar_updater(tqdm(unit='B', unit_scale=True)) + ) + + +def list_dir(root, prefix=False): + """List all directories at a given root + + Args: + root (str): Path to directory whose folders need to be listed + prefix (bool, optional): If true, prepends the path to each result, otherwise + only returns the name of the directories found + """ + root = os.path.expanduser(root) + directories = list( + filter( + lambda p: os.path.isdir(os.path.join(root, p)), + os.listdir(root) + ) + ) + + if prefix is True: + directories = [os.path.join(root, d) for d in directories] + + return directories + + +def list_files(root, suffix, prefix=False): + """List all files ending with a suffix at a given root + + Args: + root (str): Path to directory whose folders need to be listed + suffix (str or tuple): Suffix of the files to match, e.g. '.png' or ('.jpg', '.png'). + It uses the Python "str.endswith" method and is passed directly + prefix (bool, optional): If true, prepends the path to each result, otherwise + only returns the name of the files found + """ + root = os.path.expanduser(root) + files = list( + filter( + lambda p: os.path.isfile(os.path.join(root, p)) and p.endswith(suffix), + os.listdir(root) + ) + ) + + if prefix is True: + files = [os.path.join(root, d) for d in files] + + return files From 8e32f5bc8dc4928a5b845fd4acd34724dc558a39 Mon Sep 17 00:00:00 2001 From: yngtodd Date: Thu, 9 Apr 2020 15:42:29 -0400 Subject: [PATCH 232/331] Add sampling for datasets. This gives a convenient way to subset datasets. --- common/darts/api/dataset.py | 22 ++++++++++++++++++++++ common/darts/datasets/sample.py | 15 +++++++++++++++ 2 files changed, 37 insertions(+) create mode 100644 common/darts/datasets/sample.py diff --git a/common/darts/api/dataset.py b/common/darts/api/dataset.py index 157835b5..488b7ce2 100644 --- a/common/darts/api/dataset.py +++ b/common/darts/api/dataset.py @@ -71,3 +71,25 @@ def load_cached(self, path): self.labels = frame.to_dict() else: self.labels = frame['labels'] + + +class Subset(InMemoryDataset): + """Subset of a dataset at specified indices. + + Args: + dataset (Dataset): The dataset to be subsetted + indices (sequence): Indices in the whole set selected for subset + """ + def __init__(self, dataset, indices): + self.dataset = dataset + self.indices = indices + + def __getitem__(self, idx): + return self.dataset[self.indices[idx]] + + def __len__(self): + return len(self.indices) + + def load_data(self): + return self.dataset[self.indices] + diff --git a/common/darts/datasets/sample.py b/common/darts/datasets/sample.py new file mode 100644 index 00000000..70880405 --- /dev/null +++ b/common/darts/datasets/sample.py @@ -0,0 +1,15 @@ +from sklearn.utils import resample +from darts.api.dataset import Subset + + +def dummy_indices(dataset): + """ Get indexes for the dataset """ + return [x for x in range(len(dataset))] + + +def sample(dataset, num_samples, replace=True): + """ Sample the dataset """ + data_idx = dummy_indices(dataset) + sample_idx = resample(data_idx, n_samples=num_samples, replace=replace) + return Subset(dataset, sample_idx) + From fd2c1021d087f480c80f8ca29dfbfe223502a0eb Mon Sep 17 00:00:00 2001 From: yngtodd Date: Thu, 9 Apr 2020 22:27:36 -0400 Subject: [PATCH 233/331] Start example This gives us the first example using Darts for the Uno model. --- common/darts/__init__.py | 7 ++ common/darts/meters/accuracy.py | 15 ++- common/darts/meters/epoch.py | 40 +++++++ examples/darts/README.rst | 3 + examples/darts/uno/README.rst | 3 + examples/darts/uno/uno.py | 186 ++++++++++++++++++++++++++++++++ 6 files changed, 253 insertions(+), 1 deletion(-) create mode 100644 common/darts/meters/epoch.py create mode 100644 examples/darts/README.rst create mode 100644 examples/darts/uno/README.rst create mode 100644 examples/darts/uno/uno.py diff --git a/common/darts/__init__.py b/common/darts/__init__.py index c98ae304..069609d6 100644 --- a/common/darts/__init__.py +++ b/common/darts/__init__.py @@ -17,6 +17,13 @@ from .meters.accuracy import MultitaskAccuracyMeter from .utils.logging import log_accuracy +from darts.meters.epoch import EpochMeter +from darts.meters.accuracy import MultitaskAccuracyMeter +from darts.utils.tensor import to_device +from darts.utils.random import SeedControl +from darts.utils.logging import log_accuracy +from darts.utils.tensor import to_device + from .functional import ( multitask_loss, multitask_loss, multitask_accuracy ) diff --git a/common/darts/meters/accuracy.py b/common/darts/meters/accuracy.py index d3089454..7e2670e2 100644 --- a/common/darts/meters/accuracy.py +++ b/common/darts/meters/accuracy.py @@ -1,3 +1,6 @@ +import os +import pandas as pd + from darts.meters.average import AverageMeter @@ -25,5 +28,15 @@ def get_accuracy(self, task): def update(self, accuracies, batch_size): for task, acc in accuracies.items(): - self.meters[task].update(acc, batch_size) + self.meters[task].update(acc[0].item(), batch_size) + + def dataframe(self): + """ Get a dataframe of all task accuracies """ + avg_accuracy = {k: v.avgs for (k, v) in self.meters.items()} + return pd.DataFrame(avg_accuracy) + + def save(self, path, filename): + """ Save the task accuracies as a csv """ + path = os.path.join(path, f'{filename}_accuracy.csv') + self.dataframe().to_csv(path, index=False) diff --git a/common/darts/meters/epoch.py b/common/darts/meters/epoch.py new file mode 100644 index 00000000..b20ee4ac --- /dev/null +++ b/common/darts/meters/epoch.py @@ -0,0 +1,40 @@ +import os +import pandas as pd + +from darts.meters.average import AverageMeter +from darts.meters.accuracy import MultitaskAccuracyMeter + + +class EpochMeter: + """ Track epoch loss and accuracy """ + + def __init__(self, tasks, name='train'): + self.name = name + self.loss_meter = AverageMeter(name) + self.acc_meter = MultitaskAccuracyMeter(tasks) + self.reset() + + def reset(self): + self.loss = [] + self.acc = { task: [] for task, _ in self.acc_meter.meters.items() } + + def update_batch_loss(self, loss, batch_size): + self.loss_meter.update(loss, batch_size) + + def update_batch_accuracy(self, acc, batch_size): + self.acc_meter.update(acc, batch_size) + + def update_epoch(self): + self.loss.append(self.loss_meter.avg) + for task, acc in self.acc_meter.meters.items(): + self.acc[task].append(acc.avg) + + def dataframe(self): + results = self.acc + results['loss'] = self.loss + return pd.DataFrame(results) + + def save(self, path): + os.makedirs(path, exist_ok=True) + path = os.path.join(path, f'{self.name}_epoch_results') + self.dataframe().to_csv(path, index=False) diff --git a/examples/darts/README.rst b/examples/darts/README.rst new file mode 100644 index 00000000..ef49aead --- /dev/null +++ b/examples/darts/README.rst @@ -0,0 +1,3 @@ +============== +DARTS Examples +============== diff --git a/examples/darts/uno/README.rst b/examples/darts/uno/README.rst new file mode 100644 index 00000000..8ee51b54 --- /dev/null +++ b/examples/darts/uno/README.rst @@ -0,0 +1,3 @@ +========= +DARTS UNO +========= diff --git a/examples/darts/uno/uno.py b/examples/darts/uno/uno.py new file mode 100644 index 00000000..54b0c172 --- /dev/null +++ b/examples/darts/uno/uno.py @@ -0,0 +1,186 @@ +import torch +import torch.nn as nn +from torch import optim +import torch.nn.functional as F +from torch.utils.data import DataLoader + +import uno as bmk +import candle +import darts + +from uno_darts import train, infer + + +def initialize_parameters(): + """ Initialize the parameters for the Uno example """ + + uno_example = bmk.UnoExample( + bmk.file_path, + 'uno_default_model.txt', + 'pytorch', + prog='uno_example', + desc='Differentiable Architecture Search - Uno example', + ) + + # Initialize parameters + gParameters = candle.finalize_parameters(p3b5_bench) + return gParameters + + +def run(params): + args = candle.ArgumentStruct(**params) + + args.cuda = torch.cuda.is_available() + device = torch.device(f"cuda" if args.cuda else "cpu") + darts.banner(device=device) + + train_data = Uno(args.datapath, 'train', download=True) + valid_data = Uno(args.datapath, 'test') + + train_data = sample(train_data, len(valid_data)) + + trainloader = DataLoader(train_data, batch_size=args.batch_size) + validloader = DataLoader(valid_data, batch_size=args.batch_size) + + criterion = nn.CrossEntropyLoss().to(device) + + tasks = { + 'response': 2, + } + + model = darts.LinearNetwork( + tasks=tasks, criterion=criterion, device=device + ).to(device) + + architecture = darts.Architecture(model, args, device=device) + + optimizer = optim.SGD( + model.parameters(), + args.lr, + momentum=args.momentum, + weight_decay=args.wd + ) + + scheduler = optim.lr_scheduler.CosineAnnealingLR( + optimizer, + float(args.epochs), + eta_min=args.lr_min + ) + + train_meter = EpochMeter(tasks, 'train') + valid_meter = EpochMeter(tasks, 'valid') + + for epoch in range(args.epochs): + + scheduler.step() + lr = scheduler.get_lr()[0] + logger.info(f'\nEpoch: {epoch} lr: {lr}') + + genotype = model.genotype() + logger.info(f'Genotype: {genotype}\n') + + train( + trainloader, + validloader, + model, + architecture, + criterion, + optimizer, + lr, + args, + tasks, + train_meter, + device + ) + + validate(validloader, model, criterion, args, tasks, valid_meter, device) + + +def train(trainloader, + validloader, + model, + architecture, + criterion, + optimizer, + lr, + args, + tasks, + meter, + device): + + valid_iter = iter(trainloader) + + for step, (data, target) in enumerate(trainloader): + + batch_size = data.size(0) + model.train() + + data = to_device(data, device) + target = to_device(target, device) + + x_search, target_search = next(valid_iter) + x_search = to_device(x_search, device) + target_search = to_device(target_search, device) + + # 1. update alpha + architecture.step( + data, + target, + x_search, + target_search, + lr, + optimizer, + unrolled=args.unrolled + ) + + logits = model(data) + loss = multitask_loss(target, logits, criterion, reduce='mean') + + # 2. update weight + optimizer.zero_grad() + loss.backward() + nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip) + optimizer.step() + + prec1 = multitask_accuracy_topk(logits, target, topk=(1,)) + meters.update_batch_loss(loss.item(), batch_size) + meters.update_batch_accuracy(prec1, batch_size) + + if step % args.log_interval == 0: + logger.info(f'Step: {step} loss: {meters.loss_meter.avg:.4}') + + meters.update_epoch() + meters.save(args.experimentpath) + + +def validate(validloader, model, criterion, args, tasks, meters, device): + model.eval() + with torch.no_grad(): + for step, (data, target) in enumerate(validloader): + + data = to_device(data, device) + target = to_device(target, device) + + batch_size = data.size(0) + + logits = model(data) + loss = multitask_loss(target, logits, criterion, reduce='mean') + + prec1 = multitask_accuracy_topk(logits, target, topk=(1,)) + meters.update_batch_loss(loss.item(), batch_size) + meters.update_batch_accuracy(prec1, batch_size) + + if step % args.log_interval == 0: + logger.info(f'>> Validation: {step} loss: {meters.loss_meter.avg:.4}') + + meters.update_epoch() + meters.save(args.experimentpath) + + +def main(): + params = initialize_parameters() + run(params) + + +if __name__=='__main__': + main() From d6dd1df31867abcd75e71924f6738858ae061f73 Mon Sep 17 00:00:00 2001 From: yngtodd Date: Thu, 9 Apr 2020 23:45:04 -0400 Subject: [PATCH 234/331] Refactor example Just tidying up a bit. --- examples/darts/uno/example_setup.py | 37 +++++++++++++++++++ examples/darts/uno/{uno.py => uno_example.py} | 20 +++++----- 2 files changed, 46 insertions(+), 11 deletions(-) create mode 100644 examples/darts/uno/example_setup.py rename examples/darts/uno/{uno.py => uno_example.py} (91%) diff --git a/examples/darts/uno/example_setup.py b/examples/darts/uno/example_setup.py new file mode 100644 index 00000000..5c7a8c62 --- /dev/null +++ b/examples/darts/uno/example_setup.py @@ -0,0 +1,37 @@ +import os +import sys + + +file_path = os.path.dirname(os.path.realpath(__file__)) +lib_path2 = os.path.abspath(os.path.join(file_path, '..', '..', 'common')) +sys.path.append(lib_path2) + + +import candle + + +REQUIRED = [ + 'learning_rate', + 'learning_rate_min', + 'momentum', + 'weight_decay', + 'grad_clip', + 'seed', + 'unrolled', + 'batch_size', + 'epochs', +] + + +class UnoExample(candle.Benchmark): + """ Example for Uno """ + + def set_locals(self): + """ Set parameters for the benchmark. + + Args: + required: set of required parameters for the benchmark. + """ + if REQUIRED is not None: + self.required = set(REQUIRED) + diff --git a/examples/darts/uno/uno.py b/examples/darts/uno/uno_example.py similarity index 91% rename from examples/darts/uno/uno.py rename to examples/darts/uno/uno_example.py index 54b0c172..d31d345d 100644 --- a/examples/darts/uno/uno.py +++ b/examples/darts/uno/uno_example.py @@ -1,14 +1,13 @@ import torch import torch.nn as nn from torch import optim -import torch.nn.functional as F from torch.utils.data import DataLoader -import uno as bmk -import candle import darts +import candle +import example_setup as bmk -from uno_darts import train, infer +from uno_darts import train, validate def initialize_parameters(): @@ -34,8 +33,8 @@ def run(params): device = torch.device(f"cuda" if args.cuda else "cpu") darts.banner(device=device) - train_data = Uno(args.datapath, 'train', download=True) - valid_data = Uno(args.datapath, 'test') + train_data = darts.Uno(args.datapath, 'train', download=True) + valid_data = darts.Uno(args.datapath, 'test') train_data = sample(train_data, len(valid_data)) @@ -67,8 +66,8 @@ def run(params): eta_min=args.lr_min ) - train_meter = EpochMeter(tasks, 'train') - valid_meter = EpochMeter(tasks, 'valid') + train_meter = darts.EpochMeter(tasks, 'train') + valid_meter = darts.EpochMeter(tasks, 'valid') for epoch in range(args.epochs): @@ -81,7 +80,6 @@ def run(params): train( trainloader, - validloader, model, architecture, criterion, @@ -150,7 +148,7 @@ def train(trainloader, logger.info(f'Step: {step} loss: {meters.loss_meter.avg:.4}') meters.update_epoch() - meters.save(args.experimentpath) + meters.save(args.results_path) def validate(validloader, model, criterion, args, tasks, meters, device): @@ -174,7 +172,7 @@ def validate(validloader, model, criterion, args, tasks, meters, device): logger.info(f'>> Validation: {step} loss: {meters.loss_meter.avg:.4}') meters.update_epoch() - meters.save(args.experimentpath) + meters.save(args.results_path) def main(): From d4c9c111b1c8894194113a867d1405c2dfdab6b7 Mon Sep 17 00:00:00 2001 From: yngtodd Date: Fri, 10 Apr 2020 02:14:52 -0400 Subject: [PATCH 235/331] Add default model parameters This follows along the benchmark examples. --- common/darts/__init__.py | 15 ++++-------- examples/darts/uno/default_model.txt | 15 ++++++++++++ examples/darts/uno/example_setup.py | 2 +- examples/darts/uno/uno_example.py | 34 ++++++++++++++++++++-------- 4 files changed, 45 insertions(+), 21 deletions(-) create mode 100644 examples/darts/uno/default_model.txt diff --git a/common/darts/__init__.py b/common/darts/__init__.py index 069609d6..e6e43756 100644 --- a/common/darts/__init__.py +++ b/common/darts/__init__.py @@ -12,17 +12,12 @@ # Utilities that are not neccessary from .datasets.p3b3 import P3B3 +from .datasets.uno import Uno +from .datasets.sample import sample from .api.config import banner -from .meters.average import AverageMeter -from .meters.accuracy import MultitaskAccuracyMeter -from .utils.logging import log_accuracy - -from darts.meters.epoch import EpochMeter -from darts.meters.accuracy import MultitaskAccuracyMeter -from darts.utils.tensor import to_device -from darts.utils.random import SeedControl -from darts.utils.logging import log_accuracy -from darts.utils.tensor import to_device +from .meters.epoch import EpochMeter +from .utils.tensor import to_device +from .utils.random import SeedControl from .functional import ( multitask_loss, multitask_loss, multitask_accuracy diff --git a/examples/darts/uno/default_model.txt b/examples/darts/uno/default_model.txt new file mode 100644 index 00000000..2332dc39 --- /dev/null +++ b/examples/darts/uno/default_model.txt @@ -0,0 +1,15 @@ +[Global_Params] +model_name = 'darts_uno' +unrolled = False +data_url = 'ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/' +savepath = '.' +log_interval = 10 +train_data = 'top_21_auc_1fold.uno.h5' +learning_rate = 0.01 +learning_rate_min = 0.001 +momentum = 0.9 +weight_decay = 3e-4 +grad_clip = 5 +batch_size = 100 +epochs = 10 +seed = 13 diff --git a/examples/darts/uno/example_setup.py b/examples/darts/uno/example_setup.py index 5c7a8c62..7d634b08 100644 --- a/examples/darts/uno/example_setup.py +++ b/examples/darts/uno/example_setup.py @@ -3,7 +3,7 @@ file_path = os.path.dirname(os.path.realpath(__file__)) -lib_path2 = os.path.abspath(os.path.join(file_path, '..', '..', 'common')) +lib_path2 = os.path.abspath(os.path.join(file_path, '..', '..', '..', 'common')) sys.path.append(lib_path2) diff --git a/examples/darts/uno/uno_example.py b/examples/darts/uno/uno_example.py index d31d345d..189eda92 100644 --- a/examples/darts/uno/uno_example.py +++ b/examples/darts/uno/uno_example.py @@ -3,11 +3,9 @@ from torch import optim from torch.utils.data import DataLoader +import example_setup as bmk import darts import candle -import example_setup as bmk - -from uno_darts import train, validate def initialize_parameters(): @@ -15,17 +13,31 @@ def initialize_parameters(): uno_example = bmk.UnoExample( bmk.file_path, - 'uno_default_model.txt', + 'default_model.txt', 'pytorch', prog='uno_example', desc='Differentiable Architecture Search - Uno example', ) # Initialize parameters - gParameters = candle.finalize_parameters(p3b5_bench) + gParameters = candle.finalize_parameters(uno_example) return gParameters +def fetch_data(gParameters): + """ Download and untar data + + Args: + gParameters: parameters from candle + + Returns: + path to where the data is located + """ + path = gParameters['data_url'] + fpath = candle.fetch_file(path + gParameters['train_data'], 'UnoExample') + return fpath + + def run(params): args = candle.ArgumentStruct(**params) @@ -33,10 +45,12 @@ def run(params): device = torch.device(f"cuda" if args.cuda else "cpu") darts.banner(device=device) - train_data = darts.Uno(args.datapath, 'train', download=True) - valid_data = darts.Uno(args.datapath, 'test') + #datapath = fetch_data(params) + datapath = params['data_url'] + params['train_data'] + train_data = darts.Uno(datapath, 'train', download=True) + valid_data = darts.Uno(datapath, 'test') - train_data = sample(train_data, len(valid_data)) + train_data = darts.sample(train_data, len(valid_data)) trainloader = DataLoader(train_data, batch_size=args.batch_size) validloader = DataLoader(valid_data, batch_size=args.batch_size) @@ -148,7 +162,7 @@ def train(trainloader, logger.info(f'Step: {step} loss: {meters.loss_meter.avg:.4}') meters.update_epoch() - meters.save(args.results_path) + meters.save(args.savepath) def validate(validloader, model, criterion, args, tasks, meters, device): @@ -172,7 +186,7 @@ def validate(validloader, model, criterion, args, tasks, meters, device): logger.info(f'>> Validation: {step} loss: {meters.loss_meter.avg:.4}') meters.update_epoch() - meters.save(args.results_path) + meters.save(args.savepath) def main(): From 10279a71fb01415826122d10667a0ff0898bc2fc Mon Sep 17 00:00:00 2001 From: Jamal Date: Fri, 10 Apr 2020 12:24:12 -0600 Subject: [PATCH 236/331] Fixed missing keyword definitions in P3 benchmarks. --- Pilot3/P3B1/p3b1.py | 2 +- Pilot3/P3B1/p3b1_baseline_keras2.py | 20 ++++++++++---------- Pilot3/P3B1/p3b1_default_model.txt | 2 +- Pilot3/P3B2/p3b2.py | 2 +- Pilot3/P3B2/p3b2_baseline_keras2.py | 2 +- Pilot3/P3B2/p3b2_default_model.txt | 2 +- Pilot3/P3B3/p3b3.py | 26 +++++++++++++++++++++----- Pilot3/P3B3/p3b3_baseline_keras2.py | 13 +++++-------- Pilot3/P3B4/p3b4.py | 16 ++++++++++++++-- 9 files changed, 55 insertions(+), 30 deletions(-) diff --git a/Pilot3/P3B1/p3b1.py b/Pilot3/P3B1/p3b1.py index e082497d..c67b5a3d 100644 --- a/Pilot3/P3B1/p3b1.py +++ b/Pilot3/P3B1/p3b1.py @@ -59,7 +59,7 @@ ] -required = ['learning_rate', 'batch_size', 'epochs', 'drop', \ +required = ['learning_rate', 'batch_size', 'epochs', 'dropout', \ 'activation', 'out_activation', 'loss', 'optimizer', 'metrics', \ 'n_fold', 'scaling', 'initialization', 'shared_nnet_spec', \ 'ind_nnet_spec', 'feature_names'] diff --git a/Pilot3/P3B1/p3b1_baseline_keras2.py b/Pilot3/P3B1/p3b1_baseline_keras2.py index d1d469ca..53df1868 100644 --- a/Pilot3/P3B1/p3b1_baseline_keras2.py +++ b/Pilot3/P3B1/p3b1_baseline_keras2.py @@ -19,7 +19,7 @@ def initialize_parameters(default_model = 'p3b1_default_model.txt'): # Build benchmark object p3b1Bmk = bmk.BenchmarkP3B1(bmk.file_path, default_model, 'keras', prog='p3b1_baseline', desc='Multi-task (DNN) for data extraction from clinical reports - Pilot 3 Benchmark 1') - + # Initialize parameters gParameters = candle.finalize_parameters(p3b1Bmk) #bmk.logger.info('Params: {}'.format(gParameters)) @@ -35,7 +35,7 @@ def fetch_data(gParameters): path = gParameters['data_url'] fpath = candle.fetch_file(path + gParameters['train_data'], 'Pilot3', untar=True) - + return fpath @@ -43,7 +43,7 @@ def build_model(gParameters, kerasDefaults, shared_nnet_spec, individual_nnet_spec, input_dim, Y_train, Y_test, verbose=False): - + labels_train = [] labels_test = [] @@ -52,13 +52,13 @@ def build_model(gParameters, kerasDefaults, for l in range( len( Y_train ) ): truth_train = np.array( Y_train[l], dtype='int32' ) truth_test = np.array( Y_test[l], dtype='int32' ) - + mv = int( np.max( truth_train ) ) - + label_train = np.zeros( ( len( truth_train ), mv + 1 ) ) for i in range( len( truth_train ) ): label_train[ i, truth_train[ i ] ] = 1 - + label_test = np.zeros( ( len( truth_test ), mv + 1 ) ) for i in range( len(truth_test) ): label_test[ i, truth_test[ i ] ] = 1 @@ -81,8 +81,8 @@ def build_model(gParameters, kerasDefaults, for k in range( len( shared_nnet_spec ) ): layer = Dense( shared_nnet_spec[ k ], activation=gParameters['activation'], name= 'shared_layer_' + str( k ) )( shared_layers[ -1 ] ) - if gParameters['drop'] > 0: - layer = Dropout( gParameters['drop'] )( shared_layers[ -1 ] ) + if gParameters['dropout'] > 0: + layer = Dropout( gParameters['dropout'] )( shared_layers[ -1 ] ) shared_layers.append( layer ) @@ -100,8 +100,8 @@ def build_model(gParameters, kerasDefaults, layer = Dense( individual_nnet_spec[l][k], activation=gParameters['activation'], name= 'indiv_layer_' + str( l ) + '_' + str( k ) )( indiv_layers[-1] ) indiv_layers.append( layer ) - if gParameters['drop'] > 0: - layer = Dropout( gParameters['drop'] )( indiv_layers[-1] ) + if gParameters['dropout'] > 0: + layer = Dropout( gParameters['dropout'] )( indiv_layers[-1] ) indiv_layers.append( layer ) else: layer = Dense( n_out_nodes[l], activation=gParameters['out_activation'], diff --git a/Pilot3/P3B1/p3b1_default_model.txt b/Pilot3/P3B1/p3b1_default_model.txt index 9d93f422..41cb0c60 100644 --- a/Pilot3/P3B1/p3b1_default_model.txt +++ b/Pilot3/P3B1/p3b1_default_model.txt @@ -5,7 +5,7 @@ model_name = 'p3b1' learning_rate = 0.01 batch_size = 10 epochs = 10 -drop = 0.0 +dropout = 0.0 activation = 'relu' out_activation = 'softmax' loss = 'categorical_crossentropy' diff --git a/Pilot3/P3B2/p3b2.py b/Pilot3/P3B2/p3b2.py index 55484632..169b81b3 100644 --- a/Pilot3/P3B2/p3b2.py +++ b/Pilot3/P3B2/p3b2.py @@ -35,7 +35,7 @@ ] required = ['train_data', 'rnn_size', 'epochs', 'n_layers', \ - 'learning_rate', 'drop', 'recurrent_dropout', \ + 'learning_rate', 'dropout', 'recurrent_dropout', \ 'temperature','primetext', 'length'] class BenchmarkP3B2(candle.Benchmark): diff --git a/Pilot3/P3B2/p3b2_baseline_keras2.py b/Pilot3/P3B2/p3b2_baseline_keras2.py index 83be99ef..67e90e1e 100644 --- a/Pilot3/P3B2/p3b2_baseline_keras2.py +++ b/Pilot3/P3B2/p3b2_baseline_keras2.py @@ -62,7 +62,7 @@ def run(gParameters): rnn_size = gParameters['rnn_size'] n_layers = gParameters['n_layers'] learning_rate = gParameters['learning_rate'] - dropout = gParameters['drop'] + dropout = gParameters['dropout'] recurrent_dropout = gParameters['recurrent_dropout'] n_epochs = gParameters['epochs'] data_train = data_path+'/data.pkl' diff --git a/Pilot3/P3B2/p3b2_default_model.txt b/Pilot3/P3B2/p3b2_default_model.txt index 73f77487..51089215 100644 --- a/Pilot3/P3B2/p3b2_default_model.txt +++ b/Pilot3/P3B2/p3b2_default_model.txt @@ -6,7 +6,7 @@ rnn_size = 64 epochs = 2 n_layers = 1 learning_rate = 0.01 -drop = 0.0 +dropout = 0.0 recurrent_dropout = 0.0 loss = 'categorical_crossentropy' activation = 'softmax' diff --git a/Pilot3/P3B3/p3b3.py b/Pilot3/P3B3/p3b3.py index 497cf633..6a7589fd 100644 --- a/Pilot3/P3B3/p3b3.py +++ b/Pilot3/P3B3/p3b3.py @@ -14,7 +14,6 @@ import candle -''' additional_definitions = [ {'name':'train_features', 'action':'store', @@ -56,9 +55,26 @@ 'type': str}, {'name':'n_fold', 'action':'store', - 'type':int} + 'type':int}, +{'name':'emb_l2', + 'action':'store', + 'type':float}, +{'name':'w_l2', + 'action':'store', + 'type':float}, +{'name':'wv_len', + 'action':'store', + 'type':int}, +{'name':'filter_sets', + 'nargs':'+', + 'type': int}, +{'name':'filter_sizes', + 'nargs':'+', + 'type': int}, +{'name':'num_filters', + 'nargs':'+', + 'type': int} ] -''' required = [ @@ -79,7 +95,7 @@ def set_locals(self): if required is not None: self.required = set(required) - # if additional_definitions is not None: - # self.additional_definitions = additional_definitions + if additional_definitions is not None: + self.additional_definitions = additional_definitions diff --git a/Pilot3/P3B3/p3b3_baseline_keras2.py b/Pilot3/P3B3/p3b3_baseline_keras2.py index c8227adb..89407399 100644 --- a/Pilot3/P3B3/p3b3_baseline_keras2.py +++ b/Pilot3/P3B3/p3b3_baseline_keras2.py @@ -13,10 +13,6 @@ from sklearn.metrics import f1_score ''' -import p3b3 as bmk -import candle - - import os, sys, gzip import keras @@ -32,6 +28,9 @@ import argparse +import p3b3 as bmk +import candle + def initialize_parameters(default_model = 'p3b3_default_model.txt'): @@ -39,14 +38,13 @@ def initialize_parameters(default_model = 'p3b3_default_model.txt'): # Build benchmark object p3b3Bmk = bmk.BenchmarkP3B3(bmk.file_path, default_model, 'keras', prog='p3b3_baseline', desc='Multi-task CNN for data extraction from clinical reports - Pilot 3 Benchmark 3') - + # Initialize parameters gParameters = candle.finalize_parameters(p3b3Bmk) #bmk.logger.info('Params: {}'.format(gParameters)) return gParameters - def fetch_data(gParameters): """ Downloads and decompresses the data if not locally available. Since the training data depends on the model definition it is not loaded, @@ -55,7 +53,7 @@ def fetch_data(gParameters): path = gParameters['data_url'] fpath = candle.fetch_file(path + gParameters['train_data'], 'Pilot3', untar=True) - + return fpath @@ -157,7 +155,6 @@ def run(gParameters): test_x = np.load( fpath + '/test_X.npy' ) test_y = np.load( fpath + '/test_Y.npy' ) - for task in range( len( train_y[ 0, : ] ) ): cat = np.unique( train_y[ :, task ] ) train_y[ :, task ] = [ np.where( cat == x )[ 0 ][ 0 ] for x in train_y[ :, task ] ] diff --git a/Pilot3/P3B4/p3b4.py b/Pilot3/P3B4/p3b4.py index 8098104d..ba502f77 100644 --- a/Pilot3/P3B4/p3b4.py +++ b/Pilot3/P3B4/p3b4.py @@ -14,6 +14,18 @@ import candle +additional_definitions=[ +{'name':'attention_size', + 'action':'store', + 'type':int}, +{'name':'embed_train', + 'action':'store', + 'type':candle.str2bool}, +{'name':'wv_len', + 'action':'store', + 'type':int} +] + required = [ 'learning_rate', 'batch_size', 'epochs', 'dropout', \ 'optimizer', 'wv_len', \ @@ -32,8 +44,8 @@ def set_locals(self): if required is not None: self.required = set(required) - # if additional_definitions is not None: - # self.additional_definitions = additional_definitions + if additional_definitions is not None: + self.additional_definitions = additional_definitions From 9d797e244fe97d8b12c83df5bc221ebd01b4044b Mon Sep 17 00:00:00 2001 From: Jamal Date: Fri, 10 Apr 2020 13:22:10 -0600 Subject: [PATCH 237/331] Fixed undefined keywords in P3B5 --- Pilot3/P3B5/p3b5.py | 38 ++++++++++++++++++++++------ Pilot3/P3B5/p3b5_baseline_pytorch.py | 2 +- Pilot3/P3B5/p3b5_default_model.txt | 4 +-- 3 files changed, 33 insertions(+), 11 deletions(-) diff --git a/Pilot3/P3B5/p3b5.py b/Pilot3/P3B5/p3b5.py index 4740638f..d7c41377 100644 --- a/Pilot3/P3B5/p3b5.py +++ b/Pilot3/P3B5/p3b5.py @@ -7,16 +7,34 @@ import candle -REQUIRED = [ - 'learning_rate', +additional_definitions = [ +{'name':'learning_rate_min', + 'action':'store', + 'type':float}, +{'name':'log_interval', + 'action':'store', + 'type':int}, +{'name':'weight_decay', + 'action':'store', + 'type':float}, +{'name':'grad_clip', + 'action':'store', + 'type':int}, +{'name':'unrolled', + 'action':'store', + 'type':candle.str2bool}, +] + +required = [ + 'learning_rate', 'learning_rate_min', 'momentum', 'weight_decay', 'grad_clip', - 'seed', + 'rng_seed', 'unrolled', - 'batch_size', - 'epochs', + 'batch_size', + 'epochs', ] @@ -26,8 +44,12 @@ class BenchmarkP3B5(candle.Benchmark): def set_locals(self): """ Set parameters for the benchmark. - Args: + Args: required: set of required parameters for the benchmark. + additional_definitions: list of dictionaries describing the additional parameters for the + benchmark. """ - if REQUIRED is not None: - self.required = set(REQUIRED) + if required is not None: + self.required = set(required) + if additional_definitions is not None: + self.additional_definitions = additional_definitions diff --git a/Pilot3/P3B5/p3b5_baseline_pytorch.py b/Pilot3/P3B5/p3b5_baseline_pytorch.py index 134e769a..9fdb72db 100644 --- a/Pilot3/P3B5/p3b5_baseline_pytorch.py +++ b/Pilot3/P3B5/p3b5_baseline_pytorch.py @@ -89,7 +89,7 @@ def run(params): eta_min=args.learning_rate_min, ) - genotype_store = GenotypeStorage(root=args.savepath) + genotype_store = GenotypeStorage(root=args.save_path) min_loss = 9999 for epoch in range(args.epochs): diff --git a/Pilot3/P3B5/p3b5_default_model.txt b/Pilot3/P3B5/p3b5_default_model.txt index 786092bb..f2813163 100644 --- a/Pilot3/P3B5/p3b5_default_model.txt +++ b/Pilot3/P3B5/p3b5_default_model.txt @@ -2,7 +2,7 @@ model_name = 'p3b5' unrolled = True data_url = 'ftp://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot3/' -savepath = '.' +save_path = '.' log_interval = 10 train_data = 'P3B3_data.tar.gz' learning_rate = 0.01 @@ -12,4 +12,4 @@ weight_decay = 3e-4 grad_clip = 5 batch_size = 100 epochs = 10 -seed = 13 +rng_seed = 13 From cd7faee3fa4a0aba665a5f46702b2e64a2ba6d0e Mon Sep 17 00:00:00 2001 From: yngtodd Date: Fri, 10 Apr 2020 16:26:29 -0400 Subject: [PATCH 238/331] Update with Uno example Everything is up and running now. Need to work out how to best fetch the data here. --- common/darts/__init__.py | 2 +- common/darts/api/model.py | 15 ++++++++- common/darts/architecture.py | 8 ++--- examples/darts/uno/default_model.txt | 3 ++ examples/darts/uno/uno_example.py | 48 ++++++++++++++-------------- 5 files changed, 46 insertions(+), 30 deletions(-) diff --git a/common/darts/__init__.py b/common/darts/__init__.py index e6e43756..7ca69f9f 100644 --- a/common/darts/__init__.py +++ b/common/darts/__init__.py @@ -20,7 +20,7 @@ from .utils.random import SeedControl from .functional import ( - multitask_loss, multitask_loss, multitask_accuracy + multitask_loss, multitask_accuracy, multitask_accuracy_topk ) __all__ = [ diff --git a/common/darts/api/model.py b/common/darts/api/model.py index b1b701c1..4663f75d 100644 --- a/common/darts/api/model.py +++ b/common/darts/api/model.py @@ -6,7 +6,7 @@ class Model(nn.Module): - """ Abstract class for Pytorch models """ + """ Class representing sampleable neural network model """ def num_params(self): """ Get the number of model parameters. """ @@ -33,3 +33,16 @@ def hashsummary(self): result.extend(hashlib.sha256(x.detach().cpu().numpy().tobytes()).hexdigest() for x in child.parameters()) return result + + def loss(self, x_data, y_true, reduce='mean'): + """ Forward propagate network and return a value of loss function """ + # TODO: This may need to be moved to the model. + if reduce not in (None, 'sum', 'mean'): + raise ValueError("`reduce` must be either None, `sum`, or `mean`!") + + y_pred = self(x_data) + return y_pred, self.loss_value(x_data, y_true, y_pred, reduce=reduce) + + def loss_value(self, x_data, y_true, y_pred, reduce=None): + """ Calculate a value of loss function """ + raise NotImplementedError diff --git a/common/darts/architecture.py b/common/darts/architecture.py index 57c89c1d..235dc057 100644 --- a/common/darts/architecture.py +++ b/common/darts/architecture.py @@ -27,18 +27,18 @@ def __init__(self, model, args, hyperparams=Hyperparameters(), device='cpu'): def comp_unrolled_model(self, data, target, eta, optimizer): """ Loss on train set and then update w_pi, not-in-place - + Parameters ---------- data : torch.tensor - + target : torch.tensor eta : float optimizer : torch.optim.optimizer optimizer of theta, not optimizer of alpha - + Returns ------- model_unrolled @@ -98,7 +98,7 @@ def backward_step(self, x_valid, target_valid): :param target_valid: :return: """ - loss = self.model.loss(x_valid, target_valid) + _, loss = self.model.loss(x_valid, target_valid, reduce='mean') # both alpha and theta require grad but only alpha optimizer will # step in current phase. loss.backward() diff --git a/examples/darts/uno/default_model.txt b/examples/darts/uno/default_model.txt index 2332dc39..1e8badd6 100644 --- a/examples/darts/uno/default_model.txt +++ b/examples/darts/uno/default_model.txt @@ -13,3 +13,6 @@ grad_clip = 5 batch_size = 100 epochs = 10 seed = 13 +lr = 0.025 +lr_min = 0.001 + diff --git a/examples/darts/uno/uno_example.py b/examples/darts/uno/uno_example.py index 189eda92..0d960c4e 100644 --- a/examples/darts/uno/uno_example.py +++ b/examples/darts/uno/uno_example.py @@ -2,6 +2,7 @@ import torch.nn as nn from torch import optim from torch.utils.data import DataLoader +from loguru import logger import example_setup as bmk import darts @@ -62,7 +63,7 @@ def run(params): } model = darts.LinearNetwork( - tasks=tasks, criterion=criterion, device=device + input_dim=942, tasks=tasks, criterion=criterion, device=device ).to(device) architecture = darts.Architecture(model, args, device=device) @@ -71,7 +72,7 @@ def run(params): model.parameters(), args.lr, momentum=args.momentum, - weight_decay=args.wd + weight_decay=args.weight_decay ) scheduler = optim.lr_scheduler.CosineAnnealingLR( @@ -109,7 +110,6 @@ def run(params): def train(trainloader, - validloader, model, architecture, criterion, @@ -127,12 +127,12 @@ def train(trainloader, batch_size = data.size(0) model.train() - data = to_device(data, device) - target = to_device(target, device) + data = darts.to_device(data, device) + target = darts.to_device(target, device) x_search, target_search = next(valid_iter) - x_search = to_device(x_search, device) - target_search = to_device(target_search, device) + x_search = darts.to_device(x_search, device) + target_search = darts.to_device(target_search, device) # 1. update alpha architecture.step( @@ -146,7 +146,7 @@ def train(trainloader, ) logits = model(data) - loss = multitask_loss(target, logits, criterion, reduce='mean') + loss = darts.multitask_loss(target, logits, criterion, reduce='mean') # 2. update weight optimizer.zero_grad() @@ -154,39 +154,39 @@ def train(trainloader, nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip) optimizer.step() - prec1 = multitask_accuracy_topk(logits, target, topk=(1,)) - meters.update_batch_loss(loss.item(), batch_size) - meters.update_batch_accuracy(prec1, batch_size) + prec1 = darts.multitask_accuracy_topk(logits, target, topk=(1,)) + meter.update_batch_loss(loss.item(), batch_size) + meter.update_batch_accuracy(prec1, batch_size) if step % args.log_interval == 0: - logger.info(f'Step: {step} loss: {meters.loss_meter.avg:.4}') + logger.info(f'Step: {step} loss: {meter.loss_meter.avg:.4}') - meters.update_epoch() - meters.save(args.savepath) + meter.update_epoch() + meter.save(args.savepath) -def validate(validloader, model, criterion, args, tasks, meters, device): +def validate(validloader, model, criterion, args, tasks, meter, device): model.eval() with torch.no_grad(): for step, (data, target) in enumerate(validloader): - data = to_device(data, device) - target = to_device(target, device) + data = darts.to_device(data, device) + target = darts.to_device(target, device) batch_size = data.size(0) logits = model(data) - loss = multitask_loss(target, logits, criterion, reduce='mean') + loss = darts.multitask_loss(target, logits, criterion, reduce='mean') - prec1 = multitask_accuracy_topk(logits, target, topk=(1,)) - meters.update_batch_loss(loss.item(), batch_size) - meters.update_batch_accuracy(prec1, batch_size) + prec1 = darts.multitask_accuracy_topk(logits, target, topk=(1,)) + meter.update_batch_loss(loss.item(), batch_size) + meter.update_batch_accuracy(prec1, batch_size) if step % args.log_interval == 0: - logger.info(f'>> Validation: {step} loss: {meters.loss_meter.avg:.4}') + logger.info(f'>> Validation: {step} loss: {meter.loss_meter.avg:.4}') - meters.update_epoch() - meters.save(args.savepath) + meter.update_epoch() + meter.save(args.savepath) def main(): From 3f8826f19b3ab058b184de1274d032806d1c1daa Mon Sep 17 00:00:00 2001 From: yngtodd Date: Fri, 10 Apr 2020 21:07:09 -0400 Subject: [PATCH 239/331] Switch to epoch meter This is easier to keep track of state that having four instances of meters running around. --- Pilot3/P3B5/p3b5_baseline_pytorch.py | 12 +++++--- Pilot3/P3B5/p3b5_darts.py | 41 ++++++++++++++-------------- common/darts/__init__.py | 2 ++ 3 files changed, 30 insertions(+), 25 deletions(-) diff --git a/Pilot3/P3B5/p3b5_baseline_pytorch.py b/Pilot3/P3B5/p3b5_baseline_pytorch.py index 0f5e33f8..a0ee7c72 100644 --- a/Pilot3/P3B5/p3b5_baseline_pytorch.py +++ b/Pilot3/P3B5/p3b5_baseline_pytorch.py @@ -65,6 +65,9 @@ def run(params): 'grade': 3, } + train_meter = darts.EpochMeter(tasks, 'train') + valid_meter = darts.EpochMeter(tasks, 'valid') + model = darts.ConvNetwork(tasks=tasks, criterion=criterion, device=device).to(device) architecture = darts.Architecture(model, args, device=device) @@ -104,19 +107,20 @@ def run(params): lr, args, tasks, - device + device, + train_meter ) # validation - valid_acc, valid_loss = infer(validloader, model, criterion, args, tasks, device) + valid_acc, valid_loss = infer(validloader, model, criterion, args, tasks, device, valid_meter) if valid_loss < min_loss: genotype_store.save_genotype(genotype) min_loss = valid_loss print(f'\nEpoch {epoch} stats:') - darts.log_accuracy(train_acc, 'train') - darts.log_accuracy(valid_acc, 'valid') + # darts.log_accuracy(train_acc, 'train') + # darts.log_accuracy(valid_acc, 'valid') def main(): diff --git a/Pilot3/P3B5/p3b5_darts.py b/Pilot3/P3B5/p3b5_darts.py index 567bfd02..63a9e3ce 100644 --- a/Pilot3/P3B5/p3b5_darts.py +++ b/Pilot3/P3B5/p3b5_darts.py @@ -21,9 +21,7 @@ import darts -def train(trainloader, validloader, model, architecture, criterion, optimizer, lr, args, tasks, device): - losses = darts.AverageMeter('LossMeter') - top1 = darts.MultitaskAccuracyMeter(tasks) +def train(trainloader, validloader, model, architecture, criterion, optimizer, lr, args, tasks, device, meter): valid_iter = iter(trainloader) @@ -45,12 +43,12 @@ def train(trainloader, validloader, model, architecture, criterion, optimizer, l # 1. update alpha architecture.step( - data, - target, - x_search, - target_search, - lr, - optimizer, + data, + target, + x_search, + target_search, + lr, + optimizer, unrolled=args.unrolled ) @@ -63,21 +61,20 @@ def train(trainloader, validloader, model, architecture, criterion, optimizer, l nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip) optimizer.step() - prec1 = darts.multitask_accuracy(target, logits) - losses.update(loss.item(), batch_size) - top1.update(prec1, batch_size) + prec1 = darts.multitask_accuracy_topk(logits, target) + meter.update_batch_loss(loss.item(), batch_size) + meter.update_batch_accuracy(prec1, batch_size) if step % args.log_interval == 0: print(f'Step: {step} loss: {losses.avg:.4}') - darts.log_accuracy(top1) + #darts.log_accuracy(top1) + meter.update_epoch() + meter.save(args.savepath) return top1, losses.avg -def infer(validloader, model, criterion, args, tasks, device): - losses = darts.AverageMeter('LossMeter') - top1 = darts.MultitaskAccuracyMeter(tasks) - +def infer(validloader, model, criterion, args, tasks, device, meter): model.eval() with torch.no_grad(): @@ -92,14 +89,16 @@ def infer(validloader, model, criterion, args, tasks, device): logits = model(data) loss = darts.multitask_loss(target, logits, criterion, reduce='mean') - prec1 = darts.multitask_accuracy(target, logits) - losses.update(loss.item(), batch_size) - top1.update(prec1, batch_size) + prec1 = darts.multitask_accuracy_topk(logits, target) + meter.update_batch_loss(loss.item(), batch_size) + meter.update_batch_accuracy(prec1, batch_size) if step % args.log_interval == 0: print(f'>> Validation: {step} loss: {losses.avg:.4}') - darts.log_accuracy(top1, 'valid') + #darts.log_accuracy(top1, 'valid') + meter.update_epoch() + meter.save(args.savepath) return top1, losses.avg diff --git a/common/darts/__init__.py b/common/darts/__init__.py index 7ca69f9f..839e8d70 100644 --- a/common/darts/__init__.py +++ b/common/darts/__init__.py @@ -15,6 +15,8 @@ from .datasets.uno import Uno from .datasets.sample import sample from .api.config import banner +from .meters.average import AverageMeter +from .meters.accuracy import MultitaskAccuracyMeter from .meters.epoch import EpochMeter from .utils.tensor import to_device from .utils.random import SeedControl From 2076631426bffc56e039bcf1ad40acf01304452c Mon Sep 17 00:00:00 2001 From: yngtodd Date: Sat, 11 Apr 2020 00:39:57 -0400 Subject: [PATCH 240/331] Add random dataset In case people want to try DARTS with random data. --- common/darts/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/common/darts/__init__.py b/common/darts/__init__.py index 839e8d70..a5e14604 100644 --- a/common/darts/__init__.py +++ b/common/darts/__init__.py @@ -13,6 +13,7 @@ # Utilities that are not neccessary from .datasets.p3b3 import P3B3 from .datasets.uno import Uno +from .datasets.random RandomData from .datasets.sample import sample from .api.config import banner from .meters.average import AverageMeter From 4a000e4e6d527d62d57e06ff4441adef89c83551 Mon Sep 17 00:00:00 2001 From: yngtodd Date: Sat, 11 Apr 2020 00:41:53 -0400 Subject: [PATCH 241/331] Begin advanced example This is the start of an advanced DARTS example, where the user specifies the stem network and all of the composable operations for DARTS, rather than relying on the given options within DARTS. --- .../Untitled-checkpoint.ipynb | 319 ++++++++++++++++++ examples/darts/advanced/README.rst | 3 + examples/darts/advanced/Untitled.ipynb | 319 ++++++++++++++++++ examples/darts/advanced/default_model.txt | 18 + examples/darts/advanced/example.py | 201 +++++++++++ examples/darts/advanced/example_setup.py | 37 ++ examples/darts/advanced/operations.py | 61 ++++ examples/darts/advanced/results/.gitkeep | 0 examples/darts/uno/.gitignore | 1 + 9 files changed, 959 insertions(+) create mode 100644 examples/darts/advanced/.ipynb_checkpoints/Untitled-checkpoint.ipynb create mode 100644 examples/darts/advanced/README.rst create mode 100644 examples/darts/advanced/Untitled.ipynb create mode 100644 examples/darts/advanced/default_model.txt create mode 100644 examples/darts/advanced/example.py create mode 100644 examples/darts/advanced/example_setup.py create mode 100644 examples/darts/advanced/operations.py create mode 100644 examples/darts/advanced/results/.gitkeep create mode 100644 examples/darts/uno/.gitignore diff --git a/examples/darts/advanced/.ipynb_checkpoints/Untitled-checkpoint.ipynb b/examples/darts/advanced/.ipynb_checkpoints/Untitled-checkpoint.ipynb new file mode 100644 index 00000000..610f5ab7 --- /dev/null +++ b/examples/darts/advanced/.ipynb_checkpoints/Untitled-checkpoint.ipynb @@ -0,0 +1,319 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "import torch.nn as nn\n", + "import torch.nn.functional as F\n", + "from torch.utils.data import DataLoader\n", + "from torchvision import datasets\n", + "from torchvision import transforms\n", + "from operations import OPS" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "train = datasets.MNIST('./data', train=True, transform=transforms.ToTensor(), download=True)\n", + "valid = datasets.MNIST('./data', train=False)\n", + "\n", + "train_loader = DataLoader(train, batch_size=10)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "x, y = next(iter(train_loader))" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "torch.Size([10, 1, 28, 28])" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "x.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "mlp = OPS['mlp'](1, 100, False)\n", + "conv = OPS['conv'](1, 32, True)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "ConvBlock(\n", + " (op): Sequential(\n", + " (0): Conv2d(1, 1, kernel_size=(3, 3), stride=(100, 100), padding=(1, 1), bias=False)\n", + " (1): BatchNorm1d(1, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", + " )\n", + ")" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "conv" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "dataset = RandomData(100, 2, {'task': 2})" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "torch.Size([10, 1, 28, 28])" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "x.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "train_loader = torch.utils.data.DataLoader(\n", + " datasets.MNIST('../data', train=True, download=True,\n", + " transform=transforms.Compose([\n", + " transforms.ToTensor(),\n", + " transforms.Normalize((0.1307,), (0.3081,))\n", + " ])),\n", + " batch_size=10, shuffle=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "x, y = next(iter(train_loader))" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "torch.Size([10, 1, 28, 28])" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "x.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": {}, + "outputs": [], + "source": [ + "class StemNet(nn.Module):\n", + " \"\"\" Network stem\n", + "\n", + " This will always be the beginning of the network.\n", + " DARTS will only recompose modules after the stem.\n", + " For this reason, we define this separate from the\n", + " other modules in the network.\n", + "\n", + " Args:\n", + " input_dim: the input dimension for your data\n", + "\n", + " cell_dim: the intermediate dimension size for\n", + " the remaining modules of the network.\n", + " \"\"\"\n", + " def __init__(self, in_channels: int=1, cell_dim: int=100, kernel_size=3):\n", + " super(StemNet, self).__init__()\n", + " self.stem = nn.Conv2d(in_channels, cell_dim, kernel_size)\n", + "\n", + " def forward(self, x):\n", + " return self.stem(x)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": {}, + "outputs": [], + "source": [ + "class ConvBlock(nn.Module):\n", + " \"\"\" ReLu -> Conv1d -> BatchNorm \"\"\"\n", + "\n", + " def __init__(self, c_in, c_out, kernel_size, stride, affine=True):\n", + " super(ConvBlock, self).__init__()\n", + " self.conv = nn.Conv2d(c_in, c_out, kernel_size=kernel_size, stride=stride)\n", + "\n", + " def forward(self, x):\n", + " return self.conv(x)" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": {}, + "outputs": [], + "source": [ + "stem = StemNet()" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": {}, + "outputs": [], + "source": [ + "OPS = {\n", + " 'conv' : lambda c, stride, affine: ConvBlock(c, c, 3, stride, affine=affine),\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": {}, + "outputs": [], + "source": [ + "conv = OPS['conv'](100, 32, True)" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": {}, + "outputs": [], + "source": [ + "out = stem(x)" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "torch.Size([10, 100, 26, 26])" + ] + }, + "execution_count": 64, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "out.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "torch.Size([10, 100, 1, 1])" + ] + }, + "execution_count": 66, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "conv(out).shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.7" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/examples/darts/advanced/README.rst b/examples/darts/advanced/README.rst new file mode 100644 index 00000000..8ee51b54 --- /dev/null +++ b/examples/darts/advanced/README.rst @@ -0,0 +1,3 @@ +========= +DARTS UNO +========= diff --git a/examples/darts/advanced/Untitled.ipynb b/examples/darts/advanced/Untitled.ipynb new file mode 100644 index 00000000..610f5ab7 --- /dev/null +++ b/examples/darts/advanced/Untitled.ipynb @@ -0,0 +1,319 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "import torch.nn as nn\n", + "import torch.nn.functional as F\n", + "from torch.utils.data import DataLoader\n", + "from torchvision import datasets\n", + "from torchvision import transforms\n", + "from operations import OPS" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "train = datasets.MNIST('./data', train=True, transform=transforms.ToTensor(), download=True)\n", + "valid = datasets.MNIST('./data', train=False)\n", + "\n", + "train_loader = DataLoader(train, batch_size=10)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "x, y = next(iter(train_loader))" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "torch.Size([10, 1, 28, 28])" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "x.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "mlp = OPS['mlp'](1, 100, False)\n", + "conv = OPS['conv'](1, 32, True)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "ConvBlock(\n", + " (op): Sequential(\n", + " (0): Conv2d(1, 1, kernel_size=(3, 3), stride=(100, 100), padding=(1, 1), bias=False)\n", + " (1): BatchNorm1d(1, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", + " )\n", + ")" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "conv" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "dataset = RandomData(100, 2, {'task': 2})" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "torch.Size([10, 1, 28, 28])" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "x.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "train_loader = torch.utils.data.DataLoader(\n", + " datasets.MNIST('../data', train=True, download=True,\n", + " transform=transforms.Compose([\n", + " transforms.ToTensor(),\n", + " transforms.Normalize((0.1307,), (0.3081,))\n", + " ])),\n", + " batch_size=10, shuffle=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "x, y = next(iter(train_loader))" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "torch.Size([10, 1, 28, 28])" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "x.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": {}, + "outputs": [], + "source": [ + "class StemNet(nn.Module):\n", + " \"\"\" Network stem\n", + "\n", + " This will always be the beginning of the network.\n", + " DARTS will only recompose modules after the stem.\n", + " For this reason, we define this separate from the\n", + " other modules in the network.\n", + "\n", + " Args:\n", + " input_dim: the input dimension for your data\n", + "\n", + " cell_dim: the intermediate dimension size for\n", + " the remaining modules of the network.\n", + " \"\"\"\n", + " def __init__(self, in_channels: int=1, cell_dim: int=100, kernel_size=3):\n", + " super(StemNet, self).__init__()\n", + " self.stem = nn.Conv2d(in_channels, cell_dim, kernel_size)\n", + "\n", + " def forward(self, x):\n", + " return self.stem(x)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": {}, + "outputs": [], + "source": [ + "class ConvBlock(nn.Module):\n", + " \"\"\" ReLu -> Conv1d -> BatchNorm \"\"\"\n", + "\n", + " def __init__(self, c_in, c_out, kernel_size, stride, affine=True):\n", + " super(ConvBlock, self).__init__()\n", + " self.conv = nn.Conv2d(c_in, c_out, kernel_size=kernel_size, stride=stride)\n", + "\n", + " def forward(self, x):\n", + " return self.conv(x)" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": {}, + "outputs": [], + "source": [ + "stem = StemNet()" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": {}, + "outputs": [], + "source": [ + "OPS = {\n", + " 'conv' : lambda c, stride, affine: ConvBlock(c, c, 3, stride, affine=affine),\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": {}, + "outputs": [], + "source": [ + "conv = OPS['conv'](100, 32, True)" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": {}, + "outputs": [], + "source": [ + "out = stem(x)" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "torch.Size([10, 100, 26, 26])" + ] + }, + "execution_count": 64, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "out.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "torch.Size([10, 100, 1, 1])" + ] + }, + "execution_count": 66, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "conv(out).shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.7" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/examples/darts/advanced/default_model.txt b/examples/darts/advanced/default_model.txt new file mode 100644 index 00000000..1e8badd6 --- /dev/null +++ b/examples/darts/advanced/default_model.txt @@ -0,0 +1,18 @@ +[Global_Params] +model_name = 'darts_uno' +unrolled = False +data_url = 'ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/' +savepath = '.' +log_interval = 10 +train_data = 'top_21_auc_1fold.uno.h5' +learning_rate = 0.01 +learning_rate_min = 0.001 +momentum = 0.9 +weight_decay = 3e-4 +grad_clip = 5 +batch_size = 100 +epochs = 10 +seed = 13 +lr = 0.025 +lr_min = 0.001 + diff --git a/examples/darts/advanced/example.py b/examples/darts/advanced/example.py new file mode 100644 index 00000000..bf3bb45b --- /dev/null +++ b/examples/darts/advanced/example.py @@ -0,0 +1,201 @@ +import torch +import torch.nn as nn +from torch import optim +from torch.utils.data import DataLoader +from loguru import logger + +import example_setup as bmk +import darts +import candle + + +def initialize_parameters(): + """ Initialize the parameters for the Uno example """ + + uno_example = bmk.UnoExample( + bmk.file_path, + 'default_model.txt', + 'pytorch', + prog='uno_example', + desc='Differentiable Architecture Search - Uno example', + ) + + # Initialize parameters + gParameters = candle.finalize_parameters(uno_example) + return gParameters + + +class StemNet(nn.Module): + """ Network stem + + This will always be the beginning of the network. + DARTS will only recompose modules after the stem. + For this reason, we define this separate from the + other modules in the network. + + Args: + input_dim: the input dimension for your data + cell_dim: the intermediate dimension size for + the remaining modules of the network. + """ + def __init__(self, input_dim: int=250, cell_dim: int=100): + super(StemNet, self).__init__() + self.fc = nn.Linear(input_dim, cell_dim) + + def forward(self, x): + return self.fc(x) + + +def run(params): + args = candle.ArgumentStruct(**params) + + args.cuda = torch.cuda.is_available() + device = torch.device(f"cuda" if args.cuda else "cpu") + darts.banner(device=device) + + tasks = { + 'task0': 5, + 'task1': 2 + } + + train_data = darts.RandomData(x_dim=250, num_samples=50, tasks=tasks) + valid_data = darts.RandomData(x_dim=250, num_samples=50, tasks=tasks) + + trainloader = DataLoader(train_data, batch_size=args.batch_size) + validloader = DataLoader(valid_data, batch_size=args.batch_size) + + criterion = nn.CrossEntropyLoss().to(device) + + model = darts.Network( + ).to(device) + + architecture = darts.Architecture(model, args, device=device) + + optimizer = optim.SGD( + model.parameters(), + args.lr, + momentum=args.momentum, + weight_decay=args.weight_decay + ) + + scheduler = optim.lr_scheduler.CosineAnnealingLR( + optimizer, + float(args.epochs), + eta_min=args.lr_min + ) + + train_meter = darts.EpochMeter(tasks, 'train') + valid_meter = darts.EpochMeter(tasks, 'valid') + + for epoch in range(args.epochs): + + scheduler.step() + lr = scheduler.get_lr()[0] + logger.info(f'\nEpoch: {epoch} lr: {lr}') + + genotype = model.genotype() + logger.info(f'Genotype: {genotype}\n') + + train( + trainloader, + model, + architecture, + criterion, + optimizer, + lr, + args, + tasks, + train_meter, + device + ) + + validate(validloader, model, criterion, args, tasks, valid_meter, device) + + +def train(trainloader, + model, + architecture, + criterion, + optimizer, + lr, + args, + tasks, + meter, + device): + + valid_iter = iter(trainloader) + + for step, (data, target) in enumerate(trainloader): + + batch_size = data.size(0) + model.train() + + data = darts.to_device(data, device) + target = darts.to_device(target, device) + + x_search, target_search = next(valid_iter) + x_search = darts.to_device(x_search, device) + target_search = darts.to_device(target_search, device) + + # 1. update alpha + architecture.step( + data, + target, + x_search, + target_search, + lr, + optimizer, + unrolled=args.unrolled + ) + + logits = model(data) + loss = darts.multitask_loss(target, logits, criterion, reduce='mean') + + # 2. update weight + optimizer.zero_grad() + loss.backward() + nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip) + optimizer.step() + + prec1 = darts.multitask_accuracy_topk(logits, target, topk=(1,)) + meter.update_batch_loss(loss.item(), batch_size) + meter.update_batch_accuracy(prec1, batch_size) + + if step % args.log_interval == 0: + logger.info(f'Step: {step} loss: {meter.loss_meter.avg:.4}') + + meter.update_epoch() + meter.save(args.savepath) + + +def validate(validloader, model, criterion, args, tasks, meter, device): + model.eval() + with torch.no_grad(): + for step, (data, target) in enumerate(validloader): + + data = darts.to_device(data, device) + target = darts.to_device(target, device) + + batch_size = data.size(0) + + logits = model(data) + loss = darts.multitask_loss(target, logits, criterion, reduce='mean') + + prec1 = darts.multitask_accuracy_topk(logits, target, topk=(1,)) + meter.update_batch_loss(loss.item(), batch_size) + meter.update_batch_accuracy(prec1, batch_size) + + if step % args.log_interval == 0: + logger.info(f'>> Validation: {step} loss: {meter.loss_meter.avg:.4}') + + meter.update_epoch() + meter.save(args.savepath) + + +def main(): + params = initialize_parameters() + run(params) + + +if __name__=='__main__': + main() diff --git a/examples/darts/advanced/example_setup.py b/examples/darts/advanced/example_setup.py new file mode 100644 index 00000000..7d634b08 --- /dev/null +++ b/examples/darts/advanced/example_setup.py @@ -0,0 +1,37 @@ +import os +import sys + + +file_path = os.path.dirname(os.path.realpath(__file__)) +lib_path2 = os.path.abspath(os.path.join(file_path, '..', '..', '..', 'common')) +sys.path.append(lib_path2) + + +import candle + + +REQUIRED = [ + 'learning_rate', + 'learning_rate_min', + 'momentum', + 'weight_decay', + 'grad_clip', + 'seed', + 'unrolled', + 'batch_size', + 'epochs', +] + + +class UnoExample(candle.Benchmark): + """ Example for Uno """ + + def set_locals(self): + """ Set parameters for the benchmark. + + Args: + required: set of required parameters for the benchmark. + """ + if REQUIRED is not None: + self.required = set(REQUIRED) + diff --git a/examples/darts/advanced/operations.py b/examples/darts/advanced/operations.py new file mode 100644 index 00000000..c0c196e9 --- /dev/null +++ b/examples/darts/advanced/operations.py @@ -0,0 +1,61 @@ +import torch.nn as nn +import torch.nn.functional as F + +""" DARTS operations contstructor """ +OPS = { + 'mlp' : lambda c, stride, affine: MLP(c, c), + 'conv' : lambda c, stride, affine: ConvBlock(c, c, 3, stride, 1, affine=affine), +} + + +class StemNet(nn.Module): + """ Network stem + + This will always be the beginning of the network. + DARTS will only recompose modules after the stem. + For this reason, we define this separate from the + other modules in the network. + + Args: + input_dim: the input dimension for your data + + cell_dim: the intermediate dimension size for + the remaining modules of the network. + """ + def __init__(self, input_dim: int=250, cell_dim: int=100): + super(StemNet, self).__init__() + self.fc = nn.Linear(input_dim, cell_dim) + + def forward(self, x): + return self.fc(x) + + +class MLP(nn.Module): + """ Multi-layer perceptron """ + + def __init__(self, cell_dim, hidden_dim): + super(MLP, self).__init__() + self.fc1 = nn.Linear(cell_dim, hidden_dim) + self.fc2 = nn.Linear(hidden_dim, cell_dim) + + def forward(self, x): + return self.fc2(self.fc1(F.relu(x))) + + +class ConvBlock(nn.Module): + """ ReLu -> Conv1d -> BatchNorm """ + + def __init__(self, c_in, c_out, kernel_size, + stride, padding, affine=True): + super(ConvBlock, self).__init__() + + self.op = nn.Sequential( + #nn.ReLU(inplace=False), + nn.Conv2d( + c_in, c_out, kernel_size, + stride=stride, padding=padding, bias=False), + nn.BatchNorm1d(c_out, affine=affine) + ) + + def forward(self, x): + return self.op(x) diff --git a/examples/darts/advanced/results/.gitkeep b/examples/darts/advanced/results/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/examples/darts/uno/.gitignore b/examples/darts/uno/.gitignore new file mode 100644 index 00000000..8d5ef26b --- /dev/null +++ b/examples/darts/uno/.gitignore @@ -0,0 +1 @@ +ftp.mcs.anl.gov/ From f9cad8530eb72b6afcba4b3132c301780b24c5eb Mon Sep 17 00:00:00 2001 From: yngtodd Date: Sat, 11 Apr 2020 02:27:43 -0400 Subject: [PATCH 242/331] Use mnist and 2D convolutions Updating the example to make things a bit more real looking. --- .../Untitled-checkpoint.ipynb | 92 +++++++++++++++++-- examples/darts/advanced/Untitled.ipynb | 92 +++++++++++++++++-- examples/darts/advanced/example.py | 56 ++++++----- examples/darts/advanced/operations.py | 66 ++++++++----- 4 files changed, 235 insertions(+), 71 deletions(-) diff --git a/examples/darts/advanced/.ipynb_checkpoints/Untitled-checkpoint.ipynb b/examples/darts/advanced/.ipynb_checkpoints/Untitled-checkpoint.ipynb index 610f5ab7..dc823be3 100644 --- a/examples/darts/advanced/.ipynb_checkpoints/Untitled-checkpoint.ipynb +++ b/examples/darts/advanced/.ipynb_checkpoints/Untitled-checkpoint.ipynb @@ -211,7 +211,49 @@ }, { "cell_type": "code", - "execution_count": 59, + "execution_count": 100, + "metadata": {}, + "outputs": [], + "source": [ + "class DilConv(nn.Module):\n", + " \"\"\" ReLU Dilated Convolution \"\"\"\n", + "\n", + " def __init__(self, c_in, c_out, kernel_size, \n", + " stride, padding, dilation, affine=True):\n", + " super(DilConv, self).__init__()\n", + "\n", + " self.op = nn.Sequential(\n", + " nn.ReLU(inplace=False),\n", + "\n", + " nn.Conv2d(\n", + " c_in,\n", + " c_in,\n", + " kernel_size=kernel_size,\n", + " stride=stride,\n", + " padding=padding,\n", + " dilation=dilation,\n", + " groups=c_in,\n", + " bias=False\n", + " ),\n", + "\n", + " nn.Conv2d(\n", + " c_in,\n", + " c_out,\n", + " kernel_size=1,\n", + " padding=0,\n", + " bias=False\n", + " ),\n", + "\n", + " nn.BatchNorm2d(c_out, affine=affine),\n", + " )\n", + "\n", + " def forward(self, x):\n", + " return self.op(x)" + ] + }, + { + "cell_type": "code", + "execution_count": 101, "metadata": {}, "outputs": [], "source": [ @@ -220,27 +262,37 @@ }, { "cell_type": "code", - "execution_count": 60, + "execution_count": 102, "metadata": {}, "outputs": [], "source": [ "OPS = {\n", + " 'dil_conv': lambda c, stride, affine: DilConv(c, c, 3, stride, 2, 2, affine=affine),\n", " 'conv' : lambda c, stride, affine: ConvBlock(c, c, 3, stride, affine=affine),\n", "}" ] }, { "cell_type": "code", - "execution_count": 65, + "execution_count": 103, + "metadata": {}, + "outputs": [], + "source": [ + "conv = OPS['conv'](100, 1, True)" + ] + }, + { + "cell_type": "code", + "execution_count": 104, "metadata": {}, "outputs": [], "source": [ - "conv = OPS['conv'](100, 32, True)" + "dill = OPS['dil_conv'](100, 1, True)" ] }, { "cell_type": "code", - "execution_count": 63, + "execution_count": 105, "metadata": {}, "outputs": [], "source": [ @@ -249,7 +301,7 @@ }, { "cell_type": "code", - "execution_count": 64, + "execution_count": 106, "metadata": {}, "outputs": [ { @@ -258,7 +310,7 @@ "torch.Size([10, 100, 26, 26])" ] }, - "execution_count": 64, + "execution_count": 106, "metadata": {}, "output_type": "execute_result" } @@ -269,16 +321,16 @@ }, { "cell_type": "code", - "execution_count": 66, + "execution_count": 107, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "torch.Size([10, 100, 1, 1])" + "torch.Size([10, 100, 24, 24])" ] }, - "execution_count": 66, + "execution_count": 107, "metadata": {}, "output_type": "execute_result" } @@ -287,6 +339,26 @@ "conv(out).shape" ] }, + { + "cell_type": "code", + "execution_count": 108, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "torch.Size([10, 100, 26, 26])" + ] + }, + "execution_count": 108, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dill(out).shape" + ] + }, { "cell_type": "code", "execution_count": null, diff --git a/examples/darts/advanced/Untitled.ipynb b/examples/darts/advanced/Untitled.ipynb index 610f5ab7..dc823be3 100644 --- a/examples/darts/advanced/Untitled.ipynb +++ b/examples/darts/advanced/Untitled.ipynb @@ -211,7 +211,49 @@ }, { "cell_type": "code", - "execution_count": 59, + "execution_count": 100, + "metadata": {}, + "outputs": [], + "source": [ + "class DilConv(nn.Module):\n", + " \"\"\" ReLU Dilated Convolution \"\"\"\n", + "\n", + " def __init__(self, c_in, c_out, kernel_size, \n", + " stride, padding, dilation, affine=True):\n", + " super(DilConv, self).__init__()\n", + "\n", + " self.op = nn.Sequential(\n", + " nn.ReLU(inplace=False),\n", + "\n", + " nn.Conv2d(\n", + " c_in,\n", + " c_in,\n", + " kernel_size=kernel_size,\n", + " stride=stride,\n", + " padding=padding,\n", + " dilation=dilation,\n", + " groups=c_in,\n", + " bias=False\n", + " ),\n", + "\n", + " nn.Conv2d(\n", + " c_in,\n", + " c_out,\n", + " kernel_size=1,\n", + " padding=0,\n", + " bias=False\n", + " ),\n", + "\n", + " nn.BatchNorm2d(c_out, affine=affine),\n", + " )\n", + "\n", + " def forward(self, x):\n", + " return self.op(x)" + ] + }, + { + "cell_type": "code", + "execution_count": 101, "metadata": {}, "outputs": [], "source": [ @@ -220,27 +262,37 @@ }, { "cell_type": "code", - "execution_count": 60, + "execution_count": 102, "metadata": {}, "outputs": [], "source": [ "OPS = {\n", + " 'dil_conv': lambda c, stride, affine: DilConv(c, c, 3, stride, 2, 2, affine=affine),\n", " 'conv' : lambda c, stride, affine: ConvBlock(c, c, 3, stride, affine=affine),\n", "}" ] }, { "cell_type": "code", - "execution_count": 65, + "execution_count": 103, + "metadata": {}, + "outputs": [], + "source": [ + "conv = OPS['conv'](100, 1, True)" + ] + }, + { + "cell_type": "code", + "execution_count": 104, "metadata": {}, "outputs": [], "source": [ - "conv = OPS['conv'](100, 32, True)" + "dill = OPS['dil_conv'](100, 1, True)" ] }, { "cell_type": "code", - "execution_count": 63, + "execution_count": 105, "metadata": {}, "outputs": [], "source": [ @@ -249,7 +301,7 @@ }, { "cell_type": "code", - "execution_count": 64, + "execution_count": 106, "metadata": {}, "outputs": [ { @@ -258,7 +310,7 @@ "torch.Size([10, 100, 26, 26])" ] }, - "execution_count": 64, + "execution_count": 106, "metadata": {}, "output_type": "execute_result" } @@ -269,16 +321,16 @@ }, { "cell_type": "code", - "execution_count": 66, + "execution_count": 107, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "torch.Size([10, 100, 1, 1])" + "torch.Size([10, 100, 24, 24])" ] }, - "execution_count": 66, + "execution_count": 107, "metadata": {}, "output_type": "execute_result" } @@ -287,6 +339,26 @@ "conv(out).shape" ] }, + { + "cell_type": "code", + "execution_count": 108, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "torch.Size([10, 100, 26, 26])" + ] + }, + "execution_count": 108, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dill(out).shape" + ] + }, { "cell_type": "code", "execution_count": null, diff --git a/examples/darts/advanced/example.py b/examples/darts/advanced/example.py index bf3bb45b..05341f43 100644 --- a/examples/darts/advanced/example.py +++ b/examples/darts/advanced/example.py @@ -8,6 +8,10 @@ import darts import candle +from operations import ( + Stem, OPS, PRIMITIVES +) + def initialize_parameters(): """ Initialize the parameters for the Uno example """ @@ -25,27 +29,6 @@ def initialize_parameters(): return gParameters -class StemNet(nn.Module): - """ Network stem - - This will always be the beginning of the network. - DARTS will only recompose modules after the stem. - For this reason, we define this separate from the - other modules in the network. - - Args: - input_dim: the input dimension for your data - cell_dim: the intermediate dimension size for - the remaining modules of the network. - """ - def __init__(self, input_dim: int=250, cell_dim: int=100): - super(StemNet, self).__init__() - self.fc = nn.Linear(input_dim, cell_dim) - - def forward(self, x): - return self.fc(x) - - def run(params): args = candle.ArgumentStruct(**params) @@ -53,20 +36,35 @@ def run(params): device = torch.device(f"cuda" if args.cuda else "cpu") darts.banner(device=device) + train_loader = torch.utils.data.DataLoader( + datasets.MNIST( + '../data', train=True, download=True, + transform=transforms.Compose([ + transforms.ToTensor(), + transforms.Normalize((0.1307,), (0.3081,)) + ])), + batch_size=args.batch_size, shuffle=True) + + valid_loader = torch.utils.data.DataLoader( + datasets.MNIST( + '../data', train=Fale, download=True, + transform=transforms.Compose([ + transforms.ToTensor(), + transforms.Normalize((0.1307,), (0.3081,)) + ])), + batch_size=args.batch_size, shuffle=True) + tasks = { - 'task0': 5, - 'task1': 2 + 'digits': 10, } - train_data = darts.RandomData(x_dim=250, num_samples=50, tasks=tasks) - valid_data = darts.RandomData(x_dim=250, num_samples=50, tasks=tasks) - - trainloader = DataLoader(train_data, batch_size=args.batch_size) - validloader = DataLoader(valid_data, batch_size=args.batch_size) - criterion = nn.CrossEntropyLoss().to(device) + stem = Stem(cell_dim=100) + model = darts.Network( + stem, cell_dim=100, primitives=PRIMITIVES, ops=OPS, + tasks=tasks, criterion=criterion, device=device ).to(device) architecture = darts.Architecture(model, args, device=device) diff --git a/examples/darts/advanced/operations.py b/examples/darts/advanced/operations.py index c0c196e9..32e9a844 100644 --- a/examples/darts/advanced/operations.py +++ b/examples/darts/advanced/operations.py @@ -1,14 +1,18 @@ import torch.nn as nn import torch.nn.functional as F + """ DARTS operations contstructor """ OPS = { - 'mlp' : lambda c, stride, affine: MLP(c, c), - 'conv' : lambda c, stride, affine: ConvBlock(c, c, 3, stride, 1, affine=affine), + 'conv_3' : lambda c, stride, affine: ConvBlock(c, c, 3, stride), + 'dil_conv': lambda c, stride, affine: DilConv(c, c, 3, stride, 2, 2, affine=affine) } -class StemNet(nn.Module): +PRIMITIVES = ['conv_3', 'dil_conv'] + + +class Stem(nn.Module): """ Network stem This will always be the beginning of the network. @@ -22,39 +26,57 @@ class StemNet(nn.Module): cell_dim: the intermediate dimension size for the remaining modules of the network. """ - def __init__(self, input_dim: int=250, cell_dim: int=100): - super(StemNet, self).__init__() - self.fc = nn.Linear(input_dim, cell_dim) + def __init__(self, in_channels: int=1, cell_dim: int=100, kernel_size=3): + super(Stem, self).__init__() + self.stem = nn.Conv2d(in_channels, cell_dim, kernel_size) def forward(self, x): - return self.fc(x) + return self.stem(x) -class MLP(nn.Module): - """ Multi-layer perceptron """ +class ConvBlock(nn.Module): + """ ReLu -> Conv1d """ - def __init__(self, cell_dim, hidden_dim): - super(MLP, self).__init__() - self.fc1 = nn.Linear(cell_dim, hidden_dim) - self.fc2 = nn.Linear(hidden_dim, cell_dim) + def __init__(self, c_in, c_out, kernel_size, stride, affine=True): + super(ConvBlock, self).__init__() + self.conv = nn.Conv2d( + c_in, c_out, kernel_size=kernel_size, stride=stride + ) def forward(self, x): - return self.fc2(self.fc1(F.relu(x))) + return self.conv(F.relu(x)) -class ConvBlock(nn.Module): - """ ReLu -> Conv1d -> BatchNorm """ +class DilConv(nn.Module): + """ ReLU Dilated Convolution """ def __init__(self, c_in, c_out, kernel_size, - stride, padding, affine=True): - super(ConvBlock, self).__init__() + stride, padding, dilation, affine=True): + super(DilConv, self).__init__() self.op = nn.Sequential( - #nn.ReLU(inplace=False), + nn.ReLU(inplace=False), + + nn.Conv2d( + c_in, + c_in, + kernel_size=kernel_size, + stride=stride, + padding=padding, + dilation=dilation, + groups=c_in, + bias=False + ), + nn.Conv2d( - c_in, c_out, kernel_size, - stride=stride, padding=padding, bias=False), - nn.BatchNorm1d(c_out, affine=affine) + c_in, + c_out, + kernel_size=1, + padding=0, + bias=False + ), + + nn.BatchNorm2d(c_out, affine=affine), ) def forward(self, x): From fa26ddee6748d35264a3cdc63d87207bb7151f39 Mon Sep 17 00:00:00 2001 From: yngtodd Date: Sat, 11 Apr 2020 18:32:37 -0400 Subject: [PATCH 243/331] Fix advanced example Still need to reconfigure the classifer. --- common/darts/__init__.py | 4 +- common/darts/modules/__init__.py | 3 + common/darts/modules/cell.py | 14 +- common/darts/modules/mixed_layer.py | 25 +- common/darts/modules/network.py | 38 +- .../Untitled-checkpoint.ipynb | 363 ++++++++++++++---- examples/darts/advanced/Untitled.ipynb | 363 ++++++++++++++---- examples/darts/advanced/example.py | 34 +- examples/darts/advanced/example_setup.py | 4 +- examples/darts/advanced/operations.py | 17 +- 10 files changed, 690 insertions(+), 175 deletions(-) diff --git a/common/darts/__init__.py b/common/darts/__init__.py index a5e14604..02bbbc53 100644 --- a/common/darts/__init__.py +++ b/common/darts/__init__.py @@ -6,6 +6,7 @@ # Essential pieces from .architecture import Architecture +from .modules.network import Network from .modules.conv.network import ConvNetwork from .modules.linear.network import LinearNetwork from .storage.genotype import GenotypeStorage @@ -13,7 +14,7 @@ # Utilities that are not neccessary from .datasets.p3b3 import P3B3 from .datasets.uno import Uno -from .datasets.random RandomData +from .datasets.random import RandomData from .datasets.sample import sample from .api.config import banner from .meters.average import AverageMeter @@ -28,6 +29,7 @@ __all__ = [ "Architecture", + "Network", "ConvNetwork", "LinearNetwork", ] diff --git a/common/darts/modules/__init__.py b/common/darts/modules/__init__.py index e69de29b..e8086a41 100644 --- a/common/darts/modules/__init__.py +++ b/common/darts/modules/__init__.py @@ -0,0 +1,3 @@ +from .cell import Cell +from .mixed_layer import MixedLayer +from .network import Network diff --git a/common/darts/modules/cell.py b/common/darts/modules/cell.py index cafa698f..6ee82d82 100644 --- a/common/darts/modules/cell.py +++ b/common/darts/modules/cell.py @@ -3,7 +3,19 @@ from darts.api import Model from darts.modules.mixed_layer import MixedLayer -from darts.modules.operations.conv import ConvBlock + + +class ConvBlock(Model): + """ ReLu -> Conv2d """ + + def __init__(self, c_in, c_out, kernel_size, stride, padding, affine=True): + super(ConvBlock, self).__init__() + self.conv = nn.Conv2d( + c_in, c_out, kernel_size=kernel_size, stride=stride, padding=padding + ) + + def forward(self, x): + return self.conv(x) class Cell(Model): diff --git a/common/darts/modules/mixed_layer.py b/common/darts/modules/mixed_layer.py index b4e4ca39..10bfd6b2 100644 --- a/common/darts/modules/mixed_layer.py +++ b/common/darts/modules/mixed_layer.py @@ -1,5 +1,7 @@ import torch import torch.nn as nn +import torch.nn.functional as F + from darts.api import Model @@ -13,7 +15,7 @@ def __init__(self, c, stride, primitives, ops): super(MixedLayer, self).__init__() self.reset(c, stride, primitives, ops) - def reset(self, c, stride): + def reset(self, c, stride, primitives, ops): self.layers = nn.ModuleList() for primitive in primitives: @@ -24,15 +26,32 @@ def reset(self, c, stride): self.layers.append(layer) + def pad(self, tensors): + """ Pad with zeros for mixed layers """ + prev = tensors[0] + padded = [] + for tensor in tensors: + if tensor.shape < prev.shape: + tensor_pad = F.pad( + input=tensor, pad=(1, 1, 1, 1), mode='constant', value=0 + ) + padded.append(tensor_pad) + else: + padded.append(tensor) + prev = tensor + + return padded + def forward(self, x, weights): """ Parameters ---------- - x : torch.tensor + x : torch.tensor Data Weights : torch.tensor alpha, [op_num:8], the output = sum of alpha * op(x) """ x = [w * layer(x) for w, layer in zip(weights, self.layers)] - return sum(x) + x = self.pad(x) + return sum(x) diff --git a/common/darts/modules/network.py b/common/darts/modules/network.py index 26d4c490..8285f606 100644 --- a/common/darts/modules/network.py +++ b/common/darts/modules/network.py @@ -1,4 +1,4 @@ -from typing import Dict, List +from typing import Dict, List, Callable import torch import torch.nn as nn @@ -23,14 +23,12 @@ class Network(Model): def __init__(self, stem: nn.Module, cell_dim: int, - primitives: List[str], - ops: Dict[], + ops: Dict[str, Callable[[int, int, bool], nn.Module]], tasks: Dict[str, int], criterion, - device: str = 'cpu', + device="cpu", hyperparams=Hyperparameters()): super(Network, self).__init__() - self.primitives = primitives self.ops = ops self.cell_dim = cell_dim self.tasks = tasks @@ -38,12 +36,14 @@ def __init__(self, self.device = device self.num_cells = hyperparams.num_cells self.num_nodes = hyperparams.num_nodes - + self.primitives = list(ops.keys()) self.stem = stem + self.channel_multiplier = hyperparams.channel_multiplier + self.c = hyperparams.c # c_curr means a factor of the output channels of current cell - c_curr = cell_dim * hyperparams.channel_multiplier * hyperparams.c - cpp, cp, c_curr = c_curr, c_curr, self.c + c_curr = cell_dim * self.channel_multiplier * hyperparams.c + cpp, cp, c_curr = c_curr, c_curr, hyperparams.c self.cells = nn.ModuleList() for i in range(hyperparams.num_cells): @@ -52,12 +52,15 @@ def __init__(self, hyperparams.channel_multiplier, cpp, cp, - c_curr + c_curr, + self.primitives, + self.ops ).to(self.device) self.cells += [cell] - self.classifier = MultitaskClassifier(cell_dim, tasks) + # TODO(Todd): Find a way to calculate the output of the ops + self.classifier = MultitaskClassifier(676, tasks) # k is the total number of edges inside single cell, 14 k = sum(1 for i in range(self.num_nodes) for j in range(2 + i)) @@ -86,7 +89,6 @@ def new(self): model = Network( self.stem, self.cell_dim, - self.primitives, self.ops, self.tasks, self.criterion @@ -110,17 +112,13 @@ def forward(self, x): return logits - def loss(self, data, target, reduce='mean'): + def loss_value(self, x_data, y_true, y_pred, reduce='mean'): """ Calculate a value of loss function """ - logits = self(data) - - for task, logit in logits.items(): - logits[task] = logit.to(self.device) + y_pred = self(x_data) losses = {} - for task, label in target.items(): - label = label.to(self.device) - losses[task] = self.criterion(logits[task], label) + for key, value in y_true.items(): + losses[key] = F.nll_loss(F.log_softmax(y_pred[key], dim=1), y_true[key]) if reduce: total = 0 @@ -131,8 +129,6 @@ def loss(self, data, target, reduce='mean'): losses = total / len(losses) elif reduce == "sum": losses = total - else: - raise ValueError('Reduced loss must use either `mean` or `sum`!') return losses diff --git a/examples/darts/advanced/.ipynb_checkpoints/Untitled-checkpoint.ipynb b/examples/darts/advanced/.ipynb_checkpoints/Untitled-checkpoint.ipynb index dc823be3..fdb1165b 100644 --- a/examples/darts/advanced/.ipynb_checkpoints/Untitled-checkpoint.ipynb +++ b/examples/darts/advanced/.ipynb_checkpoints/Untitled-checkpoint.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 20, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -56,70 +56,6 @@ "x.shape" ] }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": {}, - "outputs": [], - "source": [ - "mlp = OPS['mlp'](1, 100, False)\n", - "conv = OPS['conv'](1, 32, True)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "ConvBlock(\n", - " (op): Sequential(\n", - " (0): Conv2d(1, 1, kernel_size=(3, 3), stride=(100, 100), padding=(1, 1), bias=False)\n", - " (1): BatchNorm1d(1, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", - " )\n", - ")" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "conv" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "dataset = RandomData(100, 2, {'task': 2})" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "torch.Size([10, 1, 28, 28])" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "x.shape" - ] - }, { "cell_type": "code", "execution_count": 10, @@ -359,6 +295,303 @@ "dill(out).shape" ] }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "torch.Size([100, 1, 26, 26])" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "torch.Size([100, 1, 26, 26])" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [], + "source": [ + "x = torch.randn(torch.Size([100, 1, 26, 26]))\n", + "y = torch.randn(torch.Size([100, 1, 24, 24]))" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "torch.Size([100, 1, 26, 26])" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "x.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "torch.Size([100, 1, 24, 24])" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [], + "source": [ + "new = torch.zeros_like(x)" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [], + "source": [ + "new[:, :, :24, :24] = y" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "tensor([[[[ 0.5583, -0.4697, -1.0664, ..., 0.7064, 0.0000, 0.0000],\n", + " [ 0.6258, 0.0675, 0.6688, ..., 0.6203, 0.0000, 0.0000],\n", + " [ 1.1937, 0.2701, 0.2217, ..., 0.0806, 0.0000, 0.0000],\n", + " ...,\n", + " [-0.0659, -0.1802, 0.3372, ..., -0.3461, 0.0000, 0.0000],\n", + " [ 0.0000, 0.0000, 0.0000, ..., 0.0000, 0.0000, 0.0000],\n", + " [ 0.0000, 0.0000, 0.0000, ..., 0.0000, 0.0000, 0.0000]]],\n", + "\n", + "\n", + " [[[-1.2648, -0.5133, -0.4088, ..., -0.4855, 0.0000, 0.0000],\n", + " [-1.3394, 0.4108, 0.6637, ..., 1.4993, 0.0000, 0.0000],\n", + " [ 0.7185, -0.2766, -0.4765, ..., -1.1961, 0.0000, 0.0000],\n", + " ...,\n", + " [-0.1028, -0.8230, -0.9398, ..., 0.7469, 0.0000, 0.0000],\n", + " [ 0.0000, 0.0000, 0.0000, ..., 0.0000, 0.0000, 0.0000],\n", + " [ 0.0000, 0.0000, 0.0000, ..., 0.0000, 0.0000, 0.0000]]],\n", + "\n", + "\n", + " [[[ 0.7054, 0.6086, -1.7438, ..., 0.1894, 0.0000, 0.0000],\n", + " [ 0.8151, 0.1001, 1.1741, ..., -1.1449, 0.0000, 0.0000],\n", + " [-0.5580, 1.6470, 0.7271, ..., -0.4478, 0.0000, 0.0000],\n", + " ...,\n", + " [-0.1456, 0.5271, -0.5290, ..., 1.8301, 0.0000, 0.0000],\n", + " [ 0.0000, 0.0000, 0.0000, ..., 0.0000, 0.0000, 0.0000],\n", + " [ 0.0000, 0.0000, 0.0000, ..., 0.0000, 0.0000, 0.0000]]],\n", + "\n", + "\n", + " ...,\n", + "\n", + "\n", + " [[[ 0.6753, -1.2837, -0.6700, ..., -1.3338, 0.0000, 0.0000],\n", + " [-0.6663, 0.0904, -0.5048, ..., 0.9331, 0.0000, 0.0000],\n", + " [-1.1993, -0.6174, -0.9150, ..., -0.2840, 0.0000, 0.0000],\n", + " ...,\n", + " [-0.4903, 0.0908, 0.9174, ..., 0.5471, 0.0000, 0.0000],\n", + " [ 0.0000, 0.0000, 0.0000, ..., 0.0000, 0.0000, 0.0000],\n", + " [ 0.0000, 0.0000, 0.0000, ..., 0.0000, 0.0000, 0.0000]]],\n", + "\n", + "\n", + " [[[-0.4726, -0.0753, -0.5620, ..., 1.0391, 0.0000, 0.0000],\n", + " [-0.9754, -0.9809, -0.2429, ..., -0.4416, 0.0000, 0.0000],\n", + " [-0.0702, 0.5949, 0.8080, ..., 1.8536, 0.0000, 0.0000],\n", + " ...,\n", + " [-1.6046, 0.5544, 0.0236, ..., 0.3422, 0.0000, 0.0000],\n", + " [ 0.0000, 0.0000, 0.0000, ..., 0.0000, 0.0000, 0.0000],\n", + " [ 0.0000, 0.0000, 0.0000, ..., 0.0000, 0.0000, 0.0000]]],\n", + "\n", + "\n", + " [[[-0.2572, 0.1227, -0.0109, ..., 0.3023, 0.0000, 0.0000],\n", + " [-0.9565, -0.1810, 0.5157, ..., 0.8777, 0.0000, 0.0000],\n", + " [ 0.0078, 2.0418, 2.2871, ..., -0.0903, 0.0000, 0.0000],\n", + " ...,\n", + " [ 0.1430, -0.5325, 0.4984, ..., 1.0231, 0.0000, 0.0000],\n", + " [ 0.0000, 0.0000, 0.0000, ..., 0.0000, 0.0000, 0.0000],\n", + " [ 0.0000, 0.0000, 0.0000, ..., 0.0000, 0.0000, 0.0000]]]])" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "new" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "torch.Size([100, 1, 24, 24])" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 59, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "x.shape > y.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "from torch.nn.utils.rnn import pad_sequence" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [], + "source": [ + "result = F.pad(input=y, pad=(1, 1, 1, 1), mode='constant', value=0)" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "torch.Size([100, 1, 26, 26])" + ] + }, + "execution_count": 56, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "result.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": {}, + "outputs": [], + "source": [ + "seq = [x, y, x, y]" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": {}, + "outputs": [], + "source": [ + "prev = seq[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "metadata": {}, + "outputs": [], + "source": [ + "prev = seq[0]\n", + "padded = [prev]\n", + "for tensor in seq:\n", + " if tensor.shape < prev.shape:\n", + " tensor_pad = F.pad(\n", + " input=tensor, pad=(1, 1, 1, 1), mode='constant', value=0\n", + " )\n", + " padded.append(tensor_pad)\n", + " else:\n", + " padded.append(tensor)\n", + " prev = tensor" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "torch.Size([100, 1, 26, 26])" + ] + }, + "execution_count": 68, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sum(padded).shape" + ] + }, { "cell_type": "code", "execution_count": null, diff --git a/examples/darts/advanced/Untitled.ipynb b/examples/darts/advanced/Untitled.ipynb index dc823be3..fdb1165b 100644 --- a/examples/darts/advanced/Untitled.ipynb +++ b/examples/darts/advanced/Untitled.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 20, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -56,70 +56,6 @@ "x.shape" ] }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": {}, - "outputs": [], - "source": [ - "mlp = OPS['mlp'](1, 100, False)\n", - "conv = OPS['conv'](1, 32, True)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "ConvBlock(\n", - " (op): Sequential(\n", - " (0): Conv2d(1, 1, kernel_size=(3, 3), stride=(100, 100), padding=(1, 1), bias=False)\n", - " (1): BatchNorm1d(1, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", - " )\n", - ")" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "conv" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "dataset = RandomData(100, 2, {'task': 2})" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "torch.Size([10, 1, 28, 28])" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "x.shape" - ] - }, { "cell_type": "code", "execution_count": 10, @@ -359,6 +295,303 @@ "dill(out).shape" ] }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "torch.Size([100, 1, 26, 26])" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "torch.Size([100, 1, 26, 26])" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [], + "source": [ + "x = torch.randn(torch.Size([100, 1, 26, 26]))\n", + "y = torch.randn(torch.Size([100, 1, 24, 24]))" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "torch.Size([100, 1, 26, 26])" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "x.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "torch.Size([100, 1, 24, 24])" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [], + "source": [ + "new = torch.zeros_like(x)" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [], + "source": [ + "new[:, :, :24, :24] = y" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "tensor([[[[ 0.5583, -0.4697, -1.0664, ..., 0.7064, 0.0000, 0.0000],\n", + " [ 0.6258, 0.0675, 0.6688, ..., 0.6203, 0.0000, 0.0000],\n", + " [ 1.1937, 0.2701, 0.2217, ..., 0.0806, 0.0000, 0.0000],\n", + " ...,\n", + " [-0.0659, -0.1802, 0.3372, ..., -0.3461, 0.0000, 0.0000],\n", + " [ 0.0000, 0.0000, 0.0000, ..., 0.0000, 0.0000, 0.0000],\n", + " [ 0.0000, 0.0000, 0.0000, ..., 0.0000, 0.0000, 0.0000]]],\n", + "\n", + "\n", + " [[[-1.2648, -0.5133, -0.4088, ..., -0.4855, 0.0000, 0.0000],\n", + " [-1.3394, 0.4108, 0.6637, ..., 1.4993, 0.0000, 0.0000],\n", + " [ 0.7185, -0.2766, -0.4765, ..., -1.1961, 0.0000, 0.0000],\n", + " ...,\n", + " [-0.1028, -0.8230, -0.9398, ..., 0.7469, 0.0000, 0.0000],\n", + " [ 0.0000, 0.0000, 0.0000, ..., 0.0000, 0.0000, 0.0000],\n", + " [ 0.0000, 0.0000, 0.0000, ..., 0.0000, 0.0000, 0.0000]]],\n", + "\n", + "\n", + " [[[ 0.7054, 0.6086, -1.7438, ..., 0.1894, 0.0000, 0.0000],\n", + " [ 0.8151, 0.1001, 1.1741, ..., -1.1449, 0.0000, 0.0000],\n", + " [-0.5580, 1.6470, 0.7271, ..., -0.4478, 0.0000, 0.0000],\n", + " ...,\n", + " [-0.1456, 0.5271, -0.5290, ..., 1.8301, 0.0000, 0.0000],\n", + " [ 0.0000, 0.0000, 0.0000, ..., 0.0000, 0.0000, 0.0000],\n", + " [ 0.0000, 0.0000, 0.0000, ..., 0.0000, 0.0000, 0.0000]]],\n", + "\n", + "\n", + " ...,\n", + "\n", + "\n", + " [[[ 0.6753, -1.2837, -0.6700, ..., -1.3338, 0.0000, 0.0000],\n", + " [-0.6663, 0.0904, -0.5048, ..., 0.9331, 0.0000, 0.0000],\n", + " [-1.1993, -0.6174, -0.9150, ..., -0.2840, 0.0000, 0.0000],\n", + " ...,\n", + " [-0.4903, 0.0908, 0.9174, ..., 0.5471, 0.0000, 0.0000],\n", + " [ 0.0000, 0.0000, 0.0000, ..., 0.0000, 0.0000, 0.0000],\n", + " [ 0.0000, 0.0000, 0.0000, ..., 0.0000, 0.0000, 0.0000]]],\n", + "\n", + "\n", + " [[[-0.4726, -0.0753, -0.5620, ..., 1.0391, 0.0000, 0.0000],\n", + " [-0.9754, -0.9809, -0.2429, ..., -0.4416, 0.0000, 0.0000],\n", + " [-0.0702, 0.5949, 0.8080, ..., 1.8536, 0.0000, 0.0000],\n", + " ...,\n", + " [-1.6046, 0.5544, 0.0236, ..., 0.3422, 0.0000, 0.0000],\n", + " [ 0.0000, 0.0000, 0.0000, ..., 0.0000, 0.0000, 0.0000],\n", + " [ 0.0000, 0.0000, 0.0000, ..., 0.0000, 0.0000, 0.0000]]],\n", + "\n", + "\n", + " [[[-0.2572, 0.1227, -0.0109, ..., 0.3023, 0.0000, 0.0000],\n", + " [-0.9565, -0.1810, 0.5157, ..., 0.8777, 0.0000, 0.0000],\n", + " [ 0.0078, 2.0418, 2.2871, ..., -0.0903, 0.0000, 0.0000],\n", + " ...,\n", + " [ 0.1430, -0.5325, 0.4984, ..., 1.0231, 0.0000, 0.0000],\n", + " [ 0.0000, 0.0000, 0.0000, ..., 0.0000, 0.0000, 0.0000],\n", + " [ 0.0000, 0.0000, 0.0000, ..., 0.0000, 0.0000, 0.0000]]]])" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "new" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "torch.Size([100, 1, 24, 24])" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 59, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "x.shape > y.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "from torch.nn.utils.rnn import pad_sequence" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [], + "source": [ + "result = F.pad(input=y, pad=(1, 1, 1, 1), mode='constant', value=0)" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "torch.Size([100, 1, 26, 26])" + ] + }, + "execution_count": 56, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "result.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": {}, + "outputs": [], + "source": [ + "seq = [x, y, x, y]" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": {}, + "outputs": [], + "source": [ + "prev = seq[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "metadata": {}, + "outputs": [], + "source": [ + "prev = seq[0]\n", + "padded = [prev]\n", + "for tensor in seq:\n", + " if tensor.shape < prev.shape:\n", + " tensor_pad = F.pad(\n", + " input=tensor, pad=(1, 1, 1, 1), mode='constant', value=0\n", + " )\n", + " padded.append(tensor_pad)\n", + " else:\n", + " padded.append(tensor)\n", + " prev = tensor" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "torch.Size([100, 1, 26, 26])" + ] + }, + "execution_count": 68, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sum(padded).shape" + ] + }, { "cell_type": "code", "execution_count": null, diff --git a/examples/darts/advanced/example.py b/examples/darts/advanced/example.py index 05341f43..b4f2f2ff 100644 --- a/examples/darts/advanced/example.py +++ b/examples/darts/advanced/example.py @@ -2,6 +2,8 @@ import torch.nn as nn from torch import optim from torch.utils.data import DataLoader +from torchvision import datasets, transforms + from loguru import logger import example_setup as bmk @@ -9,19 +11,19 @@ import candle from operations import ( - Stem, OPS, PRIMITIVES + Stem, OPS ) def initialize_parameters(): - """ Initialize the parameters for the Uno example """ + """ Initialize the parameters for the Advanced example """ - uno_example = bmk.UnoExample( + uno_example = bmk.AdvancedExample( bmk.file_path, 'default_model.txt', 'pytorch', - prog='uno_example', - desc='Differentiable Architecture Search - Uno example', + prog='advanced_example', + desc='Differentiable Architecture Search - Advanced example', ) # Initialize parameters @@ -36,18 +38,18 @@ def run(params): device = torch.device(f"cuda" if args.cuda else "cpu") darts.banner(device=device) - train_loader = torch.utils.data.DataLoader( + trainloader = torch.utils.data.DataLoader( datasets.MNIST( - '../data', train=True, download=True, + './data', train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ])), batch_size=args.batch_size, shuffle=True) - valid_loader = torch.utils.data.DataLoader( + validloader = torch.utils.data.DataLoader( datasets.MNIST( - '../data', train=Fale, download=True, + './data', train=False, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) @@ -63,8 +65,7 @@ def run(params): stem = Stem(cell_dim=100) model = darts.Network( - stem, cell_dim=100, primitives=PRIMITIVES, ops=OPS, - tasks=tasks, criterion=criterion, device=device + stem, cell_dim=100, ops=OPS, tasks=tasks, criterion=criterion ).to(device) architecture = darts.Architecture(model, args, device=device) @@ -124,14 +125,14 @@ def train(trainloader, valid_iter = iter(trainloader) for step, (data, target) in enumerate(trainloader): - batch_size = data.size(0) model.train() - + target = _wrap_target(target) data = darts.to_device(data, device) target = darts.to_device(target, device) x_search, target_search = next(valid_iter) + target_search = _wrap_target(target_search) x_search = darts.to_device(x_search, device) target_search = darts.to_device(target_search, device) @@ -170,6 +171,7 @@ def validate(validloader, model, criterion, args, tasks, meter, device): model.eval() with torch.no_grad(): for step, (data, target) in enumerate(validloader): + target = _wrap_target(target) data = darts.to_device(data, device) target = darts.to_device(target, device) @@ -190,6 +192,12 @@ def validate(validloader, model, criterion, args, tasks, meter, device): meter.save(args.savepath) +def _wrap_target(target): + """ Wrap the MNIST target in a dictionary """ + return {'digits': target} + + + def main(): params = initialize_parameters() run(params) diff --git a/examples/darts/advanced/example_setup.py b/examples/darts/advanced/example_setup.py index 7d634b08..12d19b73 100644 --- a/examples/darts/advanced/example_setup.py +++ b/examples/darts/advanced/example_setup.py @@ -23,8 +23,8 @@ ] -class UnoExample(candle.Benchmark): - """ Example for Uno """ +class AdvancedExample(candle.Benchmark): + """ Example for Advanced use of DARTS """ def set_locals(self): """ Set parameters for the benchmark. diff --git a/examples/darts/advanced/operations.py b/examples/darts/advanced/operations.py index 32e9a844..23cd4e87 100644 --- a/examples/darts/advanced/operations.py +++ b/examples/darts/advanced/operations.py @@ -4,14 +4,12 @@ """ DARTS operations contstructor """ OPS = { + 'none' : lambda c, stride, affine: Identity(), 'conv_3' : lambda c, stride, affine: ConvBlock(c, c, 3, stride), 'dil_conv': lambda c, stride, affine: DilConv(c, c, 3, stride, 2, 2, affine=affine) } -PRIMITIVES = ['conv_3', 'dil_conv'] - - class Stem(nn.Module): """ Network stem @@ -35,7 +33,7 @@ def forward(self, x): class ConvBlock(nn.Module): - """ ReLu -> Conv1d """ + """ ReLu -> Conv2d """ def __init__(self, c_in, c_out, kernel_size, stride, affine=True): super(ConvBlock, self).__init__() @@ -81,3 +79,14 @@ def __init__(self, c_in, c_out, kernel_size, def forward(self, x): return self.op(x) + + +class Identity(nn.Module): + """ Identity module """ + + def __init__(self): + super(Identity, self).__init__() + + def forward(self, x): + return x + From c57beb4b3eb6e80226ad6c0c4ba0b3490c52b571 Mon Sep 17 00:00:00 2001 From: yngtodd Date: Sun, 12 Apr 2020 13:42:44 -0400 Subject: [PATCH 244/331] Add docstrings This should make the Network a little less opaque. --- common/darts/modules/network.py | 32 ++++++++++++++++++++++++--- examples/darts/advanced/example.py | 11 ++++++--- examples/darts/advanced/operations.py | 12 +++++++--- 3 files changed, 46 insertions(+), 9 deletions(-) diff --git a/common/darts/modules/network.py b/common/darts/modules/network.py index 8285f606..1947f837 100644 --- a/common/darts/modules/network.py +++ b/common/darts/modules/network.py @@ -18,11 +18,38 @@ class Hyperparameters: class Network(Model): - """ Collection of cells """ + """ Collection of cells + + Args: + stem: nn.Module that takes the input data + and outputs `cell_dim` number of features + + classifier_dim: number of features from + Darts.modules.mixed_layer.MixedLayer. This + depends upon the choice of primitives specified + by `ops`. + + ops: Constructor for all of the primitive nn.Modules. This + should be a dictionary of lambda function used to construct + your nn.Modules. The parameters of the lamdas must be `c`, the + number of input channels of each primitive, `stride`, the stride for + convolution blocks, and `affine`, whether to use `affine` in + batch norm. + + tasks: a dictionary whose keys are the names of the classification + tasks, and whose keys are the number of classes in each task. + + criterion: Pytorch loss criterion + + device: Either "cpu" or "gpu + + hyperparams: instance of Hyperparameters. This hyperparamters for DARTS. + """ def __init__(self, stem: nn.Module, cell_dim: int, + classifier_dim: int, ops: Dict[str, Callable[[int, int, bool], nn.Module]], tasks: Dict[str, int], criterion, @@ -59,8 +86,7 @@ def __init__(self, self.cells += [cell] - # TODO(Todd): Find a way to calculate the output of the ops - self.classifier = MultitaskClassifier(676, tasks) + self.classifier = MultitaskClassifier(classifier_dim, tasks) # k is the total number of edges inside single cell, 14 k = sum(1 for i in range(self.num_nodes) for j in range(2 + i)) diff --git a/examples/darts/advanced/example.py b/examples/darts/advanced/example.py index b4f2f2ff..22d800bc 100644 --- a/examples/darts/advanced/example.py +++ b/examples/darts/advanced/example.py @@ -65,7 +65,8 @@ def run(params): stem = Stem(cell_dim=100) model = darts.Network( - stem, cell_dim=100, ops=OPS, tasks=tasks, criterion=criterion + stem, cell_dim=100, classifier_dim=676, + ops=OPS, tasks=tasks, criterion=criterion, device=device ).to(device) architecture = darts.Architecture(model, args, device=device) @@ -193,9 +194,13 @@ def validate(validloader, model, criterion, args, tasks, meter, device): def _wrap_target(target): - """ Wrap the MNIST target in a dictionary """ - return {'digits': target} + """ Wrap the MNIST target in a dictionary + The multitask classifier of DARTS expects a + dictionary of target tasks. Here we simply wrap + MNIST's target in a dictionary. + """ + return {'digits': target} def main(): diff --git a/examples/darts/advanced/operations.py b/examples/darts/advanced/operations.py index 23cd4e87..6a25bbf8 100644 --- a/examples/darts/advanced/operations.py +++ b/examples/darts/advanced/operations.py @@ -29,7 +29,9 @@ def __init__(self, in_channels: int=1, cell_dim: int=100, kernel_size=3): self.stem = nn.Conv2d(in_channels, cell_dim, kernel_size) def forward(self, x): - return self.stem(x) + x = self.stem(x) +# print(f'stem: {x.shape}') + return x class ConvBlock(nn.Module): @@ -42,7 +44,9 @@ def __init__(self, c_in, c_out, kernel_size, stride, affine=True): ) def forward(self, x): - return self.conv(F.relu(x)) + x = self.conv(F.relu(x)) +# print(f'convblock: {x.shape}') + return x class DilConv(nn.Module): @@ -78,7 +82,9 @@ def __init__(self, c_in, c_out, kernel_size, ) def forward(self, x): - return self.op(x) + x = self.op(x) +# print(f'dilconv: {x.shape}') + return x class Identity(nn.Module): From e9047b3335ae9febcee9fa24e461b37936d92e94 Mon Sep 17 00:00:00 2001 From: yngtodd Date: Sun, 12 Apr 2020 17:15:30 -0400 Subject: [PATCH 245/331] Start example docs This will give people a sense our how DARTS works. --- examples/darts/uno/README.rst | 47 +++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/examples/darts/uno/README.rst b/examples/darts/uno/README.rst index 8ee51b54..0c00ff64 100644 --- a/examples/darts/uno/README.rst +++ b/examples/darts/uno/README.rst @@ -1,3 +1,50 @@ ========= DARTS UNO ========= + + +Differentiable architecture search + +This is an adaptation of Hanxiao Liu et al's DARTS algorithm, extending +the work to handle convolutional neural networks for NLP problems and more. +Details of the original authors' approach can be found in their 2019 ICLR paper_. + + +UNO Example +----------- + +Let's take a look at a look at using DARTS for the Pilot 1 Uno example. In the Uno +problem the task is to classify tumor dose response with respect to a few different +data sources. For simplicity, we will use one source, Uno's gene data, to be used +for this classification. + +DARTS works by composing various neural net primitives, defined as Pytorch `nn.Modules`, +to create a larger directed acyclic graph (DAG) that is to be your model. This +composition is differentiable as we take the softmax of the choice of primitive types +at each layer of the network. To make this more clear, let's first define a few abstractions +in the algorithm: + +1. Primitve: this is the fundamental block of computation, defined as an `nn.Module`. + At each layer of your network, one of these primitves will be chosen by taking the + softmax of all possible primitives at that layer. Examples could be a convolution block, + a linear layer, a skip connect, or anything that you can come up with (subject to a few + constraints). + +2. Cell: this is an abstraction that holds each of the primitive types for level of your + network. This is where we perform the softmax over the possible primitive types. + +3. Nodes: this is the level of abstraction that would normally be considered a `layer` in + your network. It can contain one or more `Cells`. + +In the DARTS algorithm, we define a number of primitives that we would like to compose together +to form our neural network. The original paper started with 8 primitive types. These types +were originally designed for a vision task, and largely consist of convolution type operations. +We have since adapted these types for the `P3B5` benchmark, creating 1D convolution types for +our NLP tasks. If you would like to see how these primitives are defined, along with their +necessary constructors used by DARTS, you can find them in +:doc:`Darts.modules.operations.conv.py<../../../common/darts/modules/operations/conv.py`. + + +.. References +.. ---------- +.. _paper: https://openreview.net/forum?id=S1eYHoC5FX From 95e80a0023a8c5053791b2990fabcdf7d8798d78 Mon Sep 17 00:00:00 2001 From: yngtodd Date: Sun, 12 Apr 2020 17:25:59 -0400 Subject: [PATCH 246/331] Fix link Test relative link within the repository. --- examples/darts/uno/README.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/darts/uno/README.rst b/examples/darts/uno/README.rst index 0c00ff64..23a98640 100644 --- a/examples/darts/uno/README.rst +++ b/examples/darts/uno/README.rst @@ -42,9 +42,10 @@ were originally designed for a vision task, and largely consist of convolution t We have since adapted these types for the `P3B5` benchmark, creating 1D convolution types for our NLP tasks. If you would like to see how these primitives are defined, along with their necessary constructors used by DARTS, you can find them in -:doc:`Darts.modules.operations.conv.py<../../../common/darts/modules/operations/conv.py`. +`Darts.modules.operations.conv.py`_ .. References .. ---------- .. _paper: https://openreview.net/forum?id=S1eYHoC5FX +.. _Darts.modules.operations.conv.py: ../../../common/darts/modules/operations/conv.py From e71bab0b79ed3c86a972cd83a38be4c5d0782bc7 Mon Sep 17 00:00:00 2001 From: yngtodd Date: Sun, 12 Apr 2020 17:37:48 -0400 Subject: [PATCH 247/331] Update README This gives a good start as a high level overview of the algorithm. --- examples/darts/uno/README.rst | 43 +++++++++++++++++++++-------------- 1 file changed, 26 insertions(+), 17 deletions(-) diff --git a/examples/darts/uno/README.rst b/examples/darts/uno/README.rst index 23a98640..32588140 100644 --- a/examples/darts/uno/README.rst +++ b/examples/darts/uno/README.rst @@ -9,43 +9,52 @@ This is an adaptation of Hanxiao Liu et al's DARTS algorithm, extending the work to handle convolutional neural networks for NLP problems and more. Details of the original authors' approach can be found in their 2019 ICLR paper_. - -UNO Example ------------ - -Let's take a look at a look at using DARTS for the Pilot 1 Uno example. In the Uno -problem the task is to classify tumor dose response with respect to a few different -data sources. For simplicity, we will use one source, Uno's gene data, to be used -for this classification. - -DARTS works by composing various neural net primitives, defined as Pytorch `nn.Modules`, +DARTS works by composing various neural net primitives, defined as Pytorch *nn.Modules*, to create a larger directed acyclic graph (DAG) that is to be your model. This composition is differentiable as we take the softmax of the choice of primitive types at each layer of the network. To make this more clear, let's first define a few abstractions in the algorithm: -1. Primitve: this is the fundamental block of computation, defined as an `nn.Module`. +1. **Primitve**: this is the fundamental block of computation, defined as an *nn.Module*. At each layer of your network, one of these primitves will be chosen by taking the softmax of all possible primitives at that layer. Examples could be a convolution block, a linear layer, a skip connect, or anything that you can come up with (subject to a few constraints). -2. Cell: this is an abstraction that holds each of the primitive types for level of your +2. **Cell**: this is an abstraction that holds each of the primitive types for level of your network. This is where we perform the softmax over the possible primitive types. -3. Nodes: this is the level of abstraction that would normally be considered a `layer` in - your network. It can contain one or more `Cells`. +3. **Nodes**: this is the level of abstraction that would normally be considered a layer in + your network. It can contain one or more *Cells*. + +4. **Architecture**: The abstraction that contains all nodes in the graph. This computes a + Hessian product with respect to the *alpha* parameters as defined in the paper. In the DARTS algorithm, we define a number of primitives that we would like to compose together to form our neural network. The original paper started with 8 primitive types. These types were originally designed for a vision task, and largely consist of convolution type operations. -We have since adapted these types for the `P3B5` benchmark, creating 1D convolution types for +We have since adapted these types for the *P3B5* benchmark, creating 1D convolution types for our NLP tasks. If you would like to see how these primitives are defined, along with their necessary constructors used by DARTS, you can find them in -`Darts.modules.operations.conv.py`_ +`darts.modules.operations.conv.py`_. +These primitives are then contained within a cell, and one or more cells are contained within a +node in the graph. DARTS then works by composing these nodes together and taking the softmax over +their primitives in each cell. Finally, the *Architecture* abstraction contains all nodes, and is +responsible for differentiating the composition of the nodes with respect to two *alpha* parameters +as defined in the paper. The end result is that we have a differentiable model that composes its +components as the model is training. + + +UNO Example +----------- + +Let's take a look at a look at using DARTS for the Pilot 1 Uno example. In the Uno +problem the task is to classify tumor dose response with respect to a few different +data sources. For simplicity, we will use one source, Uno's gene data, to be used +for this classification. .. References .. ---------- .. _paper: https://openreview.net/forum?id=S1eYHoC5FX -.. _Darts.modules.operations.conv.py: ../../../common/darts/modules/operations/conv.py +.. _darts.modules.operations.conv.py: ../../../common/darts/modules/operations/conv.py From bf8aaa7f657b12784411f1c17fdfd872fb078f24 Mon Sep 17 00:00:00 2001 From: yngtodd Date: Sun, 12 Apr 2020 18:29:27 -0400 Subject: [PATCH 248/331] Wrap up README This should give a good place for the tutorial. --- examples/darts/uno/README.rst | 71 ++++++++++++++++++++++++++++++++++- 1 file changed, 70 insertions(+), 1 deletion(-) diff --git a/examples/darts/uno/README.rst b/examples/darts/uno/README.rst index 32588140..ac869b0c 100644 --- a/examples/darts/uno/README.rst +++ b/examples/darts/uno/README.rst @@ -30,6 +30,10 @@ in the algorithm: 4. **Architecture**: The abstraction that contains all nodes in the graph. This computes a Hessian product with respect to the *alpha* parameters as defined in the paper. +5. **Genotype**: genotypes are instances of a particular configuration of the graph. As the + optimization runs, and each cell computes the softmax over their primitive types, the final + configuration of all nodes with their resulting primitive is a genotype. + In the DARTS algorithm, we define a number of primitives that we would like to compose together to form our neural network. The original paper started with 8 primitive types. These types were originally designed for a vision task, and largely consist of convolution type operations. @@ -45,6 +49,8 @@ responsible for differentiating the composition of the nodes with respect to two as defined in the paper. The end result is that we have a differentiable model that composes its components as the model is training. +As the optimization runs, the model will print the resulting loss with respect to a given *Genotype*. +The final model will be the *Genotype* with corresponding to the lowest loss. UNO Example ----------- @@ -52,9 +58,72 @@ UNO Example Let's take a look at a look at using DARTS for the Pilot 1 Uno example. In the Uno problem the task is to classify tumor dose response with respect to a few different data sources. For simplicity, we will use one source, Uno's gene data, to be used -for this classification. +for this classification. + +The Uno models are typically fully connected deep networks. DARTS provides some basic linear network +primitives which can be found in `darts.modules.operations.linear.py`_. For simplicity, we will make +use of those primitives for this example. To see how we can define new primitives, see the `advanced`_ +example. + +There are two main abstractions that we need to instantiate in order to get up and running: + +* **LinearNetwork**: + +.. code-block:: python + + LinearNetwork(input_dim, tasks, criterion, device) + +The *LinearNetwork* takes a few parameters: + +1. *input_dim* (int): the data input dimension +2. *tasks* (Dict[str, int]): a dictionary of classification tasks where the keys are the task names + and the values are the number of classes for that task. +3. *criterion*: a Pytorch loss function +4. *device* (str): either "cpu" or "gpu" + +* **Architecture**: + +.. code-block:: python + + Architecture(model, args, device) + +The *Architecture* expects the following arguments: + +1. *model*: and instance of the *LinearNetwork* +2. *args*: an instance of argparse args containing the weight decay and momentum parameters for the + *Architecture*'s optimizer controlling the Hessian optimization. +3. *device* (str): "cpu" or "gpu" + +Model training should familiar to those that are accustomed to using Pytorch with one small difference: + +.. code-block:: python + + # ... + for step, (data, target) in enumerate(trainloader): + #... + architecture.step( + data, target, x_search, target_search, lr, optimizer, unrolled + ) + # ... + # ... + +To understand what is going on here, recall that DARTS is a bi-level optimization procedure, +where there are two Pytorch optimizers, one for the normal gradient step for our model weights, +and another to for our *Architecture* to step in the composition of our neural net's nodes. The +*architecture.step* function is then taking that composition step. It expects that we pass it our +data and labels of the training set, but also the data and labels of our validation set. For +simplicity of this tutorial, *x_search* and *target_search* are from our training set, but these +would normally use a separate validation set. + +Finally, to run this example: + +.. code-block:: + + python uno_example.py .. References .. ---------- .. _paper: https://openreview.net/forum?id=S1eYHoC5FX .. _darts.modules.operations.conv.py: ../../../common/darts/modules/operations/conv.py +.. _darts.modules.operations.linear.py: ../../../common/darts/modules.operations.linear.py +.. _advanced: ../advanced From ce3eb758f97667622d6830f3cffb1668573d5a96 Mon Sep 17 00:00:00 2001 From: yngtodd Date: Sun, 12 Apr 2020 20:18:13 -0400 Subject: [PATCH 249/331] Update advanced README This gives us a starting point for this example. --- examples/darts/advanced/README.rst | 203 +++++++++++++++++++++++++- examples/darts/advanced/operations.py | 8 +- 2 files changed, 202 insertions(+), 9 deletions(-) diff --git a/examples/darts/advanced/README.rst b/examples/darts/advanced/README.rst index 8ee51b54..4b1d503a 100644 --- a/examples/darts/advanced/README.rst +++ b/examples/darts/advanced/README.rst @@ -1,3 +1,200 @@ -========= -DARTS UNO -========= +============== +DARTS Advanced +============== + + +Differentiable architecture search + +This is an adaptation of Hanxiao Liu et al's DARTS algorithm, extending +the work to handle convolutional neural networks for NLP problems and more. +Details of the original authors' approach can be found in their 2019 ICLR paper_. + +DARTS works by composing various neural net primitives, defined as Pytorch *nn.Modules*, +to create a larger directed acyclic graph (DAG) that is to be your model. This +composition is differentiable as we take the softmax of the choice of primitive types +at each layer of the network. To make this more clear, let's first define a few abstractions +in the algorithm: + +1. **Primitve**: this is the fundamental block of computation, defined as an *nn.Module*. + At each layer of your network, one of these primitves will be chosen by taking the + softmax of all possible primitives at that layer. Examples could be a convolution block, + a linear layer, a skip connect, or anything that you can come up with (subject to a few + constraints). + +2. **Cell**: this is an abstraction that holds each of the primitive types for level of your + network. This is where we perform the softmax over the possible primitive types. + +3. **Nodes**: this is the level of abstraction that would normally be considered a layer in + your network. It can contain one or more *Cells*. + +4. **Architecture**: The abstraction that contains all nodes in the graph. This computes a + Hessian product with respect to the *alpha* parameters as defined in the paper. + +5. **Genotype**: genotypes are instances of a particular configuration of the graph. As the + optimization runs, and each cell computes the softmax over their primitive types, the final + configuration of all nodes with their resulting primitive is a genotype. + +In the DARTS algorithm, we define a number of primitives that we would like to compose together +to form our neural network. The original paper started with 8 primitive types. These types +were originally designed for a vision task, and largely consist of convolution type operations. +We have since adapted these types for the *P3B5* benchmark, creating 1D convolution types for +our NLP tasks. If you would like to see how these primitives are defined, along with their +necessary constructors used by DARTS, you can find them in +`darts.modules.operations.conv.py`_. + +These primitives are then contained within a cell, and one or more cells are contained within a +node in the graph. DARTS then works by composing these nodes together and taking the softmax over +their primitives in each cell. Finally, the *Architecture* abstraction contains all nodes, and is +responsible for differentiating the composition of the nodes with respect to two *alpha* parameters +as defined in the paper. The end result is that we have a differentiable model that composes its +components as the model is training. + +As the optimization runs, the model will print the resulting loss with respect to a given *Genotype*. +The final model will be the *Genotype* with corresponding to the lowest loss. + +Adnvanced Example +----------------- + +In this example we will take a look at how to define our own primitives to be handled by DARTS. If +you have not read the `Uno example`_, I would recommend taking a look at that first. There we showed +how we can use the built in primitives to DARTS. As reference, you can also look to see how those +built it primitives are defined in `darts.modules.operations.linear.py`_ and +`darts.modules.operations.conv.py`_. + +In order to define custom networks to be handled by DARTS, you need to define a few things: + +1. **Network Stem**: This is an *nn.Module* that takes in your input data, processes it in some way, + and feeds its features of size *cell_dim* to your remaining network primitives. The parameter + *cell_dim* must be the input size for all of your primitives. Since DARTS can compose your primitives + in *any* order, the input and output dimension of all of your primitives must be of size *cell_dim*. + +2. **Primitives**: These *nn.Modules* are the basic building blocks for your network. They can be anything + that you dream of, so long as their input and output dimensions are of size *cell_dim*. + +3. **A constructor for your primitives**: This is a dictionary of lambda functions used to construct your + network primitives. By convention, this is a dictionary called *OPS*. We will look at this a bit closer + below. + +Defining our Components +----------------------- + +Let's take a look at the various pieces that we need to define. All of these components can be found in +`operations.py`_. + +Network Stem +------------ + +As we mentioned above, this is the module that is defined at the beginning of your network, mapping your +input data to *cell_dim*. + +.. code-block:: python + + class Stem(nn.Module): + """ Network stem + + This will always be the beginning of the network. + DARTS will only recompose modules after the stem. + For this reason, we define this separate from the + other modules in the network. + + Args: + input_dim: the input dimension for your data + + cell_dim: the intermediate dimension size for + the remaining modules of the network. + """ + def __init__(self, in_channels: int=1, cell_dim: int=100, kernel_size=3): + super(Stem, self).__init__() + self.stem = nn.Conv2d(in_channels, cell_dim, kernel_size) + + def forward(self, x): + return self.stem(x) + +Primitives +---------- + +DARTS primitives are Pytorch *nn.Modules*. For this example, we have defined three primitives: *ConvBlock*, +*DilConv*, and the *Identity* (a skip layer). It is important to remember DARTS will try many different +orderings of these primitives between *nodes*. Therefore, the imput and output dimensions of each of these +primitives must be of size *cell_dim*. + +It is also important to know that DARTS expects the *Identity* function to be included in the primitives. +This is so that DARTS can account for varying depths of neural networks. Since at each node, DARTS must choose +one primitive (choosing meaning taking the softmax over the primitives), having the no-op *Identity* means +that we can optimize over the depth of the network. It would be possible to define a 100 layer network and +have the output *Genotype* be only a few layers deep. If we were to not include the *Identity*, every layer +would be some transformation of the previous layer's features, and we could run the risk of overparameterizing +our network. + +A Constructor for our Primitives +-------------------------------- + +Since DARTS does not control what primitives you define, we need to provide it with a constructor for those +primitives. By convention, this is handled by a dictionary of lambda functions called *OPS*. The keys of this +dictionary are the names of our primitives, and the values of the dictionary are lambda functions that +construct those primitives. Let's take a look at the example's *OPS* + +**************** + +There are two main abstractions that we need to instantiate in order to get up and running: + +* **LinearNetwork**: + +.. code-block:: python + + LinearNetwork(input_dim, tasks, criterion, device) + +The *LinearNetwork* takes a few parameters: + +1. *input_dim* (int): the data input dimension +2. *tasks* (Dict[str, int]): a dictionary of classification tasks where the keys are the task names + and the values are the number of classes for that task. +3. *criterion*: a Pytorch loss function +4. *device* (str): either "cpu" or "gpu" + +* **Architecture**: + +.. code-block:: python + + Architecture(model, args, device) + +The *Architecture* expects the following arguments: + +1. *model*: and instance of the *LinearNetwork* +2. *args*: an instance of argparse args containing the weight decay and momentum parameters for the + *Architecture*'s optimizer controlling the Hessian optimization. +3. *device* (str): "cpu" or "gpu" + +Model training should familiar to those that are accustomed to using Pytorch with one small difference: + +.. code-block:: python + + # ... + for step, (data, target) in enumerate(trainloader): + #... + architecture.step( + data, target, x_search, target_search, lr, optimizer, unrolled + ) + # ... + # ... + +To understand what is going on here, recall that DARTS is a bi-level optimization procedure, +where there are two Pytorch optimizers, one for the normal gradient step for our model weights, +and another to for our *Architecture* to step in the composition of our neural net's nodes. The +*architecture.step* function is then taking that composition step. It expects that we pass it our +data and labels of the training set, but also the data and labels of our validation set. For +simplicity of this tutorial, *x_search* and *target_search* are from our training set, but these +would normally use a separate validation set. + +Finally, to run this example: + +.. code-block:: + + python uno_example.py + +.. References +.. ---------- +.. _paper: https://openreview.net/forum?id=S1eYHoC5FX +.. _darts.modules.operations.conv.py: ../../../common/darts/modules/operations/conv.py +.. _darts.modules.operations.linear.py: ../../../common/darts/modules.operations.linear.py +.. _operations.py: ./operations.py diff --git a/examples/darts/advanced/operations.py b/examples/darts/advanced/operations.py index 6a25bbf8..d07c878d 100644 --- a/examples/darts/advanced/operations.py +++ b/examples/darts/advanced/operations.py @@ -44,9 +44,7 @@ def __init__(self, c_in, c_out, kernel_size, stride, affine=True): ) def forward(self, x): - x = self.conv(F.relu(x)) -# print(f'convblock: {x.shape}') - return x + return self.conv(F.relu(x)) class DilConv(nn.Module): @@ -82,9 +80,7 @@ def __init__(self, c_in, c_out, kernel_size, ) def forward(self, x): - x = self.op(x) -# print(f'dilconv: {x.shape}') - return x + return self.op(x) class Identity(nn.Module): From 52ed7d55fa566d839b55eb08f059d84b17a4e9e4 Mon Sep 17 00:00:00 2001 From: yngtodd Date: Sun, 12 Apr 2020 20:42:27 -0400 Subject: [PATCH 250/331] Finalize README This get us set for the tutorial! --- examples/darts/advanced/README.rst | 76 ++++++++++++------------------ 1 file changed, 31 insertions(+), 45 deletions(-) diff --git a/examples/darts/advanced/README.rst b/examples/darts/advanced/README.rst index 4b1d503a..ceb41a86 100644 --- a/examples/darts/advanced/README.rst +++ b/examples/darts/advanced/README.rst @@ -132,65 +132,51 @@ A Constructor for our Primitives Since DARTS does not control what primitives you define, we need to provide it with a constructor for those primitives. By convention, this is handled by a dictionary of lambda functions called *OPS*. The keys of this dictionary are the names of our primitives, and the values of the dictionary are lambda functions that -construct those primitives. Let's take a look at the example's *OPS* - -**************** - -There are two main abstractions that we need to instantiate in order to get up and running: - -* **LinearNetwork**: +construct those primitives. Let's take a look at the example's *OPS*: .. code-block:: python - LinearNetwork(input_dim, tasks, criterion, device) - -The *LinearNetwork* takes a few parameters: - -1. *input_dim* (int): the data input dimension -2. *tasks* (Dict[str, int]): a dictionary of classification tasks where the keys are the task names - and the values are the number of classes for that task. -3. *criterion*: a Pytorch loss function -4. *device* (str): either "cpu" or "gpu" + """ DARTS operations contstructor """ + OPS = { + 'none' : lambda c, stride, affine: Identity(), + 'conv_3' : lambda c, stride, affine: ConvBlock(c, c, 3, stride), + 'dil_conv': lambda c, stride, affine: DilConv(c, c, 3, stride, 2, 2, affine=affine) + } + +As mentioned, the keys of *OPS* are the names we give to each of our primitives. These keys will be +what DARTS uses when defining *Genotypes*. Note that the the lambda functions take three parameters: +1. *c*, the number of channels (or features) of the layer; 2. *stride*, the stride for convolutions; and +3. *affine* whether to use affine transforms in batch normalization. These parameters are the default +implementation of DARTS, and must be present. Any other hyperparameters of our custom primitives must be +given default values. One last thing to note: in order to keep things consistent, DARTS reserves the keyword +*none* for the *Identity* primitive. Again, this primitive must be included in any custom primitive set, and +it's key must be *none*. This method of constructing our primitives could be changed in future versions of +DARTS to better acccommodate fancier primitives. As always, pull requests are welcome! + +Putting it all Together +----------------------- -* **Architecture**: +Once we have defined our stem, primitives, and our *OPS* constructor, we can that hand them over to DARTS: .. code-block:: python - Architecture(model, args, device) - -The *Architecture* expects the following arguments: - -1. *model*: and instance of the *LinearNetwork* -2. *args*: an instance of argparse args containing the weight decay and momentum parameters for the - *Architecture*'s optimizer controlling the Hessian optimization. -3. *device* (str): "cpu" or "gpu" + model = darts.Network( + stem, cell_dim=100, classifier_dim=676, + ops=OPS, tasks=tasks, criterion=criterion, device=device + ).to(device) -Model training should familiar to those that are accustomed to using Pytorch with one small difference: - -.. code-block:: python + architecture = darts.Architecture(model, args, device=device) - # ... - for step, (data, target) in enumerate(trainloader): - #... - architecture.step( - data, target, x_search, target_search, lr, optimizer, unrolled - ) - # ... - # ... - -To understand what is going on here, recall that DARTS is a bi-level optimization procedure, -where there are two Pytorch optimizers, one for the normal gradient step for our model weights, -and another to for our *Architecture* to step in the composition of our neural net's nodes. The -*architecture.step* function is then taking that composition step. It expects that we pass it our -data and labels of the training set, but also the data and labels of our validation set. For -simplicity of this tutorial, *x_search* and *target_search* are from our training set, but these -would normally use a separate validation set. +Note that we must specify the *classifier_dim* the number of input features from our primitives. Since each +of the primitives must have the same number of input and output features, this will be the flattned number +of features from any of your primitives. Since DARTS cannot know ahead of time what your primitives will be, +we must specify how many features will go into our final fully connected layer of the network. Finally, to run this example: .. code-block:: - python uno_example.py + python example.py .. References .. ---------- From 6955bb2d9a8286a18de9202346f5a86300daf6cd Mon Sep 17 00:00:00 2001 From: yngtodd Date: Sun, 12 Apr 2020 20:45:30 -0400 Subject: [PATCH 251/331] Update README table of contents This will be a guide to the ordering of examples. --- examples/darts/README.rst | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/examples/darts/README.rst b/examples/darts/README.rst index ef49aead..15322c13 100644 --- a/examples/darts/README.rst +++ b/examples/darts/README.rst @@ -1,3 +1,11 @@ ============== DARTS Examples ============== + +Our recommended ordering of examples: + +1. **Uno**: learn how to use the neural network building blocks in DARTS to + define a fully connected model using DARTS. + +2. **Advanced**: how to define our own neural network primitives to be optimized + by DARTS. From 8237b6bf3fedf37da4a627705936df59edd31102 Mon Sep 17 00:00:00 2001 From: yngtodd Date: Sun, 12 Apr 2020 20:48:10 -0400 Subject: [PATCH 252/331] Add link to Uno example This was missing. --- examples/darts/advanced/README.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/darts/advanced/README.rst b/examples/darts/advanced/README.rst index ceb41a86..d9197271 100644 --- a/examples/darts/advanced/README.rst +++ b/examples/darts/advanced/README.rst @@ -184,3 +184,4 @@ Finally, to run this example: .. _darts.modules.operations.conv.py: ../../../common/darts/modules/operations/conv.py .. _darts.modules.operations.linear.py: ../../../common/darts/modules.operations.linear.py .. _operations.py: ./operations.py +.. _Uno example: ../uno From af2c23ea3976acc6928838bda5e23e421da241f4 Mon Sep 17 00:00:00 2001 From: yngtodd Date: Sun, 12 Apr 2020 21:13:39 -0400 Subject: [PATCH 253/331] Remove placeholders We no longer need the outline for the pull request. Replacing that with an overview of the algorithm. --- common/darts/README.rst | 72 ++++++++++++++++++++++++----------------- 1 file changed, 42 insertions(+), 30 deletions(-) diff --git a/common/darts/README.rst b/common/darts/README.rst index 048eeb01..7dce2fcd 100644 --- a/common/darts/README.rst +++ b/common/darts/README.rst @@ -1,6 +1,7 @@ -===== -DARTS -===== +============== +DARTS Advanced +============== + Differentiable architecture search @@ -8,43 +9,54 @@ This is an adaptation of Hanxiao Liu et al's DARTS algorithm, extending the work to handle convolutional neural networks for NLP problems and more. Details of the original authors' approach can be found in their 2019 ICLR paper_. -Notes ------ - -The following steps should be finished before merging the PR: +DARTS works by composing various neural net primitives, defined as Pytorch *nn.Modules*, +to create a larger directed acyclic graph (DAG) that is to be your model. This +composition is differentiable as we take the softmax of the choice of primitive types +at each layer of the network. To make this more clear, let's first define a few abstractions +in the algorithm: -- [ ] Expert level `Network` with user defined primitives and stem -- [ ] Examples -- [ ] README overview of the library +1. **Primitve**: this is the fundamental block of computation, defined as an *nn.Module*. + At each layer of your network, one of these primitves will be chosen by taking the + softmax of all possible primitives at that layer. Examples could be a convolution block, + a linear layer, a skip connect, or anything that you can come up with (subject to a few + constraints). -Expert Level Network --------------------- +2. **Cell**: this is an abstraction that holds each of the primitive types for level of your + network. This is where we perform the softmax over the possible primitive types. -The user must define: +3. **Nodes**: this is the level of abstraction that would normally be considered a layer in + your network. It can contain one or more *Cells*. -1. Fundamental operations -2. Ops constructor for fundamental operations -3. Primitives list +4. **Architecture**: The abstraction that contains all nodes in the graph. This computes a + Hessian product with respect to the *alpha* parameters as defined in the paper. -Draft ------ +5. **Genotype**: genotypes are instances of a particular configuration of the graph. As the + optimization runs, and each cell computes the softmax over their primitive types, the final + configuration of all nodes with their resulting primitive is a genotype. -.. code-block:: python +In the DARTS algorithm, we define a number of primitives that we would like to compose together +to form our neural network. The original paper started with 8 primitive types. These types +were originally designed for a vision task, and largely consist of convolution type operations. +We have since adapted these types for the *P3B5* benchmark, creating 1D convolution types for +our NLP tasks. If you would like to see how these primitives are defined, along with their +necessary constructors used by DARTS, you can find them in +`darts.modules.operations.conv.py`_. - class Network: - """ Expert mode network """ +These primitives are then contained within a cell, and one or more cells are contained within a +node in the graph. DARTS then works by composing these nodes together and taking the softmax over +their primitives in each cell. Finally, the *Architecture* abstraction contains all nodes, and is +responsible for differentiating the composition of the nodes with respect to two *alpha* parameters +as defined in the paper. The end result is that we have a differentiable model that composes its +components as the model is training. - def __init__(self, stem, primitives, ops): - self.stem = stem - self.primitives = primitives - self ops = ops - - def _helper_init(self, ...): - """ Helper to construct the private member variables """ - raise NotImplementedError +As the optimization runs, the model will print the resulting loss with respect to a given *Genotype*. +The final model will be the *Genotype* with corresponding to the lowest loss. .. References .. ---------- .. _paper: https://openreview.net/forum?id=S1eYHoC5FX - +.. _darts.modules.operations.conv.py: ../../../common/darts/modules/operations/conv.py +.. _darts.modules.operations.linear.py: ../../../common/darts/modules.operations.linear.py +.. _operations.py: ./operations.py +.. _Uno example: ../uno From 3b455d0a5f5b8077955c0baf3e6a8685c08a662c Mon Sep 17 00:00:00 2001 From: yngtodd Date: Sun, 12 Apr 2020 21:23:09 -0400 Subject: [PATCH 254/331] Fix title This was a carryover from the advanced tutorial. --- common/darts/README.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/common/darts/README.rst b/common/darts/README.rst index 7dce2fcd..f6bc557d 100644 --- a/common/darts/README.rst +++ b/common/darts/README.rst @@ -1,6 +1,6 @@ -============== -DARTS Advanced -============== +===== +DARTS +===== Differentiable architecture search From 45d9564208443fd03800728951f6ce52f2b1b9b0 Mon Sep 17 00:00:00 2001 From: yngtodd Date: Sun, 12 Apr 2020 21:26:08 -0400 Subject: [PATCH 255/331] Tidy up example This makes sure that results are saved to the results dir. --- examples/darts/advanced/Untitled.ipynb | 624 ---------------------- examples/darts/advanced/default_model.txt | 2 +- examples/darts/advanced/results/.gitkeep | 0 3 files changed, 1 insertion(+), 625 deletions(-) delete mode 100644 examples/darts/advanced/Untitled.ipynb delete mode 100644 examples/darts/advanced/results/.gitkeep diff --git a/examples/darts/advanced/Untitled.ipynb b/examples/darts/advanced/Untitled.ipynb deleted file mode 100644 index fdb1165b..00000000 --- a/examples/darts/advanced/Untitled.ipynb +++ /dev/null @@ -1,624 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "import torch\n", - "import torch.nn as nn\n", - "import torch.nn.functional as F\n", - "from torch.utils.data import DataLoader\n", - "from torchvision import datasets\n", - "from torchvision import transforms\n", - "from operations import OPS" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "train = datasets.MNIST('./data', train=True, transform=transforms.ToTensor(), download=True)\n", - "valid = datasets.MNIST('./data', train=False)\n", - "\n", - "train_loader = DataLoader(train, batch_size=10)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "x, y = next(iter(train_loader))" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "torch.Size([10, 1, 28, 28])" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "x.shape" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "train_loader = torch.utils.data.DataLoader(\n", - " datasets.MNIST('../data', train=True, download=True,\n", - " transform=transforms.Compose([\n", - " transforms.ToTensor(),\n", - " transforms.Normalize((0.1307,), (0.3081,))\n", - " ])),\n", - " batch_size=10, shuffle=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "x, y = next(iter(train_loader))" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "torch.Size([10, 1, 28, 28])" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "x.shape" - ] - }, - { - "cell_type": "code", - "execution_count": 57, - "metadata": {}, - "outputs": [], - "source": [ - "class StemNet(nn.Module):\n", - " \"\"\" Network stem\n", - "\n", - " This will always be the beginning of the network.\n", - " DARTS will only recompose modules after the stem.\n", - " For this reason, we define this separate from the\n", - " other modules in the network.\n", - "\n", - " Args:\n", - " input_dim: the input dimension for your data\n", - "\n", - " cell_dim: the intermediate dimension size for\n", - " the remaining modules of the network.\n", - " \"\"\"\n", - " def __init__(self, in_channels: int=1, cell_dim: int=100, kernel_size=3):\n", - " super(StemNet, self).__init__()\n", - " self.stem = nn.Conv2d(in_channels, cell_dim, kernel_size)\n", - "\n", - " def forward(self, x):\n", - " return self.stem(x)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 58, - "metadata": {}, - "outputs": [], - "source": [ - "class ConvBlock(nn.Module):\n", - " \"\"\" ReLu -> Conv1d -> BatchNorm \"\"\"\n", - "\n", - " def __init__(self, c_in, c_out, kernel_size, stride, affine=True):\n", - " super(ConvBlock, self).__init__()\n", - " self.conv = nn.Conv2d(c_in, c_out, kernel_size=kernel_size, stride=stride)\n", - "\n", - " def forward(self, x):\n", - " return self.conv(x)" - ] - }, - { - "cell_type": "code", - "execution_count": 100, - "metadata": {}, - "outputs": [], - "source": [ - "class DilConv(nn.Module):\n", - " \"\"\" ReLU Dilated Convolution \"\"\"\n", - "\n", - " def __init__(self, c_in, c_out, kernel_size, \n", - " stride, padding, dilation, affine=True):\n", - " super(DilConv, self).__init__()\n", - "\n", - " self.op = nn.Sequential(\n", - " nn.ReLU(inplace=False),\n", - "\n", - " nn.Conv2d(\n", - " c_in,\n", - " c_in,\n", - " kernel_size=kernel_size,\n", - " stride=stride,\n", - " padding=padding,\n", - " dilation=dilation,\n", - " groups=c_in,\n", - " bias=False\n", - " ),\n", - "\n", - " nn.Conv2d(\n", - " c_in,\n", - " c_out,\n", - " kernel_size=1,\n", - " padding=0,\n", - " bias=False\n", - " ),\n", - "\n", - " nn.BatchNorm2d(c_out, affine=affine),\n", - " )\n", - "\n", - " def forward(self, x):\n", - " return self.op(x)" - ] - }, - { - "cell_type": "code", - "execution_count": 101, - "metadata": {}, - "outputs": [], - "source": [ - "stem = StemNet()" - ] - }, - { - "cell_type": "code", - "execution_count": 102, - "metadata": {}, - "outputs": [], - "source": [ - "OPS = {\n", - " 'dil_conv': lambda c, stride, affine: DilConv(c, c, 3, stride, 2, 2, affine=affine),\n", - " 'conv' : lambda c, stride, affine: ConvBlock(c, c, 3, stride, affine=affine),\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": 103, - "metadata": {}, - "outputs": [], - "source": [ - "conv = OPS['conv'](100, 1, True)" - ] - }, - { - "cell_type": "code", - "execution_count": 104, - "metadata": {}, - "outputs": [], - "source": [ - "dill = OPS['dil_conv'](100, 1, True)" - ] - }, - { - "cell_type": "code", - "execution_count": 105, - "metadata": {}, - "outputs": [], - "source": [ - "out = stem(x)" - ] - }, - { - "cell_type": "code", - "execution_count": 106, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "torch.Size([10, 100, 26, 26])" - ] - }, - "execution_count": 106, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "out.shape" - ] - }, - { - "cell_type": "code", - "execution_count": 107, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "torch.Size([10, 100, 24, 24])" - ] - }, - "execution_count": 107, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "conv(out).shape" - ] - }, - { - "cell_type": "code", - "execution_count": 108, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "torch.Size([10, 100, 26, 26])" - ] - }, - "execution_count": 108, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "dill(out).shape" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "torch.Size([100, 1, 26, 26])" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "torch.Size([100, 1, 26, 26])" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "metadata": {}, - "outputs": [], - "source": [ - "x = torch.randn(torch.Size([100, 1, 26, 26]))\n", - "y = torch.randn(torch.Size([100, 1, 24, 24]))" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "torch.Size([100, 1, 26, 26])" - ] - }, - "execution_count": 39, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "x.shape" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "torch.Size([100, 1, 24, 24])" - ] - }, - "execution_count": 40, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "y.shape" - ] - }, - { - "cell_type": "code", - "execution_count": 43, - "metadata": {}, - "outputs": [], - "source": [ - "new = torch.zeros_like(x)" - ] - }, - { - "cell_type": "code", - "execution_count": 45, - "metadata": {}, - "outputs": [], - "source": [ - "new[:, :, :24, :24] = y" - ] - }, - { - "cell_type": "code", - "execution_count": 51, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "tensor([[[[ 0.5583, -0.4697, -1.0664, ..., 0.7064, 0.0000, 0.0000],\n", - " [ 0.6258, 0.0675, 0.6688, ..., 0.6203, 0.0000, 0.0000],\n", - " [ 1.1937, 0.2701, 0.2217, ..., 0.0806, 0.0000, 0.0000],\n", - " ...,\n", - " [-0.0659, -0.1802, 0.3372, ..., -0.3461, 0.0000, 0.0000],\n", - " [ 0.0000, 0.0000, 0.0000, ..., 0.0000, 0.0000, 0.0000],\n", - " [ 0.0000, 0.0000, 0.0000, ..., 0.0000, 0.0000, 0.0000]]],\n", - "\n", - "\n", - " [[[-1.2648, -0.5133, -0.4088, ..., -0.4855, 0.0000, 0.0000],\n", - " [-1.3394, 0.4108, 0.6637, ..., 1.4993, 0.0000, 0.0000],\n", - " [ 0.7185, -0.2766, -0.4765, ..., -1.1961, 0.0000, 0.0000],\n", - " ...,\n", - " [-0.1028, -0.8230, -0.9398, ..., 0.7469, 0.0000, 0.0000],\n", - " [ 0.0000, 0.0000, 0.0000, ..., 0.0000, 0.0000, 0.0000],\n", - " [ 0.0000, 0.0000, 0.0000, ..., 0.0000, 0.0000, 0.0000]]],\n", - "\n", - "\n", - " [[[ 0.7054, 0.6086, -1.7438, ..., 0.1894, 0.0000, 0.0000],\n", - " [ 0.8151, 0.1001, 1.1741, ..., -1.1449, 0.0000, 0.0000],\n", - " [-0.5580, 1.6470, 0.7271, ..., -0.4478, 0.0000, 0.0000],\n", - " ...,\n", - " [-0.1456, 0.5271, -0.5290, ..., 1.8301, 0.0000, 0.0000],\n", - " [ 0.0000, 0.0000, 0.0000, ..., 0.0000, 0.0000, 0.0000],\n", - " [ 0.0000, 0.0000, 0.0000, ..., 0.0000, 0.0000, 0.0000]]],\n", - "\n", - "\n", - " ...,\n", - "\n", - "\n", - " [[[ 0.6753, -1.2837, -0.6700, ..., -1.3338, 0.0000, 0.0000],\n", - " [-0.6663, 0.0904, -0.5048, ..., 0.9331, 0.0000, 0.0000],\n", - " [-1.1993, -0.6174, -0.9150, ..., -0.2840, 0.0000, 0.0000],\n", - " ...,\n", - " [-0.4903, 0.0908, 0.9174, ..., 0.5471, 0.0000, 0.0000],\n", - " [ 0.0000, 0.0000, 0.0000, ..., 0.0000, 0.0000, 0.0000],\n", - " [ 0.0000, 0.0000, 0.0000, ..., 0.0000, 0.0000, 0.0000]]],\n", - "\n", - "\n", - " [[[-0.4726, -0.0753, -0.5620, ..., 1.0391, 0.0000, 0.0000],\n", - " [-0.9754, -0.9809, -0.2429, ..., -0.4416, 0.0000, 0.0000],\n", - " [-0.0702, 0.5949, 0.8080, ..., 1.8536, 0.0000, 0.0000],\n", - " ...,\n", - " [-1.6046, 0.5544, 0.0236, ..., 0.3422, 0.0000, 0.0000],\n", - " [ 0.0000, 0.0000, 0.0000, ..., 0.0000, 0.0000, 0.0000],\n", - " [ 0.0000, 0.0000, 0.0000, ..., 0.0000, 0.0000, 0.0000]]],\n", - "\n", - "\n", - " [[[-0.2572, 0.1227, -0.0109, ..., 0.3023, 0.0000, 0.0000],\n", - " [-0.9565, -0.1810, 0.5157, ..., 0.8777, 0.0000, 0.0000],\n", - " [ 0.0078, 2.0418, 2.2871, ..., -0.0903, 0.0000, 0.0000],\n", - " ...,\n", - " [ 0.1430, -0.5325, 0.4984, ..., 1.0231, 0.0000, 0.0000],\n", - " [ 0.0000, 0.0000, 0.0000, ..., 0.0000, 0.0000, 0.0000],\n", - " [ 0.0000, 0.0000, 0.0000, ..., 0.0000, 0.0000, 0.0000]]]])" - ] - }, - "execution_count": 51, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "new" - ] - }, - { - "cell_type": "code", - "execution_count": 48, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "torch.Size([100, 1, 24, 24])" - ] - }, - "execution_count": 48, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "y.shape" - ] - }, - { - "cell_type": "code", - "execution_count": 59, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 59, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "x.shape > y.shape" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [], - "source": [ - "from torch.nn.utils.rnn import pad_sequence" - ] - }, - { - "cell_type": "code", - "execution_count": 55, - "metadata": {}, - "outputs": [], - "source": [ - "result = F.pad(input=y, pad=(1, 1, 1, 1), mode='constant', value=0)" - ] - }, - { - "cell_type": "code", - "execution_count": 56, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "torch.Size([100, 1, 26, 26])" - ] - }, - "execution_count": 56, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "result.shape" - ] - }, - { - "cell_type": "code", - "execution_count": 60, - "metadata": {}, - "outputs": [], - "source": [ - "seq = [x, y, x, y]" - ] - }, - { - "cell_type": "code", - "execution_count": 64, - "metadata": {}, - "outputs": [], - "source": [ - "prev = seq[0]" - ] - }, - { - "cell_type": "code", - "execution_count": 67, - "metadata": {}, - "outputs": [], - "source": [ - "prev = seq[0]\n", - "padded = [prev]\n", - "for tensor in seq:\n", - " if tensor.shape < prev.shape:\n", - " tensor_pad = F.pad(\n", - " input=tensor, pad=(1, 1, 1, 1), mode='constant', value=0\n", - " )\n", - " padded.append(tensor_pad)\n", - " else:\n", - " padded.append(tensor)\n", - " prev = tensor" - ] - }, - { - "cell_type": "code", - "execution_count": 68, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "torch.Size([100, 1, 26, 26])" - ] - }, - "execution_count": 68, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sum(padded).shape" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.7" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/examples/darts/advanced/default_model.txt b/examples/darts/advanced/default_model.txt index 1e8badd6..79d9c880 100644 --- a/examples/darts/advanced/default_model.txt +++ b/examples/darts/advanced/default_model.txt @@ -2,7 +2,7 @@ model_name = 'darts_uno' unrolled = False data_url = 'ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/' -savepath = '.' +savepath = './results' log_interval = 10 train_data = 'top_21_auc_1fold.uno.h5' learning_rate = 0.01 diff --git a/examples/darts/advanced/results/.gitkeep b/examples/darts/advanced/results/.gitkeep deleted file mode 100644 index e69de29b..00000000 From 3ca5242211e6a75252711cdeaa7a51f8f094c8e4 Mon Sep 17 00:00:00 2001 From: yngtodd Date: Sun, 12 Apr 2020 21:30:42 -0400 Subject: [PATCH 256/331] Fix import The benchmark wasn't using darts to get access to the GenotypeStorage. --- Pilot3/P3B5/p3b5_baseline_pytorch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Pilot3/P3B5/p3b5_baseline_pytorch.py b/Pilot3/P3B5/p3b5_baseline_pytorch.py index 348c779a..63c5a8a3 100644 --- a/Pilot3/P3B5/p3b5_baseline_pytorch.py +++ b/Pilot3/P3B5/p3b5_baseline_pytorch.py @@ -84,7 +84,7 @@ def run(params): eta_min=args.learning_rate_min, ) - genotype_store = GenotypeStorage(root=args.save_path) + genotype_store = darts.GenotypeStorage(root=args.save_path) min_loss = 9999 for epoch in range(args.epochs): From 736cb65a204bdb453fed1c032609b33d0a5ed410 Mon Sep 17 00:00:00 2001 From: yngtodd Date: Mon, 13 Apr 2020 15:06:58 -0400 Subject: [PATCH 257/331] Remove data fetching `candle.fetch_data` was having some trouble when not being called from a benchmark directory. Switching to downloading the data through the `Uno` dataset class makes sure that users can download the data on any machine, saving it to the examples /data directory. --- examples/darts/uno/default_model.txt | 2 +- examples/darts/uno/uno_example.py | 20 ++------------------ 2 files changed, 3 insertions(+), 19 deletions(-) diff --git a/examples/darts/uno/default_model.txt b/examples/darts/uno/default_model.txt index 1e8badd6..b5a538b7 100644 --- a/examples/darts/uno/default_model.txt +++ b/examples/darts/uno/default_model.txt @@ -1,7 +1,7 @@ [Global_Params] model_name = 'darts_uno' unrolled = False -data_url = 'ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/' +data_url = 'http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/' savepath = '.' log_interval = 10 train_data = 'top_21_auc_1fold.uno.h5' diff --git a/examples/darts/uno/uno_example.py b/examples/darts/uno/uno_example.py index 0d960c4e..a7b402cd 100644 --- a/examples/darts/uno/uno_example.py +++ b/examples/darts/uno/uno_example.py @@ -25,20 +25,6 @@ def initialize_parameters(): return gParameters -def fetch_data(gParameters): - """ Download and untar data - - Args: - gParameters: parameters from candle - - Returns: - path to where the data is located - """ - path = gParameters['data_url'] - fpath = candle.fetch_file(path + gParameters['train_data'], 'UnoExample') - return fpath - - def run(params): args = candle.ArgumentStruct(**params) @@ -46,10 +32,8 @@ def run(params): device = torch.device(f"cuda" if args.cuda else "cpu") darts.banner(device=device) - #datapath = fetch_data(params) - datapath = params['data_url'] + params['train_data'] - train_data = darts.Uno(datapath, 'train', download=True) - valid_data = darts.Uno(datapath, 'test') + train_data = darts.Uno('./data', 'train', download=True) + valid_data = darts.Uno('./data', 'test') train_data = darts.sample(train_data, len(valid_data)) From c88454e4299653f9529167aa451b6d80c9b59318 Mon Sep 17 00:00:00 2001 From: yngtodd Date: Mon, 13 Apr 2020 21:29:01 -0400 Subject: [PATCH 258/331] Remove notebook checkpoints Just tidying up for the PR. --- .../Untitled-checkpoint.ipynb | 624 ------------------ 1 file changed, 624 deletions(-) delete mode 100644 examples/darts/advanced/.ipynb_checkpoints/Untitled-checkpoint.ipynb diff --git a/examples/darts/advanced/.ipynb_checkpoints/Untitled-checkpoint.ipynb b/examples/darts/advanced/.ipynb_checkpoints/Untitled-checkpoint.ipynb deleted file mode 100644 index fdb1165b..00000000 --- a/examples/darts/advanced/.ipynb_checkpoints/Untitled-checkpoint.ipynb +++ /dev/null @@ -1,624 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "import torch\n", - "import torch.nn as nn\n", - "import torch.nn.functional as F\n", - "from torch.utils.data import DataLoader\n", - "from torchvision import datasets\n", - "from torchvision import transforms\n", - "from operations import OPS" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "train = datasets.MNIST('./data', train=True, transform=transforms.ToTensor(), download=True)\n", - "valid = datasets.MNIST('./data', train=False)\n", - "\n", - "train_loader = DataLoader(train, batch_size=10)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "x, y = next(iter(train_loader))" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "torch.Size([10, 1, 28, 28])" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "x.shape" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "train_loader = torch.utils.data.DataLoader(\n", - " datasets.MNIST('../data', train=True, download=True,\n", - " transform=transforms.Compose([\n", - " transforms.ToTensor(),\n", - " transforms.Normalize((0.1307,), (0.3081,))\n", - " ])),\n", - " batch_size=10, shuffle=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "x, y = next(iter(train_loader))" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "torch.Size([10, 1, 28, 28])" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "x.shape" - ] - }, - { - "cell_type": "code", - "execution_count": 57, - "metadata": {}, - "outputs": [], - "source": [ - "class StemNet(nn.Module):\n", - " \"\"\" Network stem\n", - "\n", - " This will always be the beginning of the network.\n", - " DARTS will only recompose modules after the stem.\n", - " For this reason, we define this separate from the\n", - " other modules in the network.\n", - "\n", - " Args:\n", - " input_dim: the input dimension for your data\n", - "\n", - " cell_dim: the intermediate dimension size for\n", - " the remaining modules of the network.\n", - " \"\"\"\n", - " def __init__(self, in_channels: int=1, cell_dim: int=100, kernel_size=3):\n", - " super(StemNet, self).__init__()\n", - " self.stem = nn.Conv2d(in_channels, cell_dim, kernel_size)\n", - "\n", - " def forward(self, x):\n", - " return self.stem(x)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 58, - "metadata": {}, - "outputs": [], - "source": [ - "class ConvBlock(nn.Module):\n", - " \"\"\" ReLu -> Conv1d -> BatchNorm \"\"\"\n", - "\n", - " def __init__(self, c_in, c_out, kernel_size, stride, affine=True):\n", - " super(ConvBlock, self).__init__()\n", - " self.conv = nn.Conv2d(c_in, c_out, kernel_size=kernel_size, stride=stride)\n", - "\n", - " def forward(self, x):\n", - " return self.conv(x)" - ] - }, - { - "cell_type": "code", - "execution_count": 100, - "metadata": {}, - "outputs": [], - "source": [ - "class DilConv(nn.Module):\n", - " \"\"\" ReLU Dilated Convolution \"\"\"\n", - "\n", - " def __init__(self, c_in, c_out, kernel_size, \n", - " stride, padding, dilation, affine=True):\n", - " super(DilConv, self).__init__()\n", - "\n", - " self.op = nn.Sequential(\n", - " nn.ReLU(inplace=False),\n", - "\n", - " nn.Conv2d(\n", - " c_in,\n", - " c_in,\n", - " kernel_size=kernel_size,\n", - " stride=stride,\n", - " padding=padding,\n", - " dilation=dilation,\n", - " groups=c_in,\n", - " bias=False\n", - " ),\n", - "\n", - " nn.Conv2d(\n", - " c_in,\n", - " c_out,\n", - " kernel_size=1,\n", - " padding=0,\n", - " bias=False\n", - " ),\n", - "\n", - " nn.BatchNorm2d(c_out, affine=affine),\n", - " )\n", - "\n", - " def forward(self, x):\n", - " return self.op(x)" - ] - }, - { - "cell_type": "code", - "execution_count": 101, - "metadata": {}, - "outputs": [], - "source": [ - "stem = StemNet()" - ] - }, - { - "cell_type": "code", - "execution_count": 102, - "metadata": {}, - "outputs": [], - "source": [ - "OPS = {\n", - " 'dil_conv': lambda c, stride, affine: DilConv(c, c, 3, stride, 2, 2, affine=affine),\n", - " 'conv' : lambda c, stride, affine: ConvBlock(c, c, 3, stride, affine=affine),\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": 103, - "metadata": {}, - "outputs": [], - "source": [ - "conv = OPS['conv'](100, 1, True)" - ] - }, - { - "cell_type": "code", - "execution_count": 104, - "metadata": {}, - "outputs": [], - "source": [ - "dill = OPS['dil_conv'](100, 1, True)" - ] - }, - { - "cell_type": "code", - "execution_count": 105, - "metadata": {}, - "outputs": [], - "source": [ - "out = stem(x)" - ] - }, - { - "cell_type": "code", - "execution_count": 106, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "torch.Size([10, 100, 26, 26])" - ] - }, - "execution_count": 106, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "out.shape" - ] - }, - { - "cell_type": "code", - "execution_count": 107, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "torch.Size([10, 100, 24, 24])" - ] - }, - "execution_count": 107, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "conv(out).shape" - ] - }, - { - "cell_type": "code", - "execution_count": 108, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "torch.Size([10, 100, 26, 26])" - ] - }, - "execution_count": 108, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "dill(out).shape" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "torch.Size([100, 1, 26, 26])" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "torch.Size([100, 1, 26, 26])" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "metadata": {}, - "outputs": [], - "source": [ - "x = torch.randn(torch.Size([100, 1, 26, 26]))\n", - "y = torch.randn(torch.Size([100, 1, 24, 24]))" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "torch.Size([100, 1, 26, 26])" - ] - }, - "execution_count": 39, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "x.shape" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "torch.Size([100, 1, 24, 24])" - ] - }, - "execution_count": 40, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "y.shape" - ] - }, - { - "cell_type": "code", - "execution_count": 43, - "metadata": {}, - "outputs": [], - "source": [ - "new = torch.zeros_like(x)" - ] - }, - { - "cell_type": "code", - "execution_count": 45, - "metadata": {}, - "outputs": [], - "source": [ - "new[:, :, :24, :24] = y" - ] - }, - { - "cell_type": "code", - "execution_count": 51, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "tensor([[[[ 0.5583, -0.4697, -1.0664, ..., 0.7064, 0.0000, 0.0000],\n", - " [ 0.6258, 0.0675, 0.6688, ..., 0.6203, 0.0000, 0.0000],\n", - " [ 1.1937, 0.2701, 0.2217, ..., 0.0806, 0.0000, 0.0000],\n", - " ...,\n", - " [-0.0659, -0.1802, 0.3372, ..., -0.3461, 0.0000, 0.0000],\n", - " [ 0.0000, 0.0000, 0.0000, ..., 0.0000, 0.0000, 0.0000],\n", - " [ 0.0000, 0.0000, 0.0000, ..., 0.0000, 0.0000, 0.0000]]],\n", - "\n", - "\n", - " [[[-1.2648, -0.5133, -0.4088, ..., -0.4855, 0.0000, 0.0000],\n", - " [-1.3394, 0.4108, 0.6637, ..., 1.4993, 0.0000, 0.0000],\n", - " [ 0.7185, -0.2766, -0.4765, ..., -1.1961, 0.0000, 0.0000],\n", - " ...,\n", - " [-0.1028, -0.8230, -0.9398, ..., 0.7469, 0.0000, 0.0000],\n", - " [ 0.0000, 0.0000, 0.0000, ..., 0.0000, 0.0000, 0.0000],\n", - " [ 0.0000, 0.0000, 0.0000, ..., 0.0000, 0.0000, 0.0000]]],\n", - "\n", - "\n", - " [[[ 0.7054, 0.6086, -1.7438, ..., 0.1894, 0.0000, 0.0000],\n", - " [ 0.8151, 0.1001, 1.1741, ..., -1.1449, 0.0000, 0.0000],\n", - " [-0.5580, 1.6470, 0.7271, ..., -0.4478, 0.0000, 0.0000],\n", - " ...,\n", - " [-0.1456, 0.5271, -0.5290, ..., 1.8301, 0.0000, 0.0000],\n", - " [ 0.0000, 0.0000, 0.0000, ..., 0.0000, 0.0000, 0.0000],\n", - " [ 0.0000, 0.0000, 0.0000, ..., 0.0000, 0.0000, 0.0000]]],\n", - "\n", - "\n", - " ...,\n", - "\n", - "\n", - " [[[ 0.6753, -1.2837, -0.6700, ..., -1.3338, 0.0000, 0.0000],\n", - " [-0.6663, 0.0904, -0.5048, ..., 0.9331, 0.0000, 0.0000],\n", - " [-1.1993, -0.6174, -0.9150, ..., -0.2840, 0.0000, 0.0000],\n", - " ...,\n", - " [-0.4903, 0.0908, 0.9174, ..., 0.5471, 0.0000, 0.0000],\n", - " [ 0.0000, 0.0000, 0.0000, ..., 0.0000, 0.0000, 0.0000],\n", - " [ 0.0000, 0.0000, 0.0000, ..., 0.0000, 0.0000, 0.0000]]],\n", - "\n", - "\n", - " [[[-0.4726, -0.0753, -0.5620, ..., 1.0391, 0.0000, 0.0000],\n", - " [-0.9754, -0.9809, -0.2429, ..., -0.4416, 0.0000, 0.0000],\n", - " [-0.0702, 0.5949, 0.8080, ..., 1.8536, 0.0000, 0.0000],\n", - " ...,\n", - " [-1.6046, 0.5544, 0.0236, ..., 0.3422, 0.0000, 0.0000],\n", - " [ 0.0000, 0.0000, 0.0000, ..., 0.0000, 0.0000, 0.0000],\n", - " [ 0.0000, 0.0000, 0.0000, ..., 0.0000, 0.0000, 0.0000]]],\n", - "\n", - "\n", - " [[[-0.2572, 0.1227, -0.0109, ..., 0.3023, 0.0000, 0.0000],\n", - " [-0.9565, -0.1810, 0.5157, ..., 0.8777, 0.0000, 0.0000],\n", - " [ 0.0078, 2.0418, 2.2871, ..., -0.0903, 0.0000, 0.0000],\n", - " ...,\n", - " [ 0.1430, -0.5325, 0.4984, ..., 1.0231, 0.0000, 0.0000],\n", - " [ 0.0000, 0.0000, 0.0000, ..., 0.0000, 0.0000, 0.0000],\n", - " [ 0.0000, 0.0000, 0.0000, ..., 0.0000, 0.0000, 0.0000]]]])" - ] - }, - "execution_count": 51, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "new" - ] - }, - { - "cell_type": "code", - "execution_count": 48, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "torch.Size([100, 1, 24, 24])" - ] - }, - "execution_count": 48, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "y.shape" - ] - }, - { - "cell_type": "code", - "execution_count": 59, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 59, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "x.shape > y.shape" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [], - "source": [ - "from torch.nn.utils.rnn import pad_sequence" - ] - }, - { - "cell_type": "code", - "execution_count": 55, - "metadata": {}, - "outputs": [], - "source": [ - "result = F.pad(input=y, pad=(1, 1, 1, 1), mode='constant', value=0)" - ] - }, - { - "cell_type": "code", - "execution_count": 56, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "torch.Size([100, 1, 26, 26])" - ] - }, - "execution_count": 56, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "result.shape" - ] - }, - { - "cell_type": "code", - "execution_count": 60, - "metadata": {}, - "outputs": [], - "source": [ - "seq = [x, y, x, y]" - ] - }, - { - "cell_type": "code", - "execution_count": 64, - "metadata": {}, - "outputs": [], - "source": [ - "prev = seq[0]" - ] - }, - { - "cell_type": "code", - "execution_count": 67, - "metadata": {}, - "outputs": [], - "source": [ - "prev = seq[0]\n", - "padded = [prev]\n", - "for tensor in seq:\n", - " if tensor.shape < prev.shape:\n", - " tensor_pad = F.pad(\n", - " input=tensor, pad=(1, 1, 1, 1), mode='constant', value=0\n", - " )\n", - " padded.append(tensor_pad)\n", - " else:\n", - " padded.append(tensor)\n", - " prev = tensor" - ] - }, - { - "cell_type": "code", - "execution_count": 68, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "torch.Size([100, 1, 26, 26])" - ] - }, - "execution_count": 68, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sum(padded).shape" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.7" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} From 8fec536c77448d7df14158e758d0f610065f0dc0 Mon Sep 17 00:00:00 2001 From: yngtodd Date: Tue, 14 Apr 2020 01:10:59 -0400 Subject: [PATCH 259/331] Add genotype storage This will save the best genotype as the training goes on. --- examples/darts/uno/uno_example.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/examples/darts/uno/uno_example.py b/examples/darts/uno/uno_example.py index a7b402cd..1b01ca81 100644 --- a/examples/darts/uno/uno_example.py +++ b/examples/darts/uno/uno_example.py @@ -35,7 +35,7 @@ def run(params): train_data = darts.Uno('./data', 'train', download=True) valid_data = darts.Uno('./data', 'test') - train_data = darts.sample(train_data, len(valid_data)) + #train_data = darts.sample(train_data, len(valid_data)) trainloader = DataLoader(train_data, batch_size=args.batch_size) validloader = DataLoader(valid_data, batch_size=args.batch_size) @@ -68,6 +68,8 @@ def run(params): train_meter = darts.EpochMeter(tasks, 'train') valid_meter = darts.EpochMeter(tasks, 'valid') + genotype_store = darts.GenotypeStorage(root=args.savepath) + for epoch in range(args.epochs): scheduler.step() @@ -87,6 +89,8 @@ def run(params): args, tasks, train_meter, + genotype, + genotype_store, device ) @@ -102,10 +106,12 @@ def train(trainloader, args, tasks, meter, + genotype, + genotype_store, device): valid_iter = iter(trainloader) - + min_accuracy = 0.0 for step, (data, target) in enumerate(trainloader): batch_size = data.size(0) @@ -142,6 +148,11 @@ def train(trainloader, meter.update_batch_loss(loss.item(), batch_size) meter.update_batch_accuracy(prec1, batch_size) + accuracy_avg = meter.acc_meter.get_avg_accuracy('response') + if accuracy_avg > min_accuracy: + genotype_store.save_genotype(genotype) + min_accuracy = accuracy_avg + if step % args.log_interval == 0: logger.info(f'Step: {step} loss: {meter.loss_meter.avg:.4}') @@ -149,6 +160,7 @@ def train(trainloader, meter.save(args.savepath) + def validate(validloader, model, criterion, args, tasks, meter, device): model.eval() with torch.no_grad(): From b3eb2b4e23a850552ead499801ea115b1e2f0272 Mon Sep 17 00:00:00 2001 From: yngtodd Date: Wed, 15 Apr 2020 09:58:22 -0400 Subject: [PATCH 260/331] Call loss consistently When unrolling models, we were not unpacking the tuple returned by model.loss like we were in other methods. This keeps things consisent. --- common/darts/architecture.py | 4 ++-- examples/darts/uno/default_model.txt | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/common/darts/architecture.py b/common/darts/architecture.py index 235dc057..2ce0fa62 100644 --- a/common/darts/architecture.py +++ b/common/darts/architecture.py @@ -44,11 +44,10 @@ def comp_unrolled_model(self, data, target, eta, optimizer): model_unrolled """ # forward to get loss - loss = self.model.loss(data, target) + _, loss = self.model.loss(data, target, reduce='mean') # flatten current weights theta = F.flatten(self.model.parameters()).detach() # theta: torch.Size([1930618]) - # print('theta:', theta.shape) try: # fetch momentum data from theta optimizer moment = F.flatten(optimizer.state[v]['momentum_buffer'] for v in self.model.parameters()) @@ -57,6 +56,7 @@ def comp_unrolled_model(self, data, target, eta, optimizer): moment = torch.zeros_like(theta) # flatten all gradients + gradient= autograd.grad(loss, self.model.parameters(), allow_unused=True) dtheta = F.flatten(autograd.grad(loss, self.model.parameters())).data # indeed, here we implement a simple SGD with momentum and weight decay # theta = theta - eta * (moment + weight decay + dtheta) diff --git a/examples/darts/uno/default_model.txt b/examples/darts/uno/default_model.txt index b5a538b7..3446cc3e 100644 --- a/examples/darts/uno/default_model.txt +++ b/examples/darts/uno/default_model.txt @@ -1,6 +1,6 @@ [Global_Params] model_name = 'darts_uno' -unrolled = False +unrolled = True data_url = 'http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/' savepath = '.' log_interval = 10 From c32ac4c6a0588ff29c062298a756ea0a7702b255 Mon Sep 17 00:00:00 2001 From: yngtodd Date: Wed, 15 Apr 2020 10:05:21 -0400 Subject: [PATCH 261/331] Remove extra gradient checking This was redundant. --- common/darts/architecture.py | 1 - examples/darts/advanced/default_model.txt | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/common/darts/architecture.py b/common/darts/architecture.py index 2ce0fa62..fe4980c4 100644 --- a/common/darts/architecture.py +++ b/common/darts/architecture.py @@ -56,7 +56,6 @@ def comp_unrolled_model(self, data, target, eta, optimizer): moment = torch.zeros_like(theta) # flatten all gradients - gradient= autograd.grad(loss, self.model.parameters(), allow_unused=True) dtheta = F.flatten(autograd.grad(loss, self.model.parameters())).data # indeed, here we implement a simple SGD with momentum and weight decay # theta = theta - eta * (moment + weight decay + dtheta) diff --git a/examples/darts/advanced/default_model.txt b/examples/darts/advanced/default_model.txt index 79d9c880..8c4a934c 100644 --- a/examples/darts/advanced/default_model.txt +++ b/examples/darts/advanced/default_model.txt @@ -1,6 +1,6 @@ [Global_Params] model_name = 'darts_uno' -unrolled = False +unrolled = True data_url = 'ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/' savepath = './results' log_interval = 10 From 533e2816f90353b2a64040233f438092f05f170d Mon Sep 17 00:00:00 2001 From: yngtodd Date: Wed, 15 Apr 2020 10:05:49 -0400 Subject: [PATCH 262/331] Make default parameters consistent This will make things easier to reason about. --- examples/darts/advanced/default_model.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/darts/advanced/default_model.txt b/examples/darts/advanced/default_model.txt index 8c4a934c..c0130a69 100644 --- a/examples/darts/advanced/default_model.txt +++ b/examples/darts/advanced/default_model.txt @@ -1,6 +1,6 @@ [Global_Params] model_name = 'darts_uno' -unrolled = True +unrolled = False data_url = 'ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/' savepath = './results' log_interval = 10 From 49cc6e5db9df101470176d6244845b7b6f905a3c Mon Sep 17 00:00:00 2001 From: yngtodd Date: Wed, 15 Apr 2020 10:16:39 -0400 Subject: [PATCH 263/331] Remove unpacking This is required when calling grad. --- common/darts/architecture.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/common/darts/architecture.py b/common/darts/architecture.py index fe4980c4..4bc829de 100644 --- a/common/darts/architecture.py +++ b/common/darts/architecture.py @@ -33,9 +33,7 @@ def comp_unrolled_model(self, data, target, eta, optimizer): data : torch.tensor target : torch.tensor - eta : float - optimizer : torch.optim.optimizer optimizer of theta, not optimizer of alpha @@ -44,10 +42,11 @@ def comp_unrolled_model(self, data, target, eta, optimizer): model_unrolled """ # forward to get loss - _, loss = self.model.loss(data, target, reduce='mean') + loss = self.model.loss(data, target) # flatten current weights theta = F.flatten(self.model.parameters()).detach() # theta: torch.Size([1930618]) + # print('theta:', theta.shape) try: # fetch momentum data from theta optimizer moment = F.flatten(optimizer.state[v]['momentum_buffer'] for v in self.model.parameters()) From 694807d14a5aff5794138c8727133ba9824fbd10 Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Mon, 20 Apr 2020 13:52:43 -0500 Subject: [PATCH 264/331] Update adrp.py Removing subtraction, as not needed. --- examples/ADRP/adrp.py | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/ADRP/adrp.py b/examples/ADRP/adrp.py index 73054c78..1c353848 100644 --- a/examples/ADRP/adrp.py +++ b/examples/ADRP/adrp.py @@ -155,7 +155,6 @@ def load_data(params, seed): df = (pd.read_csv(train_file, skiprows=1).values).astype("float32") PL = df.shape[1] - PL -= 1 print("PL=", PL) PS = PL - 1 From 0357787f74f7bd287592c26110571b9c82036fd9 Mon Sep 17 00:00:00 2001 From: Jamal Date: Tue, 21 Apr 2020 09:49:09 -0600 Subject: [PATCH 265/331] Add CLR keywords and conflict checking utility --- common/candle/__init__.py | 9 +- common/default_utils.py | 183 +++++++++++++++++++++++++++----------- 2 files changed, 136 insertions(+), 56 deletions(-) diff --git a/common/candle/__init__.py b/common/candle/__init__.py index 6ca22a6e..fc90942a 100644 --- a/common/candle/__init__.py +++ b/common/candle/__init__.py @@ -23,6 +23,7 @@ from default_utils import verify_path from default_utils import keras_default_config from default_utils import set_up_logger +from default_utils import check_flag_conflicts from generic_utils import Progbar @@ -87,13 +88,13 @@ from keras_utils import r2 from keras_utils import mae from keras_utils import mse - + from viz_utils import plot_metrics from solr_keras import CandleRemoteMonitor from solr_keras import compute_trainable_params from solr_keras import TerminateOnTimeOut - + from uq_keras_utils import abstention_variable_initialization from uq_keras_utils import abstention_loss from uq_keras_utils import abs_acc @@ -103,6 +104,10 @@ from uq_keras_utils import add_model_output from uq_keras_utils import AbstentionAdapt_Callback + from clr_keras_utils import CyclicLR + from clr_keras_utils import clr_set_args + from clr_keras_utils import clr_callback + elif 'torch' in sys.modules: print ('Importing candle utils for pytorch') from pytorch_utils import set_seed diff --git a/common/default_utils.py b/common/default_utils.py index 7fd12d99..1a741ace 100644 --- a/common/default_utils.py +++ b/common/default_utils.py @@ -31,7 +31,56 @@ DEFAULT_DATATYPE = np.float32 -PARAMETERS_CANDLE = ['config_file', 'verbose', 'logfile', 'save_path', 'model_name', 'data_type', 'dense', 'rng_seed', 'epochs', 'batch_size', 'train_bool', 'eval_bool', 'timeout', 'home_dir', 'train_data', 'test_data', 'output_dir', 'data_url', 'experiment_id', 'run_id', 'conv', 'locally_connected', 'activation', 'out_activation', 'lstm_size', 'recurrent_dropout', 'dropout', 'pool', 'batch_normalization', 'loss', 'optimizer', 'metrics', 'scaling', 'shuffle', 'feature_subsample', 'learning_rate', 'early_stop', 'momentum', 'initialization', 'val_split', 'train_steps', 'val_steps', 'test_steps', 'train_samples', 'val_samples', 'gpus', 'profiling'] +PARAMETERS_CANDLE = [ + 'config_file', + # neon parser + 'verbose', 'logfile', 'save_path', 'model_name', 'data_type', 'dense', 'rng_seed', 'epochs', 'batch_size', + # general behavior + 'train_bool', 'eval_bool', 'timeout', + # logging + 'home_dir', 'train_data', 'test_data', 'output_dir', 'data_url', 'experiment_id', 'run_id', + # model architecture + 'conv', 'locally_connected', 'activation', 'out_activation', 'lstm_size', 'recurrent_dropout', + # processing between layers + 'dropout', 'pool', 'batch_normalization', + # model evaluation + 'loss', 'optimizer', 'metrics', + # data preprocessing + 'scaling', 'shuffle', 'feature_subsample', + # training + 'learning_rate', 'early_stop', 'momentum', 'initialization', + 'val_split', 'train_steps', 'val_steps', 'test_steps', 'train_samples', 'val_samples', + # backend + 'gpus', + # profiling + 'profiling', + # cyclic learning rate + 'clr_flag', 'clr_mode', 'clr_base_lr', 'clr_max_lr', 'clr_gamma' + ] + +CONFLICT_LIST = [ + ['clr_flag','warmup_lr'], + ['clr_flag','reduce_lr'] +] + +def check_flag_conflicts(params): + key_set = set(params.keys()) + # check for conflicts + #conflict_flag = False + # loop over each set of mutually exclusive flags + # if any set conflicts exit program + for flag_list in CONFLICT_LIST: + flag_count = 0 + for i in flag_list: + if i in key_set: + if params[i] is True: + flag_count +=1 + if flag_count > 1 : + raise Exception('ERROR ! Conflict in flag specification. ' \ + 'These flags should not be used together: ' + str(sorted(flag_list)) + \ + '... Exiting') + #print("Warning: conflicting flags in ", flag_list) + #exit() #### IO UTILS @@ -116,7 +165,7 @@ def set_up_logger(logfile, logger, verbose): def eval_string_as_list(str_read, separator, dtype): """ Parse a string and convert it into a list of lists. - + Parameters ---------- str_read : string @@ -125,7 +174,7 @@ def eval_string_as_list(str_read, separator, dtype): Character that specifies the separation between the lists dtype : data type Data type to decode the elements of the list - + Return ---------- decoded_list : list @@ -152,7 +201,7 @@ def eval_string_as_list(str_read, separator, dtype): def eval_string_as_list_of_lists(str_read, separator_out, separator_in, dtype): """ Parse a string and convert it into a list of lists. - + Parameters ---------- str_read : string @@ -163,7 +212,7 @@ def eval_string_as_list_of_lists(str_read, separator_out, separator_in, dtype): Character that specifies the separation between the inner level lists dtype : data type Data type to decode the elements of the lists - + Return ---------- decoded_list : list @@ -241,7 +290,7 @@ def __init__(self, option_strings, dest, type, **kwargs): """Initialize a ListOfListsAction object. If no type is specified, an integer is assumed by default as the type for the elements of the list-of-lists. - + Parameters ---------- option_strings : string @@ -260,13 +309,13 @@ def __init__(self, option_strings, dest, type, **kwargs): self.dtype = type if self.dtype is None: self.dtype = np.int32 - + def __call__(self, parser, namespace, values, option_string=None): """This function overrides the __call__ method of the base argparse.Action class. - + This function implements the action of the ListOfListAction class by parsing an input string (command-line option or argument) and maping it into a list-of-lists. The resulting list-of-lists is @@ -331,7 +380,7 @@ def check_file_parameters_exists(params_parser, params_benchmark, params_file): Includes additional parameters defined in the benchmark. params_file : python dictionary Includes parameters read from the configuration file. - + Global: PARAMETERS_CANDLE : python list Includes all the core keywords that are specified in CANDLE. @@ -407,6 +456,8 @@ def finalize_parameters(bmk): bmk.check_required_exists(gParameters) print ('Params:') pprint(gParameters) + # Check that no keywords conflict + check_flag_conflicts(gParameters) return gParameters @@ -414,7 +465,7 @@ def finalize_parameters(bmk): def get_default_neon_parser(parser): """Parse command-line arguments that are default in neon parser (and are common to all frameworks). Ignore if not present. - + Parameters ---------- parser : ArgumentParser object @@ -426,7 +477,7 @@ def get_default_neon_parser(parser): parser.add_argument("-l", "--log", dest='logfile', default=None, help="log file") - + # Logging utilities parser.add_argument("-s", "--save_path", dest='save_path', default=argparse.SUPPRESS, type=str, @@ -468,17 +519,17 @@ def get_default_neon_parser(parser): def get_common_parser(parser): """Parse command-line arguments. Ignore if not present. - + Parameters ---------- parser : ArgumentParser object Parser for command-line options """ - + # Configuration file parser.add_argument("--config_file", dest='config_file', default=argparse.SUPPRESS, help="specify model configuration file") - + # General behavior parser.add_argument("--train_bool", dest='train_bool', type=str2bool, default=True, @@ -496,7 +547,7 @@ def get_common_parser(parser): parser.add_argument("--home_dir", dest='home_dir', default=argparse.SUPPRESS, type=str, help="set home directory") - + parser.add_argument("--train_data", action="store", default=argparse.SUPPRESS, help="training data filename") @@ -508,7 +559,7 @@ def get_common_parser(parser): parser.add_argument("--output_dir", dest='output_dir', default=argparse.SUPPRESS, type=str, help="output directory") - + parser.add_argument("--data_url", dest='data_url', default=argparse.SUPPRESS, type=str, help="set data source url") @@ -516,7 +567,7 @@ def get_common_parser(parser): parser.add_argument("--experiment_id", default="EXP000", type=str, help="set the experiment unique identifier") parser.add_argument("--run_id", default="RUN000", type=str, help="set the run unique identifier") - + # Model definition @@ -533,16 +584,16 @@ def get_common_parser(parser): parser.add_argument("--out_activation", default=argparse.SUPPRESS, help="keras activation function to use in out layer: softmax, linear, ...") - - + + parser.add_argument("--lstm_size", nargs='+', type=int, default= argparse.SUPPRESS, help="integer array describing size of LSTM internal state per layer") parser.add_argument("--recurrent_dropout", action="store", default=argparse.SUPPRESS, type=float, help="ratio of recurrent dropout") - - + + # Processing between layers parser.add_argument("--dropout", type=float, default=argparse.SUPPRESS, @@ -553,7 +604,7 @@ def get_common_parser(parser): parser.add_argument("--batch_normalization", type=str2bool, default=argparse.SUPPRESS, help="use batch normalization") - + # Model Evaluation parser.add_argument("--loss", default=argparse.SUPPRESS, @@ -565,13 +616,13 @@ def get_common_parser(parser): parser.add_argument("--metrics", default=argparse.SUPPRESS, help="metrics to evaluate performance: accuracy, ...") - + # Data preprocessing parser.add_argument("--scaling", default=argparse.SUPPRESS, choices=['minabs', 'minmax', 'std', 'none'], help="type of feature scaling; 'minabs': to [-1,1]; 'minmax': to [0,1], 'std': standard unit normalization; 'none': no normalization") - + parser.add_argument("--shuffle", type=str2bool, default=False, help="randomly shuffle data set (produces different training and testing partitions each run depending on the seed)") @@ -613,8 +664,8 @@ def get_common_parser(parser): parser.add_argument("--val_samples", action="store", default=argparse.SUPPRESS, type=int, help="overrides the number of validation samples if set to nonzero") - - + + # Backend configuration parser.add_argument("--gpus", action="store", nargs='*', default=[], type=int, @@ -625,6 +676,30 @@ def get_common_parser(parser): default = 'false', help="Turn profiling on or off") + # cyclic learning rate + parser.add_argument("--clr_flag", + default=argparse.SUPPRESS, + #default=None, + type=str2bool, + help="CLR flag (boolean)") + parser.add_argument("--clr_mode", + default=argparse.SUPPRESS, + #default=None, + type=str, choices=['trng1', 'trng2', 'exp'], + help="CLR mode (default: trng1)") + parser.add_argument("--clr_base_lr", type=float, + default=argparse.SUPPRESS, + #default=1e-4, + help="Base lr for cycle lr.") + parser.add_argument("--clr_max_lr", type=float, + default=argparse.SUPPRESS, + #default=1e-3, + help="Max lr for cycle lr.") + parser.add_argument("--clr_gamma", type=float, + default=argparse.SUPPRESS, + #default=0.999994, + help="Gamma parameter for learning cycle LR.") + return parser @@ -632,7 +707,7 @@ def get_common_parser(parser): def args_overwrite_config(args, config): """Overwrite configuration parameters with parameters specified via command-line. - + Parameters ---------- args : ArgumentParser object @@ -640,15 +715,15 @@ def args_overwrite_config(args, config): config : python dictionary Parameters read from configuration file """ - + params = config - + args_dict = vars(args) - + for key in args_dict.keys(): params[key] = args_dict[key] - - + + if 'data_type' not in params: params['data_type'] = DEFAULT_DATATYPE else: @@ -675,16 +750,16 @@ def get_choice(name): """ Maps name string to the right type of argument """ mapping = {} - + # dtype mapping['f16'] = np.float16 mapping['f32'] = np.float32 mapping['f64'] = np.float64 - + mapped = mapping.get(name) if not mapped: raise Exception('No mapping found for "{}"'.format(name)) - + return mapped @@ -699,7 +774,7 @@ def directory_from_parameters(params, commonroot='Output'): String to specify the common folder to store results. """ - + if commonroot in set(['.', './']): # Same directory --> convert to absolute path outdir = os.path.abspath('.') else: # Create path specified @@ -750,7 +825,7 @@ def __init__(self, filepath, defmodel, framework, prog=None, desc=None, parser=N parser : argparser (default None) if 'neon' framework a NeonArgparser is passed. Otherwise an argparser is constructed. """ - + if parser is None: parser = argparse.ArgumentParser(prog=prog, formatter_class=argparse.ArgumentDefaultsHelpFormatter, description=desc, conflict_handler='resolve') @@ -758,11 +833,11 @@ def __init__(self, filepath, defmodel, framework, prog=None, desc=None, parser=N self.file_path = filepath self.default_model = defmodel self.framework = framework - + self.required = set([]) self.additional_definitions = [] self.set_locals() - + def parse_from_common(self): @@ -772,15 +847,15 @@ def parse_from_common(self): 'get_common_parser' which are defined previously(above). If the order changes or they are moved, the calling has to be updated. """ - - + + # Parse has been split between arguments that are common with the default neon parser # and all the other options parser = self.parser if self.framework is not 'neon': parser = get_default_neon_parser(parser) parser = get_common_parser(parser) - + self.parser = parser # Set default configuration file @@ -791,7 +866,7 @@ def parse_from_benchmark(self): """Functionality to parse options specific specific for each benchmark. """ - + for d in self.additional_definitions: if 'type' not in d: d['type'] = None @@ -817,13 +892,13 @@ def parse_from_benchmark(self): self.parser.add_argument('--' + d['name'], choices=d['choices'], default=d['default'], help=d['help']) else: # Non an action, one parameter, no choices self.parser.add_argument('--' + d['name'], type=d['type'], default=d['default'], help=d['help']) - + def format_benchmark_config_arguments(self, dictfileparam): """ Functionality to format the particular parameters of the benchmark. - + Parameters ---------- dictfileparam : python dictionary @@ -833,7 +908,7 @@ def format_benchmark_config_arguments(self, dictfileparam): Most of the time command-line overwrites configuration file except when the command-line is using default values and config file defines those values - + """ configOut = dictfileparam.copy() @@ -844,7 +919,7 @@ def format_benchmark_config_arguments(self, dictfileparam): dtype = d['type'] else: dtype = None - + if 'action' in d: if inspect.isclass(d['action']): str_read = dictfileparam[d['name']] @@ -866,7 +941,7 @@ def read_config_file(self, file): config.read(file) section=config.sections() fileParams={} - + # parse specified arguments (minimal validation: if arguments # are written several times in the file, just the first time # will be used) @@ -874,7 +949,7 @@ def read_config_file(self, file): for k,v in config.items(sec): if not k in fileParams: fileParams[k] = eval(v) - + fileParams = self.format_benchmark_config_arguments(fileParams) #pprint(fileParams) @@ -888,11 +963,11 @@ def set_locals(self): - additional_definitions: list of dictionaries describing \ the additional parameters for the benchmark. """ - + pass - + def check_required_exists(self, gparam): """Functionality to verify that the required model parameters have been specified. @@ -901,7 +976,7 @@ def check_required_exists(self, gparam): key_set = set(gparam.keys()) intersect_set = key_set.intersection(self.required) diff_set = self.required.difference(intersect_set) - + if ( len(diff_set) > 0 ): raise Exception('ERROR ! Required parameters are not specified. ' \ 'These required parameters have not been initialized: ' + str(sorted(diff_set)) + \ @@ -913,9 +988,9 @@ def keras_default_config(): """Defines parameters that intervine in different functions using the keras defaults. This helps to keep consistency in parameters between frameworks. """ - + kerasDefaults = {} - + # Optimizers #kerasDefaults['clipnorm']=? # Maximum norm to clip all parameter gradients #kerasDefaults['clipvalue']=? # Maximum (minimum=-max) value to clip all parameter gradients From fe7c939a7fae096a67ec5d4d90e668f5e5daa76a Mon Sep 17 00:00:00 2001 From: Jamal Date: Tue, 21 Apr 2020 09:50:43 -0600 Subject: [PATCH 266/331] Added CLR functions to library. --- common/clr_keras_utils.py | 200 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 200 insertions(+) create mode 100644 common/clr_keras_utils.py diff --git a/common/clr_keras_utils.py b/common/clr_keras_utils.py new file mode 100644 index 00000000..9d542505 --- /dev/null +++ b/common/clr_keras_utils.py @@ -0,0 +1,200 @@ +from keras.callbacks import Callback +from keras import backend as K +import numpy as np + +def clr_check_args(args): + req_keys = ['clr_mode','clr_base_lr','clr_max_lr','clr_gamma'] + keys_present = True + for key in req_keys: + if key not in args.keys(): + keys_present = False + return keys_present + +def clr_set_args(args): + req_keys = ['clr_mode','clr_base_lr','clr_max_lr','clr_gamma'] + exclusive_keys = ['warmup_lr', 'reduce_lr'] + keys_present = True + for key in req_keys: + if key not in args.keys(): + keys_present = False + if keys_present and args['clr_mode'] is not None: + clr_keras_kwargs = {'mode': args['clr_mode'], 'base_lr': args['clr_base_lr'], + 'max_lr': args['clr_max_lr'], 'gamma': args['clr_gamma']} + for ex_key in exclusive_keys: + if ex_key in args.keys(): + if args[ex_key] == True: + print("Key ", ex_key, " conflicts, setting to False") + args[ex_key] = False + else: + print("Incomplete CLR specification: will run without") + clr_keras_kwargs = {'mode': None, 'base_lr': 0.1, + 'max_lr': 0.1, 'gamma': 0.1} + return clr_keras_kwargs + +def clr_callback(mode=None, base_lr=1e-4, max_lr=1e-3, gamma=0.999994): + """ Creates keras callback for cyclical learning rate. """ + # keras_contrib = './keras_contrib/callbacks' + # sys.path.append(keras_contrib) + + if mode == 'trng1': + clr = CyclicLR(base_lr=base_lr, max_lr=max_lr, mode='triangular') + elif mode == 'trng2': + clr = CyclicLR(base_lr=base_lr, max_lr=max_lr, mode='triangular2') + elif mode == 'exp': + clr = CyclicLR(base_lr=base_lr, max_lr=max_lr, mode='exp_range', gamma=gamma) # 0.99994; 0.99999994; 0.999994 + return clr + +class CyclicLR(Callback): + """This callback implements a cyclical learning rate policy (CLR). + The method cycles the learning rate between two boundaries with + some constant frequency. + # Arguments + base_lr: initial learning rate which is the + lower boundary in the cycle. + max_lr: upper boundary in the cycle. Functionally, + it defines the cycle amplitude (max_lr - base_lr). + The lr at any cycle is the sum of base_lr + and some scaling of the amplitude; therefore + max_lr may not actually be reached depending on + scaling function. + step_size: number of training iterations per + half cycle. Authors suggest setting step_size + 2-8 x training iterations in epoch. + mode: one of {triangular, triangular2, exp_range}. + Default 'triangular'. + Values correspond to policies detailed above. + If scale_fn is not None, this argument is ignored. + gamma: constant in 'exp_range' scaling function: + gamma**(cycle iterations) + scale_fn: Custom scaling policy defined by a single + argument lambda function, where + 0 <= scale_fn(x) <= 1 for all x >= 0. + mode paramater is ignored + scale_mode: {'cycle', 'iterations'}. + Defines whether scale_fn is evaluated on + cycle number or cycle iterations (training + iterations since start of cycle). Default is 'cycle'. + + The amplitude of the cycle can be scaled on a per-iteration or + per-cycle basis. + This class has three built-in policies, as put forth in the paper. + "triangular": + A basic triangular cycle w/ no amplitude scaling. + "triangular2": + A basic triangular cycle that scales initial amplitude by half each cycle. + "exp_range": + A cycle that scales initial amplitude by gamma**(cycle iterations) at each + cycle iteration. + For more detail, please see paper. + + # Example for CIFAR-10 w/ batch size 100: + ```python + clr = CyclicLR(base_lr=0.001, max_lr=0.006, + step_size=2000., mode='triangular') + model.fit(X_train, Y_train, callbacks=[clr]) + ``` + + Class also supports custom scaling functions: + ```python + clr_fn = lambda x: 0.5*(1+np.sin(x*np.pi/2.)) + clr = CyclicLR(base_lr=0.001, max_lr=0.006, + step_size=2000., scale_fn=clr_fn, + scale_mode='cycle') + model.fit(X_train, Y_train, callbacks=[clr]) + ``` + + # References + + - [Cyclical Learning Rates for Training Neural Networks]( + https://arxiv.org/abs/1506.01186) + """ + + def __init__( + self, + base_lr=0.001, + max_lr=0.006, + step_size=2000., + mode='triangular', + gamma=1., + scale_fn=None, + scale_mode='cycle'): + super(CyclicLR, self).__init__() + + if mode not in ['triangular', 'triangular2', + 'exp_range']: + raise KeyError("mode must be one of 'triangular', " + "'triangular2', or 'exp_range'") + self.base_lr = base_lr + self.max_lr = max_lr + self.step_size = step_size + self.mode = mode + self.gamma = gamma + if scale_fn is None: + if self.mode == 'triangular': + self.scale_fn = lambda x: 1. + self.scale_mode = 'cycle' + elif self.mode == 'triangular2': + self.scale_fn = lambda x: 1 / (2.**(x - 1)) + self.scale_mode = 'cycle' + elif self.mode == 'exp_range': + self.scale_fn = lambda x: gamma ** x + self.scale_mode = 'iterations' + else: + self.scale_fn = scale_fn + self.scale_mode = scale_mode + self.clr_iterations = 0. + self.trn_iterations = 0. + self.history = {} + + self._reset() + + def _reset(self, new_base_lr=None, new_max_lr=None, + new_step_size=None): + """Resets cycle iterations. + Optional boundary/step size adjustment. + """ + if new_base_lr is not None: + self.base_lr = new_base_lr + if new_max_lr is not None: + self.max_lr = new_max_lr + if new_step_size is not None: + self.step_size = new_step_size + self.clr_iterations = 0. + + def clr(self): + cycle = np.floor(1 + self.clr_iterations / (2 * self.step_size)) + x = np.abs(self.clr_iterations / self.step_size - 2 * cycle + 1) + if self.scale_mode == 'cycle': + return self.base_lr + (self.max_lr - self.base_lr) * \ + np.maximum(0, (1 - x)) * self.scale_fn(cycle) + else: + return self.base_lr + (self.max_lr - self.base_lr) * \ + np.maximum(0, (1 - x)) * self.scale_fn(self.clr_iterations) + + def on_train_begin(self, logs={}): + logs = logs or {} + + if self.clr_iterations == 0: + K.set_value(self.model.optimizer.lr, self.base_lr) + else: + K.set_value(self.model.optimizer.lr, self.clr()) + + def on_batch_end(self, epoch, logs=None): + + logs = logs or {} + self.trn_iterations += 1 + self.clr_iterations += 1 + K.set_value(self.model.optimizer.lr, self.clr()) + + self.history.setdefault( + 'lr', []).append( + K.get_value( + self.model.optimizer.lr)) + self.history.setdefault('iterations', []).append(self.trn_iterations) + + for k, v in logs.items(): + self.history.setdefault(k, []).append(v) + + def on_epoch_end(self, epoch, logs=None): + logs = logs or {} + logs['lr'] = K.get_value(self.model.optimizer.lr) From 8af79eebf89f7990b6594d93de339f713548512f Mon Sep 17 00:00:00 2001 From: Jamal Date: Wed, 22 Apr 2020 14:28:20 -0600 Subject: [PATCH 267/331] Added CLR version of Uno with associated AUC model file. --- Pilot1/Uno/uno_auc_clr_model.txt | 51 +++ Pilot1/Uno/uno_clr_keras2.py | 566 +++++++++++++++++++++++++++++++ 2 files changed, 617 insertions(+) create mode 100644 Pilot1/Uno/uno_auc_clr_model.txt create mode 100644 Pilot1/Uno/uno_clr_keras2.py diff --git a/Pilot1/Uno/uno_auc_clr_model.txt b/Pilot1/Uno/uno_auc_clr_model.txt new file mode 100644 index 00000000..3cc0ea68 --- /dev/null +++ b/Pilot1/Uno/uno_auc_clr_model.txt @@ -0,0 +1,51 @@ +[Global_Params] +train_sources=['CCLE'] +test_sources=['train'] +cell_types=None +cell_features=['rnaseq'] +drug_features=['descriptors'] +dense=[1000, 1000, 1000, 1000, 1000] +dense_feature_layers=[1000, 1000, 1000] +activation='relu' +loss='mse' +optimizer='adamax' +scaling='std' +dropout=.1 +epochs=50 +batch_size=32 +val_split=0.2 +cv=1 +max_val_loss=1.0 +learning_rate=0.0001 +base_lr=None +agg_dose='AUC' +residual=False +reduce_lr=False +warmup_lr=False +batch_normalization=False +feature_subsample=0 +rng_seed=2018 +no_gen=False +verbose=False + + +preprocess_rnaseq='source_scale' +gpus=1 +use_landmark_genes=True +no_feature_source=True +no_response_source=True +cp=True +save_path='save/uno' + +single=True +timeout=-1 + +[Monitor_Params] +solr_root='' + +[CLR_Params] +clr_flag = True +clr_mode = 'trng1' +clr_base_lr = 0.001 +clr_max_lr = 0.01 +clr_gamma = 0.999 diff --git a/Pilot1/Uno/uno_clr_keras2.py b/Pilot1/Uno/uno_clr_keras2.py new file mode 100644 index 00000000..7041cdbe --- /dev/null +++ b/Pilot1/Uno/uno_clr_keras2.py @@ -0,0 +1,566 @@ +#! /usr/bin/env python + +from __future__ import division, print_function + +import logging +import os +import random + +import numpy as np +import pandas as pd + +import keras +from keras import backend as K +from keras import optimizers +from keras.models import Model +from keras.layers import Input, Dense, Dropout +from keras.callbacks import Callback, ModelCheckpoint, ReduceLROnPlateau, LearningRateScheduler, TensorBoard +from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error +from scipy.stats.stats import pearsonr + +import uno as benchmark +import candle + +import uno_data +from uno_data import CombinedDataLoader, CombinedDataGenerator, DataFeeder + + +logger = logging.getLogger(__name__) +os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' + + +def set_seed(seed): + os.environ['PYTHONHASHSEED'] = '0' + np.random.seed(seed) + + random.seed(seed) + + if K.backend() == 'tensorflow': + import tensorflow as tf + tf.set_random_seed(seed) + candle.set_parallelism_threads() + + +def verify_path(path): + folder = os.path.dirname(path) + if folder and not os.path.exists(folder): + os.makedirs(folder) + + +def set_up_logger(logfile, verbose): + verify_path(logfile) + fh = logging.FileHandler(logfile) + fh.setFormatter(logging.Formatter("[%(asctime)s %(process)d] %(message)s", datefmt="%Y-%m-%d %H:%M:%S")) + fh.setLevel(logging.DEBUG) + + sh = logging.StreamHandler() + sh.setFormatter(logging.Formatter('')) + sh.setLevel(logging.DEBUG if verbose else logging.INFO) + + for log in [logger, uno_data.logger]: + log.setLevel(logging.DEBUG) + log.addHandler(fh) + log.addHandler(sh) + + +def extension_from_parameters(args): + """Construct string for saving model with annotation of parameters""" + ext = '' + ext += '.A={}'.format(args.activation) + ext += '.B={}'.format(args.batch_size) + ext += '.E={}'.format(args.epochs) + ext += '.O={}'.format(args.optimizer) + # ext += '.LEN={}'.format(args.maxlen) + ext += '.LR={}'.format(args.learning_rate) + ext += '.CF={}'.format(''.join([x[0] for x in sorted(args.cell_features)])) + ext += '.DF={}'.format(''.join([x[0] for x in sorted(args.drug_features)])) + if args.feature_subsample > 0: + ext += '.FS={}'.format(args.feature_subsample) + if args.dropout > 0: + ext += '.DR={}'.format(args.dropout) + if args.warmup_lr: + ext += '.wu_lr' + if args.reduce_lr: + ext += '.re_lr' + if args.residual: + ext += '.res' + if args.use_landmark_genes: + ext += '.L1000' + if args.no_gen: + ext += '.ng' + for i, n in enumerate(args.dense): + if n > 0: + ext += '.D{}={}'.format(i + 1, n) + if args.dense_feature_layers != args.dense: + for i, n in enumerate(args.dense): + if n > 0: + ext += '.FD{}={}'.format(i + 1, n) + + return ext + + +def discretize(y, bins=5): + percentiles = [100 / bins * (i + 1) for i in range(bins - 1)] + thresholds = [np.percentile(y, x) for x in percentiles] + classes = np.digitize(y, thresholds) + return classes + + +def r2(y_true, y_pred): + SS_res = K.sum(K.square(y_true - y_pred)) + SS_tot = K.sum(K.square(y_true - K.mean(y_true))) + return (1 - SS_res / (SS_tot + K.epsilon())) + + +def mae(y_true, y_pred): + return keras.metrics.mean_absolute_error(y_true, y_pred) + + +def evaluate_prediction(y_true, y_pred): + mse = mean_squared_error(y_true, y_pred) + mae = mean_absolute_error(y_true, y_pred) + r2 = r2_score(y_true, y_pred) + corr, _ = pearsonr(y_true, y_pred) + return {'mse': mse, 'mae': mae, 'r2': r2, 'corr': corr} + + +def log_evaluation(metric_outputs, description='Comparing y_true and y_pred:'): + logger.info(description) + for metric, value in metric_outputs.items(): + logger.info(' {}: {:.4f}'.format(metric, value)) + + +class LoggingCallback(Callback): + def __init__(self, print_fcn=print): + Callback.__init__(self) + self.print_fcn = print_fcn + + def on_epoch_end(self, epoch, logs={}): + msg = "[Epoch: %i] %s" % (epoch, ", ".join("%s: %f" % (k, v) for k, v in sorted(logs.items()))) + self.print_fcn(msg) + + +class PermanentDropout(Dropout): + def __init__(self, rate, **kwargs): + super(PermanentDropout, self).__init__(rate, **kwargs) + self.uses_learning_phase = False + + def call(self, x, mask=None): + if 0. < self.rate < 1.: + noise_shape = self._get_noise_shape(x) + x = K.dropout(x, self.rate, noise_shape) + return x + + +class MultiGPUCheckpoint(ModelCheckpoint): + + def set_model(self, model): + if isinstance(model.layers[-2], Model): + self.model = model.layers[-2] + else: + self.model = model + + +def build_feature_model(input_shape, name='', dense_layers=[1000, 1000], + activation='relu', residual=False, + dropout_rate=0, permanent_dropout=True): + x_input = Input(shape=input_shape) + h = x_input + for i, layer in enumerate(dense_layers): + x = h + h = Dense(layer, activation=activation)(h) + if dropout_rate > 0: + if permanent_dropout: + h = PermanentDropout(dropout_rate)(h) + else: + h = Dropout(dropout_rate)(h) + if residual: + try: + h = keras.layers.add([h, x]) + except ValueError: + pass + model = Model(x_input, h, name=name) + return model + + +class SimpleWeightSaver(Callback): + + def __init__(self, fname): + self.fname = fname + + def set_model(self, model): + if isinstance(model.layers[-2], Model): + self.model = model.layers[-2] + else: + self.model = model + + def on_train_end(self, logs={}): + self.model.save_weights(self.fname) + + +def build_model(loader, args, permanent_dropout=True, silent=False): + input_models = {} + dropout_rate = args.dropout + for fea_type, shape in loader.feature_shapes.items(): + base_type = fea_type.split('.')[0] + if base_type in ['cell', 'drug']: + if args.dense_cell_feature_layers is not None and base_type == 'cell': + dense_feature_layers = args.dense_cell_feature_layers + elif args.dense_drug_feature_layers is not None and base_type == 'drug': + dense_feature_layers = args.dense_drug_feature_layers + else: + dense_feature_layers = args.dense_feature_layers + + box = build_feature_model(input_shape=shape, name=fea_type, + dense_layers=dense_feature_layers, + dropout_rate=dropout_rate, permanent_dropout=permanent_dropout) + if not silent: + logger.debug('Feature encoding submodel for %s:', fea_type) + box.summary(print_fn=logger.debug) + input_models[fea_type] = box + + inputs = [] + encoded_inputs = [] + for fea_name, fea_type in loader.input_features.items(): + shape = loader.feature_shapes[fea_type] + fea_input = Input(shape, name='input.' + fea_name) + inputs.append(fea_input) + if fea_type in input_models: + input_model = input_models[fea_type] + encoded = input_model(fea_input) + else: + encoded = fea_input + encoded_inputs.append(encoded) + + merged = keras.layers.concatenate(encoded_inputs) + + h = merged + for i, layer in enumerate(args.dense): + x = h + h = Dense(layer, activation=args.activation)(h) + if dropout_rate > 0: + if permanent_dropout: + h = PermanentDropout(dropout_rate)(h) + else: + h = Dropout(dropout_rate)(h) + if args.residual: + try: + h = keras.layers.add([h, x]) + except ValueError: + pass + output = Dense(1)(h) + + return Model(inputs, output) + + +def initialize_parameters(default_model='uno_clr_model.txt'): + + # Build benchmark object + unoBmk = benchmark.BenchmarkUno(benchmark.file_path, default_model, 'keras', + prog='uno_clr', desc='Build neural network based models to predict tumor response to single and paired drugs.') + + # Initialize parameters + gParameters = candle.finalize_parameters(unoBmk) + # benchmark.logger.info('Params: {}'.format(gParameters)) + + return gParameters + + +class Struct: + def __init__(self, **entries): + self.__dict__.update(entries) + + +def run(params): + + candle.check_flag_conflicts(params) + args = Struct(**params) + set_seed(args.rng_seed) + ext = extension_from_parameters(args) + verify_path(args.save_path) + prefix = args.save_path + ext + logfile = args.logfile if args.logfile else prefix + '.log' + set_up_logger(logfile, args.verbose) + logger.info('Params: {}'.format(params)) + + if (len(args.gpus) > 0): + import tensorflow as tf + config = tf.ConfigProto() + config.gpu_options.allow_growth = True + config.gpu_options.visible_device_list = ",".join(map(str, args.gpus)) + K.set_session(tf.Session(config=config)) + + loader = CombinedDataLoader(seed=args.rng_seed) + loader.load(cache=args.cache, + ncols=args.feature_subsample, + agg_dose=args.agg_dose, + cell_features=args.cell_features, + drug_features=args.drug_features, + drug_median_response_min=args.drug_median_response_min, + drug_median_response_max=args.drug_median_response_max, + use_landmark_genes=args.use_landmark_genes, + use_filtered_genes=args.use_filtered_genes, + cell_feature_subset_path=args.cell_feature_subset_path or args.feature_subset_path, + drug_feature_subset_path=args.drug_feature_subset_path or args.feature_subset_path, + preprocess_rnaseq=args.preprocess_rnaseq, + single=args.single, + train_sources=args.train_sources, + test_sources=args.test_sources, + embed_feature_source=not args.no_feature_source, + encode_response_source=not args.no_response_source, + use_exported_data=args.use_exported_data, + ) + + target = args.agg_dose or 'Growth' + val_split = args.val_split + train_split = 1 - val_split + + if args.export_csv: + fname = args.export_csv + loader.partition_data(cv_folds=args.cv, train_split=train_split, val_split=val_split, + cell_types=args.cell_types, by_cell=args.by_cell, by_drug=args.by_drug, + cell_subset_path=args.cell_subset_path, drug_subset_path=args.drug_subset_path) + train_gen = CombinedDataGenerator(loader, batch_size=args.batch_size, shuffle=args.shuffle) + val_gen = CombinedDataGenerator(loader, partition='val', batch_size=args.batch_size, shuffle=args.shuffle) + + x_train_list, y_train = train_gen.get_slice(size=train_gen.size, dataframe=True, single=args.single) + x_val_list, y_val = val_gen.get_slice(size=val_gen.size, dataframe=True, single=args.single) + df_train = pd.concat([y_train] + x_train_list, axis=1) + df_val = pd.concat([y_val] + x_val_list, axis=1) + df = pd.concat([df_train, df_val]).reset_index(drop=True) + if args.growth_bins > 1: + df = uno_data.discretize(df, 'Growth', bins=args.growth_bins) + df.to_csv(fname, sep='\t', index=False, float_format="%.3g") + return + + if args.export_data: + fname = args.export_data + loader.partition_data(cv_folds=args.cv, train_split=train_split, val_split=val_split, + cell_types=args.cell_types, by_cell=args.by_cell, by_drug=args.by_drug, + cell_subset_path=args.cell_subset_path, drug_subset_path=args.drug_subset_path) + train_gen = CombinedDataGenerator(loader, batch_size=args.batch_size, shuffle=args.shuffle) + val_gen = CombinedDataGenerator(loader, partition='val', batch_size=args.batch_size, shuffle=args.shuffle) + store = pd.HDFStore(fname, complevel=9, complib='blosc:snappy') + + config_min_itemsize = {'Sample': 30, 'Drug1': 10} + if not args.single: + config_min_itemsize['Drug2'] = 10 + + for partition in ['train', 'val']: + gen = train_gen if partition == 'train' else val_gen + for i in range(gen.steps): + x_list, y = gen.get_slice(size=args.batch_size, dataframe=True, single=args.single) + + for j, input_feature in enumerate(x_list): + input_feature.columns = [''] * len(input_feature.columns) + store.append('x_{}_{}'.format(partition, j), input_feature.astype('float32'), format='table', data_column=True) + store.append('y_{}'.format(partition), y.astype({target: 'float32'}), format='table', data_column=True, + min_itemsize=config_min_itemsize) + logger.info('Generating {} dataset. {} / {}'.format(partition, i, gen.steps)) + + # save input_features and feature_shapes from loader + store.put('model', pd.DataFrame()) + store.get_storer('model').attrs.input_features = loader.input_features + store.get_storer('model').attrs.feature_shapes = loader.feature_shapes + + store.close() + logger.info('Completed generating {}'.format(fname)) + return + + if args.use_exported_data is None: + loader.partition_data(cv_folds=args.cv, train_split=train_split, val_split=val_split, + cell_types=args.cell_types, by_cell=args.by_cell, by_drug=args.by_drug, + cell_subset_path=args.cell_subset_path, drug_subset_path=args.drug_subset_path) + + model = build_model(loader, args) + logger.info('Combined model:') + model.summary(print_fn=logger.info) + # plot_model(model, to_file=prefix+'.model.png', show_shapes=True) + + if args.cp: + model_json = model.to_json() + with open(prefix + '.model.json', 'w') as f: + print(model_json, file=f) + + def warmup_scheduler(epoch): + lr = args.learning_rate or base_lr * args.batch_size / 100 + if epoch <= 5: + K.set_value(model.optimizer.lr, (base_lr * (5 - epoch) + lr * epoch) / 5) + logger.debug('Epoch {}: lr={:.5g}'.format(epoch, K.get_value(model.optimizer.lr))) + return K.get_value(model.optimizer.lr) + + df_pred_list = [] + + cv_ext = '' + cv = args.cv if args.cv > 1 else 1 + + for fold in range(cv): + if args.cv > 1: + logger.info('Cross validation fold {}/{}:'.format(fold + 1, cv)) + cv_ext = '.cv{}'.format(fold + 1) + + template_model = build_model(loader, args, silent=True) + if args.initial_weights: + logger.info("Loading initial weights from {}".format(args.initial_weights)) + template_model.load_weights(args.initial_weights) + + if len(args.gpus) > 1: + from keras.utils import multi_gpu_model + gpu_count = len(args.gpus) + logger.info("Multi GPU with {} gpus".format(gpu_count)) + model = multi_gpu_model(template_model, cpu_merge=False, gpus=gpu_count) + else: + model = template_model + + optimizer = optimizers.deserialize({'class_name': args.optimizer, 'config': {}}) + base_lr = args.base_lr or K.get_value(optimizer.lr) + if args.learning_rate: + K.set_value(optimizer.lr, args.learning_rate) + + model.compile(loss=args.loss, optimizer=optimizer, metrics=[mae, r2]) + + # calculate trainable and non-trainable params + params.update(candle.compute_trainable_params(model)) + + # Here is where we set a bunch of callback + # Set the CLR first so it will invalidate the warmup_lr, reduce_lr flags if needed + clr_args = candle.clr_set_args(params) + if clr_args['mode'] is not None: + clrCallback = candle.clr_callback(**clr_args) + + candle_monitor = candle.CandleRemoteMonitor(params=params) + timeout_monitor = candle.TerminateOnTimeOut(params['timeout']) + es_monitor = keras.callbacks.EarlyStopping(patience=10, verbose=1) + + reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=0.00001) + warmup_lr = LearningRateScheduler(warmup_scheduler) + checkpointer = MultiGPUCheckpoint(prefix + cv_ext + '.model.h5', save_best_only=True) + tensorboard = TensorBoard(log_dir="tb/{}{}{}".format(args.tb_prefix, ext, cv_ext)) + history_logger = LoggingCallback(logger.debug) + + callbacks = [candle_monitor, timeout_monitor, history_logger] + if args.es: + callbacks.append(es_monitor) + if args.reduce_lr: + callbacks.append(reduce_lr) + if args.warmup_lr: + callbacks.append(warmup_lr) + if args.cp: + callbacks.append(checkpointer) + if args.tb: + callbacks.append(tensorboard) + if args.save_weights: + logger.info("Will save weights to: " + args.save_weights) + callbacks.append(MultiGPUCheckpoint(args.save_weights)) + if clr_args['mode'] is not None: + callbacks.append(clrCallback) + + if args.use_exported_data is not None: + train_gen = DataFeeder(filename=args.use_exported_data, batch_size=args.batch_size, shuffle=args.shuffle, single=args.single, agg_dose=args.agg_dose) + val_gen = DataFeeder(partition='val', filename=args.use_exported_data, batch_size=args.batch_size, shuffle=args.shuffle, single=args.single, agg_dose=args.agg_dose) + test_gen = DataFeeder(partition='test', filename=args.use_exported_data, batch_size=args.batch_size, shuffle=args.shuffle, single=args.single, agg_dose=args.agg_dose) + else: + train_gen = CombinedDataGenerator(loader, fold=fold, batch_size=args.batch_size, shuffle=args.shuffle, single=args.single) + val_gen = CombinedDataGenerator(loader, partition='val', fold=fold, batch_size=args.batch_size, shuffle=args.shuffle, single=args.single) + + df_val = val_gen.get_response(copy=True) + y_val = df_val[target].values + y_shuf = np.random.permutation(y_val) + log_evaluation(evaluate_prediction(y_val, y_shuf), + description='Between random pairs in y_val:') + + if args.no_gen: + x_train_list, y_train = train_gen.get_slice(size=train_gen.size, single=args.single) + x_val_list, y_val = val_gen.get_slice(size=val_gen.size, single=args.single) + history = model.fit(x_train_list, y_train, + batch_size=args.batch_size, + epochs=args.epochs, + callbacks=callbacks, + validation_data=(x_val_list, y_val)) + else: + logger.info('Data points per epoch: train = %d, val = %d, test = %d', train_gen.size, val_gen.size, test_gen.size) + logger.info('Steps per epoch: train = %d, val = %d, test = %d', train_gen.steps, val_gen.steps, test_gen.steps) + history = model.fit_generator(train_gen, train_gen.steps, + epochs=args.epochs, + callbacks=callbacks, + validation_data=val_gen, + validation_steps=val_gen.steps) + + # prediction on holdout(test) when exists or use validation set + if test_gen.size > 0: + df_val = test_gen.get_response(copy=True) + y_val = df_val[target].values + y_val_pred = model.predict_generator(test_gen, test_gen.steps + 1) + y_val_pred = y_val_pred[:test_gen.size] + else: + if args.no_gen: + y_val_pred = model.predict(x_val_list, batch_size=args.batch_size) + else: + val_gen.reset() + y_val_pred = model.predict_generator(val_gen, val_gen.steps + 1) + y_val_pred = y_val_pred[:val_gen.size] + + y_val_pred = y_val_pred.flatten() + + scores = evaluate_prediction(y_val, y_val_pred) + log_evaluation(scores) + + # df_val = df_val.assign(PredictedGrowth=y_val_pred, GrowthError=y_val_pred - y_val) + df_val['Predicted' + target] = y_val_pred + df_val[target + 'Error'] = y_val_pred - y_val + df_pred_list.append(df_val) + + candle.plot_metrics(history, title=None, skip_ep=0, outdir='./save/', add_lr=True) + + pred_fname = prefix + '.predicted.tsv' + df_pred = pd.concat(df_pred_list) + if args.agg_dose: + if args.single: + df_pred.sort_values(['Sample', 'Drug1', target], inplace=True) + else: + df_pred.sort_values(['Source', 'Sample', 'Drug1', 'Drug2', target], inplace=True) + else: + if args.single: + df_pred.sort_values(['Sample', 'Drug1', 'Dose1', 'Growth'], inplace=True) + else: + df_pred.sort_values(['Sample', 'Drug1', 'Drug2', 'Dose1', 'Dose2', 'Growth'], inplace=True) + df_pred.to_csv(pred_fname, sep='\t', index=False, float_format='%.4g') + + if args.cv > 1: + scores = evaluate_prediction(df_pred[target], df_pred['Predicted' + target]) + log_evaluation(scores, description='Combining cross validation folds:') + + for test_source in loader.test_sep_sources: + test_gen = CombinedDataGenerator(loader, partition='test', batch_size=args.batch_size, source=test_source) + df_test = test_gen.get_response(copy=True) + y_test = df_test[target].values + n_test = len(y_test) + if n_test == 0: + continue + if args.no_gen: + x_test_list, y_test = test_gen.get_slice(size=test_gen.size, single=args.single) + y_test_pred = model.predict(x_test_list, batch_size=args.batch_size) + else: + y_test_pred = model.predict_generator(test_gen.flow(single=args.single), test_gen.steps) + y_test_pred = y_test_pred[:test_gen.size] + y_test_pred = y_test_pred.flatten() + scores = evaluate_prediction(y_test, y_test_pred) + log_evaluation(scores, description='Testing on data from {} ({})'.format(test_source, n_test)) + + if K.backend() == 'tensorflow': + K.clear_session() + + logger.handlers = [] + + return history + + +def main(): + params = initialize_parameters() + run(params) + + +if __name__ == '__main__': + main() + if K.backend() == 'tensorflow': + K.clear_session() From 1826fde9e5e682a7917818492ac0654ce37bde5d Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Wed, 22 Apr 2020 19:26:35 -0500 Subject: [PATCH 268/331] Update adrp.py fix drop to dropout --- examples/ADRP/adrp.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/ADRP/adrp.py b/examples/ADRP/adrp.py index 1c353848..fc85e335 100644 --- a/examples/ADRP/adrp.py +++ b/examples/ADRP/adrp.py @@ -81,7 +81,7 @@ "activation", "batch_size", "dense", - "drop", + "dropout", "epochs", "initialization", "learning_rate", @@ -127,8 +127,8 @@ def extension_from_parameters(params, framework=""): if params["epsilon_std"] != 1.0: ext += ".EPS={}".format(params["epsilon_std"]) - if params["drop"]: - ext += ".DR={}".format(params["drop"]) + if params["dropout"]: + ext += ".DR={}".format(params["dropout"]) if params["batch_normalization"]: ext += ".BN" if params["warmup_lr"]: From 1be2944ae6b75001424444c68142c3bb04663cec Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Wed, 22 Apr 2020 19:49:22 -0500 Subject: [PATCH 269/331] Update adrp_baseline_keras2.py --- examples/ADRP/adrp_baseline_keras2.py | 20 ++------------------ 1 file changed, 2 insertions(+), 18 deletions(-) diff --git a/examples/ADRP/adrp_baseline_keras2.py b/examples/ADRP/adrp_baseline_keras2.py index dc060a38..02a3ea41 100644 --- a/examples/ADRP/adrp_baseline_keras2.py +++ b/examples/ADRP/adrp_baseline_keras2.py @@ -263,8 +263,8 @@ def run(params): kernel_initializer=initializer_weights, bias_initializer=initializer_bias, )(x) - if params["drop"]: - x = Dropout(params["drop"])(x) + if params["dropout"]: + x = Dropout(params["dropout"])(x) output = Dense( output_dim, activation=activation, @@ -279,17 +279,6 @@ def run(params): bias_initializer=initializer_bias, )(inputs) - # x = Dense(250, activation=ac)(inputs) - - # x = Dropout(DR)(x) - # x = Dense(125, activation=ac)(x) - # x = Dropout(DR)(x) - # x = Dense(60, activation=ac)(x) - # x = Dropout(DR)(x) - # x = Dense(30, activation=ac)(x) - # x = Dropout(DR)(x) - # outputs = Dense(1, activation=ac)(x) - model = Model(inputs=inputs, outputs=output) model.summary() @@ -319,7 +308,6 @@ def run(params): monitor="val_loss", factor=0.75, patience=20, - verbose=1, mode="auto", epsilon=0.0001, cooldown=3, @@ -389,8 +377,6 @@ def post_process(params, X_train, X_test, Y_test, score, history, model): print("Test val_loss:", score[0]) print("Test val_mae:", score[1]) - exit() - # serialize model to JSON model_json = model.to_json() with open(save_path + "agg_adrp.model.json", "w") as json_file: @@ -405,8 +391,6 @@ def post_process(params, X_train, X_test, Y_test, score, history, model): model.save_weights(save_path + "agg_adrp.model.h5") print("Saved model to disk") - exit() - # load json and create model json_file = open(save_path + "agg_adrp.model.json", "r") loaded_model_json = json_file.read() From 6d8c693b839c779737fb2936c31fc613b1a47d8e Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Wed, 22 Apr 2020 19:50:10 -0500 Subject: [PATCH 270/331] Update adrp_default_model.txt drop to dropout, remove exit's from python file --- examples/ADRP/adrp_default_model.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/ADRP/adrp_default_model.txt b/examples/ADRP/adrp_default_model.txt index 39f23f70..260269f3 100644 --- a/examples/ADRP/adrp_default_model.txt +++ b/examples/ADRP/adrp_default_model.txt @@ -8,7 +8,7 @@ epochs=1 activation='relu' loss='mean_squared_error' optimizer='sgd' -drop=0.1 +dropout=0.1 learning_rate=0.0001 momentum=0.9 scaling='minmax' From 218596bb76e5bb5fe12b6b80ffbde1e5ba9d9647 Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Thu, 23 Apr 2020 14:09:27 -0500 Subject: [PATCH 271/331] o Get rid of a few warnings --- examples/ADRP/adrp.py | 2 -- examples/ADRP/adrp_default_model.txt | 3 --- 2 files changed, 5 deletions(-) diff --git a/examples/ADRP/adrp.py b/examples/ADRP/adrp.py index fc85e335..0e3e26fc 100644 --- a/examples/ADRP/adrp.py +++ b/examples/ADRP/adrp.py @@ -89,11 +89,9 @@ "optimizer", "rng_seed", "scaling", - "validation_split", "latent_dim", "batch_normalization", "epsilon_std", - "solr_root", "timeout", ] diff --git a/examples/ADRP/adrp_default_model.txt b/examples/ADRP/adrp_default_model.txt index 260269f3..5b1a84ae 100644 --- a/examples/ADRP/adrp_default_model.txt +++ b/examples/ADRP/adrp_default_model.txt @@ -12,7 +12,6 @@ dropout=0.1 learning_rate=0.0001 momentum=0.9 scaling='minmax' -validation_split=0.1 epsilon_std=1.0 rng_seed=2017 initialization='glorot_uniform' @@ -23,8 +22,6 @@ use_cp=False early_stop=True reduce_lr=True feature_subsample=0 -nb_classes=2 [Monitor_Params] -solr_root='' timeout=3600 From 809789901e1c5463002d98d86f39eb658c61b7a2 Mon Sep 17 00:00:00 2001 From: Rajeev Jain Date: Thu, 23 Apr 2020 14:16:22 -0500 Subject: [PATCH 272/331] o Use train_data instead of in o Fixed all warnings --- examples/ADRP/adrp.py | 6 +++--- examples/ADRP/adrp_default_model.txt | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/ADRP/adrp.py b/examples/ADRP/adrp.py index 0e3e26fc..1eabb26b 100644 --- a/examples/ADRP/adrp.py +++ b/examples/ADRP/adrp.py @@ -142,11 +142,11 @@ def extension_from_parameters(params, framework=""): def load_data(params, seed): # start change # - if params["in"].endswith("csv") or params["in"].endswith("csv"): - print("processing csv in file {}".format(params["in"])) + if params["train_data"].endswith("csv") or params["train_data"].endswith("csv"): + print("processing csv in file {}".format(params["train_data"])) url = params["data_url"] - file_train = params["in"] + file_train = params["train_data"] train_file = candle.get_file( file_train, url + file_train, cache_subdir="Pilot1" ) diff --git a/examples/ADRP/adrp_default_model.txt b/examples/ADRP/adrp_default_model.txt index 5b1a84ae..6912a821 100644 --- a/examples/ADRP/adrp_default_model.txt +++ b/examples/ADRP/adrp_default_model.txt @@ -1,6 +1,6 @@ [Global_Params] data_url = 'ftp://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/' -in='adrp-p1.csv' +train_data='adrp-p1.csv' model_name='adrp' dense=[250, 125, 60, 30] batch_size=32 From 478d8aafc49354998340e0cabf2a334f11620962 Mon Sep 17 00:00:00 2001 From: brettin Date: Wed, 29 Apr 2020 12:47:04 -0400 Subject: [PATCH 273/331] new file --- examples/ADRP/reg_go2.py | 281 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 281 insertions(+) create mode 100644 examples/ADRP/reg_go2.py diff --git a/examples/ADRP/reg_go2.py b/examples/ADRP/reg_go2.py new file mode 100644 index 00000000..80be28a5 --- /dev/null +++ b/examples/ADRP/reg_go2.py @@ -0,0 +1,281 @@ +import pandas as pd +import numpy as np +import os +import sys +import gzip +import argparse + +import math +import matplotlib +matplotlib.use('Agg') + +import matplotlib.pyplot as plt + +import tensorflow as tf + +import keras as ke +from keras import backend as K + +from keras.layers import Input, Dense, Dropout, Activation +from keras.optimizers import SGD, Adam, RMSprop +from keras.models import Sequential, Model, model_from_json, model_from_yaml +from keras.utils import np_utils, multi_gpu_model + +from keras.callbacks import ModelCheckpoint, CSVLogger, ReduceLROnPlateau, EarlyStopping + + +from sklearn.model_selection import train_test_split +from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error +from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler + +file_path = os.path.dirname(os.path.realpath(__file__)) +lib_path = os.path.abspath(os.path.join(file_path, '..', '..', 'common')) +sys.path.append(lib_path) + +psr = argparse.ArgumentParser(description='input csv file') +psr.add_argument('--in', default='in_file') +psr.add_argument('--ep', type=int, default=400) +args=vars(psr.parse_args()) +print(args) + +EPOCH = args['ep'] +BATCH = 32 +#nb_classes = 2 + +data_path = args['in'] + +df_toss = (pd.read_csv(data_path,nrows=1).values) + +print('df_toss:', df_toss.shape) + +PL = df_toss.size +PS = PL - 1 + +print('PL=',PL) + +#PL = 6213 # 38 + 60483 +#PS = 6212 # 60483 +DR = 0.1 # Dropout rate + +def r2(y_true, y_pred): + SS_res = K.sum(K.square(y_true - y_pred)) + SS_tot = K.sum(K.square(y_true - K.mean(y_true))) + return (1 - SS_res/(SS_tot + K.epsilon())) + +class Attention(ke.layers.Layer): + def __init__(self, output_dim, **kwargs): + self.output_dim = output_dim + super(Attention, self).__init__(**kwargs) + + def build(self, input_shape): + self.kernel = self.add_weight(name='kernel', + shape=(input_shape[1], self.output_dim), + initializer='uniform', + trainable=True) + super(Attention, self).build(input_shape) + + def call(self, V): + Q = ke.backend.dot(V, self.kernel) + Q = Q * V + Q = Q / math.sqrt(self.output_dim) + Q = ke.activations.softmax(Q) + return Q + + def compute_output_shape(self, input_shape): + return input_shape + + + +def load_data(): + + data_path = args['in'] + + df = (pd.read_csv(data_path,skiprows=1).values).astype('float32') + + df_y = df[:,0].astype('float32') + df_x = df[:, 1:PL].astype(np.float32) + + +# scaler = MaxAbsScaler() + + scaler = StandardScaler() + df_x = scaler.fit_transform(df_x) + + X_train, X_test, Y_train, Y_test = train_test_split(df_x, df_y, test_size= 0.20, random_state=42) + + print('x_train shape:', X_train.shape) + print('x_test shape:', X_test.shape) + + + return X_train, Y_train, X_test, Y_test + +X_train, Y_train, X_test, Y_test = load_data() + +print('X_train shape:', X_train.shape) +print('X_test shape:', X_test.shape) + +print('Y_train shape:', Y_train.shape) +print('Y_test shape:', Y_test.shape) + + +inputs = Input(shape=(PS,)) +x = Dense(250, activation='relu')(inputs) +#b = Attention(1000)(a) +#x = ke.layers.multiply([b, a]) + +#b = Dense(1000, activation='softmax')(inputs) +#x = ke.layers.multiply([a,b]) + +#x = Dense(1000, activation='relu')(x) +#x = Dropout(DR)(x) +#x = Dense(500, activation='relu')(x) +#x = Dropout(DR)(x) +#x = Dense(250, activation='relu')(x) +x = Dropout(DR)(x) +x = Dense(125, activation='relu')(x) +x = Dropout(DR)(x) +x = Dense(60, activation='relu')(x) +x = Dropout(DR)(x) +x = Dense(30, activation='relu')(x) +x = Dropout(DR)(x) +outputs = Dense(1, activation='relu')(x) + +model = Model(inputs=inputs, outputs=outputs) + +model.summary() + +#parallel_model = multi_gpu_model(model, gpus=4) +#parallel_model.compile(loss='mean_squared_error', +# optimizer=SGD(lr=0.0001, momentum=0.9), +# metrics=['mae',r2]) + +model.compile(loss='mean_squared_error', + optimizer=SGD(lr=0.0001, momentum=0.9), + metrics=['mae',r2]) + +# set up a bunch of callbacks to do work during model training.. + +checkpointer = ModelCheckpoint(filepath='reg_go.autosave.model.h5', verbose=1, save_weights_only=False, save_best_only=True) +csv_logger = CSVLogger('reg_go.training.log') +reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.75, patience=20, verbose=1, mode='auto', epsilon=0.0001, cooldown=3, min_lr=0.000000001) +early_stop = EarlyStopping(monitor='val_loss', patience=100, verbose=1, mode='auto') + + +#history = parallel_model.fit(X_train, Y_train, + +history = model.fit(X_train, Y_train, + batch_size=BATCH, + epochs=EPOCH, + verbose=1, + validation_data=(X_test, Y_test), + callbacks = [checkpointer, csv_logger, reduce_lr, early_stop]) + +score = model.evaluate(X_test, Y_test, verbose=0) + +print(score) + +print(history.history.keys()) +# dict_keys(['val_loss', 'val_mae', 'val_r2', 'loss', 'mae', 'r2', 'lr']) + +# summarize history for MAE +#plt.plot(history.history['mean_absolute_error']) +plt.plot(history.history['mae']) +#plt.plot(history.history['val_mean_absolute_error']) +plt.plot(history.history['val_mae']) + +plt.title('Model Mean Absolute Error') +plt.ylabel('mae') +plt.xlabel('epoch') +plt.legend(['train', 'test'], loc='upper left') + +plt.savefig('reg_go.mae.png', bbox_inches='tight') +plt.savefig('reg_go.mae.pdf', bbox_inches='tight') + +plt.close() + +# summarize history for loss +plt.plot(history.history['loss']) +plt.plot(history.history['val_loss']) +plt.title('Model Loss') +plt.ylabel('loss') +plt.xlabel('epoch') +plt.legend(['train', 'test'], loc='upper left') + +plt.savefig('reg_go.loss.png', bbox_inches='tight') +plt.savefig('reg_go.loss.pdf', bbox_inches='tight') + +plt.close() + +print('Test val_loss:', score[0]) +print('Test val_mae:', score[1]) + +#exit() + +# serialize model to JSON +model_json = model.to_json() +with open("reg_go.model.json", "w") as json_file: + json_file.write(model_json) + +# serialize model to YAML +model_yaml = model.to_yaml() +with open("reg_go.model.yaml", "w") as yaml_file: + yaml_file.write(model_yaml) + + +# serialize weights to HDF5 +model.save_weights("reg_go.model.h5") +print("Saved model to disk") + +#exit() + +# load json and create model +json_file = open('reg_go.model.json', 'r') +loaded_model_json = json_file.read() +json_file.close() +loaded_model_json = model_from_json(loaded_model_json) + + +# load yaml and create model +yaml_file = open('reg_go.model.yaml', 'r') +loaded_model_yaml = yaml_file.read() +yaml_file.close() +loaded_model_yaml = model_from_yaml(loaded_model_yaml) + + +# load weights into new model +loaded_model_json.load_weights("reg_go.model.h5") +print("Loaded json model from disk") + +# evaluate json loaded model on test data +loaded_model_json.compile(loss='mean_squared_error', optimizer='SGD', metrics=['mean_absolute_error']) +score_json = loaded_model_json.evaluate(X_test, Y_test, verbose=0) + +print('json Validation loss:', score_json[0]) +print('json Validation mae:', score_json[1]) + +# load weights into new model +loaded_model_yaml.load_weights("reg_go.model.h5") +print("Loaded yaml model from disk") + +# evaluate loaded model on test data +loaded_model_yaml.compile(loss='mean_squared_error', optimizer='SGD', metrics=['mean_absolute_error']) +score_yaml = loaded_model_yaml.evaluate(X_test, Y_test, verbose=0) + +print('yaml Validation loss:', score_yaml[0]) +print('yaml Validation mae:', score_yaml[1]) + +# predict using loaded yaml model on test and training data + +predict_yaml_train = loaded_model_yaml.predict(X_train) + +predict_yaml_test = loaded_model_yaml.predict(X_test) + +pred_train = predict_yaml_train[:,0] +pred_test = predict_yaml_test[:,0] + +np.savetxt("pred_train.csv", pred_train, delimiter=".", newline='\n', fmt="%.3f") +np.savetxt("pred_test.csv", pred_test, delimiter=",", newline='\n',fmt="%.3f") + +print('Correlation prediction on test and Y_test:', np.corrcoef( pred_test, Y_test)) +print('Correlation prediction on train and Y_train:', np.corrcoef( pred_train, Y_train)) + From 85bae31a37e6583f29487bdb36808e4065f21167 Mon Sep 17 00:00:00 2001 From: Jamal Date: Tue, 5 May 2020 12:09:12 -0600 Subject: [PATCH 274/331] Fixed finalize_parameters calls in examples. --- examples/mnist/mnist_cnn_candle.py | 2 +- examples/mnist/mnist_mlp_candle.py | 2 +- examples/unet/unet_candle.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/mnist/mnist_cnn_candle.py b/examples/mnist/mnist_cnn_candle.py index bfade296..126869db 100644 --- a/examples/mnist/mnist_cnn_candle.py +++ b/examples/mnist/mnist_cnn_candle.py @@ -13,7 +13,7 @@ def initialize_parameters(): ) # Initialize parameters - gParameters = candle.initialize_parameters(mnist_common) + gParameters = candle.finalize_parameters(mnist_common) csv_logger = CSVLogger('{}/params.log'.format(gParameters)) return gParameters diff --git a/examples/mnist/mnist_mlp_candle.py b/examples/mnist/mnist_mlp_candle.py index e6288100..d3a0cd34 100644 --- a/examples/mnist/mnist_mlp_candle.py +++ b/examples/mnist/mnist_mlp_candle.py @@ -13,7 +13,7 @@ def initialize_parameters(): ) # Initialize parameters - gParameters = candle.initialize_parameters(mnist_common) + gParameters = candle.finalize_parameters(mnist_common) csv_logger = CSVLogger('{}/params.log'.format(gParameters)) return gParameters diff --git a/examples/unet/unet_candle.py b/examples/unet/unet_candle.py index 3c7a6c11..cb15f156 100644 --- a/examples/unet/unet_candle.py +++ b/examples/unet/unet_candle.py @@ -13,7 +13,7 @@ def initialize_parameters(): ) # Initialize parameters - gParameters = candle.initialize_parameters(unet_common) + gParameters = candle.finalize_parameters(unet_common) return gParameters def run(gParameters): From b0db3109e13d29abc57c8d39f8d224c607aed5ba Mon Sep 17 00:00:00 2001 From: Thomas Brettin Date: Tue, 5 May 2020 14:53:21 -0400 Subject: [PATCH 275/331] Update for release Update to 400 epochs, no timeout, and running instructions --- examples/ADRP/README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/examples/ADRP/README.md b/examples/ADRP/README.md index 9d687e4d..7ebcd0c3 100644 --- a/examples/ADRP/README.md +++ b/examples/ADRP/README.md @@ -6,6 +6,7 @@ http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/ (~500MB) ## Sample run: ``` +$ export CUDA_VISIBLE_DEVICES=1 $ python adrp_baseline_keras2.py Using TensorFlow backend. Importing candle utils for keras @@ -46,7 +47,7 @@ Params: 'dense': [250, 125, 60, 30], 'drop': 0.1, 'early_stop': True, - 'epochs': 1, + 'epochs': 400, 'epsilon_std': 1.0, 'experiment_id': 'EXP000', 'feature_subsample': 0, @@ -71,7 +72,7 @@ Params: 'scaling': 'minmax', 'shuffle': False, 'solr_root': '', - 'timeout': 3600, + 'timeout': 0, 'train_bool': True, 'tsne': False, 'use_cp': False, From e5324e478bc1741a4fbbfc317a09eb7ab7ee432f Mon Sep 17 00:00:00 2001 From: brettin Date: Tue, 5 May 2020 14:04:40 -0500 Subject: [PATCH 276/331] update for release --- examples/ADRP/adrp_default_model.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/ADRP/adrp_default_model.txt b/examples/ADRP/adrp_default_model.txt index 6912a821..5f5452de 100644 --- a/examples/ADRP/adrp_default_model.txt +++ b/examples/ADRP/adrp_default_model.txt @@ -4,7 +4,7 @@ train_data='adrp-p1.csv' model_name='adrp' dense=[250, 125, 60, 30] batch_size=32 -epochs=1 +epochs=400 activation='relu' loss='mean_squared_error' optimizer='sgd' @@ -24,4 +24,4 @@ reduce_lr=True feature_subsample=0 [Monitor_Params] -timeout=3600 +timeout=0 From dc83b953b1e7a97133f0430f515915fca0c5fc55 Mon Sep 17 00:00:00 2001 From: brettin Date: Tue, 5 May 2020 14:07:21 -0500 Subject: [PATCH 277/331] update for release --- README.setup.linux | 29 ++++++++++------------------- 1 file changed, 10 insertions(+), 19 deletions(-) diff --git a/README.setup.linux b/README.setup.linux index b12446d5..b17ceb11 100644 --- a/README.setup.linux +++ b/README.setup.linux @@ -2,34 +2,25 @@ # ------------------------------ # Download the Anaconda installer -curl -o Anaconda3-2018.12-Linux-x86_64.sh https://repo.continuum.io/archive/Anaconda3-2018.12-Linux-x86_64.sh +curl -o Anaconda3-2020.02-Linux-x86_64.sh https://repo.anaconda.com/archive/Anaconda3-2020.02-Linux-x86_64.sh # Make the installer executable -chmod u+x ./Anaconda3-2018.12-Linux-x86_64.sh +chmod u+x ./Anaconda3-5.1.0-Linux-x86_64.sh # Run the installer, accepting the defaults. -./Anaconda3-2018.12-Linux-x86_64.sh +./Anaconda3-5.1.0-Linux-x86_64.sh # Add anaconda2/bin to your path (assumes default install location) export PATH=$HOME/anaconda3/bin:$PATH -# Create a new conda environment -conda create --name py37_candle -source activate py37_candle - # Install additonal modules not shipped with Anaconda -conda install -c conda-forge tensorflow -conda install -c anaconda hdf5=1.8.17 -conda install -c anaconda theano -conda install -c conda-forge keras=2 -conda install -c anaconda pandas -conda install -c anaconda scikit-learn -conda install -c anaconda matplotlib ----conda install -c conda-forge pygpu - -# Install additional modules for Pilot2 benchmarks -conda install -c conda-forge opencv -conda install -c conda-forge tqdm +conda install -y -c conda-forge tensorflow-gpu=1 +conda install -y -c anaconda hdf5 +conda install -y -c conda-forge keras=2 +conda install -y -c anaconda pandas +conda install -y -c anaconda scikit-learn +conda install -y -c anaconda matplotlib +conda install -y -c conda-forge pygpu conda update -c conda-forge numpy # Download the source files for the benchmarks From 484ef92b2a8f32c088f306a518f3a9e2a7791f26 Mon Sep 17 00:00:00 2001 From: Thomas Brettin Date: Tue, 5 May 2020 15:15:56 -0400 Subject: [PATCH 278/331] Added pytorch, loguru --- README.setup.linux | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/README.setup.linux b/README.setup.linux index b17ceb11..fcffb5f2 100644 --- a/README.setup.linux +++ b/README.setup.linux @@ -5,10 +5,10 @@ curl -o Anaconda3-2020.02-Linux-x86_64.sh https://repo.anaconda.com/archive/Anaconda3-2020.02-Linux-x86_64.sh # Make the installer executable -chmod u+x ./Anaconda3-5.1.0-Linux-x86_64.sh +chmod u+x ./Anaconda3-2020.02-Linux-x86_64.sh # Run the installer, accepting the defaults. -./Anaconda3-5.1.0-Linux-x86_64.sh +./Anaconda3-2020.02-Linux-x86_64.sh # Add anaconda2/bin to your path (assumes default install location) export PATH=$HOME/anaconda3/bin:$PATH @@ -21,6 +21,8 @@ conda install -y -c anaconda pandas conda install -y -c anaconda scikit-learn conda install -y -c anaconda matplotlib conda install -y -c conda-forge pygpu +conda install -y -c anaconda pytorch +pip install loguru conda update -c conda-forge numpy # Download the source files for the benchmarks From a76736e3be04f930a179104e4d87d768fe127c1d Mon Sep 17 00:00:00 2001 From: Jamal Date: Tue, 5 May 2020 13:40:59 -0600 Subject: [PATCH 279/331] Fix Imputer in p1b3.py --- Pilot1/P1B3/p1b3.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/Pilot1/P1B3/p1b3.py b/Pilot1/P1B3/p1b3.py index b0be7db7..3b29ad32 100644 --- a/Pilot1/P1B3/p1b3.py +++ b/Pilot1/P1B3/p1b3.py @@ -18,7 +18,10 @@ from itertools import cycle, islice -from sklearn.preprocessing import Imputer +try: + from sklearn.impute import SimpleImputer as Imputer +except ImportError: + from sklearn.preprocessing import Imputer from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler file_path = os.path.dirname(os.path.realpath(__file__)) @@ -318,7 +321,8 @@ def impute_and_scale(df, scaling='std'): df = df.dropna(axis=1, how='all') - imputer = Imputer(strategy='mean', axis=0) + #imputer = Imputer(strategy='mean', axis=0) + imputer = Imputer(strategy='mean') mat = imputer.fit_transform(df) if scaling is None or scaling.lower() == 'none': From fd0ffc559239bc5a281b94a110eb5e9b0a7d9bfa Mon Sep 17 00:00:00 2001 From: Jamal Date: Tue, 5 May 2020 13:46:49 -0600 Subject: [PATCH 280/331] Fix Combo benchmark --- Pilot1/Combo/NCI60.py | 7 +++++-- Pilot1/Combo/combo_dose.py | 2 +- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/Pilot1/Combo/NCI60.py b/Pilot1/Combo/NCI60.py index 702c66ad..b061316e 100644 --- a/Pilot1/Combo/NCI60.py +++ b/Pilot1/Combo/NCI60.py @@ -7,7 +7,10 @@ import numpy as np import pandas as pd -from sklearn.preprocessing import Imputer +try: + from sklearn.impute import SimpleImputer as Imputer +except ImportError: + from sklearn.preprocessing import Imputer from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler file_path = os.path.dirname(os.path.realpath(__file__)) @@ -40,7 +43,7 @@ def impute_and_scale(df, scaling='std'): df = df.dropna(axis=1, how='all') - imputer = Imputer(strategy='mean', axis=0) + imputer = Imputer(strategy='mean') mat = imputer.fit_transform(df) if scaling is None or scaling.lower() == 'none': diff --git a/Pilot1/Combo/combo_dose.py b/Pilot1/Combo/combo_dose.py index dc6e9a57..f720c8e3 100644 --- a/Pilot1/Combo/combo_dose.py +++ b/Pilot1/Combo/combo_dose.py @@ -678,7 +678,7 @@ def initialize_parameters(): desc = 'Build neural network based models to predict tumor response to drug pairs.') # Initialize parameters - gParameters = candle.initialize_parameters(comboBmk) + gParameters = candle.finalize_parameters(comboBmk) #combo.logger.info('Params: {}'.format(gParameters)) return gParameters From 709257b25a31e9c3bda9e5cd195a362f3cf69e40 Mon Sep 17 00:00:00 2001 From: Jamal Date: Tue, 5 May 2020 14:26:40 -0600 Subject: [PATCH 281/331] Fixed missing test_data, fixed Imputer. --- Pilot1/Uno/uno_baseline_keras2.py | 1 + Pilot1/Uno/uno_clr_keras2.py | 1 + Pilot1/Uno/uno_data.py | 8 ++++++-- 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/Pilot1/Uno/uno_baseline_keras2.py b/Pilot1/Uno/uno_baseline_keras2.py index f6308104..b7926e12 100644 --- a/Pilot1/Uno/uno_baseline_keras2.py +++ b/Pilot1/Uno/uno_baseline_keras2.py @@ -452,6 +452,7 @@ def warmup_scheduler(epoch): else: train_gen = CombinedDataGenerator(loader, fold=fold, batch_size=args.batch_size, shuffle=args.shuffle, single=args.single) val_gen = CombinedDataGenerator(loader, partition='val', fold=fold, batch_size=args.batch_size, shuffle=args.shuffle, single=args.single) + test_gen = CombinedDataGenerator(loader, partition='test', fold=fold, batch_size=args.batch_size, shuffle=args.shuffle, single=args.single) df_val = val_gen.get_response(copy=True) y_val = df_val[target].values diff --git a/Pilot1/Uno/uno_clr_keras2.py b/Pilot1/Uno/uno_clr_keras2.py index 7041cdbe..aa14f04c 100644 --- a/Pilot1/Uno/uno_clr_keras2.py +++ b/Pilot1/Uno/uno_clr_keras2.py @@ -462,6 +462,7 @@ def warmup_scheduler(epoch): else: train_gen = CombinedDataGenerator(loader, fold=fold, batch_size=args.batch_size, shuffle=args.shuffle, single=args.single) val_gen = CombinedDataGenerator(loader, partition='val', fold=fold, batch_size=args.batch_size, shuffle=args.shuffle, single=args.single) + test_gen = CombinedDataGenerator(loader, partition='test', fold=fold, batch_size=args.batch_size, shuffle=args.shuffle, single=args.single) df_val = val_gen.get_response(copy=True) y_val = df_val[target].values diff --git a/Pilot1/Uno/uno_data.py b/Pilot1/Uno/uno_data.py index 46d02187..274f5ece 100644 --- a/Pilot1/Uno/uno_data.py +++ b/Pilot1/Uno/uno_data.py @@ -13,7 +13,11 @@ from itertools import cycle, islice -from sklearn.preprocessing import Imputer +try: + from sklearn.impute import SimpleImputer as Imputer +except ImportError: + from sklearn.preprocessing import Imputer + from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler from sklearn.model_selection import ShuffleSplit, KFold @@ -76,7 +80,7 @@ def impute_and_scale(df, scaling='std', imputing='mean', dropna='all'): if imputing is None or imputing.lower() == 'none': mat = df.values else: - imputer = Imputer(strategy=imputing, axis=0) + imputer = Imputer(strategy=imputing) mat = imputer.fit_transform(df) if scaling is None or scaling.lower() == 'none': From bfac442d4357e9bd3e8f8e44a4f9e0a9d2435cd5 Mon Sep 17 00:00:00 2001 From: Jamal Date: Tue, 5 May 2020 14:30:13 -0600 Subject: [PATCH 282/331] Partial fix to T29. --- Pilot1/T29/infer.py | 2 +- Pilot1/T29/t29res.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/Pilot1/T29/infer.py b/Pilot1/T29/infer.py index 471bcde7..e0d7608f 100644 --- a/Pilot1/T29/infer.py +++ b/Pilot1/T29/infer.py @@ -48,7 +48,7 @@ def initialize_parameters(): 'help':'Number of predictions to do on each sample.'} ] t29_common.additional_definitions = additional_definitions - gParameters = candle.initialize_parameters(t29_common) + gParameters = candle.finalize_parameters(t29_common) return gParameters diff --git a/Pilot1/T29/t29res.py b/Pilot1/T29/t29res.py index e4fea01e..3e0054f9 100644 --- a/Pilot1/T29/t29res.py +++ b/Pilot1/T29/t29res.py @@ -20,7 +20,9 @@ file_path = os.path.dirname(os.path.realpath(__file__)) # candle -sys.path.append('/raid/brettin/Benchmarks/common') +file_path = os.path.dirname(os.path.realpath(__file__)) +lib_path2 = os.path.abspath(os.path.join(file_path, '..', '..', 'common')) +sys.path.append(lib_path2) import candle # candle From ec5fbf5373623d9b4dc6d0bf8b2f66fe6be5c552 Mon Sep 17 00:00:00 2001 From: brettin Date: Tue, 5 May 2020 17:56:13 -0500 Subject: [PATCH 283/331] fixed unsafe thread runtime error --- Pilot1/P1B3/p1b3_baseline_keras2.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Pilot1/P1B3/p1b3_baseline_keras2.py b/Pilot1/P1B3/p1b3_baseline_keras2.py index 009096f6..c9c858a4 100644 --- a/Pilot1/P1B3/p1b3_baseline_keras2.py +++ b/Pilot1/P1B3/p1b3_baseline_keras2.py @@ -359,8 +359,7 @@ def run(gParameters): validation_steps=val_steps, verbose=0, callbacks=[checkpointer, loss_history, progbar, candleRemoteMonitor], - pickle_safe=True, - workers=gParameters['workers']) + ) benchmark.logger.removeHandler(fh) benchmark.logger.removeHandler(sh) From f7187ab605eee1d450ccb81c55995bdb20debfb8 Mon Sep 17 00:00:00 2001 From: brettin Date: Tue, 5 May 2020 18:07:03 -0500 Subject: [PATCH 284/331] remane --- Pilot1/{Attn1 => Attn}/0 | 0 Pilot1/{Attn1 => Attn}/README.md | 0 Pilot1/{Attn1 => Attn}/attn.py | 0 Pilot1/{Attn1 => Attn}/attn_abs_default_model.txt | 0 Pilot1/{Attn1 => Attn}/attn_abstention_keras2.py | 0 Pilot1/{Attn1 => Attn}/attn_baseline_keras2.py | 0 Pilot1/{Attn1 => Attn}/attn_bin_working_jan7_h5.py | 0 Pilot1/{Attn1 => Attn}/attn_bin_working_jan7_h5.sh | 0 Pilot1/{Attn1 => Attn}/attn_bsub.sh | 0 Pilot1/{Attn1 => Attn}/attn_default_model.txt | 0 Pilot1/{Attn1 => Attn}/attn_viz_utils.py | 0 Pilot1/{Attn1 => Attn}/cmd1.sh | 0 Pilot1/{Attn1 => Attn}/cmd2.sh | 0 13 files changed, 0 insertions(+), 0 deletions(-) rename Pilot1/{Attn1 => Attn}/0 (100%) rename Pilot1/{Attn1 => Attn}/README.md (100%) rename Pilot1/{Attn1 => Attn}/attn.py (100%) rename Pilot1/{Attn1 => Attn}/attn_abs_default_model.txt (100%) rename Pilot1/{Attn1 => Attn}/attn_abstention_keras2.py (100%) rename Pilot1/{Attn1 => Attn}/attn_baseline_keras2.py (100%) rename Pilot1/{Attn1 => Attn}/attn_bin_working_jan7_h5.py (100%) rename Pilot1/{Attn1 => Attn}/attn_bin_working_jan7_h5.sh (100%) rename Pilot1/{Attn1 => Attn}/attn_bsub.sh (100%) rename Pilot1/{Attn1 => Attn}/attn_default_model.txt (100%) rename Pilot1/{Attn1 => Attn}/attn_viz_utils.py (100%) rename Pilot1/{Attn1 => Attn}/cmd1.sh (100%) rename Pilot1/{Attn1 => Attn}/cmd2.sh (100%) diff --git a/Pilot1/Attn1/0 b/Pilot1/Attn/0 similarity index 100% rename from Pilot1/Attn1/0 rename to Pilot1/Attn/0 diff --git a/Pilot1/Attn1/README.md b/Pilot1/Attn/README.md similarity index 100% rename from Pilot1/Attn1/README.md rename to Pilot1/Attn/README.md diff --git a/Pilot1/Attn1/attn.py b/Pilot1/Attn/attn.py similarity index 100% rename from Pilot1/Attn1/attn.py rename to Pilot1/Attn/attn.py diff --git a/Pilot1/Attn1/attn_abs_default_model.txt b/Pilot1/Attn/attn_abs_default_model.txt similarity index 100% rename from Pilot1/Attn1/attn_abs_default_model.txt rename to Pilot1/Attn/attn_abs_default_model.txt diff --git a/Pilot1/Attn1/attn_abstention_keras2.py b/Pilot1/Attn/attn_abstention_keras2.py similarity index 100% rename from Pilot1/Attn1/attn_abstention_keras2.py rename to Pilot1/Attn/attn_abstention_keras2.py diff --git a/Pilot1/Attn1/attn_baseline_keras2.py b/Pilot1/Attn/attn_baseline_keras2.py similarity index 100% rename from Pilot1/Attn1/attn_baseline_keras2.py rename to Pilot1/Attn/attn_baseline_keras2.py diff --git a/Pilot1/Attn1/attn_bin_working_jan7_h5.py b/Pilot1/Attn/attn_bin_working_jan7_h5.py similarity index 100% rename from Pilot1/Attn1/attn_bin_working_jan7_h5.py rename to Pilot1/Attn/attn_bin_working_jan7_h5.py diff --git a/Pilot1/Attn1/attn_bin_working_jan7_h5.sh b/Pilot1/Attn/attn_bin_working_jan7_h5.sh similarity index 100% rename from Pilot1/Attn1/attn_bin_working_jan7_h5.sh rename to Pilot1/Attn/attn_bin_working_jan7_h5.sh diff --git a/Pilot1/Attn1/attn_bsub.sh b/Pilot1/Attn/attn_bsub.sh similarity index 100% rename from Pilot1/Attn1/attn_bsub.sh rename to Pilot1/Attn/attn_bsub.sh diff --git a/Pilot1/Attn1/attn_default_model.txt b/Pilot1/Attn/attn_default_model.txt similarity index 100% rename from Pilot1/Attn1/attn_default_model.txt rename to Pilot1/Attn/attn_default_model.txt diff --git a/Pilot1/Attn1/attn_viz_utils.py b/Pilot1/Attn/attn_viz_utils.py similarity index 100% rename from Pilot1/Attn1/attn_viz_utils.py rename to Pilot1/Attn/attn_viz_utils.py diff --git a/Pilot1/Attn1/cmd1.sh b/Pilot1/Attn/cmd1.sh similarity index 100% rename from Pilot1/Attn1/cmd1.sh rename to Pilot1/Attn/cmd1.sh diff --git a/Pilot1/Attn1/cmd2.sh b/Pilot1/Attn/cmd2.sh similarity index 100% rename from Pilot1/Attn1/cmd2.sh rename to Pilot1/Attn/cmd2.sh From 9866cb43e5fb72f851a800fb88098e2a0fca1178 Mon Sep 17 00:00:00 2001 From: Jamal Date: Wed, 6 May 2020 13:36:18 -0600 Subject: [PATCH 285/331] Bug fix: p3b1 not adding shared layers correctly, --- Pilot3/P3B1/p3b1_baseline_keras2.py | 1 + 1 file changed, 1 insertion(+) diff --git a/Pilot3/P3B1/p3b1_baseline_keras2.py b/Pilot3/P3B1/p3b1_baseline_keras2.py index 53df1868..23dc4905 100644 --- a/Pilot3/P3B1/p3b1_baseline_keras2.py +++ b/Pilot3/P3B1/p3b1_baseline_keras2.py @@ -81,6 +81,7 @@ def build_model(gParameters, kerasDefaults, for k in range( len( shared_nnet_spec ) ): layer = Dense( shared_nnet_spec[ k ], activation=gParameters['activation'], name= 'shared_layer_' + str( k ) )( shared_layers[ -1 ] ) + shared_layers.append( layer ) if gParameters['dropout'] > 0: layer = Dropout( gParameters['dropout'] )( shared_layers[ -1 ] ) shared_layers.append( layer ) From d032a9cc54d8c7e1d06537e817ee3be8d6f4168a Mon Sep 17 00:00:00 2001 From: yngtodd Date: Wed, 6 May 2020 16:17:25 -0400 Subject: [PATCH 286/331] Remove loguru We don't need to add this dependency just to print things. --- examples/darts/advanced/example.py | 10 ++++------ examples/darts/uno/uno_example.py | 9 ++++----- 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/examples/darts/advanced/example.py b/examples/darts/advanced/example.py index 22d800bc..c700a934 100644 --- a/examples/darts/advanced/example.py +++ b/examples/darts/advanced/example.py @@ -4,8 +4,6 @@ from torch.utils.data import DataLoader from torchvision import datasets, transforms -from loguru import logger - import example_setup as bmk import darts import candle @@ -91,10 +89,10 @@ def run(params): scheduler.step() lr = scheduler.get_lr()[0] - logger.info(f'\nEpoch: {epoch} lr: {lr}') + print(f'\nEpoch: {epoch} lr: {lr}') genotype = model.genotype() - logger.info(f'Genotype: {genotype}\n') + print(f'Genotype: {genotype}\n') train( trainloader, @@ -162,7 +160,7 @@ def train(trainloader, meter.update_batch_accuracy(prec1, batch_size) if step % args.log_interval == 0: - logger.info(f'Step: {step} loss: {meter.loss_meter.avg:.4}') + print(f'Step: {step} loss: {meter.loss_meter.avg:.4}') meter.update_epoch() meter.save(args.savepath) @@ -187,7 +185,7 @@ def validate(validloader, model, criterion, args, tasks, meter, device): meter.update_batch_accuracy(prec1, batch_size) if step % args.log_interval == 0: - logger.info(f'>> Validation: {step} loss: {meter.loss_meter.avg:.4}') + print(f'>> Validation: {step} loss: {meter.loss_meter.avg:.4}') meter.update_epoch() meter.save(args.savepath) diff --git a/examples/darts/uno/uno_example.py b/examples/darts/uno/uno_example.py index 1b01ca81..c6e037c3 100644 --- a/examples/darts/uno/uno_example.py +++ b/examples/darts/uno/uno_example.py @@ -2,7 +2,6 @@ import torch.nn as nn from torch import optim from torch.utils.data import DataLoader -from loguru import logger import example_setup as bmk import darts @@ -74,10 +73,10 @@ def run(params): scheduler.step() lr = scheduler.get_lr()[0] - logger.info(f'\nEpoch: {epoch} lr: {lr}') + print(f'\nEpoch: {epoch} lr: {lr}') genotype = model.genotype() - logger.info(f'Genotype: {genotype}\n') + print(f'Genotype: {genotype}\n') train( trainloader, @@ -154,7 +153,7 @@ def train(trainloader, min_accuracy = accuracy_avg if step % args.log_interval == 0: - logger.info(f'Step: {step} loss: {meter.loss_meter.avg:.4}') + print(f'Step: {step} loss: {meter.loss_meter.avg:.4}') meter.update_epoch() meter.save(args.savepath) @@ -179,7 +178,7 @@ def validate(validloader, model, criterion, args, tasks, meter, device): meter.update_batch_accuracy(prec1, batch_size) if step % args.log_interval == 0: - logger.info(f'>> Validation: {step} loss: {meter.loss_meter.avg:.4}') + print(f'>> Validation: {step} loss: {meter.loss_meter.avg:.4}') meter.update_epoch() meter.save(args.savepath) From 2b10631f80475d7fca77f05e185ab71f0df8a000 Mon Sep 17 00:00:00 2001 From: yngtodd Date: Thu, 7 May 2020 13:22:32 -0400 Subject: [PATCH 287/331] Return logging This moves us back to logging instead of print, but uses the std library logging instead of loguru. --- examples/darts/advanced/example.py | 14 ++++++++++---- examples/darts/uno/uno_example.py | 13 +++++++++---- 2 files changed, 19 insertions(+), 8 deletions(-) diff --git a/examples/darts/advanced/example.py b/examples/darts/advanced/example.py index c700a934..d8a91ce3 100644 --- a/examples/darts/advanced/example.py +++ b/examples/darts/advanced/example.py @@ -4,6 +4,8 @@ from torch.utils.data import DataLoader from torchvision import datasets, transforms +import logging + import example_setup as bmk import darts import candle @@ -13,6 +15,10 @@ ) +logging.basicConfig(level = logging.INFO) +logger = logging.getLogger("darts_advanced") + + def initialize_parameters(): """ Initialize the parameters for the Advanced example """ @@ -89,10 +95,10 @@ def run(params): scheduler.step() lr = scheduler.get_lr()[0] - print(f'\nEpoch: {epoch} lr: {lr}') + logger.info(f'\nEpoch: {epoch} lr: {lr}') genotype = model.genotype() - print(f'Genotype: {genotype}\n') + logger.info(f'Genotype: {genotype}\n') train( trainloader, @@ -160,7 +166,7 @@ def train(trainloader, meter.update_batch_accuracy(prec1, batch_size) if step % args.log_interval == 0: - print(f'Step: {step} loss: {meter.loss_meter.avg:.4}') + logger.info(f'Step: {step} loss: {meter.loss_meter.avg:.4}') meter.update_epoch() meter.save(args.savepath) @@ -185,7 +191,7 @@ def validate(validloader, model, criterion, args, tasks, meter, device): meter.update_batch_accuracy(prec1, batch_size) if step % args.log_interval == 0: - print(f'>> Validation: {step} loss: {meter.loss_meter.avg:.4}') + logger.info(f'>> Validation: {step} loss: {meter.loss_meter.avg:.4}') meter.update_epoch() meter.save(args.savepath) diff --git a/examples/darts/uno/uno_example.py b/examples/darts/uno/uno_example.py index c6e037c3..45043174 100644 --- a/examples/darts/uno/uno_example.py +++ b/examples/darts/uno/uno_example.py @@ -1,3 +1,5 @@ +import logging + import torch import torch.nn as nn from torch import optim @@ -7,6 +9,9 @@ import darts import candle +logging.basicConfig(level = logging.INFO) +logger = logging.getLogger("darts_uno") + def initialize_parameters(): """ Initialize the parameters for the Uno example """ @@ -73,10 +78,10 @@ def run(params): scheduler.step() lr = scheduler.get_lr()[0] - print(f'\nEpoch: {epoch} lr: {lr}') + logger.info(f'\nEpoch: {epoch} lr: {lr}') genotype = model.genotype() - print(f'Genotype: {genotype}\n') + logger.info(f'Genotype: {genotype}\n') train( trainloader, @@ -153,7 +158,7 @@ def train(trainloader, min_accuracy = accuracy_avg if step % args.log_interval == 0: - print(f'Step: {step} loss: {meter.loss_meter.avg:.4}') + logger.info(f'Step: {step} loss: {meter.loss_meter.avg:.4}') meter.update_epoch() meter.save(args.savepath) @@ -178,7 +183,7 @@ def validate(validloader, model, criterion, args, tasks, meter, device): meter.update_batch_accuracy(prec1, batch_size) if step % args.log_interval == 0: - print(f'>> Validation: {step} loss: {meter.loss_meter.avg:.4}') + logger.info(f'>> Validation: {step} loss: {meter.loss_meter.avg:.4}') meter.update_epoch() meter.save(args.savepath) From 6e7cb5b90c905f88bc8a48cb774bc1777d2cc69f Mon Sep 17 00:00:00 2001 From: yngtodd Date: Fri, 8 May 2020 10:14:10 -0400 Subject: [PATCH 288/331] Move lr scheduler This removes the Pytorch warning about calling a learning rate scheduler.step() before optimizer.step(). --- examples/darts/uno/default_model.txt | 1 - examples/darts/uno/example_setup.py | 1 - examples/darts/uno/uno_example.py | 12 ++++++------ 3 files changed, 6 insertions(+), 8 deletions(-) diff --git a/examples/darts/uno/default_model.txt b/examples/darts/uno/default_model.txt index 3446cc3e..40f1cc41 100644 --- a/examples/darts/uno/default_model.txt +++ b/examples/darts/uno/default_model.txt @@ -1,6 +1,5 @@ [Global_Params] model_name = 'darts_uno' -unrolled = True data_url = 'http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/' savepath = '.' log_interval = 10 diff --git a/examples/darts/uno/example_setup.py b/examples/darts/uno/example_setup.py index 7d634b08..0a129157 100644 --- a/examples/darts/uno/example_setup.py +++ b/examples/darts/uno/example_setup.py @@ -17,7 +17,6 @@ 'weight_decay', 'grad_clip', 'seed', - 'unrolled', 'batch_size', 'epochs', ] diff --git a/examples/darts/uno/uno_example.py b/examples/darts/uno/uno_example.py index 45043174..e7e3b17f 100644 --- a/examples/darts/uno/uno_example.py +++ b/examples/darts/uno/uno_example.py @@ -39,8 +39,6 @@ def run(params): train_data = darts.Uno('./data', 'train', download=True) valid_data = darts.Uno('./data', 'test') - #train_data = darts.sample(train_data, len(valid_data)) - trainloader = DataLoader(train_data, batch_size=args.batch_size) validloader = DataLoader(valid_data, batch_size=args.batch_size) @@ -76,7 +74,6 @@ def run(params): for epoch in range(args.epochs): - scheduler.step() lr = scheduler.get_lr()[0] logger.info(f'\nEpoch: {epoch} lr: {lr}') @@ -89,7 +86,7 @@ def run(params): architecture, criterion, optimizer, - lr, + scheduler, args, tasks, train_meter, @@ -106,7 +103,7 @@ def train(trainloader, architecture, criterion, optimizer, - lr, + scheduler, args, tasks, meter, @@ -128,6 +125,8 @@ def train(trainloader, x_search = darts.to_device(x_search, device) target_search = darts.to_device(target_search, device) + lr = scheduler.get_lr()[0] + # 1. update alpha architecture.step( data, @@ -136,7 +135,7 @@ def train(trainloader, target_search, lr, optimizer, - unrolled=args.unrolled + unrolled=False ) logits = model(data) @@ -147,6 +146,7 @@ def train(trainloader, loss.backward() nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip) optimizer.step() + scheduler.step() prec1 = darts.multitask_accuracy_topk(logits, target, topk=(1,)) meter.update_batch_loss(loss.item(), batch_size) From d1a0f756638ef85d0dc2e56dc482eb0f836e8c0f Mon Sep 17 00:00:00 2001 From: yngtodd Date: Fri, 8 May 2020 10:26:54 -0400 Subject: [PATCH 289/331] Move scheduler This corrects the lr scheduler step being called before the optimizer step warning. --- examples/darts/advanced/default_model.txt | 1 - examples/darts/advanced/example.py | 10 ++++++---- examples/darts/advanced/example_setup.py | 1 - 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/examples/darts/advanced/default_model.txt b/examples/darts/advanced/default_model.txt index c0130a69..2df0df62 100644 --- a/examples/darts/advanced/default_model.txt +++ b/examples/darts/advanced/default_model.txt @@ -1,6 +1,5 @@ [Global_Params] model_name = 'darts_uno' -unrolled = False data_url = 'ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/' savepath = './results' log_interval = 10 diff --git a/examples/darts/advanced/example.py b/examples/darts/advanced/example.py index d8a91ce3..77003f64 100644 --- a/examples/darts/advanced/example.py +++ b/examples/darts/advanced/example.py @@ -93,7 +93,6 @@ def run(params): for epoch in range(args.epochs): - scheduler.step() lr = scheduler.get_lr()[0] logger.info(f'\nEpoch: {epoch} lr: {lr}') @@ -106,7 +105,7 @@ def run(params): architecture, criterion, optimizer, - lr, + scheduler, args, tasks, train_meter, @@ -121,7 +120,7 @@ def train(trainloader, architecture, criterion, optimizer, - lr, + scheduler, args, tasks, meter, @@ -141,6 +140,8 @@ def train(trainloader, x_search = darts.to_device(x_search, device) target_search = darts.to_device(target_search, device) + lr = scheduler.get_lr()[0] + # 1. update alpha architecture.step( data, @@ -149,7 +150,7 @@ def train(trainloader, target_search, lr, optimizer, - unrolled=args.unrolled + unrolled=False ) logits = model(data) @@ -160,6 +161,7 @@ def train(trainloader, loss.backward() nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip) optimizer.step() + scheduler.step() prec1 = darts.multitask_accuracy_topk(logits, target, topk=(1,)) meter.update_batch_loss(loss.item(), batch_size) diff --git a/examples/darts/advanced/example_setup.py b/examples/darts/advanced/example_setup.py index 12d19b73..c47a6682 100644 --- a/examples/darts/advanced/example_setup.py +++ b/examples/darts/advanced/example_setup.py @@ -17,7 +17,6 @@ 'weight_decay', 'grad_clip', 'seed', - 'unrolled', 'batch_size', 'epochs', ] From 55384a91e4f414261cbbfa77ed2c422372951c33 Mon Sep 17 00:00:00 2001 From: yngtodd Date: Fri, 8 May 2020 10:35:53 -0400 Subject: [PATCH 290/331] Refactor READMEs We don't need to repeat information about the algorithm in each README. This makes things less redundant and easier to navigate. --- examples/darts/README.rst | 54 ++++++++++++++++++++++++++ examples/darts/advanced/README.rst | 61 ++++-------------------------- examples/darts/uno/README.rst | 53 -------------------------- 3 files changed, 62 insertions(+), 106 deletions(-) diff --git a/examples/darts/README.rst b/examples/darts/README.rst index 15322c13..b8869eda 100644 --- a/examples/darts/README.rst +++ b/examples/darts/README.rst @@ -9,3 +9,57 @@ Our recommended ordering of examples: 2. **Advanced**: how to define our own neural network primitives to be optimized by DARTS. + +Differentiable architecture search + +This is an adaptation of Hanxiao Liu et al's DARTS algorithm, extending +the work to handle convolutional neural networks for NLP problems and more. +Details of the original authors' approach can be found in their 2019 ICLR paper_. + +DARTS works by composing various neural net primitives, defined as Pytorch *nn.Modules*, +to create a larger directed acyclic graph (DAG) that is to be your model. This +composition is differentiable as we take the softmax of the choice of primitive types +at each layer of the network. To make this more clear, let's first define a few abstractions +in the algorithm: + +1. **Primitve**: this is the fundamental block of computation, defined as an *nn.Module*. + At each layer of your network, one of these primitves will be chosen by taking the + softmax of all possible primitives at that layer. Examples could be a convolution block, + a linear layer, a skip connect, or anything that you can come up with (subject to a few + constraints). + +2. **Cell**: this is an abstraction that holds each of the primitive types for level of your + network. This is where we perform the softmax over the possible primitive types. + +3. **Nodes**: this is the level of abstraction that would normally be considered a layer in + your network. It can contain one or more *Cells*. + +4. **Architecture**: The abstraction that contains all nodes in the graph. This computes a + Hessian product with respect to the *alpha* parameters as defined in the paper. + +5. **Genotype**: genotypes are instances of a particular configuration of the graph. As the + optimization runs, and each cell computes the softmax over their primitive types, the final + configuration of all nodes with their resulting primitive is a genotype. + +In the DARTS algorithm, we define a number of primitives that we would like to compose together +to form our neural network. The original paper started with 8 primitive types. These types +were originally designed for a vision task, and largely consist of convolution type operations. +We have since adapted these types for the *P3B5* benchmark, creating 1D convolution types for +our NLP tasks. If you would like to see how these primitives are defined, along with their +necessary constructors used by DARTS, you can find them in +`darts.modules.operations.conv.py`_. + +These primitives are then contained within a cell, and one or more cells are contained within a +node in the graph. DARTS then works by composing these nodes together and taking the softmax over +their primitives in each cell. Finally, the *Architecture* abstraction contains all nodes, and is +responsible for differentiating the composition of the nodes with respect to two *alpha* parameters +as defined in the paper. The end result is that we have a differentiable model that composes its +components as the model is training. + +As the optimization runs, the model will print the resulting loss with respect to a given *Genotype*. +The final model will be the *Genotype* with corresponding to the lowest loss. + +.. References +.. ---------- +.. _paper: https://openreview.net/forum?id=S1eYHoC5FX +.. _darts.modules.operations.conv.py: ../../../common/darts/modules/operations/conv.py diff --git a/examples/darts/advanced/README.rst b/examples/darts/advanced/README.rst index d9197271..c41e63c1 100644 --- a/examples/darts/advanced/README.rst +++ b/examples/darts/advanced/README.rst @@ -2,59 +2,6 @@ DARTS Advanced ============== - -Differentiable architecture search - -This is an adaptation of Hanxiao Liu et al's DARTS algorithm, extending -the work to handle convolutional neural networks for NLP problems and more. -Details of the original authors' approach can be found in their 2019 ICLR paper_. - -DARTS works by composing various neural net primitives, defined as Pytorch *nn.Modules*, -to create a larger directed acyclic graph (DAG) that is to be your model. This -composition is differentiable as we take the softmax of the choice of primitive types -at each layer of the network. To make this more clear, let's first define a few abstractions -in the algorithm: - -1. **Primitve**: this is the fundamental block of computation, defined as an *nn.Module*. - At each layer of your network, one of these primitves will be chosen by taking the - softmax of all possible primitives at that layer. Examples could be a convolution block, - a linear layer, a skip connect, or anything that you can come up with (subject to a few - constraints). - -2. **Cell**: this is an abstraction that holds each of the primitive types for level of your - network. This is where we perform the softmax over the possible primitive types. - -3. **Nodes**: this is the level of abstraction that would normally be considered a layer in - your network. It can contain one or more *Cells*. - -4. **Architecture**: The abstraction that contains all nodes in the graph. This computes a - Hessian product with respect to the *alpha* parameters as defined in the paper. - -5. **Genotype**: genotypes are instances of a particular configuration of the graph. As the - optimization runs, and each cell computes the softmax over their primitive types, the final - configuration of all nodes with their resulting primitive is a genotype. - -In the DARTS algorithm, we define a number of primitives that we would like to compose together -to form our neural network. The original paper started with 8 primitive types. These types -were originally designed for a vision task, and largely consist of convolution type operations. -We have since adapted these types for the *P3B5* benchmark, creating 1D convolution types for -our NLP tasks. If you would like to see how these primitives are defined, along with their -necessary constructors used by DARTS, you can find them in -`darts.modules.operations.conv.py`_. - -These primitives are then contained within a cell, and one or more cells are contained within a -node in the graph. DARTS then works by composing these nodes together and taking the softmax over -their primitives in each cell. Finally, the *Architecture* abstraction contains all nodes, and is -responsible for differentiating the composition of the nodes with respect to two *alpha* parameters -as defined in the paper. The end result is that we have a differentiable model that composes its -components as the model is training. - -As the optimization runs, the model will print the resulting loss with respect to a given *Genotype*. -The final model will be the *Genotype* with corresponding to the lowest loss. - -Adnvanced Example ------------------ - In this example we will take a look at how to define our own primitives to be handled by DARTS. If you have not read the `Uno example`_, I would recommend taking a look at that first. There we showed how we can use the built in primitives to DARTS. As reference, you can also look to see how those @@ -174,6 +121,14 @@ we must specify how many features will go into our final fully connected layer o Finally, to run this example: +First, make sure that you can get the example data by installing `torchvision`: + +.. code-block:: + + pip install torchvision + +Then run the example with + .. code-block:: python example.py diff --git a/examples/darts/uno/README.rst b/examples/darts/uno/README.rst index ac869b0c..f45fffc4 100644 --- a/examples/darts/uno/README.rst +++ b/examples/darts/uno/README.rst @@ -2,59 +2,6 @@ DARTS UNO ========= - -Differentiable architecture search - -This is an adaptation of Hanxiao Liu et al's DARTS algorithm, extending -the work to handle convolutional neural networks for NLP problems and more. -Details of the original authors' approach can be found in their 2019 ICLR paper_. - -DARTS works by composing various neural net primitives, defined as Pytorch *nn.Modules*, -to create a larger directed acyclic graph (DAG) that is to be your model. This -composition is differentiable as we take the softmax of the choice of primitive types -at each layer of the network. To make this more clear, let's first define a few abstractions -in the algorithm: - -1. **Primitve**: this is the fundamental block of computation, defined as an *nn.Module*. - At each layer of your network, one of these primitves will be chosen by taking the - softmax of all possible primitives at that layer. Examples could be a convolution block, - a linear layer, a skip connect, or anything that you can come up with (subject to a few - constraints). - -2. **Cell**: this is an abstraction that holds each of the primitive types for level of your - network. This is where we perform the softmax over the possible primitive types. - -3. **Nodes**: this is the level of abstraction that would normally be considered a layer in - your network. It can contain one or more *Cells*. - -4. **Architecture**: The abstraction that contains all nodes in the graph. This computes a - Hessian product with respect to the *alpha* parameters as defined in the paper. - -5. **Genotype**: genotypes are instances of a particular configuration of the graph. As the - optimization runs, and each cell computes the softmax over their primitive types, the final - configuration of all nodes with their resulting primitive is a genotype. - -In the DARTS algorithm, we define a number of primitives that we would like to compose together -to form our neural network. The original paper started with 8 primitive types. These types -were originally designed for a vision task, and largely consist of convolution type operations. -We have since adapted these types for the *P3B5* benchmark, creating 1D convolution types for -our NLP tasks. If you would like to see how these primitives are defined, along with their -necessary constructors used by DARTS, you can find them in -`darts.modules.operations.conv.py`_. - -These primitives are then contained within a cell, and one or more cells are contained within a -node in the graph. DARTS then works by composing these nodes together and taking the softmax over -their primitives in each cell. Finally, the *Architecture* abstraction contains all nodes, and is -responsible for differentiating the composition of the nodes with respect to two *alpha* parameters -as defined in the paper. The end result is that we have a differentiable model that composes its -components as the model is training. - -As the optimization runs, the model will print the resulting loss with respect to a given *Genotype*. -The final model will be the *Genotype* with corresponding to the lowest loss. - -UNO Example ------------ - Let's take a look at a look at using DARTS for the Pilot 1 Uno example. In the Uno problem the task is to classify tumor dose response with respect to a few different data sources. For simplicity, we will use one source, Uno's gene data, to be used From d594fb25cc1b441e9c8747aa33d8419a8e8541f3 Mon Sep 17 00:00:00 2001 From: yngtodd Date: Fri, 8 May 2020 10:39:14 -0400 Subject: [PATCH 291/331] Clean up formatting This should make the README a bit more palatable. --- examples/darts/README.rst | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/examples/darts/README.rst b/examples/darts/README.rst index b8869eda..8b71b1e9 100644 --- a/examples/darts/README.rst +++ b/examples/darts/README.rst @@ -2,6 +2,11 @@ DARTS Examples ============== +Differentiable architecture search + +TLDR +---- + Our recommended ordering of examples: 1. **Uno**: learn how to use the neural network building blocks in DARTS to @@ -10,7 +15,8 @@ Our recommended ordering of examples: 2. **Advanced**: how to define our own neural network primitives to be optimized by DARTS. -Differentiable architecture search +The Algorithm +------------- This is an adaptation of Hanxiao Liu et al's DARTS algorithm, extending the work to handle convolutional neural networks for NLP problems and more. From b986926d06b381903eed5c33e5bd03971ad41253 Mon Sep 17 00:00:00 2001 From: yngtodd Date: Fri, 8 May 2020 10:41:54 -0400 Subject: [PATCH 292/331] Create run section THis makes it easier to see the instructions to run the example. --- examples/darts/advanced/README.rst | 3 ++- examples/darts/uno/README.rst | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/examples/darts/advanced/README.rst b/examples/darts/advanced/README.rst index c41e63c1..0da11fdf 100644 --- a/examples/darts/advanced/README.rst +++ b/examples/darts/advanced/README.rst @@ -119,7 +119,8 @@ of the primitives must have the same number of input and output features, this w of features from any of your primitives. Since DARTS cannot know ahead of time what your primitives will be, we must specify how many features will go into our final fully connected layer of the network. -Finally, to run this example: +Run the Example +--------------- First, make sure that you can get the example data by installing `torchvision`: diff --git a/examples/darts/uno/README.rst b/examples/darts/uno/README.rst index f45fffc4..1e421e98 100644 --- a/examples/darts/uno/README.rst +++ b/examples/darts/uno/README.rst @@ -62,7 +62,8 @@ data and labels of the training set, but also the data and labels of our validat simplicity of this tutorial, *x_search* and *target_search* are from our training set, but these would normally use a separate validation set. -Finally, to run this example: +Run the Example +--------------- .. code-block:: From 7f1e99b0cc6bdcfb826695d7d151ec526a8e10f7 Mon Sep 17 00:00:00 2001 From: yngtodd Date: Fri, 8 May 2020 10:55:44 -0400 Subject: [PATCH 293/331] Use learning_rate and learning_rate_min This keeps things consisten with CANDLE conventions, and removes potentially confusing parameters. --- examples/darts/advanced/default_model.txt | 4 +--- examples/darts/advanced/example.py | 4 ++-- examples/darts/uno/default_model.txt | 4 +--- examples/darts/uno/uno_example.py | 4 ++-- 4 files changed, 6 insertions(+), 10 deletions(-) diff --git a/examples/darts/advanced/default_model.txt b/examples/darts/advanced/default_model.txt index 2df0df62..239a569e 100644 --- a/examples/darts/advanced/default_model.txt +++ b/examples/darts/advanced/default_model.txt @@ -4,7 +4,7 @@ data_url = 'ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/' savepath = './results' log_interval = 10 train_data = 'top_21_auc_1fold.uno.h5' -learning_rate = 0.01 +learning_rate = 0.025 learning_rate_min = 0.001 momentum = 0.9 weight_decay = 3e-4 @@ -12,6 +12,4 @@ grad_clip = 5 batch_size = 100 epochs = 10 seed = 13 -lr = 0.025 -lr_min = 0.001 diff --git a/examples/darts/advanced/example.py b/examples/darts/advanced/example.py index 77003f64..f5e5a2ce 100644 --- a/examples/darts/advanced/example.py +++ b/examples/darts/advanced/example.py @@ -77,7 +77,7 @@ def run(params): optimizer = optim.SGD( model.parameters(), - args.lr, + args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay ) @@ -85,7 +85,7 @@ def run(params): scheduler = optim.lr_scheduler.CosineAnnealingLR( optimizer, float(args.epochs), - eta_min=args.lr_min + eta_min=args.learning_rate_min ) train_meter = darts.EpochMeter(tasks, 'train') diff --git a/examples/darts/uno/default_model.txt b/examples/darts/uno/default_model.txt index 40f1cc41..78d7c325 100644 --- a/examples/darts/uno/default_model.txt +++ b/examples/darts/uno/default_model.txt @@ -4,7 +4,7 @@ data_url = 'http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/' savepath = '.' log_interval = 10 train_data = 'top_21_auc_1fold.uno.h5' -learning_rate = 0.01 +learning_rate = 0.025 learning_rate_min = 0.001 momentum = 0.9 weight_decay = 3e-4 @@ -12,6 +12,4 @@ grad_clip = 5 batch_size = 100 epochs = 10 seed = 13 -lr = 0.025 -lr_min = 0.001 diff --git a/examples/darts/uno/uno_example.py b/examples/darts/uno/uno_example.py index e7e3b17f..c8643805 100644 --- a/examples/darts/uno/uno_example.py +++ b/examples/darts/uno/uno_example.py @@ -56,7 +56,7 @@ def run(params): optimizer = optim.SGD( model.parameters(), - args.lr, + args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay ) @@ -64,7 +64,7 @@ def run(params): scheduler = optim.lr_scheduler.CosineAnnealingLR( optimizer, float(args.epochs), - eta_min=args.lr_min + eta_min=args.learning_rate_min ) train_meter = darts.EpochMeter(tasks, 'train') From 287b024ba02fc2cedf2f577df8b4a97e4219fa61 Mon Sep 17 00:00:00 2001 From: Jamal Date: Fri, 8 May 2020 09:08:56 -0600 Subject: [PATCH 294/331] Removed deprecated solr_root keyword. --- Pilot1/Attn/attn.py | 1 - Pilot1/Attn/attn_abs_default_model.txt | 1 - Pilot1/Attn/attn_abstention_keras2.py | 1 - Pilot1/Attn/attn_default_model.txt | 2 -- Pilot1/Combo/combo.py | 2 +- Pilot1/Combo/combo_default_model.txt | 1 - Pilot1/Combo/combo_perf_bench_model.txt | 1 - Pilot1/P1B1/p1b1.py | 1 - Pilot1/P1B1/p1b1_default_model.txt | 1 - Pilot1/P1B1/p1b1_perf_bench_model.txt | 1 - Pilot1/Uno/uno.py | 1 - Pilot1/Uno/uno_auc_clr_model.txt | 1 - Pilot1/UnoMT/unoMT.py | 1 - 13 files changed, 1 insertion(+), 14 deletions(-) diff --git a/Pilot1/Attn/attn.py b/Pilot1/Attn/attn.py index 9676dc52..aaeb9526 100644 --- a/Pilot1/Attn/attn.py +++ b/Pilot1/Attn/attn.py @@ -79,7 +79,6 @@ 'latent_dim', 'batch_normalization', 'epsilon_std', - 'solr_root', 'timeout' ] diff --git a/Pilot1/Attn/attn_abs_default_model.txt b/Pilot1/Attn/attn_abs_default_model.txt index 86d22b50..442c5c7d 100644 --- a/Pilot1/Attn/attn_abs_default_model.txt +++ b/Pilot1/Attn/attn_abs_default_model.txt @@ -24,5 +24,4 @@ save_path='save_abs/EXP01/' target_abs_acc=0.85 [Monitor_Params] -solr_root='' timeout=3600 diff --git a/Pilot1/Attn/attn_abstention_keras2.py b/Pilot1/Attn/attn_abstention_keras2.py index a321bfa3..3d492141 100644 --- a/Pilot1/Attn/attn_abstention_keras2.py +++ b/Pilot1/Attn/attn_abstention_keras2.py @@ -58,7 +58,6 @@ 'optimizer', 'rng_seed', 'val_split', - 'solr_root', 'timeout', 'target_abs_acc'] diff --git a/Pilot1/Attn/attn_default_model.txt b/Pilot1/Attn/attn_default_model.txt index a0e03982..0fc2b03d 100644 --- a/Pilot1/Attn/attn_default_model.txt +++ b/Pilot1/Attn/attn_default_model.txt @@ -22,9 +22,7 @@ use_cp=False early_stop=True reduce_lr=True feature_subsample=0 -output_dir='./save/001/' save_path='./save/001/' [Monitor_Params] -solr_root='' timeout=3600 diff --git a/Pilot1/Combo/combo.py b/Pilot1/Combo/combo.py index f12d0bcf..72e2b37f 100644 --- a/Pilot1/Combo/combo.py +++ b/Pilot1/Combo/combo.py @@ -93,7 +93,7 @@ required = [ 'activation', 'batch_size', 'dense', 'dense_feature_layers', 'dropout', 'epochs', 'learning_rate', 'loss', 'optimizer', 'residual', 'rng_seed', 'save_path', 'scaling', 'feature_subsample', 'val_split', - 'solr_root', 'timeout' + 'timeout' ] class BenchmarkCombo(candle.Benchmark): diff --git a/Pilot1/Combo/combo_default_model.txt b/Pilot1/Combo/combo_default_model.txt index 3b57a979..d25d2d5d 100644 --- a/Pilot1/Combo/combo_default_model.txt +++ b/Pilot1/Combo/combo_default_model.txt @@ -28,5 +28,4 @@ use_combo_score=False verbose = False [Monitor_Params] -solr_root='' timeout=3600 diff --git a/Pilot1/Combo/combo_perf_bench_model.txt b/Pilot1/Combo/combo_perf_bench_model.txt index 0a5b1b32..db404931 100644 --- a/Pilot1/Combo/combo_perf_bench_model.txt +++ b/Pilot1/Combo/combo_perf_bench_model.txt @@ -29,5 +29,4 @@ verbose=False use_landmark_genes=True [Monitor_Params] -solr_root='' timeout=3600 diff --git a/Pilot1/P1B1/p1b1.py b/Pilot1/P1B1/p1b1.py index ff85605a..bb6f52a4 100644 --- a/Pilot1/P1B1/p1b1.py +++ b/Pilot1/P1B1/p1b1.py @@ -94,7 +94,6 @@ 'feature_subsample', 'batch_normalization', 'epsilon_std', - 'solr_root', 'timeout' ] diff --git a/Pilot1/P1B1/p1b1_default_model.txt b/Pilot1/P1B1/p1b1_default_model.txt index 486f0371..8f915d2e 100644 --- a/Pilot1/P1B1/p1b1_default_model.txt +++ b/Pilot1/P1B1/p1b1_default_model.txt @@ -26,5 +26,4 @@ alpha_dropout=False save_path='save/' [Monitor_Params] -solr_root='' timeout=3600 diff --git a/Pilot1/P1B1/p1b1_perf_bench_model.txt b/Pilot1/P1B1/p1b1_perf_bench_model.txt index 01ffc46c..04c791b7 100644 --- a/Pilot1/P1B1/p1b1_perf_bench_model.txt +++ b/Pilot1/P1B1/p1b1_perf_bench_model.txt @@ -27,5 +27,4 @@ save='save' use_landmark_genes=True [Monitor_Params] -solr_root='' timeout=3600 diff --git a/Pilot1/Uno/uno.py b/Pilot1/Uno/uno.py index 74581ef6..bebea29c 100644 --- a/Pilot1/Uno/uno.py +++ b/Pilot1/Uno/uno.py @@ -232,6 +232,5 @@ def set_locals(self): 'save_path', 'scaling', 'val_split', - 'solr_root', 'timeout' ] diff --git a/Pilot1/Uno/uno_auc_clr_model.txt b/Pilot1/Uno/uno_auc_clr_model.txt index 3cc0ea68..30893584 100644 --- a/Pilot1/Uno/uno_auc_clr_model.txt +++ b/Pilot1/Uno/uno_auc_clr_model.txt @@ -41,7 +41,6 @@ single=True timeout=-1 [Monitor_Params] -solr_root='' [CLR_Params] clr_flag = True diff --git a/Pilot1/UnoMT/unoMT.py b/Pilot1/UnoMT/unoMT.py index 3b4eaa0e..b18a4bdb 100644 --- a/Pilot1/UnoMT/unoMT.py +++ b/Pilot1/UnoMT/unoMT.py @@ -271,7 +271,6 @@ 'epochs', 'rng_seed', 'val_split', - 'solr_root', 'timeout', ] From 2fccd855d73bd86d0edad97dc93117bfd2d144bd Mon Sep 17 00:00:00 2001 From: yngtodd Date: Fri, 8 May 2020 11:11:15 -0400 Subject: [PATCH 295/331] Add link to Pytorch This gives everyone a heads up that they will need Pytorch to run this example. --- examples/darts/README.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/examples/darts/README.rst b/examples/darts/README.rst index 8b71b1e9..0da65854 100644 --- a/examples/darts/README.rst +++ b/examples/darts/README.rst @@ -65,7 +65,14 @@ components as the model is training. As the optimization runs, the model will print the resulting loss with respect to a given *Genotype*. The final model will be the *Genotype* with corresponding to the lowest loss. +Setup +----- + +Darts makes use of Pytorch. You can find binaries for both Pytorch and Torchvision (used in the advanced +example) at the `pytorch website`_. + .. References .. ---------- .. _paper: https://openreview.net/forum?id=S1eYHoC5FX .. _darts.modules.operations.conv.py: ../../../common/darts/modules/operations/conv.py +.. _pytorch website: https://pytorch.org/ From 9733e3cd14ff935ebf4522a13e3a37a01be4d733 Mon Sep 17 00:00:00 2001 From: yngtodd Date: Fri, 8 May 2020 11:13:05 -0400 Subject: [PATCH 296/331] Move setup to TLDR People may not see it at the bottom. --- examples/darts/README.rst | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/examples/darts/README.rst b/examples/darts/README.rst index 0da65854..3a03eac0 100644 --- a/examples/darts/README.rst +++ b/examples/darts/README.rst @@ -15,6 +15,12 @@ Our recommended ordering of examples: 2. **Advanced**: how to define our own neural network primitives to be optimized by DARTS. +Setup +----- + +Darts makes use of Pytorch. You can find binaries for both Pytorch and Torchvision (used in the advanced +example) at the `pytorch website`_. + The Algorithm ------------- @@ -65,12 +71,6 @@ components as the model is training. As the optimization runs, the model will print the resulting loss with respect to a given *Genotype*. The final model will be the *Genotype* with corresponding to the lowest loss. -Setup ------ - -Darts makes use of Pytorch. You can find binaries for both Pytorch and Torchvision (used in the advanced -example) at the `pytorch website`_. - .. References .. ---------- .. _paper: https://openreview.net/forum?id=S1eYHoC5FX From 7e104e1ab6fbeb168d4e63749b6931301456eda9 Mon Sep 17 00:00:00 2001 From: Jamal Mohd-Yusof Date: Fri, 8 May 2020 10:19:07 -0600 Subject: [PATCH 297/331] Update README.setup.linux --- README.setup.linux | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/README.setup.linux b/README.setup.linux index fcffb5f2..daaf82a5 100644 --- a/README.setup.linux +++ b/README.setup.linux @@ -22,7 +22,14 @@ conda install -y -c anaconda scikit-learn conda install -y -c anaconda matplotlib conda install -y -c conda-forge pygpu conda install -y -c anaconda pytorch -pip install loguru +conda install numba +conda install astropy +conda install patsy +conda install statsmodels +conda install requests +conda install torch +conda install pytorch +conda install -c conda-forge tqdm conda update -c conda-forge numpy # Download the source files for the benchmarks From be43691f2a87b4b8d737e8998a1e917bbd1d6f45 Mon Sep 17 00:00:00 2001 From: Jamal Date: Tue, 12 May 2020 12:22:05 -0600 Subject: [PATCH 298/331] Fixed up various benchmarks to conform to CANDLE spec. --- Pilot1/NT3/nt3_baseline_keras2.py | 2 +- Pilot1/NT3/nt3_baseline_keras2_tensorrt.py | 49 ++++++++++++++-------- Pilot1/NT3/training.log | 9 +--- Pilot1/UnoMT/unoMT_baseline_pytorch.py | 4 ++ Pilot1/UnoMT/unoMT_pytorch_model.py | 18 ++++---- Pilot3/P3B4/tf_mthcan.py | 48 ++++++++++----------- common/file_utils.py | 6 +++ common/solr_keras.py | 26 ------------ examples/ADRP/adrp_baseline_keras2.py | 18 ++++---- examples/darts/advanced/default_model.txt | 7 ++-- examples/darts/advanced/example.py | 4 +- examples/darts/advanced/example_setup.py | 12 +++++- examples/darts/uno/default_model.txt | 7 ++-- examples/darts/uno/example_setup.py | 16 ++++++- examples/darts/uno/uno_example.py | 7 ++-- 15 files changed, 124 insertions(+), 109 deletions(-) diff --git a/Pilot1/NT3/nt3_baseline_keras2.py b/Pilot1/NT3/nt3_baseline_keras2.py index 582d4584..36aa65df 100644 --- a/Pilot1/NT3/nt3_baseline_keras2.py +++ b/Pilot1/NT3/nt3_baseline_keras2.py @@ -26,7 +26,7 @@ def initialize_parameters(default_model = 'nt3_default_model.txt'): # Build benchmark object nt3Bmk = bmk.BenchmarkNT3(bmk.file_path, default_model, 'keras', - prog='nt3_baseline', desc='Multi-task (DNN) for data extraction from clinical reports - Pilot 3 Benchmark 1') + prog='nt3_baseline', desc='1D CNN to classify RNA sequence data in normal or tumor classes') # Initialize parameters gParameters = candle.finalize_parameters(nt3Bmk) diff --git a/Pilot1/NT3/nt3_baseline_keras2_tensorrt.py b/Pilot1/NT3/nt3_baseline_keras2_tensorrt.py index 429a4d2f..64303e28 100644 --- a/Pilot1/NT3/nt3_baseline_keras2_tensorrt.py +++ b/Pilot1/NT3/nt3_baseline_keras2_tensorrt.py @@ -28,11 +28,8 @@ lib_path2 = os.path.abspath(os.path.join(file_path, '..', '..', 'common')) sys.path.append(lib_path2) -import data_utils -import p1_common, p1_common_keras -from solr_keras import CandleRemoteMonitor, compute_trainable_params, TerminateOnTimeOut - - +import nt3 as bmk +import candle ''' Import Tensorflow Modules ''' import tensorflow as tf @@ -54,6 +51,7 @@ #P = 60483 # 60483 #DR = 0.1 # Dropout rate +''' def common_parser(parser): parser.add_argument("--config_file", dest='config_file', type=str, @@ -117,7 +115,19 @@ def initialize_parameters(): # Consolidate parameter set. Command-line parameters overwrite file configuration gParameters = p1_common.args_overwrite_config(args, fileParameters) return gParameters + ''' + +def initialize_parameters(default_model = 'nt3_default_model.txt'): + + # Build benchmark object + nt3Bmk = bmk.BenchmarkNT3(bmk.file_path, default_model, 'keras', + prog='nt3_baseline_tensorrt', desc='1D CNN to classify RNA sequence data in normal or tumor classes') + # Initialize parameters + gParameters = candle.finalize_parameters(nt3Bmk) + #benchmark.logger.info('Params: {}'.format(gParameters)) + + return gParameters def load_data(train_path, test_path, gParameters): @@ -164,8 +174,8 @@ def run(gParameters): file_test = gParameters['test_data'] url = gParameters['data_url'] - train_file = data_utils.get_file(file_train, url+file_train, cache_subdir='Pilot1') - test_file = data_utils.get_file(file_test, url+file_test, cache_subdir='Pilot1') + train_file = candle.get_file(file_train, url+file_train, cache_subdir='Pilot1') + test_file = candle.get_file(file_test, url+file_test, cache_subdir='Pilot1') X_train, Y_train, X_test, Y_test = load_data(train_file, test_file, gParameters) @@ -224,6 +234,7 @@ def run(gParameters): #model.add(Dense(gParameters['classes'])) #model.add(Activation(gParameters['out_activation']), name='activation_5') model.add(Dense(gParameters['classes'], activation=gParameters['out_activation'], name='activation_5')) + #Reference case #model.add(Conv1D(filters=128, kernel_size=20, strides=1, padding='valid', input_shape=(P, 1))) #model.add(Activation('relu')) @@ -241,10 +252,10 @@ def run(gParameters): #model.add(Dense(CLASSES)) #model.add(Activation('softmax')) - kerasDefaults = p1_common.keras_default_config() + kerasDefaults = candle.keras_default_config() # Define optimizer - optimizer = p1_common_keras.build_optimizer(gParameters['optimizer'], + optimizer = candle.build_optimizer(gParameters['optimizer'], gParameters['learning_rate'], kerasDefaults) @@ -264,7 +275,7 @@ def run(gParameters): os.makedirs(output_dir) # calculate trainable and non-trainable params - gParameters.update(compute_trainable_params(model)) + gParameters.update(candle.compute_trainable_params(model)) # set up a bunch of callbacks to do work during model training.. model_name = gParameters['model_name'] @@ -272,11 +283,11 @@ def run(gParameters): # checkpointer = ModelCheckpoint(filepath=path, verbose=1, save_weights_only=False, save_best_only=True) csv_logger = CSVLogger('{}/training.log'.format(output_dir)) reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=10, verbose=1, mode='auto', epsilon=0.0001, cooldown=0, min_lr=0) - candleRemoteMonitor = CandleRemoteMonitor(params=gParameters) - timeoutMonitor = TerminateOnTimeOut(TIMEOUT) + candleRemoteMonitor = candle.CandleRemoteMonitor(params=gParameters) + timeoutMonitor = candle.TerminateOnTimeOut(TIMEOUT) history = model.fit(X_train, Y_train, batch_size=gParameters['batch_size'], - epochs=2, #gParameters['epochs'], + epochs=gParameters['epochs'], verbose=1, validation_data=(X_test, Y_test), callbacks = [csv_logger, reduce_lr, candleRemoteMonitor, timeoutMonitor]) @@ -286,10 +297,14 @@ def run(gParameters): #Begin tensorrt code config = { # Where to save models (Tensorflow + TensorRT) - "graphdef_file": "/gpfs/jlse-fs0/users/pbalapra/tensorrt/Benchmarks/Pilot1/NT3/nt3.pb", - "frozen_model_file": "/gpfs/jlse-fs0/users/pbalapra/tensorrt/Benchmarks/Pilot1/NT3/nt3_frozen_model.pb", - "snapshot_dir": "/gpfs/jlse-fs0/users/pbalapra/tensorrt/Benchmarks/Pilot1/NT3/snapshot", - "engine_save_dir": "/gpfs/jlse-fs0/users/pbalapra/tensorrt/Benchmarks/Pilot1/NT3", + #"graphdef_file": "/gpfs/jlse-fs0/users/pbalapra/tensorrt/Benchmarks/Pilot1/NT3/nt3.pb", + #"frozen_model_file": "/gpfs/jlse-fs0/users/pbalapra/tensorrt/Benchmarks/Pilot1/NT3/nt3_frozen_model.pb", + #"snapshot_dir": "/gpfs/jlse-fs0/users/pbalapra/tensorrt/Benchmarks/Pilot1/NT3/snapshot", + #"engine_save_dir": "/gpfs/jlse-fs0/users/pbalapra/tensorrt/Benchmarks/Pilot1/NT3", + "graphdef_file": "nt3.pb", + "frozen_model_file": "nt3_frozen_model.pb", + "snapshot_dir": "snapshot", + "engine_save_dir": ".", # Needed for TensorRT "inference_batch_size": 1, # inference batch size diff --git a/Pilot1/NT3/training.log b/Pilot1/NT3/training.log index c5767189..5395ad58 100644 --- a/Pilot1/NT3/training.log +++ b/Pilot1/NT3/training.log @@ -1,7 +1,2 @@ -epoch,acc,loss,val_acc,val_loss -0,0.5607142976618239,0.6856411993503571,0.8500000084085124,0.6599466066275325 -1,0.7125000102844622,0.6328544015330928,0.8642857213105474,0.5925303740160806 -2,0.8196428661633816,0.5563539402293307,0.9000000059604645,0.5126077091055257 -3,0.8553571506802525,0.4710269028853093,0.9250000039381641,0.4243324079683849 -4,0.8982142917811871,0.3808819293709738,0.8607142934841769,0.4064289022769247 -5,0.9151785764843225,0.3095611079729029,0.932142861187458,0.2992870360612869 +epoch,accuracy,loss,val_accuracy,val_loss +0,0.5705357,0.6902586913534573,0.6178571581840515,0.6843732084546771 diff --git a/Pilot1/UnoMT/unoMT_baseline_pytorch.py b/Pilot1/UnoMT/unoMT_baseline_pytorch.py index abdbf5ef..95842f41 100644 --- a/Pilot1/UnoMT/unoMT_baseline_pytorch.py +++ b/Pilot1/UnoMT/unoMT_baseline_pytorch.py @@ -45,6 +45,10 @@ def run(params): # Setting up random seed for reproducible and deterministic results seed_random_state(args.rng_seed) + # check for sufficient number of epochs to start validation + if params['epochs'] < params['resp_val_start_epoch']: + raise Exception('Number of epochs is less than validation threshold (resp_val_start_epoch)') + # Construct extension to save validation results now = datetime.datetime.now() ext = '%02d%02d_%02d%02d_pytorch' \ diff --git a/Pilot1/UnoMT/unoMT_pytorch_model.py b/Pilot1/UnoMT/unoMT_pytorch_model.py index b8477341..59513161 100644 --- a/Pilot1/UnoMT/unoMT_pytorch_model.py +++ b/Pilot1/UnoMT/unoMT_pytorch_model.py @@ -407,7 +407,7 @@ def pre_train_config(self): def train(self): - + args = self.args device = self.device @@ -429,7 +429,7 @@ def train(self): print('=' * 80 + '\nTraining Epoch %3i:' % (epoch + 1)) epoch_start_time = time.time() - + self.resp_lr_decay.step(epoch) self.cl_clf_lr_decay.step(epoch) self.drug_target_lr_decay.step(epoch) @@ -469,9 +469,9 @@ def train(self): if epoch >= args.resp_val_start_epoch: - + resp_r2 = self.validation(epoch) - + #print('\nValidation Results:') # Record the best R2 score (same data source) @@ -492,7 +492,7 @@ def train(self): def validation(self, epoch): - + args = self.args device = self.device @@ -504,7 +504,7 @@ def validation(self, epoch): site_clf_net=self.site_clf_net, type_clf_net=self.type_clf_net, data_loader=self.cl_clf_val_loader, ) - + self.val_cl_clf_acc.append([cl_category_acc, cl_site_acc, cl_type_acc]) # Validating drug target classifier @@ -519,7 +519,7 @@ def validation(self, epoch): valid_drug_qed(device=device, drug_qed_net=self.drug_qed_net, data_loader=self.drug_qed_val_loader) - + self.val_drug_qed_mse.append(drug_qed_mse) self.val_drug_qed_mae.append(drug_qed_mae) self.val_drug_qed_r2.append(drug_qed_r2) @@ -536,10 +536,10 @@ def validation(self, epoch): self.val_resp_r2.append(resp_r2) return resp_r2 - + def print_final_stats(self): - + args = self.args val_cl_clf_acc = np.array(self.val_cl_clf_acc).reshape(-1, 3) diff --git a/Pilot3/P3B4/tf_mthcan.py b/Pilot3/P3B4/tf_mthcan.py index 2d6155ba..c0506717 100644 --- a/Pilot3/P3B4/tf_mthcan.py +++ b/Pilot3/P3B4/tf_mthcan.py @@ -16,12 +16,12 @@ def __init__(self,embedding_matrix,num_classes,max_sents,max_words, attention_size=512,dropout_rate=0.9,activation=tf.nn.elu,lr=0.0001, optimizer= 'adam', embed_train = True): - tf.reset_default_graph() + tf.compat.v1.reset_default_graph() dropout_keep = dropout_rate self.dropout_keep = dropout_keep - self.dropout = tf.placeholder(tf.float32) + self.dropout = tf.compat.v1.placeholder(tf.float32) self.ms = max_sents self.mw = max_words self.embedding_matrix = embedding_matrix.astype(np.float32) @@ -31,19 +31,19 @@ def __init__(self,embedding_matrix,num_classes,max_sents,max_words, self.embed_train = embed_train #doc input - self.doc_input = tf.placeholder(tf.int32, shape=[None,max_sents,max_words]) # batch x sents x words + self.doc_input = tf.compat.v1.placeholder(tf.int32, shape=[None,max_sents,max_words]) # batch x sents x words batch_size = tf.shape(self.doc_input)[0] - + words_per_sent = tf.reduce_sum(tf.sign(self.doc_input),2) # batch X sents max_words_ = tf.reduce_max(words_per_sent) sents_per_doc = tf.reduce_sum(tf.sign(words_per_sent),1) # batch max_sents_ = tf.reduce_max(sents_per_doc) doc_input_reduced = self.doc_input[:,:max_sents_,:max_words_] #clip - + doc_input_reshape = tf.reshape(doc_input_reduced,(-1,max_words_)) # batch*sents x words #word embeddings - word_embeds = tf.gather(tf.get_variable('embeddings',initializer=self.embedding_matrix, + word_embeds = tf.gather(tf.compat.v1.get_variable('embeddings',initializer=self.embedding_matrix, dtype=tf.float32, trainable=self.embed_train),doc_input_reshape) word_embeds = tf.nn.dropout(word_embeds,self.dropout) # batch*sents x words x attention_size @@ -62,7 +62,7 @@ def __init__(self,embedding_matrix,num_classes,max_sents,max_words, outputs = tf.matmul(outputs,V) # batch*sents x words x attention_size #word target attention - Q = tf.get_variable('word_Q',(1,1,self.attention_size), + Q = tf.compat.v1.get_variable('word_Q',(1,1,self.attention_size), tf.float32,tf.orthogonal_initializer()) Q = tf.tile(Q,[batch_size*max_sents_,1,1]) V = outputs @@ -72,7 +72,7 @@ def __init__(self,embedding_matrix,num_classes,max_sents,max_words, outputs = tf.where(tf.equal(outputs,0),tf.ones_like(outputs)*-1000,outputs) outputs = tf.nn.dropout(tf.nn.softmax(outputs),self.dropout) outputs = tf.matmul(outputs,V) # batch*sents x 1 x attention_size - + sent_embeds = tf.reshape(outputs,(-1,max_sents_,self.attention_size)) sent_embeds = tf.nn.dropout(sent_embeds,self.dropout) # batch x sents x attention_size @@ -91,7 +91,7 @@ def __init__(self,embedding_matrix,num_classes,max_sents,max_words, outputs = tf.matmul(outputs,V) # batch x sents x attention_size #sent target attention - Q = tf.get_variable('sent_Q',(1,1,self.attention_size), + Q = tf.compat.v1.get_variable('sent_Q',(1,1,self.attention_size), tf.float32,tf.orthogonal_initializer()) Q = tf.tile(Q,[batch_size,1,1]) V = outputs @@ -116,27 +116,27 @@ def __init__(self,embedding_matrix,num_classes,max_sents,max_words, self.labels = [] self.loss = 0 for i in range(self.num_tasks): - label = tf.placeholder(tf.int32,shape=[None]) + label = tf.compat.v1.placeholder(tf.int32,shape=[None]) self.labels.append(label) loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits[i],labels=label)) self.loss += loss/self.num_tasks - # self.optimizer = tf.train.AdamOptimizer(lr,0.9,0.99).minimize(self.loss) + # self.optimizer = tf.compat.v1.train.AdamOptimizer(lr,0.9,0.99).minimize(self.loss) if optimizer == 'adam': - self.optimizer = tf.train.AdamOptimizer(lr,0.9,0.99).minimize(self.loss) + self.optimizer = tf.compat.v1.train.AdamOptimizer(lr,0.9,0.99).minimize(self.loss) elif optimizer == 'sgd': - self.optimizer = tf.train.GradientDescentOptimizer( lr ).minimize( self.loss ) + self.optimizer = tf.compat.v1.train.GradientDescentOptimizer( lr ).minimize( self.loss ) elif optimizer == 'adadelta': - self.optimizer = tf.train.AdadeltaOptimizer( learning_rate= lr ).minimize( self.loss ) + self.optimizer = tf.compat.v1.train.AdadeltaOptimizer( learning_rate= lr ).minimize( self.loss ) else: - self.optimizer = tf.train.RMSPropOptimizer( lr ).minimize( self.loss ) + self.optimizer = tf.compat.v1.train.RMSPropOptimizer( lr ).minimize( self.loss ) #init op - config = tf.ConfigProto() + config = tf.compat.v1.ConfigProto() config.gpu_options.allow_growth = True - self.saver = tf.train.Saver() - self.sess = tf.Session(config=config) + self.saver = tf.compat.v1.train.Saver() + self.sess = tf.compat.v1.Session(config=config) self.sess.run(tf.global_variables_initializer()) - + def train(self,data,labels,batch_size=100,epochs=50,validation_data=None): if validation_data: @@ -189,7 +189,7 @@ def train(self,data,labels,batch_size=100,epochs=50,validation_data=None): #checkpoint after every epoch print("\ntraining time: %.2f" % (time.time()-start_time)) - + for i in range(self.num_tasks): micro = f1_score(y_trues[i],y_preds[i],average='micro') macro = f1_score(y_trues[i],y_preds[i],average='macro') @@ -202,11 +202,11 @@ def train(self,data,labels,batch_size=100,epochs=50,validation_data=None): #reset timer start_time = time.time() - + return history def predict(self,data,batch_size=100): - + y_preds = [[] for i in range(self.num_tasks)] for start in range(0,len(data),batch_size): @@ -230,7 +230,7 @@ def predict(self,data,batch_size=100): return y_preds def score(self,data,labels,batch_size=16): - + loss = [] y_preds = [[] for i in range(self.num_tasks)] for start in range(0,len(data),batch_size): @@ -246,7 +246,7 @@ def score(self,data,labels,batch_size=16): feed_dict[self.labels[i]] = labels[i][start:stop] retvals = self.sess.run(self.predictions+[self.loss],feed_dict=feed_dict) loss.append(retvals[-1]) - + for i in range(self.num_tasks): y_preds[i].append(np.argmax(retvals[i],1)) diff --git a/common/file_utils.py b/common/file_utils.py index 526a4373..4a3aac42 100644 --- a/common/file_utils.py +++ b/common/file_utils.py @@ -100,6 +100,12 @@ def get_file(fname, origin, untar=False, else: download = True + # fix ftp protocol if needed + if origin.startswith('ftp://'): + new_url = origin.replace('ftp://','http://') + origin = new_url + print('Origin = ', origin) + if download: print('Downloading data from', origin) global progbar diff --git a/common/solr_keras.py b/common/solr_keras.py index f3d0fdfd..eb33c009 100644 --- a/common/solr_keras.py +++ b/common/solr_keras.py @@ -45,12 +45,6 @@ def __init__(self, super(CandleRemoteMonitor, self).__init__() self.global_params = params - self.has_solr_config = False - if 'solr_root' in params and params['solr_root'] != '': - self.has_solr_config = True - self.root = params['solr_root'] - self.path = '/run/update?commit=true' - self.headers = {'Content-Type': 'application/json'} # init self.experiment_id = None @@ -77,8 +71,6 @@ def on_train_begin(self, logs=None): } # print("on_train_begin", send) self.log_messages.append(send) - if self.has_solr_config: - self.submit(send) def on_epoch_begin(self, epoch, logs=None): self.epoch_timestamp = datetime.now() @@ -101,8 +93,6 @@ def on_epoch_end(self, epoch, logs=None): } # print("on_epoch_end", send) self.log_messages.append(send) - if self.has_solr_config: - self.submit(send) def on_train_end(self, logs=None): logs = logs or {} @@ -118,26 +108,10 @@ def on_train_end(self, logs=None): } # print("on_train_end", send) self.log_messages.append(send) - if self.has_solr_config: - self.submit(send) # save to file when finished self.save() - def submit(self, send): - """Send json to solr - - Arguments: - send -- json object - """ - try: - requests.post(self.root + self.path, - json=[send], - headers=self.headers) - except requests.exceptions.RequestException: - warnings.warn( - 'Warning: could not reach RemoteMonitor root server at ' + str(self.root)) - def save(self): """Save log_messages to file """ diff --git a/examples/ADRP/adrp_baseline_keras2.py b/examples/ADRP/adrp_baseline_keras2.py index 02a3ea41..c22d016e 100644 --- a/examples/ADRP/adrp_baseline_keras2.py +++ b/examples/ADRP/adrp_baseline_keras2.py @@ -89,23 +89,23 @@ def auroc(y_true, y_pred): return score -def covariance(x, y): - return K.mean(x * y) - K.mean(x) * K.mean(y) +#def covariance(x, y): +# return K.mean(x * y) - K.mean(x) * K.mean(y) def corr(y_true, y_pred): - cov = covariance(y_true, y_pred) - var1 = covariance(y_true, y_true) - var2 = covariance(y_pred, y_pred) + cov = candle.covariance(y_true, y_pred) + var1 = candle.covariance(y_true, y_true) + var2 = candle.covariance(y_pred, y_pred) return cov / (K.sqrt(var1 * var2) + K.epsilon()) -def xent(y_true, y_pred): - return binary_crossentropy(y_true, y_pred) +#def xent(y_true, y_pred): +# return binary_crossentropy(y_true, y_pred) -def mse(y_true, y_pred): - return mean_squared_error(y_true, y_pred) +#def mse(y_true, y_pred): +# return mean_squared_error(y_true, y_pred) class MetricHistory(Callback): diff --git a/examples/darts/advanced/default_model.txt b/examples/darts/advanced/default_model.txt index 239a569e..13a35b75 100644 --- a/examples/darts/advanced/default_model.txt +++ b/examples/darts/advanced/default_model.txt @@ -1,8 +1,8 @@ [Global_Params] model_name = 'darts_uno' data_url = 'ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/' -savepath = './results' -log_interval = 10 +save_path = './results' +log_interval = 100 train_data = 'top_21_auc_1fold.uno.h5' learning_rate = 0.025 learning_rate_min = 0.001 @@ -11,5 +11,4 @@ weight_decay = 3e-4 grad_clip = 5 batch_size = 100 epochs = 10 -seed = 13 - +rng_seed = 13 diff --git a/examples/darts/advanced/example.py b/examples/darts/advanced/example.py index f5e5a2ce..dbadfed2 100644 --- a/examples/darts/advanced/example.py +++ b/examples/darts/advanced/example.py @@ -171,7 +171,7 @@ def train(trainloader, logger.info(f'Step: {step} loss: {meter.loss_meter.avg:.4}') meter.update_epoch() - meter.save(args.savepath) + meter.save(args.save_path) def validate(validloader, model, criterion, args, tasks, meter, device): @@ -196,7 +196,7 @@ def validate(validloader, model, criterion, args, tasks, meter, device): logger.info(f'>> Validation: {step} loss: {meter.loss_meter.avg:.4}') meter.update_epoch() - meter.save(args.savepath) + meter.save(args.save_path) def _wrap_target(target): diff --git a/examples/darts/advanced/example_setup.py b/examples/darts/advanced/example_setup.py index c47a6682..53f46fac 100644 --- a/examples/darts/advanced/example_setup.py +++ b/examples/darts/advanced/example_setup.py @@ -9,6 +9,14 @@ import candle +additional_definitions = [ + {'name':'grad_clip','type':int}, + {'name':'learning_rate_min','type':float, 'help':'Minimum learning rate'}, + {'name':'log_interval','type':int, 'help':'Logging interval'}, + {'name':'unrolled','type':candle.str2bool}, + {'name':'weight_decay','type':float}, + {'name':'grad_clip','type':int} +] REQUIRED = [ 'learning_rate', @@ -16,7 +24,7 @@ 'momentum', 'weight_decay', 'grad_clip', - 'seed', + 'rng_seed', 'batch_size', 'epochs', ] @@ -33,4 +41,6 @@ def set_locals(self): """ if REQUIRED is not None: self.required = set(REQUIRED) + if additional_definitions is not None: + self.additional_definitions = additional_definitions diff --git a/examples/darts/uno/default_model.txt b/examples/darts/uno/default_model.txt index 78d7c325..cb69d184 100644 --- a/examples/darts/uno/default_model.txt +++ b/examples/darts/uno/default_model.txt @@ -1,8 +1,8 @@ [Global_Params] model_name = 'darts_uno' data_url = 'http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/' -savepath = '.' -log_interval = 10 +save_path = './results' +log_interval = 100 train_data = 'top_21_auc_1fold.uno.h5' learning_rate = 0.025 learning_rate_min = 0.001 @@ -11,5 +11,4 @@ weight_decay = 3e-4 grad_clip = 5 batch_size = 100 epochs = 10 -seed = 13 - +rng_seed = 13 diff --git a/examples/darts/uno/example_setup.py b/examples/darts/uno/example_setup.py index 0a129157..75c36fbe 100644 --- a/examples/darts/uno/example_setup.py +++ b/examples/darts/uno/example_setup.py @@ -9,6 +9,14 @@ import candle +additional_definitions = [ +{'name':'grad_clip','type':int}, +{'name':'learning_rate_min','type':float, 'help':'Minimum learning rate'}, +{'name':'log_interval','type':int, 'help':'Logging interval'}, +{'name':'unrolled','type':candle.str2bool}, +{'name':'weight_decay','type':float}, +{'name':'grad_clip','type':int} +] REQUIRED = [ 'learning_rate', @@ -16,7 +24,7 @@ 'momentum', 'weight_decay', 'grad_clip', - 'seed', + 'rng_seed', 'batch_size', 'epochs', ] @@ -29,8 +37,12 @@ def set_locals(self): """ Set parameters for the benchmark. Args: - required: set of required parameters for the benchmark. + - required: set of required parameters for the benchmark. + - additional_definitions: list of dictionaries describing the additional parameters for the + benchmark. """ if REQUIRED is not None: self.required = set(REQUIRED) + if additional_definitions is not None: + self.additional_definitions = additional_definitions diff --git a/examples/darts/uno/uno_example.py b/examples/darts/uno/uno_example.py index c8643805..30f616d1 100644 --- a/examples/darts/uno/uno_example.py +++ b/examples/darts/uno/uno_example.py @@ -4,6 +4,7 @@ import torch.nn as nn from torch import optim from torch.utils.data import DataLoader +import logging import example_setup as bmk import darts @@ -70,7 +71,7 @@ def run(params): train_meter = darts.EpochMeter(tasks, 'train') valid_meter = darts.EpochMeter(tasks, 'valid') - genotype_store = darts.GenotypeStorage(root=args.savepath) + genotype_store = darts.GenotypeStorage(root=args.save_path) for epoch in range(args.epochs): @@ -161,7 +162,7 @@ def train(trainloader, logger.info(f'Step: {step} loss: {meter.loss_meter.avg:.4}') meter.update_epoch() - meter.save(args.savepath) + meter.save(args.save_path) @@ -186,7 +187,7 @@ def validate(validloader, model, criterion, args, tasks, meter, device): logger.info(f'>> Validation: {step} loss: {meter.loss_meter.avg:.4}') meter.update_epoch() - meter.save(args.savepath) + meter.save(args.save_path) def main(): From a5d44c9645e11296a387d91d6d0a1203e0d38ca6 Mon Sep 17 00:00:00 2001 From: yngtodd Date: Tue, 12 May 2020 17:05:34 -0400 Subject: [PATCH 299/331] Fix meters The meters in DARTS were updated when they were moved to common, but the logging in P3B5 was not updated. --- Pilot3/P3B5/p3b5_baseline_pytorch.py | 16 ++++++++++------ Pilot3/P3B5/p3b5_darts.py | 10 ++++------ 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/Pilot3/P3B5/p3b5_baseline_pytorch.py b/Pilot3/P3B5/p3b5_baseline_pytorch.py index 63c5a8a3..75829b63 100644 --- a/Pilot3/P3B5/p3b5_baseline_pytorch.py +++ b/Pilot3/P3B5/p3b5_baseline_pytorch.py @@ -97,7 +97,7 @@ def run(params): print(f'Genotype: {genotype}') # training - train_acc, train_loss = train( + train( trainloader, validloader, model, @@ -112,16 +112,20 @@ def run(params): ) # validation - valid_acc, valid_loss = infer(validloader, model, criterion, args, tasks, device, valid_meter) + valid_loss = infer( + validloader, + model, + criterion, + args, + tasks, + device, + valid_meter + ) if valid_loss < min_loss: genotype_store.save_genotype(genotype) min_loss = valid_loss - print(f'\nEpoch {epoch} stats:') - # darts.log_accuracy(train_acc, 'train') - # darts.log_accuracy(valid_acc, 'valid') - def main(): params = initialize_parameters() diff --git a/Pilot3/P3B5/p3b5_darts.py b/Pilot3/P3B5/p3b5_darts.py index 63a9e3ce..af252658 100644 --- a/Pilot3/P3B5/p3b5_darts.py +++ b/Pilot3/P3B5/p3b5_darts.py @@ -66,12 +66,10 @@ def train(trainloader, validloader, model, architecture, criterion, optimizer, l meter.update_batch_accuracy(prec1, batch_size) if step % args.log_interval == 0: - print(f'Step: {step} loss: {losses.avg:.4}') - #darts.log_accuracy(top1) + print(f'Step: {step} loss: {meter.loss_meter.avg:.4}') meter.update_epoch() meter.save(args.savepath) - return top1, losses.avg def infer(validloader, model, criterion, args, tasks, device, meter): @@ -94,12 +92,12 @@ def infer(validloader, model, criterion, args, tasks, device, meter): meter.update_batch_accuracy(prec1, batch_size) if step % args.log_interval == 0: - print(f'>> Validation: {step} loss: {losses.avg:.4}') - #darts.log_accuracy(top1, 'valid') + print(f'>> Validation: {step} loss: {meter.loss_meter.avg:.4}') meter.update_epoch() meter.save(args.savepath) - return top1, losses.avg + + return meter.loss_meter.avg if __name__=='__main__': From 24c88cf87e5eefc8adbc1f125d1480d60ba26634 Mon Sep 17 00:00:00 2001 From: Jamal Date: Tue, 12 May 2020 21:39:55 -0600 Subject: [PATCH 300/331] Fixed save_path keyword. --- Pilot3/P3B5/p3b5_darts.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Pilot3/P3B5/p3b5_darts.py b/Pilot3/P3B5/p3b5_darts.py index af252658..c5d77fb4 100644 --- a/Pilot3/P3B5/p3b5_darts.py +++ b/Pilot3/P3B5/p3b5_darts.py @@ -69,7 +69,7 @@ def train(trainloader, validloader, model, architecture, criterion, optimizer, l print(f'Step: {step} loss: {meter.loss_meter.avg:.4}') meter.update_epoch() - meter.save(args.savepath) + meter.save(args.save_path) def infer(validloader, model, criterion, args, tasks, device, meter): @@ -95,7 +95,7 @@ def infer(validloader, model, criterion, args, tasks, device, meter): print(f'>> Validation: {step} loss: {meter.loss_meter.avg:.4}') meter.update_epoch() - meter.save(args.savepath) + meter.save(args.save_path) return meter.loss_meter.avg From c1e69b7b01f8c321228a5543614aa22bdf5a31c5 Mon Sep 17 00:00:00 2001 From: Jamal Date: Tue, 12 May 2020 22:05:51 -0600 Subject: [PATCH 301/331] Added missing keyword definitions, fixed broken model files, made args consistent. --- Pilot2/P2B1/p2b1.py | 17 +++++++++++++---- Pilot2/P2B1/p2b1_baseline_keras2.py | 18 +++++++++--------- Pilot2/P2B1/p2b1_default_model.txt | 2 +- Pilot2/P2B1/p2b1_medium_model.txt | 11 ++++++++++- Pilot2/P2B1/p2b1_small_model.txt | 8 +++++--- 5 files changed, 38 insertions(+), 18 deletions(-) diff --git a/Pilot2/P2B1/p2b1.py b/Pilot2/P2B1/p2b1.py index ad11088b..faa0c860 100644 --- a/Pilot2/P2B1/p2b1.py +++ b/Pilot2/P2B1/p2b1.py @@ -31,10 +31,10 @@ {'name':'train_bool', 'type':candle.str2bool,'default':True,'help':'Invoke training'}, {'name':'eval_bool', 'type':candle.str2bool,'default':False,'help':'Use model for inference'}, {'name':'home_dir','help':'Home Directory','type':str,'default':'.'}, -{'name':'config_file','help':'Config File','type':str,'default':os.path.join(file_path, 'p2b1_default_model.txt')}, +#{'name':'config_file','help':'Config File','type':str,'default':os.path.join(file_path, 'p2b1_default_model.txt')}, {'name':'weight_path','help':'Trained Model Pickle File','type':str,'default':None}, {'name':'base_memo','help':'Memo','type':str,'default':None}, -{'name':'seed', 'type':candle.str2bool,'default':False,'help':'Random Seed'}, +#{'name':'seed_bool', 'type':candle.str2bool,'default':False,'help':'Random Seed'}, {'name':'case','help':'[Full, Center, CenterZ]','type':str,'default':'Full'}, {'name':'fig_bool', 'type':candle.str2bool,'default':False,'help':'Generate Prediction Figure'}, {'name':'set_sel','help':'[3k_Disordered, 3k_Ordered, 3k_Ordered_and_gel, 6k_Disordered, 6k_Ordered, 6k_Ordered_and_gel]','type':str,'default':'3k_Disordered'}, @@ -42,7 +42,16 @@ {'name':'full_conv_bool', 'type':candle.str2bool, 'default':False, 'help':'Invoke training using fully convolutional NN for inner AE'}, {'name':'type_bool', 'type':candle.str2bool, 'default':True, 'help':'Include molecule type information in desining AE'}, {'name':'nbr_type', 'type':str, 'default':'relative', 'help':'Defines the type of neighborhood data to use. [relative, invariant]'}, -{'name':'backend', 'help':'Keras Backend', 'type':str, 'default':'tensorflow'} +{'name':'backend', 'help':'Keras Backend', 'type':str, 'default':'tensorflow'}, +{'name':'cool', 'help':'Boolean: cool learning rate', 'type':candle.str2bool, 'default':False}, +{'name':'data_set', 'help':'Data set for training', 'type':str, 'default':None}, +{'name':'l2_reg', 'help':'Regularization parameter', 'type':float, 'default':None}, +{'name':'molecular_nbrs', 'help':'Data dimension for molecular autoencoder', 'type':int, 'default':None}, +{'name':'molecular_nonlinearity', 'help':'Activation for molecular netowrk', 'type':str, 'default':None}, +{'name':'molecular_num_hidden', 'nargs':'+', 'help':'Layer sizes for molecular network', 'type':int, 'default':None}, +{'name':'noise_factor', 'help':'Noise factor', 'type':float, 'default':None}, +{'name':'num_hidden', 'nargs':'+', 'help':'Dense layer specification', 'type':int, 'default':None}, +{'name':'sampling_density', 'help':'Sampling density', 'type':float, 'default':None} ] required = [ @@ -61,7 +70,7 @@ 'molecular_num_hidden', 'molecular_nonlinearity', 'molecular_nbrs', - 'drop_prob', + 'dropout', 'l2_reg', 'sampling_density', 'save_path' diff --git a/Pilot2/P2B1/p2b1_baseline_keras2.py b/Pilot2/P2B1/p2b1_baseline_keras2.py index 524c2cf7..f7f9eeaf 100644 --- a/Pilot2/P2B1/p2b1_baseline_keras2.py +++ b/Pilot2/P2B1/p2b1_baseline_keras2.py @@ -12,7 +12,7 @@ from importlib import reload # Python 3.4+ except ImportError: from imp import reload # Python 3.0 - 3.3 - + TIMEOUT=3600 # in sec; set this to -1 for no timeout file_path = os.path.dirname(os.path.realpath(__file__)) #lib_path = os.path.abspath(os.path.join(file_path, '..', 'common')) @@ -22,7 +22,7 @@ from keras import backend as K -import p2b1 +import p2b1 import candle import p2b1_AE_models as AE_models @@ -53,7 +53,7 @@ def initialize_parameters(default_model = 'p2b1_default_model.txt'): print ('\nTraining parameters:') for key in sorted(GP): print ("\t%s: %s" % (key, GP[key])) - + # print json.dumps(GP, indent=4, skipkeys=True, sort_keys=True) if GP['backend'] != 'theano' and GP['backend'] != 'tensorflow': @@ -79,8 +79,8 @@ def initialize_parameters(default_model = 'p2b1_default_model.txt'): def run(GP): # set the seed - if GP['seed']: - np.random.seed(GP['seed']) + if GP['rng_seed']: + np.random.seed(GP['rng_seed']) else: np.random.seed(np.random.randint(10000)) @@ -211,7 +211,7 @@ def run(GP): nonlinearity=molecular_nonlinearity, hidden_layers=molecular_hidden_layers, l2_reg=GP['l2_reg'], - drop=float(GP['drop_prob'])) + drop=float(GP['dropout'])) elif full_conv_bool: molecular_model, molecular_encoder = AE_models.full_conv_mol_auto(bead_k_size=bead_kernel_size, mol_k_size=mol_kernel_size, @@ -220,14 +220,14 @@ def run(GP): nonlinearity=molecular_nonlinearity, hidden_layers=molecular_hidden_layers, l2_reg=GP['l2_reg'], - drop=float(GP['drop_prob'])) + drop=float(GP['dropout'])) else: molecular_model, molecular_encoder = AE_models.dense_auto(weights_path=None, input_shape=(molecular_input_dim,), nonlinearity=molecular_nonlinearity, hidden_layers=molecular_hidden_layers, l2_reg=GP['l2_reg'], - drop=float(GP['drop_prob'])) + drop=float(GP['dropout'])) if GP['loss'] == 'mse': loss_func = 'mse' @@ -238,7 +238,7 @@ def run(GP): print ('\nModel Summary: \n') molecular_model.summary() ##### set up callbacks and cooling for the molecular_model ########## - drop = 0.5 + drop = GP['dropout'] mb_epochs = GP['epochs'] initial_lrate = GP['learning_rate'] epochs_drop = 1+int(np.floor(mb_epochs/3)) diff --git a/Pilot2/P2B1/p2b1_default_model.txt b/Pilot2/P2B1/p2b1_default_model.txt index 62e4e30c..52c876ca 100644 --- a/Pilot2/P2B1/p2b1_default_model.txt +++ b/Pilot2/P2B1/p2b1_default_model.txt @@ -13,7 +13,7 @@ molecular_nonlinearity='elu' molecular_num_hidden=[256, 128, 64, 32, 16, 8] molecular_nbrs = 200 base_memo='p2b1' -drop_prob = 0.5 +dropout = 0.5 data_set='3k_Ordered' sampling_density = 0.15 save_path='.' diff --git a/Pilot2/P2B1/p2b1_medium_model.txt b/Pilot2/P2B1/p2b1_medium_model.txt index bb178ef8..3912d0ee 100644 --- a/Pilot2/P2B1/p2b1_medium_model.txt +++ b/Pilot2/P2B1/p2b1_medium_model.txt @@ -4,6 +4,15 @@ batch_size=32 learning_rate=0.01 epochs=10 cool=True -weight_decay=0.0005 +optimizer='adam' +loss='custom' +activation='relu' +molecular_nonlinearity='elu' +molecular_num_hidden=[256, 128, 64, 32, 16, 8] +molecular_nbrs = 200 noise_factor=0 base_memo='p2b1' +dropout = 0.5 +l2_reg=0.01 +sampling_density = 0.15 +save_path='.' diff --git a/Pilot2/P2B1/p2b1_small_model.txt b/Pilot2/P2B1/p2b1_small_model.txt index 9f53dd02..196fcb15 100644 --- a/Pilot2/P2B1/p2b1_small_model.txt +++ b/Pilot2/P2B1/p2b1_small_model.txt @@ -4,13 +4,15 @@ batch_size=32 learning_rate=0.01 epochs=10 cool=False -weight_decay=0.0005 noise_factor=0.0 optimizer='adam' loss='mse' activation='relu' -molecular_nonlinearity=elu +molecular_nonlinearity='elu' molecular_num_hidden=[54,12] -molecular_epochs=1 molecular_nbrs=10 base_memo='p2b1' +dropout = 0.5 +l2_reg=0.01 +sampling_density = 0.15 +save_path='.' From 8e02669c78639ec0094bae238d6958db3eca9bc8 Mon Sep 17 00:00:00 2001 From: Jamal Date: Tue, 12 May 2020 22:22:26 -0600 Subject: [PATCH 302/331] Removed deprecated solr_root keyword. --- Pilot1/UnoMT/README.md | 2 -- Pilot1/UnoMT/unoMT_default_model.txt | 1 - 2 files changed, 3 deletions(-) diff --git a/Pilot1/UnoMT/README.md b/Pilot1/UnoMT/README.md index e9f76b90..491e2fdb 100644 --- a/Pilot1/UnoMT/README.md +++ b/Pilot1/UnoMT/README.md @@ -78,7 +78,6 @@ Configuration file: ./unoMT_default_model.txt 'rnaseq_scaling': 'std', 'rng_seed': 0, 'save_path': 'save/unoMT', - 'solr_root': '', 'timeout': 3600, 'train_sources': 'NCI60', 'trn_batch_size': 32, @@ -142,7 +141,6 @@ Params: 'run_id': 'RUN000', 'save_path': 'save/unoMT', 'shuffle': False, - 'solr_root': '', 'timeout': 3600, 'train_bool': True, 'train_sources': 'NCI60', diff --git a/Pilot1/UnoMT/unoMT_default_model.txt b/Pilot1/UnoMT/unoMT_default_model.txt index ff0b569b..df64d076 100644 --- a/Pilot1/UnoMT/unoMT_default_model.txt +++ b/Pilot1/UnoMT/unoMT_default_model.txt @@ -91,6 +91,5 @@ rng_seed=0 save_path='save/unoMT' [Monitor_Params] -solr_root='' timeout=3600 From 67db5641996eb8fe08869710e1cf48178ad74b6a Mon Sep 17 00:00:00 2001 From: Jamal Date: Wed, 13 May 2020 07:40:28 -0600 Subject: [PATCH 303/331] Fixed Windows formatted README. --- Pilot1/Uno_UQ/README.md | 541 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 540 insertions(+), 1 deletion(-) diff --git a/Pilot1/Uno_UQ/README.md b/Pilot1/Uno_UQ/README.md index 3da30a33..2157778e 100644 --- a/Pilot1/Uno_UQ/README.md +++ b/Pilot1/Uno_UQ/README.md @@ -1 +1,540 @@ -## Uno_UQ: Predicting Tumor Dose Response across Multiple Data Sources with added UQ functionality. ## Functionality Uno_UQ adds uncertainty quantification (UQ) functionality to the Uno model. For information about the underlaying model, please refer to the Uno benchmark. This page overviews the added UQ functionality provided, which includes: - Generation of holdout set. - Training excluding the holdout set. - Inference for the specified data. - Training for homoscedastic and heteroscedastic models. - Empirical calibration of UQ for the trained models. ## Holdout The holdout script generates a set of identifiers to holdout during training, depending on the --partition_by argument. If --partition_by is 'drug_pair' it generates a set of drug IDs. If --partition_by is 'cell' it generates a set of cell IDs. In any other case it generates a set of indices. The fraction to reserve in the holdout set is given by the --val_split argument. #### Example output ``` python uno_holdoutUQ_data.py Using TensorFlow backend. Importing candle utils for keras Params: {'activation': 'relu', 'agg_dose': 'AUC', 'base_lr': None, 'batch_normalization': False, 'batch_size': 32, 'by_cell': None, 'by_drug': None, 'cache': None, 'cell_feature_subset_path': '', 'cell_features': ['rnaseq'], 'cell_subset_path': '', 'cell_types': None, 'cp': False, 'cv': 1, 'data_type': , 'dense': [1000, 1000, 1000], 'dense_feature_layers': [1000, 1000, 1000], 'dropout': 0.1, 'drug_feature_subset_path': '', 'drug_features': ['descriptors', 'fingerprints'], 'drug_median_response_max': 1, 'drug_median_response_min': -1, 'drug_subset_path': '', 'epochs': 10, 'exclude_cells': [], 'exclude_drugs': [], 'experiment_id': 'EXP000', 'export_csv': None, 'export_data': None, 'feature_subsample': 0, 'feature_subset_path': '', 'gpus': [], 'growth_bins': 0, 'initial_weights': None, 'learning_rate': 0.01, 'logfile': None, 'loss': 'mse', 'max_val_loss': 1.0, 'no_feature_source': True, 'no_gen': False, 'no_response_source': True, 'optimizer': 'sgd', 'output_dir': './Output/EXP000/RUN000', 'partition_by': 'cell', 'preprocess_rnaseq': 'none', 'profiling': False, 'reduce_lr': False, 'residual': False, 'rng_seed': 2018, 'run_id': 'RUN000', 'sample_repetition': False, 'save_path': 'save_default/', 'save_weights': 'default.weights.h5', 'scaling': 'std', 'shuffle': False, 'single': True, 'solr_root': '', 'tb': False, 'tb_prefix': 'tb', 'test_sources': ['train'], 'timeout': 3600, 'train_bool': True, 'train_sources': ['gCSI'], 'use_exported_data': None, 'use_filtered_genes': False, 'use_landmark_genes': True, 'val_split': 0.2, 'verbose': None, 'warmup_lr': False} partition_by: cell Cell IDs in holdout set written in file: save_default/infer_cell_ids ``` ## Train The train script trains the model, as in the underlying Uno benchmark, but excluding the IDs in the holdout file. The file with the holdout set should be provided via one of the following arguments - --uq_exclude_drugs_file='file' if the file contains a set of drug IDs. - --uq_exclude_cells_file='file' if the file contains a set of cell IDs. - --uq_exclude_indices_file='file' if the file contains a set of indices. An additional --loss heteroscedastic option is available. This will learn the input-dependent noise level as well as the main regression variable specified (i.e. growth or AUC). #### Example output ``` python uno_trainUQ_keras2.py --cp True --uq_exclude_cells_file 'save_default/infer_cell_ids' Using TensorFlow backend. Importing candle utils for keras Params: {'activation': 'relu', 'agg_dose': 'AUC', 'base_lr': None, 'batch_normalization': False, 'batch_size': 32, 'by_cell': None, 'by_drug': None, 'cache': None, 'cell_feature_subset_path': '', 'cell_features': ['rnaseq'], 'cell_subset_path': '', 'cell_types': None, 'cp': True, 'cv': 1, 'data_type': , 'dense': [1000, 1000, 1000], 'dense_feature_layers': [1000, 1000, 1000], 'dropout': 0.1, 'drug_feature_subset_path': '', 'drug_features': ['descriptors', 'fingerprints'], 'drug_median_response_max': 1, 'drug_median_response_min': -1, 'drug_subset_path': '', 'epochs': 10, 'exclude_cells': [], 'exclude_drugs': [], 'exclude_indices': [], 'experiment_id': 'EXP000', 'export_csv': None, 'export_data': None, 'feature_subsample': 0, 'feature_subset_path': '', 'gpus': [], 'growth_bins': 0, 'initial_weights': None, 'learning_rate': 0.01, 'logfile': None, 'loss': 'mse', 'max_val_loss': 1.0, 'no_feature_source': True, 'no_gen': False, 'no_response_source': True, 'optimizer': 'sgd', 'output_dir': './Output/EXP000/RUN000', 'partition_by': 'cell', 'preprocess_rnaseq': 'none', 'reduce_lr': False, 'reg_l2': 0.0, 'residual': False, 'rng_seed': 2018, 'run_id': 'RUN000', 'sample_repetition': False, 'save_path': 'save_default/', 'save_weights': 'saved.weights.h5', 'scaling': 'std', 'shuffle': False, 'single': True, 'solr_root': '', 'tb': False, 'tb_prefix': 'tb', 'test_sources': ['train'], 'timeout': 3600, 'train_bool': True, 'train_sources': ['gCSI'], 'uq_exclude_cells_file': 'save_default/infer_cell_ids', 'use_exported_data': None, 'use_filtered_genes': False, 'use_landmark_genes': True, 'val_split': 0.2, 'verbose': None, 'warmup_lr': False} Read file: save_default/infer_cell_ids Number of elements read: 72 Cells to exclude: ['gCSI.NCI-H889', 'gCSI.MEWO', 'gCSI.PA-TU-8902', 'gCSI.BCPAP', 'gCSI.CAL-12T', 'gCSI.NCI-H727', 'gCSI.HUH-1', 'gCSI.NUGC-4', 'gCSI.MKN74', 'gCSI.PK-1', 'gCSI.A2058', 'gCSI.RAJI', 'gCSI.JHH-7', 'gCSI.SUIT-2', 'gCSI.OE21', 'gCSI.HCC1806', 'gCSI.PANC-10-05', 'gCSI.RMG-I', 'gCSI.NCI-H1703', 'gCSI.KMS-34', 'gCSI.G-361', 'gCSI.EPLC-272H', 'gCSI.HEP-G2', 'gCSI.RERF-LC-MS', 'gCSI.COLO-800', 'gCSI.KM12', 'gCSI.DOHH-2', 'gCSI.EFM-19', 'gCSI.MDA-MB-468', 'gCSI.MHH-ES-1', 'gCSI.IPC-298', 'gCSI.GRANTA-519', 'gCSI.8305C', 'gCSI.KYSE-140', 'gCSI.MALME-3M', 'gCSI.MIA-PACA-2', 'gCSI.NCI-H1666', 'gCSI.PC-3', 'gCSI.RT4', 'gCSI.HUP-T4', 'gCSI.NCI-H1869', 'gCSI.WM-266-4', 'gCSI.KMM-1', 'gCSI.OE33', 'gCSI.SU-DHL-6', 'gCSI.QGP-1', 'gCSI.IGR-37', 'gCSI.VMRC-RCW', 'gCSI.NCI-H1838', 'gCSI.SW948', 'gCSI.COLO-679', 'gCSI.CAL-51', 'gCSI.HUCCT1', 'gCSI.LP-1', 'gCSI.RPMI-7951', 'gCSI.HPAF-II', 'gCSI.OCUM-1', 'gCSI.HOP-92', 'gCSI.NCI-H661', 'gCSI.TOV-112D', 'gCSI.PANC-03-27', 'gCSI.AGS', 'gCSI.HEC-59', 'gCSI.LN-18', 'gCSI.U-87-MG', 'gCSI.U-2-OS', 'gCSI.ABC-1', 'gCSI.IGR-1', 'gCSI.SK-MEL-3', 'gCSI.A549', 'gCSI.HCC4006', 'gCSI.NCI-H1355'] Combined model: __________________________________________________________________________________________________ Layer (type) Output Shape Param # Connected to ================================================================================================== input.cell.rnaseq (InputLayer) (None, 942) 0 __________________________________________________________________________________________________ input.drug1.descriptors (InputL (None, 5270) 0 __________________________________________________________________________________________________ input.drug1.fingerprints (Input (None, 2048) 0 __________________________________________________________________________________________________ cell.rnaseq (Model) (None, 1000) 2945000 input.cell.rnaseq[0][0] __________________________________________________________________________________________________ drug.descriptors (Model) (None, 1000) 7273000 input.drug1.descriptors[0][0] __________________________________________________________________________________________________ drug.fingerprints (Model) (None, 1000) 4051000 input.drug1.fingerprints[0][0] __________________________________________________________________________________________________ concatenate_1 (Concatenate) (None, 3000) 0 cell.rnaseq[1][0] drug.descriptors[1][0] drug.fingerprints[1][0] __________________________________________________________________________________________________ dense_10 (Dense) (None, 1000) 3001000 concatenate_1[0][0] __________________________________________________________________________________________________ permanent_dropout_10 (Permanent (None, 1000) 0 dense_10[0][0] __________________________________________________________________________________________________ dense_11 (Dense) (None, 1000) 1001000 permanent_dropout_10[0][0] __________________________________________________________________________________________________ permanent_dropout_11 (Permanent (None, 1000) 0 dense_11[0][0] __________________________________________________________________________________________________ dense_12 (Dense) (None, 1000) 1001000 permanent_dropout_11[0][0] __________________________________________________________________________________________________ permanent_dropout_12 (Permanent (None, 1000) 0 dense_12[0][0] __________________________________________________________________________________________________ dense_13 (Dense) (None, 1) 1001 permanent_dropout_12[0][0] ================================================================================================== Total params: 19,273,001 Trainable params: 19,273,001 Non-trainable params: 0 __________________________________________________________________________________________________ Training homoscedastic model: partition:train, rank:0, sharded index size:2784, batch_size:32, steps:87 partition:val, rank:0, sharded index size:704, batch_size:32, steps:22 Between random pairs in y_val: mse: 0.0604 mae: 0.1978 r2: -0.9105 corr: 0.0447 Data points per epoch: train = 2784, val = 704 Steps per epoch: train = 87, val = 22 Epoch 1/10 87/87 [==============================] - 15s 174ms/step - loss: 0.2165 - mae: 0.2144 - r2: -6.4761 - val_loss: 0.0247 - val_mae: 0.1244 - val_r2: 0.1916 Current time ....15.176 Epoch 2/10 87/87 [==============================] - 12s 142ms/step - loss: 0.0247 - mae: 0.1240 - r2: 0.1302 - val_loss: 0.0208 - val_mae: 0.1147 - val_r2: 0.3058 Current time ....28.323 Epoch 3/10 87/87 [==============================] - 12s 143ms/step - loss: 0.0219 - mae: 0.1157 - r2: 0.2278 - val_loss: 0.0197 - val_mae: 0.1112 - val_r2: 0.3565 Current time ....41.321 Epoch 4/10 87/87 [==============================] - 12s 143ms/step - loss: 0.0203 - mae: 0.1111 - r2: 0.2897 - val_loss: 0.0182 - val_mae: 0.1072 - val_r2: 0.3980 Current time ....54.330 Epoch 5/10 87/87 [==============================] - 13s 153ms/step - loss: 0.0187 - mae: 0.1066 - r2: 0.3388 - val_loss: 0.0189 - val_mae: 0.1090 - val_r2: 0.3804 Current time ....68.120 Epoch 6/10 87/87 [==============================] - 13s 148ms/step - loss: 0.0185 - mae: 0.1075 - r2: 0.3412 - val_loss: 0.0186 - val_mae: 0.1088 - val_r2: 0.3921 Current time ....80.967 Epoch 7/10 87/87 [==============================] - 13s 147ms/step - loss: 0.0185 - mae: 0.1069 - r2: 0.3468 - val_loss: 0.0177 - val_mae: 0.1043 - val_r2: 0.4259 Current time ....93.769 Epoch 8/10 87/87 [==============================] - 13s 150ms/step - loss: 0.0176 - mae: 0.1031 - r2: 0.3791 - val_loss: 0.0159 - val_mae: 0.0994 - val_r2: 0.4793 Current time ....107.421 Epoch 9/10 87/87 [==============================] - 13s 150ms/step - loss: 0.0177 - mae: 0.1034 - r2: 0.3745 - val_loss: 0.0161 - val_mae: 0.1000 - val_r2: 0.4696 Current time ....120.945 Epoch 10/10 87/87 [==============================] - 14s 159ms/step - loss: 0.0169 - mae: 0.1022 - r2: 0.4086 - val_loss: 0.0173 - val_mae: 0.1029 - val_r2: 0.4337 Current time ....134.744 Comparing y_true and y_pred: mse: 0.0165 mae: 0.1016 r2: 0.4782 corr: 0.7072 Testing predictions stored in file: save_default/uno.A=relu.B=32.E=10.O=sgd.LS=mse.LR=0.01.CF=r.DF=df.DR=0.1.L1000.D1=1000.D2=1000.D3=1000.predicted.tsv Model stored in file: save_default/uno.A=relu.B=32.E=10.O=sgd.LS=mse.LR=0.01.CF=r.DF=df.DR=0.1.L1000.D1=1000.D2=1000.D3=1000.model.json Model stored in file: save_default/uno.A=relu.B=32.E=10.O=sgd.LS=mse.LR=0.01.CF=r.DF=df.DR=0.1.L1000.D1=1000.D2=1000.D3=1000.model.h5 Model weights stored in file: save_default//default.weights.h5 partition:test, rank:0, sharded index size:0, batch_size:32, steps:0 ``` ## Infer The infer script does inference on a trained model, as in the underlying Uno benchmark. This script is able to use a pre-generated file or it can construct the data to do inference if a set of identifiers are provided. The argument --uq_infer_file must be used to specify the name of the file with the data (or the identifiers) to do inference. Additionally, if the data needs to be constructed, then one of the following arguments should be used to specify what type of identifiers are provided - --uq_infer_given_drugs=True if the file contains a set of drug IDs. - --uq_infer_given_cells=True if the file contains a set of cell IDs. - --uq_infer_given_indices=True if the file contains a set of indices. Note that the latter works if all the arguments for the data construction are set as well (usually those are taken from the model configuration file). Of course this specification and the trained model should be consistent for the script to work. Likewise, in the case that a pre-generated file is provided, the features included and the trained model should be consistent for the script to work. Note also that the --loss heteroscedastic option should be specified if the model was trained to predict the heterogeneous noise as well. #### Example output This assumes that a trained model (files default.model.json and default.weights.h5) is available at save_default folder. A sample json file compatible with the default model used in the training demo script is provided. After running the training script a default.weights.h5 file should be generated. Both, in combination, can be used for testing the inference demo script and would produce a similar output to the one shown next. ``` python uno_inferUQ_keras2.py --uq_infer_file save_default/infer_cell_ids --uq_infer_given_cells True --model_file save_default/uno.A\=relu.B\=32.E\=10.O\=sgd.LS\=mse.LR\=0.01.CF\=r.DF\=df.DR\=0.1.L1000.D1\=1000.D2\=1000.D3\=1000.model.h5 --weights_file save_default/saved.weights.h5 --n_pred 10 Using TensorFlow backend. Importing candle utils for keras Params: {'activation': 'relu', 'agg_dose': 'AUC', 'base_lr': None, 'batch_normalization': False, 'batch_size': 32, 'by_cell': None, 'by_drug': None, 'cache': None, 'cell_feature_subset_path': '', 'cell_features': ['rnaseq'], 'cell_subset_path': '', 'cell_types': None, 'cp': False, 'cv': 1, 'data_type': , 'dense': [1000, 1000, 1000], 'dense_feature_layers': [1000, 1000, 1000], 'dropout': 0.1, 'drug_feature_subset_path': '', 'drug_features': ['descriptors', 'fingerprints'], 'drug_median_response_max': 1, 'drug_median_response_min': -1, 'drug_subset_path': '', 'epochs': 10, 'exclude_cells': [], 'exclude_drugs': [], 'experiment_id': 'EXP000', 'export_csv': None, 'export_data': None, 'feature_subsample': 0, 'feature_subset_path': '', 'gpus': [], 'growth_bins': 0, 'initial_weights': None, 'learning_rate': 0.01, 'logfile': None, 'loss': 'mse', 'max_val_loss': 1.0, 'model_file': 'save_default/default.model.json', 'n_pred': 10, 'no_feature_source': True, 'no_gen': False, 'no_response_source': True, 'optimizer': 'sgd', 'output_dir': './Output/EXP000/RUN000', 'partition_by': 'cell', 'preprocess_rnaseq': 'none', 'profiling': False 'reduce_lr': False, 'residual': False, 'rng_seed': 2018, 'run_id': 'RUN000', 'sample_repetition': False, 'save_path': 'save_default/', 'save_weights': None, 'scaling': 'std', 'shuffle': False, 'single': True, 'solr_root': '', 'tb': False, 'tb_prefix': 'tb', 'test_sources': ['train'], 'timeout': 3600, 'train_bool': True, 'train_sources': ['gCSI'], 'uq_infer_file': 'save_default/infer_cell_ids', 'uq_infer_given_cells': True, 'uq_infer_given_drugs': False, 'uq_infer_given_indices': False, 'use_exported_data': None, 'use_filtered_genes': False, 'use_landmark_genes': True, 'val_split': 0.2, 'verbose': None, 'warmup_lr': False, 'weights_file': 'save_default/saved.weights.h5'} __________________________________________________________________________________________________ Layer (type) Output Shape Param # Connected to ================================================================================================== input.cell.rnaseq (InputLayer) (None, 942) 0 __________________________________________________________________________________________________ input.drug1.descriptors (InputL (None, 5270) 0 __________________________________________________________________________________________________ input.drug1.fingerprints (Input (None, 2048) 0 __________________________________________________________________________________________________ cell.rnaseq (Model) (None, 1000) 2945000 input.cell.rnaseq[0][0] __________________________________________________________________________________________________ drug.descriptors (Model) (None, 1000) 7273000 input.drug1.descriptors[0][0] __________________________________________________________________________________________________ drug.fingerprints (Model) (None, 1000) 4051000 input.drug1.fingerprints[0][0] __________________________________________________________________________________________________ concatenate_1 (Concatenate) (None, 3000) 0 cell.rnaseq[1][0] drug.descriptors[1][0] drug.fingerprints[1][0] __________________________________________________________________________________________________ dense_10 (Dense) (None, 1000) 3001000 concatenate_1[0][0] __________________________________________________________________________________________________ permanent_dropout_10 (Permanent (None, 1000) 0 dense_10[0][0] __________________________________________________________________________________________________ dense_11 (Dense) (None, 1000) 1001000 permanent_dropout_10[0][0] __________________________________________________________________________________________________ permanent_dropout_11 (Permanent (None, 1000) 0 dense_11[0][0] __________________________________________________________________________________________________ dense_12 (Dense) (None, 1000) 1001000 permanent_dropout_11[0][0] __________________________________________________________________________________________________ permanent_dropout_12 (Permanent (None, 1000) 0 dense_12[0][0] __________________________________________________________________________________________________ dense_13 (Dense) (None, 1) 1001 permanent_dropout_12[0][0] ================================================================================================== Total params: 19,273,001 Trainable params: 19,273,001 Non-trainable params: 0 __________________________________________________________________________________________________ partition:test, rank:0, sharded index size:0, batch_size:32, steps:0 Read file: save_default/infer_cell_ids Number of elements read: 72 Comparing y_true and y_pred: mse: 0.0173 mae: 0.1012 r2: 0.4687 corr: 0.7001 Comparing y_true and y_pred: mse: 0.0172 mae: 0.1005 r2: 0.4720 corr: 0.7010 Comparing y_true and y_pred: mse: 0.0171 mae: 0.1033 r2: 0.4751 corr: 0.7064 Comparing y_true and y_pred: mse: 0.0175 mae: 0.1045 r2: 0.4627 corr: 0.6945 Comparing y_true and y_pred: mse: 0.0162 mae: 0.1007 r2: 0.5017 corr: 0.7277 Comparing y_true and y_pred: mse: 0.0166 mae: 0.1008 r2: 0.4921 corr: 0.7141 Comparing y_true and y_pred: mse: 0.0181 mae: 0.1059 r2: 0.4443 corr: 0.6878 Comparing y_true and y_pred: mse: 0.0167 mae: 0.1015 r2: 0.4875 corr: 0.7087 Comparing y_true and y_pred: mse: 0.0169 mae: 0.1032 r2: 0.4805 corr: 0.7106 Comparing y_true and y_pred: mse: 0.0169 mae: 0.0999 r2: 0.4817 corr: 0.7075 Predictions stored in file: save_default/uno.A=relu.B=32.E=10.O=sgd.LS=mse.LR=None.CF=r.DF=df.DR=0.1.L1000.D1=1000.D2=1000.D3=1000.predicted_INFER.tsv ``` ## Empirical Calibration Scripts included in the calibration subfolder compute empirical calibration for the inference results. The scripts with suffix HOM compute empirical calibration for inference with homoscedastic model, while the script with suffix HET computes empirical calibration for inference with a heteroscedastic model. To run the scripts it is necessary to provide the path to the file and the file with the inference results. Note that it is assumed that the file with the inference results includes each realization of the inference (implicit in the 'all' suffix), but for the homoscedastic case a script is provided to process an inference file with only the consolidated statistics (generally the average over all the realizations). Also, note that a specific format of the file with the inference results is assumed. Thus, a set of default values, reflecting the format of current CANDLE infer scripts, is used. More arbitrary formats may be usable, if they incurr in similar column offsets, but it would require passing the right parameters to the function reading the inference file. The script generates a series of plots and pickle (dill) files, displaying and encoding the empirical calibration computed. \ No newline at end of file +## Uno_UQ: Predicting Tumor Dose Response across Multiple Data Sources with added UQ functionality. + + + +## Functionality + +Uno_UQ adds uncertainty quantification (UQ) functionality to the Uno model. For information about the underlaying model, please refer to the Uno benchmark. + + + +This page overviews the added UQ functionality provided, which includes: + +- Generation of holdout set. + +- Training excluding the holdout set. + +- Inference for the specified data. + +- Training for homoscedastic and heteroscedastic models. + +- Empirical calibration of UQ for the trained models. + + + +## Holdout + +The holdout script generates a set of identifiers to holdout during training, depending on the --partition_by argument. + +If --partition_by is 'drug_pair' it generates a set of drug IDs. + +If --partition_by is 'cell' it generates a set of cell IDs. + +In any other case it generates a set of indices. + + + +The fraction to reserve in the holdout set is given by the --val_split argument. + + + +#### Example output + +``` +python uno_holdoutUQ_data.py +Using TensorFlow backend. +Importing candle utils for keras +Params: +{'activation': 'relu', + 'agg_dose': 'AUC', + 'base_lr': None, + 'batch_normalization': False, + 'batch_size': 32, + 'by_cell': None, + 'by_drug': None, + 'cache': None, + 'cell_feature_subset_path': '', + 'cell_features': ['rnaseq'], + 'cell_subset_path': '', + 'cell_types': None, + 'cp': False, + 'cv': 1, + 'data_type': , + 'dense': [1000, 1000, 1000], + 'dense_feature_layers': [1000, 1000, 1000], + 'dropout': 0.1, + 'drug_feature_subset_path': '', + 'drug_features': ['descriptors', 'fingerprints'], + 'drug_median_response_max': 1, + 'drug_median_response_min': -1, + 'drug_subset_path': '', + 'epochs': 10, + 'exclude_cells': [], + 'exclude_drugs': [], + 'experiment_id': 'EXP000', + 'export_csv': None, + 'export_data': None, + 'feature_subsample': 0, + 'feature_subset_path': '', + 'gpus': [], + 'growth_bins': 0, + 'initial_weights': None, + 'learning_rate': 0.01, + 'logfile': None, + 'loss': 'mse', + 'max_val_loss': 1.0, + 'no_feature_source': True, + 'no_gen': False, + 'no_response_source': True, + 'optimizer': 'sgd', + 'output_dir': './Output/EXP000/RUN000', + 'partition_by': 'cell', + 'preprocess_rnaseq': 'none', + 'profiling': False, + 'reduce_lr': False, + 'residual': False, + 'rng_seed': 2018, + 'run_id': 'RUN000', + 'sample_repetition': False, + 'save_path': 'save_default/', + 'save_weights': 'default.weights.h5', + 'scaling': 'std', + 'shuffle': False, + 'single': True, + 'solr_root': '', + 'tb': False, + 'tb_prefix': 'tb', + 'test_sources': ['train'], + 'timeout': 3600, + 'train_bool': True, + 'train_sources': ['gCSI'], + 'use_exported_data': None, + 'use_filtered_genes': False, + 'use_landmark_genes': True, + 'val_split': 0.2, + 'verbose': None, + 'warmup_lr': False} +partition_by: cell +Cell IDs in holdout set written in file: save_default/infer_cell_ids + +``` + + + +## Train + +The train script trains the model, as in the underlying Uno benchmark, but excluding the IDs in the holdout file. The file with the holdout set should be provided via one of the following arguments + +- --uq_exclude_drugs_file='file' if the file contains a set of drug IDs. + +- --uq_exclude_cells_file='file' if the file contains a set of cell IDs. + +- --uq_exclude_indices_file='file' if the file contains a set of indices. + + + +An additional --loss heteroscedastic option is available. This will learn the input-dependent noise level as well as the main regression variable specified (i.e. growth or AUC). + + + +#### Example output + +``` + +python uno_trainUQ_keras2.py --cp True --uq_exclude_cells_file 'save_default/infer_cell_ids' + +Using TensorFlow backend. +Importing candle utils for keras +Params: +{'activation': 'relu', + 'agg_dose': 'AUC', + 'base_lr': None, + 'batch_normalization': False, + 'batch_size': 32, + 'by_cell': None, + 'by_drug': None, + 'cache': None, + 'cell_feature_subset_path': '', + 'cell_features': ['rnaseq'], + 'cell_subset_path': '', + 'cell_types': None, + 'cp': True, + 'cv': 1, + 'data_type': , + 'dense': [1000, 1000, 1000], + 'dense_feature_layers': [1000, 1000, 1000], + 'dropout': 0.1, + 'drug_feature_subset_path': '', + 'drug_features': ['descriptors', 'fingerprints'], + 'drug_median_response_max': 1, + 'drug_median_response_min': -1, + 'drug_subset_path': '', + 'epochs': 10, + 'exclude_cells': [], + 'exclude_drugs': [], + 'exclude_indices': [], + 'experiment_id': 'EXP000', + 'export_csv': None, + 'export_data': None, + 'feature_subsample': 0, + 'feature_subset_path': '', + 'gpus': [], + 'growth_bins': 0, + 'initial_weights': None, + 'learning_rate': 0.01, + 'logfile': None, + 'loss': 'mse', + 'max_val_loss': 1.0, + 'no_feature_source': True, + 'no_gen': False, + 'no_response_source': True, + 'optimizer': 'sgd', + 'output_dir': './Output/EXP000/RUN000', + 'partition_by': 'cell', + 'preprocess_rnaseq': 'none', + 'reduce_lr': False, + 'reg_l2': 0.0, + 'residual': False, + 'rng_seed': 2018, + 'run_id': 'RUN000', + 'sample_repetition': False, + 'save_path': 'save_default/', + 'save_weights': 'saved.weights.h5', + 'scaling': 'std', + 'shuffle': False, + 'single': True, + 'solr_root': '', + 'tb': False, + 'tb_prefix': 'tb', + 'test_sources': ['train'], + 'timeout': 3600, + 'train_bool': True, + 'train_sources': ['gCSI'], + 'uq_exclude_cells_file': 'save_default/infer_cell_ids', + 'use_exported_data': None, + 'use_filtered_genes': False, + 'use_landmark_genes': True, + 'val_split': 0.2, + 'verbose': None, + 'warmup_lr': False} +Read file: save_default/infer_cell_ids +Number of elements read: 72 +Cells to exclude: ['gCSI.NCI-H889', 'gCSI.MEWO', 'gCSI.PA-TU-8902', 'gCSI.BCPAP', 'gCSI.CAL-12T', 'gCSI.NCI-H727', 'gCSI.HUH-1', 'gCSI.NUGC-4', 'gCSI.MKN74', 'gCSI.PK-1', 'gCSI.A2058', 'gCSI.RAJI', 'gCSI.JHH-7', 'gCSI.SUIT-2', 'gCSI.OE21', 'gCSI.HCC1806', 'gCSI.PANC-10-05', 'gCSI.RMG-I', 'gCSI.NCI-H1703', 'gCSI.KMS-34', 'gCSI.G-361', 'gCSI.EPLC-272H', 'gCSI.HEP-G2', 'gCSI.RERF-LC-MS', 'gCSI.COLO-800', 'gCSI.KM12', 'gCSI.DOHH-2', 'gCSI.EFM-19', 'gCSI.MDA-MB-468', 'gCSI.MHH-ES-1', 'gCSI.IPC-298', 'gCSI.GRANTA-519', 'gCSI.8305C', 'gCSI.KYSE-140', 'gCSI.MALME-3M', 'gCSI.MIA-PACA-2', 'gCSI.NCI-H1666', 'gCSI.PC-3', 'gCSI.RT4', 'gCSI.HUP-T4', 'gCSI.NCI-H1869', 'gCSI.WM-266-4', 'gCSI.KMM-1', 'gCSI.OE33', 'gCSI.SU-DHL-6', 'gCSI.QGP-1', 'gCSI.IGR-37', 'gCSI.VMRC-RCW', 'gCSI.NCI-H1838', 'gCSI.SW948', 'gCSI.COLO-679', 'gCSI.CAL-51', 'gCSI.HUCCT1', 'gCSI.LP-1', 'gCSI.RPMI-7951', 'gCSI.HPAF-II', 'gCSI.OCUM-1', 'gCSI.HOP-92', 'gCSI.NCI-H661', 'gCSI.TOV-112D', 'gCSI.PANC-03-27', 'gCSI.AGS', 'gCSI.HEC-59', 'gCSI.LN-18', 'gCSI.U-87-MG', 'gCSI.U-2-OS', 'gCSI.ABC-1', 'gCSI.IGR-1', 'gCSI.SK-MEL-3', 'gCSI.A549', 'gCSI.HCC4006', 'gCSI.NCI-H1355'] +Combined model: +__________________________________________________________________________________________________ +Layer (type) Output Shape Param # Connected to +================================================================================================== +input.cell.rnaseq (InputLayer) (None, 942) 0 +__________________________________________________________________________________________________ +input.drug1.descriptors (InputL (None, 5270) 0 +__________________________________________________________________________________________________ +input.drug1.fingerprints (Input (None, 2048) 0 +__________________________________________________________________________________________________ +cell.rnaseq (Model) (None, 1000) 2945000 input.cell.rnaseq[0][0] +__________________________________________________________________________________________________ +drug.descriptors (Model) (None, 1000) 7273000 input.drug1.descriptors[0][0] +__________________________________________________________________________________________________ +drug.fingerprints (Model) (None, 1000) 4051000 input.drug1.fingerprints[0][0] +__________________________________________________________________________________________________ +concatenate_1 (Concatenate) (None, 3000) 0 cell.rnaseq[1][0] + drug.descriptors[1][0] + drug.fingerprints[1][0] +__________________________________________________________________________________________________ +dense_10 (Dense) (None, 1000) 3001000 concatenate_1[0][0] +__________________________________________________________________________________________________ +permanent_dropout_10 (Permanent (None, 1000) 0 dense_10[0][0] +__________________________________________________________________________________________________ +dense_11 (Dense) (None, 1000) 1001000 permanent_dropout_10[0][0] +__________________________________________________________________________________________________ +permanent_dropout_11 (Permanent (None, 1000) 0 dense_11[0][0] +__________________________________________________________________________________________________ +dense_12 (Dense) (None, 1000) 1001000 permanent_dropout_11[0][0] +__________________________________________________________________________________________________ +permanent_dropout_12 (Permanent (None, 1000) 0 dense_12[0][0] +__________________________________________________________________________________________________ +dense_13 (Dense) (None, 1) 1001 permanent_dropout_12[0][0] +================================================================================================== +Total params: 19,273,001 +Trainable params: 19,273,001 +Non-trainable params: 0 +__________________________________________________________________________________________________ +Training homoscedastic model: +partition:train, rank:0, sharded index size:2784, batch_size:32, steps:87 +partition:val, rank:0, sharded index size:704, batch_size:32, steps:22 +Between random pairs in y_val: + mse: 0.0604 + mae: 0.1978 + r2: -0.9105 + corr: 0.0447 +Data points per epoch: train = 2784, val = 704 +Steps per epoch: train = 87, val = 22 +Epoch 1/10 +87/87 [==============================] - 15s 174ms/step - loss: 0.2165 - mae: 0.2144 - r2: -6.4761 - val_loss: 0.0247 - val_mae: 0.1244 - val_r2: 0.1916 +Current time ....15.176 +Epoch 2/10 +87/87 [==============================] - 12s 142ms/step - loss: 0.0247 - mae: 0.1240 - r2: 0.1302 - val_loss: 0.0208 - val_mae: 0.1147 - val_r2: 0.3058 +Current time ....28.323 +Epoch 3/10 +87/87 [==============================] - 12s 143ms/step - loss: 0.0219 - mae: 0.1157 - r2: 0.2278 - val_loss: 0.0197 - val_mae: 0.1112 - val_r2: 0.3565 +Current time ....41.321 +Epoch 4/10 +87/87 [==============================] - 12s 143ms/step - loss: 0.0203 - mae: 0.1111 - r2: 0.2897 - val_loss: 0.0182 - val_mae: 0.1072 - val_r2: 0.3980 +Current time ....54.330 +Epoch 5/10 +87/87 [==============================] - 13s 153ms/step - loss: 0.0187 - mae: 0.1066 - r2: 0.3388 - val_loss: 0.0189 - val_mae: 0.1090 - val_r2: 0.3804 +Current time ....68.120 +Epoch 6/10 +87/87 [==============================] - 13s 148ms/step - loss: 0.0185 - mae: 0.1075 - r2: 0.3412 - val_loss: 0.0186 - val_mae: 0.1088 - val_r2: 0.3921 +Current time ....80.967 +Epoch 7/10 +87/87 [==============================] - 13s 147ms/step - loss: 0.0185 - mae: 0.1069 - r2: 0.3468 - val_loss: 0.0177 - val_mae: 0.1043 - val_r2: 0.4259 +Current time ....93.769 +Epoch 8/10 +87/87 [==============================] - 13s 150ms/step - loss: 0.0176 - mae: 0.1031 - r2: 0.3791 - val_loss: 0.0159 - val_mae: 0.0994 - val_r2: 0.4793 +Current time ....107.421 +Epoch 9/10 +87/87 [==============================] - 13s 150ms/step - loss: 0.0177 - mae: 0.1034 - r2: 0.3745 - val_loss: 0.0161 - val_mae: 0.1000 - val_r2: 0.4696 +Current time ....120.945 +Epoch 10/10 +87/87 [==============================] - 14s 159ms/step - loss: 0.0169 - mae: 0.1022 - r2: 0.4086 - val_loss: 0.0173 - val_mae: 0.1029 - val_r2: 0.4337 +Current time ....134.744 +Comparing y_true and y_pred: + mse: 0.0165 + mae: 0.1016 + r2: 0.4782 + corr: 0.7072 +Testing predictions stored in file: save_default/uno.A=relu.B=32.E=10.O=sgd.LS=mse.LR=0.01.CF=r.DF=df.DR=0.1.L1000.D1=1000.D2=1000.D3=1000.predicted.tsv +Model stored in file: save_default/uno.A=relu.B=32.E=10.O=sgd.LS=mse.LR=0.01.CF=r.DF=df.DR=0.1.L1000.D1=1000.D2=1000.D3=1000.model.json +Model stored in file: save_default/uno.A=relu.B=32.E=10.O=sgd.LS=mse.LR=0.01.CF=r.DF=df.DR=0.1.L1000.D1=1000.D2=1000.D3=1000.model.h5 +Model weights stored in file: save_default//default.weights.h5 +partition:test, rank:0, sharded index size:0, batch_size:32, steps:0 + +``` + + + +## Infer + +The infer script does inference on a trained model, as in the underlying Uno benchmark. This script is able to use a pre-generated file or it can construct the data to do inference if a set of identifiers are provided. + + + +The argument --uq_infer_file must be used to specify the name of the file with the data (or the identifiers) to do inference. + + + +Additionally, if the data needs to be constructed, then one of the following arguments should be used to specify what type of identifiers are provided + +- --uq_infer_given_drugs=True if the file contains a set of drug IDs. + +- --uq_infer_given_cells=True if the file contains a set of cell IDs. + +- --uq_infer_given_indices=True if the file contains a set of indices. + + + +Note that the latter works if all the arguments for the data construction are set as well (usually those are taken from the model configuration file). Of course this specification and the trained model should be consistent for the script to work. + + + +Likewise, in the case that a pre-generated file is provided, the features included and the trained model should be consistent for the script to work. + + + +Note also that the --loss heteroscedastic option should be specified if the model was trained to predict the heterogeneous noise as well. + + + +#### Example output + +This assumes that a trained model (files default.model.json and default.weights.h5) is available at save_default folder. A sample json file compatible with the default model used in the training demo script is provided. After running the training script a default.weights.h5 file should be generated. Both, in combination, can be used for testing the inference demo script and would produce a similar output to the one shown next. + +``` + +python uno_inferUQ_keras2.py --uq_infer_file save_default/infer_cell_ids --uq_infer_given_cells True --model_file save_default/uno.A\=relu.B\=32.E\=10.O\=sgd.LS\=mse.LR\=0.01.CF\=r.DF\=df.DR\=0.1.L1000.D1\=1000.D2\=1000.D3\=1000.model.h5 --weights_file save_default/saved.weights.h5 --n_pred 10 +Using TensorFlow backend. +Importing candle utils for keras +Params: +{'activation': 'relu', + 'agg_dose': 'AUC', + 'base_lr': None, + 'batch_normalization': False, + 'batch_size': 32, + 'by_cell': None, + 'by_drug': None, + 'cache': None, + 'cell_feature_subset_path': '', + 'cell_features': ['rnaseq'], + 'cell_subset_path': '', + 'cell_types': None, + 'cp': False, + 'cv': 1, + 'data_type': , + 'dense': [1000, 1000, 1000], + 'dense_feature_layers': [1000, 1000, 1000], + 'dropout': 0.1, + 'drug_feature_subset_path': '', + 'drug_features': ['descriptors', 'fingerprints'], + 'drug_median_response_max': 1, + 'drug_median_response_min': -1, + 'drug_subset_path': '', + 'epochs': 10, + 'exclude_cells': [], + 'exclude_drugs': [], + 'experiment_id': 'EXP000', + 'export_csv': None, + 'export_data': None, + 'feature_subsample': 0, + 'feature_subset_path': '', + 'gpus': [], + 'growth_bins': 0, + 'initial_weights': None, + 'learning_rate': 0.01, + 'logfile': None, + 'loss': 'mse', + 'max_val_loss': 1.0, + 'model_file': 'save_default/default.model.json', + 'n_pred': 10, + 'no_feature_source': True, + 'no_gen': False, + 'no_response_source': True, + 'optimizer': 'sgd', + 'output_dir': './Output/EXP000/RUN000', + 'partition_by': 'cell', + 'preprocess_rnaseq': 'none', + 'profiling': False + 'reduce_lr': False, + 'residual': False, + 'rng_seed': 2018, + 'run_id': 'RUN000', + 'sample_repetition': False, + 'save_path': 'save_default/', + 'save_weights': None, + 'scaling': 'std', + 'shuffle': False, + 'single': True, + 'solr_root': '', + 'tb': False, + 'tb_prefix': 'tb', + 'test_sources': ['train'], + 'timeout': 3600, + 'train_bool': True, + 'train_sources': ['gCSI'], + 'uq_infer_file': 'save_default/infer_cell_ids', + 'uq_infer_given_cells': True, + 'uq_infer_given_drugs': False, + 'uq_infer_given_indices': False, + 'use_exported_data': None, + 'use_filtered_genes': False, + 'use_landmark_genes': True, + 'val_split': 0.2, + 'verbose': None, + 'warmup_lr': False, + 'weights_file': 'save_default/saved.weights.h5'} +__________________________________________________________________________________________________ +Layer (type) Output Shape Param # Connected to +================================================================================================== +input.cell.rnaseq (InputLayer) (None, 942) 0 +__________________________________________________________________________________________________ +input.drug1.descriptors (InputL (None, 5270) 0 +__________________________________________________________________________________________________ +input.drug1.fingerprints (Input (None, 2048) 0 +__________________________________________________________________________________________________ +cell.rnaseq (Model) (None, 1000) 2945000 input.cell.rnaseq[0][0] +__________________________________________________________________________________________________ +drug.descriptors (Model) (None, 1000) 7273000 input.drug1.descriptors[0][0] +__________________________________________________________________________________________________ +drug.fingerprints (Model) (None, 1000) 4051000 input.drug1.fingerprints[0][0] +__________________________________________________________________________________________________ +concatenate_1 (Concatenate) (None, 3000) 0 cell.rnaseq[1][0] + drug.descriptors[1][0] + drug.fingerprints[1][0] +__________________________________________________________________________________________________ +dense_10 (Dense) (None, 1000) 3001000 concatenate_1[0][0] +__________________________________________________________________________________________________ +permanent_dropout_10 (Permanent (None, 1000) 0 dense_10[0][0] +__________________________________________________________________________________________________ +dense_11 (Dense) (None, 1000) 1001000 permanent_dropout_10[0][0] +__________________________________________________________________________________________________ +permanent_dropout_11 (Permanent (None, 1000) 0 dense_11[0][0] +__________________________________________________________________________________________________ +dense_12 (Dense) (None, 1000) 1001000 permanent_dropout_11[0][0] +__________________________________________________________________________________________________ +permanent_dropout_12 (Permanent (None, 1000) 0 dense_12[0][0] +__________________________________________________________________________________________________ +dense_13 (Dense) (None, 1) 1001 permanent_dropout_12[0][0] +================================================================================================== +Total params: 19,273,001 +Trainable params: 19,273,001 +Non-trainable params: 0 +__________________________________________________________________________________________________ +partition:test, rank:0, sharded index size:0, batch_size:32, steps:0 +Read file: save_default/infer_cell_ids +Number of elements read: 72 +Comparing y_true and y_pred: + mse: 0.0173 + mae: 0.1012 + r2: 0.4687 + corr: 0.7001 +Comparing y_true and y_pred: + mse: 0.0172 + mae: 0.1005 + r2: 0.4720 + corr: 0.7010 +Comparing y_true and y_pred: + mse: 0.0171 + mae: 0.1033 + r2: 0.4751 + corr: 0.7064 +Comparing y_true and y_pred: + mse: 0.0175 + mae: 0.1045 + r2: 0.4627 + corr: 0.6945 +Comparing y_true and y_pred: + mse: 0.0162 + mae: 0.1007 + r2: 0.5017 + corr: 0.7277 +Comparing y_true and y_pred: + mse: 0.0166 + mae: 0.1008 + r2: 0.4921 + corr: 0.7141 +Comparing y_true and y_pred: + mse: 0.0181 + mae: 0.1059 + r2: 0.4443 + corr: 0.6878 +Comparing y_true and y_pred: + mse: 0.0167 + mae: 0.1015 + r2: 0.4875 + corr: 0.7087 +Comparing y_true and y_pred: + mse: 0.0169 + mae: 0.1032 + r2: 0.4805 + corr: 0.7106 +Comparing y_true and y_pred: + mse: 0.0169 + mae: 0.0999 + r2: 0.4817 + corr: 0.7075 +Predictions stored in file: save_default/uno.A=relu.B=32.E=10.O=sgd.LS=mse.LR=None.CF=r.DF=df.DR=0.1.L1000.D1=1000.D2=1000.D3=1000.predicted_INFER.tsv +``` + + + +## Empirical Calibration + +Scripts included in the calibration subfolder compute empirical calibration for the inference results. The scripts with suffix HOM compute empirical calibration for inference with homoscedastic model, while the script with suffix HET computes empirical calibration for inference with a heteroscedastic model. + + + +To run the scripts it is necessary to provide the path to the file and the file with the inference results. Note that it is assumed that the file with the inference results includes each realization of the inference (implicit in the 'all' suffix), but for the homoscedastic case a script is provided to process an inference file with only the consolidated statistics (generally the average over all the realizations). Also, note that a specific format of the file with the inference results is assumed. Thus, a set of default values, reflecting the format of current CANDLE infer scripts, is used. More arbitrary formats may be usable, if they incurr in similar column offsets, but it would require passing the right parameters to the function reading the inference file. + + + +The script generates a series of plots and pickle (dill) files, displaying and encoding the empirical calibration computed. + From da82634991e875003d315afc33c1372e1419282a Mon Sep 17 00:00:00 2001 From: Jamal Date: Wed, 13 May 2020 07:41:32 -0600 Subject: [PATCH 304/331] Removed unused keywords --- Pilot1/P1B1/p1b1.py | 2 +- Pilot1/P1B1/p1b1_default_model.txt | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/Pilot1/P1B1/p1b1.py b/Pilot1/P1B1/p1b1.py index bb6f52a4..d40a9865 100644 --- a/Pilot1/P1B1/p1b1.py +++ b/Pilot1/P1B1/p1b1.py @@ -84,7 +84,7 @@ 'initialization', 'learning_rate', 'loss', - 'noise_factor', + #'noise_factor', 'optimizer', 'rng_seed', 'model', diff --git a/Pilot1/P1B1/p1b1_default_model.txt b/Pilot1/P1B1/p1b1_default_model.txt index 8f915d2e..a2a9051a 100644 --- a/Pilot1/P1B1/p1b1_default_model.txt +++ b/Pilot1/P1B1/p1b1_default_model.txt @@ -14,7 +14,6 @@ learning_rate=None base_lr=None scaling='minmax' model='ae' -noise_factor=0 val_split=0.1 epsilon_std=1.0 rng_seed=2017 From f5a3af1971389fc57f29135649948e60a340f53b Mon Sep 17 00:00:00 2001 From: Jamal Date: Wed, 13 May 2020 08:14:17 -0600 Subject: [PATCH 305/331] Remove deprecated solr_root keywords --- Pilot1/Uno/README.AUC.md | 1 - Pilot1/Uno/README.md | 2 +- Pilot1/Uno/uno_auc_clr_model.txt | 2 +- Pilot1/Uno/uno_auc_model.txt | 3 +-- Pilot1/Uno/uno_by_drug_example.txt | 1 - Pilot1/Uno/uno_clr_model.txt | 39 +++++++++++++++++++++++++++++ Pilot1/Uno/uno_default_model.txt | 1 - Pilot1/Uno/uno_fom_model.txt | 1 - Pilot1/Uno/uno_perf_bench_model.txt | 1 - 9 files changed, 42 insertions(+), 9 deletions(-) create mode 100644 Pilot1/Uno/uno_clr_model.txt diff --git a/Pilot1/Uno/README.AUC.md b/Pilot1/Uno/README.AUC.md index 3b308ece..902adb93 100644 --- a/Pilot1/Uno/README.AUC.md +++ b/Pilot1/Uno/README.AUC.md @@ -72,7 +72,6 @@ Params: 'scaling': 'std', 'shuffle': False, 'single': True, - 'solr_root': '', 'tb': False, 'tb_prefix': 'tb', 'test_sources': ['train'], diff --git a/Pilot1/Uno/README.md b/Pilot1/Uno/README.md index 96f46c45..c2c86f4e 100644 --- a/Pilot1/Uno/README.md +++ b/Pilot1/Uno/README.md @@ -7,7 +7,7 @@ Uno can be trained with a subset of dose response data sources. Here is an comma uno_baseline_keras2.py --train_sources all --cache cache/all --use_landmark_genes True --preprocess_rnaseq source_scale --no_feature_source True --no_response_source True Using TensorFlow backend. Params: {'activation': 'relu', 'batch_size': 32, 'dense': [1000, 1000, 1000], 'dense_feature_layers': [1000, 1000, 1000], 'drop': 0, 'epochs': 10, 'learning_rate': None, 'loss': -'mse', 'optimizer': 'adam', 'residual': False, 'rng_seed': 2018, 'save': 'save/uno', 'scaling': 'std', 'feature_subsample': 0, 'validation_split': 0.2, 'solr_root': '', 'timeout' +'mse', 'optimizer': 'adam', 'residual': False, 'rng_seed': 2018, 'save': 'save/uno', 'scaling': 'std', 'feature_subsample': 0, 'validation_split': 0.2, 'timeout' : -1, 'train_sources': ['all'], 'test_sources': ['train'], 'cell_types': None, 'cell_features': ['rnaseq'], 'drug_features': ['descriptors', 'fingerprints'], 'cv': 1, 'max_val_lo ss': 1.0, 'base_lr': None, 'reduce_lr': False, 'warmup_lr': False, 'batch_normalization': False, 'no_gen': False, 'config_file': '/raid/fangfang/Benchmarks/Pilot1/Uno/uno_default _model.txt', 'verbose': False, 'logfile': None, 'train_bool': True, 'shuffle': True, 'alpha_dropout': False, 'gpus': [], 'experiment_id': 'EXP.000', 'run_id': 'RUN.000', 'by_cell diff --git a/Pilot1/Uno/uno_auc_clr_model.txt b/Pilot1/Uno/uno_auc_clr_model.txt index 30893584..363b8467 100644 --- a/Pilot1/Uno/uno_auc_clr_model.txt +++ b/Pilot1/Uno/uno_auc_clr_model.txt @@ -38,9 +38,9 @@ cp=True save_path='save/uno' single=True -timeout=-1 [Monitor_Params] +timeout=-1 [CLR_Params] clr_flag = True diff --git a/Pilot1/Uno/uno_auc_model.txt b/Pilot1/Uno/uno_auc_model.txt index c92a850e..3b1f6e0d 100644 --- a/Pilot1/Uno/uno_auc_model.txt +++ b/Pilot1/Uno/uno_auc_model.txt @@ -38,7 +38,6 @@ cp=True save_path='save/uno' single=True -timeout=-1 [Monitor_Params] -solr_root='' +timeout=-1 diff --git a/Pilot1/Uno/uno_by_drug_example.txt b/Pilot1/Uno/uno_by_drug_example.txt index 851f23c9..81dc30a1 100644 --- a/Pilot1/Uno/uno_by_drug_example.txt +++ b/Pilot1/Uno/uno_by_drug_example.txt @@ -36,5 +36,4 @@ by_drug='paclitaxel' cache='cache.pac' [Monitor_Params] -solr_root='' timeout=3600 diff --git a/Pilot1/Uno/uno_clr_model.txt b/Pilot1/Uno/uno_clr_model.txt new file mode 100644 index 00000000..0ef55e80 --- /dev/null +++ b/Pilot1/Uno/uno_clr_model.txt @@ -0,0 +1,39 @@ +[Global_Params] +train_sources=['GDSC', 'CTRP', 'ALMANAC'] +test_sources=['train'] +cell_types=None +cell_features=['rnaseq'] +drug_features=['descriptors', 'fingerprints'] +dense=[1000, 1000, 1000] +dense_feature_layers=[1000, 1000, 1000] +activation='relu' +loss='mse' +optimizer='adam' +scaling='std' +dropout=0 +epochs=10 +batch_size=32 +val_split=0.2 +cv=1 +max_val_loss=1.0 +learning_rate=None +base_lr=None +residual=False +reduce_lr=True +warmup_lr=True +batch_normalization=False +feature_subsample=0 +rng_seed=2018 +save_path='save/uno' +no_gen=False +verbose = False + +[Monitor_Params] +timeout=3600 + +[CLR_Params] +clr_flag = False +clr_mode = 'trng1' +clr_base_lr = 0.00001 +clr_max_lr = 0.001 +clr_gamma = 0.999 diff --git a/Pilot1/Uno/uno_default_model.txt b/Pilot1/Uno/uno_default_model.txt index 9cf9cff3..8f406d35 100644 --- a/Pilot1/Uno/uno_default_model.txt +++ b/Pilot1/Uno/uno_default_model.txt @@ -29,5 +29,4 @@ no_gen=False verbose = False [Monitor_Params] -solr_root='' timeout=3600 diff --git a/Pilot1/Uno/uno_fom_model.txt b/Pilot1/Uno/uno_fom_model.txt index e84a9442..2c9eb14d 100644 --- a/Pilot1/Uno/uno_fom_model.txt +++ b/Pilot1/Uno/uno_fom_model.txt @@ -34,5 +34,4 @@ no_response_source=True single=True [Monitor_Params] -solr_root='' timeout=-1 diff --git a/Pilot1/Uno/uno_perf_bench_model.txt b/Pilot1/Uno/uno_perf_bench_model.txt index b065fb8c..a35d2b55 100644 --- a/Pilot1/Uno/uno_perf_bench_model.txt +++ b/Pilot1/Uno/uno_perf_bench_model.txt @@ -30,5 +30,4 @@ verbose = False use_landmark_genes=True [Monitor_Params] -solr_root='' timeout=3600 From 1163b4df8e801b59a3ff9e46d5d3ea9e7cf45f05 Mon Sep 17 00:00:00 2001 From: Harry Yoo Date: Wed, 13 May 2020 12:04:00 -0500 Subject: [PATCH 306/331] add readme --- examples/mnist/README.md | 59 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) create mode 100644 examples/mnist/README.md diff --git a/examples/mnist/README.md b/examples/mnist/README.md new file mode 100644 index 00000000..b9f5ab31 --- /dev/null +++ b/examples/mnist/README.md @@ -0,0 +1,59 @@ +# MNIST Example + +This example demonstrate how to convert keras code into CANDLE compliant. +Please refer [tutorial](https://ecp-candle.github.io/Candle/html/tutorials/writing_candle_code.html) for more detail. + +Here is the list of files, + +- mnist.py: CANDLE class +- mnist_cnn.py and mnist_mlp.py: original mnist implementation from keras project +- mnist_cnn_candle.py: mnist_cnn.py converted in CANDLE compliant mode +- mnist_mlp_candle.py: mnist_mlp.py converted in CANDLE compliant mode +- mnist_params.txt: model parameters are stored in a file for reproduciblity + + +``` +$ python mnist_cnn_candle.py -e 3 +Using TensorFlow backend. + +Importing candle utils for keras +Params: +{'activation': 'relu', +'batch_size': 128, +'data_type': , +'epochs': 3, +'experiment_id': 'EXP000', +'gpus': [], +'logfile': None, +'optimizer': 'rmsprop', +'output_dir': '/Users/hsyoo/projects/CANDLE/Benchmarks/examples/mnist/Output/EXP000/RUN000', +'profiling': False, +'rng_seed': 7102, +'run_id': 'RUN000', +'shuffle': False, +'timeout': -1, +'train_bool': True, +'verbose': None} +Downloading data from https://s3.amazonaws.com/img-datasets/mnist.npz +11493376/11490434 [==============================] - 2s 0us/step +x_train shape: (60000, 28, 28, 1) +60000 train samples +10000 test samples +Instructions for updating: +Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`. + +Instructions for updating: +Use tf.where in 2.0, which has the same broadcast rule as np.where +Train on 60000 samples, validate on 10000 samples +Epoch 1/3 +2020-05-13 11:53:17.373979: I tensorflow/core/platform/cpu_feature_guard.cc:145] This TensorFlow binary is optimized with Intel(R) MKL-DNN to use the following CPU instructions in performance critical operations: SSE4.1 SSE4.2 AVX AVX2 FMA +To enable them in non-MKL-DNN operations, rebuild TensorFlow with the appropriate compiler flags. +2020-05-13 11:53:17.374474: I tensorflow/core/common_runtime/process_util.cc:115] Creating new thread pool with default inter op setting: 16. Tune using inter_op_parallelism_threads for best performance. +60000/60000 [==============================] - 56s 932us/step - loss: 0.2719 - acc: 0.9157 - val_loss: 0.0683 - val_acc: 0.9774 +Epoch 2/3 +60000/60000 [==============================] - 55s 909us/step - loss: 0.0904 - acc: 0.9733 - val_loss: 0.0411 - val_acc: 0.9872 +Epoch 3/3 +60000/60000 [==============================] - 55s 909us/step - loss: 0.0666 - acc: 0.9808 - val_loss: 0.0339 - val_acc: 0.9893 +Test loss: 0.03386178284487105 +Test accuracy: 0.9893 +``` From 73793046d7a4f2799cd873a36cbed1603723b13d Mon Sep 17 00:00:00 2001 From: Harry Yoo Date: Wed, 13 May 2020 12:13:18 -0500 Subject: [PATCH 307/331] add README --- examples/M16/README.md | 193 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 193 insertions(+) create mode 100644 examples/M16/README.md diff --git a/examples/M16/README.md b/examples/M16/README.md new file mode 100644 index 00000000..f06b590c --- /dev/null +++ b/examples/M16/README.md @@ -0,0 +1,193 @@ +# Feature Selection examples + +The code is for demonstrate feature selection methods that CANDLE provides. + + +``` +$ python M16_test.py + +Importing candle utils for keras +Origin = http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/Candle_Milestone_16_Version_12_15_2019/Data/Data_For_Testing/small_drug_descriptor_data_unique_samples.txt +Origin = http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/Candle_Milestone_16_Version_12_15_2019/Data/Data_For_Testing/small_drug_response_data.txt +Origin = http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/Candle_Milestone_16_Version_12_15_2019/Data/Data_For_Testing/Gene_Expression_Full_Data_Unique_Samples.txt +Origin = http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/Candle_Milestone_16_Version_12_15_2019/Data/Data_For_Testing/CCLE_NCI60_Gene_Expression_Full_Data.txt +Origin = http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/Candle_Milestone_16_Version_12_15_2019/Data/Gene_Sets/MSigDB.v7.0/c2.cgp.v7.0.entrez.gmt +Origin = http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/Candle_Milestone_16_Version_12_15_2019/Data/Gene_Sets/MSigDB.v7.0/c2.cgp.v7.0.symbols.gmt +Origin = http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/Candle_Milestone_16_Version_12_15_2019/Data/Gene_Sets/MSigDB.v7.0/c2.cp.biocarta.v7.0.entrez.gmt +Origin = http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/Candle_Milestone_16_Version_12_15_2019/Data/Gene_Sets/MSigDB.v7.0/c2.cp.biocarta.v7.0.symbols.gmt +Origin = http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/Candle_Milestone_16_Version_12_15_2019/Data/Gene_Sets/MSigDB.v7.0/c2.cp.kegg.v7.0.entrez.gmt +Origin = http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/Candle_Milestone_16_Version_12_15_2019/Data/Gene_Sets/MSigDB.v7.0/c2.cp.kegg.v7.0.symbols.gmt +Origin = http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/Candle_Milestone_16_Version_12_15_2019/Data/Gene_Sets/MSigDB.v7.0/c2.cp.pid.v7.0.entrez.gmt +Origin = http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/Candle_Milestone_16_Version_12_15_2019/Data/Gene_Sets/MSigDB.v7.0/c2.cp.pid.v7.0.symbols.gmt +Origin = http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/Candle_Milestone_16_Version_12_15_2019/Data/Gene_Sets/MSigDB.v7.0/c2.cp.reactome.v7.0.entrez.gmt +Origin = http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/Candle_Milestone_16_Version_12_15_2019/Data/Gene_Sets/MSigDB.v7.0/c2.cp.reactome.v7.0.symbols.gmt +Origin = http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/Candle_Milestone_16_Version_12_15_2019/Data/Gene_Sets/MSigDB.v7.0/c5.bp.v7.0.entrez.gmt +Origin = http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/Candle_Milestone_16_Version_12_15_2019/Data/Gene_Sets/MSigDB.v7.0/c5.bp.v7.0.symbols.gmt +Origin = http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/Candle_Milestone_16_Version_12_15_2019/Data/Gene_Sets/MSigDB.v7.0/c5.cc.v7.0.entrez.gmt +Origin = http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/Candle_Milestone_16_Version_12_15_2019/Data/Gene_Sets/MSigDB.v7.0/c5.cc.v7.0.symbols.gmt +Origin = http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/Candle_Milestone_16_Version_12_15_2019/Data/Gene_Sets/MSigDB.v7.0/c5.mf.v7.0.entrez.gmt +Origin = http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/Candle_Milestone_16_Version_12_15_2019/Data/Gene_Sets/MSigDB.v7.0/c5.mf.v7.0.symbols.gmt +Origin = http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/Candle_Milestone_16_Version_12_15_2019/Data/Gene_Sets/MSigDB.v7.0/c6.all.v7.0.entrez.gmt +Origin = http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/Candle_Milestone_16_Version_12_15_2019/Data/Gene_Sets/MSigDB.v7.0/c6.all.v7.0.symbols.gmt +Gene Set data is locally stored at /Users/hsyoo/projects/CANDLE/Benchmarks/common/../Data/examples/Gene_Sets/MSigDB.v7.0/ + + +Testing select_features_by_missing_values +Drug descriptor dataframe includes 10 drugs (rows) and 10 drug descriptor features (columns) + MW AMW Sv Se ... Mv Psi_e_1d Psi_e_1s VE3sign_X +Drug_1 475.40 8.804 34.718 54.523 ... 0.643 NaN NaN NaN +Drug_10 457.71 10.898 29.154 43.640 ... 0.694 NaN NaN -2.752 +Drug_100 561.80 6.688 49.975 83.607 ... 0.595 NaN NaN -4.335 +Drug_1000 362.51 6.840 32.794 52.461 ... 0.619 NaN NaN -9.968 +Drug_1001 628.83 7.763 51.593 81.570 ... 0.637 NaN NaN -2.166 +Drug_1002 377.19 10.777 26.191 36.578 ... 0.748 NaN NaN -1.526 +Drug_1003 371.42 8.254 30.896 45.473 ... 0.687 NaN NaN -4.983 +Drug_1004 453.60 8.100 37.949 55.872 ... 0.678 NaN NaN -4.100 +Drug_1005 277.35 7.704 23.940 35.934 ... 0.665 NaN NaN -5.234 +Drug_1006 409.47 8.189 34.423 50.356 ... 0.688 NaN NaN -2.513 + +[10 rows x 10 columns] +Select features with missing rates smaller than 0.1 +Feature IDs [0 1 2 3 4 5 6] +Select features with missing rates smaller than 0.3 +Feature IDs [0 1 2 3 4 5 6 9] + + +Testing select_features_by_variation +Select features with a variance larger than 100 +Feature IDs [0 3 5] +Select the top 2 features with the largest standard deviation +Feature IDs [0 5] + + +Testing select_decorrelated_features +Select features that are not identical to each other and are not all missing. +Feature IDs [0 1 2 3 4 5 6 9] +Select features whose absolute mutual Spearman correlation coefficient is smaller than 0.8 +Feature IDs [0 2 6 9] + + +Testing generate_cross_validation_partition +Generate 5-fold cross-validation partition of 10 samples twice +[[[0, 5], [1, 2, 3, 4, 6, 7, 8, 9]], [[1, 6], [0, 2, 3, 4, 5, 7, 8, 9]], [[2, 7], [0, 1, 3, 4, 5, 6, 8, 9]], [[3, 8], [0, 1, 2, 4, 5, 6, 7, 9]], [[4, 9], [0, 1, 2, 3, 5, 6, 7, 8]], [[5, 8], [0, 1, 2, 3, 4, 6, 7, 9]], [[3, 9], [0, 1, 2, 4, 5, 6, 7, 8]], [[2, 4], [0, 1, 3, 5, 6, 7, 8, 9]], [[1, 7], [0, 2, 3, 4, 5, 6, 8, 9]], [[0, 6], [1, 2, 3, 4, 5, 7, 8, 9]]] +Drug response data of 5 cell lines treated by various drugs. + SOURCE CELL DRUG AUC EC50 EC50se R2fit HS +0 CCLE CCLE.22RV1 CCLE.1 0.7153 5.660 0.6867 0.9533 0.6669 +1 CCLE CCLE.22RV1 CCLE.10 0.9579 7.023 0.7111 0.4332 4.0000 +2 CCLE CCLE.22RV1 CCLE.11 0.4130 7.551 0.0385 0.9948 1.3380 +3 CCLE CCLE.22RV1 CCLE.12 0.8004 5.198 11.7100 0.9944 4.0000 +4 CCLE CCLE.22RV1 CCLE.13 0.5071 7.149 0.3175 0.8069 1.0150 +.. ... ... ... ... ... ... ... ... +95 CCLE CCLE.697 CCLE.12 0.7869 5.278 20.1200 0.8856 4.0000 +96 CCLE CCLE.697 CCLE.13 0.4433 7.474 0.0265 0.9978 3.7080 +97 CCLE CCLE.697 CCLE.14 0.4337 7.466 0.0106 0.9996 3.4330 +98 CCLE CCLE.697 CCLE.15 0.8721 3.097 29.1300 0.4884 0.2528 +99 CCLE CCLE.697 CCLE.16 0.7955 7.496 0.1195 0.9396 1.9560 + +[100 rows x 8 columns] +Generate partition indices to divide the data into 4 sets without shared cell lines for 5 times. +[[[68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91], [44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67], [92, 93, 94, 95, 96, 97, 98, 99], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43]], [[44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67], [92, 93, 94, 95, 96, 97, 98, 99], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23], [24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91]], [[92, 93, 94, 95, 96, 97, 98, 99], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23], [24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43], [44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91]], [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23], [24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43], [68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91], [44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 92, 93, 94, 95, 96, 97, 98, 99]], [[24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43], [68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91], [44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 92, 93, 94, 95, 96, 97, 98, 99]]] +Using TensorFlow backend. +... +found 2 batches +found 0 numerical covariates... +found 0 categorical variables: +Standardizing Data across genes. +Fitting L/S model and finding priors +Finding parametric adjustments + + + +Testing quantile_normalization +Gene expression data of 897 cell lines (columns) and 17741 genes (rows). + CCL_61 CCL_62 CCL_63 ... CCL_1076 CCL_1077 CCL_1078 +entrezID geneSymbol ... +1 A1BG 0.99 0.03 0.36 ... 2.56 3.55 3.04 +29974 A1CF 4.03 3.03 0.00 ... 0.00 0.03 0.00 +2 A2M 2.68 0.03 0.16 ... 0.77 0.31 1.20 +144568 A2ML1 0.07 0.07 0.01 ... 0.01 0.00 1.09 +127550 A3GALT2 0.15 0.00 0.06 ... 2.34 0.00 0.03 +... ... ... ... ... ... ... ... +440590 ZYG11A 0.41 0.06 1.70 ... 0.75 3.44 2.44 +79699 ZYG11B 4.45 4.23 3.08 ... 4.25 3.61 3.68 +7791 ZYX 4.65 5.72 6.67 ... 7.78 4.12 5.97 +23140 ZZEF1 4.14 3.98 3.90 ... 4.62 3.76 3.54 +26009 ZZZ3 4.77 5.01 3.90 ... 4.38 3.46 3.60 + +[17741 rows x 897 columns] +Before normalization +Max difference of third quartile between cell lines is 1.86 +Max difference of median between cell lines is 2.25 +Max difference of first quartile between cell lines is 0.5 +After normalization +Max difference of third quartile between cell lines is 0.01 +Max difference of median between cell lines is 0.02 +Max difference of first quartile between cell lines is 0.06 + + +Testing generate_gene_set_data +Generate gene-set-level data of 897 cell lines and 189 oncogenic signature gene sets + GLI1_UP.V1_DN GLI1_UP.V1_UP ... LEF1_UP.V1_DN LEF1_UP.V1_UP +CCL_61 -0.031096 0.283946 ... 0.096461 -0.329343 +CCL_62 0.362855 -0.101684 ... 0.426951 -0.477634 +CCL_63 -0.304989 -0.165160 ... 0.036932 -0.201916 +CCL_64 -0.037737 -0.043124 ... 0.154256 -0.210188 +CCL_65 0.102477 0.438871 ... -0.166487 0.287382 +... ... ... ... ... ... +CCL_1074 0.508978 0.137934 ... 0.148213 0.166717 +CCL_1075 -0.145029 0.216169 ... -0.067391 0.258455 +CCL_1076 -0.357758 0.337235 ... 0.008950 0.186134 +CCL_1077 0.086597 -0.266070 ... 0.217244 -0.276022 +CCL_1078 0.374237 -0.428383 ... 0.312984 -0.303721 + +[897 rows x 189 columns] +Generate gene-set-level data of 897 cell lines and 186 KEGG pathways + KEGG_GLYCOLYSIS_GLUCONEOGENESIS ... KEGG_VIRAL_MYOCARDITIS +CCL_61 6.495365 ... -30.504868 +CCL_62 30.679006 ... -7.205641 +CCL_63 10.534238 ... -5.414998 +CCL_64 6.142140 ... -10.555601 +CCL_65 -0.303868 ... -9.784998 +... ... ... ... +CCL_1074 -1.945281 ... 6.891960 +CCL_1075 -21.373730 ... 0.612092 +CCL_1076 -11.711818 ... -10.353794 +CCL_1077 -11.576702 ... -31.679962 +CCL_1078 -10.355489 ... -26.232325 + +[897 rows x 186 columns] + + +Testing combat_batch_effect_removal +Gene expression data of 60 NCI60 cell lines and 1018 CCLE cell lines with 17741 genes. + NCI60.786-0|CCL_1 ... CCLE.ZR7530|CCL_1078 +entrezID geneSymbol ... +1 A1BG 0.00 ... 3.04 +29974 A1CF 0.00 ... 0.00 +2 A2M 0.00 ... 1.20 +144568 A2ML1 0.00 ... 1.09 +127550 A3GALT2 0.00 ... 0.03 +... ... ... ... +440590 ZYG11A 0.01 ... 2.44 +79699 ZYG11B 3.37 ... 3.68 +7791 ZYX 7.05 ... 5.97 +23140 ZZEF1 4.05 ... 3.54 +26009 ZZZ3 4.10 ... 3.60 + +[17741 rows x 1078 columns] +Before removal of batch effect between NCI60 and CCLE datasets +Average third quartile of NCI60 cell lines is 4.0 +Average median of NCI60 cell lines is 1.71 +Average first quartile of NCI60 cell lines is 0.01 +Average third quartile of CCLE cell lines is 4.88 +Average median of CCLE cell lines is 2.75 +Average first quartile of CCLE cell lines is 0.14 +Adjusting data +After removal of batch effect between NCI60 and CCLE datasets +Average third quartile of NCI60 cell lines is 4.81 +Average median of NCI60 cell lines is 2.65 +Average first quartile of NCI60 cell lines is 0.23 +Average third quartile of CCLE cell lines is 4.83 +Average median of CCLE cell lines is 2.72 +Average first quartile of CCLE cell lines is 0.13 +``` \ No newline at end of file From 383459ababa13ecec2817545e522f017504fa75f Mon Sep 17 00:00:00 2001 From: Harry Yoo Date: Wed, 13 May 2020 12:26:38 -0500 Subject: [PATCH 308/331] add code and output side-by-side --- examples/M16/README.md | 195 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 187 insertions(+), 8 deletions(-) diff --git a/examples/M16/README.md b/examples/M16/README.md index f06b590c..1cf68c40 100644 --- a/examples/M16/README.md +++ b/examples/M16/README.md @@ -2,15 +2,47 @@ The code is for demonstrate feature selection methods that CANDLE provides. +## Download data +Code +```python +# download all the data if needed from the repo +data_url = 'http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/Candle_Milestone_16_Version_12_15_2019/Data/Data_For_Testing/' +file_name = 'small_drug_descriptor_data_unique_samples.txt' +drug_descriptor = candle.get_file(file_name, data_url+file_name, cache_subdir='examples') -``` -$ python M16_test.py +file_name = 'small_drug_response_data.txt' +response_data = candle.get_file(file_name, data_url+file_name, cache_subdir='examples') + +file_name = 'Gene_Expression_Full_Data_Unique_Samples.txt' +gene_expression = candle.get_file(file_name, data_url+file_name, cache_subdir='examples') +file_name = 'CCLE_NCI60_Gene_Expression_Full_Data.txt' +ccle_nci60 = candle.get_file(file_name, data_url+file_name, cache_subdir='examples') +``` +Output +```bash Importing candle utils for keras Origin = http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/Candle_Milestone_16_Version_12_15_2019/Data/Data_For_Testing/small_drug_descriptor_data_unique_samples.txt Origin = http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/Candle_Milestone_16_Version_12_15_2019/Data/Data_For_Testing/small_drug_response_data.txt Origin = http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/Candle_Milestone_16_Version_12_15_2019/Data/Data_For_Testing/Gene_Expression_Full_Data_Unique_Samples.txt Origin = http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/Candle_Milestone_16_Version_12_15_2019/Data/Data_For_Testing/CCLE_NCI60_Gene_Expression_Full_Data.txt +``` + +## Download gene set +Code +```python +# download all the gene_set files needed +data_url = 'http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/Candle_Milestone_16_Version_12_15_2019/Data/Gene_Sets/MSigDB.v7.0/' +for gene_set_category in ['c2.cgp','c2.cp.biocarta','c2.cp.kegg','c2.cp.pid','c2.cp.reactome','c5.bp','c5.cc','c5.mf','c6.all']: + for gene_name_type in ['entrez', 'symbols']: + file_name = gene_set_category+'.v7.0.'+gene_name_type+'.gmt' + local_file = candle.get_file(file_name, data_url+file_name, cache_subdir='examples/Gene_Sets/MSigDB.v7.0') +# extract base directory for gene_set data files +data_dir = local_file.split(file_name)[0] +print('Gene Set data is locally stored at ', data_dir) +``` +Output +``` Origin = http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/Candle_Milestone_16_Version_12_15_2019/Data/Gene_Sets/MSigDB.v7.0/c2.cgp.v7.0.entrez.gmt Origin = http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/Candle_Milestone_16_Version_12_15_2019/Data/Gene_Sets/MSigDB.v7.0/c2.cgp.v7.0.symbols.gmt Origin = http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/Candle_Milestone_16_Version_12_15_2019/Data/Gene_Sets/MSigDB.v7.0/c2.cp.biocarta.v7.0.entrez.gmt @@ -30,8 +62,24 @@ Origin = http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/Candle_ Origin = http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/Candle_Milestone_16_Version_12_15_2019/Data/Gene_Sets/MSigDB.v7.0/c6.all.v7.0.entrez.gmt Origin = http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/Candle_Milestone_16_Version_12_15_2019/Data/Gene_Sets/MSigDB.v7.0/c6.all.v7.0.symbols.gmt Gene Set data is locally stored at /Users/hsyoo/projects/CANDLE/Benchmarks/common/../Data/examples/Gene_Sets/MSigDB.v7.0/ +``` - +# Select features based on missing values +Code +```python +print('Testing select_features_by_missing_values') +print('Drug descriptor dataframe includes 10 drugs (rows) and 10 drug descriptor features (columns)') +data = pd.read_csv(drug_descriptor, sep='\t', engine='c', na_values=['na', '-', ''], header=0, index_col=0, low_memory=False) +print(data) +print('Select features with missing rates smaller than 0.1') +id = candle.select_features_by_missing_values(data, threshold=0.1) +print('Feature IDs', id) +print('Select features with missing rates smaller than 0.3') +id = candle.select_features_by_missing_values(data.values, threshold=0.3) +print('Feature IDs', id) +``` +Output +```bash Testing select_features_by_missing_values Drug descriptor dataframe includes 10 drugs (rows) and 10 drug descriptor features (columns) MW AMW Sv Se ... Mv Psi_e_1d Psi_e_1s VE3sign_X @@ -51,22 +99,65 @@ Select features with missing rates smaller than 0.1 Feature IDs [0 1 2 3 4 5 6] Select features with missing rates smaller than 0.3 Feature IDs [0 1 2 3 4 5 6 9] +``` +# Select features based on variation +Code +```python +print('Testing select_features_by_variation') +print('Select features with a variance larger than 100') +id = candle.select_features_by_variation(data, variation_measure='var', threshold=100, portion=None, draw_histogram=False) +print('Feature IDs', id) +print('Select the top 2 features with the largest standard deviation') +id = candle.select_features_by_variation(data, variation_measure='std', portion=0.2) +print('Feature IDs', id) +``` +Output +``` Testing select_features_by_variation Select features with a variance larger than 100 Feature IDs [0 3 5] Select the top 2 features with the largest standard deviation Feature IDs [0 5] +``` - +# Select decorrelated features +Code +```python +print('Testing select_decorrelated_features') +print('Select features that are not identical to each other and are not all missing.') +id = candle.select_decorrelated_features(data, threshold=None, random_seed=None) +print('Feature IDs', id) +print('Select features whose absolute mutual Spearman correlation coefficient is smaller than 0.8') +id = candle.select_decorrelated_features(data, method='spearman', threshold=0.8, random_seed=10) +print('Feature IDs', id) +``` +Output +``` Testing select_decorrelated_features Select features that are not identical to each other and are not all missing. Feature IDs [0 1 2 3 4 5 6 9] Select features whose absolute mutual Spearman correlation coefficient is smaller than 0.8 Feature IDs [0 2 6 9] +``` - +# Generate cross-validation partitions of data +Code +```python +print('Testing generate_cross_validation_partition') +print('Generate 5-fold cross-validation partition of 10 samples twice') +p = candle.generate_cross_validation_partition(range(10), n_folds=5, n_repeats=2, portions=None, random_seed=None) +print(p) +print('Drug response data of 5 cell lines treated by various drugs.') +data = pd.read_csv(response_data, sep='\t', engine='c', na_values=['na', '-', ''], header=0, index_col=None, low_memory=False) +print(data) +print('Generate partition indices to divide the data into 4 sets without shared cell lines for 5 times.') +p = candle.generate_cross_validation_partition(data.CELL, n_folds=5, n_repeats=1, portions=[1, 1, 1, 2], random_seed=1) +print(p) +``` +Output +``` Testing generate_cross_validation_partition Generate 5-fold cross-validation partition of 10 samples twice [[[0, 5], [1, 2, 3, 4, 6, 7, 8, 9]], [[1, 6], [0, 2, 3, 4, 5, 7, 8, 9]], [[2, 7], [0, 1, 3, 4, 5, 6, 8, 9]], [[3, 8], [0, 1, 2, 4, 5, 6, 7, 9]], [[4, 9], [0, 1, 2, 3, 5, 6, 7, 8]], [[5, 8], [0, 1, 2, 3, 4, 6, 7, 9]], [[3, 9], [0, 1, 2, 4, 5, 6, 7, 8]], [[2, 4], [0, 1, 3, 5, 6, 7, 8, 9]], [[1, 7], [0, 2, 3, 4, 5, 6, 8, 9]], [[0, 6], [1, 2, 3, 4, 5, 7, 8, 9]]] @@ -95,9 +186,34 @@ found 0 categorical variables: Standardizing Data across genes. Fitting L/S model and finding priors Finding parametric adjustments +``` - - +# Quantile normalization of gene expression data +Code +```python +print('Testing quantile_normalization') +print('Gene expression data of 897 cell lines (columns) and 17741 genes (rows).') +data = pd.read_csv(gene_expression, sep='\t', engine='c', na_values=['na', '-', ''], header=0, index_col=[0, 1], low_memory=False) +print(data) +print('Before normalization') +third_quartile = data.quantile(0.75, axis=0) +print('Max difference of third quartile between cell lines is ' + str(np.round(a=np.max(third_quartile) - np.min(third_quartile), decimals=2))) +second_quartile = data.quantile(0.5, axis=0) +print('Max difference of median between cell lines is ' + str(np.round(a=np.max(second_quartile) - np.min(second_quartile), decimals=2))) +first_quartile = data.quantile(0.25, axis=0) +print('Max difference of first quartile between cell lines is ' + str(np.round(a=np.max(first_quartile) - np.min(first_quartile), decimals=2))) +norm_data = candle.quantile_normalization(np.transpose(data)) +norm_data = np.transpose(norm_data) +print('After normalization') +third_quartile = norm_data.quantile(0.75, axis=0) +print('Max difference of third quartile between cell lines is ' + str(np.round(a=np.max(third_quartile) - np.min(third_quartile), decimals=2))) +second_quartile = norm_data.quantile(0.5, axis=0) +print('Max difference of median between cell lines is ' + str(np.round(a=np.max(second_quartile) - np.min(second_quartile), decimals=2))) +first_quartile = norm_data.quantile(0.25, axis=0) +print('Max difference of first quartile between cell lines is ' + str(np.round(a=np.max(first_quartile) - np.min(first_quartile), decimals=2))) +``` +Output +``` Testing quantile_normalization Gene expression data of 897 cell lines (columns) and 17741 genes (rows). CCL_61 CCL_62 CCL_63 ... CCL_1076 CCL_1077 CCL_1078 @@ -123,8 +239,22 @@ After normalization Max difference of third quartile between cell lines is 0.01 Max difference of median between cell lines is 0.02 Max difference of first quartile between cell lines is 0.06 +``` - +# Generate gene-set-level data +```python +print('Testing generate_gene_set_data') +gene_set_data = candle.generate_gene_set_data(np.transpose(norm_data), [i[0] for i in norm_data.index], gene_name_type='entrez', + gene_set_category='c6.all', metric='mean', standardize=True, data_dir=data_dir) +print('Generate gene-set-level data of 897 cell lines and 189 oncogenic signature gene sets') +print(gene_set_data) +gene_set_data = candle.generate_gene_set_data(np.transpose(norm_data), [i[1] for i in norm_data.index], gene_name_type='symbols', + gene_set_category='c2.cp.kegg', metric='sum', standardize=True, data_dir=data_dir) +print('Generate gene-set-level data of 897 cell lines and 186 KEGG pathways') +print(gene_set_data) +``` +Output +``` Testing generate_gene_set_data Generate gene-set-level data of 897 cell lines and 189 oncogenic signature gene sets GLI1_UP.V1_DN GLI1_UP.V1_UP ... LEF1_UP.V1_DN LEF1_UP.V1_UP @@ -156,8 +286,57 @@ CCL_1077 -11.576702 ... -31.679962 CCL_1078 -10.355489 ... -26.232325 [897 rows x 186 columns] +``` + +# Combat batch normalization on gene expression data +Code +```python +print('Testing combat_batch_effect_removal') +print('Gene expression data of 60 NCI60 cell lines and 1018 CCLE cell lines with 17741 genes.') +data = pd.read_csv(ccle_nci60, sep='\t', engine='c', na_values=['na', '-', ''], header=0, index_col=[0, 1], low_memory=False) +print(data) +resource = np.array([i.split('.')[0] for i in data.columns]) + +print('Before removal of batch effect between NCI60 and CCLE datasets') +# Identify NCI60 cell lines and quantile normalize their gene expression data +id = np.where(resource == 'NCI60')[0] +norm_data_NCI60 = candle.quantile_normalization(np.transpose(data.iloc[:, id])) +print('Average third quartile of NCI60 cell lines is ' + str(np.round(a=np.mean(norm_data_NCI60.quantile(0.75, axis=1)), decimals=2))) +print('Average median of NCI60 cell lines is ' + str(np.round(a=np.mean(norm_data_NCI60.quantile(0.5, axis=1)), decimals=2))) +print('Average first quartile of NCI60 cell lines is ' + str(np.round(a=np.mean(norm_data_NCI60.quantile(0.25, axis=1)), decimals=2))) + +# Identify CCLE cell lines and quantile normalize their gene expression data +id = np.where(resource == 'CCLE')[0] +norm_data_CCLE = candle.quantile_normalization(np.transpose(data.iloc[:, id])) +print('Average third quartile of CCLE cell lines is ' + str(np.round(a=np.mean(norm_data_CCLE.quantile(0.75, axis=1)), decimals=2))) +print('Average median of CCLE cell lines is ' + str(np.round(a=np.mean(norm_data_CCLE.quantile(0.5, axis=1)), decimals=2))) +print('Average first quartile of CCLE cell lines is ' + str(np.round(a=np.mean(norm_data_CCLE.quantile(0.25, axis=1)), decimals=2))) + +# Combine normalized data of NCI60 cell lines and CCLE cell lines +norm_data = pd.concat((norm_data_NCI60, norm_data_CCLE), axis=0) +norm_data = np.transpose(norm_data) +# Apply ComBat algorithm to remove the batch effect between NCI60 and CCLE +corrected_data = candle.combat_batch_effect_removal(norm_data, pd.Series([i.split('.')[0] for i in norm_data.columns], index=norm_data.columns)) +print('After removal of batch effect between NCI60 and CCLE datasets') + +resource = np.array([i.split('.')[0] for i in corrected_data.columns]) +id = np.where(resource == 'NCI60')[0] +corrected_data_NCI60 = np.transpose(corrected_data.iloc[:, id]) +print('Average third quartile of NCI60 cell lines is ' + str(np.round(a=np.mean(corrected_data_NCI60.quantile(0.75, axis=1)), decimals=2))) +print('Average median of NCI60 cell lines is ' + str(np.round(a=np.mean(corrected_data_NCI60.quantile(0.5, axis=1)), decimals=2))) +print('Average first quartile of NCI60 cell lines is ' + str(np.round(a=np.mean(corrected_data_NCI60.quantile(0.25, axis=1)), decimals=2))) + +# Identify CCLE cell lines and quantile normalize their gene expression data +id = np.where(resource == 'CCLE')[0] +corrected_data_CCLE = np.transpose(corrected_data.iloc[:, id]) +print('Average third quartile of CCLE cell lines is ' + str(np.round(a=np.mean(corrected_data_CCLE.quantile(0.75, axis=1)), decimals=2))) +print('Average median of CCLE cell lines is ' + str(np.round(a=np.mean(corrected_data_CCLE.quantile(0.5, axis=1)), decimals=2))) +print('Average first quartile of CCLE cell lines is ' + str(np.round(a=np.mean(corrected_data_CCLE.quantile(0.25, axis=1)), decimals=2))) +``` +Output +``` Testing combat_batch_effect_removal Gene expression data of 60 NCI60 cell lines and 1018 CCLE cell lines with 17741 genes. NCI60.786-0|CCL_1 ... CCLE.ZR7530|CCL_1078 From 3cda625b9d3a99ae801d41504cac60f49a8cd134 Mon Sep 17 00:00:00 2001 From: Jamal Date: Wed, 13 May 2020 12:24:31 -0600 Subject: [PATCH 309/331] Minor README edit. --- examples/M16/README.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/examples/M16/README.md b/examples/M16/README.md index 1cf68c40..702c130e 100644 --- a/examples/M16/README.md +++ b/examples/M16/README.md @@ -1,6 +1,8 @@ # Feature Selection examples -The code is for demonstrate feature selection methods that CANDLE provides. +The code demonstrates feature selection methods that CANDLE provides. + +It can be run by executing '''python M16_test.py''' ## Download data Code @@ -369,4 +371,4 @@ Average first quartile of NCI60 cell lines is 0.23 Average third quartile of CCLE cell lines is 4.83 Average median of CCLE cell lines is 2.72 Average first quartile of CCLE cell lines is 0.13 -``` \ No newline at end of file +``` From 0f803287d4f5ec5c7f65f19e5e27293bb50a9c54 Mon Sep 17 00:00:00 2001 From: Jamal Date: Wed, 13 May 2020 12:26:03 -0600 Subject: [PATCH 310/331] More edits --- examples/M16/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/M16/README.md b/examples/M16/README.md index 702c130e..78ea9996 100644 --- a/examples/M16/README.md +++ b/examples/M16/README.md @@ -2,7 +2,7 @@ The code demonstrates feature selection methods that CANDLE provides. -It can be run by executing '''python M16_test.py''' +It can be run by executing ``` python M16_test.py ``` ## Download data Code From abdd4312cad7fad4a87e614fe8fc8983e9ddbf15 Mon Sep 17 00:00:00 2001 From: yngtodd Date: Wed, 13 May 2020 16:27:08 -0400 Subject: [PATCH 311/331] Fix results save location This addresses the issue of failing to write the genotype.json when the directory does not exist. This is one of the simpler solutions, and keeps things consistent with the defaul_model.txt of the advanced example. --- examples/darts/uno/default_model.txt | 4 ++-- examples/darts/uno/results/.gitkeep | 0 examples/darts/uno/uno_example.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) create mode 100644 examples/darts/uno/results/.gitkeep diff --git a/examples/darts/uno/default_model.txt b/examples/darts/uno/default_model.txt index 78d7c325..27013a88 100644 --- a/examples/darts/uno/default_model.txt +++ b/examples/darts/uno/default_model.txt @@ -1,7 +1,7 @@ [Global_Params] model_name = 'darts_uno' data_url = 'http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/' -savepath = '.' +savepath = './results' log_interval = 10 train_data = 'top_21_auc_1fold.uno.h5' learning_rate = 0.025 @@ -10,6 +10,6 @@ momentum = 0.9 weight_decay = 3e-4 grad_clip = 5 batch_size = 100 -epochs = 10 +epochs = 1 seed = 13 diff --git a/examples/darts/uno/results/.gitkeep b/examples/darts/uno/results/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/examples/darts/uno/uno_example.py b/examples/darts/uno/uno_example.py index c8643805..703990b0 100644 --- a/examples/darts/uno/uno_example.py +++ b/examples/darts/uno/uno_example.py @@ -33,7 +33,7 @@ def run(params): args = candle.ArgumentStruct(**params) args.cuda = torch.cuda.is_available() - device = torch.device(f"cuda" if args.cuda else "cpu") + device = torch.device(f"cuda" if args.cuda else f"cpu") darts.banner(device=device) train_data = darts.Uno('./data', 'train', download=True) From 56fef59accbf8c184802bba44129ebefbaba60e0 Mon Sep 17 00:00:00 2001 From: yngtodd Date: Wed, 13 May 2020 16:30:02 -0400 Subject: [PATCH 312/331] Reset default number of epochs This puts us back to the default of 10, instead of my testing number of 1 epoch. --- examples/darts/uno/default_model.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/darts/uno/default_model.txt b/examples/darts/uno/default_model.txt index 27013a88..f510ae44 100644 --- a/examples/darts/uno/default_model.txt +++ b/examples/darts/uno/default_model.txt @@ -10,6 +10,6 @@ momentum = 0.9 weight_decay = 3e-4 grad_clip = 5 batch_size = 100 -epochs = 1 +epochs = 10 seed = 13 From 09e05b2ad73eb20569f8cc32e5323ba4c3107849 Mon Sep 17 00:00:00 2001 From: yngtodd Date: Wed, 13 May 2020 16:31:14 -0400 Subject: [PATCH 313/331] Increase log level This puts us back to where Jamal had set the logging. --- examples/darts/uno/default_model.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/darts/uno/default_model.txt b/examples/darts/uno/default_model.txt index f510ae44..09a8fb64 100644 --- a/examples/darts/uno/default_model.txt +++ b/examples/darts/uno/default_model.txt @@ -2,7 +2,7 @@ model_name = 'darts_uno' data_url = 'http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/' savepath = './results' -log_interval = 10 +log_interval = 100 train_data = 'top_21_auc_1fold.uno.h5' learning_rate = 0.025 learning_rate_min = 0.001 From ab0ca0397e9aa5fb0a624cf586e4c34a0932a7b1 Mon Sep 17 00:00:00 2001 From: Jamal Date: Wed, 13 May 2020 14:32:02 -0600 Subject: [PATCH 314/331] Add more detail --- examples/M16/README.md | 37 ++++++++++++++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/examples/M16/README.md b/examples/M16/README.md index 78ea9996..4f42bbaa 100644 --- a/examples/M16/README.md +++ b/examples/M16/README.md @@ -1,3 +1,38 @@ +# Background + +Data preprocessing is an important front-end step in data analysis that prepares data for subsequent analysis. +It not only enables the subsequent analysis by processing and transforming data, but also influences the quality of subsequent analysis sometimes significantly. +Several common examples of data preprocessing are data standardization and normalization to remove/suppress noise, removal of batch effect to combine datasets for larger studies, and generation of new representations of data to enable new analyses. +Feature selection can be viewed as a kind of data preprocessing for prediction analysis. +Its goal is to select a (minimum) subset of available features, based on which prediction models with a good performance can be constructed. +And the performance can be evaluated from multiple aspects, such as the prediction accuracy and the speed of constructing the prediction model. + +The data preprocessing methods can generate data partitions to enable flexible cross-validation analysis, normalize and remove batch effects from gene expression data of cancer cells, and generate genomic representations at the gene set level for cancer cells. +The feature selection methods can filter features based on missing values and variations, and perform feature decorrelation. +Features without much variation might not be useful for prediction and highly-correlated features are not necessary to be all included in the prediction model. +We also implement and extend the co-expression extrapolation (COXEN) gene selection method for Pilot 1 project [10], which can select predictive and generalizable genes for predicting drug response in the precision oncology applications. + +# General Data Preprocessing Functions + +```generate_cross_validation_partition``` + +To flexibly generate data partitions for cross-validation analysis, such as partitioning of grouped samples into sets that do not share groups. + +# Data Preprocessing Functions Specific to Pilot 1 Applications + +```quantile_normalizationa``` + +To perform quantile normalization of genomic data [8] with tolerance of missing values. + +```combat_batch_effect_removal``` + +To perform ComBat analysis [9] on gene expression data to remove batch effects. + +```generate_gene_set_data``` + +To calculate genomic representations at gene set level, such as the average expression values of genes in a pathway and the total number of SNP mutations in a genetic pathway. + + # Feature Selection examples The code demonstrates feature selection methods that CANDLE provides. @@ -184,7 +219,7 @@ Using TensorFlow backend. ... found 2 batches found 0 numerical covariates... -found 0 categorical variables: +found 0 categorical variables: Standardizing Data across genes. Fitting L/S model and finding priors Finding parametric adjustments From ef276e3e74825579eb15c42a6dc1be5f0d81c17d Mon Sep 17 00:00:00 2001 From: yngtodd Date: Wed, 13 May 2020 16:32:06 -0400 Subject: [PATCH 315/331] Fix save path argument This makes this consistent with the Benchmarks. --- examples/darts/uno/default_model.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/darts/uno/default_model.txt b/examples/darts/uno/default_model.txt index 09a8fb64..5907d74a 100644 --- a/examples/darts/uno/default_model.txt +++ b/examples/darts/uno/default_model.txt @@ -1,7 +1,7 @@ [Global_Params] model_name = 'darts_uno' data_url = 'http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/' -savepath = './results' +save_path = './results' log_interval = 100 train_data = 'top_21_auc_1fold.uno.h5' learning_rate = 0.025 From 5d88d54c536edfdf456f8df58d91be5b64a7e573 Mon Sep 17 00:00:00 2001 From: Jamal Date: Wed, 13 May 2020 14:36:54 -0600 Subject: [PATCH 316/331] More detail on functions --- examples/M16/README.md | 31 +++++++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/examples/M16/README.md b/examples/M16/README.md index 4f42bbaa..33cf618c 100644 --- a/examples/M16/README.md +++ b/examples/M16/README.md @@ -10,15 +10,15 @@ And the performance can be evaluated from multiple aspects, such as the predicti The data preprocessing methods can generate data partitions to enable flexible cross-validation analysis, normalize and remove batch effects from gene expression data of cancer cells, and generate genomic representations at the gene set level for cancer cells. The feature selection methods can filter features based on missing values and variations, and perform feature decorrelation. Features without much variation might not be useful for prediction and highly-correlated features are not necessary to be all included in the prediction model. -We also implement and extend the co-expression extrapolation (COXEN) gene selection method for Pilot 1 project [10], which can select predictive and generalizable genes for predicting drug response in the precision oncology applications. +We also implement and extend the co-expression extrapolation (COXEN) gene selection method for Pilot 1 project, which can select predictive and generalizable genes for predicting drug response in the precision oncology applications. -# General Data Preprocessing Functions +## General Data Preprocessing Functions ```generate_cross_validation_partition``` To flexibly generate data partitions for cross-validation analysis, such as partitioning of grouped samples into sets that do not share groups. -# Data Preprocessing Functions Specific to Pilot 1 Applications +## Data Preprocessing Functions Specific to Pilot 1 Applications ```quantile_normalizationa``` @@ -32,8 +32,31 @@ To perform ComBat analysis [9] on gene expression data to remove batch effects. To calculate genomic representations at gene set level, such as the average expression values of genes in a pathway and the total number of SNP mutations in a genetic pathway. +## General Feature Selection Functions -# Feature Selection examples +```select_features_by_missing_values``` + +To remove features with (many) missing values. + +```select_features_by_variation``` + +To remove features with no or small variations. + +```select_decorrelated_features``` + +To select a subset of features that are not identical or highly correlated with each other. + +## Feature (Gene) Selection Functions Specific to Pilot 1 Applications + +```coxen_single_drug_gene_selection``` + +To perform co-expression extrapolation (COXEN) analysis that selects predictive and generalizable genes for predicting the response of tumor cells to a specific drug. + +```coxen_multi_drug_gene_selection``` + +To extend the COXEN approach for selecting genes to predict the response of tumor cells to multiple drugs in precision oncology applications. + +# Running the example The code demonstrates feature selection methods that CANDLE provides. From f29df88112f68d6e7a2e177de16cd12bf22fa1b2 Mon Sep 17 00:00:00 2001 From: Harry Yoo Date: Wed, 13 May 2020 15:47:11 -0500 Subject: [PATCH 317/331] add cross refs --- examples/M16/README.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/examples/M16/README.md b/examples/M16/README.md index 33cf618c..0bdb3411 100644 --- a/examples/M16/README.md +++ b/examples/M16/README.md @@ -22,29 +22,29 @@ To flexibly generate data partitions for cross-validation analysis, such as part ```quantile_normalizationa``` -To perform quantile normalization of genomic data [8] with tolerance of missing values. +To perform quantile normalization of genomic data [8] with tolerance of missing values. [[see example code]](#quantile-normalization-of-gene-expression-data) ```combat_batch_effect_removal``` -To perform ComBat analysis [9] on gene expression data to remove batch effects. +To perform ComBat analysis [9] on gene expression data to remove batch effects. [[see example code]](#combat-batch-normalization-on-gene-expression-data) ```generate_gene_set_data``` -To calculate genomic representations at gene set level, such as the average expression values of genes in a pathway and the total number of SNP mutations in a genetic pathway. +To calculate genomic representations at gene set level, such as the average expression values of genes in a pathway and the total number of SNP mutations in a genetic pathway. [[see example code]](#generate-gene-set-level-data) ## General Feature Selection Functions ```select_features_by_missing_values``` -To remove features with (many) missing values. +To remove features with (many) missing values. [[see example code]](#select-features-based-on-missing-values) ```select_features_by_variation``` -To remove features with no or small variations. +To remove features with no or small variations. [[see example code]](#select-features-based-on-variation) ```select_decorrelated_features``` -To select a subset of features that are not identical or highly correlated with each other. +To select a subset of features that are not identical or highly correlated with each other. [[see example code]](#select-decorrelated-features) ## Feature (Gene) Selection Functions Specific to Pilot 1 Applications From d17d7ae9b20a7bc54222ee7bc4fc66ed46a960f4 Mon Sep 17 00:00:00 2001 From: yngtodd Date: Wed, 13 May 2020 17:12:28 -0400 Subject: [PATCH 318/331] Handle save paths that do not exist This gives the genotype storage a way to create paths that do not exist before trying to save the genotype.json there. --- common/darts/storage/genotype.py | 1 + examples/darts/uno/results/.gitkeep | 0 examples/darts/uno/uno_example.py | 6 +++--- 3 files changed, 4 insertions(+), 3 deletions(-) delete mode 100644 examples/darts/uno/results/.gitkeep diff --git a/common/darts/storage/genotype.py b/common/darts/storage/genotype.py index 87596b35..a5c72ad6 100644 --- a/common/darts/storage/genotype.py +++ b/common/darts/storage/genotype.py @@ -23,6 +23,7 @@ def save_genotype(self, genotype: Genotype, filename='genotype.json') -> None: filename: name of the save file """ genotype = self._replace_range(genotype) + os.makedirs(self.root, exist_ok=True) path = os.path.join(self.root, filename) with open(path, 'w') as outfile: json.dump(genotype, outfile) diff --git a/examples/darts/uno/results/.gitkeep b/examples/darts/uno/results/.gitkeep deleted file mode 100644 index e69de29b..00000000 diff --git a/examples/darts/uno/uno_example.py b/examples/darts/uno/uno_example.py index 703990b0..9893be6e 100644 --- a/examples/darts/uno/uno_example.py +++ b/examples/darts/uno/uno_example.py @@ -70,7 +70,7 @@ def run(params): train_meter = darts.EpochMeter(tasks, 'train') valid_meter = darts.EpochMeter(tasks, 'valid') - genotype_store = darts.GenotypeStorage(root=args.savepath) + genotype_store = darts.GenotypeStorage(root=args.save_path) for epoch in range(args.epochs): @@ -161,7 +161,7 @@ def train(trainloader, logger.info(f'Step: {step} loss: {meter.loss_meter.avg:.4}') meter.update_epoch() - meter.save(args.savepath) + meter.save(args.save_path) @@ -186,7 +186,7 @@ def validate(validloader, model, criterion, args, tasks, meter, device): logger.info(f'>> Validation: {step} loss: {meter.loss_meter.avg:.4}') meter.update_epoch() - meter.save(args.savepath) + meter.save(args.save_path) def main(): From 80360136f12df85d33c3f7b42dfdbc5650712202 Mon Sep 17 00:00:00 2001 From: yngtodd Date: Wed, 13 May 2020 17:14:35 -0400 Subject: [PATCH 319/331] Rename example This cleans things up a bit. --- examples/darts/uno/{uno_example.py => example.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename examples/darts/uno/{uno_example.py => example.py} (100%) diff --git a/examples/darts/uno/uno_example.py b/examples/darts/uno/example.py similarity index 100% rename from examples/darts/uno/uno_example.py rename to examples/darts/uno/example.py From fc8913d2ff3b4c1b0f1340074e1b09870e070ea0 Mon Sep 17 00:00:00 2001 From: yngtodd Date: Wed, 13 May 2020 17:23:01 -0400 Subject: [PATCH 320/331] Rename examples This corrects my previous misread of Jamal's request, and keeps things more specific. --- examples/darts/advanced/{example.py => advanced_example.py} | 0 examples/darts/uno/{example.py => uno_example.py} | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename examples/darts/advanced/{example.py => advanced_example.py} (100%) rename examples/darts/uno/{example.py => uno_example.py} (100%) diff --git a/examples/darts/advanced/example.py b/examples/darts/advanced/advanced_example.py similarity index 100% rename from examples/darts/advanced/example.py rename to examples/darts/advanced/advanced_example.py diff --git a/examples/darts/uno/example.py b/examples/darts/uno/uno_example.py similarity index 100% rename from examples/darts/uno/example.py rename to examples/darts/uno/uno_example.py From e7950ccd87ddf2e24bac43209d714979fe8bebcc Mon Sep 17 00:00:00 2001 From: Jamal Date: Wed, 13 May 2020 15:25:43 -0600 Subject: [PATCH 321/331] Add references to readme. --- examples/M16/README.md | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/examples/M16/README.md b/examples/M16/README.md index 33cf618c..ad4c842c 100644 --- a/examples/M16/README.md +++ b/examples/M16/README.md @@ -10,7 +10,7 @@ And the performance can be evaluated from multiple aspects, such as the predicti The data preprocessing methods can generate data partitions to enable flexible cross-validation analysis, normalize and remove batch effects from gene expression data of cancer cells, and generate genomic representations at the gene set level for cancer cells. The feature selection methods can filter features based on missing values and variations, and perform feature decorrelation. Features without much variation might not be useful for prediction and highly-correlated features are not necessary to be all included in the prediction model. -We also implement and extend the co-expression extrapolation (COXEN) gene selection method for Pilot 1 project, which can select predictive and generalizable genes for predicting drug response in the precision oncology applications. +We also implement and extend the co-expression extrapolation (COXEN) gene selection method for Pilot 1 project [3], which can select predictive and generalizable genes for predicting drug response in the precision oncology applications. ## General Data Preprocessing Functions @@ -22,11 +22,11 @@ To flexibly generate data partitions for cross-validation analysis, such as part ```quantile_normalizationa``` -To perform quantile normalization of genomic data [8] with tolerance of missing values. +To perform quantile normalization of genomic data [1] with tolerance of missing values. ```combat_batch_effect_removal``` -To perform ComBat analysis [9] on gene expression data to remove batch effects. +To perform ComBat analysis [2] on gene expression data to remove batch effects. ```generate_gene_set_data``` @@ -50,7 +50,7 @@ To select a subset of features that are not identical or highly correlated with ```coxen_single_drug_gene_selection``` -To perform co-expression extrapolation (COXEN) analysis that selects predictive and generalizable genes for predicting the response of tumor cells to a specific drug. +To perform co-expression extrapolation (COXEN) analysis [3] that selects predictive and generalizable genes for predicting the response of tumor cells to a specific drug. ```coxen_multi_drug_gene_selection``` @@ -430,3 +430,12 @@ Average third quartile of CCLE cell lines is 4.83 Average median of CCLE cell lines is 2.72 Average first quartile of CCLE cell lines is 0.13 ``` + +# References + +1. Bolstad BM, Irizarry RA, Astrand M, et al. \(2003\) A comparison of normalization methods for high density oligonucleotide array data based on variance and bias. Bioinformatics. 2003 Jan 22;19\(2\):185-93. + +2. Johnson WE, Rabinovic A, and Li C \(2007\) Adjusting batch effects in microarray expression data using Empirical Bayes methods. Biostatistics 8\(1\):118-127. + +3. Lee JK, Havaleshko DM, Cho H, et al. \(2007\) A strategy for predicting the chemosensitivity of human cancers and its application to drug discovery. Proc Natl Acad Sci USA, 2007 Aug 7; 104\(32\):13086-91. Epub 2007 Jul 31 + From 7a826e1b76491d364d2bbf31db7d8314073acc86 Mon Sep 17 00:00:00 2001 From: Jamal Date: Wed, 13 May 2020 15:32:32 -0600 Subject: [PATCH 322/331] Formatting tweaks --- examples/M16/README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/M16/README.md b/examples/M16/README.md index 15e2b8ac..024baa2e 100644 --- a/examples/M16/README.md +++ b/examples/M16/README.md @@ -433,9 +433,9 @@ Average first quartile of CCLE cell lines is 0.13 # References -1. Bolstad BM, Irizarry RA, Astrand M, et al. \(2003\) A comparison of normalization methods for high density oligonucleotide array data based on variance and bias. Bioinformatics. 2003 Jan 22;19\(2\):185-93. +1. Bolstad BM, Irizarry RA, Astrand M, et al. \(2003\) *A comparison of normalization methods for high density oligonucleotide array data based on variance and bias* Bioinformatics. 2003 Jan 22;19\(2\):185-93. -2. Johnson WE, Rabinovic A, and Li C \(2007\) Adjusting batch effects in microarray expression data using Empirical Bayes methods. Biostatistics 8\(1\):118-127. +2. Johnson WE, Rabinovic A, and Li C \(2007\) *Adjusting batch effects in microarray expression data using Empirical Bayes methods* Biostatistics 8\(1\):118-127. -3. Lee JK, Havaleshko DM, Cho H, et al. \(2007\) A strategy for predicting the chemosensitivity of human cancers and its application to drug discovery. Proc Natl Acad Sci USA, 2007 Aug 7; 104\(32\):13086-91. Epub 2007 Jul 31 +3. Lee JK, Havaleshko DM, Cho H, et al. \(2007\) *A strategy for predicting the chemosensitivity of human cancers and its application to drug discovery* Proc Natl Acad Sci USA, 2007 Aug 7; 104\(32\):13086-91. Epub 2007 Jul 31 From 4813236c0b2661132e0c62d7a429fef6703b1475 Mon Sep 17 00:00:00 2001 From: Jamal Date: Mon, 18 May 2020 08:50:51 -0600 Subject: [PATCH 323/331] Removed reference to deprecated solr_root --- examples/ADRP/README.md | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/examples/ADRP/README.md b/examples/ADRP/README.md index 7ebcd0c3..412c224e 100644 --- a/examples/ADRP/README.md +++ b/examples/ADRP/README.md @@ -34,7 +34,6 @@ Configuration file: /home/jain/CANDLE/fork/Benchmarks/examples/ADRP/adrp_defaul 'rng_seed': 2017, 'save_path': './001/', 'scaling': 'minmax', - 'solr_root': '', 'timeout': 3600, 'use_cp': False, 'validation_split': 0.1} @@ -71,7 +70,6 @@ Params: 'save_path': './001/', 'scaling': 'minmax', 'shuffle': False, - 'solr_root': '', 'timeout': 0, 'train_bool': True, 'tsne': False, @@ -82,7 +80,7 @@ Params: 'warmup_lr': False} WARNING:tensorflow:From /home/jain/CANDLE/fork/Benchmarks/common/keras_utils.py:51: The name tf.set_random_seed is deprecated. Please use tf.compat.v1.set_random_seed instead. -Params: {'data_url': 'ftp://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/', 'in': 'adrp-p1.csv', 'model_name': 'adrp', 'dense': [250, 125, 60, 30], 'batch_size': 32, 'epochs': 1, 'activation': 'relu', 'loss': 'mean_squared_error', 'optimizer': 'sgd', 'drop': 0.1, 'learning_rate': 0.0001, 'momentum': 0.9, 'scaling': 'minmax', 'validation_split': 0.1, 'epsilon_std': 1.0, 'rng_seed': 2017, 'initialization': 'glorot_uniform', 'latent_dim': 2, 'batch_normalization': False, 'save_path': './001/', 'use_cp': False, 'early_stop': True, 'reduce_lr': True, 'feature_subsample': 0, 'nb_classes': 2, 'solr_root': '', 'timeout': 3600, 'verbose': None, 'logfile': None, 'train_bool': True, 'experiment_id': 'EXP000', 'run_id': 'RUN000', 'shuffle': False, 'gpus': [], 'profiling': False, 'residual': False, 'warmup_lr': False, 'use_tb': False, 'tsne': False, 'datatype': , 'output_dir': '/home/jain/CANDLE/fork/Benchmarks/examples/ADRP/Output/EXP000/RUN000'} +Params: {'data_url': 'ftp://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/', 'in': 'adrp-p1.csv', 'model_name': 'adrp', 'dense': [250, 125, 60, 30], 'batch_size': 32, 'epochs': 1, 'activation': 'relu', 'loss': 'mean_squared_error', 'optimizer': 'sgd', 'drop': 0.1, 'learning_rate': 0.0001, 'momentum': 0.9, 'scaling': 'minmax', 'validation_split': 0.1, 'epsilon_std': 1.0, 'rng_seed': 2017, 'initialization': 'glorot_uniform', 'latent_dim': 2, 'batch_normalization': False, 'save_path': './001/', 'use_cp': False, 'early_stop': True, 'reduce_lr': True, 'feature_subsample': 0, 'nb_classes': 2, 'timeout': 3600, 'verbose': None, 'logfile': None, 'train_bool': True, 'experiment_id': 'EXP000', 'run_id': 'RUN000', 'shuffle': False, 'gpus': [], 'profiling': False, 'residual': False, 'warmup_lr': False, 'use_tb': False, 'tsne': False, 'datatype': , 'output_dir': '/home/jain/CANDLE/fork/Benchmarks/examples/ADRP/Output/EXP000/RUN000'} processing csv in file adrp-p1.csv PL= 1614 X_train shape: (27447, 1613) From 715b26d596e0dbe58fb71759811ebb9da6e843cc Mon Sep 17 00:00:00 2001 From: Jamal Date: Mon, 18 May 2020 09:01:47 -0600 Subject: [PATCH 324/331] Removed more solr_root references. --- Pilot1/Uno_UQ/data_utils_/uno.py | 1 - Pilot1/Uno_UQ/uno_defaultUQ_model.txt | 1 - 2 files changed, 2 deletions(-) diff --git a/Pilot1/Uno_UQ/data_utils_/uno.py b/Pilot1/Uno_UQ/data_utils_/uno.py index 30c96b69..26e80d4e 100644 --- a/Pilot1/Uno_UQ/data_utils_/uno.py +++ b/Pilot1/Uno_UQ/data_utils_/uno.py @@ -348,6 +348,5 @@ def set_locals(self): 'save_path', 'scaling', 'val_split', - 'solr_root', 'timeout' ] diff --git a/Pilot1/Uno_UQ/uno_defaultUQ_model.txt b/Pilot1/Uno_UQ/uno_defaultUQ_model.txt index fc119df8..1230114b 100644 --- a/Pilot1/Uno_UQ/uno_defaultUQ_model.txt +++ b/Pilot1/Uno_UQ/uno_defaultUQ_model.txt @@ -36,5 +36,4 @@ use_landmark_genes=True partition_by='cell' [Monitor_Params] -solr_root='' timeout=3600 From 22b0e6bd647c333980aeee853cfa6f0dd8eb23b6 Mon Sep 17 00:00:00 2001 From: Jamal Date: Mon, 18 May 2020 09:16:01 -0600 Subject: [PATCH 325/331] Remove solr_root from README files --- Pilot1/Attn/README.md | 2 +- Pilot1/Combo/README.md | 4 ++-- Pilot1/Uno_UQ/README.md | 3 --- 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/Pilot1/Attn/README.md b/Pilot1/Attn/README.md index a9fa4ced..95cc7970 100644 --- a/Pilot1/Attn/README.md +++ b/Pilot1/Attn/README.md @@ -9,7 +9,7 @@ Any file of the form top*21_1fold*"ijk".h5 can be used as input ``` python attn_baseline_keras2.py -Params: {'model_name': 'attn', 'dense': [2000, 600], 'batch_size': 32, 'epochs': 1, 'activation': 'relu', 'loss': 'categorical_crossentropy', 'optimizer': 'sgd', 'drop': 0.2, 'learning_rate': 1e-05, 'momentum': 0.7, 'scaling': 'minmax', 'validation_split': 0.1, 'epsilon_std': 1.0, 'rng_seed': 2017, 'initialization': 'glorot_uniform', 'latent_dim': 2, 'batch_normalization': False, 'in': 'top_21_1fold_001.h5', 'save_path': 'candle_save', 'save_dir': './save/001/', 'use_cp': False, 'early_stop': True, 'reduce_lr': True, 'feature_subsample': 0, 'nb_classes': 2, 'solr_root': '', 'timeout': 3600, 'verbose': None, 'logfile': None, 'train_bool': True, 'experiment_id': 'EXP000', 'run_id': 'RUN000', 'shuffle': False, 'gpus': [], 'profiling': False, 'residual': False, 'warmup_lr': False, 'use_tb': False, 'tsne': False, 'datatype': , 'output_dir': '/nfs2/jain/Benchmarks/Pilot1/Attn/Output/EXP000/RUN000'} +Params: {'model_name': 'attn', 'dense': [2000, 600], 'batch_size': 32, 'epochs': 1, 'activation': 'relu', 'loss': 'categorical_crossentropy', 'optimizer': 'sgd', 'drop': 0.2, 'learning_rate': 1e-05, 'momentum': 0.7, 'scaling': 'minmax', 'validation_split': 0.1, 'epsilon_std': 1.0, 'rng_seed': 2017, 'initialization': 'glorot_uniform', 'latent_dim': 2, 'batch_normalization': False, 'in': 'top_21_1fold_001.h5', 'save_path': 'candle_save', 'save_dir': './save/001/', 'use_cp': False, 'early_stop': True, 'reduce_lr': True, 'feature_subsample': 0, 'nb_classes': 2, 'timeout': 3600, 'verbose': None, 'logfile': None, 'train_bool': True, 'experiment_id': 'EXP000', 'run_id': 'RUN000', 'shuffle': False, 'gpus': [], 'profiling': False, 'residual': False, 'warmup_lr': False, 'use_tb': False, 'tsne': False, 'datatype': , 'output_dir': '/nfs2/jain/Benchmarks/Pilot1/Attn/Output/EXP000/RUN000'} ... ... processing h5 in file top_21_1fold_001.h5 diff --git a/Pilot1/Combo/README.md b/Pilot1/Combo/README.md index 82cdfe56..fbd034a2 100644 --- a/Pilot1/Combo/README.md +++ b/Pilot1/Combo/README.md @@ -39,7 +39,7 @@ $ python combo_baseline_keras2.py python combo_baseline_keras2.py --use_landmark_genes --warmup_lr --reduce_lr -z 256 Using TensorFlow backend. -Params: {'activation': 'relu', 'batch_size': 256, 'dense': [1000, 1000, 1000], 'dense_feature_layers': [1000, 1000, 1000], 'drop': 0, 'epochs': 10, 'learning_rate': None, 'loss': 'mse', 'optimizer': 'adam', 'residual': False, 'rng_seed': 2017, 'save': 'save/combo', 'scaling': 'std', 'feature_subsample': 0, 'validation_split': 0.2, 'solr_root': '', 'timeout': -1, 'cell_features': ['expression'], 'drug_features': ['descriptors'], 'cv': 1, 'max_val_loss': 1.0, 'base_lr': None, 'reduce_lr': True, 'warmup_lr': True, 'batch_normalization': False, 'gen': False, 'use_combo_score': False, 'config_file': '/home/fangfang/work/Benchmarks.combo/Pilot1/Combo/combo_default_model.txt', 'verbose': False, 'logfile': None, 'train_bool': True, 'shuffle': True, 'alpha_dropout': False, 'gpus': [], 'experiment_id': 'EXP.000', 'run_id': 'RUN.000', 'use_landmark_genes': True, 'cp': False, 'tb': False, 'datatype': } +Params: {'activation': 'relu', 'batch_size': 256, 'dense': [1000, 1000, 1000], 'dense_feature_layers': [1000, 1000, 1000], 'drop': 0, 'epochs': 10, 'learning_rate': None, 'loss': 'mse', 'optimizer': 'adam', 'residual': False, 'rng_seed': 2017, 'save': 'save/combo', 'scaling': 'std', 'feature_subsample': 0, 'validation_split': 0.2, 'timeout': -1, 'cell_features': ['expression'], 'drug_features': ['descriptors'], 'cv': 1, 'max_val_loss': 1.0, 'base_lr': None, 'reduce_lr': True, 'warmup_lr': True, 'batch_normalization': False, 'gen': False, 'use_combo_score': False, 'config_file': '/home/fangfang/work/Benchmarks.combo/Pilot1/Combo/combo_default_model.txt', 'verbose': False, 'logfile': None, 'train_bool': True, 'shuffle': True, 'alpha_dropout': False, 'gpus': [], 'experiment_id': 'EXP.000', 'run_id': 'RUN.000', 'use_landmark_genes': True, 'cp': False, 'tb': False, 'datatype': } Loaded 311737 unique (CL, D1, D2) response sets. Filtered down to 85303 rows with matching information. Unique cell lines: 59 @@ -196,4 +196,4 @@ python uno_baseline_keras2.py --conf combo_perf_benchmark.txt | Nucleus | 0:14:13 | 72 | 3.47 | 3.8 | 9.3 | 21.9 | 63.4 | 91.9 | | Tesla (K20) | 0:44:17 | 250 | 1.00 | 3.9 | 42.3 | 12.9 | 73.8 | 53.3 | | Titan | | | | | | | | | keras version 2.0.3 does not supprot model.clone_model() which is introduced in 2.0.7 | -* Time per epoch on the machine divided by time per epoch of Titan (or Tesla) \ No newline at end of file +* Time per epoch on the machine divided by time per epoch of Titan (or Tesla) diff --git a/Pilot1/Uno_UQ/README.md b/Pilot1/Uno_UQ/README.md index 2157778e..05bdeb74 100644 --- a/Pilot1/Uno_UQ/README.md +++ b/Pilot1/Uno_UQ/README.md @@ -101,7 +101,6 @@ Params: 'scaling': 'std', 'shuffle': False, 'single': True, - 'solr_root': '', 'tb': False, 'tb_prefix': 'tb', 'test_sources': ['train'], @@ -203,7 +202,6 @@ Params: 'scaling': 'std', 'shuffle': False, 'single': True, - 'solr_root': '', 'tb': False, 'tb_prefix': 'tb', 'test_sources': ['train'], @@ -413,7 +411,6 @@ Params: 'scaling': 'std', 'shuffle': False, 'single': True, - 'solr_root': '', 'tb': False, 'tb_prefix': 'tb', 'test_sources': ['train'], From 14d4a0ca6749b64719c204680da983ee29003237 Mon Sep 17 00:00:00 2001 From: Jamal Date: Fri, 22 May 2020 17:01:32 -0600 Subject: [PATCH 326/331] Fix error not detecting untarred local file. --- common/file_utils.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/common/file_utils.py b/common/file_utils.py index 4a3aac42..c7d637d9 100644 --- a/common/file_utils.py +++ b/common/file_utils.py @@ -9,6 +9,8 @@ from six.moves.urllib.request import urlopen from six.moves.urllib.error import URLError, HTTPError +import wget +import requests from generic_utils import Progbar @@ -90,7 +92,7 @@ def get_file(fname, origin, untar=False, fpath = os.path.join(datadir, fname) download = False - if os.path.exists(fpath): + if os.path.exists(fpath) or os.path.exists(untar_fpath): # file found; verify integrity if a hash was provided if md5_hash is not None: if not validate_file(fpath, md5_hash): @@ -101,10 +103,12 @@ def get_file(fname, origin, untar=False, download = True # fix ftp protocol if needed + ''' if origin.startswith('ftp://'): new_url = origin.replace('ftp://','http://') origin = new_url print('Origin = ', origin) + ''' if download: print('Downloading data from', origin) @@ -122,6 +126,7 @@ def dl_progress(count, block_size, total_size): try: try: urlretrieve(origin, fpath, dl_progress) + #fpath = wget.download(origin) except URLError as e: raise Exception(error_msg.format(origin, e.errno, e.reason)) except HTTPError as e: From 752a47d84bf7d731c62cc2bb628e7f7d34e6cc2b Mon Sep 17 00:00:00 2001 From: Jamal Date: Mon, 1 Jun 2020 11:21:03 -0600 Subject: [PATCH 327/331] Pruned release_03 in preparation for merge to master. --- Pilot1/Attn/0 | 6 - Pilot1/Attn/attn_bin_working_jan7_h5.py | 568 -------- Pilot1/Attn/attn_bin_working_jan7_h5.sh | 51 - Pilot1/Attn/attn_bsub.sh | 57 - Pilot1/Attn/cmd1.sh | 17 - Pilot1/Attn/cmd2.sh | 5 - Pilot1/NT3/nt3_baseline_keras2_tensorrt.py | 429 ------ Pilot1/NT3/nt3_tensorrt_convert.py | 45 - Pilot1/NT3/training.log | 2 - Pilot1/T29/README.candle | 43 - Pilot1/T29/infer.py | 136 -- Pilot1/T29/t29_default_model.txt | 13 - Pilot1/T29/t29res.py | 294 ---- Pilot1/Uno/plangen.py | 1489 -------------------- Pilot1/Uno/topN_to_uno.py | 178 --- 15 files changed, 3333 deletions(-) delete mode 100755 Pilot1/Attn/0 delete mode 100644 Pilot1/Attn/attn_bin_working_jan7_h5.py delete mode 100755 Pilot1/Attn/attn_bin_working_jan7_h5.sh delete mode 100755 Pilot1/Attn/attn_bsub.sh delete mode 100755 Pilot1/Attn/cmd1.sh delete mode 100755 Pilot1/Attn/cmd2.sh delete mode 100644 Pilot1/NT3/nt3_baseline_keras2_tensorrt.py delete mode 100644 Pilot1/NT3/nt3_tensorrt_convert.py delete mode 100644 Pilot1/NT3/training.log delete mode 100644 Pilot1/T29/README.candle delete mode 100644 Pilot1/T29/infer.py delete mode 100644 Pilot1/T29/t29_default_model.txt delete mode 100644 Pilot1/T29/t29res.py delete mode 100644 Pilot1/Uno/plangen.py delete mode 100644 Pilot1/Uno/topN_to_uno.py diff --git a/Pilot1/Attn/0 b/Pilot1/Attn/0 deleted file mode 100755 index e8371f00..00000000 --- a/Pilot1/Attn/0 +++ /dev/null @@ -1,6 +0,0 @@ -0 -1 -2 -3 -4 -5 diff --git a/Pilot1/Attn/attn_bin_working_jan7_h5.py b/Pilot1/Attn/attn_bin_working_jan7_h5.py deleted file mode 100644 index 570fc94f..00000000 --- a/Pilot1/Attn/attn_bin_working_jan7_h5.py +++ /dev/null @@ -1,568 +0,0 @@ -import itertools -import pandas as pd -import numpy as np -import os -import sys -import gzip -import argparse -import sklearn - -import matplotlib -matplotlib.use('Agg') - -import matplotlib.pyplot as plt - -import tensorflow as tf - -import keras as ke -from keras import backend as K - -from keras.layers import Input, Dense, Dropout, Activation, BatchNormalization -from keras.optimizers import SGD, Adam, RMSprop, Adadelta -from keras.models import Sequential, Model, model_from_json, model_from_yaml -from keras.utils import np_utils, multi_gpu_model - -from keras.callbacks import ModelCheckpoint, CSVLogger, ReduceLROnPlateau, EarlyStopping - -from sklearn.utils.class_weight import compute_class_weight -from sklearn.model_selection import train_test_split -from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, roc_auc_score, confusion_matrix, balanced_accuracy_score, classification_report -from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler -from sklearn.metrics import recall_score, auc, roc_curve, f1_score, precision_recall_curve - - -file_path = os.path.dirname(os.path.realpath(__file__)) -lib_path = os.path.abspath(os.path.join(file_path, '..', '..', 'common')) -sys.path.append(lib_path) - -psr = argparse.ArgumentParser(description='input agg csv file') -psr.add_argument('--in', default='in_file') -psr.add_argument('--ep', type=int, default=400) -psr.add_argument('--save_dir', default=".") -args=vars(psr.parse_args()) -if not args['save_dir'].endswith('/'): - args['save_dir'] = args['save_dir'] + '/' -print(args) - -EPOCH = args['ep'] -BATCH = 32 -nb_classes = 2 - -data_path = args['in'] - -# df_toss = (pd.read_csv(data_path,nrows=1).values) - -# print('df_toss:', df_toss.shape) - -# PL = df_toss.size -# PS = PL - 1 - -# print('PL=',PL) - -#PL = 6213 # 38 + 60483 -#PS = 6212 # 60483 -DR = 0.2 # Dropout rate - -def r2(y_true, y_pred): - SS_res = K.sum(K.square(y_true - y_pred)) - SS_tot = K.sum(K.square(y_true - K.mean(y_true))) - return (1 - SS_res/(SS_tot + K.epsilon())) - - - -def tf_auc(y_true, y_pred): - auc = tf.metrics.auc(y_true, y_pred)[1] - K.get_session().run(tf.local_variables_initializer()) - return auc - - -#from sklearn.metrics import roc_auc_score -#import tensorflow as tf - -def auroc( y_true, y_pred ) : - score = tf.py_func( lambda y_true, y_pred : roc_auc_score( y_true, y_pred, average='macro', sample_weight=None).astype('float32'), - [y_true, y_pred], - 'float32', - stateful=False, - name='sklearnAUC' ) - return score - - -def load_data(): - - # start change # - if args['in'].endswith('h5') or args['in'].endswith('hdf5'): - print ('processing h5 in file {}'.format(args['in'])) - - df_x_train_0 = pd.read_hdf(args['in'], 'x_train_0').astype(np.float32) - df_x_train_1 = pd.read_hdf(args['in'], 'x_train_1').astype(np.float32) - X_train = pd.concat([df_x_train_0, df_x_train_1], axis=1, sort=False) - del df_x_train_0, df_x_train_1 - - df_x_test_0 = pd.read_hdf(args['in'], 'x_test_0').astype(np.float32) - df_x_test_1 = pd.read_hdf(args['in'], 'x_test_1').astype(np.float32) - X_test = pd.concat([df_x_test_0, df_x_test_1], axis=1, sort=False) - del df_x_test_0, df_x_test_1 - - df_x_val_0 = pd.read_hdf(args['in'], 'x_val_0').astype(np.float32) - df_x_val_1 = pd.read_hdf(args['in'], 'x_val_1').astype(np.float32) - X_val = pd.concat([df_x_val_0, df_x_val_1], axis=1, sort=False) - del df_x_val_0, df_x_val_1 - - Y_train = pd.read_hdf(args['in'], 'y_train') - Y_test = pd.read_hdf(args['in'], 'y_test') - Y_val = pd.read_hdf(args['in'], 'y_val') - - # assumes AUC is in the third column at index 2 - # df_y = df['AUC'].astype('int') - # df_x = df.iloc[:,3:].astype(np.float32) - - # assumes dataframe has already been scaled - # scaler = StandardScaler() - # df_x = scaler.fit_transform(df_x) - - else: - print ('expecting in file file suffix h5') - sys.exit() - - - print('x_train shape:', X_train.shape) - print('x_test shape:', X_test.shape) - - return X_train, Y_train, X_val, Y_val, X_test, Y_test - - -X_train, _Y_train, X_val, _Y_val, X_test, _Y_test = load_data() -# move this inside the load_data function -Y_train = _Y_train['AUC'] -Y_test = _Y_test['AUC'] -Y_val = _Y_val['AUC'] - -Y_train_neg, Y_train_pos = np.bincount(Y_train) -Y_test_neg, Y_test_pos = np.bincount(Y_test) -Y_val_neg, Y_val_pos = np.bincount(Y_val) - -Y_train_total = Y_train_neg + Y_train_pos -Y_test_total = Y_test_neg + Y_test_pos -Y_val_total = Y_val_neg + Y_val_pos - -total = Y_train_total + Y_test_total + Y_val_total -neg = Y_train_neg + Y_test_neg + Y_val_neg -pos = Y_train_pos + Y_test_pos + Y_val_pos - -print('Examples:\n Total: {}\n Positive: {} ({:.2f}% of total)\n'.format( - total, pos, 100 * pos / total)) - -Y_train = np_utils.to_categorical(Y_train,nb_classes) -Y_test = np_utils.to_categorical(Y_test,nb_classes) -Y_val = np_utils.to_categorical(Y_val,nb_classes) - -# ----------------------- from stack overflow - -y_integers = np.argmax(Y_train, axis=1) -class_weights = compute_class_weight('balanced', np.unique(y_integers), y_integers) -d_class_weights = dict(enumerate(class_weights)) - -print('X_train shape:', X_train.shape) -print('X_test shape:', X_test.shape) - -print('Y_train shape:', Y_train.shape) -print('Y_test shape:', Y_test.shape) - -PS=X_train.shape[1] -inputs = Input(shape=(PS,)) - -x = Dense(1000, activation='relu')(inputs) -x = BatchNormalization()(x) - -a = Dense(1000, activation='relu')(x) -a = BatchNormalization()(a) - -b = Dense(1000, activation='softmax')(x) -x = ke.layers.multiply([a,b]) - -x = Dense(500, activation='relu')(x) -x = BatchNormalization()(x) -x = Dropout(DR)(x) - -x = Dense(250, activation='relu')(x) -x = BatchNormalization()(x) -x = Dropout(DR)(x) - -x = Dense(125, activation='relu')(x) -x = BatchNormalization()(x) -x = Dropout(DR)(x) - -x = Dense(60, activation='relu')(x) -x = BatchNormalization()(x) -x = Dropout(DR)(x) - -x = Dense(30, activation='relu')(x) -x = BatchNormalization()(x) -x = Dropout(DR)(x) - -outputs = Dense(2, activation='softmax')(x) - -model = Model(inputs=inputs, outputs=outputs) - -model.summary() - -#parallel_model = multi_gpu_model(model, gpus=4) -#parallel_model.compile(loss='mean_squared_error', -# optimizer=SGD(lr=0.0001, momentum=0.9), -# metrics=['mae',r2]) - -model.compile(loss='categorical_crossentropy', - optimizer=SGD(lr=0.00001, momentum=0.9), -# optimizer=Adam(lr=0.00001), -# optimizer=RMSprop(lr=0.0001), -# optimizer=Adadelta(), - metrics=['acc',tf_auc]) - -# set up a bunch of callbacks to do work during model training.. - -checkpointer = ModelCheckpoint(filepath=args['save_dir'] + 'Agg_attn_bin.autosave.model.h5', verbose=1, save_weights_only=False, save_best_only=True) -csv_logger = CSVLogger(args['save_dir'] + 'Agg_attn_bin.training.log') -reduce_lr = ReduceLROnPlateau(monitor='val_tf_auc', factor=0.20, patience=40, verbose=1, mode='auto', min_delta=0.0001, cooldown=3, min_lr=0.000000001) -early_stop = EarlyStopping(monitor='val_tf_auc', patience=200, verbose=1, mode='auto') - - - -#history = parallel_model.fit(X_train, Y_train, - -history = model.fit(X_train, Y_train, class_weight=d_class_weights, - batch_size=BATCH, - epochs=EPOCH, - verbose=1, - validation_data=(X_val, Y_val), - callbacks = [checkpointer, csv_logger, reduce_lr, early_stop]) - - -score = model.evaluate(X_test, Y_test, verbose=0) - -Y_predict = model.predict(X_test) - -threshold = 0.5 - -Y_pred_int = (Y_predict[:,0] < threshold).astype(np.int) -Y_test_int = (Y_test[:,0] < threshold).astype(np.int) - -print ('creating table of predictions') -f = open(args['save_dir'] + 'Agg_attn_bin.predictions.tsv', 'w') -for index, row in _Y_test.iterrows(): - if row['AUC'] == 1: - if Y_pred_int[index] == 1: - call='TP' - else: - call='FN' - if row['AUC'] == 0: - if Y_pred_int[index] == 0: - call = 'TN' - else: - call = 'FP' - # 1 TN 0 0.6323 NCI60.786-0 NSC.256439 NSC.102816 - print(index, "\t", call, "\t", Y_pred_int[index], "\t", row['AUC'], "\t", row['Sample'], "\t", row['Drug1'], file=f) -f.close() - -#print(Y_test[:,0]) -#print(Y_predict[:,0]) - -false_pos_rate, true_pos_rate, thresholds = roc_curve(Y_test[:,0], Y_predict[:,0]) - -#print(thresholds) - -roc_auc = auc(false_pos_rate, true_pos_rate) - -auc_keras = roc_auc -fpr_keras = false_pos_rate -tpr_keras = true_pos_rate - -print ('creating figure 1 at ', args['save_dir'] + 'Agg_attn_bin.auroc.pdf') -plt.figure(1) -plt.plot([0, 1], [0, 1], 'k--', label="No Skill") -plt.plot(fpr_keras, tpr_keras, label='Keras (area = {:.3f})'.format(auc_keras)) -plt.xlabel('False positive rate') -plt.ylabel('True positive rate') -plt.title('ROC curve') -plt.legend(loc='best') - -plt.savefig(args['save_dir'] + 'Agg_attn_bin.auroc.pdf', bbox_inches='tight') -plt.close() - - -# Zoom in view of the upper left corner. -print ('creating figure 2 at ', args['save_dir'] + 'Agg_attn_bin.auroc2.pdf') -plt.figure(2) -plt.xlim(0, 0.2) -plt.ylim(0.8, 1) -plt.plot([0, 1], [0, 1], 'k--', label="No Skill") -plt.plot(fpr_keras, tpr_keras, label='Keras (area = {:.3f})'.format(auc_keras)) -plt.xlabel('False positive rate') -plt.ylabel('True positive rate') -plt.title('ROC curve (zoomed in at top left)') -plt.legend(loc='best') - -plt.savefig(args['save_dir'] + 'Agg_attn_bin.auroc2.pdf', bbox_inches='tight') -plt.close() - - -f1 = f1_score(Y_test_int, Y_pred_int) - -precision, recall, thresholds = precision_recall_curve(Y_test[:,0], Y_predict[:,0]) - -#print(thresholds) - -pr_auc = auc(recall, precision) - -pr_keras = pr_auc -precision_keras = precision -recall_keras = recall - -print -print - -print('f1=%.3f auroc=%.3f aucpr=%.3f' % (f1, auc_keras, pr_keras)) - -print ('creating figure 3 at ', args['save_dir'] + 'Agg_attn_bin.aurpr.pdf') -plt.figure(1) -no_skill = len(Y_test_int[Y_test_int==1]) / len(Y_test_int) -plt.plot([0, 1], [no_skill, no_skill], linestyle='--', label='No Skill') -plt.plot(recall_keras, precision_keras, label='PR Keras (area = {:.3f})'.format(pr_keras)) -plt.xlabel('Recall') -plt.ylabel('Precision') -plt.title('PR curve') -plt.legend(loc='best') - -plt.savefig(args['save_dir'] + 'Agg_attn_bin.aurpr.pdf', bbox_inches='tight') - -plt.close() - - -def plot_confusion_matrix(cm, classes, - normalize=False, - title='Confusion matrix', - cmap=plt.cm.Blues): - """ - This function prints and plots the confusion matrix. - Normalization can be applied by setting `normalize=True`. - """ - if normalize: - cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] - print("Normalized confusion matrix") - else: - print('Confusion matrix, without normalization') - - print(cm) - - plt.imshow(cm, interpolation='nearest', cmap=cmap) - plt.title(title) - plt.colorbar() - tick_marks = np.arange(len(classes)) - plt.xticks(tick_marks, classes, rotation=45) - plt.yticks(tick_marks, classes) - - fmt = '.2f' if normalize else 'd' - thresh = cm.max() / 2. - for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): - plt.text(j, i, format(cm[i, j], fmt), - horizontalalignment="center", - color="white" if cm[i, j] > thresh else "black") - - plt.ylabel('True label') - plt.xlabel('Predicted label') - plt.tight_layout() - -class_names=["Non-Response","Response"] - -# Compute confusion matrix -cnf_matrix = sklearn.metrics.confusion_matrix(Y_test_int, Y_pred_int) -np.set_printoptions(precision=2) - -# Plot non-normalized confusion matrix -#plt.figure() -print ('creating figure 4 at ', args['save_dir'] + 'Agg_attn_bin.confusion_without_norm.pdf') -plot_confusion_matrix(cnf_matrix, classes=class_names, - title='Confusion matrix, without normalization') -plt.savefig(args['save_dir'] + 'Agg_attn_bin.confusion_without_norm.pdf', bbox_inches='tight') - -plt.close() - - - -def plot_confusion_matrix(cm, classes, - normalize=False, - title='Confusion matrix', - cmap=plt.cm.Blues): - """ - This function prints and plots the confusion matrix. - Normalization can be applied by setting `normalize=True`. - """ - if normalize: - cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] - print("Normalized confusion matrix") - else: - print('Confusion matrix, without normalization') - - print(cm) - - plt.imshow(cm, interpolation='nearest', cmap=cmap) - plt.title(title) - plt.colorbar() - tick_marks = np.arange(len(classes)) - plt.xticks(tick_marks, classes, rotation=45) - plt.yticks(tick_marks, classes) - - fmt = '.2f' if normalize else 'd' - thresh = cm.max() / 2. - for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): - plt.text(j, i, format(cm[i, j], fmt), - horizontalalignment="center", - color="white" if cm[i, j] > thresh else "black") - - plt.ylabel('True label') - plt.xlabel('Predicted label') - plt.tight_layout() - -class_names=["Non-Response","Response"] - -# Compute confusion matrix -cnf_matrix = sklearn.metrics.confusion_matrix(Y_test_int, Y_pred_int) -np.set_printoptions(precision=2) - -# Plot non-normalized confusion matrix -#plt.figure() -plot_confusion_matrix(cnf_matrix, classes=class_names, - title='Confusion matrix, without normalization') -plt.savefig(args['save_dir'] + 'Agg_attn_bin.confusion_without_norm.pdf', bbox_inches='tight') - -plt.close() - -# Plot normalized confusion matrix -#plt.figure() -plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True, - title='Normalized confusion matrix') -plt.savefig(args['save_dir'] + 'Agg_attn_bin.confusion_with_norm.pdf', bbox_inches='tight') - -plt.close() - - -print('Examples:\n Total: {}\n Positive: {} ({:.2f}% of total)\n'.format( - total, pos, 100 * pos / total)) - - -print(sklearn.metrics.roc_auc_score(Y_test_int, Y_pred_int)) - -print(sklearn.metrics.balanced_accuracy_score(Y_test_int, Y_pred_int)) - -print(sklearn.metrics.classification_report(Y_test_int, Y_pred_int)) - -print(sklearn.metrics.confusion_matrix(Y_test_int, Y_pred_int)) - -print("score") -print(score) - -#exit() - -# summarize history for accuracy -plt.plot(history.history['acc']) -plt.plot(history.history['val_acc']) -plt.title('Model Accuracy') -plt.ylabel('accuracy') -plt.xlabel('epoch') -plt.legend(['train', 'test'], loc='upper left') - -plt.savefig(args['save_dir'] + 'Agg_attn_bin.accuracy.png', bbox_inches='tight') -plt.savefig(args['save_dir'] + 'Agg_attn_bin.accuracy.pdf', bbox_inches='tight') - -plt.close() - -# summarize history for loss -plt.plot(history.history['loss']) -plt.plot(history.history['val_loss']) -plt.title('Model Loss') -plt.ylabel('loss') -plt.xlabel('epoch') -plt.legend(['train', 'test'], loc='upper left') - -plt.savefig(args['save_dir'] + 'Agg_attn_bin.loss.png', bbox_inches='tight') -plt.savefig(args['save_dir'] + 'Agg_attn_bin.loss.pdf', bbox_inches='tight') - - -print('Test val_loss:', score[0]) -print('Test accuracy:', score[1]) - -# serialize model to JSON -model_json = model.to_json() -with open(args['save_dir'] + "Agg_attn_bin.model.json", "w") as json_file: - json_file.write(model_json) - -# serialize model to YAML -model_yaml = model.to_yaml() -with open(args['save_dir'] + "Agg_attn_bin.model.yaml", "w") as yaml_file: - yaml_file.write(model_yaml) - - -# serialize weights to HDF5 -model.save_weights(args['save_dir'] + "Agg_attn_bin.model.h5") -print("Saved model to disk") - -# load json and create model -json_file = open(args['save_dir'] + 'Agg_attn_bin.model.json', 'r') -loaded_model_json = json_file.read() -json_file.close() -loaded_model_json = model_from_json(loaded_model_json) - - -# load yaml and create model -yaml_file = open(args['save_dir'] + 'Agg_attn_bin.model.yaml', 'r') -loaded_model_yaml = yaml_file.read() -yaml_file.close() -loaded_model_yaml = model_from_yaml(loaded_model_yaml) - - -# load weights into new model -loaded_model_json.load_weights(args['save_dir'] + "Agg_attn_bin.model.h5") -print("Loaded json model from disk") - -# evaluate json loaded model on test data -loaded_model_json.compile(loss='binary_crossentropy', optimizer='SGD', metrics=['accuracy']) -score_json = loaded_model_json.evaluate(X_test, Y_test, verbose=0) - -print('json Validation loss:', score_json[0]) -print('json Validation accuracy:', score_json[1]) - -print("json %s: %.2f%%" % (loaded_model_json.metrics_names[1], score_json[1]*100)) - - -# load weights into new model -loaded_model_yaml.load_weights(args['save_dir'] + "Agg_attn_bin.model.h5") -print("Loaded yaml model from disk") - -# evaluate loaded model on test data -loaded_model_yaml.compile(loss='binary_crossentropy', optimizer='SGD', metrics=['accuracy']) -score_yaml = loaded_model_yaml.evaluate(X_test, Y_test, verbose=0) - -print('yaml Validation loss:', score_yaml[0]) -print('yaml Validation accuracy:', score_yaml[1]) - -print("yaml %s: %.2f%%" % (loaded_model_yaml.metrics_names[1], score_yaml[1]*100)) - -# predict using loaded yaml model on test and training data - -predict_yaml_train = loaded_model_yaml.predict(X_train) - -predict_yaml_test = loaded_model_yaml.predict(X_test) - - -print('Yaml_train_shape:', predict_yaml_train.shape) -print('Yaml_test_shape:', predict_yaml_test.shape) - - -predict_yaml_train_classes = np.argmax(predict_yaml_train, axis=1) -predict_yaml_test_classes = np.argmax(predict_yaml_test, axis=1) - -np.savetxt(args['save_dir'] + "Agg_attn_bin_predict_yaml_train.csv", predict_yaml_train, delimiter=",", fmt="%.3f") -np.savetxt(args['save_dir'] + "Agg_attn_bin_predict_yaml_test.csv", predict_yaml_test, delimiter=",", fmt="%.3f") - -np.savetxt(args['save_dir'] + "Agg_attn_bin_predict_yaml_train_classes.csv", predict_yaml_train_classes, delimiter=",",fmt="%d") -np.savetxt(args['save_dir'] + "Agg_attn_bin_predict_yaml_test_classes.csv", predict_yaml_test_classes, delimiter=",",fmt="%d") diff --git a/Pilot1/Attn/attn_bin_working_jan7_h5.sh b/Pilot1/Attn/attn_bin_working_jan7_h5.sh deleted file mode 100755 index 3d75e083..00000000 --- a/Pilot1/Attn/attn_bin_working_jan7_h5.sh +++ /dev/null @@ -1,51 +0,0 @@ -#!/bin/bash - -prefix="/gpfs/alpine/scratch/brettin/med106" -local_prefix="/mnt/bb/$USER" - -m=$1 -datadir=$2 - -echo "input arg file: $m" -echo "input datadir: $datadir" - -for i in $(cat $m) ; do - - device=$(($i % 6)) - - # pad with zeros to conform to input file names - if [ $i -lt 10 ] ; then - n=00"$i" - else - n=0"$i" - fi - - export CUDA_VISIBLE_DEVICES=$device - - # should test if JSM_GPU_ASSIGNMENTS is empty - if [ $JSM_GPU_ASSIGNMENTS -eq $device ] ; then - echo "processing line value $i from infile $m using device $device on input $n" - mkdir -p "$prefix"/save/"$datadir"/"$n" - mkdir -p "$local_prefix"/save/"$datadir"/"$n" - mkdir -p "$local_prefix"/"$datadir" - - echo "copying files to $local_prefix/$datadir" - cp "$prefix"/Data_sets/"$datadir"/top_21_1fold_"$n".h5 \ - $local_prefix/"$datadir"/ - - ls $local_prefix/"$datadir" - - echo "running attn_bin_working_jan7_h5.py --in $local_prefix/$datadir/top_21_1fold_"$n".h5" - python attn_bin_working_jan7_h5.py --in $local_prefix/"$datadir"/top_21_1fold_"$n".h5 \ - --ep 200 \ - --save_dir "$local_prefix"/save/"$datadir"/"$n"/ > "$local_prefix"/save/"$datadir"/"$n".log & - sleep 2 - - fi - -done - -wait - -echo "running cp -r $local_prefix/save/* $prefix/save/" -cp -r $local_prefix/save/* $prefix/save/ diff --git a/Pilot1/Attn/attn_bsub.sh b/Pilot1/Attn/attn_bsub.sh deleted file mode 100755 index 5f6bdae4..00000000 --- a/Pilot1/Attn/attn_bsub.sh +++ /dev/null @@ -1,57 +0,0 @@ -#!/bin/bash -#BSUB -W 12:00 -#BSUB -nnodes 160 -#BSUB -P med106 -#BSUB -alloc_flags NVME -#BSUB -J attn1 - -# need 92 nodes for 12 hr run -# with 12 hour run should be able to do 180 (15*12) epochs -# -# at 17 nodes per data set, need to run 6 datasets (102 nodes) -# - -module load gcc/4.8.5 -module load spectrum-mpi/10.3.0.1-20190611 -module load cuda/10.1.168 -export PATH="/ccs/proj/med106/gounley1/summit/miniconda37/bin:$PATH" - -for i in $(seq 1 16) ; do - jsrun -n 6 -a 1 -c 7 -g 1 ./attn_bin_working_jan7_h5.sh $i top21_baseline > attn1.top21_baseline."$i".log 2>&1 & -done - -for i in $(seq 1 16) ; do - jsrun -n 6 -a 1 -c 7 -g 1 ./attn_bin_working_jan7_h5.sh $i top21_r.0_baseline > attn1.top21_r.0_baseline."$i".log 2>&1 & -done - -for i in $(seq 1 16) ; do - jsrun -n 6 -a 1 -c 7 -g 1 ./attn_bin_working_jan7_h5.sh $i top21_r.0_gap1 > attn1.top21_r.0_gap1."$i".log 2>&1 & -done - -for i in $(seq 1 16) ; do - jsrun -n 6 -a 1 -c 7 -g 1 ./attn_bin_working_jan7_h5.sh $i top21_r.0_gap2 > attn1.top21_r.0_gap2."$i".log 2>&1 & -done - -for i in $(seq 1 16) ; do - jsrun -n 6 -a 1 -c 7 -g 1 ./attn_bin_working_jan7_h5.sh $i top21_r.5_baseline > attn1.top21_r.5_baseline."$i".log 2>&1 & -done - -for i in $(seq 1 16) ; do - jsrun -n 6 -a 1 -c 7 -g 1 ./attn_bin_working_jan7_h5.sh $i top21_r.5_gap1 > attn1.top21_r.5_gap1."$i".log 2>&1 & -done - -for i in $(seq 1 16) ; do - jsrun -n 6 -a 1 -c 7 -g 1 ./attn_bin_working_jan7_h5.sh $i top21_r.5_gap2 > attn1.top21_r.5_gap2."$i".log 2>&1 & -done - -for i in $(seq 1 16) ; do - jsrun -n 6 -a 1 -c 7 -g 1 ./attn_bin_working_jan7_h5.sh $i top21_r.9_baseline > attn1.top21_r.9_baseline."$i".log 2>&1 & -done - -for i in $(seq 1 16) ; do - jsrun -n 6 -a 1 -c 7 -g 1 ./attn_bin_working_jan7_h5.sh $i top21_r.9_gap1 > attn1.top21_r.9_gap1."$i".log 2>&1 & -done - -for i in $(seq 1 16) ; do - jsrun -n 6 -a 1 -c 7 -g 1 ./attn_bin_working_jan7_h5.sh $i top21_r.9_gap2 > attn1.top21_r.9_gap2."$i".log 2>&1 & -done diff --git a/Pilot1/Attn/cmd1.sh b/Pilot1/Attn/cmd1.sh deleted file mode 100755 index 104543d2..00000000 --- a/Pilot1/Attn/cmd1.sh +++ /dev/null @@ -1,17 +0,0 @@ -prefix=/scratch/brettin/Agg_attn_bin_iter1 -# prefix=$HOME - -for m in $(seq -w 0 7); do - - device=$(($m % 8)) - n="00$m" - - export CUDA_VISIBLE_DEVICES=$device - mkdir -p $prefix/save/$n - - python attn_bin_working_jan7_h5.py --in /scratch/data/benchmarks/binary_811_splits/top_21_1fold_"$n".h5 \ - --ep 200 \ - --save_dir $prefix/save/"$n"/ > $prefix/save/$n.log & - - sleep 2 -done diff --git a/Pilot1/Attn/cmd2.sh b/Pilot1/Attn/cmd2.sh deleted file mode 100755 index ca4dc21f..00000000 --- a/Pilot1/Attn/cmd2.sh +++ /dev/null @@ -1,5 +0,0 @@ - -for n in $(cat $1) ; do - echo $n - ./attn_bin_working_jan7_h5.sh $n -done diff --git a/Pilot1/NT3/nt3_baseline_keras2_tensorrt.py b/Pilot1/NT3/nt3_baseline_keras2_tensorrt.py deleted file mode 100644 index 64303e28..00000000 --- a/Pilot1/NT3/nt3_baseline_keras2_tensorrt.py +++ /dev/null @@ -1,429 +0,0 @@ -from __future__ import print_function -import pandas as pd -import numpy as np -import os -import sys -import gzip -import argparse -try: - import configparser -except ImportError: - import ConfigParser as configparser - -from keras import backend as K - -from keras.layers import Input, Dense, Dropout, Activation, Conv1D, MaxPooling1D, Flatten -from keras.optimizers import SGD, Adam, RMSprop -from keras.models import Sequential, Model, model_from_json, model_from_yaml -from keras.utils import np_utils -from keras.callbacks import ModelCheckpoint, CSVLogger, ReduceLROnPlateau - -from sklearn.metrics import accuracy_score -from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler - -TIMEOUT=3600 # in sec; set this to -1 for no timeout -file_path = os.path.dirname(os.path.realpath(__file__)) -lib_path = os.path.abspath(os.path.join(file_path, '..', 'common')) -sys.path.append(lib_path) -lib_path2 = os.path.abspath(os.path.join(file_path, '..', '..', 'common')) -sys.path.append(lib_path2) - -import nt3 as bmk -import candle - -''' Import Tensorflow Modules ''' -import tensorflow as tf -from tensorflow.python.framework import graph_io -from tensorflow.python.tools import freeze_graph -from tensorflow.core.protobuf import saver_pb2 -from tensorflow.python.training import saver as saver_lib - - -#url_nt3 = 'ftp://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/normal-tumor/' -#file_train = 'nt_train2.csv' -#file_test = 'nt_test2.csv' - -#EPOCH = 400 -#BATCH = 20 -#CLASSES = 2 - -#PL = 60484 # 1 + 60483 these are the width of the RNAseq datasets -#P = 60483 # 60483 -#DR = 0.1 # Dropout rate - -''' -def common_parser(parser): - - parser.add_argument("--config_file", dest='config_file', type=str, - default=os.path.join(file_path, 'nt3_default_model.txt'), - help="specify model configuration file") - - # Parse has been split between arguments that are common with the default neon parser - # and all the other options - parser = p1_common.get_default_neon_parse(parser) - parser = p1_common.get_p1_common_parser(parser) - - return parser - -def get_nt3_parser(): - - parser = argparse.ArgumentParser(prog='nt3_baseline', formatter_class=argparse.ArgumentDefaultsHelpFormatter, - description='Train Autoencoder - Pilot 1 Benchmark NT3') - - return common_parser(parser) - -def read_config_file(file): - config = configparser.ConfigParser() - config.read(file) - section = config.sections() - fileParams = {} - - fileParams['data_url'] = eval(config.get(section[0],'data_url')) - fileParams['train_data'] = eval(config.get(section[0],'train_data')) - fileParams['test_data'] = eval(config.get(section[0],'test_data')) - fileParams['model_name'] = eval(config.get(section[0],'model_name')) - fileParams['conv'] = eval(config.get(section[0],'conv')) - fileParams['dense'] = eval(config.get(section[0],'dense')) - fileParams['activation'] = eval(config.get(section[0],'activation')) - fileParams['out_activation'] = eval(config.get(section[0],'out_activation')) - fileParams['loss'] = eval(config.get(section[0],'loss')) - fileParams['optimizer'] = eval(config.get(section[0],'optimizer')) - fileParams['metrics'] = eval(config.get(section[0],'metrics')) - fileParams['epochs'] = eval(config.get(section[0],'epochs')) - fileParams['batch_size'] = eval(config.get(section[0],'batch_size')) - fileParams['learning_rate'] = eval(config.get(section[0], 'learning_rate')) - fileParams['dropout'] = eval(config.get(section[0],'dropout')) - fileParams['classes'] = eval(config.get(section[0],'classes')) - fileParams['pool'] = eval(config.get(section[0],'pool')) - fileParams['save_path'] = eval(config.get(section[0], 'save_path')) - - # parse the remaining values - for k,v in config.items(section[0]): - if not k in fileParams: - fileParams[k] = eval(v) - - return fileParams - -def initialize_parameters(): - # Get command-line parameters - parser = get_nt3_parser() - args = parser.parse_args() - #print('Args:', args) - # Get parameters from configuration file - fileParameters = read_config_file(args.config_file) - #print ('Params:', fileParameters) - # Consolidate parameter set. Command-line parameters overwrite file configuration - gParameters = p1_common.args_overwrite_config(args, fileParameters) - return gParameters - ''' - -def initialize_parameters(default_model = 'nt3_default_model.txt'): - - # Build benchmark object - nt3Bmk = bmk.BenchmarkNT3(bmk.file_path, default_model, 'keras', - prog='nt3_baseline_tensorrt', desc='1D CNN to classify RNA sequence data in normal or tumor classes') - - # Initialize parameters - gParameters = candle.finalize_parameters(nt3Bmk) - #benchmark.logger.info('Params: {}'.format(gParameters)) - - return gParameters - -def load_data(train_path, test_path, gParameters): - - print('Loading data...') - df_train = (pd.read_csv(train_path,header=None).values).astype('float32') - df_test = (pd.read_csv(test_path,header=None).values).astype('float32') - print('done') - - print('df_train shape:', df_train.shape) - print('df_test shape:', df_test.shape) - - seqlen = df_train.shape[1] - - df_y_train = df_train[:,0].astype('int') - df_y_test = df_test[:,0].astype('int') - - Y_train = np_utils.to_categorical(df_y_train,gParameters['classes']) - Y_test = np_utils.to_categorical(df_y_test,gParameters['classes']) - - df_x_train = df_train[:, 1:seqlen].astype(np.float32) - df_x_test = df_test[:, 1:seqlen].astype(np.float32) - -# X_train = df_x_train.as_matrix() -# X_test = df_x_test.as_matrix() - - X_train = df_x_train - X_test = df_x_test - - scaler = MaxAbsScaler() - mat = np.concatenate((X_train, X_test), axis=0) - mat = scaler.fit_transform(mat) - - X_train = mat[:X_train.shape[0], :] - X_test = mat[X_train.shape[0]:, :] - - return X_train, Y_train, X_test, Y_test - - -def run(gParameters): - - print ('Params:', gParameters) - - file_train = gParameters['train_data'] - file_test = gParameters['test_data'] - url = gParameters['data_url'] - - train_file = candle.get_file(file_train, url+file_train, cache_subdir='Pilot1') - test_file = candle.get_file(file_test, url+file_test, cache_subdir='Pilot1') - - X_train, Y_train, X_test, Y_test = load_data(train_file, test_file, gParameters) - - print('X_train shape:', X_train.shape) - print('X_test shape:', X_test.shape) - - print('Y_train shape:', Y_train.shape) - print('Y_test shape:', Y_test.shape) - - x_train_len = X_train.shape[1] - - # this reshaping is critical for the Conv1D to work - - X_train = np.expand_dims(X_train, axis=2) - X_test = np.expand_dims(X_test, axis=2) - - print('X_train shape:', X_train.shape) - print('X_test shape:', X_test.shape) - - model = Sequential() - - layer_list = list(range(0, len(gParameters['conv']), 3)) - for l, i in enumerate(layer_list): - filters = gParameters['conv'][i] - filter_len = gParameters['conv'][i+1] - stride = gParameters['conv'][i+2] - print(int(i/3), filters, filter_len, stride) - if gParameters['pool']: - pool_list=gParameters['pool'] - if type(pool_list) != list: - pool_list=list(pool_list) - - if filters <= 0 or filter_len <= 0 or stride <= 0: - break - if 'locally_connected' in gParameters: - model.add(LocallyConnected1D(filters, filter_len, strides=stride, padding='valid', input_shape=(x_train_len, 1))) - else: - #input layer - if i == 0: - model.add(Conv1D(filters=filters, kernel_size=filter_len, strides=stride, padding='valid', input_shape=(x_train_len, 1))) - else: - model.add(Conv1D(filters=filters, kernel_size=filter_len, strides=stride, padding='valid')) - model.add(Activation(gParameters['activation'])) - if gParameters['pool']: - model.add(MaxPooling1D(pool_size=pool_list[int(i/3)])) - - model.add(Flatten()) - - for layer in gParameters['dense']: - if layer: - model.add(Dense(layer)) - model.add(Activation(gParameters['activation'])) - # This has to be disabled for tensorrt otherwise I am getting an error - if False and gParameters['dropout']: - model.add(Dropout(gParameters['dropout'])) - #model.add(Dense(gParameters['classes'])) - #model.add(Activation(gParameters['out_activation']), name='activation_5') - model.add(Dense(gParameters['classes'], activation=gParameters['out_activation'], name='activation_5')) - -#Reference case -#model.add(Conv1D(filters=128, kernel_size=20, strides=1, padding='valid', input_shape=(P, 1))) -#model.add(Activation('relu')) -#model.add(MaxPooling1D(pool_size=1)) -#model.add(Conv1D(filters=128, kernel_size=10, strides=1, padding='valid')) -#model.add(Activation('relu')) -#model.add(MaxPooling1D(pool_size=10)) -#model.add(Flatten()) -#model.add(Dense(200)) -#model.add(Activation('relu')) -#model.add(Dropout(0.1)) -#model.add(Dense(20)) -#model.add(Activation('relu')) -#model.add(Dropout(0.1)) -#model.add(Dense(CLASSES)) -#model.add(Activation('softmax')) - - kerasDefaults = candle.keras_default_config() - - # Define optimizer - optimizer = candle.build_optimizer(gParameters['optimizer'], - gParameters['learning_rate'], - kerasDefaults) - - model.summary() - for layer in model.layers: - print(layer.name) - - print([x.op.name for x in model.outputs]) - - model.compile(loss=gParameters['loss'], - optimizer=optimizer, - metrics=[gParameters['metrics']]) - - output_dir = gParameters['output_dir'] - - if not os.path.exists(output_dir): - os.makedirs(output_dir) - - # calculate trainable and non-trainable params - gParameters.update(candle.compute_trainable_params(model)) - - # set up a bunch of callbacks to do work during model training.. - model_name = gParameters['model_name'] - path = '{}/{}.autosave.model.h5'.format(output_dir, model_name) - # checkpointer = ModelCheckpoint(filepath=path, verbose=1, save_weights_only=False, save_best_only=True) - csv_logger = CSVLogger('{}/training.log'.format(output_dir)) - reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=10, verbose=1, mode='auto', epsilon=0.0001, cooldown=0, min_lr=0) - candleRemoteMonitor = candle.CandleRemoteMonitor(params=gParameters) - timeoutMonitor = candle.TerminateOnTimeOut(TIMEOUT) - history = model.fit(X_train, Y_train, - batch_size=gParameters['batch_size'], - epochs=gParameters['epochs'], - verbose=1, - validation_data=(X_test, Y_test), - callbacks = [csv_logger, reduce_lr, candleRemoteMonitor, timeoutMonitor]) - - score = model.evaluate(X_test, Y_test, verbose=0) - - #Begin tensorrt code - config = { - # Where to save models (Tensorflow + TensorRT) - #"graphdef_file": "/gpfs/jlse-fs0/users/pbalapra/tensorrt/Benchmarks/Pilot1/NT3/nt3.pb", - #"frozen_model_file": "/gpfs/jlse-fs0/users/pbalapra/tensorrt/Benchmarks/Pilot1/NT3/nt3_frozen_model.pb", - #"snapshot_dir": "/gpfs/jlse-fs0/users/pbalapra/tensorrt/Benchmarks/Pilot1/NT3/snapshot", - #"engine_save_dir": "/gpfs/jlse-fs0/users/pbalapra/tensorrt/Benchmarks/Pilot1/NT3", - "graphdef_file": "nt3.pb", - "frozen_model_file": "nt3_frozen_model.pb", - "snapshot_dir": "snapshot", - "engine_save_dir": ".", - - # Needed for TensorRT - "inference_batch_size": 1, # inference batch size - "input_layer": "conv1d_1", # name of the input tensor in the TF computational graph - "out_layer": "activation_5/Softmax", # name of the output tensorf in the TF conputational graph - "output_size" : 2, # number of classes in output (5) - "precision": "fp32" # desired precision (fp32, fp16) "test_image_path" : "/home/data/val/roses" - } - - # Now, let's use the Tensorflow backend to get the TF graphdef and frozen graph - K.set_learning_phase(0) - sess = K.get_session() - saver = saver_lib.Saver(write_version=saver_pb2.SaverDef.V2) - - # save model weights in TF checkpoint - checkpoint_path = saver.save(sess, config['snapshot_dir'], global_step=0, latest_filename='checkpoint_state') - - # remove nodes not needed for inference from graph def - train_graph = sess.graph - inference_graph = tf.graph_util.remove_training_nodes(train_graph.as_graph_def()) - - #print(len([n.name for n in tf.get_default_graph().as_graph_def().node])) - - # write the graph definition to a file. - # You can view this file to see your network structure and - # to determine the names of your network's input/output layers. - graph_io.write_graph(inference_graph, '.', config['graphdef_file']) - - # specify which layer is the output layer for your graph. - # In this case, we want to specify the softmax layer after our - # last dense (fully connected) layer. - out_names = config['out_layer'] - - # freeze your inference graph and save it for later! (Tensorflow) - freeze_graph.freeze_graph( - config['graphdef_file'], - '', - False, - checkpoint_path, - out_names, - "save/restore_all", - "save/Const:0", - config['frozen_model_file'], - False, - "" - ) - - - if False: - print('Test score:', score[0]) - print('Test accuracy:', score[1]) - # serialize model to JSON - model_json = model.to_json() - with open("{}/{}.model.json".format(output_dir, model_name), "w") as json_file: - json_file.write(model_json) - - # serialize model to YAML - model_yaml = model.to_yaml() - with open("{}/{}.model.yaml".format(output_dir, model_name), "w") as yaml_file: - yaml_file.write(model_yaml) - - # serialize weights to HDF5 - model.save_weights("{}/{}.weights.h5".format(output_dir, model_name)) - print("Saved model to disk") - - # load json and create model - json_file = open('{}/{}.model.json'.format(output_dir, model_name), 'r') - loaded_model_json = json_file.read() - json_file.close() - loaded_model_json = model_from_json(loaded_model_json) - - - # load yaml and create model - yaml_file = open('{}/{}.model.yaml'.format(output_dir, model_name), 'r') - loaded_model_yaml = yaml_file.read() - yaml_file.close() - loaded_model_yaml = model_from_yaml(loaded_model_yaml) - - - # load weights into new model - loaded_model_json.load_weights('{}/{}.weights.h5'.format(output_dir, model_name)) - print("Loaded json model from disk") - - # evaluate json loaded model on test data - loaded_model_json.compile(loss=gParameters['loss'], - optimizer=gParameters['optimizer'], - metrics=[gParameters['metrics']]) - score_json = loaded_model_json.evaluate(X_test, Y_test, verbose=0) - - print('json Test score:', score_json[0]) - print('json Test accuracy:', score_json[1]) - - print("json %s: %.2f%%" % (loaded_model_json.metrics_names[1], score_json[1]*100)) - - # load weights into new model - loaded_model_yaml.load_weights('{}/{}.weights.h5'.format(output_dir, model_name)) - print("Loaded yaml model from disk") - - # evaluate loaded model on test data - loaded_model_yaml.compile(loss=gParameters['loss'], - optimizer=gParameters['optimizer'], - metrics=[gParameters['metrics']]) - score_yaml = loaded_model_yaml.evaluate(X_test, Y_test, verbose=0) - - print('yaml Test score:', score_yaml[0]) - print('yaml Test accuracy:', score_yaml[1]) - - print("yaml %s: %.2f%%" % (loaded_model_yaml.metrics_names[1], score_yaml[1]*100)) - - return history - -def main(): - - gParameters = initialize_parameters() - run(gParameters) - -if __name__ == '__main__': - main() - try: - K.clear_session() - except AttributeError: # theano does not have this function - pass diff --git a/Pilot1/NT3/nt3_tensorrt_convert.py b/Pilot1/NT3/nt3_tensorrt_convert.py deleted file mode 100644 index 019f87ea..00000000 --- a/Pilot1/NT3/nt3_tensorrt_convert.py +++ /dev/null @@ -1,45 +0,0 @@ -''' Import TensorRT Modules ''' -import tensorrt as trt -import uff -from tensorrt.parsers import uffparser - -config = { - # Where to save models (Tensorflow + TensorRT) - "graphdef_file": "/gpfs/jlse-fs0/users/pbalapra/tensorrt/Benchmarks/Pilot1/NT3/nt3.pb", - "frozen_model_file": "/gpfs/jlse-fs0/users/pbalapra/tensorrt/Benchmarks/Pilot1/NT3/nt3_frozen_model.pb", - "snapshot_dir": "/gpfs/jlse-fs0/users/pbalapra/tensorrt/Benchmarks/Pilot1/NT3/snapshot", - "engine_save_dir": "/gpfs/jlse-fs0/users/pbalapra/tensorrt/Benchmarks/Pilot1/NT3", - # Needed for TensorRT - "inference_batch_size": 1, # inference batch size - "input_layer": "conv1d_1", # name of the input tensor in the TF computational graph - "out_layer": "activation_5/Softmax", # name of the output tensorf in the TF conputational graph - "output_size" : 2, # number of classes in output (5) - "precision": "fp32", # desired precision (fp32, fp16) - "test_image_path" : "/home/data/val/roses" -} - -G_LOGGER = trt.infer.ConsoleLogger(trt.infer.LogSeverity.INFO) -INPUT_LAYERS = [config['input_layer']] -OUTPUT_LAYERS = [config['out_layer']] -INFERENCE_BATCH_SIZE = config['inference_batch_size'] - -# Load your newly created Tensorflow frozen model and convert it to UFF -uff_model = uff.from_tensorflow_frozen_model(config['frozen_model_file'], OUTPUT_LAYERS) - -# Create a UFF parser to parse the UFF file created from your TF Frozen model -parser = uffparser.create_uff_parser() -parser.register_input(INPUT_LAYERS[0],(1,60464,128),0) -parser.register_output(OUTPUT_LAYERS[0]) - -# Build your TensorRT inference engine -if(config['precision'] == 'fp32'): - engine = trt.utils.uff_to_trt_engine(G_LOGGER, uff_model, parser, INFERENCE_BATCH_SIZE, 1<<20, trt.infer.DataType.FLOAT) -elif(config['precision'] == 'fp16'): - engine = trt.utils.uff_to_trt_engine(G_LOGGER, uff_model, parser, INFERENCE_BATCH_SIZE, 1<<20, trt.infer.DataType.HALF) - - # Serialize TensorRT engine to a file for when you are ready to deploy your model. -save_path = str(config['engine_save_dir']) + "keras_vgg19_b" + str(INFERENCE_BATCH_SIZE) + "_"+ str(config['precision']) + ".engine" - -trt.utils.write_engine_to_file(save_path, engine.serialize()) - -print("Saved TRT engine to {}".format(save_path)) diff --git a/Pilot1/NT3/training.log b/Pilot1/NT3/training.log deleted file mode 100644 index 5395ad58..00000000 --- a/Pilot1/NT3/training.log +++ /dev/null @@ -1,2 +0,0 @@ -epoch,accuracy,loss,val_accuracy,val_loss -0,0.5705357,0.6902586913534573,0.6178571581840515,0.6843732084546771 diff --git a/Pilot1/T29/README.candle b/Pilot1/T29/README.candle deleted file mode 100644 index 17f4a743..00000000 --- a/Pilot1/T29/README.candle +++ /dev/null @@ -1,43 +0,0 @@ -curl -o rip.it.test.csv.gz ftp://ftp.mcs.anl.gov/pub/candle/public/tutorials/t29res/rip.it.test.csv.gz -curl -o rip.it.train.csv.gz ftp://ftp.mcs.anl.gov/pub/candle/public/tutorials/t29res/rip.it.train.csv.gz -gunzip rip.it.test.csv.gz -gunzip rip.it.train.csv.gz - - -git checkout release_01 - -def initialize_parameters(): - t29_common = candle_keras.Benchmark(file_path, 't29_default_model.txt','keras', - prog='t29res.py',desc='resnet') - gParameters = candle.keras.initialize_parameters(t29_common) - return gParameters - - -# In the run method, get default settings for keras objects, -# such as those for the the optimizer. - -kerasDefaults = candle_keras.xkeras_default_config() -kerasDefaults['momentum_sgd'] = gParameters['momentum'] - -# In the run method, create the optimizer using user supplied -# parameters as well as those in the keras defaults. - -OPTIMIZER = keras_utils.build_optimizer(gParameters['optimizer'], - gParameters['learning_rate'], - kerasDefaults) - -# Add additional arguements that are not represented in the default -# arguments (Need a reference to the list of default arguements). - -additional_definitions = [ - {'name':'connections', - 'default':1, - 'type':int, - 'help':'The number of residual connections.'}, - {'name':'distance', - 'default':1, - 'type':int, - 'help':'Residual connection distance between dense layers.'} -] - -# To configure the width of the dense layers diff --git a/Pilot1/T29/infer.py b/Pilot1/T29/infer.py deleted file mode 100644 index e0d7608f..00000000 --- a/Pilot1/T29/infer.py +++ /dev/null @@ -1,136 +0,0 @@ -import pandas as pd -import numpy as np -import os -import sys -import keras as ke -from keras.models import Sequential, Model, model_from_json, model_from_yaml -from keras.utils import np_utils -from keras import backend as K -from keras.callbacks import ModelCheckpoint, CSVLogger, ReduceLROnPlateau, LearningRateScheduler -from sklearn.metrics import accuracy_score -from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler - - -file_path = os.path.dirname(os.path.realpath(__file__)) - -# candle -sys.path.append('/raid/brettin/Benchmarks/common') -import candle - -# This needs to be fixed -# candle -def initialize_parameters(): - t29_common = candle.Benchmark(file_path, 't29_default_model.txt','keras', - prog='t29res.py',desc='resnet') - - # Need a pointer to the docs showing what is provided - # by default - additional_definitions = [ - {'name':'connections', - 'default':1, - 'type':int, - 'help':'The number of residual connections.'}, - {'name':'distance', - 'default':1, - 'type':int, - 'help':'Residual connection distance between dense layers.'}, - {'name':'model', - 'default':'model.json', - 'type':str, - 'help':'Name of json model description file.'}, - {'name':'weights', - 'default':'model.h5', - 'type':str, - 'help':'Name of h5 weights file.'}, - {'name':'n_pred', - 'default':1, - 'type':int, - 'help':'Number of predictions to do on each sample.'} - ] - t29_common.additional_definitions = additional_definitions - gParameters = candle.finalize_parameters(t29_common) - return gParameters - - -def load_data(gParameters): - train_path=gParameters['train_path'] - test_path=gParameters['test_path'] - df_train = (pd.read_csv(train_path,header=None).values).astype('float32') - df_test = (pd.read_csv(test_path,header=None).values).astype('float32') - - print('df_train shape:', df_train.shape) - print('df_test shape:', df_test.shape) - - df_y_train = df_train[:,0].astype('int') - df_y_test = df_test[:,0].astype('int') - - Y_train = np_utils.to_categorical(df_y_train,gParameters['classes']) - train_classes = np.argmax(Y_train, axis=1) - - Y_test = np_utils.to_categorical(df_y_test,gParameters['classes']) - test_classes = np.argmax(Y_test, axis=1) - - df_x_train = df_train[:, 1:df_train.shape[1]].astype(np.float32) - df_x_test = df_test[:, 1:df_train.shape[1]].astype(np.float32) - - # not sure the extra variable is needed, and is this a copy or reference - X_train = df_x_train - X_test = df_x_test - - scaler = MaxAbsScaler() - mat = np.concatenate((X_train, X_test), axis=0) - mat = scaler.fit_transform(mat) - - X_train = mat[:X_train.shape[0], :] - X_test = mat[X_train.shape[0]:, :] - - return X_train, Y_train, X_test, Y_test - -# This is required for candle compliance. -# It essentially wraps what was in the implicit main funcion -def run(gParameters): - print ('gParameters: ', gParameters) - - # load the data - X_train, Y_train, X_test, Y_test = load_data(gParameters) - - # load json and create model - json_file = open(gParameters['model'], 'r') - loaded_model_json = json_file.read() - json_file.close() - loaded_model_json = model_from_json(loaded_model_json) - - # load weights into new model - loaded_model_json.load_weights(gParameters['weights']) - print("Loaded json model from disk") - - # predict using loaded yaml model on test and training data - pred_test_df = pd.DataFrame() - pred_test_classes_df = pd.DataFrame() - - for x in range(gParameters['n_pred']): - predict_test = loaded_model_json.predict(X_test) - pred_test_df[str(x)] = np.amax(predict_test, axis=1) - pred_test_classes_df[str(x)] = np.argmax(predict_test, axis=1) - - pred_test_df['mean'] = pred_test_df.mean(axis=1) - pred_test_df['std'] = pred_test_df.std(axis=1) - - pred_test_df.to_csv("predict_test.csv") - pred_test_classes_df.to_csv("predict_test_classes.csv") - return - -# This is also added for candle compliance so that the program can -# still be executed independently from the command line. -def main(): - - gParameters = initialize_parameters() - run(gParameters) - -if __name__ == '__main__': - main() - try: - ke.clear_session() - except AttributeError: # theano does not have this function - pass - diff --git a/Pilot1/T29/t29_default_model.txt b/Pilot1/T29/t29_default_model.txt deleted file mode 100644 index 33b04760..00000000 --- a/Pilot1/T29/t29_default_model.txt +++ /dev/null @@ -1,13 +0,0 @@ -[Global_Params] -train_path='./rip.it.train.csv' -test_path='./rip.it.test.csv' -batch_size=64 -epochs=100 -dropout=0.2 -classes=2 -optimizer='sgd' -learning_rate=0.002 -momentum=0.42 -loss='categorical_crossentropy' -activation='relu' -CHECK=1 diff --git a/Pilot1/T29/t29res.py b/Pilot1/T29/t29res.py deleted file mode 100644 index 3e0054f9..00000000 --- a/Pilot1/T29/t29res.py +++ /dev/null @@ -1,294 +0,0 @@ -import pandas as pd -import numpy as np -import os -import sys -import gzip -import matplotlib -matplotlib.use('Agg') -import matplotlib.pyplot as plt -import keras as ke -from keras.layers import Input, Dense, Dropout, Activation -from keras.optimizers import SGD, Adam, RMSprop -from keras.models import Sequential, Model, model_from_json, model_from_yaml -from keras.utils import np_utils -from keras import backend as K -from keras.callbacks import ModelCheckpoint, CSVLogger, ReduceLROnPlateau, LearningRateScheduler -from sklearn.metrics import accuracy_score -from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler - - -file_path = os.path.dirname(os.path.realpath(__file__)) - -# candle -file_path = os.path.dirname(os.path.realpath(__file__)) -lib_path2 = os.path.abspath(os.path.join(file_path, '..', '..', 'common')) -sys.path.append(lib_path2) -import candle - -# candle -def initialize_parameters(default_model = 't29_default_model.txt'): - t29_common = candle.Benchmark(file_path, default_model,'keras', - prog='t29res.py',desc='resnet') - - # Need a pointer to the docs showing what is provided - # by default - additional_definitions = [ - {'name':'connections', - 'default':1, - 'type':int, - 'help':'The number of residual connections.'}, - {'name':'distance', - 'default':1, - 'type':int, - 'help':'Residual connection distance between dense layers.'} - ] - t29_common.additional_definitions = additional_definitions - gParameters = candle.finalize_parameters(t29_common) - return gParameters - - -def load_data(nb_classes, PL, gParameters): - train_path=gParameters['train_path'] - test_path=gParameters['test_path'] - df_train = (pd.read_csv(train_path,header=None).values).astype('float32') - df_test = (pd.read_csv(test_path,header=None).values).astype('float32') - - print('df_train shape:', df_train.shape) - print('df_test shape:', df_test.shape) - - df_y_train = df_train[:,0].astype('int') - df_y_test = df_test[:,0].astype('int') - - Y_train = np_utils.to_categorical(df_y_train,nb_classes) - train_classes = np.argmax(Y_train, axis=1) - np.savetxt("train_classes.csv", train_classes, delimiter=",", fmt="%d") - - Y_test = np_utils.to_categorical(df_y_test,nb_classes) - test_classes = np.argmax(Y_test, axis=1) - np.savetxt("test_classes.csv", test_classes, delimiter=",", fmt="%d") - - df_x_train = df_train[:, 1:PL].astype(np.float32) - df_x_test = df_test[:, 1:PL].astype(np.float32) - - # not sure the extra variable is needed, and is this a copy or reference - X_train = df_x_train - X_test = df_x_test - - scaler = MaxAbsScaler() - mat = np.concatenate((X_train, X_test), axis=0) - mat = scaler.fit_transform(mat) - - X_train = mat[:X_train.shape[0], :] - X_test = mat[X_train.shape[0]:, :] - - return X_train, Y_train, X_test, Y_test - -# Create residual connections -# x is input -# distance is distance to residual connection - -# this is a function I added so that we could include -# the distance between residually connected layers -# and the number of residual connections needed -def f(x, gParameters, distance=1): - input = x - for i in range(distance): - if 'dropout' in gParameters: - x = Dropout(gParameters['dropout'])(x) - x = Dense(1000, activation=gParameters['activation'])(x) - y = ke.layers.add([input,x]) - return y - -# This is required for candle compliance. -# It essentially wraps what was in the implicit main funcion -def run(gParameters): - print ('gParameters: ', gParameters) - - EPOCH = gParameters['epochs'] - BATCH = gParameters['batch_size'] - nb_classes = gParameters['classes'] - DR = gParameters['dropout'] - ACTIVATION = gParameters['activation'] - kerasDefaults = candle.keras_default_config() - kerasDefaults['momentum_sgd'] = gParameters['momentum'] - OPTIMIZER = candle.build_optimizer(gParameters['optimizer'], - gParameters['learning_rate'], - kerasDefaults) - PL = 6213 # 38 + 60483 - PS = 6212 # 60483 - - X_train, Y_train, X_test, Y_test = load_data(nb_classes, PL, gParameters) - - print('X_train shape:', X_train.shape) - print('X_test shape:', X_test.shape) - - print('Y_train shape:', Y_train.shape) - print('Y_test shape:', Y_test.shape) - - - inputs = Input(shape=(PS,)) - - x = Dense(2000, activation=ACTIVATION)(inputs) - x = Dense(1000, activation=ACTIVATION)(x) - - for i in range(gParameters['connections']): - x = f(x, gParameters, distance=gParameters['distance'] ) - - x = Dropout(DR)(x) - - x = Dense(500, activation=ACTIVATION)(x) - x = Dropout(DR)(x) - x = Dense(250, activation=ACTIVATION)(x) - x = Dropout(DR)(x) - x = Dense(125, activation=ACTIVATION)(x) - x = Dropout(DR)(x) - x = Dense(62, activation=ACTIVATION)(x) - x = Dropout(DR)(x) - x = Dense(30, activation=ACTIVATION)(x) - x = Dropout(DR)(x) - outputs = Dense(2, activation='softmax')(x) - - model = Model(inputs=inputs, outputs=outputs) - model.summary() - model.compile(loss='categorical_crossentropy', - optimizer=OPTIMIZER, - metrics=['accuracy']) - - # set up a bunch of callbacks to do work during model training. - checkpointer = ModelCheckpoint(filepath='t29res.autosave.model.h5', verbose=0, save_weights_only=False, save_best_only=True) - csv_logger = CSVLogger('t29res.training.log') - reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.4, patience=10, verbose=1, mode='auto', epsilon=0.0001, cooldown=3, min_lr=0.000000001) - callbacks = [checkpointer, csv_logger, reduce_lr] - - def warmup_scheduler(epoch): - lr=gParameters['learning_rate'] - if epoch <= 4: - K.set_value(model.optimizer.lr, (lr * (epoch+1) / 5)) - print ('Epoch {}: lr={}'.format(epoch, K.get_value(model.optimizer.lr))) - return K.get_value(model.optimizer.lr) - - if 'warmup_lr' in gParameters: - - warmup_lr = LearningRateScheduler(warmup_scheduler) - print("adding LearningRateScheduler") - callbacks.append(warmup_lr) - - - history = model.fit(X_train, Y_train, - batch_size=BATCH, - epochs=EPOCH, - verbose=1, - validation_data=(X_test, Y_test), - callbacks = callbacks) - - score = model.evaluate(X_test, Y_test, verbose=0) - - # summarize history for accuracy - plt.plot(history.history['acc']) - plt.plot(history.history['val_acc']) - plt.title('Model Accuracy') - plt.ylabel('accuracy') - plt.xlabel('epoch') - plt.legend(['train', 'test'], loc='upper left') - - plt.savefig('t29res.accuracy.png', bbox_inches='tight') - plt.savefig('t29res.accuracy.pdf', bbox_inches='tight') - - plt.close() - - # summarize history for loss - plt.plot(history.history['loss']) - plt.plot(history.history['val_loss']) - plt.title('Model Loss') - plt.ylabel('loss') - plt.xlabel('epoch') - plt.legend(['train', 'test'], loc='upper left') - - plt.savefig('t29res.loss.png', bbox_inches='tight') - plt.savefig('t29res.loss.pdf', bbox_inches='tight') - - print('Test val_loss:', score[0]) - print('Test accuracy:', score[1]) - - # serialize model to JSON - model_json = model.to_json() - with open("t29res.model.json", "w") as json_file: - json_file.write(model_json) - - # serialize model to YAML - model_yaml = model.to_yaml() - with open("t29res.model.yaml", "w") as yaml_file: - yaml_file.write(model_yaml) - - # serialize weights to HDF5 - model.save_weights("t29res.model.h5") - print("Saved model to disk") - - # load json and create model - json_file = open('t29res.model.json', 'r') - loaded_model_json = json_file.read() - json_file.close() - loaded_model_json = model_from_json(loaded_model_json) - - # load yaml and create model - yaml_file = open('t29res.model.yaml', 'r') - loaded_model_yaml = yaml_file.read() - yaml_file.close() - loaded_model_yaml = model_from_yaml(loaded_model_yaml) - - # load weights into new model - loaded_model_json.load_weights("t29res.model.h5") - print("Loaded json model from disk") - - # evaluate json loaded model on test data - loaded_model_json.compile(loss='binary_crossentropy', optimizer=gParameters['optimizer'], metrics=['accuracy']) - score_json = loaded_model_json.evaluate(X_test, Y_test, verbose=0) - - print('json Validation loss:', score_json[0]) - print('json Validation accuracy:', score_json[1]) - print("json %s: %.2f%%" % (loaded_model_json.metrics_names[1], score_json[1]*100)) - - # load weights into new model - loaded_model_yaml.load_weights("t29res.model.h5") - print("Loaded yaml model from disk") - - # evaluate loaded model on test data - loaded_model_yaml.compile(loss='binary_crossentropy', optimizer=gParameters['optimizer'], metrics=['accuracy']) - score_yaml = loaded_model_yaml.evaluate(X_test, Y_test, verbose=0) - - print('yaml Validation loss:', score_yaml[0]) - print('yaml Validation accuracy:', score_yaml[1]) - print("yaml %s: %.2f%%" % (loaded_model_yaml.metrics_names[1], score_yaml[1]*100)) - - # predict using loaded yaml model on test and training data - predict_yaml_train = loaded_model_yaml.predict(X_train) - predict_yaml_test = loaded_model_yaml.predict(X_test) - - print('Yaml_train_shape:', predict_yaml_train.shape) - print('Yaml_test_shape:', predict_yaml_test.shape) - - predict_yaml_train_classes = np.argmax(predict_yaml_train, axis=1) - predict_yaml_test_classes = np.argmax(predict_yaml_test, axis=1) - - np.savetxt("predict_yaml_train.csv", predict_yaml_train, delimiter=",", fmt="%.3f") - np.savetxt("predict_yaml_test.csv", predict_yaml_test, delimiter=",", fmt="%.3f") - - np.savetxt("predict_yaml_train_classes.csv", predict_yaml_train_classes, delimiter=",",fmt="%d") - np.savetxt("predict_yaml_test_classes.csv", predict_yaml_test_classes, delimiter=",",fmt="%d") - - return history - -# This is also added for candle compliance so that the program can -# still be executed independently from the command line. -def main(): - - gParameters = initialize_parameters() - run(gParameters) - -if __name__ == '__main__': - main() - try: - ke.clear_session() - except AttributeError: # theano does not have this function - pass - diff --git a/Pilot1/Uno/plangen.py b/Pilot1/Uno/plangen.py deleted file mode 100644 index 5eccdcca..00000000 --- a/Pilot1/Uno/plangen.py +++ /dev/null @@ -1,1489 +0,0 @@ - -from collections import deque -from collections import namedtuple -from enum import Enum -import glob -import itertools as it -import json -import numpy as np -import os -import sys -import sqlite3 -from sqlite3 import Error as db_Error - -# import planargs - -from abc import ABC, abstractmethod # abstract class support -from collections import OrderedDict -from scipy.special import comb -from pprint import pprint as pp -from datetime import datetime - -ISO_TIMESTAMP = "seconds" # timestamp to ISO string -ISO_TIMESTAMP_ENCODE = "%Y-%m-%dT%H:%M:%S" # ISO string to timestamp -DEBUG_SQL = False - -def isempty(path): - """Determine whether the given directory is empty.""" - flist = glob.glob(os.path.join(path,'*')) - return flist == [] - - -def validate_args(args): - """Validate the execution arguments as defined in planargs.py. - - This function validates input arguments defined in the 'args' namespace. - The inputs are lists series of feature-set names (fs_names), files - (fs_paths) and partitioning attributes (fs_parts). fs_names and fs_files - must designate the same number of parameters. For example: - - --fs_names CELL DRUG --fs_paths cells.txt drugs.txt - - The CELL name is paired with the cells.txt file, DRUG with drugs.txt, etc. - Currently, this one for one correspondence also applies to the fs_part arg, - which specifies the number of partitions the feature-set list is broken - into at every level of the plan generation recursion. A complete example - might look like this: - - --fsnames CELL DRUG --fs_paths cells.txt drugs.txt --fs_parts 2 2 - - An output directory for the plan in any of its formats is given by out_dir. - An input directory may be specified via in_dir to simplify the coding of - fs_paths. Otherwise, feature-set files must be fully specified. Each of the - files is read and returned. - - Returns - Upon success, a tuple is returned. It contains: - - t[0] - the generator class implementing the appropriate partition() - function. - - t[1] - a list of feature-set entry lists is returned. All entries - are stripped of white-space, all white-space lines have been removed. - For example: - - [[CELL1 ... CELLn] [DRUG1 ... DRUGn]] - - Additionally, an args.lines list is created where each entry contains - the entry count of the corresponding fs_paths file argument. - """ - params = {} - verbose = args.verbose - - fs_names_len = len(args.fs_names) - fs_paths_len = len(args.fs_paths) - fs_parts_len = len(args.fs_parts) - - nbr_feature_sets = fs_names_len - test_lengths = [fs_names_len, fs_paths_len, fs_parts_len] - reqd_lengths = [nbr_feature_sets] * 3 - - if test_lengths != reqd_lengths: - sys.exit("Error: The lengths of all feature set definition args (fs_<>) must be identical") - - if nbr_feature_sets <= 1: - sys.exit("Error: Partitioning requires multiple feature sets") - - for nparts in args.fs_parts: - if nparts < 1 or nparts >= 8: - sys.exit("Error: Invalid partitioning value %d" % nparts) - - # validate input and output directories - if args.in_dir and not os.path.isdir(args.in_dir): - sys.exit("Error: --in_dir must designate a directory, '%s' is not valid" % args.in_dir) - - if not os.path.isdir(args.out_dir): - sys.exit("Error: --out_dir must designate a directory, '%s' is not valid" % args.out_dir) - - if not args.overwrite and not isempty(args.out_dir): - sys.exit("Error: --out_dir '%s' is not empty, --overwrite not specified" % args.out_dir) - - if verbose: - print("Writing plan information to %s" % os.path.abspath(args.out_dir)) - - # expand, validate and load input feature-set content lists - fs_content = [] - args.fs_lines = [] - file_error = False - if args.in_dir == None: - args.in_dir = '' # prepare for use in os.path.join() - - for i, path in enumerate(args.fs_paths): - fullpath = os.path.join(args.in_dir, path) - if not os.path.exists(fullpath): - file_error = True - print("Error: %s file not found" % fullpath) - else: - with open(fullpath, 'r') as f: # read text and sanitize - raw_lines = f.readlines() - - text = [line.strip() for line in raw_lines] - text = [l for l in text if l != ''] - fs_content.append(text) - args.fs_lines.append(len(text)) - - if verbose: - print("Loading '%s' feature set definition from %s - %d lines" - % (args.fs_names[i], fullpath, len(text))) - - if file_error: - sys.exit("Terminating due to error") - - # construct a partitioning object exporting a partion() function - if args.partition_strategy == 'leaveout': - generator = LeaveoutSubsetGenerator() - - # return feature-set contents lists - return generator, fs_content - - -class SubsetGenerator(ABC): - """Abstract class implementing a data partitioning method. - - The SubsetGenerator class provides a template for subclasses that implement - mechanisms for dividing sets of lists into sublists for the purpose of - defining unique ML training and validation sets. - - Subclasses must implement those methods defined as @abstractmethod. - The validate() function provided here does a sanity test for all anticipated - partitioning schemes. Subclasses should implement their specializations. - """ - - def __init__(self, name=''): - self.name = name - self.term_msg = "Terminating due to error" - - @abstractmethod - def partition( - self, - base, - size=None, - count=None, - name='-unspecified-' - ): - """Partition a feature-set array. - - Partition the 'base', a list of elements, using the abstract arguments - 'size' and 'count' to tailor the implementation's algorithm. 'name' is - used in error reporting and is optional. - """ - validate(self, base, size, count, name) - return [] - - def get_plan_label(self, plan_dict, root_name): - root = plan_dict[root_name] - return root['label'] - - def _validation_error(self, base_len, size, count, name='-unspecified-'): - """Provide a common error reporting function. """ - print("Base list length: %d requested %d sublists of length %d" % - (base_len, count, size)) - - def validate(self, base, size=None, count=None, name='-unspecified-'): - """Provide basic request validation, specific generators may impose - additional requirements. - """ - berror = False - base_len = len(base) - - if size == None or size <= 0 or size > base_len: - berror = True - else: - unique_combos = comb(base_len, size) # implements N take K - if count > unique_combos: - berror = True - if berror: - SubsetGenerator._validation_error(self, base_len, size, count, name) - - return not berror - -# -# UNDER EVALUATION ????????????????????????????????????????????????????? -# - -class IterativeSubsetGenerator(SubsetGenerator): - """ Tom Brettin method... subset generation via iteration over base""" - def __init__(self): - SubsetGenerator.__init__(self, 'IterativeSubsetGenerator') - - def partition(self, base, size=None, count=0, name=None): - """ """ - - if size is None: - print("Error: Unspecified list partitioning size") - sys.exit(3) - - """ - base_len = len(base) - if count == 0: # a simplification useful in the iterative approach - count = base_len - """ - - is_valid = SubsetGenerator.validate(self, base, size, count, name) - if not is_valid: - print(self.term_msg) - sys.exit(1) - - if count > base_len: - SubsetGenerator._validation_error(self, base_len, size, count, name) - print(self.term_msg) - sys.exit(2) - - np_base = np.array(base) - selected_sublists = [] - omit_size = base_len - size - increment = min(size, omit_size) - - # omit consecutive blocks of feature-name entries - for i in range(count): - org = i * increment - if org >= base_len: - org = org % base_len - if org == 0 and i > 0: - print("Warning: %d sublists of %s completed short of the requested %d" - % (i, name, count)) - break - - end = org + size - sublist = np_base.take(range(org, end), mode='wrap') - print(sublist) - selected_sublists.append(sublist) - - return selected_sublists - - -class LeaveoutSubsetGenerator(SubsetGenerator): - """CANDLE milestone 13 style feature set partitioning. - - All SubsetGenerator subclasses are required to implement partition(), - plan_init() and plan_term() functions. - """ - - def __init__(self): - SubsetGenerator.__init__(self, 'LeaveoutSubsetGenerator') - self.strategy = "leaveout" - - def plan_init(self, fs_names, fs_paths, fs_lines, fs_parts, maxdepth, root_name='1'): - """Initialize - collect plan metadata """ - currtime = datetime.now() - details = {'fs_names': fs_names, 'fs_filepaths':fs_paths, 'fs_parts': fs_parts} - details['create_date'] = currtime.isoformat(timespec=ISO_TIMESTAMP) - details['strategy'] = self.strategy - - label = '' - for i in range(len(fs_names)): - if i != 0: - label += '_' - s = '{}{}-p{}'.format(fs_names[i], fs_lines[i], fs_parts[i]) - label += s - - if maxdepth > 0: - label += '-maxdepth{}'.format(maxdepth) - - details['label'] = label - plan_dict = OrderedDict() - plan_dict[root_name] = details - return root_name, plan_dict - - def plan_term(self, plan_dict, root_name, nbr_subplans): - """Completion - post plan summary metadata """ - meta = plan_dict[root_name] - meta['nbr_subplans'] = nbr_subplans - - - def partition(self, base, size='n/a', count=None, name=None): - """Partition a feature-set list into lists of equal sized elements. - - This partitioner accepts a list of feature-set names and returns - 'count' lists, the elements evenly divided between these lists. - The last sublist will contain more or fewer elements if the base - list cannot be evenly divided. - - Args - base: A list of feature-set names. - size: Ignored, not used in this implementation. - count: The number of equal sized partitions requested, required. - name: A tag used for debug/error tracing. Not used in this - implementation. - - These arguments are common to all partition functions defined in - SubsetGenerator subclasses. - - Returns - When the input 'base' list contains a number of entries equal to or - greater than 'count', a list of 'count' sublists is returned. For - example: - - [[CELL1, ..., CELL4], [CELL5, ..., CELL7]] - - Otherwise the base list is returned as a list of lists, each list - containing one feature from the input list. This implementation - maintains compatibility with the "standard" return format discussed - above. - """ - - base_len = len(base) - if base_len < count: # can partition any further? - return [[feature] for feature in base] - - size = base_len // count - sublists = [] - - for i in range(count): - org = i * size - end = org + size - if i != count - 1: - part = base[org:end] - else: - part = base[org:] - sublists.append(part) - - return sublists - -#------------------------------------------------------------------------------ -# Database support, table and column definitions, DDL and DML -# Refer to the plan_prep() function for a discussion of the "planstat" and -# "runhist" tables defined below. -#------------------------------------------------------------------------------ - -class RunType(Enum): - RUN_ALL = 0 - RESTART = 1 - -class RunStat(Enum): # subplan execution status - SCHEDULED = 'scheduled' - COMPLETE = 'complete' - -# planstat table, rows are returned via the PlanstatRow namedtuple - -_planstat_ddl = """ - CREATE TABLE IF NOT EXISTS planstat ( - plan_name TEXT NOT NULL PRIMARY KEY, - create_date TEXT NOT NULL, - feature_sets TEXT NOT NULL, - partitions TEXT NOT NULL, - nbr_subplans INTEGER - ); """ - -PlanstatRow = namedtuple('PlanstatRow', - [ - 'rowid', - 'plan_name', - 'create_date', - 'feature_sets', - 'partitions', - 'nbr_subplans' - ] -) - -_select_row_from_planstat = """ - SELECT rowid, - plan_name, create_date, feature_sets, partitions, nbr_subplans - FROM planstat - WHERE plan_name='{}' - """ - -_insert_planstat_plan = """ - INSERT INTO planstat ( - plan_name, create_date, feature_sets, partitions, nbr_subplans) - VALUES ('{}', '{}', '{}', '{}', {}) - """ - -_delete_planstat_plan = """ - DELETE FROM planstat where rowid = {} - """ - -# runhist table, rows are returned via the RunhistRow namedtuple - -_runhist_ddl = """ - CREATE TABLE IF NOT EXISTS runhist ( - plan_id INTEGER NOT NULL, - subplan_id TEXT NOT NULL, - status TEXT NOT NULL, - start_time TEXT NOT NULL, - stop_time TEXT, - run_mins INT, - mae REAL, - mse REAL, - r_square REAL, - other_info TEXT, - weights_fn TEXT, - PRIMARY KEY (plan_id, subplan_id) - ); """ - -RunhistRow = namedtuple('RunhistRow', - [ - 'plan_id', - 'subplan_id', - 'status', - 'start_time', - 'stop_time', - 'run_mins', - 'mae', - 'mse', - 'r_square', - 'other_info', - 'weights_fn' - ] -) - -_select_row_from_runhist = """ - SELECT plan_id, subplan_id, status, - start_time, stop_time, run_mins, - mae, mse, r_square, other_info, weights_fn - FROM runhist - WHERE plan_id = {} and subplan_id = '{}' - """ - -_insupd_scheduled_runhist = """ - REPLACE INTO runhist(plan_id, subplan_id, status, start_time, - stop_time, run_mins, mae, mse, r_square, other_info, weights_fn) - VALUES({}, '{}', '{}', '{}', - NULL, NULL, NULL, NULL, NULL, NULL, NULL) - """ - -_insupd_completed_runhist = """ - UPDATE runhist SET - status = '{}', - stop_time = '{}', - run_mins = {}, - mae = {}, - mse = {}, - r_square = {}, - other_info = '{}', - weights_fn = '{}' - WHERE - plan_id = {} AND subplan_id='{}' - """ - -_delete_from_runhistory = """ - DELETE FROM runhist where plan_id = {} - """ - -#------------------------------------------------------------------------------ -# "Plan management" Database functions -# -# db_connect - establish database connection returning conn handle -# execute_sql_stmt - execute a SQL statement with optional error trap -# plan_prep - prepare for the execution of a multi-step "plan" -# start_subplan - start a subplan, (ex. '1.4.8'), write RunhistRow -# stop_subplan - stop a subplan, update RunhistRow -# get_subplan_runhist - return a RunhistRow for a given subplan -# plan_remove - remove all database records for the named plan -#------------------------------------------------------------------------------ - -def execute_sql_stmt(conn, stmt, cursor=None, trap_exception=False): - """Execute a SQL statement. - - This is a convenience function that wraps the execution of a given SQL - statement with exception handling and cleanup logic. - - Args - conn: An open database connection handle - stmt: A fully instantiated SQL statement - - cursor: Optionally, a cursor managed by the caller. If - local cursor is used. Provide a cursor if you must - operate on it after completion, fetchall() for example. - - trap_exception: By default exceptions raised by the database must be - handled by the caller. If True, errors are reflected - by the boolean return value and the cursor and/or - connection handle provided by the caller are closed.. - - Returns - False indicates that an exception occurred, else True. - """ - - if cursor: - lclcsr = cursor - else: - lclcsr = conn.cursor() - try: - if DEBUG_SQL: - with open("plangen_db.log", "a") as fp: - fp.write("STMT: " + stmt + "\n") - - db_exception = False - lclcsr.execute(stmt) - - except db_Error as e: - db_exception = True - print('execute_sql_stmt:', stmt) - print('execute_sql_stmt:', e) - if not trap_exception: - raise - finally: - if not cursor: - lclcsr.close() - - if db_exception: - if cursor: - cursor.close() - conn.close() - - return not db_exception - - -def db_connect(db_path): - """Connect to the plan management database. - - Establish a connection to the sqlite3 database contained in the named file. - A plan management database is created and populated at db_path if the file - does not exist. - - Args - db_path: A relative or absolute path or ":memory:" - - Returns - A connection handle is returned upon success, else None - """ - - if db_path == ':memory:' or not os.path.exists(db_path): - prev_allocated = False - else: - prev_allocated = True - - try: - conn = sqlite3.connect(db_path) - except db_Error as error: - print('db_connect', error) - raise - - # create plan management tables on initial database allocation - if conn and not prev_allocated: - complete = execute_sql_stmt(conn, _planstat_ddl) - complete &= execute_sql_stmt(conn, _runhist_ddl) - - if complete: - conn.commit() - else: - conn.close() - conn = None - return conn - - -def plan_remove(db_path, plan_path): - """Delete the named plan from the plan managment database. - - The relative plan name is extracted from the plan_path by removing the - leading directories and the trailing filetype suffix from the given - plan_path. The planstat row is retrieved and the associated rowid is - the plan_id identifying the target runhist table rows. - - Returns - Zero indicates deletion complete, -1 if the plan name is not matched. - """ - - status = 0 - conn = db_connect(db_path) - plan_key = _get_planstat_key(plan_path) - stmt = _select_row_from_planstat.format(plan_key) - csr = conn.cursor() - execute_sql_stmt(conn, stmt, cursor=csr) - nrow = csr.rowcount - row = csr.fetchone() - - print("%d run history rows deleted" % nrow) - - if not row: - print("Error: CLEANUP request failed - %s has not been run" % plan_key) - status = -1 - else: - plan_rec = PlanstatRow._make(row) # column-name addressable - rowid = plan_rec.rowid # the unique rowid is the plan uniquifier - _delete_runhistory(conn, rowid) - stmt = _delete_planstat_plan.format(rowid) - status = execute_sql_stmt(conn, stmt) - - csr.close() - conn.close() - return status - - -def plan_prep(db_path, plan_path, run_type=RunType.RUN_ALL): - """Prepare to run a plan, a hierarchy of interdependent subplans. - - Plan names and related information are stored in the planstat (PLAN STATUS) - table. There is one row for each plan submitted. A positive, unique integer - called the 'rowid' is assigned to table rows by the database manager. The - rowid of a planstat table row is defined here as the "plan_id". The plan_id - together with a textual "subplan_id" (example: '1.2.4') form a composite - key that is the primary key of the runhist (RUN HISTORY) table. The purpose - of this function is to register the plan and return the associated plan_id. - - RunTypes - When a new plan is presented it is registered in the planstat table and - during its execution a large number of runhist (RUN HISTORY) table - entries are created and then updated. To prevent unintended loss of - data one of the following "RunTypes" is specified on the initial - plan_prep() call and again on subsequent start_subplan() calls. - - Specify RUN_ALL on the first attempt to run a plan. If the plan name - is already registered, the request fails and neither the planstat or - runstat tables are changed. - - Specify RESTART if a prior attempt to run a plan did not complete. The - presence of a corresponding planstat record is verified. start_subplan() - returns a SKIP status if the associated runhist row (if any) is marked - COMPLETE. - - Args - db_path: plan management database path (relative or absolute) - plan_path: JSON plan file (relative or absolute) - run_type: RunType.RUN_ALL, the default, or RunType.RESTART - - Returns - A negative value indicates a fatal error. - - Otherwise the integer returned is the plan_id used together with a - subplan_id string used in subsequent start_subplan(), stop_subplan() - and get_subplan_hist() calls. - """ - - # load the plan and retrieve identity info - plan_dict = load_plan(plan_path) - create_date = get_plan_create_date(plan_dict) - feature_sets = get_plan_fs_names(plan_dict) - partitions = get_plan_fs_parts(plan_dict) - nbr_subplans = get_plan_nbr_subplans(plan_dict) - - # de termine if a plan of the given name has already been registered - conn = db_connect(db_path) - plan_key = _get_planstat_key(plan_path) - stmt = _select_row_from_planstat.format(plan_key) - csr = conn.cursor() - execute_sql_stmt(conn, stmt, cursor=csr) - row = csr.fetchone() - - if not row: - rowid = -1 - else: - plan_rec = PlanstatRow._make(row) # column-name addressable - rowid = plan_rec.rowid # the unique rowid will be the uniquifier returned - - # compare run_type to initial expectations - error = False - - if run_type == RunType.RUN_ALL and rowid > 0: - print("Error: RUN_ALL specified but plan: %s has already been defined" % plan_key) - error = True - - elif run_type == RunType.RESTART and rowid < 0: - print("Warning: RESTART specified but plan: %s has not been previously run" % plan_key) - - elif rowid > 0 and create_date != create_date: # DEBUG ???????????????????????????????????? plan_rec.create_date: - print("Error: RESTART specified but the signature of the previously defined plan: %s does not match" % plan_key) - error = True - - # register new plans acquiring the uniquifying plan_id used to compose runhistory table keys - if not error and rowid < 0: - feature_sets = str(feature_sets) - feature_sets = feature_sets.replace("'", "") # create string literal from list of str - partitions = str(partitions) # create string literal from list of int - - stmt = _insert_planstat_plan.format( - plan_key, - create_date, - feature_sets, - partitions, - nbr_subplans - ) - - status = execute_sql_stmt(conn, stmt, cursor=csr) - rowid = csr.lastrowid - - # cleanup resources and return uniquifier or error indicator - csr.close() - conn.commit() - - if error: - return -1 - else: - return rowid - - -def start_subplan(db_path, plan_path, plan_id=None, subplan_id=None, run_type=None): - """Schedule the execution of a subplan. - - This function writes a RunhistRow record to the runhist table indicating that - the named plan/subplan has been SCHEDULED. The row includes the "start time". - If the given run_type is RESTART, it is possible that the subplan has already - run, as indicated by the status returned. - - Args - db_path: plan management database path (relative or absolute) - plan_path: JSON plan file (relative or absolute) - plan_id: the plan identifier returned by plan_prep() - subplan_id the subplan identifier ex. '1 4.8' - run_type: RunType.RUN_ALL or RunType.RESTART - - Returns - Zero indicates that a RunhistRow record has been created to represent - the subplan. -1 is returned from a RESTART call if the a RunhistRow - already exists for the plan/subplan and is marked COMPLETE. - """ - - conn = db_connect(db_path) - csr = conn.cursor() - skip = False - - # skip previously completed work if RESTART - if run_type == RunType.RESTART: - stmt = _select_row_from_runhist.format(plan_id, subplan_id) - execute_sql_stmt(conn, stmt, cursor=csr) - row = csr.fetchone() - - if row: - runhist_rec = RunhistRow._make(row) - if runhist_rec.status == RunStat.COMPLETE.name: - skip = True - - # construct/reinit a new runhist record - if not skip: - currtime = datetime.now() - start_time = currtime.isoformat(timespec=ISO_TIMESTAMP) - - stmt = _insupd_scheduled_runhist.format( - plan_id, - subplan_id, - RunStat.SCHEDULED.name, - start_time - ) - - execute_sql_stmt(conn, stmt, cursor=csr) - - csr.close() - conn.commit() - conn.close() - - if skip: - return -1 - else: - return 0 - - -def stop_subplan(db_path, plan_id=None, subplan_id=None, comp_info_dict={}): - """Complete the execution of a subplan. - - This function updates the RunhistRow record created by start_subplan() - updating the status to COMPLETE, the completion timestamp, and "user - fields" (such as MAE, MSE, R2) returned by the model. - - A comp_dict dictionary is populated with the names and default values - for columns implemented in the RunhistRow table. Values matching those - names are extracted from the comp_info_dict are written to the table. - - Args - db_path: plan management database path (relative or absolute) - plan_path: JSON plan file (relative or absolute) - plan_id: the plan identifier returned by plan_prep() - comp_info_dict: supplemental completion data dictionar - """ - - conn = db_connect(db_path) - csr = conn.cursor() - curr_time = datetime.now() - stop_time = curr_time.isoformat(timespec=ISO_TIMESTAMP) - - comp_dict = dict(mae=0.0, mse=0.0, r_square=0.0, weights_fn='N/A', unprocessed='') - remainder = _acquire_actuals(comp_dict, comp_info_dict) - - if len(remainder) == 0: - other_info = '' - else: - other_info = json.dumps(remainder) - - # fetch row to retrieve schedule info - stmt = _select_row_from_runhist.format(plan_id, subplan_id) - execute_sql_stmt(conn, stmt, csr) - row = csr.fetchone() - - if row: # expected, caller error if already marked COMPLETED - runhist_rec = RunhistRow._make(row) - if runhist_rec.status != RunStat.COMPLETE.name: - start_time = datetime.strptime(runhist_rec.start_time, ISO_TIMESTAMP_ENCODE) - duration = curr_time - start_time - run_mins = int((duration.total_seconds() + 59) / 60) - - # update runhist record - stmt = _insupd_completed_runhist.format( - # column values - RunStat.COMPLETE.name, - stop_time, - run_mins, - comp_dict['mae'], - comp_dict['mse'], - comp_dict['r_square'], - other_info, - comp_dict['weights_fn'], - # key spec - plan_id, - subplan_id - ) - - execute_sql_stmt(conn, stmt) - - # cleanup - csr.close() - conn.commit() - conn.close() - - -def get_subplan_runhist(db_path, plan_id=None, subplan_id=None): - """Return the RunhistRow record for a given plan/subplan. - - Args - db_path: plan management database path (relative or absolute) - plan_id: the plan identifier returned by plan_prep() - subplan_id the subplan identifier ex. '1 4.8' - - Returns - The RunhistRow associated with the given plan/subplan is returned if - found. - """ - conn = db_connect(db_path) - stmt = _select_row_from_runhist.format(plan_id, subplan_id) - csr = conn.cursor() - execute_sql_stmt(conn, stmt, csr) - row = csr.fetchone() - - if not row: - plan_rec = None - else: - plan_rec = RunhistRow._make(row) - - return plan_rec - -def _acquire_actuals(dft_dict, actuals_dict): - """Extract values from dictionary overlaying defaults.""" - actuals = actuals_dict.copy() - for key, value in dft_dict.items(): - if key in actuals: - dft_dict[key] = actuals[key] - actuals.pop(key) - - return actuals # possibly empty - - -def _get_planstat_key(plan_path): - """Extract the name portion of a plan from a filepath.""" - basename = os.path.basename(plan_path) - basepfx = basename.split(sep='.') - return basepfx[0] - - -def _delete_runhistory(conn, plan_id): - """Delete RunhistRows containing the given plan_id.""" - csr = conn.cursor() - stmt = _delete_from_runhistory.format(plan_id) - execute_sql_stmt(conn, stmt, cursor=csr, trap_exception=True) - rowcount = csr.rowcount - print("CLEANUP processing removed %d run history records" % rowcount) - csr.close() - return rowcount - - -#------------------------------------------------------------------------------ -# Plan navigation, content retrieval -#------------------------------------------------------------------------------ - -def load_plan(filepath): - """Load a JSON transfer learning plan. - - The named JSON tranfer learning plan file is loaded in a manner that preserves - the entry order imposed when the plan was created. This allows the root entry - to be easily located regardless of the plan entry naming scheme in use. - - Args - filepath: A relative or absolute path to the JSON file. - - Returns - An entry-ordered plan in OrderedDict format is returned. - """ - - with open(filepath, 'r') as f: - ordered_plan_dict = json.load(f, object_pairs_hook=OrderedDict) - return ordered_plan_dict - -def get_plan_create_date(plan_dict): - _, value = _get_first_entry(plan_dict) - return value['create_date'] - -def get_plan_fs_names(plan_dict): - _, value = _get_first_entry(plan_dict) - return value['fs_names'] - -def get_plan_fs_parts(plan_dict): - _, value = _get_first_entry(plan_dict) - return value['fs_parts'] - -def get_plan_nbr_subplans(plan_dict): - _, value = _get_first_entry(plan_dict) - return value['nbr_subplans'] - -def _get_first_entry(ordered_dict): - key, value = next(iter(ordered_dict.items())) - return key, value - -def get_subplan(plan_dict, subplan_id=None): - """Retrieve the content of a named subplan or the root plan. - - Args - plan_dict: The plan dictionary as returned by load_plan(). - subplan_id: The name of the desired subplan. Omit this arg to acquire - the content and name of the plan tree root. - - Returns - A (content, subplan_id) pair is returned. The returned name is useful when - using default arguments to retrieve the root plan. - """ - - if subplan_id is None: - subplan_id, content = _get_first_entry(plan_dict) - else: - content = plan_dict.get(subplan_id) - return content, subplan_id - - -def get_predecessor(plan_dict, subplan_id): - """Acquire the name of the predecessor (parent) of a given subplan. - - The plan tree is a true tree. All subplans have exactly one - predecessor/parent. Use this function to walk 'up' the tree. - - Args - plan_dict: The plan dictionary as returned by load_plan(). - subplan_id: The name of the target subplan. - - Returns - The name of the parent subplan is returned. If the root plan name - is specified None is returned. - """ - - segments = subplan_id.split(sep='.') - if len(segments) <= 1: - subplan_id = None - else: - segments.pop() - subplan_id = '.'.join(segments) - return subplan_id - - -def get_successors(plan_dict, subplan_id): - """Acquire the names of the successors (children) of a given subplan. - - All subplans other than 'leaf' subplans have at least one successor. Use - this function to walk 'down' one or more plan subtrees. - - Args - plan_dict: The plan dictionary as returned by load_plan(). - subplan_id: The name of the target subplan. - - Returns - A list of the names of all successors (children) of the given subplan - is returned. The list may be empty. - """ - successor_names = [] - for i in it.count(start=1): - new_name = subplan_id + '.' + str(i) - value = plan_dict.get(new_name) - if not value: - break - successor_names.append(new_name) - - return successor_names - - -def _get_named_set(plan_dict, subplan_id, section_tag, fs_name, collector, parent_features=None): - """ """ - - while True: - content, _ = get_subplan(plan_dict, subplan_id) - assert(content) - - section = content[section_tag] - for i, section_features in enumerate(section): - feature_list = section_features[fs_name] - collector.insert(i, feature_list) - - if not parent_features: - break - - # visit parent node, root has no feature information and ends upward traversal - subplan_id = get_predecessor(plan_dict, subplan_id) - grand_parent_id = get_predecessor(plan_dict, subplan_id) - - if not grand_parent_id: - break - - -def get_subplan_features(plan_dict, subplan_id, parent_features=False): - """Return train and validation features associated with a named subplan. - - Args - plan_dict: The plan dictionary as returned by load_plan()x. - subplan_id: The name of the target subplan - parent_features: True or False - - Returns - The result is four-tuple (t0, t1, t2, t30) constructed as follows. - Some applications may choose to discard some of the returns, t0 and - t1, for example. - - t0 - the result dictionary which is disassmbled as follows - t1 - a list of feature names found in the train/validate sets - t2 - training feature set dictionary as described below - t3 - validation feature set dictionary as described below - - t2 and t3 are dictionaries that represent one or more training sets - and one or more validation sets, respectively. The key of each entry - is a feature-set name as returned in the t1 list, ['cell', 'drug'] for - example. The value of each is a list of lists. - - Consider a training feature set dictionary returned as follows: - - { - 'cell': [[C1, C2, C3, C4], [C5, C6, C7, C8]], - 'drug': [[ [D1, D2] , [D3, D4]] - } - - The feature sets defined here are the combination of (cell[0], drug[0]) - and (cell[1], drug[1]). The lenghts, i.e. number of sublists of each - dictionary entry are always equal. - """ - - # acquire feature_set names populated in the plan - content, _ = get_subplan(plan_dict, subplan_id) - if not content: - return None, None - - # peek inside the training set to capture active feature-set names - train_set = content['train'][0] - fs_names = [name for name in train_set.keys()] - - # categorize the results - result = {} - result[0] = fs_names - result['train'] = {} - result['val'] = {} - - for set_name, pf in [('train', True), ('val', False)]: - if pf == True: - pf = parent_features - - for fs_name in fs_names: - collector = [] - _get_named_set( - plan_dict, - subplan_id, - set_name, - fs_name, - collector, - parent_features=pf - ) - - result[set_name][fs_name] = collector - - return result, result[0], result['train'], result['val'] - -#------------------------------------------------------------------------------ -# Plan construction -#------------------------------------------------------------------------------ - -def build_dictionary_from_lists(seq_list, names): - """Create a dictionary with 'names' as labels and 'seq_list' values.""" - dict = {} - for seq, tag in zip(seq_list, names): - dict[tag] = list(seq) - return dict - - -def build_plan_tree(args, feature_set_content, parent_plan_id='', depth=0, data_pfx='', plan_pfx=''): - """Generate a plan supporting training, transfer-learning, resume-training. - - ADD GENERAL DOC - - This function is recursive. - - Arguments: - args: A namespace capturing the values of command line arguments - and parameter values derived from those arguments. Refer to - validate_args(). - - feature_set_content: This is a list of sublists, where each sublist - contains the names of the nth group of feature-set elements. - - parent_plan_id: This is the name of the parent's plan. The name - is extended with '.nn' at each level of the recursion to - ensure that parentage/liniage is fully conveyed in each - (subplan) plan_id. - - depth: Specify 0 on the root call. This arg can be used to - determine/set the current level of the recursion. - - data_pfx: Reserved for constructing feature-set name files. - plan_pfx: Reserved for constructing plan control files. - - Returns - args.plan_dict contains a dictionary representing the plan. This may be - JSONized. - - The number of planning steps (nbr of subplans in the plan tree) is explicitly - returned. - """ - curr_depth = depth + 1 - if args.maxdepth > 0 and curr_depth >= args.maxdepth: - return 0 - - all_parts = [] - - #flat_partitions = [] # preserve, used for file-based approach - #files = [] # preserve, used for file-based approach - #sequence = 0 # preserve, used for file-based approach - xxx = False - - for i in range(len(args.fs_names)): - group = feature_set_content[i] - count = args.fs_parts[i] - feature_set_name = args.fs_names[i] - partitions = args.generator.partition(feature_set_content[i], count=count) - all_parts.append(partitions) - - # acquire a cross-product of all feature-set partitions - parts_xprod = np.array(list(it.product(*all_parts))) - steps = len(parts_xprod) - - if steps > 1: - substeps = 0 - for step in range(steps): - train = [] - val = [] - - # split into validation and training components - for i, plan in enumerate(parts_xprod): - section = build_dictionary_from_lists(plan, args.fs_names) - if i == step: - val.append(section) - else: - train.append(section) - - # generate next depth/level (successor) plans - curr_plan_id = '{}.{}'.format(parent_plan_id, step + 1) - args.plan_dict[curr_plan_id] = {'val': val, 'train': train} - data_name = '{}.{}'.format(data_pfx, step + 1) - plan_name = '{}.{}'.format(plan_pfx, step + 1) - - # depth-first, shorthand representation of tree showing first feature names - if args.debug: - indent = ' ' * (depth * 4) - print(indent, curr_plan_id) - indent += ' ' * 4 - fs = parts_xprod[step] - for i in range(len(fs)): - print(indent, args.fs_names[i], 'count:', len(fs[i]), 'first:', fs[i][0]) - - substeps += build_plan_tree( - args, - parts_xprod[step], - parent_plan_id=curr_plan_id, - depth=curr_depth, - data_pfx=data_name, - plan_pfx=plan_name - ) - - steps += substeps - return steps - - """ - # THIS IS A WORK-IN-PROGRESS ... GENERATING FILES FOR DATA AND PLAN - - files.append([]) - files_ndx = len(files) - 1 - - for j in range(len(partitions)): - part = partitions[j] - flat_partitions.append(part) - if len(part) == 0: - sys.exit("big trouble ?????????????") - - sequence += 1 - file_name = '{}.{}.{}'.format(data_pfx, sequence, feature_set_name) - print("writing file %s with %d entries" % (file_name, len(part))) # write out 'part' - #write_file(file_name, part) - pair = (feature_set_name, file_name) - files[files_ndx].append(pair) - - file_xprod = np.array(list(it.product(*files))) - nbr_plans = len(file_xprod) - - for seq in range(nbr_plans): - plan_string = '' - - for ndx, curr in enumerate(file_xprod): - if ndx == seq: - plan_string += '--val (' - else: - plan_string += '--inc (' - for (tag, fname) in curr: - plan_string += '{}-{} '.format(tag, fname) - plan_string += ')' - - file_name = '{}.{}'.format(plan_pfx, seq + 1) - print(file_name) - plan_lines = list(plan_string) - #write_file(file_name, plan_lines) - - # construct list of omitted feature entries - - for seq in range(nbr_plans): - omitted_feature_content = [] - org = 0 - - for i in partition_spec: - omitted_feature_content.append(flat_partitions[org]) - org = i - - data_name = '{}.{}'.format(data_pfx, seq + 1) - plan_name = '{}.{}'.format(plan_pfx, seq + 1) - - steps = build_plan_tree( - args, - omitted_feature_content, - parent_plan_id=curr_plan_id, - depth=curr_depth, - data_pfx=data_name, - plan_pfx=plan_name - ) - return - """ - -def write_file(fname, title, string_list): - """Write text expressed as an array of lines to file.""" - with open(fname, 'w') as f: - for line in string_list: - f.write(line) - -def write_dict_to_json(dictionary, fname): - """Write dictionary to a json file.""" - with open(fname, 'w') as f: - json.dump(dictionary, f) - -#---------------------------------------------------------------------------------- -# various hard-coded lists, test cases - the synthetic feature-sets remain useful -#---------------------------------------------------------------------------------- - -""" -synthetic_cell_names = ['cell_' + '%04d' % (x) for x in range(1000)] -synthetic_drug_names = ['drug_' + '%04d' % (x) for x in range(1000)] -""" - -#---------------------------------------------------------------------------------- -# mainline -#---------------------------------------------------------------------------------- - -def main(): - # Acquire and validate arguments - args = planargs.parse_arguments() - args.json = True # the only available option thus far - - generator, feature_set_content = validate_args(args) - args.generator = generator - - root_name, args.plan_dict = generator.plan_init( - fs_names = args.fs_names, # validated cmdline arg - fs_paths = args.fs_paths, # validated cmdline arg - fs_lines = args.fs_lines, # created by validate_args - fs_parts = args.fs_parts, # validated cmdline arg - maxdepth = args.maxdepth - ) - - # feature_set_content = [cell_names, drug_names] - # feature_set_content = [synthetic_cell_names, synthetic_drug_names] - - # remove by-1 dimensions, they do not need to be represented in the plan explicitly - while True: - try: - ndx = args.fs_parts.index(1) - args.fs_names.pop(ndx) - args.fs_paths.pop(ndx) - args.fs_lines.pop(ndx) - args.fs_parts.pop(ndx) - except ValueError: - break - - # Plan generation - data_fname_pfx = os.path.join(args.out_dir, 'DATA.1') - plan_fname_pfx = os.path.join(args.out_dir, 'PLAN.1') - - steps = build_plan_tree( - args, # command line argument namespace - feature_set_content, # for example [[cell1 ... celln] [drug1 ... drugn]] - parent_plan_id=root_name, # name of root plan, subplan names created from this stem - data_pfx=data_fname_pfx, # DATA file prefix, building block for feature name files - plan_pfx=plan_fname_pfx # PLAN file prefix, building block for plan name files - ) - - generator.plan_term(args.plan_dict, root_name, steps) - print("Plan generation complete, total steps: %d" % steps) - - if args.json: - label = args.generator.get_plan_label(args.plan_dict, root_name) - qualified_name = 'plangen_' + label + '.json' - json_file_name = os.path.join(args.out_dir, qualified_name) - json_abspath = os.path.abspath(json_file_name) - write_dict_to_json(args.plan_dict, json_abspath) - print("%s JSON file written" % json_abspath) - - if args.print_tree: - print("Plan dictionary generated") - pp(args.plan_dict, width=160) # DEBUG comment this out for large plans - - if args.test: - test1(json_abspath, "test1_sql.db") - # test2(json_abspath, "test2_sql.db") - -#---------------------------------------------------------------------------------- -# test plan navigation and subplan entry retrieval -#---------------------------------------------------------------------------------- - -def test2(plan_path, db_path): - run_type = RunType.RESTART - #run_type = RunType.RUN_ALL - - plan_name = os.path.basename(plan_path) - plan_id = plan_prep(db_path, plan_name, run_type) - - plan_dict = load_plan(plan_path) - metadata, root_name = get_subplan(plan_dict) - - queue = deque() - queue.append(root_name) - - print("Test2 start") - for iloop in it.count(start = 0): - if len(queue) == 0: - print("Test2 complete - proc loop count: %d" % iloop) - break - - curr_subplan = queue.popleft() - successor_names = get_successors(plan_dict, curr_subplan) - for successor in successor_names: - queue.append(successor) - - if len(curr_subplan) == 1: - continue - - status = start_subplan( - db_path, - plan_path, - plan_id=plan_id, - subplan_id=curr_subplan, - run_type=run_type - ) - - if status < 0: - continue - - completion_status = dict(mse=1.1, mae=2.2, r_square=.555) - - stop_subplan( - db_path, - plan_id=plan_id, - subplan_id=curr_subplan, - comp_info_dict=completion_status - ) - print("Completing subplan %6d" % iloop) - -#---------------------------------------------------------------------------------- -# -def test1(plan_path, db_path): - run_type = RunType.RESTART - #run_type = RunType.RUN_ALL - - plan_name = os.path.basename(plan_path) - plan_id = plan_prep(db_path, plan_name, run_type) - - if (plan_id < 0): - sys.exit("Terminating due to database detected error") - - print("\nBegin plan navigation and subplan retrieval test") - plan_dict = load_plan(plan_path) - - # plan root name value returned when subplan_id= is omitted - metadata, root_name = get_subplan(plan_dict) - - # the root has no parent / predecessor - parent_name = get_predecessor(plan_dict, root_name) - print("Demonstrate that root \'%s\' predecessor is not defined: %s" % (root_name, parent_name)) - - # the root contains metadata, it is not a run specification - successor_names = get_successors(plan_dict, root_name) - print("\nThe first runable configurations are defined in %s\n" % successor_names) - - # the root is the predecessor of these first level runables - for sname in successor_names: - parent_name = get_predecessor(plan_dict, sname) - print("The parent of %s is %s" % (sname, parent_name)) - - # run the right subtree - print("\nRun the rightmost subtree \n") - for i in it.count(start = 1): - listlen = len(successor_names) - if listlen == 0: - break - - for name in successor_names: - status = start_subplan( - db_path, - plan_path, - plan_id=plan_id, - subplan_id=name, - run_type=run_type - ) - - if status < 0: - print("subplan: %s skipped, previously processed" % name) - - select_one = successor_names[listlen - 1] - parent_name = get_predecessor(plan_dict, select_one) - print("%-16s is a successor of %-16s - all successors: %s" % (select_one, parent_name, successor_names)) - -# ??????????????????????????????????????????????????????????? - value,_ = get_subplan(plan_dict, select_one) - - if i < 3: - for pf in [False, True]: - _, fs_name_list, train_list, val_list = get_subplan_features(plan_dict, select_one, parent_features=pf) - print("\nsubplan original:", select_one, "parent features:", pf) - pp(plan_dict[select_one]) - print("\nflattened TRAIN") - pp(train_list) - print("\nflattened VAL") - pp(val_list) - -# ??????????????????????????????????????????????????????????? - - # test retrieval api - row = get_subplan_runhist(db_path, plan_id=plan_id, subplan_id=select_one) - #print(row) - - # post subplan termination - completion_status = dict(mse=1.1, mae=2.2, r_square=.555, misc='no such column', data=123) - - stop_subplan( - db_path, - plan_id=plan_id, - subplan_id=select_one, - comp_info_dict=completion_status - ) - - successor_names = get_successors(plan_dict, select_one) - - print("\nEnd of branch reached") -# plan_remove(db_path, "plangen_cell8-p2_drug8-p2.json") - -#---------------------------------------------------------------------------------- - -if __name__ == "__main__": - main() diff --git a/Pilot1/Uno/topN_to_uno.py b/Pilot1/Uno/topN_to_uno.py deleted file mode 100644 index 6aaf2e31..00000000 --- a/Pilot1/Uno/topN_to_uno.py +++ /dev/null @@ -1,178 +0,0 @@ -import argparse -import os -import json -from collections import OrderedDict -import pandas as pd -import numpy as np - - -def parse_arguments(): - parser = argparse.ArgumentParser() - parser.add_argument('--dataframe_from', type=str, default='top21_dataframe_8x8.csv', - help='Dataframe file name contains all data points') - parser.add_argument('--plan', type=str, default='plan.json', - help='Plan data file') - parser.add_argument('--node', type=str, default=None, - help='node number to execute') - parser.add_argument('--incremental', action='store_true', - help='True for building dataset incrementally') - parser.add_argument('--fold', type=str, default=None, - help='pre-calculated indexes for cross fold validation') - parser.add_argument('--cell_feature_selection', default=None, - help='Plain text list for cell feature filtering. one item per line') - parser.add_argument('--drug_feature_selection', default=None, - help='Plain text list for drug feature filtering. one item per line') - parser.add_argument('--output', type=str, default='topN.uno.h5', - help='output filename') - - args, unparsed = parser.parse_known_args() - return args, unparsed - - -def read_plan(filename, node): - print("reading {} file for node {}".format(filename, node)) - with open(filename, 'r') as plan_file: - plan = json.load(plan_file) - if node is None: - return plan - - if node in plan: - return plan[node] - else: - raise Exception('Node index "{}" was not found in plan file'.format(node)) - - -def build_masks(args, df): - if args.node is None: - print('node is None. Generate Random split') - mask = get_random_mask(df) - return mask, ~mask - - print('from new build_mask: {} {} {}'.format(args.plan, args.node, args.incremental)) - import plangen - plan = read_plan(args.plan, None) - ids = {} - mask = {} - _, _, ids['train'], ids['val'] = plangen.get_subplan_features(plan, args.node, args.incremental) - - for partition in ['train', 'val']: - _mask = df['Sample'] == None - for i in range(len(ids[partition]['cell'])): - if 'cell' in ids[partition] and 'drug' in ids[partition]: - cl_filter = ids[partition]['cell'][i] - dr_filter = ids[partition]['drug'][i] - __mask = df['Sample'].isin(cl_filter) & df['Drug1'].isin(dr_filter) - elif 'cell' in ids[partition]: - cl_filter = ids[partition]['cell'][i] - __mask = df['Sample'].isin(cl_filter) - elif 'drug' in ids[partition]: - dr_filter = ids[partition]['drug'][i] - __mask = df['Drug1'].isin(dr_filter) - _mask = _mask | __mask - mask[partition] = _mask - return mask['train'], mask['val'] - - -def get_random_mask(df): - return np.random.rand(len(df)) < 0.8 - - -def read_dataframe(args): - _, ext = os.path.splitext(args.dataframe_from) - if ext == '.h5' or ext == '.hdf5': - store = pd.HDFStore(args.dataframe_from, 'r') - df = store.get('df') - store.close() - elif ext == '.feather': - df = pd.read_feather(args.dataframe_from).fillna(0) - elif ext == '.parquet': - df = pd.read_parquet(args.dataframe_from).fillna(0) - else: - df = pd.read_csv(args.dataframe_from, low_memory=False, na_values='na').fillna(0) - - df.rename(columns={'CELL': 'Sample', 'DRUG': 'Drug1'}, inplace=True) - df_y = df[['AUC', 'Sample', 'Drug1']] - - cols = df.columns.to_list() - cl_columns = list(filter(lambda x: x.startswith('GE_'), cols)) - dd_columns = list(filter(lambda x: x.startswith('DD_'), cols)) - - if args.cell_feature_selection is not None: - features = set(pd.read_csv(args.cell_feature_selection, skip_blank_lines=True, header=None)[0].to_list()) - cl_columns = list(filter(lambda x: x in features, cl_columns)) - - if args.drug_feature_selection is not None: - features = set(pd.read_csv(args.drug_feature_selection, skip_blank_lines=True, header=None)[0].to_list()) - dd_columns = list(filter(lambda x: x in features, dd_columns)) - - df_cl = df.loc[:, cl_columns] - df_dd = df.loc[:, dd_columns] - - return df_y, df_cl, df_dd - - -def build_dataframe(args): - df_y, df_cl, df_dd = read_dataframe(args) - - if args.fold is not None: - tr_id = pd.read_csv('{}_tr_id.csv'.format(args.fold)) - vl_id = pd.read_csv('{}_vl_id.csv'.format(args.fold)) - tr_idx = tr_id.iloc[:, 0].dropna().values.astype(int).tolist() - vl_idx = vl_id.iloc[:, 0].dropna().values.astype(int).tolist() - tr_vl_idx = tr_idx + vl_idx - - y_train = df_y.iloc[tr_idx, :].reset_index(drop=True) - y_val = df_y.iloc[vl_idx, :].reset_index(drop=True) - y_test = df_y.loc[~df_y.index.isin(tr_vl_idx), :].reset_index(drop=True) - - x_train_0 = df_cl.iloc[tr_idx, :].reset_index(drop=True) - x_train_1 = df_dd.iloc[tr_idx, :].reset_index(drop=True) - x_train_1.columns = [''] * len(x_train_1.columns) - - x_val_0 = df_cl.iloc[vl_idx, :].reset_index(drop=True) - x_val_1 = df_dd.iloc[vl_idx, :].reset_index(drop=True) - x_val_1.columns = [''] * len(x_val_1.columns) - - x_test_0 = df_cl.iloc[~df_cl.index.isin(tr_vl_idx), :].reset_index(drop=True) - x_test_1 = df_dd.iloc[~df_dd.index.isin(tr_vl_idx), :].reset_index(drop=True) - x_test_1.columns = [''] * len(x_val_1.columns) - else: - train_mask, val_mask = build_masks(args, df_y) - - y_train = pd.DataFrame(data=df_y[train_mask].reset_index(drop=True)) - y_val = pd.DataFrame(data=df_y[val_mask].reset_index(drop=True)) - - x_train_0 = df_cl[train_mask].reset_index(drop=True) - x_train_1 = df_dd[train_mask].reset_index(drop=True) - x_train_1.columns = [''] * len(x_train_1.columns) - - x_val_0 = df_cl[val_mask].reset_index(drop=True) - x_val_1 = df_dd[val_mask].reset_index(drop=True) - x_val_1.columns = [''] * len(x_val_1.columns) - - # store - store = pd.HDFStore(args.output, 'w', complevel=9, complib='blosc:snappy') - store.put('y_train', y_train, format='table') - store.put('y_val', y_val, format='table') - store.put('x_train_0', x_train_0, format='table') - store.put('x_train_1', x_train_1, format='table') - store.put('x_val_0', x_val_0, format='table') - store.put('x_val_1', x_val_1, format='table') - - # keep input feature list and shape - cl_width = len(df_cl.columns) - dd_width = len(df_dd.columns) - store.put('model', pd.DataFrame()) - store.get_storer('model').attrs.input_features = OrderedDict([('cell.rnaseq', 'cell.rnaseq'), ('drug1.descriptors', 'drug.descriptors')]) - store.get_storer('model').attrs.feature_shapes = OrderedDict([('cell.rnaseq', (cl_width,)), ('drug.descriptors', (dd_width,))]) - - if y_test is not None: - store.put('y_test', y_test, format='table') - store.put('x_test_0', x_test_0, format='table') - store.put('x_test_1', x_test_1, format='table') - store.close() - - -if __name__ == '__main__': - parsed, unparsed = parse_arguments() - build_dataframe(parsed) From 995b94fb614e14366128c82ab95faf860ed7e1b0 Mon Sep 17 00:00:00 2001 From: Jamal Date: Wed, 3 Jun 2020 09:34:21 -0600 Subject: [PATCH 328/331] Fixed 'gpus' issue, fixed formatting. --- common/default_utils.py | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/common/default_utils.py b/common/default_utils.py index 1a741ace..4e649a2c 100644 --- a/common/default_utils.py +++ b/common/default_utils.py @@ -667,8 +667,10 @@ def get_common_parser(parser): # Backend configuration - parser.add_argument("--gpus", action="store", nargs='*', - default=[], type=int, + parser.add_argument("--gpus", nargs="*", + default=argparse.SUPPRESS, + #default=[0], + type=int, help="set IDs of GPUs to use") # profiling flags @@ -679,26 +681,26 @@ def get_common_parser(parser): # cyclic learning rate parser.add_argument("--clr_flag", default=argparse.SUPPRESS, - #default=None, + #default=None, type=str2bool, - help="CLR flag (boolean)") + help="CLR flag (boolean)") parser.add_argument("--clr_mode", default=argparse.SUPPRESS, - #default=None, + #default=None, type=str, choices=['trng1', 'trng2', 'exp'], - help="CLR mode (default: trng1)") + help="CLR mode (default: trng1)") parser.add_argument("--clr_base_lr", type=float, default=argparse.SUPPRESS, - #default=1e-4, - help="Base lr for cycle lr.") + #default=1e-4, + help="Base lr for cycle lr.") parser.add_argument("--clr_max_lr", type=float, default=argparse.SUPPRESS, - #default=1e-3, - help="Max lr for cycle lr.") + #default=1e-3, + help="Max lr for cycle lr.") parser.add_argument("--clr_gamma", type=float, default=argparse.SUPPRESS, - #default=0.999994, - help="Gamma parameter for learning cycle LR.") + #default=0.999994, + help="Gamma parameter for learning cycle LR.") return parser @@ -949,7 +951,6 @@ def read_config_file(self, file): for k,v in config.items(sec): if not k in fileParams: fileParams[k] = eval(v) - fileParams = self.format_benchmark_config_arguments(fileParams) #pprint(fileParams) From 538f98cc96c279eab3d4bfd8642a6763d67cc37a Mon Sep 17 00:00:00 2001 From: Jamal Date: Wed, 3 Jun 2020 09:38:09 -0600 Subject: [PATCH 329/331] Added required gpus keyword to model files. --- Pilot1/Uno/uno_auc_clr_model.txt | 2 +- Pilot1/Uno/uno_auc_model.txt | 2 +- Pilot1/Uno/uno_by_drug_example.txt | 1 + Pilot1/Uno/uno_clr_model.txt | 1 + Pilot1/Uno/uno_default_model.txt | 1 + Pilot1/Uno/uno_fom_model.txt | 1 + Pilot1/Uno/uno_perf_bench_model.txt | 1 + 7 files changed, 7 insertions(+), 2 deletions(-) diff --git a/Pilot1/Uno/uno_auc_clr_model.txt b/Pilot1/Uno/uno_auc_clr_model.txt index 363b8467..437b19e9 100644 --- a/Pilot1/Uno/uno_auc_clr_model.txt +++ b/Pilot1/Uno/uno_auc_clr_model.txt @@ -30,7 +30,7 @@ verbose=False preprocess_rnaseq='source_scale' -gpus=1 +gpus=[0] use_landmark_genes=True no_feature_source=True no_response_source=True diff --git a/Pilot1/Uno/uno_auc_model.txt b/Pilot1/Uno/uno_auc_model.txt index 3b1f6e0d..13168906 100644 --- a/Pilot1/Uno/uno_auc_model.txt +++ b/Pilot1/Uno/uno_auc_model.txt @@ -30,7 +30,7 @@ verbose=False preprocess_rnaseq='source_scale' -gpus=1 +gpus=[0] use_landmark_genes=True no_feature_source=True no_response_source=True diff --git a/Pilot1/Uno/uno_by_drug_example.txt b/Pilot1/Uno/uno_by_drug_example.txt index 81dc30a1..6fb41364 100644 --- a/Pilot1/Uno/uno_by_drug_example.txt +++ b/Pilot1/Uno/uno_by_drug_example.txt @@ -34,6 +34,7 @@ use_landmark_genes=True partition_by='cell' by_drug='paclitaxel' cache='cache.pac' +gpus = [0] [Monitor_Params] timeout=3600 diff --git a/Pilot1/Uno/uno_clr_model.txt b/Pilot1/Uno/uno_clr_model.txt index 0ef55e80..b077cbb7 100644 --- a/Pilot1/Uno/uno_clr_model.txt +++ b/Pilot1/Uno/uno_clr_model.txt @@ -27,6 +27,7 @@ rng_seed=2018 save_path='save/uno' no_gen=False verbose = False +gpus = [0] [Monitor_Params] timeout=3600 diff --git a/Pilot1/Uno/uno_default_model.txt b/Pilot1/Uno/uno_default_model.txt index 8f406d35..72ddeb45 100644 --- a/Pilot1/Uno/uno_default_model.txt +++ b/Pilot1/Uno/uno_default_model.txt @@ -27,6 +27,7 @@ rng_seed=2018 save_path='save/uno' no_gen=False verbose = False +gpus = [0] [Monitor_Params] timeout=3600 diff --git a/Pilot1/Uno/uno_fom_model.txt b/Pilot1/Uno/uno_fom_model.txt index 2c9eb14d..99cfd839 100644 --- a/Pilot1/Uno/uno_fom_model.txt +++ b/Pilot1/Uno/uno_fom_model.txt @@ -32,6 +32,7 @@ preprocess_rnaseq='source_scale' no_feature_source=True no_response_source=True single=True +gpus = [0] [Monitor_Params] timeout=-1 diff --git a/Pilot1/Uno/uno_perf_bench_model.txt b/Pilot1/Uno/uno_perf_bench_model.txt index a35d2b55..b8f7b213 100644 --- a/Pilot1/Uno/uno_perf_bench_model.txt +++ b/Pilot1/Uno/uno_perf_bench_model.txt @@ -28,6 +28,7 @@ save_path='save/uno' no_gen=False verbose = False use_landmark_genes=True +gpus = [0] [Monitor_Params] timeout=3600 From 3ef7c9fb212201a7824b3f4a2db0a16843635158 Mon Sep 17 00:00:00 2001 From: Harry Yoo Date: Wed, 3 Jun 2020 13:29:19 -0500 Subject: [PATCH 330/331] fix document heading hierarchy --- examples/ADRP/README.md | 4 +++- examples/M16/README.md | 26 ++++++++++++++------------ 2 files changed, 17 insertions(+), 13 deletions(-) diff --git a/examples/ADRP/README.md b/examples/ADRP/README.md index 412c224e..2c6acf58 100644 --- a/examples/ADRP/README.md +++ b/examples/ADRP/README.md @@ -1,4 +1,6 @@ -The Pilot1 ADRP Benchmark loads a csv file +# Pilot1 ADRP Benchmark + +## loads a csv file Benchmark auto downloads the file below: http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/ (~500MB) diff --git a/examples/M16/README.md b/examples/M16/README.md index 024baa2e..2d9cb78f 100644 --- a/examples/M16/README.md +++ b/examples/M16/README.md @@ -1,4 +1,6 @@ -# Background +# Data preprocessing - feature selection examples + +## Background Data preprocessing is an important front-end step in data analysis that prepares data for subsequent analysis. It not only enables the subsequent analysis by processing and transforming data, but also influences the quality of subsequent analysis sometimes significantly. @@ -56,13 +58,13 @@ To perform co-expression extrapolation (COXEN) analysis [3] that selects predict To extend the COXEN approach for selecting genes to predict the response of tumor cells to multiple drugs in precision oncology applications. -# Running the example +## Running the example The code demonstrates feature selection methods that CANDLE provides. It can be run by executing ``` python M16_test.py ``` -## Download data +### Download data Code ```python # download all the data if needed from the repo @@ -88,7 +90,7 @@ Origin = http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/Candle_ Origin = http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/Candle_Milestone_16_Version_12_15_2019/Data/Data_For_Testing/CCLE_NCI60_Gene_Expression_Full_Data.txt ``` -## Download gene set +### Download gene set Code ```python # download all the gene_set files needed @@ -124,7 +126,7 @@ Origin = http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/Candle_ Gene Set data is locally stored at /Users/hsyoo/projects/CANDLE/Benchmarks/common/../Data/examples/Gene_Sets/MSigDB.v7.0/ ``` -# Select features based on missing values +### Select features based on missing values Code ```python print('Testing select_features_by_missing_values') @@ -161,7 +163,7 @@ Select features with missing rates smaller than 0.3 Feature IDs [0 1 2 3 4 5 6 9] ``` -# Select features based on variation +### Select features based on variation Code ```python print('Testing select_features_by_variation') @@ -182,7 +184,7 @@ Select the top 2 features with the largest standard deviation Feature IDs [0 5] ``` -# Select decorrelated features +### Select decorrelated features Code ```python print('Testing select_decorrelated_features') @@ -202,7 +204,7 @@ Select features whose absolute mutual Spearman correlation coefficient is smalle Feature IDs [0 2 6 9] ``` -# Generate cross-validation partitions of data +### Generate cross-validation partitions of data Code ```python print('Testing generate_cross_validation_partition') @@ -248,7 +250,7 @@ Fitting L/S model and finding priors Finding parametric adjustments ``` -# Quantile normalization of gene expression data +### Quantile normalization of gene expression data Code ```python print('Testing quantile_normalization') @@ -301,7 +303,7 @@ Max difference of median between cell lines is 0.02 Max difference of first quartile between cell lines is 0.06 ``` -# Generate gene-set-level data +### Generate gene-set-level data ```python print('Testing generate_gene_set_data') gene_set_data = candle.generate_gene_set_data(np.transpose(norm_data), [i[0] for i in norm_data.index], gene_name_type='entrez', @@ -348,7 +350,7 @@ CCL_1078 -10.355489 ... -26.232325 [897 rows x 186 columns] ``` -# Combat batch normalization on gene expression data +### Combat batch normalization on gene expression data Code ```python print('Testing combat_batch_effect_removal') @@ -431,7 +433,7 @@ Average median of CCLE cell lines is 2.72 Average first quartile of CCLE cell lines is 0.13 ``` -# References +## References 1. Bolstad BM, Irizarry RA, Astrand M, et al. \(2003\) *A comparison of normalization methods for high density oligonucleotide array data based on variance and bias* Bioinformatics. 2003 Jan 22;19\(2\):185-93. From e2770da95d3c6659e30044c356809dfa974d18e8 Mon Sep 17 00:00:00 2001 From: Jamal Date: Tue, 16 Jun 2020 09:18:36 -0600 Subject: [PATCH 331/331] Fix file_utils error with untarring files. --- common/file_utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/common/file_utils.py b/common/file_utils.py index c7d637d9..46e065d2 100644 --- a/common/file_utils.py +++ b/common/file_utils.py @@ -9,7 +9,6 @@ from six.moves.urllib.request import urlopen from six.moves.urllib.error import URLError, HTTPError -import wget import requests from generic_utils import Progbar @@ -88,11 +87,13 @@ def get_file(fname, origin, untar=False, fnamesplit = fname.split('.tgz') untar_fpath = os.path.join(datadir, fnamesplit[0]) untar = True + else: + untar_fpath = None fpath = os.path.join(datadir, fname) download = False - if os.path.exists(fpath) or os.path.exists(untar_fpath): + if os.path.exists(fpath) or (untar_fpath is not None and os.path.exists(untar_fpath)): # file found; verify integrity if a hash was provided if md5_hash is not None: if not validate_file(fpath, md5_hash): @@ -126,7 +127,6 @@ def dl_progress(count, block_size, total_size): try: try: urlretrieve(origin, fpath, dl_progress) - #fpath = wget.download(origin) except URLError as e: raise Exception(error_msg.format(origin, e.errno, e.reason)) except HTTPError as e: