Merge branch 'serenejiang-dev-update' into main

knightlab-analyses · Apr 16, 2021 · b7e920c · b7e920c
2 parents 62bb31e + cc04e8e
commit b7e920c
Show file tree

Hide file tree

Showing 175 changed files with 60,330 additions and 1 deletion.
diff --git a/README.md b/README.md
@@ -1,2 +1,89 @@
 # stability-analyses
-code for simulation studies and experimental microbiome data applications in Stability manuscript
+This set of codes are used for reproducing all the simulation studies and experimental microbiome data applications in Stability manuscript.
+
+I. General code:
+
+code_method folder: contain codes to reproduce simulation results for continuous outcomes
+
+	getStability.R: function to calculate Stability Index
+
+	cv_method.R: code for 4 selected feature selection methods with user-defined parameter grids and cross-validations for parameter tuning when applied to continuous outcomes
+
+	cv_method_binary_update.R: code for 4 selected feature selection methods with user-defined parameter grids and cross-validations for parameter tuning when applied to binary outcomes
+
+	stab_data_applications.R: function to perform hypothesis testing using bootstrap for continuous outcomes
+
+	stab_data_applications_binary.R: function to perform hypothesis testing using bootstrap for binary outcomes
+
+	bootstrap_test_compLasso_rf.R: general functions for comparing feature selection methods using hypothesis testing based on bootstrap when applied to continuous outcomes
+
+	bootstrap_test_compLasso_rf_binary.R: general functions for comparing feature selection methods using hypothesis testing based on bootstrap when applied to binary outcomes
+
+	source code for compositional lasso (continuous outcome) is available at: https://www.math.pku.edu.cn/teachers/linw/software.html
+	source code for compositional lasso (binary outcome) is available at: https://github.com/UVic-omics/Microbiome-Variable-Selection
+
+
+II. Simulation part (within simulations folder):
+
+sim_data_generation folder: contain codes to generate simulated data
+
+	sim_dat_ind_toeplitz: code to generate simulated data with Independent and Toeplitz correlation designs
+	sim_dat_block.R: code to generate simulated data with Block correlation design
+	run_sim_data.sh: bash commands for running simulation data generation code on HPC
+
+code_sim_cts folder: contain codes to reproduce simulation results for continuous outcomes 
+
+	cv_sim_apply.R: general functions for applying selected feature selection methods to simulated data when applied to continuous outcomes
+
+	1. compute Stability and MSE for different simulation scenarios
+	ind_results.R: code for comparing 3 methods (lasso, elastic net, random forests) in simulated data with Independent design and continuous outcomes
+	toe_results.R: code for comparing 3 methods (lasso, elastic net, random forests) in simulated data with Toeplitz design and continuous outcomes
+	block_results.R: code for comparing 3 methods (lasso, elastic net, random forests) in simulated data with Block design and continuous outcomes
+	CL_sim_apply.R: code for obtaining results for compositional lasso in all simulation correlation designs with continuous outcomes
+
+	2. hypothesis testing with bootstrap for selected simulation scenarios
+	boot_CL_testing.R: code for calculating bootstrapped confidence interval for compositional lasso method in simulated data with continous outcomes
+	boot_RF_testing.R: code for calculating bootstrapped confidence interval for random forests method in simulated data with continous outcomes
+
+	3. bash commands
+	run_sim_cts.sh: bash commands for running simulation code for continous outcomes on HPC
+
+
+code_sim_bin folder: contain codes to reproduce simulation results for binary outcomes
+
+	cv_sim_apply_binary_update.R: general functions for applying selected feature selection methods to simulated data when applied to binary outcomes
+
+	1. compute Stability and AUC for different simulation scenarios
+	ind_results_binary_update.R: code for comparing all 4 methods in simulated data with Independent design and binary outcomes
+	toe_results_binary_update.R: code for comparing all 4 methods in simulated data with Toeplitz design and binary outcomes
+	block_results_binary_update.R: code for comparing all 4 methods in simulated data with Block design and binary outcomes
+
+	2. hypothesis testing with bootstrap for selected simulation scenarios
+	boot_sim_binary.R: code for calculating bootstrapped confidence interval for compositional lasso and random forests methods in simulated data with binary outcomes
+
+	3. bash commands
+	run_sim_bin.sh: bash commands for running simulation code for binary outcomes on HPC
+
+notebooks_sim_cts folder: contain notebooks (R) to summarize simulation results for continuous outcome
+
+notebooks_sim_bin folder: contain notebooks (R) to summarize simulation results for binary outcome	
+
+results_summary_cts folder: contain outputs of tables from notebooks in notebooks_sim_cts folder
+
+results_summary_bin folder: contain outputs of tables from notebooks in notebooks_sim_bin folder
+
+figures_combined folder: contain figures generated for both continous and binary outcomes based on notebook 6_make_figures_combined in notebooks_sim_bin folder
+
+
+III. application part (within data_application folder):
+
+	code_cts folder: contain code for real data applications to BMI & soil datasets for continuous outcomes
+
+	code_bin folder: contain code for real data applications to BMI & soil datasets for binary outcomes
+
+	notebooks_applications folder: contain notebooks (R) to summarize microbiome application results for continuous and binary outcomes
+
+	88soils folder: contain data and application results for soil datast
+
+	BMI folder: contain data and application results for BMI datast
+
diff --git a/code_method/.DS_Store b/code_method/.DS_Store
diff --git a/code_method/bootstrap_test_compLasso_rf.R b/code_method/bootstrap_test_compLasso_rf.R
@@ -0,0 +1,126 @@
+##########################################################
+### hypothesis testing with bootstraps ###################
+##########################################################
+
+source('cv_method.R') 
+source('getStability.R')
+
+## set up parallel computing
+library(foreach)
+library(doParallel)
+numCores <- detectCores() - 2 
+registerDoParallel(numCores)  # use multicore, set to the number of our cores
+
+boot_stab_sim = function(num_boot=100, sim_file, method, seednum=31, ratio.training=0.8, fold.cv=10, 
+					  family='gaussian', lambda.grid=exp(seq(-4, -2, 0.2)), alpha.grid=seq(0.1, 0.9, 0.1), 
+	                  mtry.grid=seq(5, 25, 5), num_trees = 500, pval_thr = 0.05){
+
+	# load simulated data
+	load(sim_file, dat <- new.env())
+
+	idx.start = 1; idx.stop = 100 # 100 repetitions for each simulated scenario
+	rou = dat$sim_array[[1]]$rou # rou, n, p are same across all repetitions
+	n = dat$sim_array[[1]]$n
+	p = dat$sim_array[[1]]$p
+
+	## get a vector of stability index from bootstrapped data
+	stab_index = rep(0, num_boot)
+	b = 0
+	for (i in idx.start:idx.stop){
+		b = b + 1
+		print(paste('index', i, sep=':'))
+	    sub = dat$sim_array[[i]]
+
+	    # bootsrap with parallelization
+		selections = foreach (i=1:num_boot) %dopar% {
+			N = length(sub$Y) # number of samples
+			boot_ids = sample(N, size=N, replace=TRUE)
+			boot_Z = sub$Z[boot_ids, ]
+			boot_Y = sub$Y[boot_ids, ]
+
+		    ## select features from lasso/elnet
+		    if (method == 'compLasso'){
+		        result.lin = cons_lasso_cv(y=boot_Y, datx=boot_Z, seednum=i, ratio.training=ratio.training)
+		        select.lin = result.lin$coef.chosen  # since 1 represents intercept
+
+		    } else if (method == 'RF'){
+		        result.rf = randomForest_cv(y=boot_Y, datx=boot_Z, seednum=i, fold.cv=fold.cv, 
+	    							        num_trees=num_trees, mtry.grid = mtry.grid, pval_thr=pval_thr)
+	            select.rf = result.rf$coef.chosen 
+		    }
+		}
+
+		# calculate stability index from bootstrapped data
+		stability_table = matrix(rep(0, num_boot * p), ncol=p)
+		for (j in 1:num_boot){
+		stability_table[j, selections[[j]]] = 1 
+		}
+
+		stab_index[b] = round(getStability(stability_table)$stability, 2)
+	}	
+
+	results=list(rou=rou, n=n, p=p, num_boot=num_boot, method=method, stab_index=stab_index)
+
+}
+
+
+########################################################
+## double bootstrap applied to real data application ###
+########################################################
+boot_stab_data = function(num_boot=100, data_file, method, seednum=31, ratio.training=0.8, fold.cv=10, 
+					      family='gaussian', lambda.grid=exp(seq(-4, -2, 0.2)), alpha.grid=seq(0.1, 0.9, 0.1), 
+	                      mtry.grid=seq(5, 25, 5), num_trees = 500, pval_thr = 0.05){
+
+	# load simulated data
+	set.seed(seednum)
+	load(data_file)
+	p = dim(taxa)[2]
+
+	stab_index = rep(0, num_boot)
+	MSE_list = list()
+	# first loop of bootstrap to generate num_boot bootstrapped datasets
+	for (k in 1:num_boot){
+        print(paste('num_boot', k, sep=':'))
+		N = length(y) # number of samples
+		sample_ids = seq(1, N, 1)
+
+		# bootstrapped samples
+	    boot_ids = sample(sample_ids, size=N, replace=TRUE)
+	    boot_taxa = taxa[boot_ids, ]
+	    boot_mf = y[boot_ids]
+
+	    ## second loop of bootstrap to perform variable selection
+		results = foreach (i=1:num_boot) %dopar% {
+			boot_ids_second = sample(N, size=N, replace=TRUE)
+			boot_Z = boot_taxa[boot_ids_second, ]
+			boot_Y = boot_mf[boot_ids_second]
+
+		    ## select features from lasso/elnet
+		    if (method == 'compLasso'){
+		        result.lin = cons_lasso_cv(y=boot_Y, datx=boot_Z, seednum=i, ratio.training=ratio.training)
+		        output.lin = c(result.lin$MSE, result.lin$coef.chosen) 
+
+		    } else if (method == 'RF'){
+		        result.rf = randomForest_cv(y=boot_Y, datx=boot_Z, seednum=i, fold.cv=fold.cv, 
+	    							        num_trees=num_trees, mtry.grid = mtry.grid, pval_thr=pval_thr)
+	            output.rf = c(result.rf$MSE, result.rf$coef.chosen) 
+		    }
+		}
+
+		# reformat results (stability & MSE)
+		stability_table = matrix(rep(0, num_boot * p), ncol=p)
+		results_mse = results_chosen = list()
+		for (b in 1:num_boot){
+			results_mse[b] = results[[b]][1]
+			results_chosen[[b]] = results[[b]][-1]
+			stability_table[b, results_chosen[[b]]] = 1 
+		}
+
+		stab_index[k] = round(getStability(stability_table)$stability, 2)
+		MSE_list[[k]] = results_mse
+	}
+
+
+	results=list(num_boot=num_boot, method=method, stab_index=stab_index, MSE_list=MSE_list)
+
+}
diff --git a/code_method/bootstrap_test_compLasso_rf_binary.R b/code_method/bootstrap_test_compLasso_rf_binary.R
@@ -0,0 +1,137 @@
+##########################################################
+### estimate correlation between stability index #########
+##########################################################
+
+source('cv_method_binary_update.R')
+source('getStability.R')
+
+## set up parallel computing
+library(foreach)
+library(doParallel)
+numCores <- detectCores() - 2 # 6 cores
+registerDoParallel(numCores)  # use multicore, set to the number of our cores
+
+########################################################
+## double bootstrap applied to simulations ###
+########################################################
+boot_stab_sim = function(num_boot=100, sim_file, method, seednum=31, 
+                         ratio.training=0.8, fold.cv=10, 
+                         family='binomial', lambda.grid=exp(seq(-4, -2, 0.2)), 
+                         alpha.grid=seq(0.1, 0.9, 0.1), 
+                         mtry.grid=seq(5, 25, 5), num_trees = 500, pval_thr = 0.05,
+                         lambda.coda=seq(0.1, 0.2, 0.01), data.split=FALSE){
+
+	# load simulated data
+	load(sim_file, dat <- new.env())
+
+	idx.start = 1; idx.stop = 100 # 100 repetitions for each simulated scenario
+	rou = dat$sim_array[[1]]$rou # rou, n, p are same across all repetitions
+	n = dat$sim_array[[1]]$n
+	p = dat$sim_array[[1]]$p
+
+	## get a vector of stability index from bootstrapped data
+	stab_index = rep(0, num_boot)
+	b = 0
+	for (i in idx.start:idx.stop){
+		b = b + 1
+		print(paste('index', i, sep=':'))
+	    sub = dat$sim_array[[i]]
+        y_binary = as.factor(ifelse(sub$Y >= median(sub$Y), 1, 0))
+
+	    # bootsrap with parallelization
+		selections = foreach (i=1:num_boot) %dopar% {
+			N = length(y_binary) # number of samples
+			boot_ids = sample(N, size=N, replace=TRUE)
+			boot_Z = sub$Z[boot_ids, ]
+			boot_Y = y_binary[boot_ids]
+            boot_X = sub$X[boot_ids, ] # for generalized compositional lasso 
+
+		    ## select features 
+		    if (method == 'GenCompLasso'){# generalized compositional lasso use X instead of Z 
+		        result.lin = gen_cons_lasso_cv(y=boot_Y, datx=boot_X, seednum=i, ratio.training=ratio.training,
+                                               lambda.coda=lambda.coda, data.split=data.split)
+		        select.lin = result.lin$coef.chosen 
+
+		    } else if (method == 'RF'){
+		        result.rf = randomForest_cv(y=boot_Y, datx=boot_Z, seednum=i, fold.cv=fold.cv, 
+	    							        num_trees=num_trees, mtry.grid = mtry.grid, pval_thr=pval_thr)
+	            select.rf = result.rf$coef.chosen 
+		    }
+		}
+
+		# calculate stability index from bootstrapped data
+		stability_table = matrix(rep(0, num_boot * p), ncol=p)
+		for (j in 1:num_boot){
+		stability_table[j, selections[[j]]] = 1 
+		}
+
+		stab_index[b] = round(getStability(stability_table)$stability, 2)
+	}	
+
+	results=list(rou=rou, n=n, p=p, num_boot=num_boot, method=method, stab_index=stab_index)
+
+}
+
+
+########################################################
+## double bootstrap applied to real data application ###
+########################################################
+boot_stab_data = function(num_boot=100, data_file, method, seednum=31, 
+                          ratio.training=0.8, fold.cv=10, 
+					      family='binomial', lambda.grid=exp(seq(-4, -2, 0.2)), 
+                          alpha.grid=seq(0.1, 0.9, 0.1), 
+	                      mtry.grid=seq(5, 25, 5), num_trees = 500, pval_thr = 0.05,
+                          lambda.coda=seq(0.1, 0.2, 0.01), data.split=FALSE){
+
+	# load simulated data
+	set.seed(seednum)
+	load(data_file)
+	p = dim(taxa)[2]
+
+	stab_index = rep(0, num_boot)
+	ROC_list = list()
+	# first loop of bootstrap to generate num_boot bootstrapped datasets
+	for (k in 1:num_boot){
+        print(paste('num_boot', k, sep=':'))
+		N = length(y) # number of samples
+		sample_ids = seq(1, N, 1)
+
+		# bootstrapped samples
+	    boot_ids = sample(sample_ids, size=N, replace=TRUE)
+	    boot_taxa = taxa[boot_ids, ]
+	    boot_mf = y[boot_ids]
+
+	    ## second loop of bootstrap to perform variable selection
+		results = foreach (i=1:num_boot) %dopar% {
+			boot_ids_second = sample(N, size=N, replace=TRUE)
+			boot_Z = boot_taxa[boot_ids_second, ]
+			boot_Y = boot_mf[boot_ids_second]
+
+		    ## select features from lasso/elnet
+		    if (method == 'GenCompLasso'){ # use X instead of Z (log-transformed)
+		        result.lin = gen_cons_lasso_cv(y=boot_Y, datx=exp(boot_Z), seednum=i, ratio.training=ratio.training)
+		        output.lin = c(result.lin$ROC, result.lin$coef.chosen)                           
+		    } else if (method == 'RF'){
+		        result.rf = randomForest_cv(y=boot_Y, datx=boot_Z, seednum=i, fold.cv=fold.cv, 
+	    							        num_trees=num_trees, mtry.grid = mtry.grid, pval_thr=pval_thr)
+	            output.rf = c(result.rf$ROC, result.rf$coef.chosen) 
+		    }
+		}
+
+		# reformat results (stability & ROC)
+		stability_table = matrix(rep(0, num_boot * p), ncol=p)
+		results_mse = results_chosen = list()
+		for (b in 1:num_boot){
+			results_mse[b] = results[[b]][1]
+			results_chosen[[b]] = results[[b]][-1]
+			stability_table[b, results_chosen[[b]]] = 1 
+		}
+
+		stab_index[k] = round(getStability(stability_table)$stability, 2)
+		ROC_list[[k]] = results_mse
+	}
+
+
+	results=list(num_boot=num_boot, method=method, stab_index=stab_index, ROC_list=ROC_list)
+
+}
diff --git a/code_method/code_method/.DS_Store b/code_method/code_method/.DS_Store