diff --git a/inst/scripts/crossvalidation_coRmiT.R b/inst/scripts/crossvalidation_coRmiT.R index 8ffa464..ce2dbf0 100755 --- a/inst/scripts/crossvalidation_coRmiT.R +++ b/inst/scripts/crossvalidation_coRmiT.R @@ -2,20 +2,21 @@ library(dplyr) filter_and_rank <- function(cv_cont_table){ - # cv_cont_table <- cv_cont_table[cv_cont_table$Pvalue <= 0.05,] + # cv_cont_table <- cv_cont_table[cv_cont_table$Pvalue <= 0.05,] if (nrow(cv_cont_table) == 0 ) return(NULL) cv_cont_table <- cv_cont_table[,c("miRNA","strategy", "corr_cutoff", "crossval_status", "Odds_ratio", "TP")] - cv_cont_table <- reshape(cv_cont_table, idvar = c("miRNA","strategy", "corr_cutoff"), timevar = "crossval_status", direction = "wide") - if (sum(c("Odds_ratio.test", "Odds_ratio.train") %in% colnames(cv_cont_table)) != 2) return(NULL) + #cv_cont_table2 <- reshape(cv_cont_table, idvar = c("miRNA","strategy", "corr_cutoff"), timevar = "crossval_status", direction = "wide") + #if ("Odds_ratio.train" %in% colnames(cv_cont_table)) return(NULL) # cv_cont_table <- cv_cont_table %>% arrange(desc(Odds_ratio.test), desc(TP.test), corr_cutoff) # cv_cont_table$OR_test_rank <- seq(1, nrow(cv_cont_table)) # str(cv_cont_table) - cv_cont_table$OR_test_rank <- rank(-cv_cont_table$Odds_ratio.test, ties.method ="min") - cv_cont_table$OR_train_rank <- rank(-cv_cont_table$Odds_ratio.train,ties.method ="min") + # cv_cont_table$OR_test_rank <- rank(-cv_cont_table$Odds_ratio.test, ties.method ="min") + cv_cont_table$OR_train_rank <- rank(-cv_cont_table$Odds_ratio,ties.method ="min") return(cv_cont_table) } + get_test_rank <- function(cv_cont_table){ top_rank_train_val <- min(cv_cont_table$OR_train_rank) top_rank_train <- cv_cont_table[cv_cont_table$OR_train_rank ==top_rank_train_val,] @@ -78,48 +79,49 @@ cormit_path <- file.path(opt$input_cormit, "temp.RData") load(cormit_path) p_val_cutoff <- 0.05 -all_pairs <- miRNA_cor_results$all_pairs -miRNA_strat_result <- miRNA_cor_results$miRNA_cont_tables -miRNA_strat_result <- miRNA_strat_result[miRNA_strat_result$Pvalue <= 0.05 & miRNA_strat_result$db_group == "multimir",] -best_miRNA_strategies <- as.data.frame(select_best_strategy(miRNA_strat_result)) -strat_combinations <- as.data.frame(unique(miRNA_strat_result[, c("corr_cutoff", "strategy")])) -miRNA_cont_tables <- data.frame() +# all_pairs <- miRNA_cor_results$all_pairs +# miRNA_strat_result <- miRNA_cor_results$miRNA_cont_tables +# miRNA_strat_result <- miRNA_strat_result[miRNA_strat_result$Pvalue <= 0.05 & miRNA_strat_result$db_group == "multimir",] +# best_miRNA_strategies <- as.data.frame(select_best_strategy(miRNA_strat_result)) +# strat_combinations <- as.data.frame(unique(miRNA_strat_result[, c("corr_cutoff", "strategy")])) +# miRNA_cont_tables <- data.frame() -for (combination in seq(1,nrow(strat_combinations))) { - corr_cutoff <- strat_combinations[combination, "corr_cutoff"] - strategy <- strat_combinations[combination, "strategy"] +# for (combination in seq(1,nrow(strat_combinations))) { +# corr_cutoff <- strat_combinations[combination, "corr_cutoff"] +# strategy <- strat_combinations[combination, "strategy"] - contingency_tables <- prepare_for_stats(all_pairs, strategy, corr_cutoff, - db_groups = "multimir", p_val_cutoff = p_val_cutoff , crossval = TRUE, test_sample = opt$test_size) - miRNA_cont_tables <- rbind(miRNA_cont_tables, contingency_tables[["strat_miRNA_ct"]]) -} +# contingency_tables <- prepare_for_stats(all_pairs, strategy, corr_cutoff, +# db_groups = "multimir", p_val_cutoff = p_val_cutoff , crossval = TRUE, test_sample = opt$test_size) +# miRNA_cont_tables <- rbind(miRNA_cont_tables, contingency_tables[["strat_miRNA_ct"]]) +# } -message("Stats prepared") -miRNA_cont_tables <- v_get_stats(miRNA_cont_tables, - selected_stats = c("v.fisher.test","odds_ratio")) +# message("Stats prepared") +# miRNA_cont_tables <- v_get_stats(miRNA_cont_tables, +# selected_stats = c("v.fisher.test","odds_ratio")) -miRNA_cont_tables$iterations <- paste(miRNA_cont_tables$miRNA, miRNA_cont_tables$crossval_it, sep = "_") +# miRNA_cont_tables$iterations <- paste(miRNA_cont_tables$miRNA, miRNA_cont_tables$crossval_it, sep = "_") -splitted_cont_tables <- split(miRNA_cont_tables, miRNA_cont_tables$iterations) +# splitted_cont_tables <- split(miRNA_cont_tables, miRNA_cont_tables$iterations) -cross_rank <- lapply(splitted_cont_tables, filter_and_rank) -full_ranking <-as.data.frame(data.table::rbindlist(cross_rank, - use.names = TRUE, - idcol = "iters")) -cross_rank <- lapply(cross_rank, get_test_rank) +# cross_rank <- lapply(splitted_cont_tables, filter_and_rank) +# full_ranking <-as.data.frame(data.table::rbindlist(cross_rank, +# use.names = TRUE, +# idcol = "iters")) +### cross_rank <- lapply(cross_rank, get_test_rank) message("Train and test datasets ordered") -cross_rank <-as.data.frame(data.table::rbindlist(cross_rank, - use.names = TRUE, - idcol = "iters")) -colnames(cross_rank) <- c("iteration", "miRNA", "strategy","corr_cutoff","test_rank") +#### cross_rank <-as.data.frame(data.table::rbindlist(cross_rank, +# use.names = TRUE, +# idcol = "iters")) +#### colnames(cross_rank) <- c("iteration", "miRNA", "strategy","corr_cutoff","test_rank") +# save(full_ranking,best_miRNA_strategies, file = "test.Rdata") +load("test.Rdata") best_miRNA_strategies$acc_ratio <- 0 best_miRNA_strategies$total_strats <- 0 -# save(cross_rank,best_miRNA_strategies,full_ranking, file = "test.Rdata") -# load("test.Rdata") + train_rank_dist <- data.frame() for (miRNA in best_miRNA_strategies$miRNA) { best_miRNA_strat <- unlist(best_miRNA_strategies[best_miRNA_strategies$miRNA == miRNA,]) @@ -130,7 +132,8 @@ for (miRNA in best_miRNA_strategies$miRNA) { train_rank_dist <- rbind(train_rank_dist, best_rank_in_train) } -str(full_ranking) +# save(train_rank_dist, file = "test_stats.RData") +# q() message("Computing report") dir.create(opt$output_files) outf <- file.path(normalizePath(opt$output_files),"coRmiT_cv.html") diff --git a/inst/templates/cormit_cv.Rmd b/inst/templates/cormit_cv.Rmd index 7d3b472..77eb8cc 100644 --- a/inst/templates/cormit_cv.Rmd +++ b/inst/templates/cormit_cv.Rmd @@ -33,8 +33,12 @@ output: ## Rank of the test dataset for the best strategies in train -```{r ranking_dist, echo = FALSE, warning =FALSE, message=FALSE, results = 'asis'} -mirna_names <- translate_miRNA_ids(cross_rank$miRNA) +```{r config, echo = FALSE, warning =FALSE, message=FALSE, results = 'asis'} + +mirna_names <- translate_miRNA_ids(unique(full_ranking$miRNA)) +``` + +```{r ranking_dist, echo = FALSE, warning =FALSE, message=FALSE, results = 'asis', eval = FALSE} cross_rank$significant <- ifelse(cross_rank$miRNA %in% best_miRNA_strategies$miRNA, "significant", "not significant") @@ -57,11 +61,18 @@ Ranking values of the best strategy in corMiT but using random sections of datab train_rank_dist$miRNA_n <- mirna_names[match(train_rank_dist$miRNA, mirna_names$Accession), "TargetName"] + +p_meds <- plyr::ddply(train_rank_dist, plyr::.(miRNA_n), summarise, med = median(OR_train_rank)) + ggplot2::ggplot(train_rank_dist, ggplot2::aes(x = miRNA_n, y = OR_train_rank)) + ggplot2::geom_boxplot()+ +ggplot2::geom_text(data = p_meds, ggplot2::aes(x = miRNA_n, y = med, label = med), + size = 3, vjust = -1.5)+ - ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 315,hjust=0,vjust=0)) + + ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 60,hjust=1,vjust=1, size = 18), + axis.text.y = ggplot2::element_text(size = 18), + axis.title.y = ggplot2::element_text(size = 15, face = "bold")) + ggplot2::labs(y= "Ranking position in 75% of DB", x = NULL)+ ggplot2::scale_y_continuous(limits = c(0, 25)) @@ -83,23 +94,23 @@ plot_objects <- list() full_ranking <- full_ranking[full_ranking$miRNA %in% best_miRNA_strategies$miRNA,] full_ranking$miRNA_n <- mirna_names[match(full_ranking$miRNA, mirna_names$Accession), "TargetName"] miRNA_cor_results$miRNA_cont_tables$miRNA_n <- mirna_names[match(miRNA_cor_results$miRNA_cont_tables$miRNA, mirna_names$Accession), "TargetName"] -train_rank_dist <- data.frame() +train_rank_dist2 <- data.frame() for (miRNA in unique(full_ranking$miRNA_n)) { - miRNA_ct <- miRNA_cor_results$miRNA_cont_tables[miRNA_cor_results$miRNA_cont_tables$miRNA_n == miRNA,] + miRNA_ct <- miRNA_cor_results$miRNA_cont_tables[miRNA_cor_results$miRNA_cont_tables$miRNA_n == miRNA,] miRNA_ct <- miRNA_ct[miRNA_ct$db_group == "multimir",] - miRNA_ct$full_db_rank <- rank(-miRNA_ct$Odds_ratio, ties.method ="min") + miRNA_rank <- full_ranking[full_ranking$miRNA_n == miRNA,] full_and_sample_eq <- match(paste0(miRNA_rank$strategy, miRNA_rank$corr_cutoff), paste0(miRNA_ct$strategy, miRNA_ct$corr_cutoff)) miRNA_rank$full_db_rank <- miRNA_ct$full_db_rank[full_and_sample_eq] miRNA_rank$Pvalue <- miRNA_ct$Pvalue[full_and_sample_eq] + # miRNA_rank <- miRNA_rank[miRNA_rank$Pvalue <= 0.05,] - miRNA_rank$full_db_rank[is.na(miRNA_rank$full_db_rank)] <- max(miRNA_rank$full_db_rank[!is.na(miRNA_rank$full_db_rank)]) - miRNA_rank$OR_train_rank[is.na(miRNA_rank$OR_train_rank)] <- max(miRNA_rank$OR_train_rank[!is.na(miRNA_rank$OR_train_rank)]) + + top_miRNA_rank <- miRNA_rank[miRNA_rank$full_db_rank == 1 & !is.na(miRNA_rank$full_db_rank),] - top_miRNA_rank <- miRNA_rank[miRNA_rank$full_db_rank == 1,] - train_rank_dist <- rbind(train_rank_dist,top_miRNA_rank) + train_rank_dist2 <- rbind(train_rank_dist2,top_miRNA_rank) pp <- ggplot2::ggplot(miRNA_rank, ggplot2::aes(y = OR_train_rank, x = full_db_rank))+ ggplot2::geom_jitter(size = 0.5,width = 0.2, height = 0.2, alpha = 0.3) + ggplot2::geom_boxplot(data = top_miRNA_rank, mapping= ggplot2::aes(y = OR_train_rank, x = 0))+ @@ -110,9 +121,30 @@ for (miRNA in unique(full_ranking$miRNA_n)) { ggplot2::geom_vline(xintercept = 1.5, linetype="dashed") plot_objects[[miRNA]] <- pp } - +# save(train_rank_dist, file = "/mnt/scratch/users/bio_267_uma/josecordoba/NGS_projects/LaforaRNAseq/analysis/target_wf_pearson/ctrl_vs_mut_deff/coRmiT.R_0000/test2.Rdata") gridExtra::grid.arrange(grobs = plot_objects, ncol = 3, nrow = ceiling(length(plot_objects)/3), left = "Ranking position in 75% of DB", bottom = "Ranking position in 100% of DB") ``` +## Rank of the best strategy 2 + +Ranking values of the best strategy in corMiT but using random sections of databases + +```{r ranking_dist_train2, echo = FALSE, warning =FALSE, message=FALSE, results = 'asis'} + +train_rank_dist2$miRNA_n <- mirna_names[match(train_rank_dist2$miRNA, mirna_names$Accession), "TargetName"] + +ggplot2::ggplot(train_rank_dist2, ggplot2::aes(x = miRNA_n, y = OR_train_rank)) + +ggplot2::geom_boxplot()+ + + + ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 60,hjust=1,vjust=1, size = 18), + axis.text.y = ggplot2::element_text(size = 18), + axis.title.y = ggplot2::element_text(size = 15, face = "bold")) + + ggplot2::labs(y= "Ranking position in 75% of DB", x = NULL)+ + ggplot2::scale_y_continuous(limits = c(0, 25)) + + + +``` \ No newline at end of file