Skip to content

Commit

Permalink
CV cormit updated
Browse files Browse the repository at this point in the history
  • Loading branch information
JoseCorCab committed Nov 10, 2023
1 parent 1bef175 commit 7518f2f
Show file tree
Hide file tree
Showing 2 changed files with 81 additions and 46 deletions.
73 changes: 38 additions & 35 deletions inst/scripts/crossvalidation_coRmiT.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,21 @@
library(dplyr)

filter_and_rank <- function(cv_cont_table){
# cv_cont_table <- cv_cont_table[cv_cont_table$Pvalue <= 0.05,]
# cv_cont_table <- cv_cont_table[cv_cont_table$Pvalue <= 0.05,]
if (nrow(cv_cont_table) == 0 ) return(NULL)

cv_cont_table <- cv_cont_table[,c("miRNA","strategy", "corr_cutoff", "crossval_status", "Odds_ratio", "TP")]
cv_cont_table <- reshape(cv_cont_table, idvar = c("miRNA","strategy", "corr_cutoff"), timevar = "crossval_status", direction = "wide")
if (sum(c("Odds_ratio.test", "Odds_ratio.train") %in% colnames(cv_cont_table)) != 2) return(NULL)
#cv_cont_table2 <- reshape(cv_cont_table, idvar = c("miRNA","strategy", "corr_cutoff"), timevar = "crossval_status", direction = "wide")
#if ("Odds_ratio.train" %in% colnames(cv_cont_table)) return(NULL)
# cv_cont_table <- cv_cont_table %>% arrange(desc(Odds_ratio.test), desc(TP.test), corr_cutoff)
# cv_cont_table$OR_test_rank <- seq(1, nrow(cv_cont_table))
# str(cv_cont_table)
cv_cont_table$OR_test_rank <- rank(-cv_cont_table$Odds_ratio.test, ties.method ="min")
cv_cont_table$OR_train_rank <- rank(-cv_cont_table$Odds_ratio.train,ties.method ="min")
# cv_cont_table$OR_test_rank <- rank(-cv_cont_table$Odds_ratio.test, ties.method ="min")
cv_cont_table$OR_train_rank <- rank(-cv_cont_table$Odds_ratio,ties.method ="min")
return(cv_cont_table)

}

get_test_rank <- function(cv_cont_table){
top_rank_train_val <- min(cv_cont_table$OR_train_rank)
top_rank_train <- cv_cont_table[cv_cont_table$OR_train_rank ==top_rank_train_val,]
Expand Down Expand Up @@ -78,48 +79,49 @@ cormit_path <- file.path(opt$input_cormit, "temp.RData")
load(cormit_path)

p_val_cutoff <- 0.05
all_pairs <- miRNA_cor_results$all_pairs
miRNA_strat_result <- miRNA_cor_results$miRNA_cont_tables
miRNA_strat_result <- miRNA_strat_result[miRNA_strat_result$Pvalue <= 0.05 & miRNA_strat_result$db_group == "multimir",]
best_miRNA_strategies <- as.data.frame(select_best_strategy(miRNA_strat_result))
strat_combinations <- as.data.frame(unique(miRNA_strat_result[, c("corr_cutoff", "strategy")]))
miRNA_cont_tables <- data.frame()
# all_pairs <- miRNA_cor_results$all_pairs
# miRNA_strat_result <- miRNA_cor_results$miRNA_cont_tables
# miRNA_strat_result <- miRNA_strat_result[miRNA_strat_result$Pvalue <= 0.05 & miRNA_strat_result$db_group == "multimir",]
# best_miRNA_strategies <- as.data.frame(select_best_strategy(miRNA_strat_result))
# strat_combinations <- as.data.frame(unique(miRNA_strat_result[, c("corr_cutoff", "strategy")]))
# miRNA_cont_tables <- data.frame()



for (combination in seq(1,nrow(strat_combinations))) {
corr_cutoff <- strat_combinations[combination, "corr_cutoff"]
strategy <- strat_combinations[combination, "strategy"]
# for (combination in seq(1,nrow(strat_combinations))) {
# corr_cutoff <- strat_combinations[combination, "corr_cutoff"]
# strategy <- strat_combinations[combination, "strategy"]

contingency_tables <- prepare_for_stats(all_pairs, strategy, corr_cutoff,
db_groups = "multimir", p_val_cutoff = p_val_cutoff , crossval = TRUE, test_sample = opt$test_size)
miRNA_cont_tables <- rbind(miRNA_cont_tables, contingency_tables[["strat_miRNA_ct"]])
}
# contingency_tables <- prepare_for_stats(all_pairs, strategy, corr_cutoff,
# db_groups = "multimir", p_val_cutoff = p_val_cutoff , crossval = TRUE, test_sample = opt$test_size)
# miRNA_cont_tables <- rbind(miRNA_cont_tables, contingency_tables[["strat_miRNA_ct"]])
# }


message("Stats prepared")
miRNA_cont_tables <- v_get_stats(miRNA_cont_tables,
selected_stats = c("v.fisher.test","odds_ratio"))
# message("Stats prepared")
# miRNA_cont_tables <- v_get_stats(miRNA_cont_tables,
# selected_stats = c("v.fisher.test","odds_ratio"))

miRNA_cont_tables$iterations <- paste(miRNA_cont_tables$miRNA, miRNA_cont_tables$crossval_it, sep = "_")
# miRNA_cont_tables$iterations <- paste(miRNA_cont_tables$miRNA, miRNA_cont_tables$crossval_it, sep = "_")

splitted_cont_tables <- split(miRNA_cont_tables, miRNA_cont_tables$iterations)
# splitted_cont_tables <- split(miRNA_cont_tables, miRNA_cont_tables$iterations)


cross_rank <- lapply(splitted_cont_tables, filter_and_rank)
full_ranking <-as.data.frame(data.table::rbindlist(cross_rank,
use.names = TRUE,
idcol = "iters"))
cross_rank <- lapply(cross_rank, get_test_rank)
# cross_rank <- lapply(splitted_cont_tables, filter_and_rank)
# full_ranking <-as.data.frame(data.table::rbindlist(cross_rank,
# use.names = TRUE,
# idcol = "iters"))
### cross_rank <- lapply(cross_rank, get_test_rank)
message("Train and test datasets ordered")
cross_rank <-as.data.frame(data.table::rbindlist(cross_rank,
use.names = TRUE,
idcol = "iters"))
colnames(cross_rank) <- c("iteration", "miRNA", "strategy","corr_cutoff","test_rank")
#### cross_rank <-as.data.frame(data.table::rbindlist(cross_rank,
# use.names = TRUE,
# idcol = "iters"))
#### colnames(cross_rank) <- c("iteration", "miRNA", "strategy","corr_cutoff","test_rank")
# save(full_ranking,best_miRNA_strategies, file = "test.Rdata")
load("test.Rdata")
best_miRNA_strategies$acc_ratio <- 0
best_miRNA_strategies$total_strats <- 0
# save(cross_rank,best_miRNA_strategies,full_ranking, file = "test.Rdata")
# load("test.Rdata")

train_rank_dist <- data.frame()
for (miRNA in best_miRNA_strategies$miRNA) {
best_miRNA_strat <- unlist(best_miRNA_strategies[best_miRNA_strategies$miRNA == miRNA,])
Expand All @@ -130,7 +132,8 @@ for (miRNA in best_miRNA_strategies$miRNA) {
train_rank_dist <- rbind(train_rank_dist, best_rank_in_train)
}

str(full_ranking)
# save(train_rank_dist, file = "test_stats.RData")
# q()
message("Computing report")
dir.create(opt$output_files)
outf <- file.path(normalizePath(opt$output_files),"coRmiT_cv.html")
Expand Down
54 changes: 43 additions & 11 deletions inst/templates/cormit_cv.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,12 @@ output:

## Rank of the test dataset for the best strategies in train

```{r ranking_dist, echo = FALSE, warning =FALSE, message=FALSE, results = 'asis'}
mirna_names <- translate_miRNA_ids(cross_rank$miRNA)
```{r config, echo = FALSE, warning =FALSE, message=FALSE, results = 'asis'}
mirna_names <- translate_miRNA_ids(unique(full_ranking$miRNA))
```

```{r ranking_dist, echo = FALSE, warning =FALSE, message=FALSE, results = 'asis', eval = FALSE}
cross_rank$significant <- ifelse(cross_rank$miRNA %in% best_miRNA_strategies$miRNA, "significant", "not significant")
Expand All @@ -57,11 +61,18 @@ Ranking values of the best strategy in corMiT but using random sections of datab
train_rank_dist$miRNA_n <- mirna_names[match(train_rank_dist$miRNA, mirna_names$Accession), "TargetName"]
p_meds <- plyr::ddply(train_rank_dist, plyr::.(miRNA_n), summarise, med = median(OR_train_rank))
ggplot2::ggplot(train_rank_dist, ggplot2::aes(x = miRNA_n, y = OR_train_rank)) +
ggplot2::geom_boxplot()+
ggplot2::geom_text(data = p_meds, ggplot2::aes(x = miRNA_n, y = med, label = med),
size = 3, vjust = -1.5)+
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 315,hjust=0,vjust=0)) +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 60,hjust=1,vjust=1, size = 18),
axis.text.y = ggplot2::element_text(size = 18),
axis.title.y = ggplot2::element_text(size = 15, face = "bold")) +
ggplot2::labs(y= "Ranking position in 75% of DB", x = NULL)+
ggplot2::scale_y_continuous(limits = c(0, 25))
Expand All @@ -83,23 +94,23 @@ plot_objects <- list()
full_ranking <- full_ranking[full_ranking$miRNA %in% best_miRNA_strategies$miRNA,]
full_ranking$miRNA_n <- mirna_names[match(full_ranking$miRNA, mirna_names$Accession), "TargetName"]
miRNA_cor_results$miRNA_cont_tables$miRNA_n <- mirna_names[match(miRNA_cor_results$miRNA_cont_tables$miRNA, mirna_names$Accession), "TargetName"]
train_rank_dist <- data.frame()
train_rank_dist2 <- data.frame()
for (miRNA in unique(full_ranking$miRNA_n)) {
miRNA_ct <- miRNA_cor_results$miRNA_cont_tables[miRNA_cor_results$miRNA_cont_tables$miRNA_n == miRNA,]
miRNA_ct <- miRNA_cor_results$miRNA_cont_tables[miRNA_cor_results$miRNA_cont_tables$miRNA_n == miRNA,]
miRNA_ct <- miRNA_ct[miRNA_ct$db_group == "multimir",]
miRNA_ct$full_db_rank <- rank(-miRNA_ct$Odds_ratio, ties.method ="min")
miRNA_rank <- full_ranking[full_ranking$miRNA_n == miRNA,]
full_and_sample_eq <- match(paste0(miRNA_rank$strategy, miRNA_rank$corr_cutoff),
paste0(miRNA_ct$strategy, miRNA_ct$corr_cutoff))
miRNA_rank$full_db_rank <- miRNA_ct$full_db_rank[full_and_sample_eq]
miRNA_rank$Pvalue <- miRNA_ct$Pvalue[full_and_sample_eq]
# miRNA_rank <- miRNA_rank[miRNA_rank$Pvalue <= 0.05,]
miRNA_rank$full_db_rank[is.na(miRNA_rank$full_db_rank)] <- max(miRNA_rank$full_db_rank[!is.na(miRNA_rank$full_db_rank)])
miRNA_rank$OR_train_rank[is.na(miRNA_rank$OR_train_rank)] <- max(miRNA_rank$OR_train_rank[!is.na(miRNA_rank$OR_train_rank)])
top_miRNA_rank <- miRNA_rank[miRNA_rank$full_db_rank == 1 & !is.na(miRNA_rank$full_db_rank),]
top_miRNA_rank <- miRNA_rank[miRNA_rank$full_db_rank == 1,]
train_rank_dist <- rbind(train_rank_dist,top_miRNA_rank)
train_rank_dist2 <- rbind(train_rank_dist2,top_miRNA_rank)
pp <- ggplot2::ggplot(miRNA_rank, ggplot2::aes(y = OR_train_rank, x = full_db_rank))+
ggplot2::geom_jitter(size = 0.5,width = 0.2, height = 0.2, alpha = 0.3) +
ggplot2::geom_boxplot(data = top_miRNA_rank, mapping= ggplot2::aes(y = OR_train_rank, x = 0))+
Expand All @@ -110,9 +121,30 @@ for (miRNA in unique(full_ranking$miRNA_n)) {
ggplot2::geom_vline(xintercept = 1.5, linetype="dashed")
plot_objects[[miRNA]] <- pp
}
# save(train_rank_dist, file = "/mnt/scratch/users/bio_267_uma/josecordoba/NGS_projects/LaforaRNAseq/analysis/target_wf_pearson/ctrl_vs_mut_deff/coRmiT.R_0000/test2.Rdata")
gridExtra::grid.arrange(grobs = plot_objects, ncol = 3, nrow = ceiling(length(plot_objects)/3),
left = "Ranking position in 75% of DB", bottom = "Ranking position in 100% of DB")
```


## Rank of the best strategy 2

Ranking values of the best strategy in corMiT but using random sections of databases

```{r ranking_dist_train2, echo = FALSE, warning =FALSE, message=FALSE, results = 'asis'}
train_rank_dist2$miRNA_n <- mirna_names[match(train_rank_dist2$miRNA, mirna_names$Accession), "TargetName"]
ggplot2::ggplot(train_rank_dist2, ggplot2::aes(x = miRNA_n, y = OR_train_rank)) +
ggplot2::geom_boxplot()+
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 60,hjust=1,vjust=1, size = 18),
axis.text.y = ggplot2::element_text(size = 18),
axis.title.y = ggplot2::element_text(size = 15, face = "bold")) +
ggplot2::labs(y= "Ranking position in 75% of DB", x = NULL)+
ggplot2::scale_y_continuous(limits = c(0, 25))
```

0 comments on commit 7518f2f

Please sign in to comment.