diff --git a/DESCRIPTION b/DESCRIPTION index 6086e0f..796185e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: pictograph Title: Tools for modeling clonal evolution of a cancer -Version: 1.2.1.1 +Version: 1.3.0 Authors@R: person(given = "Jiaying", family = "Lai", @@ -15,13 +15,19 @@ Suggests: testthat, knitr, rmarkdown -RoxygenNote: 7.2.3 +RoxygenNote: 7.3.1 VignetteBuilder: knitr Imports: - igraph, ggplot2, - tidyr, - dplyr, rjags, - ggmcmc, - stringr \ No newline at end of file + stringr, + readr (>= 2.1.4), + tidyverse (>= 2.0.0), + dplyr (>= 1.1.1), + tidyr (>= 1.3.0), + LaplacesDemon (>= 16.1.6), + igraph (>= 2.0.1), + viridis (>= 0.6.5), + ggmcmc (>= 1.5.1), + UpSetR (>= 1.4.0), + cluster (>= 2.1.6) diff --git a/NAMESPACE b/NAMESPACE index 08516d7..fe3a600 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,37 +1,44 @@ # Generated by roxygen2: do not edit by hand +S3method(mutate,admat) +S3method(mutate,admat.2) +S3method(mutate,admat.3) +S3method(mutate,column) +S3method(mutate,n.columns) +S3method(mutate,n.columns.clusterprob) export(calcSubcloneProportions) -export(calcSubcloneProportions2) export(calcTreeScores) -export(clusterSep) +export(calculateTreeScoreMutations) export(collectBestKChains) export(colorScheme) -export(enumerateSpanningTrees) export(enumerateSpanningTreesModified) -export(estimateCCFs) export(estimateClusterAssignments) +export(estimateICN) +export(estimateMCFs) +export(estimateMultiplicity) +export(estimateMultiplicityMatrix) export(filterEdgesBasedOnCCFs) -export(forceCCFs) export(generateAllTrees) -export(importCSV) +export(importFiles) +export(mcmcMain) export(mergeSetChains) -export(plotAllZProb) -export(plotBIC) -export(plotCCFViolin) -export(plotChainsCCF) -export(plotClusterAssignmentProb) +export(plotChainsMCF) export(plotClusterAssignmentProbVertical) -export(plotDensityCCF) export(plotEnsembleTree) -export(plotPPD) +export(plotMCFViolin) export(plotSubcloneBar) export(plotSubclonePie) export(plotTree) export(prepareGraph) -export(prepareGraphForGabowMyers) +export(runMCMCForAllBoxes) +export(separateMutationsBySamplePresence) export(writeClusterAssignmentsTable) -export(writeClusterCCFsTable) +export(writeClusterMCFsTable) +export(writeIcnTable) +export(writeMultiplicityTable) export(writeSetKTable) +import(UpSetR) +import(cluster) import(dplyr) import(ggmcmc) import(ggplot2) @@ -40,13 +47,10 @@ import(magrittr) import(methods) import(readr) import(rjags) +import(stringr) import(tibble) import(tidyr) +import(tidyverse) +import(viridis) importFrom(epiR,epi.betabuster) -importFrom(ggmcmc,ggs) -importFrom(magrittr,set_colnames) importFrom(stringr,str_replace) -importFrom(stringr,str_replace_all) -importFrom(tidyr,pivot_longer) -importFrom(tidyr,unite) -importFrom(viridis,viridis) diff --git a/R/MCMC-clustering.R b/R/MCMC-clustering.R new file mode 100644 index 0000000..d8002e4 --- /dev/null +++ b/R/MCMC-clustering.R @@ -0,0 +1,396 @@ +#' run MCMC using JAGS +#' +#' @export +runMCMCForAllBoxes <- function(sep_list, + sample_presence=TRUE, + ploidy=2, + max_K = 5, + min_mutation_per_cluster = 5, + cluster_diff_thresh=0.05, + n.iter = 5000, + n.burn = 1000, + thin = 10, + mc.cores = 4, + inits = list(".RNG.name" = "base::Wichmann-Hill", + ".RNG.seed" = 123)){ + + if (!sample_presence) { + all_set_results <- vector("list", 1) + names(all_set_results) <- paste0(rep("1", ncol(sep_list$y)), collapse = "") + params = c("z", "mcf", "icn", "m", "ystar") + + temp_box <- sep_list + temp_box$pattern <- paste0(rep("1", ncol(sep_list$y)), collapse = "") + temp_max_K <- min(max_K, floor(nrow(temp_box$y)/min_mutation_per_cluster)) + temp_max_K <- max(temp_max_K, 1) + + temp_samps_list <- runMutSetMCMC(temp_box, + ploidy=ploidy, + n.iter = n.iter, + n.burn = n.burn, + thin = thin, + mc.cores = mc.cores, + inits = inits, + temp_max_K = temp_max_K, + params = params, + min_mutation_per_cluster = min_mutation_per_cluster, + cluster_diff_thresh = cluster_diff_thresh, + sample_presence=sample_presence) + + all_set_results[[1]] <- temp_samps_list + + } else { + all_set_results <- vector("list", length(sep_list)) + names(all_set_results) <- names(sep_list) + params = c("z", "mcf", "icn", "m", "ystar") + + for (i in seq_len(length(sep_list))) { + temp_box <- sep_list[[i]] + # Max number of clusters cannot be more than number of mutations/min_mutation_per_cluster + temp_max_K <- min(max_K, floor(length(temp_box$mutation_indices)/min_mutation_per_cluster)) + temp_max_K <- max(temp_max_K, 1) + temp_samps_list <- runMutSetMCMC(temp_box, + ploidy=ploidy, + n.iter = n.iter, + n.burn = n.burn, + thin = thin, + mc.cores = mc.cores, + inits = inits, + temp_max_K = temp_max_K, + params = params, + min_mutation_per_cluster = min_mutation_per_cluster, + cluster_diff_thresh = cluster_diff_thresh, + sample_presence=sample_presence) + + all_set_results[[i]] <- temp_samps_list + } + } + return(all_set_results) +} + +runMutSetMCMC <- function(temp_box, + ploidy=2, + n.iter = 10000, + n.burn = 1000, + thin = 10, + mc.cores = 1, + inits = list(".RNG.name" = "base::Wichmann-Hill", + ".RNG.seed" = 123), + temp_max_K = 5, + params = c("z", "mcf", "icn", "m", "ystar"), + min_mutation_per_cluster = 1, + cluster_diff_thresh=0.05, + sample_presence=FALSE) { + + temp_samps_list <- runMCMCForABox(temp_box, + ploidy=ploidy, + n.iter = n.iter, + n.burn = n.burn, + thin = thin, + mc.cores = mc.cores, + inits = inits, + params = params, + max_K = temp_max_K, + sample_presence=sample_presence) + + # Format chains + if (length(temp_samps_list) == 1) { + samps_list <- list(formatChains(temp_samps_list)) + names(samps_list) <- "K1" + } else { + samps_list <- parallel::mclapply(temp_samps_list, formatChains, + mc.cores = mc.cores) + } + + # check whether: + # 1) number of mutations per cluster is at least min_mutation_per_cluster + # 2) difference between any two cluster less than cluster_diff_thresh + filtered_samps_list <- filterK(samps_list, min_mutation_per_cluster = min_mutation_per_cluster, + cluster_diff_thresh = cluster_diff_thresh) + + # Calculate BIC and silhouette + K_tested <- seq_len(length(filtered_samps_list)) + if (temp_max_K > 1) { + box_indata <- getBoxInputData(temp_box, ploidy) + + bic_vec <- unname(unlist(parallel::mclapply(filtered_samps_list, + function(chains) calcChainBIC(chains=chains, input.data=box_indata, pattern=temp_box$pattern), + mc.cores = mc.cores))) + bic_tb <- tibble(K_tested = K_tested, + BIC = bic_vec) + BIC_best_chains <- samps_list[[which.min(bic_vec)]] + sc_vec <- unname(unlist(parallel::mclapply(filtered_samps_list, + function(chains) calcChainSilhouette(chains=chains, input.data=box_indata, pattern=temp_box$pattern), + mc.cores = mc.cores))) + + sc_tb <- tibble(K_tested = K_tested, + silhouette = sc_vec) + sc_best_chains <- samps_list[[which.max(sc_vec)]] + res_list <- list(all_chains = samps_list, + silhouette = sc_tb, + BIC = bic_tb, + BIC_best_chains = BIC_best_chains, + sc_best_chains = sc_best_chains, + BIC_best_K = which.min(bic_vec), + silhouette_best_K = which.max(sc_vec)) + } else { + # only 1 variant, so must be 1 cluster and don't need to check BIC + res_list <- list(all_chains = filtered_samps_list, + silhouette = NA, + BIC = NA, + BIC_best_chains = filtered_samps_list[[1]], + sc_best_chains = filtered_samps_list[[1]], + BIC_best_K = 1, + silhouette_best_K = 1) + } + return(res_list) +} + +runMCMCForABox <- function(box, + ploidy=2, + n.iter = 10000, + n.burn = 1000, + thin = 10, + mc.cores = 1, + inits = list(".RNG.name" = "base::Wichmann-Hill", + ".RNG.seed" = 123), + params = c("z", "mcf", "icn", "m", "ystar"), + max_K = 5, + sample_presence=FALSE) { + + # select columns if the presence pattern is 1 + box_input_data <- getBoxInputData(box, ploidy) + + extdir <- system.file("extdata", package="pictograph") + + # choose sample in which mutations are present + sample_to_sort <- which(colSums(box_input_data$y) > 0)[1] + + jags.file <- file.path(extdir, "model.jags") + jags.file.K1 <- file.path(extdir, "model_K1.jags") + + + samps_K1 <- runMCMC(box_input_data, + 1, + jags.file.K1, + inits, + params, + n.iter=n.iter, + thin=thin, + n.burn=n.burn) + + if(box_input_data$S == 1) { + colnames(samps_K1[[1]])[which(colnames(samps_K1[[1]]) == "mcf")] <- "mcf[1,1]" + } + + if (sample_presence) { + samps_K1 <- reverseDrop(samps_K1, box$pattern, n.iter) + } + + if (max_K > 1) { + + box_input_data$sample_to_sort <- sample_to_sort + + samps_2 <- parallel::mclapply(2:max_K, + function(k) runMCMC(box_input_data, k, + jags.file, inits, params, + n.iter=n.iter, thin=thin, + n.burn=n.burn), + mc.cores=mc.cores) + + if (sample_presence) { + for (i in seq_len(length(samps_2))) { + samps_2[[i]] <- reverseDrop(samps_2[[i]], box$pattern, n.iter) + } + } + samps_list <- c(list(samps_K1), samps_2) + names(samps_list) <- paste0("K", 1:max_K) + return(samps_list) + + } else { + names(samps_K1) <- "K1" + return(samps_K1) + } + +} + +getBoxInputData <- function(box, ploidy=2) { + sample_list = vector() + + # include samples if the pattern is 1; i.e. presence of mutations in the sample + for (j in 1:ncol(box$y)) { + if (strsplit(box$pattern, "")[[1]][j] == "1") { + sample_list <- append(sample_list, j) + } + } + box_input_data <- list(I = nrow(box$y), + S = length(sample_list), + y = box$y[,sample_list,drop=FALSE], + n = box$n[,sample_list,drop=FALSE], + tcn = box$tcn[,sample_list,drop=FALSE], + is_cn = box$is_cn, + mtp = box$mtp, + cncf = box$cncf[,sample_list,drop=FALSE], + icn = box$icn, + purity=box$purity, + ploidy=ploidy) + + # set tcn to 2 if 0 + box_input_data$tcn[box_input_data$tcn==0] <- 2 + return(box_input_data) +} + +runMCMC <- function(box_input_data, + K, + jags.file, + inits, + params, + n.iter=10000, + thin=10, + n.chains=1, + n.adapt=1000, + n.burn=1000) { + if (K > 1) box_input_data$K <- K + jags.m <- jags.model(jags.file, + box_input_data, + n.chains = n.chains, + inits = inits, + n.adapt = n.adapt) + if (n.burn > 0) update(jags.m, n.burn) + samps <- coda.samples(jags.m, params, n.iter=n.iter, thin=thin) + + return(samps) +} + +reverseDrop <- function(samps, pattern, n.iter) { + total_sample = nchar(pattern) + sample_list = vector() + for (j in seq_len(nchar(pattern))) { + if (strsplit(pattern, "")[[1]][j] == "1") { + sample_list <- append(sample_list, j) + } + } + k_list = vector() + ystar_list = vector() + # replace current sample id by true sample id from pattern + for (i in seq_len(length(colnames(samps[[1]])))) { + if (startsWith(colnames(samps[[1]])[i], "mcf")) { + para <- str_extract_all(colnames(samps[[1]])[i], "[0-9]+")[[1]] + colnames(samps[[1]])[i] <- paste("mcf[", para[1], ",", sample_list[strtoi(para[2])], "]", sep = "") + k_list <- c(k_list, para[1]) + } else if (startsWith(colnames(samps[[1]])[i], "ystar")) { + para <- str_extract_all(colnames(samps[[1]])[i], "[0-9]+")[[1]] + colnames(samps[[1]])[i] <- paste("ystar[", para[1], ",", sample_list[strtoi(para[2])], "]", sep = "") + ystar_list <- c(ystar_list, para[1]) + } + } + k_list <- unique(k_list) + ystar_list <- unique(ystar_list) + + # add back dropped samples + absent_sample <- vector() + for (sample in seq_len(total_sample)) { + if (! sample %in% sample_list) { + absent_sample <- append(absent_sample, sample) + } + } + for (k in seq_len(length(k_list))) { + for (j in seq_len(length(absent_sample))) { + col = paste("mcf[", k_list[k], ",", absent_sample[j], "]", sep = "") + samps[[1]] <- cbind(samps[[1]], col=0) + colnames(samps[[1]])[colnames(samps[[1]]) == 'col'] <- col + } + } + for (ystar in seq_len(length(ystar_list))) { + for (j in seq_len(length(absent_sample))) { + col = paste("ystar[", ystar_list[ystar], ",", absent_sample[j], "]", sep = "") + samps[[1]] <- cbind(samps[[1]], col=0) + colnames(samps[[1]])[colnames(samps[[1]]) == 'col'] <- col + } + } + + samps[[1]] <- samps[[1]][,order(colnames(samps[[1]]))] + + return(samps) +} + + +filterK <- function(samps_list, min_mutation_per_cluster=1, cluster_diff_thresh=0.05) { + filtered_samps_list <- list() + toBreak = F + for (i in seq_len(length(samps_list))) { + k = as.numeric(gsub("\\D", "", names(samps_list)[i])) + if (k > 1) { + mcf_chain = samps_list[[i]]$mcf_chain + z_chain = samps_list[[i]]$z_chain + clusterTable = writeClusterAssignmentsTable(z_chain) + # check whether all cluster contains at least one mutation + if (length(unique(clusterTable$Cluster))==k) { + # check whether all cluster contains at least min_mutation_per_cluster mutations + if (any(table(clusterTable$Cluster) < min_mutation_per_cluster)) { + break + } + } else { + break + } + mcfTable = writeClusterMCFsTable(mcf_chain) + # check whether mcf for any cluster is less than cluster_diff_thresh in all samples + for (j1 in seq_len(k)) { + if (all(mcfTable[j1,2:ncol(mcfTable)] < cluster_diff_thresh)) { + toBreak = T + } + } + # check whether mcf difference between any two clusters less than cluster_diff_thresh in all samples + for (j1 in seq_len(k-1)) { + for (j2 in seq(j1+1, k)) { + diff = abs(mcfTable[j1,2:ncol(mcfTable)] - mcfTable[j2,2:ncol(mcfTable)]) + if (all(diff < cluster_diff_thresh)) { + toBreak = T + } + } + } + } + if (toBreak) { break } + filtered_samps_list[[names(samps_list)[i]]] <- samps_list[[i]] + } + return(filtered_samps_list) +} + +formatChains <- function(samps) { + temp_z <- get.parameter.chain("z", ggmcmc::ggs(samps)) %>% + mutate(Parameter = as.character(Parameter)) + if (nrow(temp_z) == 0) { + temp_z <- get.parameter.chain("z", ggmcmc::ggs(samps) %>% mutate(Parameter = gsub("z","z[1]",Parameter))) %>% + mutate(Parameter = as.character(Parameter)) + } + temp_mcf <- get.parameter.chain("mcf", ggmcmc::ggs(samps)) %>% + mutate(Parameter = as.character(Parameter)) + if (nrow(temp_mcf) == 0) { + temp_mcf <- get.parameter.chain("mcf", ggmcmc::ggs(samps) %>% mutate(Parameter = gsub("mcf","mcf[1,1]",Parameter))) %>% + mutate(Parameter = as.character(Parameter)) + } + temp_ystar <- get.parameter.chain("ystar", ggmcmc::ggs(samps)) %>% + mutate(Parameter = as.character(Parameter)) + temp_icn <- get.parameter.chain("icn", ggmcmc::ggs(samps)) %>% + mutate(Parameter = as.character(Parameter)) + if (nrow(temp_icn) == 0) { + temp_icn <- get.parameter.chain("icn", ggmcmc::ggs(samps) %>% mutate(Parameter = gsub("icn","icn[1]",Parameter))) %>% + mutate(Parameter = as.character(Parameter)) + } + temp_m <- get.parameter.chain("m", ggmcmc::ggs(samps)) %>% + mutate(Parameter = as.character(Parameter)) + if (nrow(temp_m) == 0) { + temp_m <- get.parameter.chain("m", ggmcmc::ggs(samps) %>% mutate(Parameter = gsub("\\bm\\b","m[1]",Parameter))) %>% + mutate(Parameter = as.character(Parameter)) + } + samps_list_formatted <- list(mcf_chain = temp_mcf, + z_chain = temp_z, + icn_chain = temp_icn, + m_chain = temp_m, + ystar_chain = temp_ystar) + return(samps_list_formatted) +} + +get.parameter.chain <- function(param, chains) { + chains[grep(paste0(param, "\\["), chains$Parameter), ] +} diff --git a/R/MCMC-main.R b/R/MCMC-main.R new file mode 100644 index 0000000..632bcfb --- /dev/null +++ b/R/MCMC-main.R @@ -0,0 +1,247 @@ +#' run PICTograph in an automated pipeline +#' +#' run MCMC chains to infer the clonal evolution of tumors from single or multi-region sequencing data. +#' This function automatically runs a pipeline of the tool. It models uncertainty of mutation cellular +#' fraction (MCF) in small somatic mutations (SSMs) and copy number alterations (CNAs), assigning SSMs +#' and CNAs to subclones using a Bayesian hierarchical model, and reconstruct tumor evolutionary trees +#' that are constrained based on principles of lineage precedence, sum condition, and optionally by +#' sample-presence. +#' +#' @param mutation_file a csv file that include information for SSMs. +#' @param outputDir output directory for saving all files. +#' @param sample_presence whether to use sample presence to separate the mutations. Not applicable if dual_model is set to FALSE and a copy number file is provided. +#' @param score scoring function to estimate the number of clusters. silhouette or BIC. +#' @param max_K user defined maximum number of clusters. +#' @param min_mutation_per_cluster minumum number of mutations in each cluster. +#' @param n.iter number of iterations by JAGS. +#' @param n.burn number of burns by JAGS. +#' @param thin number of thin by JAGS. +#' @param mc.cores number of cores to use for parallel computing; not applicable to windows. +#' @param inits additional parameters by JAGS. +#' @param cluster_diff_thresh threshold to merge two clusters. +#' @export +mcmcMain <- function(mutation_file, + outputDir=NULL, + sample_presence=TRUE, + score="silhouette", # either BIC or silhouette + max_K = 10, + min_mutation_per_cluster=5, + cluster_diff_thresh=0.05, + n.iter=5000, + n.burn=1000, + thin=10, + mc.cores=8, + inits=list(".RNG.name" = "base::Wichmann-Hill",".RNG.seed" = 123), + alt_reads_thresh = 0, # placeholder + vaf_thresh = 0 # placeholder + ) { + + data <- importFiles(mutation_file, + outputDir, + alt_reads_thresh=alt_reads_thresh, + vaf_thresh=vaf_thresh) + + # use working directory to save outputs if outputDir is not provided + if (is.null(outputDir)) { + outputDir = getwd() + } + + # save upset plot if more than one sample + if (ncol(data$y) > 1) { + data_matrix <- ifelse(data$y[data$is_cn==0,]>0, 1, 0) + png(paste(outputDir, "upsetR.png", sep="/"), res=100) + print(upset(as.data.frame(data_matrix), text.scale = c(1.5, 1.5, 1.5, 1.5, 1.5, 1.5), keep.order = T, sets = rev(colnames(data_matrix)))) + dev.off() + } + + data <- assign("data", data, envir = .GlobalEnv) + + if (sample_presence) { + message("Using sample presence; SSM only") + input_data <- list(y=data$y, + n=data$n, + tcn=data$tcn, + is_cn=data$is_cn, + mtp=data$mtp, + icn=data$icn, + cncf=data$cncf, + MutID=data$MutID, + purity=data$purity) + + input_data <- assign("input_data", input_data, envir = .GlobalEnv) + + # separate mutations by sample presence + sep_list <- separateMutationsBySamplePresence(input_data) + + # For each presence set, run clustering MCMC, calculate silhouette and BIC and choose best K + all_set_results <- runMCMCForAllBoxes(sep_list, sample_presence=sample_presence, max_K = max_K, min_mutation_per_cluster = min_mutation_per_cluster, + cluster_diff_thresh = cluster_diff_thresh, inits = inits, + n.iter = n.iter, n.burn = n.burn, thin = thin, mc.cores = mc.cores) + } else { + message("Not using sample presence; SSM only") + input_data <- list(y=data$y, + n=data$n, + tcn=data$tcn, + is_cn=data$is_cn, + mtp=data$mtp, + icn=data$icn, + cncf=data$cncf, + MutID=data$MutID, + purity=data$purity) + + input_data <- assign("input_data", input_data, envir = .GlobalEnv) + + all_set_results <- runMCMCForAllBoxes(input_data, sample_presence=sample_presence, max_K = max_K, min_mutation_per_cluster = min_mutation_per_cluster, + cluster_diff_thresh = cluster_diff_thresh, inits = inits, + n.iter = n.iter, n.burn = n.burn, thin = thin, mc.cores = mc.cores) + } + + all_set_results <- assign("all_set_results", all_set_results, envir = .GlobalEnv) + + # pick K: silhouette or BIC + set_k_choices <- writeSetKTable(all_set_results) + set_k_choices <- assign("set_k_choices", set_k_choices, envir = .GlobalEnv) + + # collect best chains + if (score=="silhouette") { + best_set_chains <- collectBestKChains(all_set_results, chosen_K = set_k_choices$silhouette_K) + } else { + best_set_chains <- collectBestKChains(all_set_results, chosen_K = set_k_choices$BIC_K) + } + chains <- mergeSetChains(best_set_chains, input_data) + + # plot MCMC tracing + png(paste(outputDir, "mcf.png", sep="/")) + print( + plotChainsMCF(chains$mcf_chain) + ) + dev.off() + + # write mcf table + mcfTable = writeClusterMCFsTable(chains$mcf_chain) + colnames(mcfTable)=c("Cluster",c(colnames(data$y))) + write.table(mcfTable, file=paste(outputDir, "mcf.csv", sep="/"), quote = FALSE, sep = ",", row.names = F) + + # write cluster assignment table + clusterAssingmentTable = writeClusterAssignmentsTable(chains$z_chain, Mut_ID = input_data$MutID) + write.table(clusterAssingmentTable, file=paste(outputDir, "clusterAssign.csv", sep="/"), quote = FALSE, sep = ",", row.names = F) + + # record estimated icn and multiplicity information + icnTable <- writeIcnTable(chains$icn_chain, Mut_ID = input_data$MutID) + write.table(icnTable, file=paste(outputDir, "icn_all.csv", sep="/"), quote = FALSE, sep = ",", row.names = F) + + multiplicityTable <- writeMultiplicityTable(chains$m_chain, Mut_ID = input_data$MutID) + write.table(multiplicityTable, file=paste(outputDir, "multiplicity_all.csv", sep="/"), quote = FALSE, sep = ",", row.names = F) + + # generate trees using different set of thresholds until at least one tree is available + threshes <- allThreshes() + for (thresh in threshes) { + generateAllTrees(chains$mcf_chain, data$purity, lineage_precedence_thresh = thresh[1], sum_filter_thresh = thresh[2]) + if (length(all_spanning_trees) > 0) { + break + } + } + + cncfTable <- data$cncf + # scores <- calcTreeScores(chains$mcf_chain, all_spanning_trees, purity=data$purity) + scores <- calculateTreeScoreMutations(chains$mcf_chain, data, icnTable, cncfTable, multiplicityTable, clusterAssingmentTable, data$purity, all_spanning_trees) + + # plot all possible trees + plotAllTrees(outputDir, scores, all_spanning_trees, mcfTable, data) + + # highest scoring tree + best_tree <- all_spanning_trees[[which(scores == max(scores))[length(which(scores == max(scores)))]]] + write.table(best_tree, file=paste(outputDir, "tree.csv", sep="/"), quote = FALSE, sep = ",", row.names = F) + + # plot best and ensemble tree + if (nrow(best_tree) >1 ) { + png(paste(outputDir, "tree.png", sep="/")) + plotTree(best_tree, palette = viridis::viridis) + dev.off() + } + + if (length(all_spanning_trees[which(scores == max(scores))])) { + png(paste(outputDir, "tree_ensemble.png", sep="/")) + plotEnsembleTree(all_spanning_trees, palette = viridis::viridis) + dev.off() + } + + # estimate purity + cc <- best_tree %>% filter(parent=="root") %>% select(child) + purity <- mcfTable %>% filter(Cluster %in% cc$child) %>% summarise(across(everything(), sum)) %>% select(-Cluster) + colnames(purity) <- colnames(data$y) + write.table(purity, file=paste(outputDir, "purity.csv", sep="/"), quote = FALSE, sep = ",", row.names = F) + + # estimate subclone proportion + subclone_props <- calcSubcloneProportions(mcf_mat, best_tree) + rownames(subclone_props) = mcfTable$Cluster + colnames(subclone_props) = colnames(data$y) + + write.csv(subclone_props, file=paste(outputDir, "subclone_proportion.csv", sep="/"), quote = FALSE) + + png(paste(outputDir, "subclone_props.png", sep="/")) + print(plotSubclonePie(subclone_props, sample_names=colnames(input_data$y))) + dev.off() + + # save all data + save.image(file=paste(outputDir, "PICTograph.RData", sep="/")) + +} + + +#' Plot all trees with the highest scores +plotAllTrees <- function(outputDir, scores, all_spanning_trees, mcfTable, data) { + # plot all tree with best scores + + outputDir = paste(outputDir, "all_trees", sep = "/") + suppressWarnings(dir.create(outputDir)) + + for (i in seq_len(length(which(scores == max(scores))))) { + idx = which(scores == max(scores))[i] + + best_tree <- all_spanning_trees[[idx]] + if (nrow(best_tree) >1 ) { + write.table(best_tree, file=paste(outputDir, "/tree", i, ".csv", sep=""), quote = FALSE, sep = ",", row.names = F) + + png(paste(outputDir, "/tree", i, ".png", sep="")) + # plot tree + plotTree(best_tree, palette = viridis::viridis) + # plotEnsembleTree(all_spanning_trees, palette = viridis::viridis) + dev.off() + + cc <- best_tree %>% filter(parent=="root") %>% select(child) + purity <- mcfTable %>% filter(Cluster %in% cc$child) %>% summarise(across(everything(), sum)) %>% select(-Cluster) + colnames(purity) <- colnames(data$y) + write.table(purity, file=paste(outputDir, "/tree_", i, "_purity.csv", sep=""), quote = FALSE, sep = ",", row.names = F) + + subclone_props <- calcSubcloneProportions(mcf_mat, best_tree) + rownames(subclone_props) = mcfTable$Cluster + colnames(subclone_props) = colnames(data$y) + + write.csv(subclone_props, file=paste(outputDir, "/tree_", i, "_subclone_proportion.csv", sep=""), quote = FALSE) + + png(paste(outputDir, "/tree_", i, "_subclone_proportion.png", sep="")) + print(plotSubclonePie(subclone_props, sample_names=colnames(input_data$y))) + dev.off() + } + } +} + +#' defines the thresholds to be used for tree building +allThreshes <- function() { + threshes <- list() + threshes[[1]] <- c(0,0) + threshes[[2]] <- c(0.1,0) + threshes[[3]] <- c(0,0.1) + threshes[[4]] <- c(0.1,0.1) + threshes[[5]] <- c(0.1,0.2) + threshes[[6]] <- c(0.2,0.1) + threshes[[7]] <- c(0.2,0.2) + threshes[[8]] <- c(0.1,0.3) + threshes[[9]] <- c(0.3,0.1) + threshes[[10]] <- c(0.2,0.3) + threshes[[11]] <- c(0.3,0.2) + threshes[[12]] <- c(0.3,0.3) + threshes[[13]] <- c(0.4,0.4) + threshes +} \ No newline at end of file diff --git a/R/MCMC-process-clusters.R b/R/MCMC-process-clusters.R new file mode 100644 index 0000000..aeea522 --- /dev/null +++ b/R/MCMC-process-clusters.R @@ -0,0 +1,523 @@ +#' Determine most probable integer assignments by taking those with highest posterior probability +#' +#' @export +#' @param icn_chain MCMC chain of integer copy number assignment values +estimateICN <- function(icn_chain) { + it <- max(icn_chain$Iteration) + mcmc_icn <- icn_chain %>% + group_by(Parameter, value) %>% + reframe(n=n(), + maxiter=it) %>% + mutate(probability=n/maxiter) %>% + ungroup() + map_icn <- mcmc_icn %>% + group_by(Parameter) %>% + reframe(value=value[probability==max(probability)]) %>% + ungroup() + + # choose first cluster if equal probability + map_icn_count <- map_icn %>% + group_by(Parameter) %>% + reframe(map_count = n()) %>% + ungroup() + if (any(map_icn_count$map_count > 1)) { + mut_ind <- which(map_icn_count$map_count > 1) + for (i in mut_ind) { + dup_var <- as.numeric(gsub("icn\\[|]", "", map_icn_count$Parameter[i])) + map_icn_dups <- which(gsub("icn\\[|]", "", map_icn$Parameter) == dup_var) + dup_ind <- map_icn_dups[-1] + map_icn <- map_icn[-dup_ind, ] + } + } + return(map_icn) +} +#' Determine most probable integer copy number by taking those with highest posterior probability. +#' +#' @export +#' @param icn_chain MCMC chain of integer copy number +#' @param Mut_ID Vector of mutation IDs, same order as provided as input data (e.g. indata$Mut_ID) +#' @return A tibble listing mutation IDs and their cluster assignments +writeIcnTable <- function(icn_chain, Mut_ID = NULL) { + map_icn <- estimateICN(icn_chain) + if (is.null(Mut_ID)) { + Mut_ID <- paste0("Mut", 1:nrow(map_icn)) + } + map_icn <- map_icn %>% + mutate(Parameter_n = as.numeric(gsub("icn\\[(\\d+)\\]","\\1",Parameter)))%>% + arrange(Parameter_n)%>% + mutate(Mut_ID = Mut_ID, icn = value)%>% + select(Mut_ID, icn) + + # map_icn <- map_icn %>% + # arrange(Cluster) + return(map_icn) +} + +#' Determine most probable mutation cluster assignments by taking those with highest posterior probability +#' +#' @export +#' @param m_chain MCMC chain of multiplicity assignment values +estimateMultiplicity <- function(m_chain) { + it <- max(m_chain$Iteration) + mcmc_m <- m_chain %>% + group_by(Parameter, value) %>% + reframe(n=n(), + maxiter=it) %>% + mutate(probability=n/maxiter) %>% + ungroup() + map_m <- mcmc_m %>% + group_by(Parameter) %>% + reframe(value=value[probability==max(probability)]) %>% + ungroup() + + # choose first cluster if equal probability + map_m_count <- map_m %>% + group_by(Parameter) %>% + reframe(map_count = n()) %>% + ungroup() + if (any(map_m_count$map_count > 1)) { + mut_ind <- which(map_m_count$map_count > 1) + for (i in mut_ind) { + dup_var <- as.numeric(gsub("m\\[|]", "", map_m_count$Parameter[i])) + map_m_dups <- which(gsub("m\\[|]", "", map_m$Parameter) == dup_var) + dup_ind <- map_m_dups[-1] + map_m <- map_m[-dup_ind, ] + } + } + return(map_m) +} + +#' Determine most probable multiplicity assignments by taking those with highest posterior probability. +#' +#' @export +#' @param m_chain MCMC chain of mutation cluster assignment values +#' @param Mut_ID Vector of mutation IDs, same order as provided as input data (e.g. indata$Mut_ID) +#' @return A tibble listing mutation IDs and their cluster assignments +writeMultiplicityTable <- function(m_chain, Mut_ID = NULL) { + map_m <- estimateMultiplicity(m_chain) + if (is.null(Mut_ID)) { + Mut_ID <- paste0("Mut", 1:nrow(map_m)) + } + map_m <- map_m %>% + mutate(Parameter_n = as.numeric(gsub("m\\[(\\d+)\\]","\\1",Parameter)))%>% + arrange(Parameter_n)%>% + mutate(Mut_ID = Mut_ID, Multiplicity = value)%>% + select(Mut_ID, Multiplicity) + + return(map_m) +} + +#' Determine the most probable cluster MCF values by taking the mean of the posterior distributions +#' +#' @export +#' @param mcf_chain MCMC chain of mCF values +#' @return matrix of estimated cluster MCFs +estimateMCFs <- function(mcf_chain) { + S <- numberSamples(mcf_chain) + K <- numberClusters(mcf_chain) + temp <- mcf_chain %>% + mutate( + I = as.numeric(gsub("mcf\\[([0-9]+),[0-9]+\\]", "\\1", Parameter)), + J = as.numeric(gsub("mcf\\[[0-9]+,([0-9]+)\\]", "\\1", Parameter))) %>% + group_by(I,J) %>% + summarise(mean_value = mean(value), .groups = 'drop') + mcf.map.matrix <- matrix(NA, nrow = K, ncol = S, dimnames = list(1:K, 1:S)) + for(row in 1:nrow(temp)) { + mcf.map.matrix[temp$I[row], temp$J[row]] <- round(temp$mean_value[row],3) + } + return(mcf.map.matrix) +} + +#' @importFrom stringr str_replace +numberSamples <- function(mcf_stats){ + params <- as.character(mcf_stats$Parameter) + nSamples <- strsplit(params, ",") %>% + sapply("[", 2) %>% + stringr::str_replace("\\]", "") %>% + as.numeric() %>% + max() + nSamples +} + +#' @importFrom stringr str_replace +numberClusters <- function(mcf_stats){ + params <- as.character(mcf_stats$Parameter) + K <- strsplit(params, ",") %>% + sapply("[", 1) %>% + str_replace("mcf\\[", "") %>% + as.numeric() %>% + max() + K +} + +#' Determine the most probable cluster MCF values by taking the mode of the posterior distributions +#' +#' @export +#' @param mcf_chain MCMC chain of MCF values +#' @param Sample_ID Vector of sample IDs, same order as provided as input data (e.g. indata$Sample_ID) +#' @return A tibble of estimated cluster MCFs in each sample +writeClusterMCFsTable <- function(mcf_chain, Sample_ID = NULL) { + map_mcf <- as.data.frame(estimateMCFs(mcf_chain)) + + if (is.null(Sample_ID)) { + Sample_ID <- paste0("Sample ", 1:ncol(map_mcf)) + } + colnames(map_mcf) <- Sample_ID + map_mcf <- map_mcf %>% + as_tibble() %>% + bind_cols(tibble(Cluster = 1:nrow(map_mcf)), .) + return(map_mcf) +} + +#' Determine most probable mutation cluster assignments by taking those with highest posterior probability +#' +#' @export +#' @param z_chain MCMC chain of mutation cluster assignment values +estimateClusterAssignments <- function(z_chain) { + it <- max(z_chain$Iteration) + mcmc_z <- z_chain %>% + group_by(Parameter, value) %>% + reframe(n=n(), + maxiter=it) %>% + mutate(probability=n/maxiter) %>% + ungroup() + map_z <- mcmc_z %>% + group_by(Parameter) %>% + reframe(value=value[probability==max(probability)]) %>% + ungroup() + + # choose first cluster if equal probability + map_z_count <- map_z %>% + group_by(Parameter) %>% + reframe(map_count = n()) %>% + ungroup() + if (any(map_z_count$map_count > 1)) { + mut_ind <- which(map_z_count$map_count > 1) + for (i in mut_ind) { + dup_var <- as.numeric(gsub("z\\[|]", "", map_z_count$Parameter[i])) + map_z_dups <- which(gsub("z\\[|]", "", map_z$Parameter) == dup_var) + dup_ind <- map_z_dups[-1] + map_z <- map_z[-dup_ind, ] + } + } + return(map_z) +} + +#' Determine most probable mutation cluster assignments by taking those with highest posterior probability. +#' +#' @export +#' @param z_chain MCMC chain of mutation cluster assignment values +#' @param Mut_ID Vector of mutation IDs, same order as provided as input data (e.g. indata$Mut_ID) +#' @return A tibble listing mutation IDs and their cluster assignments +writeClusterAssignmentsTable <- function(z_chain, mcf_chain=NULL, cncf=NULL, Mut_ID = NULL) { + map_z <- estimateClusterAssignments(z_chain) + if (is.null(Mut_ID)) { + Mut_ID <- paste0("Mut", 1:nrow(map_z)) + } + map_z <- map_z %>% + mutate(Parameter_n = as.numeric(gsub("z\\[(\\d+)\\]","\\1",Parameter)))%>% + arrange(Parameter_n)%>% + mutate(Mut_ID = Mut_ID, Cluster = value)%>% + select(Mut_ID, Cluster) + + if (!is.null(cncf)) { + if (is.null(mcf_chain)) { + warning("mcf_chain information is required to add CNA to cluster assignment table") + } else { + w_mat <- estimateMCFs(mcf_chain) + for (i in seq_len(nrow(cncf))) { + cls = which(apply(w_mat, 1, function(x) return(all(x == cncf_update[i,])))) + map_z <- map_z %>% add_row(Mut_ID=rownames(cncf)[i], Cluster=cls) + } + } + } + + return(map_z) +} + +#' Collect chains for best K of each mutation set +#' +#' @export +#' @param all_set_results List of MCMC results for each mutation set +#' @param chosen_K (Optional) Vector of K to choose for each mutation set, in the same order as all_set_results. If left blank, function will select best K automatically selected by \code{clusterSep} +collectBestKChains <- function(all_set_results, chosen_K = NULL) { + # best_set_chains <- lapply(all_set_results, function(x) x$all_chains[[length(x$all_chains)]]) + if (is.null(chosen_K)) { + best_set_chains <- lapply(all_set_results, function(x) x$silhouette_best_chains) + } else { + best_set_chains <- mapply(function(set_res, choose_K) set_res$all_chains[[choose_K]], + set_res = all_set_results, + chosen_K, + SIMPLIFY = FALSE) + } + return(best_set_chains) +} + +#' Relabel chains for all sets and merge +#' +#' @export +#' @import dplyr +#' @param best_set_chains List of lists of MCMC chains (mcf_chain, z_chain, ystar_chain) for each mutation set +#' @param indata List of input data objects (same as provided to clusterSep) +mergeSetChains <- function(best_set_chains, indata) { + best_K_vals <- unname(sapply(best_set_chains, function(x) max(x$z_chain$value))) + sep_list <- separateMutationsBySamplePresence(indata) + + # first set doesn't need to change cluster labels + mcf_chain <- best_set_chains[[1]]$mcf_chain + temp_m_chain <- best_set_chains[[1]]$m_chain + temp_icn_chain <- best_set_chains[[1]]$icn_chain + temp_z_chain <- best_set_chains[[1]]$z_chain + temp_ystar_chain <- best_set_chains[[1]]$ystar_chain + + if (length(best_set_chains) > 1) { + # still need to change mutation indices if more than 1 box + z_chain <- relabel_z_chain_mut_only(temp_z_chain, sep_list[[1]]$mutation_indices) + m_chain <- relabel_m_chain_mut_only(temp_m_chain, sep_list[[1]]$mutation_indices) + icn_chain <- relabel_icn_chain_mut_only(temp_icn_chain, sep_list[[1]]$mutation_indices) + ystar_chain <- relabel_ystar_chain(temp_ystar_chain, + sep_list[[1]]$mutation_indices) + for (i in 2:length(best_set_chains)) { + temp_mcf_chain <- best_set_chains[[i]]$mcf_chain + temp_m_chain <- best_set_chains[[i]]$m_chain + temp_icn_chain <- best_set_chains[[i]]$icn_chain + temp_z_chain <- best_set_chains[[i]]$z_chain + temp_ystar_chain <- best_set_chains[[i]]$ystar_chain + new_cluster_labels <- seq_len(best_K_vals[i]) + sum(best_K_vals[1:(i-1)]) + + temp_relabeled_mcf_chain <- relabel_mcf_chain(temp_mcf_chain, new_cluster_labels) + temp_relabeled_icn_chain <- relabel_icn_chain(temp_icn_chain, new_cluster_labels, + sep_list[[i]]$mutation_indices) + temp_relabeled_m_chain <- relabel_m_chain(temp_m_chain, new_cluster_labels, + sep_list[[i]]$mutation_indices) + temp_relabeled_z_chain <- relabel_z_chain(temp_z_chain, new_cluster_labels, + sep_list[[i]]$mutation_indices) + temp_relabeled_ystar_chain <- relabel_ystar_chain(temp_ystar_chain, + sep_list[[i]]$mutation_indices) + + mcf_chain <- rbind(mcf_chain, temp_relabeled_mcf_chain) + icn_chain <- rbind(icn_chain, temp_relabeled_icn_chain) + m_chain <- rbind(m_chain, temp_relabeled_m_chain) + z_chain <- rbind(z_chain, temp_relabeled_z_chain) + ystar_chain <- rbind(ystar_chain, temp_relabeled_ystar_chain) + } + } else { + z_chain <- temp_z_chain + icn_chain <- temp_icn_chain + m_chain <- temp_m_chain + ystar_chain <- temp_ystar_chain + } + + # set levels for Parameter + mcf_chain <- mcf_chain %>% + mutate(k = as.numeric(gsub("mcf\\[", "", + sapply(mcf_chain$Parameter, + function(x) strsplit(as.character(x), ",")[[1]][1])))) %>% + mutate(s = as.numeric(gsub("\\]", "", + sapply(mcf_chain$Parameter, + function(x) strsplit(as.character(x), ",")[[1]][2])))) %>% + arrange(k, s) %>% + mutate(Parameter = factor(Parameter, levels = unique(mcf_chain$Parameter))) %>% + select(Iteration, Chain, Parameter, value) + + z_chain_param_order <- tibble(Parameter = unique(z_chain$Parameter)) %>% + mutate(Variant = as.numeric(gsub("z\\[", "", + gsub("\\]", "", + unique(z_chain$Parameter))))) %>% + arrange(Variant) + z_chain <- z_chain %>% + mutate(Parameter = factor(Parameter, levels = z_chain_param_order$Parameter)) + + icn_chain_param_order <- tibble(Parameter = unique(icn_chain$Parameter)) %>% + mutate(Variant = as.numeric(gsub("icn\\[", "", + gsub("\\]", "", + unique(icn_chain$Parameter))))) %>% + arrange(Variant) + icn_chain <- icn_chain %>% + mutate(Parameter = factor(Parameter, levels = icn_chain_param_order$Parameter)) + + m_chain_param_order <- tibble(Parameter = unique(m_chain$Parameter)) %>% + mutate(Variant = as.numeric(gsub("m\\[", "", + gsub("\\]", "", + unique(m_chain$Parameter))))) %>% + arrange(Variant) + m_chain <- m_chain %>% + mutate(Parameter = factor(Parameter, levels = m_chain_param_order$Parameter)) + + ystar_chain <- ystar_chain %>% + mutate(Mutation_index = as.numeric(gsub("ystar\\[", "", + sapply(ystar_chain$Parameter, + function(x) strsplit(as.character(x), ",")[[1]][1]))), + s = as.numeric(gsub("\\]", "", + sapply(ystar_chain$Parameter, + function(x) strsplit(as.character(x), ",")[[1]][2])))) + ystar_chain <- ystar_chain %>% + arrange(Mutation_index, s) %>% + mutate(Parameter = factor(Parameter, levels = unique(Parameter))) + + chains <- list(mcf_chain = mcf_chain, + z_chain = z_chain, + icn_chain = icn_chain, + m_chain = m_chain, + ystar_chain = ystar_chain) + return(chains) +} + +relabel_z_chain <- function(z_chain, new_cluster_labels, mutation_indices) { + # new_cluster_labels = numeric vector of labels that map to 1:length(new_cluster_labels) + # mutation_indices = numeric vector of original mutation indices prior to separating by sample presence + if (length(mutation_indices) != length(unique(z_chain$Parameter))) { + stop("number of supplied mutation indices does not match the number of mutations in z_chain") + } + ## would break when no mutation is assigned to a cluster + ## poor choice of k, would prob lower the k + if (length(new_cluster_labels) < length(unique(z_chain$value))) { + stop("number of supplied new cluster labels does not match the number of clusters in z_chain") + } + new_z <- z_chain %>% + mutate(i = as.numeric(gsub("\\]", "", + gsub("z\\[", "", + sapply(z_chain$Parameter, + function(x) strsplit(as.character(x), ",")[[1]][1]))))) + new_z <- new_z %>% + mutate(new_i = mutation_indices[i], + value = new_cluster_labels[new_z$value]) %>% + mutate(Parameter = paste0("z[", new_i, "]")) %>% + arrange(new_i) %>% + select(Iteration, Chain, Parameter, value) + return(new_z) +} + +relabel_m_chain <- function(m_chain, new_cluster_labels, mutation_indices) { + # new_cluster_labels = numeric vector of labels that map to 1:length(new_cluster_labels) + # mutation_indices = numeric vector of original mutation indices prior to separating by sample presence + if (length(mutation_indices) != length(unique(m_chain$Parameter))) { + stop("number of supplied mutation indices does not match the number of mutations in m_chain") + } + new_m <- m_chain %>% + mutate(i = as.numeric(gsub("\\]", "", + gsub("m\\[", "", + sapply(m_chain$Parameter, + function(x) strsplit(as.character(x), ",")[[1]][1]))))) + new_m <- new_m %>% + mutate(new_i = mutation_indices[i], + value = new_m$value) %>% + mutate(Parameter = paste0("m[", new_i, "]")) %>% + arrange(new_i) %>% + select(Iteration, Chain, Parameter, value) + return(new_m) +} + +relabel_icn_chain <- function(icn_chain, new_cluster_labels, mutation_indices) { + # new_cluster_labels = numeric vector of labels that map to 1:length(new_cluster_labels) + # mutation_indices = numeric vector of original mutation indices prior to separating by sample presence + if (length(mutation_indices) != length(unique(icn_chain$Parameter))) { + stop("number of supplied mutation indices does not match the number of mutations in icn_chain") + } + new_icn <- icn_chain %>% + mutate(i = as.numeric(gsub("\\]", "", + gsub("icn\\[", "", + sapply(icn_chain$Parameter, + function(x) strsplit(as.character(x), ",")[[1]][1]))))) + new_icn <- new_icn %>% + mutate(new_i = mutation_indices[i], + value = new_icn$value) %>% + mutate(Parameter = paste0("icn[", new_i, "]")) %>% + arrange(new_i) %>% + select(Iteration, Chain, Parameter, value) + return(new_icn) +} + +relabel_m_chain_mut_only <- function(m_chain, mutation_indices) { + # mutation_indices = numeric vector of original mutation indices prior to separating by sample presence + # cluster labels are left unchanged + if (length(mutation_indices) != length(unique(m_chain$Parameter))) { + stop("number of supplied mutation indices does not match the number of mutations in m_chain") + } + new_m <- m_chain %>% + mutate(i = as.numeric(gsub("\\]", "", + gsub("m\\[", "", + sapply(m_chain$Parameter, + function(x) strsplit(as.character(x), ",")[[1]][1]))))) + new_m <- new_m %>% + mutate(new_i = mutation_indices[i]) %>% + mutate(Parameter = paste0("m[", new_i, "]")) %>% + arrange(new_i) %>% + select(Iteration, Chain, Parameter, value) + return(new_m) +} + +relabel_icn_chain_mut_only <- function(icn_chain, mutation_indices) { + # mutation_indices = numeric vector of original mutation indices prior to separating by sample presence + # cluster labels are left unchanged + if (length(mutation_indices) != length(unique(icn_chain$Parameter))) { + stop("number of supplied mutation indices does not match the number of mutations in icn_chain") + } + new_icn <- icn_chain %>% + mutate(i = as.numeric(gsub("\\]", "", + gsub("icn\\[", "", + sapply(icn_chain$Parameter, + function(x) strsplit(as.character(x), ",")[[1]][1]))))) + new_icn <- new_icn %>% + mutate(new_i = mutation_indices[i]) %>% + mutate(Parameter = paste0("icn[", new_i, "]")) %>% + arrange(new_i) %>% + select(Iteration, Chain, Parameter, value) + return(new_icn) +} + +relabel_z_chain_mut_only <- function(z_chain, mutation_indices) { + # mutation_indices = numeric vector of original mutation indices prior to separating by sample presence + # cluster labels are left unchanged + if (length(mutation_indices) != length(unique(z_chain$Parameter))) { + stop("number of supplied mutation indices does not match the number of mutations in z_chain") + } + new_z <- z_chain %>% + mutate(i = as.numeric(gsub("\\]", "", + gsub("z\\[", "", + sapply(z_chain$Parameter, + function(x) strsplit(as.character(x), ",")[[1]][1]))))) + new_z <- new_z %>% + mutate(new_i = mutation_indices[i]) %>% + mutate(Parameter = paste0("z[", new_i, "]")) %>% + arrange(new_i) %>% + select(Iteration, Chain, Parameter, value) + return(new_z) +} + +relabel_ystar_chain <- function(ystar_chain, mutation_indices) { + # mutation_indices = numeric vector of original mutation indices prior to separating by sample presence + i_s <- gsub("ystar\\[|]", "", ystar_chain$Parameter) + i <- sapply(i_s, function(x) strsplit(x, ",")[[1]][1]) %>% + as.numeric + s <- sapply(i_s, function(x) strsplit(x, ",")[[1]][2]) %>% + as.numeric + new_ystar <- ystar_chain %>% + mutate(i = i, + s = s) + new_ystar <- new_ystar %>% + mutate(new_i = mutation_indices[i]) %>% + mutate(Parameter = paste0("ystar[", new_i, ",", s, "]")) %>% + arrange(new_i) %>% + select(Iteration, Chain, Parameter, value) + return(new_ystar) +} + +relabel_mcf_chain <- function(mcf_chain, new_cluster_labels) { + # new_cluster_labels = numeric vector of labels that map to 1:length(new_cluster_labels) + new_mcf <- mcf_chain %>% + mutate(k = as.numeric(gsub("mcf\\[", "", + sapply(mcf_chain$Parameter, + function(x) strsplit(as.character(x), ",")[[1]][1])))) %>% + mutate(s = as.numeric(gsub("\\]", "", + sapply(mcf_chain$Parameter, + function(x) strsplit(as.character(x), ",")[[1]][2])))) + if (length(new_cluster_labels) != length(unique(new_mcf$k))) { + stop("number of supplied new cluster labels does not match the number of clusters in mcf_chain") + } + new_mcf <- new_mcf %>% + mutate(k_new = new_cluster_labels[new_mcf$k]) %>% + mutate(Parameter = paste0("mcf[", k_new, ",", s, "]")) %>% + select(Iteration, Chain, Parameter, value) + return(new_mcf) +} diff --git a/R/MCMC_plot_clusters.R b/R/MCMC_plot_clusters.R new file mode 100644 index 0000000..186a099 --- /dev/null +++ b/R/MCMC_plot_clusters.R @@ -0,0 +1,238 @@ +#' Plot probabilities of mutation cluster assignments - vertical +#' +#' @export +#' @import ggplot2 +#' @import tibble +#' @import dplyr +#' @import tidyr +#' @param z_chain MCMC chain of mutation cluster assignment values, which is the second item in the list returned by \code{clusterSep} +#' @param mcf_chain MCMC chain of CCF values, which is the first item in the list returned by \code{clusterSep} +#' @param filter_thresh Lowest posterior probability to include cluster assignment. Default value is 0.05 (inclusive) +#' @param MutID (Optional) Vector of mutation IDs for labeling purposes. Same order as supplied as input data (e.g. indata$Mut_ID) +#' @param SampleID (Optional) Vector of sample IDs for labeling purposes. Same order as supplied as input data (e.g. indata$Sample_ID) +plotClusterAssignmentProbVertical <- function(z_chain, + mcf_chain, + filter_thresh = 0.05, + MutID = NULL, + SampleID = NULL) { + + mcmc_z <- generateZPostSummary(z_chain, mcf_chain, filter_thresh, MutID, SampleID) + K <- max(mcmc_z$value) + z.seg.tb <- mcmc_z %>% + group_by(Parameter) %>% + summarize(z1 = min(value), z2 = max(value)) %>% + ungroup() %>% + mutate(Variant = as.numeric(gsub("z\\[|]", "", Parameter)), + Mut_ID = mcmc_z$Mut_ID[match(Variant, mcmc_z$Variant)], + Sample_presence = mcmc_z$Sample_presence[match(Variant, mcmc_z$Variant)]) + + z.plot <- ggplot(mcmc_z, aes(x = value, y = Mut_ID, color = probability)) + + theme_light() + + scale_y_discrete(drop = T, name = "Variant") + + scale_x_continuous(breaks = 1:K, name = "Cluster", labels = 1:K) + + geom_segment(data = z.seg.tb, + aes(y=Mut_ID, yend=Mut_ID, + x=z1, xend=z2), + color="black", linetype=2) + + geom_point() + + theme(panel.grid.minor = element_blank(), + strip.background=element_blank(), + strip.text = element_text(colour = 'black'), + strip.text.y = element_text(angle = 0)) + + facet_grid(Sample_presence~., scales = "free", space = "free") + + scale_color_gradient(limits = c(0,1)) + return(z.plot) +} + +generateZPostSummary <- function(z_chain, + mcf_chain, + filter_thresh = 0.05, + MutID = NULL, + SampleID = NULL) { + + map_z <- estimateClusterAssignments(z_chain) + map_mcf <- estimateMCFs(mcf_chain) + + I <- length(unique(z_chain$Parameter)) + K <- max(unique(z_chain$value)) + num_iter <- max(z_chain$Iteration) + S <- ncol(map_mcf) + + if (is.null(MutID)) { + mut_labels <- 1:I + } else { + mut_labels <- MutID + } + if (is.null(SampleID)) { + sample_labels <- paste0("Sample ", 1:S) + } else { + sample_labels <- SampleID + } + + + + tiers <- generateTiers(map_mcf, sample_labels) + + mcmc_z <- summarizeZPost(z_chain) %>% + filter(probability >= filter_thresh) + + # Variant sample presence + var_sample_pres <- map_z %>% + ungroup() %>% + mutate(cluster_num = value, + Variant = as.numeric(gsub("z\\[|]", "", Parameter)), + Mut_ID = mut_labels[Variant], + Sample_presence = tiers$samples[value]) + # sample presence order + sample_pres_order <- tiers %>% + select(samples, tier) %>% + distinct() %>% + arrange(-tier) %>% + pull(samples) + # variant order + var_order <- map_z %>% + arrange(-value) %>% + mutate(Variant = as.numeric(gsub("z\\[|]", "", Parameter)), + Mut_ID = mut_labels[Variant]) %>% + pull(Mut_ID) + + mcmc_z <- mcmc_z %>% + mutate(Variant = as.numeric(gsub("z\\[|]", "", Parameter)), + Mut_ID = factor(mut_labels[Variant], var_order), + Sample_presence = factor(var_sample_pres$Sample_presence[Variant], + sample_pres_order)) + return(mcmc_z) +} + +summarizeZPost <- function(z_chain) { + I <- length(unique(z_chain$Parameter)) + K <- max(unique(z_chain$value)) + num_iter <- max(z_chain$Iteration) + mcmc_z <- z_chain %>% + group_by(Parameter, value) %>% + summarize(n=n(), + num_iter=num_iter) %>% + mutate(probability=n/num_iter) %>% + ungroup() + return(mcmc_z) +} + + +generateTiers <- function(w_mat, Sample_ID) { + clusters <- paste0("Cluster ", seq_len(nrow(w_mat))) + bin <- w_mat > 0 + samples <- apply(bin, 1, function(x) paste(Sample_ID[x], collapse = ",\n")) + tier <- rowSums(bin) + tiers <- tibble(cluster = clusters, + cluster_num = seq_len(nrow(w_mat)), + samples = samples, + tier = tier) + return(tiers) +} + +#' Plot CCF chain trace +#' +#' @export +#' @param mcf_chain MCMC chain of CCF values, which is the first item in the list returned by \code{mergeSetChains} +plotChainsMCF <- function(mcf_chain) { + cluster <- strsplit(as.character(mcf_chain$Parameter), ",") %>% + sapply(., function(x) gsub("mcf\\[", "", x[1])) %>% + as.numeric + sample <- strsplit(as.character(mcf_chain$Parameter), ",") %>% + sapply(., function(x) gsub("\\]", "", x[2])) %>% + as.numeric + + mcf_chain <- mcf_chain %>% + mutate(Cluster = factor(paste0("Cluster ", cluster), + levels = paste0("Cluster ", sort(unique(cluster)))), + Sample = factor(paste0("Sample ", sample), + levels = paste0("Sample ", sort(unique(sample))))) + + ggplot(mcf_chain, aes(x = Iteration, y = value)) + + geom_line() + + theme_light() + + facet_grid(Cluster ~ Sample) + + ylab("Cancer Cell Fraction") +} + +#' Plot cluster CCF posterior distributions as violin plots +#' +#' @export +#' @param mcf_chain MCMC chain of CCF values +#' @param z_chain (Optional) MCMC chain of mutation cluster assignment values. If provided, cluster names will show the number of mutations assigned in brackets +#' @param indata (Optional) List of input data items +plotMCFViolin <- function(mcf_chain, z_chain = NULL, indata = NULL) { + # process data + vdat <- violinProcessData(mcf_chain, indata) + + if (!is.null(z_chain)) { + num_muts_in_clusters <- estimateClusterAssignments(z_chain) %>% + group_by(value) %>% + summarize(num_muts = n()) %>% + ungroup() %>% + rename(cluster = value) + num_muts <- num_muts_in_clusters$num_muts[match(vdat$cluster_num, num_muts_in_clusters$cluster)] + new_cluster_labels <- paste0("Cluster ", + vdat$cluster_num, + " [", num_muts,"]") + vdat <- vdat %>% + mutate(cluster = factor(new_cluster_labels, unique(new_cluster_labels))) + } + + # plot violins + vplot <- plotViolin(vdat) + return(vplot) +} + +violinProcessData <- function(mcf_chain, indata = NULL) { + mcf_mat <- estimateMCFs(mcf_chain) + est_K <- nrow(mcf_mat) + + if (is.null(indata$Sample_ID)) { + sample_names <- paste0("Sample ", 1:ncol(mcf_mat)) + } else { + sample_names <- indata$Sample_ID + } + + vdat <- mcf_chain %>% + mutate(sample=stringr::str_replace_all(Parameter, "mcf\\[[:digit:]+,", ""), + sample=stringr::str_replace_all(sample, "\\]", ""), + cluster=stringr::str_replace_all(Parameter, "mcf\\[", ""), + cluster=stringr::str_replace_all(cluster, ",[:digit:]\\]", "")) %>% + mutate(sample=as.numeric(sample), + sample=sample_names[sample], + sample=factor(sample, sample_names), + cluster=as.numeric(cluster), + cluster=paste0("Cluster ", cluster), + cluster=factor(cluster, level=paste("Cluster", 1:est_K))) + + tiers <- generateTiers(mcf_mat, sample_names) + + vdat <- vdat %>% + mutate(cluster=as.character(cluster)) %>% + left_join(tiers, by="cluster") %>% + mutate(cluster=factor(cluster, tiers$cluster), + tier=factor(tier, sort(unique(tiers$tier)))) + return(vdat) +} + +plotViolin <- function(vdat) { + vplot <- ggplot(vdat, aes(sample, value)) + + geom_violin(aes(fill=tier), + alpha=0.6, + scale="width", + draw_quantiles=c(0.25, 0.5, 0.75), + color="white") + + geom_violin(fill="transparent", color="black", + scale="width", draw_quantiles=0.5) + + theme_bw(base_size=12) + + theme(strip.background=element_blank(), + axis.text.x=element_text(size=12), + panel.grid=element_blank(), + legend.pos="bottom") + + facet_wrap(~cluster, nrow=1) + + ylab("Posterior CCF") + xlab("") +ylim(c(0, 1)) + + guides(fill=guide_legend("Sample-presence")) + + theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) + return(vplot) +} \ No newline at end of file diff --git a/R/bic.R b/R/bic.R deleted file mode 100644 index cd90484..0000000 --- a/R/bic.R +++ /dev/null @@ -1,220 +0,0 @@ -calcLogLik <- function(z.iter, w.iter, input.data) { - W <- w.iter[z.iter, ] - - if (is.null(input.data$purity)) { - theta <- calcTheta(input.data$m, input.data$tcn, W) - } else { - purity <- input.data$purity - P <- matrix(rep(purity, each = input.data$I), nrow = input.data$I, ncol = input.data$S) - theta <- calcTheta2(input.data$m, input.data$tcn, W, P) - } - - sum(dbinom(as.matrix(input.data$y), as.matrix(input.data$n), as.matrix(theta), log=T)) -} - -calcChainLogLik <- function(chains, input.data, est_K) { - num_iter <- max(chains$z_chain$Iteration) - - lik <- c() - for(iter in 1:num_iter) { - z.iter <- chains$z_chain %>% - filter(Iteration == iter) %>% - pull(value) - w.iter <- filter(chains$w_chain, Iteration == iter) %>% - reshapeW() - lik <- c(lik, calcLogLik(z.iter, w.iter, input.data)) - } - return(mean(lik)) -} - - -#' @importFrom stringr str_replace_all -reshapeW <- function(w.chain.iter) { - w.mat <- w.chain.iter %>% - mutate(sample=stringr::str_replace_all(Parameter, "w\\[[:digit:]+,", ""), - sample=as.numeric(stringr::str_replace_all(sample, "\\]", "")), - cluster=stringr::str_replace_all(Parameter, "w\\[", ""), - cluster=as.numeric(stringr::str_replace_all(cluster, ",[:digit:]\\]", ""))) %>% - select(cluster, sample, value) - S <- max(w.mat$sample) - w.mat <- w.mat %>% - pivot_wider(names_from = sample, - values_from = value) - w.mat$cluster <- NULL - w.mat <- as.matrix(w.mat) - colnames(w.mat) <- paste0("sample", 1:S) - return(w.mat) -} - -calcBIC <- function(n, k, ll) log(n)*k - 2*ll - -#' @import magrittr -calcChainBIC <- function(chains, input.data) { - n <- input.data$I * input.data$S - est_K <- estimateCCFs(chains$w_chain) %>% - nrow(.) - ll <- calcChainLogLik(chains, input.data, est_K) - - BIC <- calcBIC(n, est_K, ll) - return(BIC) -} - -calcBICForRangeK <- function(samps.list, kToTest, input.data) { - mapply(function(samps, k) - calcBIC(input.data$I*input.data$S, k, calcChainLogLik(samps, input.data, k)), - samps = samps.list, k = kToTest) -} - -calcTheta <- function(m, tcn, w) { - (m * w) / (tcn * w + 2*(1-w)) -} - -calcTheta2 <- function(m, tcn, w, p) { - (m * w * p) / (tcn * p + 2*(1-p)) -} - -#' Make table listing possible choices of K (minimum BIC and elbow of BIC plot) for each mutation set -#' -#' @export -#' @import dplyr -#' @param all_set_results List of MCMC results for each mutation set; returned by \code{clusterSep} -#' @param sample_names (Optional) Vector of sample IDs, same order as provided as input data (e.g. indata$Sample_ID) -writeSetKTable <- function(all_set_results, sample_names = NULL) { - min_bic_k <- sapply(all_set_results, function(x) x$best_K) - elbow_k <- sapply(all_set_results, function(x) ifelse(is.logical(x$BIC), 1, findElbow2(x$BIC$BIC))) - knee_k <- sapply(all_set_results, function(x) ifelse(is.logical(x$BIC), 1, findKnee(x$BIC$BIC))) - min_bic_k_tb <- tibble(set_name_bin = names(all_set_results), - min_BIC = min_bic_k, - elbow = elbow_k, - knee = knee_k) - - if (!is.null(sample_names)) { - min_bic_k_tb <- min_bic_k_tb %>% - mutate(set_name_full = sapply(min_bic_k_tb$set_name_bin, - function(x) getSetName(x, sample_names, collapse_string = ","))) %>% - select(set_name_bin, set_name_full, min_BIC, elbow, knee) - } - - # write chosen K if elbow, knee, and minimum BIC all agree - chosen_K <- rep(NA, length(all_set_results)) - for (i in seq_len(length(all_set_results))) { - chosen_K[i] <- ifelse(min_bic_k_tb$min_BIC[i] == min_bic_k_tb$elbow[i] & - min_bic_k_tb$min_BIC[i] == min_bic_k_tb$knee[i], - min_bic_k_tb$min_BIC[i], - min(min_bic_k_tb$min_BIC[i],min_bic_k_tb$elbow[i],min_bic_k_tb$knee[i])) - } - - min_bic_k_tb <- min_bic_k_tb %>% - mutate(chosen_K = chosen_K) - - return(min_bic_k_tb) -} - -# find elbow of bic plot -findElbow1 <- function(BIC) { - - # return first ind if all increasing - if(all (BIC >= BIC[1])) return(1) - - delta1 <- diff(BIC) - delta2 <- diff(delta1) - elbow_ind <- which.min(delta2) + 2 - return(elbow_ind) -} - -findElbow2 <- function(BIC) { - - # return first ind if all increasing - if(all (BIC >= BIC[1])) return(1) - - K_vec <- seq_len(length(BIC)) - # line defined by two end-points of BIC plot P1 = (x1, y1) and P2 = (x2, y2) - x1 <- K_vec[1] - y1 <- BIC[1] - x2 <- length(BIC) - y2 <- BIC[length(BIC)] - - # calculate distance of each point (x0, y0) to line - perp_dist <- sapply(K_vec, function(i) calcDistFromPointToLine(K_vec[i], BIC[i], - x1, y1, - x2, y2)) - return(which.max(perp_dist)) -} - -calcDistFromPointToLine <- function (x0, y0, - x1, y1, - x2, y2) { - numerator <- abs( (x2-x1)*(y1-y0) - (x1-x0)*(y2-y1) ) - denominator <- sqrt( (x2-x1)^2 + (y2-y1)^2 ) - distance <- numerator / denominator - return(distance) -} - -# angle-based method for knee point detection of BIC -# Zhao et al 2008 -# returns index of knee point -findKnee <- function(BIC, n = 5) { - if (length(BIC) == 1) return(1) - - # return first ind if all increasing - if(all (BIC >= BIC[1])) return(1) - - # initialize - curr_val <- BIC[1] - prev_val <- BIC[1] - next_val <- BIC[1] - - # begin - diff_fun <- rep(NA, length(BIC)) - for (m in seq_len(length(BIC))) { - curr_val <- BIC[m] - next_val <- BIC[min(m+1, length(BIC))] - diff_fun[m] <- DiffFun(prev_val, next_val, curr_val) - prev_val <- curr_val - } - - # find first n local maximas in diff_fun - local_max <- localMaxima(diff_fun) - local_max <- local_max[1:min(length(local_max), n)] - if (length(local_max) == 1) return(local_max) - - # for each n with decreasing order of LocalMax value, - angle <- c() - for (n in local_max) { - angle <- c(angle, AngleFun(BIC[max(n-1, 1)], BIC[min(n+1, length(BIC))], BIC[n])) - } - # return m with the first minima angle - return(local_max[localMinima(angle)[1]]) -} -DiffFun <- function(prev_val, next_val, curr_val) { - prev_val + next_val - 2*curr_val -} -AngleFun <- function(prev_val, next_val, curr_val) { - atan( 1 / abs(curr_val - prev_val) ) + atan( 1 / abs(next_val - curr_val) ) -} - -# detect local maxima -localMaxima <- function(x) { - # Use -Inf instead if x is numeric (non-integer) - y <- diff(c(-.Machine$integer.max, x)) > 0L # for maxima - #y <- diff(c(.Machine$integer.max, x)) < 0L # for mimuma - rle(y)$lengths - y <- cumsum(rle(y)$lengths) - y <- y[seq.int(1L, length(y), 2L)] - if (x[[1]] == x[[2]]) { - y <- y[-1] - } - return(y) -} -localMinima <- function(x) { - # Use -Inf instead if x is numeric (non-integer) - #y <- diff(c(-.Machine$integer.max, x)) > 0L # for maxima - y <- diff(c(.Machine$integer.max, x)) < 0L # for mimuma - rle(y)$lengths - y <- cumsum(rle(y)$lengths) - y <- y[seq.int(1L, length(y), 2L)] - if (x[[1]] == x[[2]]) { - y <- y[-1] - } - return(y) -} diff --git a/R/calculate-BIC.R b/R/calculate-BIC.R new file mode 100644 index 0000000..c6fee11 --- /dev/null +++ b/R/calculate-BIC.R @@ -0,0 +1,229 @@ +#' @import magrittr +calcChainBIC <- function(chains, input.data, pattern) { + options(dplyr.summarise.inform = FALSE) + mcf <- chains$mcf_chain%>% + mutate(value = round(value,5), + Cluster = as.numeric(gsub("mcf\\[(.*),.*","\\1", Parameter)), + Sample = as.numeric(gsub(".*,(.*)\\]","\\1", Parameter)))%>% + group_by(Cluster,Sample)%>% + reframe(mcf = mean(value))%>% + ungroup()%>% + spread(key = Sample, value = mcf) + + ww <- writeClusterAssignmentsTable(chains$z_chain)%>% + mutate(Mut_ID = as.numeric(gsub("Mut","",Mut_ID)))%>% + arrange(Mut_ID)%>% + left_join(mcf, by = "Cluster")%>% + select(-c("Mut_ID","Cluster"))%>% + as.matrix() + + ww <- ww[,which(strsplit(pattern, split="")[[1]]=="1"),drop=FALSE] + + mm <- writeMultiplicityTable(chains$m_chain)%>% + mutate(Mut_ID = as.numeric(gsub("Mut","",Mut_ID)))%>% + arrange(Mut_ID)%>% + select(c("Multiplicity")) %>% + as.matrix() + + mm <- replicate(input.data$S, mm[, 1]) + + is_cn <- replicate(input.data$S, input.data$is_cn) + vaf <- ifelse(is_cn==0, (ww + (mm-1) * input.data$cncf)/input.data$tcn, (ww * mm + 1 - ww) / input.data$tcn) + vaf <- ifelse(vaf<=0, 0.001, vaf) + vaf <- ifelse(vaf>=1, 0.999, vaf) + + lik <- sum(dbinom(input.data$y,input.data$n,vaf,log = T)) + + est_K <- estimateMCFs(chains$mcf_chain) %>% nrow(.) + + BIC <- log(input.data$I*input.data$S)*est_K-2*lik + return(BIC) +} + +#' @import magrittr +calcChainSilhouette <- function(chains, input.data, pattern) { + + vaf <- input.data$y / input.data$n + + zz <- writeClusterAssignmentsTable(chains$z_chain) %>% + mutate(Mut_ID = as.numeric(gsub("Mut","",Mut_ID))) %>% + arrange(Mut_ID)%>% + select(c("Cluster")) %>% + as.matrix() + zz <- zz[,1] + + mcf1 <- chains$mcf_chain%>% + mutate(value = round(value,5), + Cluster = as.numeric(gsub("mcf\\[(.*),.*","\\1", Parameter)), + Sample = as.numeric(gsub(".*,(.*)\\]","\\1", Parameter)))%>% + group_by(Cluster,Sample)%>% + reframe(mcf = mean(value))%>% + ungroup()%>% + spread(key = Sample, value = mcf) + + ww <- writeClusterAssignmentsTable(chains$z_chain)%>% + mutate(Mut_ID = as.numeric(gsub("Mut","",Mut_ID)))%>% + arrange(Mut_ID)%>% + left_join(mcf1, by = "Cluster")%>% + select(-c("Mut_ID","Cluster"))%>% + as.matrix() + + ww <- ww[,which(strsplit(pattern, split="")[[1]]=="1"),drop=FALSE] + + mm <- writeMultiplicityTable(chains$m_chain)%>% + mutate(Mut_ID = as.numeric(gsub("Mut","",Mut_ID)))%>% + arrange(Mut_ID)%>% + select(c("Multiplicity")) %>% + as.matrix() + mm <- replicate(input.data$S, mm[, 1]) + + is_cn <- replicate(input.data$S, input.data$is_cn) + + mcf <- ifelse(is_cn==0, input.data$tcn * vaf - (mm-1) * input.data$cncf, (input.data$tcn * vaf - 1) / (mm - 1)) + mcf <- ifelse(mcf<0, ww, mcf) + mcf <- ifelse(mcf>1, ww, mcf) + + sil_widths <- silhouette(zz, dist(mcf)) + if (length(sil_widths)==1) { + mean_silhouette_score <- 0 + } else { + mean_silhouette_score <- mean(sil_widths[, "sil_width"]) + } + + return(mean_silhouette_score) +} + +#' Make table listing possible choices of K (minimum BIC and elbow of BIC plot) for each mutation set +#' +#' @export +#' @import dplyr +#' @param all_set_results List of MCMC results for each mutation set; returned by \code{clusterSep} +#' @param sample_names (Optional) Vector of sample IDs, same order as provided as input data (e.g. indata$Sample_ID) +writeSetKTable <- function(all_set_results, sample_names = NULL) { + +min_bic_k <- sapply(all_set_results, function(x) x$BIC_best_K) + elbow_k <- sapply(all_set_results, function(x) ifelse(is.logical(x$BIC), 1, findElbow(x$BIC$BIC))) + knee_k <- sapply(all_set_results, function(x) ifelse(is.logical(x$BIC), 1, findKnee(x$BIC$BIC))) + min_bic_k_tb <- tibble(set_name_bin = names(all_set_results), + min_BIC = min_bic_k, + elbow = elbow_k, + knee = knee_k) + + if (!is.null(sample_names)) { + min_bic_k_tb <- min_bic_k_tb %>% + mutate(set_name_full = sapply(min_bic_k_tb$set_name_bin, + function(x) getSetName(x, sample_names, collapse_string = ","))) %>% + select(set_name_bin, set_name_full, min_BIC, elbow, knee) + } + + min_bic_k_tb <- min_bic_k_tb %>% + mutate(BIC_K = round((min_BIC + elbow + knee) / 3)) + + silhouette_k <- sapply(all_set_results, function(x) x$silhouette_best_K) + min_bic_k_tb <- min_bic_k_tb %>% + mutate(silhouette_K = silhouette_k) + + return(min_bic_k_tb) +} + +findElbow <- function(BIC) { + + # return first ind if all increasing + if(all (BIC >= BIC[1])) return(1) + + K_vec <- seq_len(length(BIC)) + # line defined by two end-points of BIC plot P1 = (x1, y1) and P2 = (x2, y2) + x1 <- K_vec[1] + y1 <- BIC[1] + x2 <- length(BIC) + y2 <- BIC[length(BIC)] + + # calculate distance of each point (x0, y0) to line + perp_dist <- sapply(K_vec, function(i) calcDistFromPointToLine(K_vec[i], BIC[i], + x1, y1, + x2, y2)) + return(which.max(perp_dist)) +} + +getmode <- function(v) { + uniqv <- unique(v) + uniqv[which.max(tabulate(match(v, uniqv)))] +} + +calcDistFromPointToLine <- function (x0, y0, + x1, y1, + x2, y2) { + numerator <- abs( (x2-x1)*(y1-y0) - (x1-x0)*(y2-y1) ) + denominator <- sqrt( (x2-x1)^2 + (y2-y1)^2 ) + distance <- numerator / denominator + return(distance) +} + +# angle-based method for knee point detection of BIC +# Zhao et al 2008 +# returns index of knee point +findKnee <- function(BIC, n = 5) { + if (length(BIC) == 1) return(1) + + # return first ind if all increasing + if(all (BIC >= BIC[1])) return(1) + + # initialize + curr_val <- BIC[1] + prev_val <- BIC[1] + next_val <- BIC[1] + + # begin + diff_fun <- rep(NA, length(BIC)) + for (m in seq_len(length(BIC))) { + curr_val <- BIC[m] + next_val <- BIC[min(m+1, length(BIC))] + diff_fun[m] <- DiffFun(prev_val, next_val, curr_val) + prev_val <- curr_val + } + + # find first n local maximas in diff_fun + local_max <- localMaxima(diff_fun) + local_max <- local_max[1:min(length(local_max), n)] + if (length(local_max) == 1) return(local_max) + + # for each n with decreasing order of LocalMax value, + angle <- c() + for (n in local_max) { + angle <- c(angle, AngleFun(BIC[max(n-1, 1)], BIC[min(n+1, length(BIC))], BIC[n])) + } + # return m with the first minima angle + return(local_max[localMinima(angle)[1]]) +} +DiffFun <- function(prev_val, next_val, curr_val) { + prev_val + next_val - 2*curr_val +} +AngleFun <- function(prev_val, next_val, curr_val) { + atan( 1 / abs(curr_val - prev_val) ) + atan( 1 / abs(next_val - curr_val) ) +} + +# detect local maxima +localMaxima <- function(x) { + # Use -Inf instead if x is numeric (non-integer) + y <- diff(c(-.Machine$integer.max, x)) > 0L # for maxima + #y <- diff(c(.Machine$integer.max, x)) < 0L # for mimuma + rle(y)$lengths + y <- cumsum(rle(y)$lengths) + y <- y[seq.int(1L, length(y), 2L)] + if (x[[1]] == x[[2]]) { + y <- y[-1] + } + return(y) +} +localMinima <- function(x) { + # Use -Inf instead if x is numeric (non-integer) + #y <- diff(c(-.Machine$integer.max, x)) > 0L # for maxima + y <- diff(c(.Machine$integer.max, x)) < 0L # for mimuma + rle(y)$lengths + y <- cumsum(rle(y)$lengths) + y <- y[seq.int(1L, length(y), 2L)] + if (x[[1]] == x[[2]]) { + y <- y[-1] + } + return(y) +} diff --git a/R/cluster_separately.R b/R/cluster_separately.R deleted file mode 100644 index 60046c5..0000000 --- a/R/cluster_separately.R +++ /dev/null @@ -1,507 +0,0 @@ -#' Run MCMC to cluster mutations and estimate CCFs -#' -#' @export -#' @importFrom ggmcmc ggs -#' @param input_data list of input data objects; -#' @param n.iter number of iterations to run MCMC -#' @param n.burn number of iterations for burn in -#' @param thin thinning parameter -#' @param mc.cores number of cores for parallelization -#' @param max_K maximum number of clusters to assess for each mutation set -#' @param model_type hierarchical model type for ("spike_and_slab" or "simple) -#' @param beta.prior option to run an initial MCMC chain and use results to specify beta priors for a second MCMC chain -#' @param one_box option to run the MCMC chain without using sample presence -clusterSep <- function(input_data, - n.iter = 10000, n.burn = 1000, thin = 10, mc.cores = 1, - inits = list(".RNG.name" = "base::Wichmann-Hill", - ".RNG.seed" = 123), - max_K = 5, - model_type = "spike_and_slab", - beta.prior = FALSE, - drop_zero = FALSE, - one_box = TRUE) { - # 1. separate mutations by sample presence - if (one_box) { - input_data$mutation_indices <- seq_len(input_data$I) - sep_list <- vector("list", 1) - sep_list[[1]] <- input_data - names(sep_list) <- "one_box" - } else { - sep_list <- separateMutationsBySamplePresence(input_data) - } - - # 2a. For each presence set, run clustering MCMC, calc BIC and choose best K (min BIC) - all_set_results <- vector("list", length(sep_list)) - names(all_set_results) <- names(sep_list) - params = c("z", "w", "ystar") - - for (i in seq_len(length(sep_list))) { - temp_box <- sep_list[[i]] - # Max number of clusters cannot be more than number of mutations - temp_max_K <- min(max_K, length(temp_box$mutation_indices)) - - temp_samps_list <- runMutSetMCMC(temp_box, - n.iter = n.iter, n.burn = n.burn, thin = thin, - mc.cores = mc.cores, - inits = inits, - temp_max_K = temp_max_K, - model_type = model_type, - params = params, - beta.prior = beta.prior, - drop_zero = drop_zero) - all_set_results[[i]] <- temp_samps_list - } - - return(all_set_results) -} - -separateMutationsBySamplePresence <- function(input_data) { - # returns list of lists -- - # each item of list contains input data for a mutation sample presence set - # original mutation indices from input_data are recorded in $mutation_indices - pres <- ifelse(input_data$y > 0, 1, 0) - pat <- apply(pres, 1, function(x) paste0(x, collapse="")) - types <- sort(names(table(pat)), decreasing=TRUE) - if (length(types) == 1) { - type_indices <- list() - type_indices[[types]] <- seq_len(input_data$I) - } else { - type_indices <- lapply(types, function(x) which(pat == x)) - names(type_indices) <- types - } - - sep_list <- list() - for (t in seq_len(length(types))) { - sep_list[[types[t]]] <- list(pattern = types[t], - mutation_indices = type_indices[[types[t]]], - purity = input_data$purity, - I = length(type_indices[[types[t]]]), - S = input_data$S, - y = input_data$y[type_indices[[types[t]]], ,drop=FALSE], - n = input_data$n[type_indices[[types[t]]], ,drop=FALSE], - tcn = input_data$tcn[type_indices[[types[t]]], ,drop=FALSE], - m = input_data$m[type_indices[[types[t]]], ,drop=FALSE]) - if (input_data$S == 1) { - break - } - } - return(sep_list) -} - -getBoxInputData <- function(box) { - list(purity = box$purity, - I = box$I, - S = box$S, - y = box$y, - n = box$n, - m = box$m, - tcn = box$tcn) -} - -reverseDrop <- function(samps, pattern, n.iter) { - total_sample = nchar(pattern) - sample_list = vector() - for (j in seq_len(nchar(pattern))) { - if (strsplit(pattern, "")[[1]][j] == "1") { - sample_list <- append(sample_list, j) - } - } - k_list = vector() - ystar_list = vector() - # replace current sample id by true sample id from pattern - for (i in seq_len(length(colnames(samps[[1]])))) { - # print(colnames(samps[[1]])[i]) - if (startsWith(colnames(samps[[1]])[i], "w")) { - para <- str_extract_all(colnames(samps[[1]])[i], "[0-9]+")[[1]] - colnames(samps[[1]])[i] <- paste("w[", para[1], ",", sample_list[strtoi(para[2])], "]", sep = "") - k_list <- c(k_list, para[1]) - } else if (startsWith(colnames(samps[[1]])[i], "ystar")) { - para <- str_extract_all(colnames(samps[[1]])[i], "[0-9]+")[[1]] - colnames(samps[[1]])[i] <- paste("ystar[", para[1], ",", sample_list[strtoi(para[2])], "]", sep = "") - ystar_list <- c(ystar_list, para[1]) - } - } - k_list <- unique(k_list) - ystar_list <- unique(ystar_list) - - # add back dropped samples - absent_sample <- vector() - for (sample in seq_len(total_sample)) { - if (! sample %in% sample_list) { - absent_sample <- append(absent_sample, sample) - } - } - for (k in seq_len(length(k_list))) { - for (j in seq_len(length(absent_sample))) { - col = paste("w[", k_list[k], ",", absent_sample[j], "]", sep = "") - samps[[1]] <- cbind(samps[[1]], col=0) - colnames(samps[[1]])[colnames(samps[[1]]) == 'col'] <- col - } - } - for (ystar in seq_len(length(ystar_list))) { - for (j in seq_len(length(absent_sample))) { - col = paste("ystar[", ystar_list[ystar], ",", absent_sample[j], "]", sep = "") - samps[[1]] <- cbind(samps[[1]], col=0) - colnames(samps[[1]])[colnames(samps[[1]]) == 'col'] <- col - } - } - - samps[[1]] <- samps[[1]][,order(colnames(samps[[1]]))] - - return(samps) -} - -runMCMCForABox <- function(box, - n.iter = 10000, n.burn = 1000, thin = 10, mc.cores = 1, - inits = list(".RNG.name" = "base::Wichmann-Hill", - ".RNG.seed" = 123), - params = c("z", "w", "ystar"), - max_K = 5, model_type = "simple", - beta.prior = FALSE, - drop_zero = FALSE) { - # returns samps_list - box_input_data <- getBoxInputData(box) - - # modify box_input_data so it only contain non-zero samples - if (drop_zero) { - sample_list = vector() - for (j in 1:box$S) { - if (strsplit(box$pattern, "")[[1]][j] == "1") { - sample_list <- append(sample_list, j) - } - } - # print(temp_box) - box_input_data$purity <- box_input_data$purity[sample_list] - box_input_data$y <- box_input_data$y[,sample_list,drop=FALSE] - box_input_data$n <- box_input_data$n[,sample_list,drop=FALSE] - box_input_data$tcn <- box_input_data$tcn[,sample_list,drop=FALSE] - box_input_data$m <- box_input_data$m[,sample_list,drop=FALSE] - box_input_data$S <- length(sample_list) - } - - extdir <- system.file("extdata", package="pictograph") - if (box$I == 1) { - jags.file.K1 <- file.path(extdir, "spike_and_slab_purity_2_K1_I1.jags") - box_input_data$I <- NULL - } else { - jags.file.K1 <- file.path(extdir, "spike_and_slab_purity_2_K1.jags") - } - - if (model_type == "simple") { - jags.file <- file.path(extdir, "model-test.jags") # fixes order of CCFs in one sample, not spike and slab - } else if (model_type == "spike_and_slab") { - jags.file <- file.path(extdir, "spike_and_slab_purity_ident.jags") # fixing order of CCFs in one sample - } else stop("provide model_type either 'spike_and_slab' or 'simple'") - - # choose sample in which mutations are present - sample_to_sort <- which(colSums(box_input_data$y) > 0)[1] - - samps_K1 <- runMCMC(box_input_data, 1, jags.file.K1, - inits, params, n.iter=n.iter, thin=thin, n.burn=n.burn) - - if(box_input_data$S == 1) { - colnames(samps_K1[[1]])[which(colnames(samps_K1[[1]]) == "w")] <- "w[1,1]" - } - - if (drop_zero) { - samps_K1 <- reverseDrop(samps_K1, box$pattern, n.iter) - } - - if(box$I == 1) { - colnames(samps_K1[[1]])[which(colnames(samps_K1[[1]]) == "z")] <- "z[1]" - } - - # Max number of clusters cannot be more than number of mutations - max_K <- min(max_K, length(box$mutation_indices)) - if (max_K > 1) { - - box_input_data$sample_to_sort <- sample_to_sort - - samps_2 <- parallel::mclapply(2:max_K, - function(k) runMCMC(box_input_data, k, - jags.file, inits, params, - n.iter=n.iter, thin=thin, - n.burn=n.burn, - beta.prior=beta.prior), - mc.cores=mc.cores) - - if (drop_zero) { - for (i in seq_len(length(samps_2))) { - samps_2[[i]] <- reverseDrop(samps_2[[i]], box$pattern, n.iter) - } - } - - samps_list <- c(list(samps_K1), samps_2) - names(samps_list) <- paste0("K", 1:max_K) - return(samps_list) - - } else { - names(samps_K1) <- "K1" - return(samps_K1) - } - -} - -formatChains <- function(samps) { - temp_z <- get.parameter.chain("z", ggmcmc::ggs(samps)) %>% - mutate(Parameter = as.character(Parameter)) - temp_w <- get.parameter.chain("w", ggmcmc::ggs(samps)) %>% - mutate(Parameter = as.character(Parameter)) - if (nrow(temp_w) == 0) { - temp_w <- get.parameter.chain("w", ggmcmc::ggs(samps) %>% mutate(Parameter = gsub("w","w[1,1]",Parameter))) %>% - mutate(Parameter = as.character(Parameter)) - } - temp_ystar <- get.parameter.chain("ystar", ggmcmc::ggs(samps)) %>% - mutate(Parameter = as.character(Parameter)) - samps_list_formatted <- list(w_chain = temp_w, - z_chain = temp_z, - ystar_chain = temp_ystar) - return(samps_list_formatted) -} - -runMutSetMCMC <- function(temp_box, - n.iter = 10000, n.burn = 1000, thin = 10, mc.cores = 1, - inits = list(".RNG.name" = "base::Wichmann-Hill", - ".RNG.seed" = 123), - temp_max_K = 5, - model_type = "spike_and_slab", - params = c("z", "w", "ystar"), - beta.prior = FALSE, - drop_zero = FALSE) { - - # Run MCMC - if (temp_max_K == 1) { - # only 1 possible cluster - temp_samps_list <- runMCMCForABox(temp_box, - n.iter = n.iter, n.burn = n.burn, - thin = thin, mc.cores = mc.cores, - inits = inits, - params = params, - max_K = temp_max_K, - drop_zero = drop_zero) - } else { - # assess range of K: [1, temp_max_K] - temp_samps_list <- runMCMCForABox(temp_box, - n.iter = n.iter, n.burn = n.burn, - thin = thin, mc.cores = mc.cores, - inits = inits, - params = params, - max_K = temp_max_K, - model = model_type, - beta.prior = beta.prior, - drop_zero = drop_zero) - } - - # Format chains - if (drop_zero && temp_box$I == 1) { - samps_list <- list(formatChains(temp_samps_list)) - names(samps_list) <- "K1" - } else { - samps_list <- parallel::mclapply(temp_samps_list, formatChains, - mc.cores = mc.cores) - } - - # Calculate BIC - K_tested <- seq_len(temp_max_K) - if (temp_max_K > 1) { - box_indata <- getBoxInputData(temp_box) - bic_vec <- unname(unlist(parallel::mclapply(samps_list, - function(chains) calcChainBIC(chains, box_indata), - mc.cores = mc.cores))) - bic_tb <- tibble(K_tested = K_tested, - BIC = bic_vec) - best_chains <- samps_list[[which.min(bic_vec)]] - res_list <- list(all_chains = samps_list, - BIC = bic_tb, - best_chains = best_chains, - best_K = which.min(bic_vec)) - } else { - # only 1 variant, so must be 1 cluster and don't need to check BIC - res_list <- list(all_chains = samps_list, - BIC = NA, - best_chains = samps_list[[1]], - best_K = 1) - } - - return(res_list) -} - - - - - - -#' Collect chains for best K of each mutation set -#' -#' @export -#' @param all_set_results List of MCMC results for each mutation set; returned by \code{clusterSep} -#' @param chosen_K (Optional) Vector of K to choose for each mutation set, in the same order as all_set_results. If left blank, function will select best K automatically selected by \code{clusterSep} -collectBestKChains <- function(all_set_results, chosen_K = NULL) { - if (is.null(chosen_K)) { - best_set_chains <- lapply(all_set_results, function(x) x$best_chains) - } else { - best_set_chains <- mapply(function(set_res, choose_K) set_res$all_chains[[choose_K]], - set_res = all_set_results, - chosen_K, - SIMPLIFY = FALSE) - } - return(best_set_chains) -} - -#' Relabel chains for all sets and merge -#' -#' @export -#' @import dplyr -#' @param best_set_chains List of lists of MCMC chains (w_chain, z_chain, ystar_chain) for each mutation set -#' @param indata List of input data objects (same as provided to clusterSep) -mergeSetChains <- function(best_set_chains, indata) { - best_K_vals <- unname(sapply(best_set_chains, function(x) max(x$z_chain$value))) - sep_list <- separateMutationsBySamplePresence(indata) - - # first set doesn't need to change cluster labels - w_chain <- best_set_chains[[1]]$w_chain - temp_z_chain <- best_set_chains[[1]]$z_chain - temp_ystar_chain <- best_set_chains[[1]]$ystar_chain - - if (length(best_set_chains) > 1) { - # still need to change mutation indices if more than 1 box - z_chain <- relabel_z_chain_mut_only(temp_z_chain, sep_list[[1]]$mutation_indices) - ystar_chain <- relabel_ystar_chain(temp_ystar_chain, - sep_list[[1]]$mutation_indices) - for (i in 2:length(best_set_chains)) { - temp_w_chain <- best_set_chains[[i]]$w_chain - temp_z_chain <- best_set_chains[[i]]$z_chain - temp_ystar_chain <- best_set_chains[[i]]$ystar_chain - new_cluster_labels <- seq_len(best_K_vals[i]) + sum(best_K_vals[1:(i-1)]) - - temp_relabeled_w_chain <- relabel_w_chain(temp_w_chain, new_cluster_labels) - temp_relabeled_z_chain <- relabel_z_chain(temp_z_chain, new_cluster_labels, - sep_list[[i]]$mutation_indices) - temp_relabeled_ystar_chain <- relabel_ystar_chain(temp_ystar_chain, - sep_list[[i]]$mutation_indices) - - w_chain <- rbind(w_chain, temp_relabeled_w_chain) - z_chain <- rbind(z_chain, temp_relabeled_z_chain) - ystar_chain <- rbind(ystar_chain, temp_relabeled_ystar_chain) - } - } else { - z_chain <- temp_z_chain - ystar_chain <- temp_ystar_chain - } - - # set levels for Parameter - w_chain <- w_chain %>% - mutate(k = as.numeric(gsub("w\\[", "", - sapply(w_chain$Parameter, - function(x) strsplit(as.character(x), ",")[[1]][1])))) %>% - mutate(s = as.numeric(gsub("\\]", "", - sapply(w_chain$Parameter, - function(x) strsplit(as.character(x), ",")[[1]][2])))) %>% - arrange(k, s) %>% - mutate(Parameter = factor(Parameter, levels = unique(w_chain$Parameter))) %>% - select(Iteration, Chain, Parameter, value) - - z_chain_param_order <- tibble(Parameter = unique(z_chain$Parameter)) %>% - mutate(Variant = as.numeric(gsub("z\\[", "", - gsub("\\]", "", - unique(z_chain$Parameter))))) %>% - arrange(Variant) - z_chain <- z_chain %>% - mutate(Parameter = factor(Parameter, levels = z_chain_param_order$Parameter)) - - ystar_chain <- ystar_chain %>% - mutate(Mutation_index = as.numeric(gsub("ystar\\[", "", - sapply(ystar_chain$Parameter, - function(x) strsplit(as.character(x), ",")[[1]][1]))), - s = as.numeric(gsub("\\]", "", - sapply(ystar_chain$Parameter, - function(x) strsplit(as.character(x), ",")[[1]][2])))) - ystar_chain <- ystar_chain %>% - arrange(Mutation_index, s) %>% - mutate(Parameter = factor(Parameter, levels = unique(Parameter))) - - chains <- list(w_chain = w_chain, - z_chain = z_chain, - ystar_chain = ystar_chain) - return(chains) -} - -relabel_w_chain <- function(w_chain, new_cluster_labels) { - # new_cluster_labels = numeric vector of labels that map to 1:length(new_cluster_labels) - new_w <- w_chain %>% - mutate(k = as.numeric(gsub("w\\[", "", - sapply(w_chain$Parameter, - function(x) strsplit(as.character(x), ",")[[1]][1])))) %>% - mutate(s = as.numeric(gsub("\\]", "", - sapply(w_chain$Parameter, - function(x) strsplit(as.character(x), ",")[[1]][2])))) - if (length(new_cluster_labels) != length(unique(new_w$k))) { - stop("number of supplied new cluster labels does not match the number of clusters in w_chain") - } - new_w <- new_w %>% - mutate(k_new = new_cluster_labels[new_w$k]) %>% - mutate(Parameter = paste0("w[", k_new, ",", s, "]")) %>% - select(Iteration, Chain, Parameter, value) - return(new_w) -} - -relabel_z_chain <- function(z_chain, new_cluster_labels, mutation_indices) { - # new_cluster_labels = numeric vector of labels that map to 1:length(new_cluster_labels) - # mutation_indices = numeric vector of original mutation indices prior to separating by sample presence - if (length(mutation_indices) != length(unique(z_chain$Parameter))) { - stop("number of supplied mutation indices does not match the number of mutations in z_chain") - } - ## would break when no mutation is assigned to a cluster - ## poor choice of k, would prob lower the k - if (length(new_cluster_labels) < length(unique(z_chain$value))) { - stop("number of supplied new cluster labels does not match the number of clusters in z_chain") - } - new_z <- z_chain %>% - mutate(i = as.numeric(gsub("\\]", "", - gsub("z\\[", "", - sapply(z_chain$Parameter, - function(x) strsplit(as.character(x), ",")[[1]][1]))))) - new_z <- new_z %>% - mutate(new_i = mutation_indices[i], - value = new_cluster_labels[new_z$value]) %>% - mutate(Parameter = paste0("z[", new_i, "]")) %>% - arrange(new_i) %>% - select(Iteration, Chain, Parameter, value) - return(new_z) -} - -relabel_ystar_chain <- function(ystar_chain, mutation_indices) { - # mutation_indices = numeric vector of original mutation indices prior to separating by sample presence - i_s <- gsub("ystar\\[|]", "", ystar_chain$Parameter) - i <- sapply(i_s, function(x) strsplit(x, ",")[[1]][1]) %>% - as.numeric - s <- sapply(i_s, function(x) strsplit(x, ",")[[1]][2]) %>% - as.numeric - new_ystar <- ystar_chain %>% - mutate(i = i, - s = s) - new_ystar <- new_ystar %>% - mutate(new_i = mutation_indices[i]) %>% - mutate(Parameter = paste0("ystar[", new_i, ",", s, "]")) %>% - arrange(new_i) %>% - select(Iteration, Chain, Parameter, value) - return(new_ystar) -} - -relabel_z_chain_mut_only <- function(z_chain, mutation_indices) { - # mutation_indices = numeric vector of original mutation indices prior to separating by sample presence - # cluster labels are left unchanged - if (length(mutation_indices) != length(unique(z_chain$Parameter))) { - stop("number of supplied mutation indices does not match the number of mutations in z_chain") - } - new_z <- z_chain %>% - mutate(i = as.numeric(gsub("\\]", "", - gsub("z\\[", "", - sapply(z_chain$Parameter, - function(x) strsplit(as.character(x), ",")[[1]][1]))))) - new_z <- new_z %>% - mutate(new_i = mutation_indices[i]) %>% - mutate(Parameter = paste0("z[", new_i, "]")) %>% - arrange(new_i) %>% - select(Iteration, Chain, Parameter, value) - return(new_z)} \ No newline at end of file diff --git a/R/clustering_functions.R b/R/clustering_functions.R deleted file mode 100644 index 98f6014..0000000 --- a/R/clustering_functions.R +++ /dev/null @@ -1,481 +0,0 @@ -#' @import rjags -#' @importFrom epiR epi.betabuster -runMCMC <- function(data, K, jags.file, inits, params, - n.iter=10000, thin=10, n.chains=1, - n.adapt=1000, n.burn=1000, - beta.prior=FALSE) { - if (K > 1) data$K <- K - jags.m <- jags.model(jags.file, - data, - n.chains = n.chains, - inits = inits, - n.adapt = n.adapt) - if (n.burn > 0) update(jags.m, n.burn) - samps <- coda.samples(jags.m, params, n.iter=n.iter, thin=thin) - - if (beta.prior & K > 1) { - # use initial MCMC to estimate beta priors for identified clusters - initial_chains <- formatChains(samps) - est_ccfs <- initial_chains$w_chain %>% - estimateCCFs %>% - as.data.frame %>% - magrittr::set_colnames(1:ncol(.)) %>% - tibble %>% - mutate(cluster = 1:nrow(.)) %>% - pivot_longer(cols = colnames(.)[colnames(.) != "cluster"], - names_to = "sample", - values_to = "ccf") %>% - mutate(sample = as.numeric(sample)) - - - beta_shapes <- lapply(est_ccfs$ccf, estimateBetaPriors) - cluster_beta_params <- est_ccfs %>% - mutate(shape1 = sapply(beta_shapes, function(x) x[1]), - shape2 = sapply(beta_shapes, function(x) x[2])) - - cluster_shape1 <- cluster_beta_params %>% - select(cluster, shape1, sample) %>% - pivot_wider(names_from = sample, - values_from = shape1) %>% - select(-c(cluster)) %>% - as.matrix - cluster_shape2 <- cluster_beta_params %>% - select(cluster, shape2, sample) %>% - pivot_wider(names_from = sample, - values_from = shape2) %>% - select(-c(cluster)) %>% - as.matrix - - # run second MCMC with specified beta priors - data$cluster_shape1 <- cluster_shape1 - data$cluster_shape2 <- cluster_shape2 - - extdir <- system.file("extdata", package="pictograph") - jags.file.beta <- file.path(extdir, "model-simple-set-beta.jags") - - jags.m.beta <- rjags::jags.model(jags.file.beta, - data, - n.chains = n.chains, - inits = inits, - n.adapt = n.adapt) - if (n.burn > 0) update(jags.m.beta, n.burn) - samps.beta <- rjags::coda.samples(jags.m.beta, params, n.iter=n.iter, thin=thin) - return(samps.beta) - } - - return(samps) -} - -#' @importFrom epiR epi.betabuster -estimateBetaPriors <- function(ccf) { - if (ccf == 0) ccf <- 0.001 - suppressWarnings( - param_list <- epiR::epi.betabuster(ccf, - conf = 0.5, - greaterthan = TRUE, - ccf, - conf.level = 0.8, - max.shape1 = 10) - ) - - if (length(param_list$shape1) > 1) { - shape1 <- mean(param_list$shape1) - } else { - shape1 <- param_list$shape1 - } - if (length(param_list$shape2) > 1) { - shape2 <- mean(param_list$shape2) - } else { - shape2 <- param_list$shape2 - } - return(c(shape1, shape2)) -} - - -getParamChain <- function(samps, param) { - chains <- do.call(rbind, samps) - chain <- chains[, grep(param, colnames(chains))] - chain -} - -relabelZ <- function(z.chain, I, K){ - mcmc_z <- z.chain %>% - group_by(Parameter, value) %>% - summarize(n=n(), - maxiter=max(Iteration)) %>% - mutate(probability=n/maxiter) - map_z <- mcmc_z %>% - group_by(Parameter) %>% - summarize(map=value[probability==max(probability)]) %>% - mutate(original_label=rep(1:K, each=I/K)) %>% - ungroup() %>% - group_by(original_label) %>% - summarize(value=names(sort(table(map), decreasing=TRUE))[1], - value=as.numeric(value)) %>% - ungroup() - sim_cluster <- mcmc_z %>% - group_by(Parameter) %>% - summarize(n=n()) %>% - mutate(sim_cluster=rep(1:K, each=I/K)) %>% - select(-n) - mcmc_z <- z.chain %>% - left_join(map_z, by="value") %>% - left_join(sim_cluster, by="Parameter") %>% - mutate(chain_value=value, - value=original_label) %>% - select(-original_label) %>% - group_by(Parameter, value) %>% - summarize(n=n(), - maxiter=max(Iteration), - sim_cluster=unique(sim_cluster)) %>% - mutate(probability=n/maxiter) - ##filter(value==sim_cluster) -} - -get_mode <- function(v) { - uniqv <- unique(v) - uniqv[which.max(tabulate(match(v, uniqv)))] -} - -orderW <- function(w, map_z){ - mcmc_cluster_numbering <- matrix(map_z$value, nrow = 10) - - mcmc_cluster_numbering <- apply(mcmc_cluster_numbering, 2, get_mode) - true_to_mcmc_w_ordering <- match(1:K, mcmc_cluster_numbering) - w_ordered <- w[true_to_mcmc_w_ordering, ] - mcmc_vals <- w.chain %>% - group_by(Parameter) %>% - summarize(mean=mean(value), - q1=quantile(value, 0.025), - q3=quantile(value, 0.975)) %>% - mutate(truth=as.numeric(t(w_ordered))) - mcmc_vals -} - - -#' @importFrom stringr str_replace -numberClusters <- function(mcf_stats){ - params <- as.character(mcf_stats$Parameter) - K <- strsplit(params, ",") %>% - sapply("[", 1) %>% - str_replace("w\\[", "") %>% - as.numeric() %>% - max() - K -} - -#' @importFrom stringr str_replace -numberSamples <- function(mcf_stats){ - params <- as.character(mcf_stats$Parameter) - nSamples <- strsplit(params, ",") %>% - sapply("[", 2) %>% - stringr::str_replace("\\]", "") %>% - as.numeric() %>% - max() - nSamples -} - -mcfMatrix <- function(mcf_stats, parameter="mean"){ - K <- numberClusters(mcf_stats) - S <- numberSamples(mcf_stats) - if(parameter=="mean") - MCF <- matrix(mcf_stats$mean, K, S, byrow=TRUE) - if(parameter=="sd") - MCF <- matrix(mcf_stats$sd, K, S, byrow=TRUE) - MCF -} - -##base.admat <- function(MCF, zero.thresh=0.01) { -chec.mcf_stats.format <- function(mcf_stats) { - if(all.equal(colnames(mcf_stats), c("Parameter", "sd", "mean"))) TRUE - else FALSE -} - - -sample.w <- function(w.chain, K) { - numIter <- max(w.chain$Iteration) - randIter <- sample(numIter, size = 1) - w.sample <- w.chain[w.chain$Iteration == randIter, ] - matrix(w.sample$value, nrow = K, byrow = T) -} - -#' Determine most probable mutation cluster assignments by taking those with highest posterior probability -#' -#' @export -#' @param z_chain MCMC chain of mutation cluster assignment values, which is the second item in the list returned by \code{clusterSep} -estimateClusterAssignments <- function(z_chain) { - it <- max(z_chain$Iteration) - mcmc_z <- z_chain %>% - group_by(Parameter, value) %>% - summarize(n=n(), - maxiter=it) %>% - mutate(probability=n/maxiter) %>% - ungroup() - map_z <- mcmc_z %>% - group_by(Parameter) %>% - summarize(value=value[probability==max(probability)]) - - # choose first cluster if equal probability - map_z_count <- map_z %>% - group_by(Parameter) %>% - summarize(map_count = n()) %>% - ungroup() - if (any(map_z_count$map_count > 1)) { - mut_ind <- which(map_z_count$map_count > 1) - for (i in mut_ind) { - dup_var <- as.numeric(gsub("z\\[|]", "", map_z_count$Parameter[i])) - map_z_dups <- which(gsub("z\\[|]", "", map_z$Parameter) == dup_var) - dup_ind <- map_z_dups[-1] - map_z <- map_z[-dup_ind, ] - } - } - return(map_z) -} - -#' Determine most probable mutation cluster assignments by taking those with highest posterior probability. -#' -#' @export -#' @param z_chain MCMC chain of mutation cluster assignment values, which is the second item in the list returned by \code{clusterSep} -#' @param Mut_ID Vector of mutation IDs, same order as provided as input data (e.g. indata$Mut_ID) -#' @return A tibble listing mutation IDs and their cluster assignments -writeClusterAssignmentsTable <- function(z_chain, Mut_ID = NULL) { - map_z <- estimateClusterAssignments(z_chain) - if (is.null(Mut_ID)) { - Mut_ID <- paste0("Mut", 1:nrow(map_z)) - } - map_z <- map_z %>% - mutate(Mut_ID = Mut_ID, Cluster = value) %>% - select(Mut_ID, Cluster) %>% - arrange(Cluster) - # map_z <- map_z %>% - # add_column(Mut_ID = Mut_ID) %>% - # mutate(Cluster = value) %>% - # select(Mut_ID, Cluster) %>% - # arrange(Cluster) - return(map_z) -} - -#' Determine the most probable cluster CCF values by taking the mode of the posterior distributions -#' -#' @export -#' @param w_chain MCMC chain of CCF values, which is the first item in the list returned by \code{clusterSep} -#' @return matrix of estimated cluster CCFs -estimateCCFs <- function(w_chain) { - S <- numberSamples(w_chain) - K <- numberClusters(w_chain) - # density plot - w.dens <- ggplot(w_chain, aes(x = value)) + - geom_density() + - facet_wrap(~Parameter, ncol = S, scales = "free_y") + - theme_light() - # find peak for MAP w - w.dens.p <- ggplot_build(w.dens)$data[[1]] - w.map <- w.dens.p %>% - as_tibble() %>% - group_by(PANEL) %>% - summarize(value = x[max(y) == y]) - w.map <- w.map %>% - mutate(Parameter = unique(w_chain$Parameter), - value_rounded = round(value, 2)) - # return w matrix - w.map.matrix <- matrix(w.map$value_rounded, K, S, byrow=TRUE) - return(w.map.matrix) -} - -#' Determine the most probable cluster CCF values by taking the mode of the posterior distributions -#' -#' @export -#' @param w_chain MCMC chain of CCF values, which is the first item in the list returned by \code{clusterSep} -#' @param Sample_ID Vector of sample IDs, same order as provided as input data (e.g. indata$Sample_ID) -#' @return A tibble of estimated cluster CCFs in each sample -writeClusterCCFsTable <- function(w_chain, Sample_ID = NULL) { - map_w <- as.data.frame(estimateCCFs(w_chain)) - - if (is.null(Sample_ID)) { - Sample_ID <- paste0("Sample ", 1:ncol(map_w)) - } - colnames(map_w) <- Sample_ID - map_w <- map_w %>% - as_tibble() %>% - bind_cols(tibble(Cluster = 1:nrow(map_w)), .) - return(map_w) -} - -# function to check if clustering respects sample presence -test.pres <- function(samps, pres.tb) { - # input: samps = one item in samps.list - # pres.tb = tibble with columns mut_id and sample_presence (label of sample-presence set) - # output: TRUE or FALSE - chains <- ggs(samps) - z.chain <- get.parameter.chain("z", chains) - map.z <- get.map.z(z.chain) - - # check that clusters respect sample presence - pres.tb$cluster <- map.z$Cluster[match(pres.tb$mut_id, map.z$Mutation)] - clusts <- unique(pres.tb$cluster) - pres.equal <- all(sapply(clusts, function(x) - length(unique(pres.tb$sample_presence[pres.tb$cluster == x])) == 1)) -} - -plot_ppd <- function(samps, test.data, K) { - ## - ## 50 mutations x 10 samples - chains <- do.call(rbind, samps) - ystar <- chains[, grep("ystar", colnames(chains))] - ## each row of MCMC is in column-major order - orig.order <- tibble(statistic=colnames(ystar)) - ppd.summaries <- ystar %>% - as_tibble() %>% - gather("statistic", "value") %>% - group_by(statistic) %>% - summarize(mean=mean(value), - q1=quantile(value, 0.025), - q3=quantile(value, 0.975)) - ppd.summaries2 <- left_join(orig.order, - ppd.summaries, by="statistic") %>% - mutate(observed=as.numeric(test.data$y)) %>% - mutate(sample=paste0("sample", rep(1:test.data$S, each=test.data$I)), - variant=rep(1:test.data$I, test.data$S)) - ggplot(ppd.summaries2, aes(x=statistic, y=mean, - ymin=q1, - ymax=q3)) + - geom_errorbar() + - geom_point(aes(x=statistic, y=observed), - size=1, color="steelblue") + - theme(axis.text.x=element_blank(), - axis.ticks.x=element_blank(), - panel.background=element_rect(fill="white", - color="black")) + - ylab("Middle 95% of posterior\npredictive distribution") + - xlab("observation index (column-major order)") + - facet_wrap(~sample) + - ggtitle(paste0("K = ", K)) -} - - -simulateVAF <- function(mcf, nvarClust, avg_depth=100, sd_depth=20){ - nClust <- nrow(mcf) - nSamp <- ncol(mcf) - nVariants <- sum(nvarClust) - nObs <- nVariants * nSamp - stopifnot(length(nvarClust) == nClust) - ##pi <- rep(1/3, 3) - ## 10 mutations for each of the 3 clusters - z <- rep(1:nClust, nvarClust) - - MCF <- mcf[z, ] - dimnames(MCF) <- list(paste0("variant", seq_len(nVariants)), - paste0("sample", seq_len(nSamp))) - mult <- tcn <- MCF; - tcn[,] <- 2 - mult[,] <- sample(1:2, nVariants*nSamp, replace=TRUE) - vaf <- (mult * MCF)/(tcn * MCF + 2*(1-MCF)) - ## add more variation to sequencing depth - mus <- rnorm(nVariants, avg_depth, sd_depth) - mus <- ifelse(mus < 0, 0, mus) - depth <- matrix(rpois(nObs, mus), nVariants, nSamp) - y <- rbinom(nObs, as.numeric(depth), as.numeric(vaf)) - y <- ifelse(y < 0, 0, y) - tibble(variant=rep(seq_len(nVariants), nSamp), - sample=rep(seq_len(nSamp), each=nVariants), - cluster=rep(z, nSamp), - y=y, - n=as.numeric(depth), - multiplicity=as.numeric(mult), - copy_number=as.numeric(tcn), - mcf=as.numeric(MCF)) -} - - -listJagInputs <- function(dat){ - w <- group_by(dat, cluster, sample) %>% - summarize(mcf=unique(mcf)) %>% - ungroup() %>% - spread(sample, mcf) %>% - select(-cluster) %>% - as.matrix() - K <- length(unique(dat$cluster)) - tcn <- dat %>% - select(variant, sample, copy_number) %>% - spread(sample, copy_number) %>% - select(-variant) %>% - as.matrix() - S <- length(unique(dat$sample)) - m <- dat %>% - select(variant, sample, multiplicity) %>% - spread(sample, multiplicity) %>% - select(-variant) %>% - as.matrix() - y <- dat %>% - select(variant, sample, y) %>% - spread(sample, y) %>% - select(-variant) %>% - as.matrix() - n <- dat %>% - select(variant, sample, n) %>% - spread(sample, n) %>% - select(-variant) %>% - as.matrix() - I <- length(unique(dat$variant)) - - list(I=I, S=S, K=K, - y=y, n=n, m=m, tcn=tcn) -} - -getmode <- function(v) { - uniqv <- unique(v) - uniqv[which.max(tabulate(match(v, uniqv)))] -} - -get.z.mapping <- function(true.z, z.chain) { - mcmc_z <- z.chain %>% - group_by(Parameter, value) %>% - summarize(n=n(), - maxiter=max(Iteration)) %>% - mutate(probability=n/maxiter) %>% - ungroup() - map_z <- mcmc_z %>% - group_by(Parameter) %>% - summarize(value=value[probability==max(probability)]) %>% - mutate(Variant = 1:length(true.z), true_z = true.z) - z_mappings <- map_z %>% - select(Variant, true_z, value) %>% - rename("mcmc_z" = "value") - z_mappings <- z_mappings %>% - group_by(true_z) %>% - summarize(mcmc_z = getmode(mcmc_z)) - z_mappings -} - -relabel.w.z.chains <- function(true.z, chains) { - w.chain <- get.parameter.chain("w", chains) - z.chain <- get.parameter.chain("z", chains) - - z_mappings <- get.z.mapping(true.z, z.chain) - - # relabel w.chain - w.chain.relabeled <- w.chain %>% - mutate(k = as.numeric(gsub("w\\[", "", sapply(w.chain$Parameter, function(x) strsplit(as.character(x), ",")[[1]][1])))) %>% - mutate(s = gsub("\\]", "", sapply(w.chain$Parameter, function(x) strsplit(as.character(x), ",")[[1]][2]))) - w.chain.relabeled <- w.chain.relabeled %>% - mutate(new_k = match(w.chain.relabeled$k, z_mappings$mcmc_z)) - w.chain.relabeled <- w.chain.relabeled %>% - mutate(new_Parameter = paste0("w[", new_k, ",", s, "]")) %>% - arrange(new_k, s) - w.chain.relabeled <- w.chain.relabeled %>% - select(Iteration, Chain, new_Parameter, value) %>% - rename("Parameter" = "new_Parameter") - w.chain.relabeled <- w.chain.relabeled %>% - mutate(Parameter = factor(w.chain.relabeled$Parameter, levels = unique(w.chain.relabeled$Parameter))) - - # relabel z.chain - z.chain.relabeled <- z.chain %>% - mutate(new_value = match(value, z_mappings$mcmc_z)) - z.chain.relabeled <- z.chain.relabeled %>% - select(Iteration, Chain, Parameter, new_value) %>% - rename("value" = "new_value") - - list(w.chain=w.chain.relabeled, - z.chain=z.chain.relabeled) -} - diff --git a/R/estimate-multiplicity.R b/R/estimate-multiplicity.R new file mode 100644 index 0000000..071a3b2 --- /dev/null +++ b/R/estimate-multiplicity.R @@ -0,0 +1,91 @@ +get95CI <- function(y, n) { + test <- prop.test(y, n, correct = T) + ci_lower <- test$conf.int[1] + ci_upper <- test$conf.int[2] + return(c(ci_lower, ci_upper)) +} + +calcmC <- function(VAF, purity, tcn) { + mC <- (VAF * (purity * tcn + (1-purity) * 2)) / purity + return(mC) +} + +getmCCI <- function(y, n, purity, tcn) { + VAF_CI <- get95CI(y, n) + lower_mC <- calcmC(VAF_CI[1], purity, tcn) + upper_mC <- calcmC(VAF_CI[2], purity, tcn) + return(c(lower_mC, upper_mC)) +} + +assignMultiplicity <- function(lower_mC, upper_mC, tcn) { + # (1) If the CI for mC overlaps an integer value, + # that value is estimated to indicate the multiplicity + # of the mutation and the mutation is clonal (C=1) + if (ceiling(lower_mC) == floor(upper_mC)) { + test_m <- ceiling(lower_mC) + # cap m at tcn + if (test_m > tcn) return (tcn) + return(test_m) + } + + # (2) If the upper bound of the CI for mC is below 1, + # the multiplicity is set to 1, and the mutation is + # subclonal, unless the resulting estimate for C is + # within a tolerance threshold (0.25) of 1 + if (upper_mC < 1) return(1) + + # (3) If the CI for mC is above 1 and does not overlap + # any integer values, multiplicity is greater than 1 + # and m is set such that the confidence interval for C + # falls within the expected intervals of [0,1] + if (lower_mC > 1) { + keep_going <- TRUE + m <- 1 + while(keep_going) { + # cap m at tcn + if (m == tcn) return(m) + + m <- m + 1 + C_lower <- lower_mC / m + C_upper <- upper_mC / m + if (C_lower > 0 & C_upper < 1) { + keep_going <- FALSE + } + } + return(m) + } + + # CI for mC spans more than 1 integer value including 1 + if (lower_mC <= 1 && upper_mC >= 1) return(1) + + return(NA) +} + +estimateMultiplicity1 <- function(y, n, tcn) { + # if no variant reads, assigning multiplicity of 1 + if (y == 0) return(1) + mC_CI <- getmCCI(y, n, 0.9, tcn) + m <- max(1, assignMultiplicity(mC_CI[1], mC_CI[2], tcn)) + return(m) +} + +#' @export +estimateMultiplicityMatrix <- function(data) { + Y = data$y + N = data$n + Tcn = data$tcn + I = data$I + S = data$S + M = matrix(100, I, S) + + for (i in 1:I) { + for (s in 1:S) { + y = Y[[i, s]] + n = N[[i, s]] + tcn = Tcn[[i, s]] + m = estimateMultiplicity1(y, n, tcn) + M[i, s] = m + } + } + return(M) +} \ No newline at end of file diff --git a/R/gabowmyers.R b/R/gabowmyers.R deleted file mode 100644 index cb96d16..0000000 --- a/R/gabowmyers.R +++ /dev/null @@ -1,390 +0,0 @@ -pop <- function(edges_tb, tb_name) { - assign(tb_name, edges_tb[-1, ], envir = .GlobalEnv) - return(edges_tb[1, ]) -} - -verticesInGraph <- function(tb) { - unique(c(tb$parent, tb$child)) -} - -bridgeTest <- function(graph_G, edge_e) { - node_to_check <- edge_e$child - !(node_to_check %in% graph_G$child) -} - -bridgeTestBFS <- function(graph_G, edge_e) { - node_to_check <- edge_e$child - - nodes_connected_to_root <- bfsLong2(graph_G) - !(node_to_check %in% nodes_connected_to_root) -} - -bfsLong2 <- function(graph_G) { - # returns vector of nodes in main tree (connected to root) including "root" - # starting at root - # does not stop if there is a cycle in graph - graph_G$parent <- as.character(graph_G$parent) - children <- graph_G[(graph_G$parent == "root"), ]$child - nodes <- c("root", children) - - while(length(children) > 0) { - c <- children[1] - temp.children <- graph_G[(graph_G$parent == c), ]$child - - # remove children already seen - if (any(temp.children %in% nodes)) { - temp.children <- temp.children[! temp.children %in% nodes] - } - - children <- c(children, temp.children) - - nodes <- c(nodes, temp.children) - children <- children[-1] - } - return(nodes) -} - -grow <- function(tree_T, all_vertices, w, thresh=0.2) { - - if (length(verticesInGraph(tree_T)) == length(all_vertices) & nrow(tree_T) == (length(all_vertices)-1)) { - assign("all_spanning_trees", c(all_spanning_trees, list(tree_T)), envir = .GlobalEnv) - - if (satisfiesSumCondition(tree_T, w, thresh)) { - assign("filtered_trees", c(filtered_trees, list(tree_T)), envir = .GlobalEnv) - } - - } else { - FF <- tibble(parent = character(), child = character()) - - bridge <- FALSE - while(!bridge) { - # new tree edge - if (nrow(F_tb) == 0) stop("F_tb is empty") - edge_e <- pop(F_tb, "F_tb") - v <- edge_e$child - tree_T <- rbind(tree_T, edge_e) - - # update F - ## push each edge (v,w), w not in T onto F - in_T <- verticesInGraph(tree_T) - temp_add_to_F <- filter(graph_G, parent == v, !(child %in% in_T)) - # temp_add_to_F - assign("F_tb", rbind(temp_add_to_F, F_tb), envir = .GlobalEnv) - - ## remove each edge (w,v), w in T from F - w_in_T <- verticesInGraph(tree_T) - removed_edges <- filter(F_tb, parent %in% w_in_T, child == v) - assign("F_tb", filter(F_tb, !edge %in% removed_edges$edge), envir = .GlobalEnv) - - # recurse - grow(tree_T, all_vertices, w, thresh) - #tree_L <- all_spanning_trees[[length(all_spanning_trees)]] - - # restore F - # pop each edge (v,w), w not in T, from F - not_in_T <- all_vertices[!all_vertices %in% verticesInGraph(tree_T)] - if (length(not_in_T) > 0 & nrow(F_tb) > 0) { - edges_to_remove_9 <- paste0(v, "->", not_in_T) - assign("F_tb", filter(F_tb, !edge %in% edges_to_remove_9), envir = .GlobalEnv) - } - # restore each edge (w,v), w in T, in F - assign("F_tb", rbind(removed_edges, F_tb), envir = .GlobalEnv) - - # delete e from T and from G, add e to FF - tree_T <- tree_T[tree_T$edge != edge_e$edge, ] - assign("graph_G",graph_G[graph_G$edge != edge_e$edge, ], envir = .GlobalEnv) - FF <- rbind(edge_e, FF) - - # bridge test - bridge <- bridgeTestBFS(graph_G, edge_e) - } - - # pop each edge e from FF, push e onto F,and add e to G - if (nrow(FF) > 0) { - - # pop and push all edges at once (same order) - # assign("F_tb", rbind(FF, F_tb), envir = .GlobalEnv) - assign("graph_G", rbind(FF, graph_G), envir = .GlobalEnv) - - # pop and push edges one by one (rev order in F) - while (nrow(FF) > 0) { - assign("F_tb", rbind(FF[1, ], F_tb), envir = .GlobalEnv) - FF <- FF[-1, ] - } - } - } -} - -growModified <- function(tree_T, all_vertices, w, sum_thresh=0.2) { - - if (length(verticesInGraph(tree_T)) == length(all_vertices) & nrow(tree_T) == (length(all_vertices)-1)) { - assign("all_spanning_trees", c(all_spanning_trees, list(tree_T)), envir = .GlobalEnv) - - } else { - FF <- tibble(parent = character(), child = character()) - - bridge <- FALSE - while(!bridge) { - # new tree edge - if (nrow(F_tb) == 0) stop("F_tb is empty") - edge_e <- pop(F_tb, "F_tb") - v <- edge_e$child - tree_T <- rbind(tree_T, edge_e) - - # check if adding this node does not violate the constraint - if (satisfiesSumCondition(tree_T, w, sum_thresh)) { - # update F - ## push each edge (v,w), w not in T onto F - in_T <- verticesInGraph(tree_T) - temp_add_to_F <- filter(graph_G, parent == v, !(child %in% in_T)) - # temp_add_to_F - assign("F_tb", rbind(temp_add_to_F, F_tb), envir = .GlobalEnv) - - ## remove each edge (w,v), w in T from F - w_in_T <- verticesInGraph(tree_T) - removed_edges <- filter(F_tb, parent %in% w_in_T, child == v) - assign("F_tb", filter(F_tb, !edge %in% removed_edges$edge), envir = .GlobalEnv) - - # recurse - growModified(tree_T, all_vertices, w, sum_thresh) - - # restore F - # pop each edge (v,w), w not in T, from F - not_in_T <- all_vertices[!all_vertices %in% verticesInGraph(tree_T)] - if (length(not_in_T) > 0 & nrow(F_tb) > 0) { - edges_to_remove_9 <- paste0(v, "->", not_in_T) - assign("F_tb", filter(F_tb, !edge %in% edges_to_remove_9), envir = .GlobalEnv) - } - # restore each edge (w,v), w in T, in F - assign("F_tb", rbind(removed_edges, F_tb), envir = .GlobalEnv) - - } - # delete e from T and from G, add e to FF - tree_T <- tree_T[tree_T$edge != edge_e$edge, ] - assign("graph_G",graph_G[graph_G$edge != edge_e$edge, ], envir = .GlobalEnv) - FF <- rbind(edge_e, FF) - - # bridge test - bridge <- bridgeTestBFS(graph_G, edge_e) - } - - # pop each edge e from FF, push e onto F,and add e to G - if (nrow(FF) > 0) { - - # pop and push all edges at once (same order) - # assign("F_tb", rbind(FF, F_tb), envir = .GlobalEnv) - assign("graph_G", rbind(FF, graph_G), envir = .GlobalEnv) - - # pop and push edges one by one (rev order in F) - while (nrow(FF) > 0) { - assign("F_tb", rbind(FF[1, ], F_tb), envir = .GlobalEnv) - FF <- FF[-1, ] - } - } - } -} - -#' Create tibble of possible edges from CCF values based on sample-presence -#' -#' @export -#' @import tibble -#' @import dplyr -#' @import tidyr -#' @importFrom tidyr pivot_longer unite -#' @param w matrix of CCF values (rows = clusters, columns = samples) -#' @param chains list of MCMC chains (must contain w_chain and z_chain) -#' @param input_data list of input data; same as supplied for clustering -#' @return graph_G tibble of possible edges with columns edge, parent, child -prepareGraphForGabowMyers <- function(w, chains, input_data) { - graph_G <- constrainedEdges(w, chains, input_data) %>% - filter(possible_edge == TRUE) %>% - mutate(parent = as.character(parent)) %>% - select(edge, parent, child) - return(graph_G) -} - -#' Create tibble of possible edges from CCF values based on w_mat only -#' -#' @export -#' @import tibble -#' @import dplyr -#' @import tidyr -#' @param w matrix of CCF values (rows = clusters, columns = samples) -#' @return graph_G tibble of possible edges with columns edge, parent, child -prepareGraph <- function(w_mat, thresh) { - graph_pre <- data.frame(edge = character(), parent = character(), child = character()) - for (i in seq_len(nrow(w_mat))) { - graph_pre <- graph_pre %>% add_row(edge = paste("root->", i, sep = ""), parent = "root", child = as.character(i)) - for (j in seq_len(nrow(w_mat))) { - if (i!=j) { - i_row = w_mat[i, ] - j_row = w_mat[j, ] - if (all(j_row-i_row > -thresh)) { - graph_pre <- graph_pre %>% add_row(edge = paste(j, "->", i, sep = ""), parent = as.character(j), child = as.character(i)) - } - } - } - } - return(graph_pre) -} - -#' Enumerate all spanning trees and filter based on Sum Condition -#' -#' @export -#' @import tibble -#' @import dplyr -#' @import tidyr -#' @param graph_G tibble of possible edges with columns edge, parent, child -#' @param w matrix of CCF values (rows = clusters, columns = samples) -#' @param sum_filter_thresh thresh maximum allowed violation of Sum Condition (default = 0.2) -enumerateSpanningTrees <- function(graph_G, w, sum_filter_thresh=0.2) { - # all_spanning_trees must be set as an empty list, global variable, before function is called - # graph_G must be set as global variable before function is called - all_spanning_trees <- assign("all_spanning_trees", list(), envir = .GlobalEnv) - filtered_trees <- assign("filtered_trees", list(), envir = .GlobalEnv) - F_tb <- assign("F_tb", filter(graph_G, parent == "root"), envir = .GlobalEnv) - all_vertices <- verticesInGraph(graph_G) - tree_T <- tibble(parent = character(), child = character()) - - grow(tree_T, all_vertices, w, sum_filter_thresh) - -# return(list(all_spanning_trees = all_spanning_trees, -# filtered_trees = filtered_trees)) -} - -#' Enumerate all spanning trees using modified Gabow-Myers wrapper -#' -#' @export -#' @param w matrix of CCF values (rows = clusters, columns = samples) -#' @param lineage_precedence_thresh maximum allowed violation of lineage precedence (default = 0.1) -#' @param sum_filter_thresh thresh maximum allowed violation of Sum Condition (default = 0.2) -generateAllTrees <- function(w, lineage_precedence_thresh=0.1, sum_filter_thresh=0.2) { - w_mat <- estimateCCFs(w) - w_mat <- assign("w_mat", w_mat, envir = .GlobalEnv) - graph_G_pre <- prepareGraph(w_mat, lineage_precedence_thresh) - graph_G <- filterEdgesBasedOnCCFs(graph_G_pre, w_mat, thresh = lineage_precedence_thresh) - graph_G <- assign("graph_G", graph_G, envir = .GlobalEnv) - enumerateSpanningTreesModified(graph_G, w_mat, sum_filter_thresh = sum_filter_thresh) -} - -#' Enumerate all spanning trees using modified Gabow-Myers -#' -#' @export -#' @import tibble -#' @import dplyr -#' @import tidyr -#' @param graph_G tibble of possible edges with columns edge, parent, child -#' @param w matrix of CCF values (rows = clusters, columns = samples) -#' @param sum_filter_thresh thresh maximum allowed violation of Sum Condition (default = 0.2) -enumerateSpanningTreesModified <- function(graph_G, w, sum_filter_thresh=0.2) { - # all_spanning_trees must be set as an empty list, global variable, before function is called - # graph_G must be set as global variable before function is called - all_spanning_trees <- assign("all_spanning_trees", list(), envir = .GlobalEnv) - #filtered_trees <- assign("filtered_trees", list(), envir = .GlobalEnv) - F_tb <- assign("F_tb", filter(graph_G, parent == "root"), envir = .GlobalEnv) - all_vertices <- verticesInGraph(graph_G) - tree_T <- tibble(parent = character(), child = character()) - - growModified(tree_T, all_vertices, w, sum_filter_thresh) -} - -satisfiesSumCondition <- function(edges, w, thresh = 0.2) { - # returns TRUE if sum condition is not violated with given threshold (default 0.2) - - edges$parent <- as.character(edges$parent) - all_parents <- unique(edges$parent) - - for (p in all_parents) { - # get parent CCF - if (p == "root") { - parent_ccf <- rep(1, ncol(w)) - } else { - parent_ccf <- w[as.numeric(p), ] - } - - # get children CCF (sum if more than 1 child) - children <- as.numeric(filter(edges, parent == p)$child) - if (length(children) > 1) { - children_ccf <- colSums(w[children, ,drop=FALSE]) - } else { - children_ccf <- w[children, ] - } - - diff <- children_ccf - parent_ccf - if (any(diff > thresh)) return(FALSE) - } - - # sum condition is never violated, return TRUE - return(TRUE) -} - -#' Filter possible edges based on lineage precedence -#' -#' @export -#' @import tibble -#' @import dplyr -#' @import tidyr -#' @param graph_G tibble of possible edges with columns edge, parent, child -#' @param w matrix of CCF values (rows = clusters, columns = samples) -#' @param thresh maximum allowed violation of lineage precedence (default = 0.1) -filterEdgesBasedOnCCFs <- function(graph_G, w, thresh = 0.1) { - check_edges_logical <- apply(graph_G, 1, function(edge) checkEdge(edge, w, thresh)) - filtered_graph_G <- graph_G[check_edges_logical, ] - return(filtered_graph_G) -} - -checkEdge <- function(edge, w, thresh = 0.2) { - # returns TRUE if satisfies lineage precedence with given threshold - # returns FALSE if violates i.e. child_ccf - parent_ccf > thresh in any sample - # edge is in the format c(edge_name, parent, child) - - # in case of factors - p <- as.character(edge[2]) - c <- as.character(edge[3]) - - if (p == "root") { - parent_ccfs <- rep(1, ncol(w)) - } else { - parent_ccfs <- w[as.numeric(p), ] - } - child_ccfs <- w[as.numeric(c), ] - - diff <- child_ccfs - parent_ccfs - if (any(diff > thresh)) { - return(FALSE) - } else { - return(TRUE) - } -} - -splitGraphG <- function(graph_G, num_trees_per_run = 100000) { - # returns list of graph_Gs - # figure out which nodes for which to keep edges constant - # each split graph_G should have tree space upper bound < num_trees_per_run - num_edges_per_child <- graph_G %>% - group_by(child) %>% - summarize(n = n()) %>% - arrange(desc(n)) - - # split graph_G tree space is larger than num_trees_per_run - if (prod(num_edges_per_child$n) > num_trees_per_run) { - nodes_to_hold <- c() - for (i in seq_len(nrow(num_edges_per_child))) { - upper_bound_per_run <- prod(num_edges_per_child$n[i:nrow(num_edges_per_child)]) - if (upper_bound_per_run <= num_trees_per_run) break - nodes_to_hold <- c(nodes_to_hold, num_edges_per_child$child[i]) - } - - common_edges <- filter(graph_G, ! child %in% nodes_to_hold) - edges_to_split <- filter(graph_G, child %in% nodes_to_hold) %>% - group_by(child) %>% - group_split() - edges_list <- lapply(edges_to_split, function(x) x$edge) - all_combinations <- expand.grid(edges_list, stringsAsFactors = F) - split_graph_Gs <- apply(all_combinations, 1, function(x) rbind(filter(graph_G, edge %in% x), common_edges)) - return(split_graph_Gs) - - } else { - return(list(graph_G)) - } -} diff --git a/R/graph_ops.R b/R/graph_ops.R deleted file mode 100644 index f8c519f..0000000 --- a/R/graph_ops.R +++ /dev/null @@ -1,764 +0,0 @@ -constrainedEdgesMatrix <- function(wmat, chains, input_data) { - ## - ## Rules: - ## - cluster (node) cannot connect to itself - ## - a cluster with near-zero MCF cannot have children - ## - a cluster present in X multiple samples cannot connect to a cluster present in Y samples - ## if X < Y - ## - X < Y implies ... - ## - ##cluster.sample.presence <- apply(w, 1, function(x) which( x>= zero.thresh)) - samp_pres <- matchSamplePresence(wmat, chains, input_data) - if (input_data$S == 1) { - cluster.sample.presence <- lapply(samp_pres, function(x) which(x==1)) - } else { - cluster.sample.presence <- apply(samp_pres, 1, function(x) which(x==1)) - } - K <- nrow(wmat) - S <- ncol(wmat) - admat <- matrix(T, K, K) - - for(i in 1:K){ - for(j in 1:K){ - - if (is.matrix(cluster.sample.presence)) { - from.samples <- cluster.sample.presence[, i] - to.samples <- cluster.sample.presence[, j] - } else if (is.list(cluster.sample.presence)) { - from.samples <- cluster.sample.presence[[i]] - to.samples <- cluster.sample.presence[[j]] - } else if (is.vector(cluster.sample.presence)) { - from.samples <- cluster.sample.presence[i] - to.samples <- cluster.sample.presence[j] - } - - if (setequal(from.samples, to.samples)) next() - if(length(from.samples) < length(to.samples)) { - admat[i, j] <- F - next() - } - if (all(to.samples %in% from.samples)) next() - admat[i, j] <- F - } - } - diag(admat) <- F - am2 <- rbind(T, admat) - dimnames(am2) <- list(c("root", 1:K), 1:K) - return(am2) -} - -matchSamplePresence <- function(w_mat, chains, input_data) { - map_z <- estimateClusterAssignments(chains$z_chain) - - # pull one variant for each cluster - single_var_clust <- map_z[match(unique(map_z$value), map_z$value), ] %>% - mutate(mut_ind = as.numeric(gsub("z\\[|]", "", Parameter))) %>% - arrange(value) - - samp_pres <- ifelse(input_data$y[single_var_clust$mut_ind, ] >=1, 1, 0) - return(samp_pres) -} - - -## refactored base.admat -constrainedEdges <- function(wmat, chains, input_data) { - am2 <- constrainedEdgesMatrix(wmat, chains, input_data) - am2.long <- as_tibble(am2) %>% - mutate(parent=rownames(am2)) %>% - pivot_longer(-parent, - names_to="child", - values_to="possible_edge") %>% - filter(parent != child) %>% - unite("edge", c("parent", "child"), sep="->", - remove=FALSE) %>% - mutate(parent=factor(parent, levels=unique(parent))) %>% - mutate(connected=0) - am2.long -} - -calcConstrianedTreeSpace <- function(mcf_matrix, zero.thresh = 0.01) { - # input: - # - mcf_matrix = matrix of cell fraction values where rows are clusters, columns are samples - # - zero.thresh = minimum cell fraction to be considered "present" in sample (default = 0.01) - # output: number of possible trees, given constraints - ce <- constrainedEdgesMatrix(mcf_matrix, zero.thresh) - possible_from_edges_per_node <- colSums(ce) - tree_space <- prod(possible_from_edges_per_node) - return(tree_space) -} - -getAllNodes <- function(am.long) { - # returns vector of all nodes in graph - am.long$parent <- as.character(am.long$parent) - unique(c(am.long$parent, am.long$child)) -} - -reversedConnection <- function(am) { - connections <- setNames(am$connected, am$edge) - reversed_connections <- connections[am$reverse_edge] %>% - "["(!is.na(.)) - reversed <- setNames(rep(0, nrow(am)), am$reverse_edge) - reversed[names(reversed_connections)] <- reversed_connections - reversed -} - -isBidirectional <- function(am) { - am %>% - mutate(bi_directional=(reverse_edge %in% edge) & - connected==1 & - reversed_connected == 1) %>% - pull(bi_directional) -} - -updateGraphElements <- function(am) { - am %>% - mutate(parent=factor(parent, levels=unique(parent))) %>% - mutate(reversed_connected=reversedConnection(.)) %>% - mutate(bi_directional=isBidirectional(.)) %>% - mutate(root_connected=isRootConnected(.)) -} - - -reversedEdges <- function(am) { - am2 <- am %>% - filter(!is.na(connected)) %>% - unite("reverse_edge", c("child", "parent"), sep="->", - remove=FALSE) - am2 -} - -getEdgeName <- function(from, to) { - paste0(from, "->", to) -} - -numNodesConnectedToRoot <- function(am.long) { - sum(am.long[am.long$parent == "root", ]$connected) -} - -randAdmatUnchecked <- function(am.long, max.num.root.children) { - # input: blank am.long (from either constrainedEdges or toLong(initEmptyAdmatFromK(K)) - # - $connected = 0 - # - $possible_edge = T/F if from constrainedEdges - # output: random graph (not necessarily valid) - blank <- am.long # save copy of original am.long - parent_levels <- levels(blank$parent) - - am.long$parent <- as.character(am.long$parent) - - all.nodes <- getAllNodes(am.long) - node.pool <- all.nodes[all.nodes != "root"] # nodes left to connect in graph - # possible edges may be limited by constraints - if ("possible_edge" %in% colnames(am.long)) { - possible.edges <- am.long %>% - filter(possible_edge == TRUE) - has_constraints <- TRUE - } else { - possible.edges <- am.long - has_constraints <- FALSE - } - parent.pool <- unique(possible.edges$parent) # possible parent nodes - - # choose node to connect to root - # select "to" node from parent.pool to prevent getting stuck if max.num.root.children == 1 - temp.possible.root.children <- filter(possible.edges, parent == "root")$child - if (length(temp.possible.root.children) > 1) { - temp.node <- sample(temp.possible.root.children, 1) - } else { - temp.node <- temp.possible.root.children - } - - am.long[am.long$edge == getEdgeName("root", temp.node), ]$connected <- 1 - node.pool <- node.pool[node.pool != temp.node] - - # connect nodes that are left - while(length(node.pool) > 0) { - #for (n in node.pool) { - if (length(node.pool) > 1) { - n <- sample(node.pool, 1) - } else { - n <- node.pool - } - - # all possible edges to node n - # check if there are constraints present - if (has_constraints) { - temp.possible.edges <- am.long %>% - filter(possible_edge == T) - } else { - temp.possible.edges <- am.long - } - # can't connect to root if max.num.root.children quota is satisfied - if (numNodesConnectedToRoot(am.long) >= max.num.root.children) { - temp.possible.edges <- temp.possible.edges %>% - filter(child == n, parent != "root") - } else { - temp.possible.edges <- temp.possible.edges %>% - filter(child == n) - } - - # choose edge to connect -- randomly sample if more than 1 possible edge - if(nrow(temp.possible.edges) > 1) { - temp.edge <- temp.possible.edges[sample(nrow(temp.possible.edges), 1), ]$edge - } else if (nrow(temp.possible.edges) == 1) { - temp.edge <- temp.possible.edges$edge - } else { - # if no possible edges, start over - return(randAdmatUnchecked(blank, max.num.root.children)) - } - - # connect edge - am.long[am.long$edge == temp.edge, ]$connected <- 1 - - node.pool <- node.pool[node.pool != n] - } - - am.long <- am.long %>% - mutate(parent = factor(am.long$parent, levels = parent_levels)) - - am.long <- reversedEdges(am.long) %>% - mutate(reversed_connected=reversedConnection(.), - bi_directional=NA, - root_connected=NA) - am.long <- updateGraphElements(am.long) - - return(am.long) -} - -randAdmat <- function(am.long, max.num.root.children) { - # input: blank am.long (from either constrainedEdges or toLong(initEmptyAdmatFromK(K)) - # - $connected = 0 - # - $possible_edge = T/F if from constrainedEdges - # output: random graph - blank <- am.long - has_constraints <- ifelse("possible_edge" %in% colnames(am.long), - ifelse(any(!am.long$possible_edge), T, F), - F) - - am.long$parent <- as.character(am.long$parent) - - all.nodes <- getAllNodes(am.long) - node.pool <- all.nodes[all.nodes != "root"] # nodes left to connect in graph - possible.edges <- am.long %>% - filter(!is.na(connected)) - parent.pool <- unique(possible.edges$parent) # possible parent nodes - - # choose node to connect to root - # select "to" node from parent.pool to prevent getting stuck if max.num.root.children == 1 - temp.node <- sample(parent.pool[parent.pool != "root"], 1) - am.long[am.long$edge == getEdgeName("root", temp.node), ]$connected <- 1 - node.pool <- node.pool[node.pool != temp.node] - from.nodes <- c("root", temp.node) - - while(length(node.pool) > 0) { - - # remove "root" from possible parents if max.num.root.children quota satisfied - if(numNodesConnectedToRoot(am.long) < max.num.root.children) { - from.nodes.pool <- from.nodes - } else { - from.nodes.pool <- from.nodes[-1] - } - - if (length(from.nodes.pool) == 1) { - temp.from <- from.nodes.pool - } else { - temp.from <- sample(from.nodes.pool, 1) - } - - # remove possible "to" nodes based on NA constraints - temp.to.pool <- am.long %>% - filter(parent==temp.from) %>% - filter(!is.na(connected)) - # remove "to" nodes if not in node.pool - temp.to.pool <- temp.to.pool$child - temp.to.pool <- temp.to.pool[temp.to.pool %in% node.pool] - # sample possible children nodes to be "to" node and connect edge - if (length(temp.to.pool) > 1) { - temp.to <- sample(temp.to.pool, 1) - } else if (length(temp.to.pool) == 1) { - temp.to <- temp.to.pool - } else { - # remove temp.from from from.nodes - } - - am.long[am.long$edge == getEdgeName(temp.from, temp.to), ]$connected <- 1 - - # add temp.to to possible from.nodes if it is a possible parent - if (temp.to %in% parent.pool) { - from.nodes <- c(from.nodes, temp.to) - } - - node.pool <- node.pool[node.pool != temp.to] - } - - - - am.long <- reversedEdges(am.long) %>% - mutate(reversed_connected=reversedConnection(.), - bi_directional=NA, - root_connected=NA) - am.long <- updateGraphElements(am.long) - am.long -} - -isParentConnected <- function(am) { - am %>% - mutate(parent=factor(parent, levels=unique(parent))) %>% - group_by(parent) %>% - summarize(n=sum(connected)) %>% - pull(n) > 0 -} - -isRootConnected <- function(am) isParentConnected(am)[1] - -isDirected <- function(am) !any(am$bi_directional) - -isFullyConnected <- function(am.long) { - # checks if graph (am.long format) is fully connected - all_nodes <- getAllNodes(am.long) - nodes_in_main_tree <- bfsLong(am.long) - length(all_nodes) == length(nodes_in_main_tree) -} - -containsCycle <- function(am.long) { - # returns nodes in main tree (connected to root) including "root" - # starting at root - am.long$parent <- as.character(am.long$parent) - children <- am.long[(am.long$parent == "root" & am.long$connected == 1), ]$child - nodes <- c("root", children) - - while(length(children) > 0) { - c <- children[1] - temp.children <- am.long[(am.long$parent == c & am.long$connected == 1), ]$child - children <- c(children, temp.children) - if (any(temp.children %in% nodes)) return(TRUE) - nodes <- c(nodes, temp.children) - - children <- children[-1] - } - FALSE -} - -validGraph <- function(am) { - isDirected(am) && - isRootConnected(am) && - !containsCycle(am) && - isFullyConnected(am) -} - -bfsLong <- function(am.long) { - # returns vector of nodes in main tree (connected to root) including "root" - # starting at root - # stops if there is a cycle present in graph - am.long$parent <- as.character(am.long$parent) - children <- am.long[(am.long$parent == "root" & am.long$connected == 1), ]$child - nodes <- c("root", children) - - while(length(children) > 0) { - c <- children[1] - temp.children <- am.long[(am.long$parent == c & am.long$connected == 1), ]$child - children <- c(children, temp.children) - if (any(temp.children %in% nodes)) stop("graph has cycle") - nodes <- c(nodes, temp.children) - children <- children[-1] - } - return(nodes) -} - -addEdge <- function(am, new_edge) { - c <- new_edge$child - # disconnect existing edge connecting to child - am[which(am$child == c & am$connected == 1), ]$connected <- 0 - # connect new edge - am[which(am$edge == new_edge$edge), ]$connected <- 1 - # update graph elements - am <- updateGraphElements(am) - return(am) -} - -initializeGraph <- function(mcf, max.num.root.children=1, zero.thresh=0.01){ - # clusters <- seq_len(nrow(mcf)) - # nsamp <- ncol(mcf) - # samples <- seq_len(nsamp) - # nclust <- length(clusters) - # mcf.long <- tibble(cluster_id=as.character(rep(clusters, nsamp)), - # sample_id=as.character(rep(samples, each=nclust)), - # mean=as.numeric(mcf)) - am.long <- constrainedEdges(mcf, zero.thresh=zero.thresh) - am.long2 <- randAdmatUnchecked(am.long, max.num.root.children) - while (!validGraph(am.long2)) { - am.long2 <- randAdmatUnchecked(am.long, max.num.root.children) - } - return(am.long2) -} - -initializeGraphFromPost <- function(post_am, max.num.root.children=1, thresh=0.1) { - constrained_am <- post_am %>% - group_by(child) %>% - mutate(max_post_for_child = max(posterior_prob)) %>% - ungroup() %>% - mutate(possible_edge = (max_post_for_child-posterior_prob) <= thresh) %>% - mutate(connected = 0) %>% - select(edge, parent, child, possible_edge, connected) - am <- randAdmatUnchecked(constrained_am, max.num.root.children) - while (!validGraph(am)) { - am <- randAdmatUnchecked(constrained_am, max.num.root.children) - } - return(am) -} - -initializeGraphFromPost2 <- function(post_am, max.num.root.children=1) { - K <- length(unique(post_am$child)) - thresh <- (1/K)/2 - constrained_am <- post_am %>% - mutate(possible_edge = posterior_prob >= thresh) %>% - mutate(connected = 0) %>% - select(edge, parent, child, possible_edge, connected) - am <- randAdmatUnchecked(constrained_am, max.num.root.children) - while (!validGraph(am)) { - am <- randAdmatUnchecked(constrained_am, max.num.root.children) - } - return(am) -} - - -toWide <- function(am.long){ - am.long$child <- as.numeric(am.long$child) - am.long %>% select(parent, child, connected) %>% - tidyr::spread(child, connected) %>% - select(-parent) %>% - as.matrix() -} - -toLong <- function(am) { - am.long <- as_tibble(am) %>% - mutate(parent=rownames(am)) %>% - pivot_longer(-parent, - names_to="child", - values_to="connected") %>% - filter(parent != child) %>% - unite("edge", c("parent", "child"), sep="->", - remove=FALSE) %>% - mutate(parent=factor(parent, levels=unique(parent))) - return(am.long) -} - -isMoveValid <- function(a, possible_move, max.num.root.children) { - # a = am.long format of current graph - # possible_move = a row in possible_moves tibble (am.long format) - # max.num.root.children = maximum number of nodes allowed to be connected to root - # returns TRUE or FALSE - astar <- addEdge(a, possible_move) - - is_valid <- validGraph(astar) & (numNodesConnectedToRoot(astar) <= max.num.root.children) - return(is_valid) -} - -sampleNewEdge <- function(a, max.num.root.children, mc.cores=1){ - ## a move is connecting a new edge and disconnecting the pre-existing edge connected to the new edge's child - possible_moves <- filter(a, connected==0, possible_edge==T) - possible_moves_list <- possible_moves %>% - group_by(edge) %>% - group_split() - is_valid <- unlist(parallel::mclapply(possible_moves_list, function(x) isMoveValid(a, x, max.num.root.children), - mc.cores = mc.cores)) - move_set <- possible_moves_list[is_valid] - #ix <- sample(seq_len(length(move_set)), 1) - ix <- tryCatch(sample(seq_len(length(move_set)), 1), error=function(e) NULL) - if(is.null(ix)) { - #print("no moves :(") - #print(a, n=100) - #saveRDS(list(am = a, move_set), "/mnt/disk005/data/projects/pictograph/scripts/method-comparison/pictograph/log-sample-error/test.rds") - return(a) - } else { - astar <- addEdge(a, move_set[[ix]]) - return(astar) - } -} - -initEmptyAdmatFromK <- function(K) { - admat <- matrix(0, K, K) - diag(admat) <- NA - am2 <- rbind(0, admat) - dimnames(am2) <- list(c("root", 1:K), 1:K) - return(am2) -} - -generateRandomGraphFromK <- function(K, max.num.root.children) { - # input: number of mutation clusters, K - # output: mutation tree; adjacency matrix - am.long <- toLong(initEmptyAdmatFromK(K)) - rand.am.long <- randAdmat(am.long, max.num.root.children) - if (!validGraph(rand.am.long)) warning("graph is not valid") - return(rand.am.long) -} - -plotGraph <- function(am.long, v_color){ - # make sure am.long is sorted by parent and child - am.long <- mutate(am.long, child = as.numeric(am.long$child)) %>% - arrange(parent, child) - am.long <- mutate(am.long, child = as.character(am.long$child)) - - # change to wide format and plot - am <- toWide(am.long) - rownames(am) <- c("root", colnames(am)) - am <- cbind(root=0, am) ## add column for root - colnames(am) <- rownames(am) - - am[is.na(am)] <- 0 - - ig <- igraph::graph_from_adjacency_matrix(am, mode = "directed", weighted = TRUE, - diag = FALSE, add.row = TRUE) - V(ig)$color <- as.list(v_color %>% arrange(match(v_sorted, names(V(ig)))) %>% select(colors))$colors - par(mar=c(0,0,0,0)+.1) - igraph::plot.igraph(ig, layout = igraph::layout_as_tree(ig), - vertex.size=34, vertex.frame.color = "#000000", vertex.label.cex = 1.5, - vertex.label.family = "Helvetica", vertex.label.color = "#000000", - edge.arrow.size = 0.5, edge.arrow.width = 2) -} - -getPosteriorAmLong <- function(am_chain) { - # input: chain from tree MCMC of trees in am.long format - # output: posterior am.long - num_trees <- length(am_chain) - - combined_am_chain <- am_chain %>% - bind_rows - post_am <- combined_am_chain %>% - group_by(edge) %>% - mutate(posterior_prob = sum(connected) / num_trees) %>% - ungroup() %>% - select(edge, parent, child, posterior_prob) %>% - distinct() - post_am -} - -toWidePostAm <- function(post_am) { - post_am <- post_am %>% - mutate(child = as.numeric(post_am$child)) - if(!is.factor(post_am$parent)) { - post_am <- post_am %>% - mutate(parent = factor(parent, levels = c("root", 1:max(post_am$parent)))) - } - post_am %>% - select(parent, child, posterior_prob) %>% - tidyr::spread(child, posterior_prob) %>% - select(-parent) %>% - as.matrix() -} - -filterAdmat <- function(admat, filter1 = TRUE, filter1.threshold = 0.1, - filter2 = TRUE, filter2.threshold = 0.1) { - # filter1 filters columns (am wide format) for edges with posterior prob > (max(column) - filter1.threshold) - # filter2 filters entire matrix for prob > filter2.threshold - - if (filter1) { - admat <- apply(admat, 2, function(x) ifelse(x > (max(x)-filter1.threshold), x, 0)) - } - - if (filter2) { - admat[admat <= filter2.threshold] <- 0 - } - - return(admat) -} - -prepPostAmForGraphing <- function(post_am) { - post_am_mat <- toWidePostAm(post_am) - - # add column for root - post_am_mat <- cbind(0, post_am_mat) - colnames(post_am_mat)[1] <- "root" - rownames(post_am_mat) <- colnames(post_am_mat) - admat <- round(as.matrix(post_am_mat), 2) - admat[is.na(admat)] <- 0 - - return(admat) -} - -plotPosteriorAmLong <- function(post_am, v_color, filter1 = TRUE, filter1.threshold = 0.1, - filter2 = TRUE, filter2.threshold = 0.1) { - # filter1 filters columns (am wide format) for edges with posterior prob > (max(column) - filter1.threshold) - admat <- prepPostAmForGraphing(post_am) - - # filter edges of low freq - admat <- filterAdmat(admat, filter1 = filter1, filter1.threshold = filter1.threshold, - filter2 = filter2, filter2.threshold = filter2.threshold) - - ig <- igraph::graph_from_adjacency_matrix(admat, mode = "directed", weighted = TRUE, - diag = FALSE, add.row = TRUE) - - igraph::E(ig)$lty <- ifelse(igraph::E(ig)$weight < 0.25, 2, 1) - - # make edge black if only 1 edge to vertex - e <- igraph::ends(ig, igraph::E(ig)) - numTo <- table(e[,2]) - edgeColors <- sapply(e[,2], function(x) ifelse(x %in% names(which(numTo==1)), "black", "darkgrey")) - igraph::E(ig)$color <- edgeColors - - igraph::V(ig)$label.cex <- 1.5 - - igraph::V(ig)$color <- as.list(v_color %>% arrange(match(v_sorted, names(V(ig)))) %>% select(colors))$colors - - par(mar=c(0,0,0,0)+.1) - igraph::plot.igraph(ig, layout = igraph::layout_as_tree(ig), - vertex.size=34, vertex.label.family = "Helvetica", - edge.arrow.size = 0.5, edge.arrow.width = 2, - edge.width = igraph::E(ig)$weight*3) -} - -plotPosteriorAmLongSim <- function(post_am, filter1 = TRUE, filter1.threshold = 0.1, - filter2 = TRUE, filter2.threshold = 0.1) { - # filter1 filters columns (am wide format) for edges with posterior prob > (max(column) - filter1.threshold) - admat <- prepPostAmForGraphing(post_am) - - # filter edges of low freq - admat <- filterAdmat(admat, filter1 = filter1, filter1.threshold = filter1.threshold, - filter2 = filter2, filter2.threshold = filter2.threshold) - - ig <- igraph::graph_from_adjacency_matrix(admat, mode = "directed", weighted = TRUE, - diag = FALSE, add.row = TRUE) - - igraph::E(ig)$lty <- ifelse(igraph::E(ig)$weight < 0.25, 2, 1) - - # make edge black if only 1 edge to vertex - e <- igraph::ends(ig, igraph::E(ig)) - numTo <- table(e[,2]) - edgeColors <- sapply(e[,2], function(x) ifelse(x %in% names(which(numTo==1)), "black", "darkgrey")) - igraph::E(ig)$color <- edgeColors - - igraph::V(ig)$label.cex <- 1 - - par(mar=c(0,0,0,0)+.1) - igraph::plot.igraph(ig, layout = igraph::layout_as_tree(ig), - vertex.color = "white", vertex.label.family = "Helvetica", - vertex.size = 30, - edge.arrow.size = 0.6, edge.arrow.width = 2, - edge.width = igraph::E(ig)$weight*5, - asp = 0) -} - -plotPosteriorAmLong2 <- function(post_am, cluster_key_genes_tb, - filter1 = TRUE, filter1.threshold = 0.1, - filter2 = TRUE, filter2.threshold = 0.1) { - # filter1 filters columns (am wide format) for edges with posterior prob > (max(column) - filter1.threshold) - admat <- prepPostAmForGraphing(post_am) - - # filter edges of low freq - admat <- filterAdmat(admat, filter1 = filter1, filter1.threshold = filter1.threshold, - filter2 = filter2, filter2.threshold = filter2.threshold) - - ig <- igraph::graph_from_adjacency_matrix(admat, mode = "directed", weighted = TRUE, - diag = FALSE, add.row = TRUE) - - igraph::E(ig)$lty <- ifelse(igraph::E(ig)$weight < 0.25, 2, 1) - - # make edge black if only 1 edge to vertex - e <- igraph::ends(ig, igraph::E(ig)) - numTo <- table(e[,2]) - edgeColors <- sapply(e[,2], function(x) ifelse(x %in% names(which(numTo==1)), "black", "darkgrey")) - igraph::E(ig)$color <- edgeColors - - igraph::V(ig)$label.cex <- 0.5 - vertex_colors <- c("white", ifelse(cluster_key_genes_tb$contains_key_gene, "lightblue", "white")) - igraph::V(ig)$color <- vertex_colors - # highlight single sample vertices - igraph::V(ig)$frame.color <- c("black", ifelse(cluster_key_genes_tb$single_sample, "#E69F00", "black")) - - par(mar=c(0,0,0,0)+.1) - igraph::plot.igraph(ig, layout = igraph::layout_as_tree(ig), - #vertex.color = "white", - vertex.label.family = "Helvetica", - edge.arrow.size = 0.2, edge.arrow.width = 2, - edge.width = igraph::E(ig)$weight*3) -} - -plotPosteriorAmLong3 <- function(post_am, cluster_key_genes_tb, - filter1 = TRUE, filter1.threshold = 0.1, - filter2 = TRUE, filter2.threshold = 0.1) { - # don't plot edges with low freq (<0.25) - # filter1 filters columns (am wide format) for edges with posterior prob > (max(column) - filter1.threshold) - admat <- prepPostAmForGraphing(post_am) - - # filter edges of low freq - admat <- filterAdmat(admat, filter1 = filter1, filter1.threshold = filter1.threshold, - filter2 = filter2, filter2.threshold = filter2.threshold) - - # filter out edges < 0.2 prob - admat[admat < 0.2] <- 0 - ig <- igraph::graph_from_adjacency_matrix(admat, mode = "directed", weighted = TRUE, - diag = FALSE, add.row = TRUE) - # edges < 0.25 prob - igraph::E(ig)$lty <- ifelse(igraph::E(ig)$weight < 0.25, 2, 1) - - # make edge black if only 1 edge to vertex - e <- igraph::ends(ig, igraph::E(ig)) - numTo <- table(e[,2]) - edgeColors <- sapply(e[,2], function(x) ifelse(x %in% names(which(numTo==1)), "black", "darkgrey")) - igraph::E(ig)$color <- edgeColors - - igraph::V(ig)$label.cex <- 0.5 - vertex_colors <- c("white", ifelse(cluster_key_genes_tb$contains_key_gene, "lightblue", "white")) - igraph::V(ig)$color <- vertex_colors - # highlight single sample vertices - igraph::V(ig)$frame.color <- c("black", ifelse(cluster_key_genes_tb$single_sample, "#E69F00", "black")) - - par(mar=c(0,0,0,0)+.1) - igraph::plot.igraph(ig, layout = igraph::layout_as_tree(ig), - #vertex.color = "white", - vertex.label.family = "Helvetica", - edge.arrow.size = 0.2, edge.arrow.width = 2, - edge.width = igraph::E(ig)$weight*3) -} - -labelMAPEdgesFromPostAM <- function(post_am) { - post_am %>% - group_by(child) %>% - mutate(map_edge = posterior_prob == max(posterior_prob)) %>% - ungroup() -} - -getMAPGraphFromPostAM <- function(post_am) { - map_am <- labelMAPEdgesFromPostAM(post_am) %>% - mutate(connected = ifelse(map_edge, 1, 0)) - return(map_am) -} - -edgeTibbleToAmLong <- function(edge_tb, root = 0) { - K <- length(unique(c(edge_tb$From, edge_tb$To))) - 1 - am_long <- toLong(initEmptyAdmatFromK(K)) - edge_tb$From <- as.character(edge_tb$From) - edge_tb[edge_tb == as.character(root)] <- "root" - for (i in seq_len(nrow(edge_tb))) { - temp_edge <- getEdgeName(edge_tb$From[i], edge_tb$To[i]) - am_long$connected[which(am_long$edge == temp_edge)] <- 1 - } - return(am_long) -} - -# getParents <- function(am_long, node, prev_parents = c()) { -# temp_parent <- filter(am_long, connected == 1, child == node)$parent -# if (temp_parent == "root") { -# return(c(prev_parents, "root")) -# } else { -# return(getParents(am_long, temp_parent, c(prev_parents, temp_parent))) -# } -# } - -getBTColumn <- function(am_long, node) { - BT_column <- rep(0, length(unique(am_long$child))) - path <- getPathFromRoot(node, am_long) - BT_column[path] <- 1 - return(BT_column) -} - -amToBT <- function(am_long) { - # returns binary perfect phylogeny matrix - # where columns are clones and rows are mutation clusters - # clones have the same numeric ID as the last mutation cluster on their path from the root - children <- unique(am_long$child) - children_num <- sort(as.numeric(children)) - - BT_cols <- sapply(children_num, function(node) getBTColumn(am_long, node)) - - return(BT_cols) -} \ No newline at end of file diff --git a/R/help.R b/R/help.R deleted file mode 100644 index 1a99056..0000000 --- a/R/help.R +++ /dev/null @@ -1,14 +0,0 @@ -#' pictograph -#' -#' @docType package -#' @name pictograph -#' @import methods -#' @import dplyr -#' @import readr -#' @import magrittr -#' @import rjags -#' @import tidyr -#' @import ggplot2 -#' @import ggmcmc -#' @import igraph -NULL diff --git a/R/metrics.R b/R/metrics.R deleted file mode 100644 index 28d4c54..0000000 --- a/R/metrics.R +++ /dev/null @@ -1,199 +0,0 @@ -# ----------------------------------------------------------------------------- -# ----------------------------------------------------------------------------- -# Tree metric: proportion of correct ancestral relationships - -getParent <- function(node, am.long) { - am.long <- am.long %>% - mutate(parent = as.character(parent)) - e <- am.long %>% - filter(child == node, connected == 1) - if (nrow(e) > 1) warning(paste0("node ", node, " has ", nrow(e), " parents")) - return(e$parent[1]) -} - -getPathFromRoot <- function(node, am.long) { - path <- node - curr_parent <- getParent(node, am.long) - while(curr_parent != "root") { - path <- c(path, curr_parent) - curr_parent <- getParent(curr_parent, am.long) - } - as.numeric(path) -} - -getRelationshipTypeOfClusters <- function(cluster1, cluster2, am.long) { - # input: cluster1 and cluster2 can be either characters or numeric - # returns type of ancestral relationship of cluster1 and cluster2 - # 1 = cluster1 and cluster2 are the same - # 2 = cluster1 is ancestral to cluster2 - # 3 = cluster2 is ancestral to cluster1 - # 4 = cluster1 and cluster2 are on distinct branches - - if (is.numeric(cluster1)) cluster1 <- as.character(cluster1) - if (is.numeric(cluster2)) cluster2 <- as.character(cluster2) - - # clusters are the same - if (cluster1 == cluster2) return(1) - - # cluster1 is ancestral to cluster2 - temp_path <- getPathFromRoot(cluster2, am.long) - if (cluster1 %in% temp_path) return(2) - - # cluster2 is ancestral to cluster1 - temp_path <- getPathFromRoot(cluster1, am.long) - if (cluster2 %in% temp_path) return(3) - - # cluster1 and cluster2 are on distinct branches - return(4) -} - -getMutRelTb <- function(z, mut_ind, am) { - # input: - # z = vector of cluster assignments for each mutation - # mut_ind = vector of mutation indices e.g. 1:I - # am = am.long format of graph - # output: tibble with columns mut1, mut2, cluster1, cluster2, relationship_type - # relationship_type is 1:4 -- see getRelationshipTypeOfCluster() - - # mutation pairs are all pairwise combinations, not including self - all_mut_pairs <- t(combn(mut_ind, 2)) - - # cluster pairs are all permutations of 2, including self - all_cluster_pairs <- rbind(t(combn(unique(z), 2)), t(combn(rev(unique(z)), 2)), - matrix(rep(unique(z), 2), nrow=length(unique(z)), byrow = F)) - all_cluster_pairs_tb <- tibble(cluster1 = all_cluster_pairs[, 1], - cluster2 = all_cluster_pairs[, 2]) - - true_rel_types <- mapply(function(c1, c2) getRelationshipTypeOfClusters(c1, c2, am), - c1=all_cluster_pairs[, 1], c2=all_cluster_pairs[, 2]) - - rel_tb <- tibble(cluster1 = all_cluster_pairs[, 1], - cluster2 = all_cluster_pairs[, 2], - relationship_type = true_rel_types) - - true_mut_rel <- tibble(mut1 = all_mut_pairs[, 1], - mut2 = all_mut_pairs[, 2], - cluster1 = z[all_mut_pairs[, 1]], - cluster2 = z[all_mut_pairs[, 2]]) - - left_join(true_mut_rel, rel_tb, by = c("cluster1", "cluster2")) -} - -calcPropRelationshipsCorrect <- function(test_mut_rel, true_mut_rel) { - # input: tibbles from getMutRelTb() - # returns proportion of relationships in test_mut_rel that match true_mut_rel - test_comp <- left_join(true_mut_rel, test_mut_rel, by = c("mut1", "mut2")) - rel_same <- test_comp$relationship_type.x == test_comp$relationship_type.y - return(sum(rel_same) / length(rel_same)) -} - -calcTreeMetricSingleIter <- function(z, am, sim_data) { - true_mut_rel <- getMutRelTb(sim_data$z, 1:sim_data$I, sim_data$am.long) - if (length(unique(z$value)) == 1) { - this_rel <- tibble(mut1 = true_mut_rel$mut1, - mut2 = true_mut_rel$mut2, - cluster1 = 1, - cluster2 = 1, - relationship_type = 1) - } else { - this_rel <- getMutRelTb(z$value, 1:sim_data$I, am) - } - - prop_true <- calcPropRelationshipsCorrect(this_rel, true_mut_rel) - return(prop_true) -} - -calcTreeMetricChain <- function(z_chain_list, am_chain, sim_data, - mc.cores=1) { - - tree_metric <- parallel::mcmapply(function(z, am) calcTreeMetricSingleIter(z, am, sim_data), - z_chain_list, am_chain, - mc.cores = mc.cores) - return(tree_metric) -} - -# ----------------------------------------------------------------------------- -# ----------------------------------------------------------------------------- -# CCF metrics (as described in PASTRI paper) - - -calcMetric1 <- function(true_w, w_star) { - # Metric 1 -- A measure of sensitivity, matches the true clones to the - # nearest reported clusters - # input: matrices with ncol = number of samples, nrow = number of clusters - # returns: numeric value - - # number of clusters can be different, but number of samples must be equal - if (ncol(true_w) != ncol(w_star)) { - stop("true_w and w_star have different number of samples") - } - - min_diff <- rep(Inf, nrow(true_w)) - - for (i in 1:nrow(true_w)) { - for (j in 1:nrow(w_star)) { - temp_diff <- mean(abs(true_w[i, ] - w_star[j, ])) - if (temp_diff < min_diff[i]) min_diff[i] <- temp_diff - } - } - - return(sum(min_diff)) -} - -calcMetric2 <- function(true_w, w_star) { - # Metric 2 -- A measure of specificity, matches the reported clusters to - # the nearest true clones - # input: matrices with ncol = number of samples, nrow = number of clusters - # returns: numeric value - - # number of clusters can be different, but number of samples must be equal - if (ncol(true_w) != ncol(w_star)) { - stop("true_w and w_star have different number of samples") - } - - min_diff <- rep(Inf, nrow(w_star)) - - for (j in 1:nrow(w_star)) { - for (i in 1:nrow(true_w)) { - temp_diff <- mean(abs(true_w[i, ] - w_star[j, ])) - if (temp_diff < min_diff[j]) min_diff[j] <- temp_diff - } - } - - return(sum(min_diff)) -} - -wTibbleToMatrix <- function(w_tb) { - # input: w_chain tibble with columns Iteration, Parameter, value for a single Iteration - # output: w_matrix with rows = clusters, columns = samples - S <- numberSamples(w_tb) - K <- numberClusters(w_tb) - w_mat <- matrix(w_tb$value, nrow = K, ncol = S, byrow = T) - return(w_mat) -} - -calcMetric1Chain <- function(true_w, w_chain, mc.cores) { - w_chain_list <- w_chain %>% - group_by(Iteration) %>% - group_split() - w_chain_list_matrix <- parallel::mclapply(w_chain_list, - wTibbleToMatrix, - mc.cores = mc.cores) - m1_chain <- parallel::mclapply(w_chain_list_matrix, - function(w_star) calcMetric1(true_w, w_star), - mc.cores = mc.cores) - return(unlist(m1_chain)) -} - -calcMetric2Chain <- function(true_w, w_chain, mc.cores) { - w_chain_list <- w_chain %>% - group_by(Iteration) %>% - group_split() - w_chain_list_matrix <- parallel::mclapply(w_chain_list, - wTibbleToMatrix, - mc.cores = mc.cores) - m2_chain <- parallel::mclapply(w_chain_list_matrix, - function(w_star) calcMetric2(true_w, w_star), - mc.cores = mc.cores) - return(unlist(m2_chain)) -} \ No newline at end of file diff --git a/R/pictograph-package.R b/R/pictograph-package.R new file mode 100644 index 0000000..351d1a9 --- /dev/null +++ b/R/pictograph-package.R @@ -0,0 +1,23 @@ +#' pictograph +#' +#' infer tumor evolution using SNV and CNA +#' +#' @import readr +#' @import tidyverse +#' @import dplyr +#' @import tidyr +#' @import rjags +#' @import ggmcmc +#' @import magrittr +#' @import methods +#' @importFrom epiR epi.betabuster +#' @import ggplot2 +#' @importFrom stringr str_replace +#' @import viridis +#' @import igraph +#' @import stringr +#' @import UpSetR +#' @import cluster +#' @name pictograph-package +#' @aliases pictograph +NULL diff --git a/R/plot.R b/R/plot.R deleted file mode 100644 index 684b57e..0000000 --- a/R/plot.R +++ /dev/null @@ -1,661 +0,0 @@ -plot.chain.trace <- function(chain, K) { - ggplot(chain, aes(x = Iteration, y = value)) + - geom_line() + - facet_wrap(~Parameter, nrow = K) + - theme_light() -} - -get.parameter.chain <- function(param, chains) { - chains[grep(paste0(param, "\\["), chains$Parameter), ] -} - -#' Plot probabilities of mutation cluster assignments -#' -#' @export -#' @import ggplot2 -#' @param z_chain MCMC chain of mutation cluster assignment values, which is the second item in the list returned by \code{clusterSep} -plotClusterAssignmentProb <- function(z_chain) { - I <- length(unique(z_chain$Parameter)) - K <- max(unique(z_chain$value)) - num_iter <- max(z_chain$Iteration) - - mcmc_z <- z_chain %>% - group_by(Parameter, value) %>% - summarize(n=n(), - num_iter=num_iter) %>% - mutate(probability=n/num_iter) %>% - ungroup() - - z.seg.tb <- mcmc_z %>% - group_by(Parameter) %>% - summarize(z1 = min(value), z2 = max(value)) - - z.plot <- ggplot(mcmc_z, aes(x = Parameter, y = value, color = probability)) + - theme_light() + - scale_x_discrete(labels = 1:I, name = "Variant") + - scale_y_continuous(breaks = 1:K, name = "Cluster") + - geom_segment(data = z.seg.tb, - aes(x=Parameter, xend=Parameter, - y=z1, yend=z2), - color="black", linetype=2) + - geom_point() + - theme(panel.grid.minor = element_blank()) - return(z.plot) -} - -summarizeZPost <- function(z_chain) { - I <- length(unique(z_chain$Parameter)) - K <- max(unique(z_chain$value)) - num_iter <- max(z_chain$Iteration) - mcmc_z <- z_chain %>% - group_by(Parameter, value) %>% - summarize(n=n(), - num_iter=num_iter) %>% - mutate(probability=n/num_iter) %>% - ungroup() - return(mcmc_z) -} - -#' Plot probabilities of mutation cluster assignments - vertical -#' -#' @export -#' @import ggplot2 -#' @import tibble -#' @import dplyr -#' @import tidyr -#' @param z_chain MCMC chain of mutation cluster assignment values, which is the second item in the list returned by \code{clusterSep} -#' @param w_chain MCMC chain of CCF values, which is the first item in the list returned by \code{clusterSep} -#' @param filter_thresh Lowest posterior probability to include cluster assignment. Default value is 0.05 (inclusive) -#' @param MutID (Optional) Vector of mutation IDs for labeling purposes. Same order as supplied as input data (e.g. indata$Mut_ID) -#' @param SampleID (Optional) Vector of sample IDs for labeling purposes. Same order as supplied as input data (e.g. indata$Sample_ID) -plotClusterAssignmentProbVertical <- function(z_chain, - w_chain, - filter_thresh = 0.05, - MutID = NULL, - SampleID = NULL) { - - mcmc_z <- generateZPostSummary(z_chain, w_chain, filter_thresh, MutID, SampleID) - K <- max(mcmc_z$value) - z.seg.tb <- mcmc_z %>% - group_by(Parameter) %>% - summarize(z1 = min(value), z2 = max(value)) %>% - ungroup() %>% - mutate(Variant = as.numeric(gsub("z\\[|]", "", Parameter)), - Mut_ID = mcmc_z$Mut_ID[match(Variant, mcmc_z$Variant)], - Sample_presence = mcmc_z$Sample_presence[match(Variant, mcmc_z$Variant)]) - - z.plot <- ggplot(mcmc_z, aes(x = value, y = Mut_ID, color = probability)) + - theme_light() + - scale_y_discrete(drop = T, name = "Variant") + - scale_x_continuous(breaks = 1:K, name = "Cluster", labels = 1:K) + - geom_segment(data = z.seg.tb, - aes(y=Mut_ID, yend=Mut_ID, - x=z1, xend=z2), - color="black", linetype=2) + - geom_point() + - theme(panel.grid.minor = element_blank(), - strip.background=element_blank(), - strip.text = element_text(colour = 'black'), - strip.text.y = element_text(angle = 0)) + - facet_grid(Sample_presence~., scales = "free", space = "free") + - scale_color_gradient(limits = c(0,1)) - return(z.plot) -} - -generateZPostSummary <- function(z_chain, - w_chain, - filter_thresh = 0.05, - MutID = NULL, - SampleID = NULL) { - - map_z <- estimateClusterAssignments(z_chain) - map_w <- estimateCCFs(w_chain) - - I <- length(unique(z_chain$Parameter)) - K <- max(unique(z_chain$value)) - num_iter <- max(z_chain$Iteration) - S <- ncol(map_w) - - if (is.null(MutID)) { - mut_labels <- 1:I - } else { - mut_labels <- MutID - } - if (is.null(SampleID)) { - sample_labels <- paste0("Sample ", 1:S) - } else { - sample_labels <- SampleID - } - - - - tiers <- generateTiers(map_w, sample_labels) - - mcmc_z <- summarizeZPost(z_chain) %>% - filter(probability >= filter_thresh) - - # Variant sample presence - var_sample_pres <- map_z %>% - ungroup() %>% - mutate(cluster_num = value, - Variant = as.numeric(gsub("z\\[|]", "", Parameter)), - Mut_ID = mut_labels[Variant], - Sample_presence = tiers$samples[value]) - # sample presence order - sample_pres_order <- tiers %>% - select(samples, tier) %>% - distinct() %>% - arrange(-tier) %>% - pull(samples) - # variant order - var_order <- map_z %>% - arrange(-value) %>% - mutate(Variant = as.numeric(gsub("z\\[|]", "", Parameter)), - Mut_ID = mut_labels[Variant]) %>% - pull(Mut_ID) - - mcmc_z <- mcmc_z %>% - mutate(Variant = as.numeric(gsub("z\\[|]", "", Parameter)), - Mut_ID = factor(mut_labels[Variant], var_order), - Sample_presence = factor(var_sample_pres$Sample_presence[Variant], - sample_pres_order)) - return(mcmc_z) -} - -grabBIC <- function(all_set_results) { - bic_list <- lapply(all_set_results, function(x) x$BIC) - bic_list_filt <- bic_list[sapply(bic_list, function(x) typeof(x) != "logical")] - bic_tb <- bic_list_filt %>% - bind_rows(.id = 'Set') %>% - mutate(Set = factor(Set, levels = names(all_set_results))) - return(bic_tb) -} - -grabBestK <- function(all_set_results) { - best_k <- lapply(all_set_results, function(x) x$best_K) %>% - stack() %>% - rename(Set = ind, - Best_K = values) %>% - as_tibble() - return(best_k) -} - -#' Plot probabilities of mutation cluster assignments (vertical) for tested K across all mutation sets -#' -#' @export -#' @import ggplot2 -#' @import dplyr -#' @param all_set_results List of MCMC results for each mutation set; returned by \code{clusterSep} -#' @param outdir Path to directory for output of plots -#' @param SampleID (Optional) Vector of sample IDs for labeling purposes. Same order as supplied as input data (e.g. indata$Sample_ID) -#' @param filter_thresh Lowest posterior probability to include cluster assignment. Default value is 0.05 (inclusive) -#' @param compare Option to only plot cluster probabilities for K chosen by minimum BIC, elbow or knee of plot when different (default FALSE plots all K tested) -plotAllZProb <- function(all_set_results, outdir, SampleID = NULL, filter_thresh = 0.05, compare = FALSE) { - if (is.null(SampleID)) { - S <- estimateCCFs(all_set_results[[1]]$all_chains[[1]]$w_chain) %>% - ncol - sample_names <- paste0("Sample ", 1:S) - } else { - sample_names <- SampleID - } - - num_sets <- length(all_set_results) - set_names_bin <- names(all_set_results) - - if (compare) k_tb <- writeSetKTable(all_set_results) - - for (set in set_names_bin) { - set_name_full <- sample_names[as.logical(as.numeric(strsplit(set, "")[[1]]))] %>% - paste0(., collapse = ",\n") - - all_set_chains <- all_set_results[[set]]$all_chains - k_tested <- as.numeric(gsub("K", "", names(all_set_chains))) - S <- all_set_chains[[1]]$w_chain %>% - estimateCCFs %>% - ncol - I <- all_set_chains[[1]]$z_chain %>% - estimateClusterAssignments %>% - nrow - num_iter <- all_set_chains[[1]]$z_chain %>% - pull(Iteration) %>% - max - - if (I == 1) next # plot is trivial if only one mutation in set; skip plotting - - if (compare) { - # only plot for K = minimum BIC and K = elbow - k_to_plot <- k_tb %>% - filter(set_name_bin == set) %>% - select(min_BIC, elbow, knee) %>% - unlist %>% - unname %>% - unique - if (any(is.na(k_to_plot))) k_to_plot <- k_to_plot[-which(is.na(k_to_plot))] - - } else { - k_to_plot <- k_tested - } - - for (k in k_to_plot) { - z_plot_file <- file.path(outdir, paste0(set, "_", names(all_set_chains)[k], "_z_plot.pdf")) - plot_title <- paste0(set_name_full, ": ", names(all_set_chains)[k]) - temp_chains <- all_set_chains[[k]] - - mcmc_z <- summarizeZPost(temp_chains$z_chain) %>% - filter(probability >= filter_thresh) %>% - mutate(Variant = as.numeric(gsub("z\\[|]", "", Parameter))) - - # order varints by highest probability cluster assignment - var_order <- mcmc_z %>% - group_by(Variant) %>% - summarize(map_z = value[which.max(probability)]) %>% - arrange(-map_z) %>% - pull(Variant) - - mcmc_z <- mcmc_z %>% - mutate(Variant = factor(Variant, levels = var_order)) - - # segments connecting multiple assignments for each variant - z.seg.tb <- mcmc_z %>% - group_by(Variant) %>% - summarize(z1 = min(value), z2 = max(value)) %>% - ungroup() - - z.plot <- ggplot(mcmc_z, aes(x = value, y = Variant, color = probability)) + - theme_light() + - scale_y_discrete(drop = T, name = "Variant", labels = NULL) + - scale_x_continuous(breaks = 1:k, name = "Cluster", labels = 1:k) + - geom_segment(data = z.seg.tb, - aes(y=Variant, yend=Variant, - x=z1, xend=z2), - color="black", linetype=2) + - geom_point() + - theme(panel.grid.minor = element_blank(), - strip.background=element_blank(), - strip.text = element_text(colour = 'black'), - strip.text.y = element_text(angle = 0)) + - scale_color_gradient(limits = c(0,1)) + - ggtitle(plot_title) - plot_height <- max(3, I/15) + S/2 - ggsave(z_plot_file, plot = z.plot, height = plot_height, width = max(3, k)) - } - } -} - -#' Plot BIC for all mutation sets -#' -#' @export -#' @import dplyr -#' @import ggplot2 -#' @param all_set_results List of MCMC results for each mutation set; returned by \code{clusterSep} -#' @param SampleID (Optional) Vector of sample IDs for labeling purposes. Same order as supplied as input data (e.g. indata$Sample_ID) -plotBIC <- function(all_set_results, Sample_ID = NULL) { - bic_tb <- grabBIC(all_set_results) - k_tb <- writeSetKTable(all_set_results) - min_BIC <- grabBestK(all_set_results) %>% - rename(K_tested = Best_K) %>% - left_join(., bic_tb, - by = c("Set", "K_tested")) %>% - mutate(Choice = "Minimum") - elbow <- k_tb %>% - select(K_tested = elbow, Set = set_name_bin) %>% - mutate(Set = factor(Set, levels(bic_tb$Set))) %>% - left_join(., bic_tb, - by = c("Set", "K_tested")) %>% - mutate(Choice = "Elbow") - knee <- k_tb %>% - select(K_tested = knee, Set = set_name_bin) %>% - mutate(Set = factor(Set, levels(bic_tb$Set))) %>% - left_join(., bic_tb, - by = c("Set", "K_tested")) %>% - mutate(Choice = "Knee") - - if (any(is.na(min_BIC$BIC))) min_BIC <- min_BIC[-which(is.na(min_BIC$BIC)), ] - if (any(is.na(elbow$BIC))) elbow <- elbow[-which(is.na(elbow$BIC)), ] - if (any(is.na(knee$BIC))) knee <- knee[-which(is.na(knee$BIC)), ] - - # rename sets - if (is.null(Sample_ID)) { - S <- all_set_results[[1]]$best_chains$w_chain %>% - estimateCCFs %>% - ncol - Sample_ID <- paste0("Sample ", 1:S) - } - - set_name_tb <- tibble(set_bin = unique(bic_tb$Set), - set_name = sapply(unique(bic_tb$Set), function(x) getSetName(x, Sample_ID))) %>% - mutate(set_name = factor(set_name, set_name)) - - bic_tb <- bic_tb %>% - mutate(Set_name = sapply(bic_tb$Set, function(x) getSetName(x, Sample_ID))) %>% - mutate(Set_name = factor(Set_name, set_name_tb$set_name)) - k_choices <- bind_rows(min_BIC, elbow, knee) %>% - mutate(Set_name = sapply(Set, function(x) getSetName(x, Sample_ID))) %>% - mutate(Set_name = factor(Set_name, set_name_tb$set_name)) %>% - mutate(Choice = factor(Choice, c("Minimum", "Elbow", "Knee"))) - - # plot - bic_plot <- ggplot(bic_tb, aes(x = K_tested, y = BIC)) + - theme_light() + - facet_wrap(~Set_name, scales = "free") + - geom_line() + - geom_point(data = k_choices, aes(color = Choice, size = Choice, shape = Choice), stroke = 1) + - scale_shape_manual(values=c(19, 1, 1))+ - scale_color_manual(values=c('#999999',"#56B1F7", "#E69F00")) + - scale_size_manual(values=c(1, 3, 5)) + - #geom_point(data = elbow, color = , size = 4, shape = 1) - theme(strip.background=element_blank(), - strip.text = element_text(colour = 'black'), - panel.grid.minor = element_blank()) + - xlab("K") - - return(bic_plot) -} - -#' Convert binary set names to long form with Sample_ID -getSetName <- function(binary_name, Sample_ID, collapse_string = ", \n") { - split_bin <- strsplit(as.character(binary_name), "") %>% - .[[1]] %>% - as.numeric() %>% - as.logical() - samples_present <- Sample_ID[split_bin] - set_name <- paste0(samples_present, collapse = collapse_string) - return(set_name) -} - -#' Plot cluster CCF posterior distributions -#' -#' @export -#' @import ggplot2 -#' @import tibble -#' @import dplyr -#' @import tidyr -#' @param w_chain MCMC chain of CCF values, which is the first item in the list returned by \code{clusterSep} -plotDensityCCF <- function(w_chain) { - w_chain <- w_chain %>% - mutate(Cluster = as.numeric(gsub("w\\[", "", sapply(w_chain$Parameter, function(x) strsplit(as.character(x), ",")[[1]][1])))) %>% - mutate(Sample = gsub("\\]", "", sapply(w_chain$Parameter, function(x) strsplit(as.character(x), ",")[[1]][2]))) - K <- max(as.numeric(w_chain$Cluster)) - S <- max(as.numeric(w_chain$Sample)) - suppressWarnings(print( - ggplot(w_chain, aes(x = value)) + - geom_density() + - facet_wrap(~Cluster + Sample, ncol = S, scales = "free_y") + - theme_light() + - ylab("Density") + xlab("CCF") + - theme(panel.grid.major = element_blank(), - panel.grid.minor = element_blank(), - strip.background = element_blank(), - #strip.text = element_text(colour = 'black'), - strip.text.x = element_blank(), - #axis.text.x=element_blank(), - #axis.ticks.x=element_blank(), - axis.text.y=element_blank(), - axis.ticks.y=element_blank()) + - scale_x_continuous(breaks=c(0, 0.5, 1)) - )) -} - -#' Plot cluster CCF posterior distributions as violin plots -#' -#' @export -#' @import ggplot2 -#' @import tibble -#' @import dplyr -#' @import tidyr -#' @importFrom stringr str_replace_all -#' @param w_chain MCMC chain of CCF values -#' @param z_chain (Optional) MCMC chain of mutation cluster assignment values. If provided, cluster names will show the number of mutations assigned in brackets -#' @param indata (Optional) List of input data items -plotCCFViolin <- function(w_chain, z_chain = NULL, indata = NULL) { - # process data - vdat <- violinProcessData(w_chain, indata) - - if (!is.null(z_chain)) { - num_muts_in_clusters <- estimateClusterAssignments(z_chain) %>% - group_by(value) %>% - summarize(num_muts = n()) %>% - ungroup() %>% - rename(cluster = value) - num_muts <- num_muts_in_clusters$num_muts[match(vdat$cluster_num, num_muts_in_clusters$cluster)] - new_cluster_labels <- paste0("Cluster ", - vdat$cluster_num, - " [", num_muts,"]") - vdat <- vdat %>% - mutate(cluster = factor(new_cluster_labels, unique(new_cluster_labels))) - } - - # plot violins - vplot <- plotViolin(vdat) - return(vplot) -} - -generateTiers <- function(w_mat, Sample_ID) { - clusters <- paste0("Cluster ", seq_len(nrow(w_mat))) - bin <- w_mat > 0 - samples <- apply(bin, 1, function(x) paste(Sample_ID[x], collapse = ",\n")) - tier <- rowSums(bin) - tiers <- tibble(cluster = clusters, - cluster_num = seq_len(nrow(w_mat)), - samples = samples, - tier = tier) - return(tiers) -} - -violinProcessData <- function(w_chain, indata = NULL) { - w_mat <- estimateCCFs(w_chain) - est_K <- nrow(w_mat) - - if (is.null(indata$Sample_ID)) { - sample_names <- paste0("Sample ", 1:ncol(w_mat)) - } else { - sample_names <- indata$Sample_ID - } - - vdat <- w_chain %>% - mutate(sample=stringr::str_replace_all(Parameter, "w\\[[:digit:]+,", ""), - sample=stringr::str_replace_all(sample, "\\]", ""), - cluster=stringr::str_replace_all(Parameter, "w\\[", ""), - cluster=stringr::str_replace_all(cluster, ",[:digit:]\\]", "")) %>% - mutate(sample=as.numeric(sample), - sample=sample_names[sample], - sample=factor(sample, sample_names), - cluster=as.numeric(cluster), - cluster=paste0("Cluster ", cluster), - cluster=factor(cluster, level=paste("Cluster", 1:est_K))) - - tiers <- generateTiers(w_mat, sample_names) - - vdat <- vdat %>% - mutate(cluster=as.character(cluster)) %>% - left_join(tiers, by="cluster") %>% - mutate(cluster=factor(cluster, tiers$cluster), - tier=factor(tier, sort(unique(tiers$tier)))) - return(vdat) -} - -plotViolin <- function(vdat) { - vplot <- ggplot(vdat, aes(sample, value)) + - geom_violin(aes(fill=tier), - alpha=0.6, - scale="width", - draw_quantiles=c(0.25, 0.5, 0.75), - color="white") + - geom_violin(fill="transparent", color="black", - scale="width", draw_quantiles=0.5) + - theme_bw(base_size=12) + - theme(strip.background=element_blank(), - axis.text.x=element_text(size=12), - panel.grid=element_blank(), - legend.pos="bottom") + - facet_wrap(~cluster, nrow=1) + - ylab("Posterior CCF") + xlab("") +ylim(c(0, 1)) + - guides(fill=guide_legend("Sample-presence")) + - theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) - return(vplot) -} - -#' Plot single tree -#' -#' @export -#' @param edges tibble of edges with columns edge, parent, child -#' @import tibble -#' @import dplyr -#' @import tidyr -#' @import ggplot2 -plotTree <- function(edges, palette=viridis::viridis) { - plotGraph(edgesToAmLong(edges), colorScheme(edges, palette)) -} - -#' generate colors for each vertice -#' @export -colorScheme <- function(edges, palette=viridis::viridis) { - v_sorted = sort(unique(c(edges$parent, edges$child))) - v_sorted = c(sort(as.integer(v_sorted[!v_sorted=='root'])), "root") - # root_idx <- which(v_sorted=="root") - colors <- c(palette(length(v_sorted)-1), "white") - v_color <- tibble(v_sorted, colors) - return(v_color) -} - -#' Plot ensemble tree -#' -#' @export -#' @param trees list of tibbles of edges, each with columns edge, parent, child -#' @import tibble -#' @import dplyr -#' @import tidyr -#' @import ggplot2 -plotEnsembleTree <- function(trees, palette=viridis::viridis) { - am_chain <- lapply(trees, edgesToAmLong) - post_am <- getPosteriorAmLong(am_chain) - plotPosteriorAmLong(post_am, colorScheme(trees[[1]], palette)) -} - -#' Plot CCF chain trace -#' -#' @export -#' @import ggplot2 -#' @import tibble -#' @import dplyr -#' @import tidyr -#' @param w_chain MCMC chain of CCF values, which is the first item in the list returned by \code{mergeSetChains} -plotChainsCCF <- function(w_chain) { - cluster <- strsplit(as.character(w_chain$Parameter), ",") %>% - sapply(., function(x) gsub("w\\[", "", x[1])) %>% - as.numeric - sample <- strsplit(as.character(w_chain$Parameter), ",") %>% - sapply(., function(x) gsub("\\]", "", x[2])) %>% - as.numeric - - w_chain <- w_chain %>% - mutate(Cluster = factor(paste0("Cluster ", cluster), - levels = paste0("Cluster ", sort(unique(cluster)))), - Sample = factor(paste0("Sample ", sample), - levels = paste0("Sample ", sort(unique(sample))))) - - ggplot(w_chain, aes(x = Iteration, y = value)) + - geom_line() + - theme_light() + - facet_grid(Cluster ~ Sample) + - ylab("Cancer Cell Fraction") -} - -#' Plot posterior predictive distribution for number of variant reads -#' -#' @export -#' @import ggplot2 -#' @import tibble -#' @import tidyr -#' @param ystar_chain MCMC chain of ystar values, which is the third item in the list returned by \code{clusterSep} -#' @param indata List of input data items -#' @param SampleID (Optional) Vector of sample IDs for labeling purposes. Same order as supplied as input data (e.g. indata$Sample_ID). If not provided, function will use the Sample_ID in indata -#' @param Mutation_ID (Optional) Vector of mutation IDs. If not provided, function will use the MutID in indata -plotPPD <- function(ystar_chain, indata, - Sample_ID = NULL, - Mutation_ID = NULL) { - I <- indata$I - if (is.null(Sample_ID)) Sample_ID <- indata$Sample_ID - if (is.null(Mutation_ID)) Mutation_ID <- indata$Mut_ID - - ppd.summaries <- ystar_chain %>% - group_by(Parameter) %>% - summarize(mean=mean(value), - median=median(value), - q1=quantile(value, 0.025), - q3=quantile(value, 0.975)) - - observed_y <- indata$y %>% - magrittr::set_colnames(Sample_ID) %>% - as_tibble() %>% - mutate(Mutation_index = 1:I) %>% - pivot_longer(cols = Sample_ID, - names_to = "Sample", - values_to = "observed_y") %>% - mutate(s = match(Sample, Sample_ID), - Parameter = paste0("ystar[", Mutation_index, ",", s, "]")) - - ppd.summaries2 <- ppd.summaries %>% - left_join(., observed_y, by = "Parameter") - points <- ppd.summaries2 %>% - select(Parameter, Mutation_index, Sample, observed_y, median) %>% - rename("Observed variant read count" = observed_y, - "Posterior median" = median) %>% - pivot_longer(cols = c("Observed variant read count", "Posterior median"), - names_to = "type", - values_to = "value") %>% - left_join(., ppd.summaries2, by = c("Parameter", - "Mutation_index", - "Sample")) - points_order <- ppd.summaries2 %>% - filter(Sample == Sample_ID[1]) %>% - arrange(observed_y) %>% - pull(Mutation_index) - - variant_gene_names <- Mutation_ID %>% - sapply(., function(x) strsplit(x, "_")[[1]][1]) %>% - as.character() - variant_gene_names_ordered <- variant_gene_names[points_order] - points2 <- points %>% - mutate(Mutation_index = factor(Mutation_index, - levels = points_order)) - - # plot - points3 <- points2 %>% - mutate(type=gsub("Observed variant read count", - "Observed variant\nread count", - type)) - colors <- setNames(c("gray40", "steelblue"), unique(points3$type)) - fill <- setNames(c("white", "steelblue"), unique(points3$type)) - points3 %>% - ggplot(aes(x=mean, y=Mutation_index, - xmin=q1, - xmax=q3)) + - geom_errorbar(color="gray") + - geom_point(aes(x = value, y = Mutation_index, color=type, fill=type), - size=2, pch=21) + - scale_y_discrete(labels = variant_gene_names_ordered, - breaks = points_order) + - theme_bw(base_size=15) + - theme(#axis.text.y=element_blank(), - axis.title.x=element_text(size=20), - axis.text.x=element_text(size=17), - strip.text=element_text(size=22), - panel.grid=element_blank(), - axis.ticks.y=element_blank(), - legend.text=element_text(size=18), - panel.background=element_rect(fill="white", - color="black"), - legend.pos="bottom", - strip.background=element_blank()) + - scale_color_manual(name="", - labels=names(colors), - values=colors) + - scale_fill_manual(name="", - labels=names(colors), - values=fill) + - xlab("Variant allele count") + - ylab("") + - facet_wrap(~Sample) + - guides(color=guide_legend(override.aes=list(size=3))) -} diff --git a/R/preprocessing.R b/R/preprocessing.R index e3f8cb4..a8ed103 100644 --- a/R/preprocessing.R +++ b/R/preprocessing.R @@ -1,10 +1,28 @@ -#' read input data file and store in PICTOGRAPH input format +#' read input data file and store in required format #' #' @export -#' @param input_file input data file; -importCSV <- function(inputFile, alt_reads_thresh = 6, vaf_thresh = 0.02) { - data <- read_csv(inputFile, show_col_types = FALSE) - data <- data %>% filter(alt_reads/total_reads>vaf_thresh, alt_reads>alt_reads_thresh) +#' @param mutation_file mutation data file; see inst/extdata/examples/*_snv.csv for examples +#' @param outputDir output directory for saving output data +importFiles <- function(mutation_file, + outputDir=NULL, + alt_reads_thresh = 0, # to be tested + vaf_thresh = 0 # to be tested + ) { + + if (is.null(outputDir)) { + outputDir = getwd() + } + + mutation_data = importMutationFileOnly(mutation_file, alt_reads_thresh, vaf_thresh) + mutation_data$is_cn <- c(rep(0, nrow(mutation_data$y))) + + return(mutation_data) +} + +#' import mutation file +importMutationFileOnly <- function(mutation_file, alt_reads_thresh = 0, vaf_thresh = 0) { + data <- read_csv(mutation_file, show_col_types = FALSE) + data <- data %>% filter(alt_reads/total_reads>=vaf_thresh, alt_reads>=alt_reads_thresh) output_data <- list() output_data$y <- as.matrix(data[c("mutation", "sample", "alt_reads")] %>% pivot_wider(names_from = sample, values_from = alt_reads, values_fill = 0)) @@ -28,169 +46,69 @@ importCSV <- function(inputFile, alt_reads_thresh = 6, vaf_thresh = 0.02) { colnames(output_data$n) = colname if (any((output_data$y - output_data$n) > 0)) { - warning("Total read count must be equal or bigger than alt read count. Please check input data before proceeding!") - stop() + stop("Total read count must be equal or bigger than alt read count. Please check input data before proceeding!") } if (any(output_data$n==0)) { - warning("Total read counts of 0 encoutered. Replaced 0 with mean total read count.") + print("Total read counts of 0 encoutered. Replaced 0 with mean total read count.") output_data$n[output_data$n==0] <- round(mean(output_data$n)) } - output_data$tcn <- as.matrix(data[c("mutation", "sample", "tumor_integer_copy_number")] %>% pivot_wider(names_from = sample, values_from = tumor_integer_copy_number, values_fill = 2)) - rownames(output_data$tcn) <- output_data$tcn[,'mutation'] - output_data$tcn <- output_data$tcn[,-1, drop=FALSE] - rowname = rownames(output_data$tcn) - colname = colnames(output_data$tcn) - output_data$tcn <- matrix(as.numeric(output_data$tcn), ncol = ncol(output_data$tcn)) - rownames(output_data$tcn) = rowname - colnames(output_data$tcn) = colname - - output_data$purity <- as.matrix(data[c("mutation", "sample", "purity")] %>% pivot_wider(names_from = sample, values_from = purity, values_fill = 0)) - rownames(output_data$purity) <- output_data$purity[,'mutation'] - output_data$purity <- output_data$purity[,-1, drop=FALSE] - rowname = rownames(output_data$purity) - colname = colnames(output_data$purity) - output_data$purity <- matrix(as.numeric(output_data$purity), ncol = ncol(output_data$purity)) - rownames(output_data$purity) = rowname - colnames(output_data$purity) = colname - output_data$purity = colSums(output_data$purity) / colSums(!!output_data$purity) - # if (ncol(output_data$purity) == 1) { - # if (length(unique(output_data$purity[,])) != 1) { - # warning("purity not consistent for the same sample; taking the mean purity") - # output_data$purity <- round(colMeans(output_data$purity), digit=2) - # } else { - # output_data$purity <- unique(output_data$purity[,]) - # } - # } else { - # if (nrow(unique(output_data$purity[,])) != 1) { - # warning("Purity not consistent for the same sample; taking the mean purity") - # output_data$purity <- round(colMeans(output_data$purity), digit=2) - # } else { - # output_data$purity <- unique(output_data$purity[,])[1,] - # } - # } + output_data$icn <- as.matrix(data[c("mutation", "sample", "tumor_integer_copy_number")] %>% pivot_wider(names_from = sample, values_from = tumor_integer_copy_number, values_fill = 2)) + rownames(output_data$icn) <- output_data$icn[,'mutation'] + output_data$icn <- output_data$icn[,2:ncol(output_data$icn)] + output_data$icn <- matrix(as.numeric(output_data$icn), ncol=ncol(output_data$y)) + output_data$icn <- apply(output_data$icn, 1, function(x) { + if (all(x == 2)) { + return(2) + } else { + return(mean(x[x != 2])) + } + }) + + output_data$cncf <- as.matrix(data[c("mutation", "sample", "cncf")] %>% pivot_wider(names_from = sample, values_from = cncf, values_fill = 0)) + rownames(output_data$cncf) <- output_data$cncf[,'mutation'] + output_data$cncf <- output_data$cncf[,-1,drop=FALSE] + rowname = rownames(output_data$cncf) + colname = colnames(output_data$cncf) + output_data$cncf <- matrix(as.numeric(output_data$cncf), ncol = ncol(output_data$cncf)) + rownames(output_data$cncf) = rowname + colnames(output_data$cncf) = colname output_data$S = ncol(output_data$y) output_data$I = nrow(output_data$y) - if ("multiplicity" %in% colnames(data)) { - output_data$m <- as.matrix(data[c("mutation", "sample", "multiplicity")] %>% pivot_wider(names_from = sample, values_from = multiplicity, values_fill = 1)) - rownames(output_data$m) <- output_data$m[,'mutation'] - output_data$m <- output_data$m[,-1, drop=FALSE] - rowname = rownames(output_data$m) - colname = colnames(output_data$m) - output_data$m <- matrix(as.numeric(output_data$m), ncol = ncol(output_data$m)) - rownames(output_data$m) = rowname - colnames(output_data$m) = colname - if(any(data$multiplicity == 0)){ - warning("Multiplicity of 0 encoutered in the input data file which may cause issue for jags model!") - } + output_data$tcn = output_data$icn * output_data$cncf + 2 * ( 1 - output_data$cncf) + + if ("major_integer_copy_number" %in% colnames(data)) { + output_data$mtp <- as.matrix(data[c("mutation", "sample", "major_integer_copy_number")] %>% pivot_wider(names_from = sample, values_from = major_integer_copy_number, values_fill = 1)) + rownames(output_data$mtp) <- output_data$mtp[,'mutation'] + output_data$mtp <- output_data$mtp[,2:ncol(output_data$mtp)] + output_data$mtp <- matrix(as.numeric(output_data$mtp), ncol=ncol(output_data$y)) + output_data$mtp <- apply(output_data$mtp, 1, function(x) { + if (all(x == 1)) { + return(1) + } else { + return(mean(x[x != 1])) + } + }) } else { - output_data$m <- estimateMultiplicityMatrix(output_data) - } - - # output_data$y[output_data$y / output_data$n < vaf_thresh] = 0 - # output_data$y[output_data$y < alt_reads_thresh] = 0 - - # output_data$n = output_data$n[rowSums(output_data$y) > 0,] - # output_data$tcn = output_data$tcn[rowSums(output_data$y) > 0,] - # output_data$m = output_data$m[rowSums(output_data$y) > 0,] - # output_data$MutID = output_data$MutID[rowSums(output_data$y) > 0] - # output_data$y = output_data$y[rowSums(output_data$y) > 0,] - # output_data$I = nrow(output_data$y) - return(output_data) -} - -get95CI <- function(y, n) { - test <- prop.test(y, n, correct = T) - ci_lower <- test$conf.int[1] - ci_upper <- test$conf.int[2] - return(c(ci_lower, ci_upper)) -} - -calcmC <- function(VAF, purity, tcn) { - mC <- (VAF * (purity * tcn + (1-purity) * 2)) / purity - return(mC) -} - -getmCCI <- function(y, n, purity, tcn) { - VAF_CI <- get95CI(y, n) - lower_mC <- calcmC(VAF_CI[1], purity, tcn) - upper_mC <- calcmC(VAF_CI[2], purity, tcn) - return(c(lower_mC, upper_mC)) -} - -assignMultiplicity <- function(lower_mC, upper_mC, tcn) { - # (1) If the CI for mC overlaps an integer value, - # that value is estimated to indicate the multiplicity - # of the mutation and the mutation is clonal (C=1) - if (ceiling(lower_mC) == floor(upper_mC)) { - test_m <- ceiling(lower_mC) - # cap m at tcn - if (test_m > tcn) return (tcn) - return(test_m) + output_data$mtp <- estimateMultiplicityMatrix(output_data)[,1] } - # (2) If the upper bound of the CI for mC is below 1, - # the multiplicity is set to 1, and the mutation is - # subclonal, unless the resulting estimate for C is - # within a tolerance threshold (0.25) of 1 - if (upper_mC < 1) return(1) - - # (3) If the CI for mC is above 1 and does not overlap - # any integer values, multiplicity is greater than 1 - # and m is set such that the confidence interval for C - # falls within the expected intervals of [0,1] - if (lower_mC > 1) { - keep_going <- TRUE - m <- 1 - while(keep_going) { - # cap m at tcn - if (m == tcn) return(m) - - m <- m + 1 - C_lower <- lower_mC / m - C_upper <- upper_mC / m - if (C_lower > 0 & C_upper < 1) { - keep_going <- FALSE - } - } - return(m) + if ("purity" %in% colnames(data)) { + output_data$purity <- as.matrix(data[c("mutation", "sample", "purity")] %>% pivot_wider(names_from = sample, values_from = purity, values_fill = 0)) + rownames(output_data$purity) <- output_data$purity[,'mutation'] + output_data$purity <- output_data$purity[,-1, drop=FALSE] + rowname = rownames(output_data$purity) + colname = colnames(output_data$purity) + output_data$purity <- matrix(as.numeric(output_data$purity), ncol = ncol(output_data$purity)) + rownames(output_data$purity) = rowname + colnames(output_data$purity) = colname + output_data$purity = colSums(output_data$purity) / colSums(!!output_data$purity) + } else { + output_data$purity = rep(0.8, ncol(output_data$y)) } - # CI for mC spans more than 1 integer value including 1 - if (lower_mC <= 1 && upper_mC >= 1) return(1) - - return(NA) -} - -estimateMultiplicity <- function(y, n, purity, tcn) { - # if no variant reads, assigning multiplicity of 1 - if (y == 0) return(1) - mC_CI <- getmCCI(y, n, purity, tcn) - m <- max(1, assignMultiplicity(mC_CI[1], mC_CI[2], tcn)) - return(m) + return(output_data) } - -estimateMultiplicityMatrix <- function(data) { - Y = data$y - N = data$n - Tcn = data$tcn - Purity = data$purity - I = data$I - S = data$S - M = matrix(100, I, S) - - for (i in 1:I) { - for (s in 1:S) { - y = Y[[i, s]] - n = N[[i, s]] - tcn = Tcn[[i, s]] - purity = Purity[s] - m = estimateMultiplicity(y, n, purity, tcn) - M[i, s] = m - } - } - return(M) -} \ No newline at end of file diff --git a/R/sample-presence.R b/R/sample-presence.R new file mode 100644 index 0000000..e004921 --- /dev/null +++ b/R/sample-presence.R @@ -0,0 +1,30 @@ +#' sample presence for MCMC +#' @export +separateMutationsBySamplePresence <- function(input_data) { + # returns list of lists -- + # each item of list contains input data for a mutation sample presence set + # original mutation indices from input_data are recorded in $mutation_indices + pres <- ifelse(input_data$y > 0 & !input_data$is_cn, 1, 0) + ifelse(input_data$is_cn & input_data$tcn != 2, 1, 0) + + pat <- apply(pres, 1, function(x) paste0(x, collapse="")) + types <- sort(names(table(pat)), decreasing=TRUE) + + type_indices <- lapply(types, function(x) which(pat == x)) + names(type_indices) <- types + + sep_list <- list() + for (t in seq_len(length(types))) { + sep_list[[types[t]]] <- list(pattern = types[t], + mutation_indices = type_indices[[types[t]]], + y = input_data$y[type_indices[[types[t]]], ,drop=FALSE], + n = input_data$n[type_indices[[types[t]]], ,drop=FALSE], + tcn = input_data$tcn[type_indices[[types[t]]], ,drop=FALSE], + is_cn = input_data$is_cn[type_indices[[types[t]]]], + cncf = input_data$cncf[type_indices[[types[t]]], ,drop=FALSE], + mtp = input_data$mtp[type_indices[[types[t]]]], + icn = input_data$icn[type_indices[[types[t]]]], + MutID = input_data$MutID[type_indices[[types[t]]]], + purity = input_data$purity) + } + return(sep_list) +} \ No newline at end of file diff --git a/R/simulation.R b/R/simulation.R new file mode 100644 index 0000000..718ce77 --- /dev/null +++ b/R/simulation.R @@ -0,0 +1,510 @@ +simulation_type3_jags <- function(eta=0.8, S=3, K=2, I=50, num_cn=10, seed=12345, depth=50) { + set.seed(12345) + eta <- 0.85 + S <- 2 + K <- 3 + I <- 70 + num_cn=10 + depth = 50 + mcf <- ifelse(runif(K*S, 0, 1) < eta, 1, 0) * + rbeta(K*S, 1, 1) %>% + matrix(K, S) + pi <- rdirichlet(1, rep(1, K)) + z <- replicate(I, sample(seq_len(K), size=1, + replace=TRUE, prob=pi)) + ## indicator for whether mutation is in copy-altered or LOH region + h <- rep(c(0, 1), c(I-num_cn, num_cn)) + + ## integer total copy number in the tumor + icn <- rep(NA, I) + icn[ h==0 ] <- 2 + icn[ h==1 ] <- rpois(sum(h==1), 2) + + ## m is the number of copies of the chromosome for an allele/variant + m <- rep(NA, I) + m[ h == 0 ] <- 1 + m[ h == 1 ] <- pmin(icn[h==1], rpois(sum(h==1), 1)) + + indices <- sample(1:2, sum(h==1), replace = TRUE) + m[ h==1 ] <- ifelse(indices == 1, m[h==1], icn[h==1]-m[h==1]) + + ## tcn is the fraction copy number of mixture of normal and tumor cells + epsilon <- 0.1 + tcn <- matrix(2, I, S) + for (i in 1:I) { + for (s in 1:S) { + if (h[i] == 1) { + tcn[i,s] <- rnorm(1, icn[i] * mcf[z[i], s] + 2 * (1-mcf[z[i], s]), epsilon) + } + } + } + + ## indicator for whether a SSM is diploid + ## cncf if the fraction of copy number alteration that overlaps a SSM + q <- rep(0, I) + for (i in 1:I) { + if (i <= (I-num_cn)) { + prob <- runif(1) + if (prob > 0.9) { + q[i] <- sample((I-num_cn+1):I,1) + tcn[i,] <- tcn[q[i],] + icn[i] <- icn[q[i]] + m[i] <- sample(c(m[q[i]], icn[q[i]]-m[q[i]]),1) + } else { + q[i] <- i + } + } else { + q[i] <- i + } + } + + vaf <- matrix(NA, I, S) + + for(i in seq_len(I)){ + for(s in seq_len(S)){ + if(h[i] == 0){ + vaf[i,s] <- max(0, (mcf[z[i], s] + (m[i] - 1) * mcf[z[q[i]],s]) / tcn[i,s]) + vaf[i,s] <- min(1, vaf[i,s]) + } else{ + vaf[i,s] <- max(0, (mcf[z[i], s]*m[i] + 1 - mcf[z[i], s]) / tcn[i,s]) + vaf[i,s] <- min(1, vaf[i,s]) + } + } + } + + n <- matrix(rpois(n = I*S, lambda = depth), I, S) + y <- matrix(rbinom(n=I*S, size=as.numeric(n), prob=as.numeric(vaf)), I, S) + + model <- jags.model(file = "~/JHU/scripts/R/pictograph2/inst/extdata/type3.jags", + data = list(K=K, S=S, I=I, y=y, is_cn=h, n=n, tcn=tcn, q=q), + inits = list(".RNG.name" = "base::Wichmann-Hill", + ".RNG.seed" = 123)) + + update(model, n.iter = 1000) + + Nrep = 10000 + + posterior_sample <- coda.samples(model, + variable.names = c("mcf", "z", "icn", "m"), + n.iter = Nrep) + # plot(posterior_sample[,1]) + # summary(posterior_sample) + + # get.parameter.chain("z", ggmcmc::ggs(posterior_sample)) + + chains <- formatChains(posterior_sample) + + mcf + mcf_est <- writeClusterMCFsTable(chains$mcf_chain) + mcf_est + plotChainsCCF(chains$mcf_chain) + + # which(z==1) + # which(z==2) + # z + z_est <- writeClusterAssignmentsTable(chains$z_chain) + # z_est %>% filter(Cluster==2) + z_est <- z_est %>% mutate(idx = as.numeric(gsub("Mut(\\d+)","\\1", Mut_ID))) %>% arrange(idx) + # z_est$Cluster + table(z, z_est$Cluster) + + m + m_est <- writeMultiplicityTable(chains$m_chain) + m_est$Multiplicity + table(m, m_est$Multiplicity) + # m_tmp <- get.parameter.chain("m", ggmcmc::ggs(posterior_sample)) + # m_tmp <- m_tmp %>% filter(Parameter=="m[46]") + + icn + icn_est <- writeIcnTable(chains$icn_chain) + icn_est$icn + table(icn, icn_est$icn) + # icn_tmp <- get.parameter.chain("icn", ggmcmc::ggs(posterior_sample)) + # icn_tmp <- icn_tmp %>% filter(Parameter=="icn[50]") + # icn_tmp <- chains$icn_chain %>% group_by(Parameter) %>% reframe(val=round(mean(value))) %>% mutate(idx = as.numeric(gsub("icn\\[(\\d+)\\]","\\1", Parameter))) %>% arrange(idx) +} + +simulation_type1_jags <- function(ets=0.8, S=3, K=2, I=50, num_cn=10, seed=12345) { + set.seed(123) + eta <- 0.8 + S <- 4 + K <- 3 + I <- 70 + num_cn=10 + mcf <- ifelse(runif(K*S, 0, 1) < eta, 1, 0) * + rbeta(K*S, 1, 1) %>% + matrix(K, S) + pi <- rdirichlet(1, rep(1, K)) + z <- replicate(I, sample(seq_len(K), size=1, + replace=TRUE, prob=pi)) + ## indicator for whether mutation is a copy number alteration + h <- rep(c(0, 1), c(I-num_cn, num_cn)) + ## integer total copy number in the tumor + icn <- rep(NA, I) + icn[ h==0 ] <- 2 + # multinom <- function(p){ + # sample(0:(p-1), sum(h==1), replace=TRUE, + # prob=rep(1/p, p)) + # } + # icn[ h==1 ] <- multinom(8) + icn[ h==1 ] <- rpois(sum(h==1), 2) + + ## m is the number of copies of the chromosome for an allele/variant + m <- rep(NA, I) + m[ h == 0 ] <- 1 + # m[ h == 1 ] <- pmin(icn[h==1], multinom(4)) + m[ h == 1 ] <- pmin(icn[h==1], rpois(sum(h==1), 1)) + + indices <- sample(1:2, sum(h==1), replace = TRUE) + m[ h==1 ] <- ifelse(indices == 1, m[h==1], icn[h==1]-m[h==1]) + + vaf <- matrix(NA, I, S) + tcn <- matrix(2, I, S) + epsilon <- 0.1 + for(i in seq_len(I)){ + for(s in seq_len(S)){ + if(h[i] == 0){ + vaf[i, s] <- mcf[z[i], s]/tcn[i,s] + } else{ + tcn[i,s] <- max(0.01, rnorm(1, icn[i] * mcf[z[i], s] + 2 * (1-mcf[z[i], s]), epsilon)) + vaf[i,s] <- (mcf[z[i], s]*m[i] + 1 - mcf[z[i], s])/tcn[i,s] + } + } + } + + n <- matrix(rpois(n = I*S, lambda = 50), I, S) + y <- matrix(rbinom(n=I*S, size=as.numeric(n), prob=as.numeric(vaf)), I, S) + + model <- jags.model(file = "~/JHU/scripts/R/pictograph2/inst/extdata/type1.jags", + data = list(K=K, S=S, I=I, y=y, is_cn=h, n=n, tcn=tcn), + inits = list(".RNG.name" = "base::Wichmann-Hill", + ".RNG.seed" = 123)) + + update(model, n.iter = 1000) + + Nrep = 10000 + + posterior_sample <- coda.samples(model, + variable.names = c("mcf", "z", "icn", "m"), + n.iter = Nrep) + # plot(posterior_sample[,1]) + # summary(posterior_sample) + + # get.parameter.chain("z", ggmcmc::ggs(posterior_sample)) + + chains <- formatChains(posterior_sample) + + mcf + mcf_est <- writeClusterMCFsTable(chains$mcf_chain) + mcf_est + plotChainsMCF(chains$mcf_chain) + + # which(z==1) + # which(z==2) + # z + z_est <- writeClusterAssignmentsTable(chains$z_chain) + # z_est %>% filter(Cluster==2) + z_est <- z_est %>% mutate(idx = as.numeric(gsub("Mut(\\d+)","\\1", Mut_ID))) %>% arrange(idx) + # z_est$Cluster + table(z, z_est$Cluster) + + m + m_est <- writeMultiplicityTable(chains$m_chain) + m_est$Multiplicity + table(m, m_est$Multiplicity) + # m_tmp <- get.parameter.chain("m", ggmcmc::ggs(posterior_sample)) + # m_tmp <- m_tmp %>% filter(Parameter=="m[46]") + + icn + icn_est <- writeIcnTable(chains$icn_chain) + icn_est$icn + table(icn, icn_est$icn) + # icn_tmp <- get.parameter.chain("icn", ggmcmc::ggs(posterior_sample)) + # icn_tmp <- icn_tmp %>% filter(Parameter=="icn[50]") + # icn_tmp <- chains$icn_chain %>% group_by(Parameter) %>% reframe(val=round(mean(value))) %>% mutate(idx = as.numeric(gsub("icn\\[(\\d+)\\]","\\1", Parameter))) %>% arrange(idx) +} + +simulation_type2_jags <- function(ets=0.8, S=3, K=2, I=50, num_cn=10, seed=12345) { + set.seed(1234) + eta <- 0.8 + S <- 3 + K <- 2 + I <- 50 + num_cn <- 10 + mcf <- ifelse(runif(K*S, 0, 1) < eta, 1, 0) * + rbeta(K*S, 1, 1) %>% + matrix(K, S) + pi <- rdirichlet(1, rep(1, K)) + z <- replicate(I, sample(seq_len(K), size=1, + replace=TRUE, prob=pi)) + ## indicator for whether mutation is in copy-altered or LOH region + h <- rep(c(0, 1), c(I-num_cn, num_cn)) + + ## integer total copy number in the tumor + icn <- rep(NA, I) + icn[ h==0 ] <- 2 + icn[ h==1 ] <- rpois(sum(h==1), 2) + + ## m is the number of copies of the chromosome for an allele/variant + m <- rep(NA, I) + m[ h == 0 ] <- 1 + m[ h == 1 ] <- pmin(icn[h==1], rpois(sum(h==1), 1)) + + indices <- sample(1:2, sum(h==1), replace = TRUE) + m[ h==1 ] <- ifelse(indices == 1, m[h==1], icn[h==1]-m[h==1]) + + + ## tcn is the fraction copy number of mixture of normal and tumor cells + epsilon <- 0.1 + tcn <- matrix(2, I, S) + for (i in 1:I) { + for (s in 1:S) { + if (h[i] == 1) { + tcn[i,s] <- rnorm(1, icn[i] * mcf[z[i], s] + 2 * (1-mcf[z[i], s]), epsilon) + } + } + } + + ## indicator for whether a SSM is diploid + ## cncf if the fraction of copy number alteration that overlaps a SSM + q <- rep(0, I) + cncf <- matrix(0, I, S) + for (i in 1:I) { + if (i <= (I-num_cn)) { + prob <- runif(1) + if (prob > 0.9) { + q[i] <- sample((I-num_cn+1):I,1) + if (icn[q[i]]!=0) { + cncf[i,] <- mcf[z[q[i]],] + icn[i] <- icn[q[i]] + m[i] <- sample(c(m[q[i]], icn[q[i]]-m[q[i]]),1) + tcn[i,] <- tcn[q[i],] + } else { + q[i] <- 0 + } + } + } else { + cncf[i,] <- mcf[z[i],] + } + } + + vaf <- matrix(NA, I, S) + for(i in seq_len(I)){ + for(s in seq_len(S)){ + if(h[i] == 0){ + vaf[i, s] <- max(0, (mcf[z[i], s] + (m[i] - 1) * cncf[i,s]) / tcn[i,s]) + } else{ + vaf[i,s] <- max(0, (mcf[z[i], s]*m[i] + 1 - mcf[z[i], s]) / tcn[i,s]) + } + } + } + + n <- matrix(rpois(n = I*S, lambda = 50), I, S) + y <- matrix(rbinom(n=I*S, size=as.numeric(n), prob=as.numeric(vaf)), I, S) + + model <- jags.model(file = "~/JHU/scripts/R/pictograph2/inst/extdata/type2.jags", + data = list(K=K, S=S, I=I, y=y, is_cn=h, n=n, tcn=tcn, mtp=m, cncf=cncf, icn=icn), + inits = list(".RNG.name" = "base::Wichmann-Hill", + ".RNG.seed" = 123)) + + update(model, n.iter = 1000) + + Nrep = 10000 + + posterior_sample <- coda.samples(model, + variable.names = c("mcf", "z", "icn", "m"), + n.iter = Nrep) + # plot(posterior_sample[,1]) + # summary(posterior_sample) + + # get.parameter.chain("z", ggmcmc::ggs(posterior_sample)) + + chains <- formatChains(posterior_sample) + + mcf + mcf_est <- writeClusterMCFsTable(chains$mcf_chain) + mcf_est + plotChainsCCF(chains$mcf_chain) + + # which(z==1) + # which(z==2) + # z + z_est <- writeClusterAssignmentsTable(chains$z_chain) + # z_est %>% filter(Cluster==2) + z_est <- z_est %>% mutate(idx = as.numeric(gsub("Mut(\\d+)","\\1", Mut_ID))) %>% arrange(idx) + # z_est$Cluster + table(z, z_est$Cluster) + + m + m_est <- writeMultiplicityTable(chains$m_chain) + m_est$Multiplicity + table(m, m_est$Multiplicity) + # m_tmp <- get.parameter.chain("m", ggmcmc::ggs(posterior_sample)) + # m_tmp <- m_tmp %>% filter(Parameter=="m[46]") + + icn + icn_est <- writeIcnTable(chains$icn_chain) + icn_est$icn + table(icn, icn_est$icn) + # icn_tmp <- get.parameter.chain("icn", ggmcmc::ggs(posterior_sample)) + # icn_tmp <- icn_tmp %>% filter(Parameter=="icn[50]") + # icn_tmp <- chains$icn_chain %>% group_by(Parameter) %>% reframe(val=round(mean(value))) %>% mutate(idx = as.numeric(gsub("icn\\[(\\d+)\\]","\\1", Parameter))) %>% arrange(idx) +} + + +simulation <- function(mcf=0, icn=2, minor_cn=1, depth=30, num_SNV=30, seed=NULL, normal_prop=0) { + # mcf=0.5 + # icn=4 + # minor_cn=1 + # depth=100 + # num_SNV=100 + # normal_prop=0 # proportion of SNV is actually from CN-neutral region + # seed=123 + + if (!is.null(seed)) { + set.seed(seed) + } + + tcn = 2 * (1-mcf) + icn * mcf + + # if a proportion of SNVs on CNA is actually on CN-neutral region + num_neutral = round(num_SNV * normal_prop) + SNV_depth_neutral = rpois(n = num_neutral, lambda = depth) + SNV_alt_neutral = numeric(num_neutral) + for (i in seq_len(num_neutral)) { + SNV_alt_neutral[i] <- rbinom(n=1, size=SNV_depth_neutral[i], prob=0.5) + } + + vaf_neutral = SNV_alt_neutral / SNV_depth_neutral + + # Actual tumor proportion + num_SNV = num_SNV - num_neutral + + # generate depth for each SNV + depth_total = rpois(n = num_SNV, lambda = depth * tcn / 2) + + # assign each SNV to one copy + SNV_assignment = sample(c(1, 2), size = num_SNV, replace = TRUE) # which segment + + # generate depth for germline + SNV_depth_germline = numeric(num_SNV) + for (i in seq_len(num_SNV)) { + SNV_depth_germline[i] <- rbinom(n=1, size=depth_total[i], prob=2 * (1-mcf)/tcn) + } + + SNV_alt_germline = numeric(num_SNV) + for (i in seq_len(num_SNV)) { + SNV_alt_germline[i] <- rbinom(n=1, size=SNV_depth_germline[i], prob=0.5) + } + + vaf_germline <- SNV_alt_germline/SNV_depth_germline + + # generate depth for tumor + SNV_depth_tumor = depth_total - SNV_depth_germline + SNV_alt_tumor = numeric(num_SNV) + for (i in seq_len(num_SNV)) { + if (icn == 0) { + SNV_alt_tumor[i] <- 0 + } else { + tumor_vaf = minor_cn / icn + if (SNV_assignment[i] == 1) { + tumor_vaf = 1 - (minor_cn / icn) + } + SNV_alt_tumor[i] <- rbinom(n=1, size=SNV_depth_tumor[i], prob=tumor_vaf) + } + } + + # vaf_tumor <- ifelse(SNV_depth_tumor==0, 0, SNV_alt_tumor/SNV_depth_tumor) + vaf_tumor = c((SNV_alt_germline + SNV_alt_tumor) / depth_total, vaf_neutral) + # return(vaf_neutral) + title = paste("mcf: ", mcf, ", num_SNV: ", num_SNV, ", depth: ", depth, ", \nICN: ", icn, ", minor CN: ", minor_cn, + ", true_prop: ", 1-normal_prop, ", unimodal: ", is.unimodal(vaf_tumor), sep="") + plot(density(vaf_tumor), xlim=c(0,1), main = title, cex.main=0.9) + + # plot(density(vaf_tumor), xlim=c(0,1), main = "VAF_tumor") + # qqnorm(vaf_tumor, main = "tumor") + # qqline(vaf_tumor, col="grey") + + is.unimodal(vaf_tumor) + # return(um) +} + +simulation_normal <- function(depth=30, num_SNV=30) { + + # generate depth for each SNV + SNV_depth = rpois(n = num_SNV, lambda = depth) + + # generate alt read counts for each SNV + SNV_alt = numeric(num_SNV) + for (i in seq_len(num_SNV)) { + SNV_alt[i] <- rbinom(n=1, size=SNV_depth[i], prob=0.5) + } + + # calculate VAF for all SNVs + vaf <- SNV_alt/SNV_depth + + # plot the SNVs + title = paste("num_SNV: ", num_SNV, ", depth: ", depth, ", unimodal: ", is.unimodal(vaf), sep="") + plot(density(vaf), xlim=c(0,1), main = title) +} + +simulation_CNA_germline_SNV <- function(mcf=0.9, icn=2, minor_cn=1, depth=30, num_SNV=30, seed=NULL, normal_prop=1) { + + if (!is.null(seed)) { + set.seed(seed) + } + + tcn = 2 * (1-mcf) + icn * mcf + + # if a proportion of SNVs on CNA is actually on CN-neutral region + num_neutral = round(num_SNV * normal_prop) + SNV_depth_neutral = rpois(n = num_neutral, lambda = depth) + SNV_alt_neutral = numeric(num_neutral) + for (i in seq_len(num_neutral)) { + SNV_alt_neutral[i] <- rbinom(n=1, size=SNV_depth_neutral[i], prob=0.5) + } + + vaf_neutral = SNV_alt_neutral / SNV_depth_neutral + + # Actual tumor proportion + num_SNV = num_SNV - num_neutral + + # generate depth for each SNV + depth_total = rpois(n = num_SNV, lambda = depth * tcn / 2) + + # assign each SNV to one copy + SNV_assignment = sample(c(1, 2), size = num_SNV, replace = TRUE) # which segment + + # generate depth for germline + SNV_depth_germline = numeric(num_SNV) + for (i in seq_len(num_SNV)) { + SNV_depth_germline[i] <- rbinom(n=1, size=depth_total[i], prob=2 * (1-mcf)/tcn) + } + + SNV_alt_germline = numeric(num_SNV) + for (i in seq_len(num_SNV)) { + SNV_alt_germline[i] <- rbinom(n=1, size=SNV_depth_germline[i], prob=0.5) + } + + vaf_germline <- SNV_alt_germline/SNV_depth_germline + + # generate depth for tumor + SNV_depth_tumor = depth_total - SNV_depth_germline + SNV_alt_tumor = numeric(num_SNV) + for (i in seq_len(num_SNV)) { + if (icn == 0) { + SNV_alt_tumor[i] <- 0 + } else { + tumor_vaf = minor_cn / icn + if (SNV_assignment[i] == 1) { + tumor_vaf = 1 - (minor_cn / icn) + } + SNV_alt_tumor[i] <- rbinom(n=1, size=SNV_depth_tumor[i], prob=tumor_vaf) + } + } + + vaf_tumor = c((SNV_alt_germline + SNV_alt_tumor) / depth_total, vaf_neutral) + + is.unimodal(vaf_tumor) +} + diff --git a/R/subclone-proportion.R b/R/subclone-proportion.R new file mode 100644 index 0000000..883da99 --- /dev/null +++ b/R/subclone-proportion.R @@ -0,0 +1,113 @@ +#' Calculate proportions of subclones in each sample (assumes CCFs comply with lineage precedence and sum condition) +#' +#' @param w_mat Matrix of CCF estimates (from \code{estimateCCFs}) +#' @param tree_edges Tibble of tree edges with columns edge, parent, and child +#' @export +calcSubcloneProportions <- function(w_mat, tree_edges) { + K <- nrow(w_mat) + S <- ncol(w_mat) + subclone_props <- matrix(NA, nrow = K, ncol = S) + + for (i in seq_len(nrow(w_mat))) { + children <- tree_edges %>% + filter(parent == as.character(i)) %>% + pull(child) %>% + as.numeric() + + if (length(children) == 1) { + children_ccfs <- w_mat[children, ] + } else if (length(children) > 1) { + children_ccfs <- w_mat[children, ,drop=FALSE] %>% + colSums + } else { + children_ccfs <- rep(0, ncol(w_mat)) + } + + subclone_props[i, ] <- w_mat[i, ] - children_ccfs + } + + # normalize subclone_props matrix so the props add up to 1 + subclone_props[subclone_props < 0] = 0 + subclone_props = round(t(t(subclone_props) / colSums(subclone_props)),digit = 3) + + return(subclone_props) +} + +#' Plot pie charts for subclone proportions in each sample +#' +#' @param subclone_props matrix of subclone proportions (returned from \code{calcSubcloneProportions}) +#' @param sample_names (Optional) Vector of sample names. Should be in the order of columns of subclone_props +#' @export +plotSubclonePie <- function(subclone_props, palette=viridis::viridis, sample_names = NULL, title_size=16, legend_size=10) { + if (is.null(sample_names)) sample_names <- paste0("Sample ", 1:ncol(subclone_props)) + props_tb <- subclone_props %>% + magrittr::set_colnames(sample_names) %>% + as_tibble() %>% + mutate(Subclone = factor(paste0("Clone ", 1:nrow(.)), + levels = paste0("Clone ", 1:nrow(.)))) %>% + pivot_longer(cols = sample_names, + names_to = "Sample", + values_to = "Proportion") + + clone_colors <- palette(nrow(subclone_props)) + ggplot(props_tb, aes(x="", y=Proportion, fill = Subclone)) + + geom_bar(stat="identity", width=1, color="white") + + coord_polar("y", start=0) + + scale_fill_manual(values = clone_colors, drop = F) + + theme_void() + + theme(legend.position = "bottom") + + theme(legend.text = element_text(size=legend_size), legend.title = element_text(size=legend_size)) + + facet_wrap(~Sample) + + theme(strip.text.x = element_text(size=title_size)) + +} + +#' Plot subclone proportions in each sample as stacked bar chart +#' +#' @param subclone_props matrix of subclone proportions (returned from \code{calcSubcloneProportions}) +#' @param sample_names (Optional) Vector of sample names. Should be in the order of columns of subclone_props +#' @param label_cluster (Default FALSE) Whether to add cluster label to text on stacked bar +#' @export +plotSubcloneBar <- function(subclone_props, palette=viridis::viridis, sample_names = NULL, label_cluster = FALSE) { + if (is.null(sample_names)) { + sample_names <- paste0("Sample ", 1:ncol(subclone_props)) + } + + clone_colors <- palette(nrow(subclone_props)) + color_half <- nrow(subclone_props) / 2 + color_half_vec <- factor(ifelse(1:nrow(subclone_props) < color_half, "white", "black"), + c("white", "black")) + + props_tb <- subclone_props %>% + magrittr::set_colnames(sample_names) %>% + as_tibble %>% + mutate(Clone = as.factor(1:nrow(.)), + text_color = color_half_vec) %>% + pivot_longer(cols = all_of(sample_names), + names_to = "Sample", + values_to = "Proportion of Tumor") + if (label_cluster) { + text_label <- paste0("Clone ", props_tb$Clone, ": ", props_tb$`Proportion of Tumor`) + props_tb <- props_tb %>% + mutate(text_label = text_label) + } else { + props_tb <- props_tb %>% + mutate(text_label = `Proportion of Tumor`) + } + + stacked_bar <- ggplot(props_tb, aes(x = Sample, y = `Proportion of Tumor`, fill = Clone)) + + theme_light() + + geom_bar(stat = "identity") + + scale_fill_manual(values = clone_colors) + + geom_text(data = subset(props_tb, `Proportion of Tumor` != 0), + aes(label = text_label, colour = text_color), + size = 4, position = position_stack(vjust = 0.5)) + + xlab("") + + scale_x_discrete(guide = guide_axis(angle = 0)) + + scale_color_manual(values = c("white", "black"), guide = "none") + + ylim(0,1.05) + + theme(axis.text=element_text(size=12), axis.title=element_text(size=14,face="bold")) + + theme(legend.text = element_text(size=12), legend.title = element_text(size=12)) + + return(stacked_bar) +} \ No newline at end of file diff --git a/R/subclone-proportions.R b/R/subclone-proportions.R deleted file mode 100644 index a97a431..0000000 --- a/R/subclone-proportions.R +++ /dev/null @@ -1,308 +0,0 @@ -#' Calculate proportions of subclones in each sample -#' -#' @param w_mat Matrix of CCF estimates (from \code{estimateCCFs}) -#' @param tree_edges Tibble of tree edges with columns edge, parent, and child -#' @import dplyr -#' @export -calcSubcloneProportions2 <- function(w_mat, tree_edges) { - K <- nrow(w_mat) - S <- ncol(w_mat) - subclone_props <- matrix(NA, nrow = K, ncol = S) - - leaves <- seq_len(K)[sapply(as.character(seq_len(K)), function(x) isLeaf(x, tree_edges))] - - # subclone proportions of leaves is just CCFs - for (leaf in leaves) { - subclone_props[leaf, ] <- w_mat[leaf, ] - } - - nodes_left <- as.character(seq_len(K))[sapply(as.character(seq_len(K)), function(x) !isLeaf(x, tree_edges))] - new_children <- leaves - - # calculate proportions of other subclones (bottom up) - while(length(nodes_left) > 0) { - temp_nodes <- tree_edges %>% - filter(child %in% new_children, parent != "root") %>% - pull(parent) %>% - unique() - - # check if nodes are branch points - for (node in temp_nodes) { - if (isBranchPoint(node, tree_edges)) { - - branch_children <- tree_edges %>% - filter(parent == node) %>% - pull(child) - - # only add ccf if all children have been added - if (any(branch_children %in% as.character(nodes_left))) { - temp_nodes <- temp_nodes[-which(temp_nodes == node)] - next - } - - node_ccf <- w_mat[as.numeric(node), ] - ##sum_child_ccfs <- colSums(w_mat[as.numeric(branch_children), ]) - # sum up proportions for all children (all generations) - temp_child_prop <- lapply(branch_children, - function(x) calcDescendantProps(x, tree_edges, subclone_props) + - subclone_props[as.numeric(x), ]) %>% - unlist() %>% - matrix(., nrow = length(branch_children), byrow = T) %>% - colSums() - - subclone_props[as.numeric(node), ] <- sapply(node_ccf - temp_child_prop, function(x) max(x, 0)) - nodes_left <- nodes_left[-which(nodes_left == node)] - - } else { - # add ccf like normal - - # sum up proportions for all children (all generations) - temp_child_prop <- calcDescendantProps(node, tree_edges, subclone_props) - - # proportion is node ccf - sum(all children proportions) - subclone_props[as.numeric(node), ] <- sapply(w_mat[as.numeric(node), ] - temp_child_prop, function(x) max(x, 0)) - nodes_left <- nodes_left[-which(nodes_left == node)] - } - } - new_children <- temp_nodes - } - - # @TODO normalize proportions -- columns should be no more than 1 - subclone_props <- normalizeProps(subclone_props) - - return(subclone_props) -} - -normalizeProps <- function(subclone_props) { - props_sums <- colSums(subclone_props) - if (all(props_sums == 1)) return(subclone_props) - - for (s in seq_len(ncol(subclone_props))) { - if (props_sums[s] != 1) { - new_col <- round(subclone_props[, s] / props_sums[s], digits = 3) - subclone_props[, s] <- new_col - } - } - return(subclone_props) -} - -calcDescendantProps <- function(node, tree_edges, subclone_props) { - temp_child <- tree_edges %>% - filter(parent == node) %>% - pull(child) - temp_child_prop <- subclone_props[as.numeric(temp_child), ] - while(!isLeaf(temp_child, tree_edges)) { - temp_child <- tree_edges %>% - filter(parent == temp_child) %>% - pull(child) - temp_child_prop <- temp_child_prop + subclone_props[as.numeric(temp_child), ] - } - return(temp_child_prop) -} - -isBranchPoint <- function(node, tree_edges) { - node <- as.character(node) # make sure node is character - children <- tree_edges %>% - filter(parent == node) %>% - pull(child) - return(length(children) > 1) -} - -isLeaf <- function(node, tree_edges) { - node <- as.character(node) # make sure node is character - children <- tree_edges %>% - filter(parent == node) %>% - pull(child) - return(length(children) == 0) -} - -#' Plot pie charts for subclone proportions in each sample -#' -#' @param subclone_props matrix of subclone proportions (returned from \code{calcSubcloneProportions}) -#' @param sample_names (Optional) Vector of sample names. Should be in the order of columns of subclone_props -#' @export -#' @import ggplot2 -#' @importFrom magrittr set_colnames -#' @importFrom viridis viridis -plotSubclonePie <- function(subclone_props, palette=viridis::viridis, sample_names = NULL, title_size=16, legend_size=10) { - if (is.null(sample_names)) sample_names <- paste0("Sample ", 1:ncol(subclone_props)) - props_tb <- subclone_props %>% - magrittr::set_colnames(sample_names) %>% - as_tibble() %>% - mutate(Subclone = factor(paste0("Clone ", 1:nrow(.)), - levels = paste0("Clone ", 1:nrow(.)))) %>% - pivot_longer(cols = sample_names, - names_to = "Sample", - values_to = "Proportion") - - clone_colors <- palette(nrow(subclone_props)) - - ggplot(props_tb, aes(x="", y=Proportion, fill = Subclone)) + - geom_bar(stat="identity", width=1, color="white") + - coord_polar("y", start=0) + - scale_fill_manual(values = clone_colors, drop = F) + - theme_void() + - theme(legend.position = "bottom") + - theme(legend.text = element_text(size=legend_size), legend.title = element_text(size=legend_size)) + - facet_wrap(~Sample) + - theme(strip.text.x = element_text(size=title_size)) - -} - -#' Force CCFs to comply with lineage precedence and sum condition -#' -#' @param w_mat Matrix of CCF estimates (from \code{estimateCCFs}) -#' @param tree_edges Tibble of tree edges with columns edge, parent, and child -#' @export -#' @import dplyr -forceCCFs <- function(w_mat, tree_edges) { - - K <- nrow(w_mat) - S <- ncol(w_mat) - fixed_w_mat <- matrix(NA, nrow = K, ncol = S) - - curr_edges <- tree_edges %>% - filter(parent == "root") - - while (nrow(curr_edges) > 0) { - next_edges <- tibble() - for (p in unique(curr_edges$parent)) { - # grab parent node CCF - if (p == "root") { - parent_ccfs <- rep(1, S) - } else { - parent_ccfs <- fixed_w_mat[as.numeric(p), ] - } - - temp_curr_edges <- curr_edges %>% - filter(parent == p) - - if (isBranchPoint(p, tree_edges)) { - # if branch point, check sum condition - child_ccfs <- w_mat[as.numeric(temp_curr_edges$child), ] - sample_sums <- colSums(child_ccfs) - for (s in 1:S) { - # if sum condition is violated, normalize children CCFs to parent CCF - if (sample_sums[s] > parent_ccfs[s]) { - for (i in seq_len(length(temp_curr_edges$child))) { - child_num <- as.numeric(temp_curr_edges$child)[i] - fixed_w_mat[child_num, s] <- round(child_ccfs[i, s] / sample_sums[s] * parent_ccfs[s], 2) - } - } else { - # else sum condition is not violated, children CCFs are fine as is - # not violating sum condition guarantees compliance with lineage precedence - for (child in as.numeric(temp_curr_edges$child)) { - fixed_w_mat[child, s] <- w_mat[child, s] - } - } - } - } else { - for (child in as.numeric(temp_curr_edges$child)) { - child_ccfs <- w_mat[child, ] - fixed_ccfs <- mapply(function(child_ccf, parent_ccf) ifelse(child_ccf > parent_ccf, parent_ccf, child_ccf), - child_ccf = child_ccfs, - parent_ccf = parent_ccfs) - fixed_w_mat[child, ] <- fixed_ccfs - } - } - - - child_next_edges <- tree_edges %>% - filter(parent %in% temp_curr_edges$child) - next_edges <- bind_rows(next_edges, - child_next_edges) - } - curr_edges <- next_edges - } - return(fixed_w_mat) -} - -#' Calculate proportions of subclones in each sample (assumes CCFs comply with lineage precedence and sum condition) -#' -#' @param w_mat Matrix of CCF estimates (from \code{estimateCCFs}) -#' @param tree_edges Tibble of tree edges with columns edge, parent, and child -#' @export -#' @import dplyr -calcSubcloneProportions <- function(w_mat, tree_edges) { - K <- nrow(w_mat) - S <- ncol(w_mat) - subclone_props <- matrix(NA, nrow = K, ncol = S) - - for (i in seq_len(nrow(w_mat))) { - children <- tree_edges %>% - filter(parent == as.character(i)) %>% - pull(child) %>% - as.numeric() - - if (length(children) == 1) { - children_ccfs <- w_mat[children, ] - } else if (length(children) > 1) { - children_ccfs <- w_mat[children, ] %>% - colSums - } else { - children_ccfs <- rep(0, ncol(w_mat)) - } - - subclone_props[i, ] <- w_mat[i, ] - children_ccfs - } - - # normalize subclone_props matrix so the props add up to 1 - subclone_props[subclone_props < 0] = 0 - subclone_props = round(t(t(subclone_props) / colSums(subclone_props)),digit = 3) - - return(subclone_props) -} - -#' Plot subclone proportions in each sample as stacked bar chart -#' -#' @param subclone_props matrix of subclone proportions (returned from \code{calcSubcloneProportions}) -#' @param sample_names (Optional) Vector of sample names. Should be in the order of columns of subclone_props -#' @param label_cluster (Default FALSE) Whether to add cluster label to text on stacked bar -#' @export -#' @import ggplot2 -#' @importFrom magrittr set_colnames -#' @importFrom viridis viridis -plotSubcloneBar <- function(subclone_props, palette=viridis::viridis, sample_names = NULL, label_cluster = FALSE) { - if (is.null(sample_names)) { - sample_names <- paste0("Sample ", 1:ncol(subclone_props)) - } - - clone_colors <- palette(nrow(subclone_props)) - color_half <- nrow(subclone_props) / 2 - color_half_vec <- factor(ifelse(1:nrow(subclone_props) < color_half, "white", "black"), - c("white", "black")) - - props_tb <- subclone_props %>% - magrittr::set_colnames(sample_names) %>% - as_tibble %>% - mutate(Clone = as.factor(1:nrow(.)), - text_color = color_half_vec) %>% - pivot_longer(cols = all_of(sample_names), - names_to = "Sample", - values_to = "Proportion of Tumor") - if (label_cluster) { - text_label <- paste0("Clone ", props_tb$Clone, ": ", props_tb$`Proportion of Tumor`) - props_tb <- props_tb %>% - mutate(text_label = text_label) - } else { - props_tb <- props_tb %>% - mutate(text_label = `Proportion of Tumor`) - } - - - stacked_bar <- ggplot(props_tb, aes(x = Sample, y = `Proportion of Tumor`, fill = Clone)) + - theme_light() + - geom_bar(stat = "identity") + - scale_fill_manual(values = clone_colors) + - geom_text(data = subset(props_tb, `Proportion of Tumor` != 0), - aes(label = text_label, colour = text_color), - size = 4, position = position_stack(vjust = 0.5)) + - xlab("") + - scale_x_discrete(guide = guide_axis(angle = 0)) + - scale_color_manual(values = c("white", "black"), guide = "none") + - ylim(0,1.05) + - theme(axis.text=element_text(size=12), axis.title=element_text(size=14,face="bold")) + - theme(legend.text = element_text(size=12), legend.title = element_text(size=12)) - - return(stacked_bar) -} diff --git a/R/tree-gabowmyers.R b/R/tree-gabowmyers.R new file mode 100644 index 0000000..ef1ea03 --- /dev/null +++ b/R/tree-gabowmyers.R @@ -0,0 +1,231 @@ +#' Enumerate all spanning trees using modified Gabow-Myers wrapper +#' +#' @export +#' @param w matrix of CCF values (rows = clusters, columns = samples) +#' @param lineage_precedence_thresh maximum allowed violation of lineage precedence (default = 0.1) +#' @param sum_filter_thresh thresh maximum allowed violation of Sum Condition (default = 0.2) +generateAllTrees <- function(mcf, purity, lineage_precedence_thresh=0.1, sum_filter_thresh=0.2) { + mcf_mat <- estimateMCFs(mcf) + mcf_mat <- assign("mcf_mat", mcf_mat, envir = .GlobalEnv) + graph_G_pre <- prepareGraph(mcf_mat, lineage_precedence_thresh) + graph_G <- filterEdgesBasedOnCCFs(graph_G_pre, mcf_mat, thresh = lineage_precedence_thresh) + graph_G <- assign("graph_G", graph_G, envir = .GlobalEnv) + enumerateSpanningTreesModified(graph_G, mcf_mat, purity, sum_filter_thresh = sum_filter_thresh) +} + +#' Create tibble of possible edges from CCF values based on w_mat only +#' +#' @export +#' @param w matrix of CCF values (rows = clusters, columns = samples) +#' @return graph_G tibble of possible edges with columns edge, parent, child +prepareGraph <- function(mcf_mat, thresh) { + graph_pre <- data.frame(edge = character(), parent = character(), child = character()) + for (i in seq_len(nrow(mcf_mat))) { + graph_pre <- graph_pre %>% add_row(edge = paste("root->", i, sep = ""), parent = "root", child = as.character(i)) + for (j in seq_len(nrow(mcf_mat))) { + if (i!=j) { + i_row = mcf_mat[i, ] + j_row = mcf_mat[j, ] + if (all(j_row-i_row >= -thresh)) { + graph_pre <- graph_pre %>% add_row(edge = paste(j, "->", i, sep = ""), parent = as.character(j), child = as.character(i)) + } + } + } + } + return(graph_pre) +} + +#' Filter possible edges based on lineage precedence +#' +#' @export +#' @param graph_G tibble of possible edges with columns edge, parent, child +#' @param w matrix of CCF values (rows = clusters, columns = samples) +#' @param thresh maximum allowed violation of lineage precedence (default = 0.1) +filterEdgesBasedOnCCFs <- function(graph_G, mcf, thresh = 0.1) { + check_edges_logical <- apply(graph_G, 1, function(edge) checkEdge(edge, mcf, thresh)) + filtered_graph_G <- graph_G[check_edges_logical, ] + return(filtered_graph_G) +} + +checkEdge <- function(edge, mcf, thresh = 0.2) { + # returns TRUE if satisfies lineage precedence with given threshold + # returns FALSE if violates i.e. child_ccf - parent_ccf > thresh in any sample + # edge is in the format c(edge_name, parent, child) + + # in case of factors + p <- as.character(edge[2]) + c <- as.character(edge[3]) + + if (p == "root") { + parent_ccfs <- rep(1, ncol(mcf)) + } else { + parent_ccfs <- mcf[as.numeric(p), ] + } + child_ccfs <- mcf[as.numeric(c), ] + + diff <- child_ccfs - parent_ccfs + if (any(diff > thresh)) { + return(FALSE) + } else { + return(TRUE) + } +} + +#' Enumerate all spanning trees using modified Gabow-Myers +#' +#' @export +#' @param graph_G tibble of possible edges with columns edge, parent, child +#' @param w matrix of CCF values (rows = clusters, columns = samples) +#' @param sum_filter_thresh thresh maximum allowed violation of Sum Condition (default = 0.2) +enumerateSpanningTreesModified <- function(graph_G, mcf, purity, sum_filter_thresh=0.2) { + # all_spanning_trees must be set as an empty list, global variable, before function is called + # graph_G must be set as global variable before function is called + all_spanning_trees <- assign("all_spanning_trees", list(), envir = .GlobalEnv) + #filtered_trees <- assign("filtered_trees", list(), envir = .GlobalEnv) + F_tb <- assign("F_tb", filter(graph_G, parent == "root"), envir = .GlobalEnv) + all_vertices <- verticesInGraph(graph_G) + tree_T <- tibble(parent = character(), child = character()) + + growModified(tree_T, all_vertices, mcf, purity, sum_filter_thresh) +} + +verticesInGraph <- function(tb) { + unique(c(tb$parent, tb$child)) +} + +growModified <- function(tree_T, all_vertices, w, purity, sum_thresh=0.2) { + + if (length(verticesInGraph(tree_T)) == length(all_vertices) & nrow(tree_T) == (length(all_vertices)-1)) { + assign("all_spanning_trees", c(all_spanning_trees, list(tree_T)), envir = .GlobalEnv) + + } else { + FF <- tibble(parent = character(), child = character()) + + bridge <- FALSE + while(!bridge) { + # new tree edge + if (nrow(F_tb) == 0) stop("F_tb is empty") + edge_e <- pop(F_tb, "F_tb") + v <- edge_e$child + tree_T <- rbind(tree_T, edge_e) + + # check if adding this node does not violate the constraint + if (satisfiesSumCondition(tree_T, w, purity, sum_thresh)) { + # update F + ## push each edge (v,w), w not in T onto F + in_T <- verticesInGraph(tree_T) + temp_add_to_F <- filter(graph_G, parent == v, !(child %in% in_T)) + # temp_add_to_F + assign("F_tb", rbind(temp_add_to_F, F_tb), envir = .GlobalEnv) + + ## remove each edge (w,v), w in T from F + w_in_T <- verticesInGraph(tree_T) + removed_edges <- filter(F_tb, parent %in% w_in_T, child == v) + assign("F_tb", filter(F_tb, !edge %in% removed_edges$edge), envir = .GlobalEnv) + + # recurse + growModified(tree_T, all_vertices, w, purity, sum_thresh) + + # restore F + # pop each edge (v,w), w not in T, from F + not_in_T <- all_vertices[!all_vertices %in% verticesInGraph(tree_T)] + if (length(not_in_T) > 0 & nrow(F_tb) > 0) { + edges_to_remove_9 <- paste0(v, "->", not_in_T) + assign("F_tb", filter(F_tb, !edge %in% edges_to_remove_9), envir = .GlobalEnv) + } + # restore each edge (w,v), w in T, in F + assign("F_tb", rbind(removed_edges, F_tb), envir = .GlobalEnv) + + } + # delete e from T and from G, add e to FF + tree_T <- tree_T[tree_T$edge != edge_e$edge, ] + assign("graph_G",graph_G[graph_G$edge != edge_e$edge, ], envir = .GlobalEnv) + FF <- rbind(edge_e, FF) + + # bridge test + bridge <- bridgeTestBFS(graph_G, edge_e) + } + + # pop each edge e from FF, push e onto F,and add e to G + if (nrow(FF) > 0) { + + # pop and push all edges at once (same order) + # assign("F_tb", rbind(FF, F_tb), envir = .GlobalEnv) + assign("graph_G", rbind(FF, graph_G), envir = .GlobalEnv) + + # pop and push edges one by one (rev order in F) + while (nrow(FF) > 0) { + assign("F_tb", rbind(FF[1, ], F_tb), envir = .GlobalEnv) + FF <- FF[-1, ] + } + } + } +} + +satisfiesSumCondition <- function(edges, w, purity, thresh = 0.2) { + # returns TRUE if sum condition is not violated with given threshold (default 0.2) + + edges$parent <- as.character(edges$parent) + all_parents <- unique(edges$parent) + + for (p in all_parents) { + # get parent CCF + if (p == "root") { + # parent_ccf <- rep(1, ncol(w)) + parent_ccf <- purity + } else { + parent_ccf <- w[as.numeric(p), ] + } + + # get children CCF (sum if more than 1 child) + children <- as.numeric(filter(edges, parent == p)$child) + if (length(children) > 1) { + children_ccf <- colSums(w[children, ,drop=FALSE]) + } else { + children_ccf <- w[children, ] + } + + diff <- children_ccf - parent_ccf + if (any(diff > thresh)) return(FALSE) + } + + # sum condition is never violated, return TRUE + return(TRUE) +} + +pop <- function(edges_tb, tb_name) { + assign(tb_name, edges_tb[-1, ], envir = .GlobalEnv) + return(edges_tb[1, ]) +} + +bridgeTestBFS <- function(graph_G, edge_e) { + node_to_check <- edge_e$child + + nodes_connected_to_root <- bfsLong2(graph_G) + !(node_to_check %in% nodes_connected_to_root) +} + +bfsLong2 <- function(graph_G) { + # returns vector of nodes in main tree (connected to root) including "root" + # starting at root + # does not stop if there is a cycle in graph + graph_G$parent <- as.character(graph_G$parent) + children <- graph_G[(graph_G$parent == "root"), ]$child + nodes <- c("root", children) + + while(length(children) > 0) { + c <- children[1] + temp.children <- graph_G[(graph_G$parent == c), ]$child + + # remove children already seen + if (any(temp.children %in% nodes)) { + temp.children <- temp.children[! temp.children %in% nodes] + } + + children <- c(children, temp.children) + + nodes <- c(nodes, temp.children) + children <- children[-1] + } + return(nodes) +} \ No newline at end of file diff --git a/R/tree-plot.R b/R/tree-plot.R new file mode 100644 index 0000000..62de812 --- /dev/null +++ b/R/tree-plot.R @@ -0,0 +1,84 @@ +#' Plot ensemble tree +#' +#' @export +#' @param trees list of tibbles of edges, each with columns edge, parent, child +plotEnsembleTree <- function(trees, palette=viridis::viridis) { + am_chain <- lapply(trees, edgesToAmLong) + post_am <- getPosteriorAmLong(am_chain) + plotPosteriorAmLong(post_am, colorScheme(trees[[1]], palette)) +} + +plotPosteriorAmLong <- function(post_am, v_color, filter1 = TRUE, filter1.threshold = 0.1, + filter2 = TRUE, filter2.threshold = 0.1) { + # filter1 filters columns (am wide format) for edges with posterior prob > (max(column) - filter1.threshold) + admat <- prepPostAmForGraphing(post_am) + + # filter edges of low freq + admat <- filterAdmat(admat, filter1 = filter1, filter1.threshold = filter1.threshold, + filter2 = filter2, filter2.threshold = filter2.threshold) + + ig <- igraph::graph_from_adjacency_matrix(admat, mode = "directed", weighted = TRUE, + diag = FALSE, add.row = TRUE) + + igraph::E(ig)$lty <- ifelse(igraph::E(ig)$weight < 0.25, 2, 1) + + # make edge black if only 1 edge to vertex + e <- igraph::ends(ig, igraph::E(ig)) + numTo <- table(e[,2]) + edgeColors <- sapply(e[,2], function(x) ifelse(x %in% names(which(numTo==1)), "black", "darkgrey")) + igraph::E(ig)$color <- edgeColors + + igraph::V(ig)$label.cex <- 1.5 + + igraph::V(ig)$color <- as.list(v_color %>% arrange(match(v_sorted, names(V(ig)))) %>% select(colors))$colors + + par(mar=c(0,0,0,0)+.1) + + igraph::plot.igraph(ig, layout = igraph::layout_as_tree(ig), + vertex.label.family = "Helvetica", vertex.size=20, + edge.arrow.size = 0.5, edge.arrow.width = 2, + edge.width = igraph::E(ig)$weight*3) +} + +#' Plot single tree +#' +#' @export +#' @param edges tibble of edges with columns edge, parent, child +plotTree <- function(edges, palette=viridis::viridis) { + plotGraph(edgesToAmLong(edges), colorScheme(edges, palette)) +} + +#' generate colors for each vertice +#' @export +colorScheme <- function(edges, palette=viridis::viridis) { + v_sorted = sort(unique(c(edges$parent, edges$child))) + v_sorted = c(sort(as.integer(v_sorted[!v_sorted=='root'])), "root") + # root_idx <- which(v_sorted=="root") + colors <- c(palette(length(v_sorted)-1), "white") + v_color <- tibble(v_sorted, colors) + return(v_color) +} + +plotGraph <- function(am.long, v_color){ + # make sure am.long is sorted by parent and child + am.long <- mutate(am.long, child = as.numeric(am.long$child)) %>% + arrange(parent, child) + am.long <- mutate(am.long, child = as.character(am.long$child)) + + # change to wide format and plot + am <- toWide(am.long) + rownames(am) <- c("root", colnames(am)) + am <- cbind(root=0, am) ## add column for root + colnames(am) <- rownames(am) + + am[is.na(am)] <- 0 + + ig <- igraph::graph_from_adjacency_matrix(am, mode = "directed", weighted = TRUE, + diag = FALSE, add.row = TRUE) + V(ig)$color <- as.list(v_color %>% arrange(match(v_sorted, names(V(ig)))) %>% select(colors))$colors + par(mar=c(0,0,0,0)+.1) + igraph::plot.igraph(ig, layout = igraph::layout_as_tree(ig), + vertex.size=24, vertex.frame.color = "#000000", vertex.label.cex = 1.5, + vertex.label.family = "Helvetica", vertex.label.color = "#000000", + edge.arrow.size = 0.5, edge.arrow.width = 2) +} \ No newline at end of file diff --git a/R/tree-score.R b/R/tree-score.R new file mode 100644 index 0000000..b1cebc7 --- /dev/null +++ b/R/tree-score.R @@ -0,0 +1,1374 @@ +constrainedEdgesMatrix <- function(wmat, chains, input_data) { + ## + ## Rules: + ## - cluster (node) cannot connect to itself + ## - a cluster with near-zero MCF cannot have children + ## - a cluster present in X multiple samples cannot connect to a cluster present in Y samples + ## if X < Y + ## - X < Y implies ... + ## + ##cluster.sample.presence <- apply(w, 1, function(x) which( x>= zero.thresh)) + samp_pres <- matchSamplePresence(wmat, chains, input_data) + if (input_data$S == 1) { + cluster.sample.presence <- lapply(samp_pres, function(x) which(x==1)) + } else { + cluster.sample.presence <- apply(samp_pres, 1, function(x) which(x==1)) + } + K <- nrow(wmat) + S <- ncol(wmat) + admat <- matrix(T, K, K) + + for(i in 1:K){ + for(j in 1:K){ + + if (is.matrix(cluster.sample.presence)) { + from.samples <- cluster.sample.presence[, i] + to.samples <- cluster.sample.presence[, j] + } else if (is.list(cluster.sample.presence)) { + from.samples <- cluster.sample.presence[[i]] + to.samples <- cluster.sample.presence[[j]] + } else if (is.vector(cluster.sample.presence)) { + from.samples <- cluster.sample.presence[i] + to.samples <- cluster.sample.presence[j] + } + + if (setequal(from.samples, to.samples)) next() + if(length(from.samples) < length(to.samples)) { + admat[i, j] <- F + next() + } + if (all(to.samples %in% from.samples)) next() + admat[i, j] <- F + } + } + diag(admat) <- F + am2 <- rbind(T, admat) + dimnames(am2) <- list(c("root", 1:K), 1:K) + return(am2) +} + +matchSamplePresence <- function(w_mat, chains, input_data) { + map_z <- estimateClusterAssignments(chains$z_chain) + + # pull one variant for each cluster + single_var_clust <- map_z[match(unique(map_z$value), map_z$value), ] %>% + mutate(mut_ind = as.numeric(gsub("z\\[|]", "", Parameter))) %>% + arrange(value) + + samp_pres <- ifelse(input_data$y[single_var_clust$mut_ind, ] >=1, 1, 0) + return(samp_pres) +} + + +## refactored base.admat +constrainedEdges <- function(wmat, chains, input_data) { + am2 <- constrainedEdgesMatrix(wmat, chains, input_data) + am2.long <- as_tibble(am2) %>% + mutate(parent=rownames(am2)) %>% + pivot_longer(-parent, + names_to="child", + values_to="possible_edge") %>% + filter(parent != child) %>% + unite("edge", c("parent", "child"), sep="->", + remove=FALSE) %>% + mutate(parent=factor(parent, levels=unique(parent))) %>% + mutate(connected=0) + am2.long +} + +calcConstrianedTreeSpace <- function(mcf_matrix, zero.thresh = 0.01) { + # input: + # - mcf_matrix = matrix of cell fraction values where rows are clusters, columns are samples + # - zero.thresh = minimum cell fraction to be considered "present" in sample (default = 0.01) + # output: number of possible trees, given constraints + ce <- constrainedEdgesMatrix(mcf_matrix, zero.thresh) + possible_from_edges_per_node <- colSums(ce) + tree_space <- prod(possible_from_edges_per_node) + return(tree_space) +} + +getAllNodes <- function(am.long) { + # returns vector of all nodes in graph + am.long$parent <- as.character(am.long$parent) + unique(c(am.long$parent, am.long$child)) +} + +reversedConnection <- function(am) { + connections <- setNames(am$connected, am$edge) + reversed_connections <- connections[am$reverse_edge] %>% + "["(!is.na(.)) + reversed <- setNames(rep(0, nrow(am)), am$reverse_edge) + reversed[names(reversed_connections)] <- reversed_connections + reversed +} + +isBidirectional <- function(am) { + am %>% + mutate(bi_directional=(reverse_edge %in% edge) & + connected==1 & + reversed_connected == 1) %>% + pull(bi_directional) +} + +updateGraphElements <- function(am) { + am %>% + mutate(parent=factor(parent, levels=unique(parent))) %>% + mutate(reversed_connected=reversedConnection(.)) %>% + mutate(bi_directional=isBidirectional(.)) %>% + mutate(root_connected=isRootConnected(.)) +} + + +reversedEdges <- function(am) { + am2 <- am %>% + filter(!is.na(connected)) %>% + unite("reverse_edge", c("child", "parent"), sep="->", + remove=FALSE) + am2 +} + +getEdgeName <- function(from, to) { + paste0(from, "->", to) +} + +numNodesConnectedToRoot <- function(am.long) { + sum(am.long[am.long$parent == "root", ]$connected) +} + +randAdmatUnchecked <- function(am.long, max.num.root.children) { + # input: blank am.long (from either constrainedEdges or toLong(initEmptyAdmatFromK(K)) + # - $connected = 0 + # - $possible_edge = T/F if from constrainedEdges + # output: random graph (not necessarily valid) + blank <- am.long # save copy of original am.long + parent_levels <- levels(blank$parent) + + am.long$parent <- as.character(am.long$parent) + + all.nodes <- getAllNodes(am.long) + node.pool <- all.nodes[all.nodes != "root"] # nodes left to connect in graph + # possible edges may be limited by constraints + if ("possible_edge" %in% colnames(am.long)) { + possible.edges <- am.long %>% + filter(possible_edge == TRUE) + has_constraints <- TRUE + } else { + possible.edges <- am.long + has_constraints <- FALSE + } + parent.pool <- unique(possible.edges$parent) # possible parent nodes + + # choose node to connect to root + # select "to" node from parent.pool to prevent getting stuck if max.num.root.children == 1 + temp.possible.root.children <- filter(possible.edges, parent == "root")$child + if (length(temp.possible.root.children) > 1) { + temp.node <- sample(temp.possible.root.children, 1) + } else { + temp.node <- temp.possible.root.children + } + + am.long[am.long$edge == getEdgeName("root", temp.node), ]$connected <- 1 + node.pool <- node.pool[node.pool != temp.node] + + # connect nodes that are left + while(length(node.pool) > 0) { + #for (n in node.pool) { + if (length(node.pool) > 1) { + n <- sample(node.pool, 1) + } else { + n <- node.pool + } + + # all possible edges to node n + # check if there are constraints present + if (has_constraints) { + temp.possible.edges <- am.long %>% + filter(possible_edge == T) + } else { + temp.possible.edges <- am.long + } + # can't connect to root if max.num.root.children quota is satisfied + if (numNodesConnectedToRoot(am.long) >= max.num.root.children) { + temp.possible.edges <- temp.possible.edges %>% + filter(child == n, parent != "root") + } else { + temp.possible.edges <- temp.possible.edges %>% + filter(child == n) + } + + # choose edge to connect -- randomly sample if more than 1 possible edge + if(nrow(temp.possible.edges) > 1) { + temp.edge <- temp.possible.edges[sample(nrow(temp.possible.edges), 1), ]$edge + } else if (nrow(temp.possible.edges) == 1) { + temp.edge <- temp.possible.edges$edge + } else { + # if no possible edges, start over + return(randAdmatUnchecked(blank, max.num.root.children)) + } + + # connect edge + am.long[am.long$edge == temp.edge, ]$connected <- 1 + + node.pool <- node.pool[node.pool != n] + } + + am.long <- am.long %>% + mutate(parent = factor(am.long$parent, levels = parent_levels)) + + am.long <- reversedEdges(am.long) %>% + mutate(reversed_connected=reversedConnection(.), + bi_directional=NA, + root_connected=NA) + am.long <- updateGraphElements(am.long) + + return(am.long) +} + +randAdmat <- function(am.long, max.num.root.children) { + # input: blank am.long (from either constrainedEdges or toLong(initEmptyAdmatFromK(K)) + # - $connected = 0 + # - $possible_edge = T/F if from constrainedEdges + # output: random graph + blank <- am.long + has_constraints <- ifelse("possible_edge" %in% colnames(am.long), + ifelse(any(!am.long$possible_edge), T, F), + F) + + am.long$parent <- as.character(am.long$parent) + + all.nodes <- getAllNodes(am.long) + node.pool <- all.nodes[all.nodes != "root"] # nodes left to connect in graph + possible.edges <- am.long %>% + filter(!is.na(connected)) + parent.pool <- unique(possible.edges$parent) # possible parent nodes + + # choose node to connect to root + # select "to" node from parent.pool to prevent getting stuck if max.num.root.children == 1 + temp.node <- sample(parent.pool[parent.pool != "root"], 1) + am.long[am.long$edge == getEdgeName("root", temp.node), ]$connected <- 1 + node.pool <- node.pool[node.pool != temp.node] + from.nodes <- c("root", temp.node) + + while(length(node.pool) > 0) { + + # remove "root" from possible parents if max.num.root.children quota satisfied + if(numNodesConnectedToRoot(am.long) < max.num.root.children) { + from.nodes.pool <- from.nodes + } else { + from.nodes.pool <- from.nodes[-1] + } + + if (length(from.nodes.pool) == 1) { + temp.from <- from.nodes.pool + } else { + temp.from <- sample(from.nodes.pool, 1) + } + + # remove possible "to" nodes based on NA constraints + temp.to.pool <- am.long %>% + filter(parent==temp.from) %>% + filter(!is.na(connected)) + # remove "to" nodes if not in node.pool + temp.to.pool <- temp.to.pool$child + temp.to.pool <- temp.to.pool[temp.to.pool %in% node.pool] + # sample possible children nodes to be "to" node and connect edge + if (length(temp.to.pool) > 1) { + temp.to <- sample(temp.to.pool, 1) + } else if (length(temp.to.pool) == 1) { + temp.to <- temp.to.pool + } else { + # remove temp.from from from.nodes + } + + am.long[am.long$edge == getEdgeName(temp.from, temp.to), ]$connected <- 1 + + # add temp.to to possible from.nodes if it is a possible parent + if (temp.to %in% parent.pool) { + from.nodes <- c(from.nodes, temp.to) + } + + node.pool <- node.pool[node.pool != temp.to] + } + + am.long <- reversedEdges(am.long) %>% + mutate(reversed_connected=reversedConnection(.), + bi_directional=NA, + root_connected=NA) + am.long <- updateGraphElements(am.long) + am.long +} + +isParentConnected <- function(am) { + am %>% + mutate(parent=factor(parent, levels=unique(parent))) %>% + group_by(parent) %>% + summarize(n=sum(connected)) %>% + pull(n) > 0 +} + +isRootConnected <- function(am) isParentConnected(am)[1] + +isDirected <- function(am) !any(am$bi_directional) + +isFullyConnected <- function(am.long) { + # checks if graph (am.long format) is fully connected + all_nodes <- getAllNodes(am.long) + nodes_in_main_tree <- bfsLong(am.long) + length(all_nodes) == length(nodes_in_main_tree) +} + +containsCycle <- function(am.long) { + # returns nodes in main tree (connected to root) including "root" + # starting at root + am.long$parent <- as.character(am.long$parent) + children <- am.long[(am.long$parent == "root" & am.long$connected == 1), ]$child + nodes <- c("root", children) + + while(length(children) > 0) { + c <- children[1] + temp.children <- am.long[(am.long$parent == c & am.long$connected == 1), ]$child + children <- c(children, temp.children) + if (any(temp.children %in% nodes)) return(TRUE) + nodes <- c(nodes, temp.children) + + children <- children[-1] + } + FALSE +} + +validGraph <- function(am) { + isDirected(am) && + isRootConnected(am) && + !containsCycle(am) && + isFullyConnected(am) +} + +bfsLong <- function(am.long) { + # returns vector of nodes in main tree (connected to root) including "root" + # starting at root + # stops if there is a cycle present in graph + am.long$parent <- as.character(am.long$parent) + children <- am.long[(am.long$parent == "root" & am.long$connected == 1), ]$child + nodes <- c("root", children) + + while(length(children) > 0) { + c <- children[1] + temp.children <- am.long[(am.long$parent == c & am.long$connected == 1), ]$child + children <- c(children, temp.children) + if (any(temp.children %in% nodes)) stop("graph has cycle") + nodes <- c(nodes, temp.children) + children <- children[-1] + } + return(nodes) +} + +addEdge <- function(am, new_edge) { + c <- new_edge$child + # disconnect existing edge connecting to child + am[which(am$child == c & am$connected == 1), ]$connected <- 0 + # connect new edge + am[which(am$edge == new_edge$edge), ]$connected <- 1 + # update graph elements + am <- updateGraphElements(am) + return(am) +} + +toWide <- function(am.long){ + am.long$child <- as.numeric(am.long$child) + am.long %>% select(parent, child, connected) %>% + tidyr::spread(child, connected) %>% + select(-parent) %>% + as.matrix() +} + +toLong <- function(am) { + am.long <- as_tibble(am) %>% + mutate(parent=rownames(am)) %>% + pivot_longer(-parent, + names_to="child", + values_to="connected") %>% + filter(parent != child) %>% + unite("edge", c("parent", "child"), sep="->", + remove=FALSE) %>% + mutate(parent=factor(parent, levels=unique(parent))) + return(am.long) +} + +isMoveValid <- function(a, possible_move, max.num.root.children) { + # a = am.long format of current graph + # possible_move = a row in possible_moves tibble (am.long format) + # max.num.root.children = maximum number of nodes allowed to be connected to root + # returns TRUE or FALSE + astar <- addEdge(a, possible_move) + + is_valid <- validGraph(astar) & (numNodesConnectedToRoot(astar) <= max.num.root.children) + return(is_valid) +} + +sampleNewEdge <- function(a, max.num.root.children, mc.cores=1){ + ## a move is connecting a new edge and disconnecting the pre-existing edge connected to the new edge's child + possible_moves <- filter(a, connected==0, possible_edge==T) + possible_moves_list <- possible_moves %>% + group_by(edge) %>% + group_split() + is_valid <- unlist(parallel::mclapply(possible_moves_list, function(x) isMoveValid(a, x, max.num.root.children), + mc.cores = mc.cores)) + move_set <- possible_moves_list[is_valid] + ix <- tryCatch(sample(seq_len(length(move_set)), 1), error=function(e) NULL) + if(is.null(ix)) { + return(a) + } else { + astar <- addEdge(a, move_set[[ix]]) + return(astar) + } +} + +initEmptyAdmatFromK <- function(K) { + admat <- matrix(0, K, K) + diag(admat) <- NA + am2 <- rbind(0, admat) + dimnames(am2) <- list(c("root", 1:K), 1:K) + return(am2) +} + +generateRandomGraphFromK <- function(K, max.num.root.children) { + # input: number of mutation clusters, K + # output: mutation tree; adjacency matrix + am.long <- toLong(initEmptyAdmatFromK(K)) + rand.am.long <- randAdmat(am.long, max.num.root.children) + if (!validGraph(rand.am.long)) warning("graph is not valid") + return(rand.am.long) +} + +getPosteriorAmLong <- function(am_chain) { + # input: chain from tree MCMC of trees in am.long format + # output: posterior am.long + num_trees <- length(am_chain) + + combined_am_chain <- am_chain %>% + bind_rows + post_am <- combined_am_chain %>% + group_by(edge) %>% + mutate(posterior_prob = sum(connected) / num_trees) %>% + ungroup() %>% + select(edge, parent, child, posterior_prob) %>% + distinct() + post_am +} + +toWidePostAm <- function(post_am) { + post_am <- post_am %>% + mutate(child = as.numeric(post_am$child)) + if(!is.factor(post_am$parent)) { + post_am <- post_am %>% + mutate(parent = factor(parent, levels = c("root", 1:max(post_am$parent)))) + } + post_am %>% + select(parent, child, posterior_prob) %>% + tidyr::spread(child, posterior_prob) %>% + select(-parent) %>% + as.matrix() +} + +filterAdmat <- function(admat, filter1 = TRUE, filter1.threshold = 0.1, + filter2 = TRUE, filter2.threshold = 0.1) { + # filter1 filters columns (am wide format) for edges with posterior prob > (max(column) - filter1.threshold) + # filter2 filters entire matrix for prob > filter2.threshold + + if (filter1) { + admat <- apply(admat, 2, function(x) ifelse(x > (max(x)-filter1.threshold), x, 0)) + } + + if (filter2) { + admat[admat <= filter2.threshold] <- 0 + } + + return(admat) +} + +prepPostAmForGraphing <- function(post_am) { + post_am_mat <- toWidePostAm(post_am) + + # add column for root + post_am_mat <- cbind(0, post_am_mat) + colnames(post_am_mat)[1] <- "root" + rownames(post_am_mat) <- colnames(post_am_mat) + admat <- round(as.matrix(post_am_mat), 2) + admat[is.na(admat)] <- 0 + + return(admat) +} + +labelMAPEdgesFromPostAM <- function(post_am) { + post_am %>% + group_by(child) %>% + mutate(map_edge = posterior_prob == max(posterior_prob)) %>% + ungroup() +} + +getMAPGraphFromPostAM <- function(post_am) { + map_am <- labelMAPEdgesFromPostAM(post_am) %>% + mutate(connected = ifelse(map_edge, 1, 0)) + return(map_am) +} + +edgeTibbleToAmLong <- function(edge_tb, root = 0) { + K <- length(unique(c(edge_tb$From, edge_tb$To))) - 1 + am_long <- toLong(initEmptyAdmatFromK(K)) + edge_tb$From <- as.character(edge_tb$From) + edge_tb[edge_tb == as.character(root)] <- "root" + for (i in seq_len(nrow(edge_tb))) { + temp_edge <- getEdgeName(edge_tb$From[i], edge_tb$To[i]) + am_long$connected[which(am_long$edge == temp_edge)] <- 1 + } + return(am_long) +} + +getBTColumn <- function(am_long, node) { + BT_column <- rep(0, length(unique(am_long$child))) + path <- getPathFromRoot(node, am_long) + BT_column[path] <- 1 + return(BT_column) +} + +amToBT <- function(am_long) { + # returns binary perfect phylogeny matrix + # where columns are clones and rows are mutation clusters + # clones have the same numeric ID as the last mutation cluster on their path from the root + children <- unique(am_long$child) + children_num <- sort(as.numeric(children)) + + BT_cols <- sapply(children_num, function(node) getBTColumn(am_long, node)) + + return(BT_cols) +} + +summarizeWChain <- function(w.chain) { + # output: mcf_stats + mcf_stats <- w.chain %>% + group_by(Parameter) %>% + summarize(sd=sd(value), + mean=mean(value)) + return(mcf_stats) +} + +create.cpov <- function(mcf_stats, alpha=0.05, zero.thresh=0.001, mcf_matrix = NULL, restriction.val = 1) { + cpov <- NA + MCF <- NA + + ## if mcf_matrix is supplied, use that to create cpov + if (is.null(mcf_matrix)) { + cpov <- initializeAdjacencyMatrix(mcf_stats = mcf_stats, zero.thresh = zero.thresh) + cpov[is.na(cpov)] <- restriction.val + MCF <- mcfMatrix(mcf_stats) + } else { + cpov <- initializeAdjacencyMatrix(mcf_matrix = mcf_matrix, zero.thresh = zero.thresh) + cpov[is.na(cpov)] <- restriction.val + MCF <- mcf_matrix + } + + sds <- mcfMatrix(mcf_stats, parameter="sd") + ##S <- ncol(mcmc_w) # number of samples + S <- numberSamples(mcf_stats) + ##cpov <- cpov[-1, ] + ## root can go to anyone -- all 0's (default base admat value) + for (r in 2:nrow(cpov)) { + for (c in 1:ncol(cpov)) { + if (cpov[r,c] == restriction.val) next # skip restricted position + from <- r-1 # 'from' cluster node + to <- c # 'to' cluster node + statistic <- 0 + pval <- 0 + for(s in seq_len(S)) { + ##d <- mcmc_w[from,s] - mcmc_w[to,s] + d <- MCF[from, s] - MCF[to, s] + d_sd <- sqrt(sds[from, s]^2 + sds[to, s]^2) + ##d_sd <- sqrt((mcmc_w_sd[from,s])^2 + (mcmc_w_sd[to,s])^2) + I <- sum(d < 0) + ## cumulative sum of the + ## number of standard deviations for the difference in + ## MCFs between 2 samples + if (d == 0 || is.nan(d / d_sd)) { + next + } else { + statistic <- statistic + (d / d_sd)^2 * I + } + + for (k in 0:S) { + pval <- pval + ((1 - pchisq(statistic, k)) * + choose(S, k) / (2^S)) + } + } + + pval <- ifelse(is.na(pval), 0, pval) + pval <- ifelse(is.nan(pval), 0, pval) + ## + ## edge seems to be based on this ad-hoc statistic, not + ## the probability of the tree + ## + cpov[r,c] <- decide.ht(pval, alpha) + } + } + cpov +} + +decide.ht <- function(pval, alpha=0.05) { + # 1 signals rejection event for null of i -> j + if (pval <= alpha) return(1) + else return(0) +} + +calcTopologyCost <- function(am, cpov, am_format = "long") { + TC <- 0 + + if (am_format == "long") { + am <- toWide(am) + } + + edges <- which(am == 1, arr.ind=TRUE) + N <- nrow(edges) + for (i in seq_len(N)) { + TC <- TC + cpov[edges[i,1], edges[i,2]] + } + + TC +} + +getEdges <- function(am.long) { + am.long %>% + filter(connected == 1) %>% + mutate(parent = as.character(parent)) +} + +getChildren <- function(am.long, node) { + # returns vector of children nodes + edges <- am.long %>% + mutate(parent = as.character(parent)) %>% + filter(connected == 1) %>% + filter(parent == node) + return(edges$child) +} + +calcMassCost <- function(am, mcf_matrix, purity, am_format="long") { + num_samples <- ncol(mcf_matrix) + + if (am_format == "long") { + edges <- getEdges(am) + + parent_nodes <- unique(edges$parent) + mass_cost <- rep(0, length(parent_nodes)) # mass cost of each parent node + + for (i in seq_len(length(parent_nodes))) { + parent_node <- parent_nodes[i] + + # root MCF is purity instead of 1 + if (parent_node == "root") { + # parent_w <- rep(1, num_samples) # 1 replaced by purity + parent_w <- purity + } else { + parent_w <- mcf_matrix[as.numeric(parent_node), ,drop=FALSE] + } + + kids <- getChildren(am, parent_node) + if (length(kids) > 1) { + children_w <- colSums(mcf_matrix[as.numeric(kids), ,drop=FALSE]) + } else { + children_w <- mcf_matrix[as.numeric(kids), ,drop=FALSE] + } + + mc_s <- ifelse(parent_w >= children_w, 0, children_w - parent_w) + #mass_cost[i] <- sqrt(sum(mc_s^2)) + mass_cost[i] <- max(mc_s) # take max across samples instead of euclidean distance + } + return(sum(mass_cost)) + + } else if (am_format == "wide") { + num_children <- rowSums(am, na.rm = T) + nodes <- which(num_children > 0, arr.ind = T) # not leaves + mc_node <- rep(0, length(nodes)) + + for (i in 1:length(nodes)) { + node <- nodes[i] + + # root node: MCF = 1 + parent_w <- rep(1, ncol(mcf_matrix)) + # not root node: look up MCF in mcf_matrix + if (node != 1) { + parent_w <- mcf_matrix[node-1,] + } + + kids <- which(am[node,] == 1, arr.ind = T) + if (num_children[node] > 1) { + children_w <- colSums(mcf_matrix[kids, ]) + } else { + children_w <- mcf_matrix[kids, ] + } + + mc_s <- ifelse(parent_w >= children_w, 0, children_w - parent_w) + mc_node[i] <- sqrt(sum(mc_s^2)) + } + return(sum(mc_node)) + } +} + +edgesToAmLong <- function(edges) { + am_wide <- initEmptyAdmatFromK(length(unique(edges$child))) + edges[edges$parent == "root", "parent"] <- "0" + edges <- edges %>% + mutate(parent = as.numeric(parent) + 1, + child = as.numeric(child)) %>% + select(parent, child) + edges <- as.matrix(edges) + for (r in 1:nrow(edges)) { + am_wide[edges[r,1], edges[r,2]] <- 1 + } + admat <- toLong(am_wide) + admat <- reversedEdges(admat) %>% + mutate(reversed_connected=reversedConnection(.), + bi_directional=NA, + root_connected=NA) + admat <- updateGraphElements(admat) + return(admat) +} + +calcTreeFitness <- function(admat, cpov, mcf_matrix, purity, am_format = "long", weight_mass = 1, weight_topology = 1, scaling_coeff=5) { + # if only edges are given, change into long format + if (am_format == "edges") { + admat <- edgesToAmLong(admat) + am_format <- "long" + } + + TC <- calcTopologyCost(admat, cpov, am_format) + MC <- calcMassCost(admat, mcf_matrix, purity, am_format) + Z <- weight_topology * TC + weight_mass * MC + fitness <- exp(-scaling_coeff * Z) + fitness +} + +satisfiesCCFSumProperties <- function(am_long, mcf_matrix, threshold = 0.2) { + # threshold = max value that children CCFs can be larger than parent; + # i.e. function returns false if (sum of children CCFs) - parent CCF > threshold + + edges <- getEdges(am_long) + parent_nodes <- unique(edges$parent) + + for (i in seq_len(length(parent_nodes))) { + parent_node <- parent_nodes[i] + + # root CCF is 1 + if (parent_node == "root") { + parent_w <- rep(1, ncol(mcf_matrix)) + } else { + parent_w <- mcf_matrix[as.numeric(parent_node), ] + } + + kids <- getChildren(am_long, parent_node) + if (length(kids) > 1) { + children_w <- colSums(mcf_matrix[as.numeric(kids), ]) + } else { + children_w <- mcf_matrix[as.numeric(kids), ] + } + + # return false if violates sum properties + if (any(children_w - parent_w > threshold)) return(FALSE) + } + + return(TRUE) +} + +#' Calculate SCHISM fitness scores for trees +#' +#' @export +#' @param w_chain MCMC chain of CCF values, which is the first item in the list returned by \code{clusterSep} +#' @param trees list of tibbles, where each tibble contains edges of a tree with columns edge, parent, child +calcTreeScores <- function(mcf_chain, trees, purity, mc.cores = 1) { + # calculate mean and sd of mcf for each cluster in each sample + mcf_stats <- summarizeWChain(mcf_chain) + + # create cpov matrix + # first: using sample presence to create a binary matrix; 0 is i can be a ancestor of j; 1 if not + # second: for i,j pair that pass the sample presence test, calc stats of the difference between mcf of all + # samples; return binary matrix + # problem: is the stats calculation in create.cpov correct? only using cluster so is actually POV instead + # of CPOV + cpov <- create.cpov(mcf_stats) + mcf_mat <- estimateMCFs(mcf_chain) + + # first calculate topology cost: sum of the cpov matrix over all edges + # potential problem: mcf[i]=0.4->mcf[j]=0.6 has same weight as mcf[i]=0.5->mcf[j]=0.6 + # second calculate mass cost: take the max mass violation among all samples; is there a better strategy? + # fitness is exp(-5*(topology cost + mass cost)) + schism_scores <- unlist(parallel:::mclapply(trees, + function(x) calcTreeFitness(x, cpov, mcf_mat, purity, am_format = "edges"), + mc.cores = mc.cores)) + return(schism_scores) +} + +#' Calculate SCHISM fitness scores for trees +#' +#' @export +calculateTreeScoreMutations <- function(mcf_chain, data, icnTable, cncfTable, multiplicityTable, clusterAssingmentTable, purity, trees, restriction.val = 1, mc.cores = 8) { + + mcfMutations1 <- (data$y * (icnTable$icn*cncfTable+2-2*cncfTable) - data$n * (multiplicityTable$Multiplicity-1)*cncfTable) / data$n + mcfMutations1[mcfMutations1>1] <-1 + mcfMutations1[mcfMutations1<0] <-0 + mcfMutations2 <- (data$n - 2 * data$y) / (data$y * icnTable$icn - 2 * data$y - data$n*multiplicityTable$Multiplicity + data$n) + mcfMutations2[mcfMutations2>1] <-1 + mcfMutations2[mcfMutations2<0] <-0 + + mcfMutations <- matrix(0, nrow = nrow(mcfMutations1), ncol = ncol(mcfMutations1)) + mcfMutations[data$is_cn == 1, ] <- mcfMutations2[data$is_cn == 1, ] + mcfMutations[data$is_cn == 0, ] <- mcfMutations1[data$is_cn == 0, ] + mcfMutations[is.nan(mcfMutations)] <- 0 + + colnames(mcfMutations) <- seq_len(ncol(mcfMutations)) + + mutation_chain <- as_tibble(mcfMutations) %>% rownames_to_column("Row") %>% + pivot_longer( + cols = -Row, + names_to = "Column", + values_to = "value" + ) %>% mutate(Row = as.integer(Row)) + + mutation_chain <- clusterAssingmentTable %>% + mutate(row = row_number()) %>% + inner_join(mutation_chain, by = c("row" = "Row")) %>% + mutate(Parameter = paste("mcf[", row, ",", gsub("V", "", Column), "]", sep = "")) %>% + select(Parameter, value, Cluster) + + mutation_stats <- mutation_chain %>% + group_by(Cluster) %>% + summarize(sd=sd(value),mean=mean(value)) + + mutation_stats <- mutation_chain %>% + left_join(mutation_stats, by="Cluster") %>% + select(Parameter, sd, value) %>% + mutate(mean = value) %>% + select(Parameter, sd, mean) + + # create pov for mutaiton pairs + pov <- create.cpov(mutation_stats) + pov <- pov[2:nrow(pov),] + + cpov <- initializeAdjacencyMatrix(mcf_stats = summarizeWChain(mcf_chain)) + cpov[is.na(cpov)] <- restriction.val + + for (r in 2:nrow(cpov)) { + for (c in 1:ncol(cpov)) { + if (cpov[r,c] == restriction.val) next + from <- r-1 # 'from' cluster node + to <- c # 'to' cluster node + fromMutations <- which(clusterAssingmentTable$Cluster==from) + toMutations <- which(clusterAssingmentTable$Cluster==to) + + totPov <- 0 + for (fromIdx in fromMutations) { + for (toIdx in toMutations) { + totPov = totPov + pov[fromIdx, toIdx] + } + } + totPov = totPov / (length(fromMutations)*length(toMutations)) + cpov[r,c] = totPov + } + } + # scores <- calcTreeScores(chains$mcf_chain, all_spanning_trees, purity) + mcf_mat <- estimateMCFs(mcf_chain) + + # first calculate topology cost: sum of the cpov matrix over all edges + # potential problem: mcf[i]=0.4->mcf[j]=0.6 has same weight as mcf[i]=0.5->mcf[j]=0.6 + # second calculate mass cost: take the max mass violation among all samples; is there a better strategy? + # fitness is exp(-5*(topology cost + mass cost)) + schism_scores <- unlist(parallel:::mclapply(trees, + function(x) calcTreeFitness(x, cpov, mcf_mat, purity, am_format = "edges"), + mc.cores = mc.cores)) +} + +rand.admat <- function(admat) { + for(col in 1:ncol(admat)) { + ind.0 <- which(admat[,col] == 0) # possible positions (0's) + + # place 1 if only 1 position available + if (length(ind.0) == 1) { + admat[ind.0, col] <- 1 + + } else { # else randomly pick which position to place 1 + rand.ind <- sample(ind.0, size=1) + admat[rand.ind,col] <- 1 + } + } + + if (sum(admat[1, ], na.rm=TRUE) == 0) { + # pick random cluster to be connected to root + rand.k <- sample(1:ncol(admat), size = 1) + curr.1 <- which(admat[, rand.k] == 1) + admat[curr.1, rand.k] <- 0 + admat[1, rand.k] <- 1 + } + + while (is.bidirectional(admat)) { + admat <- fix.bidirectional(admat) + } + + admat +} + +restrictions.from.post.admat <- function(post.admat, threshold=0.1) { + # input: posterior adjacency matrix + # output: adjacency matrix with restrictions (0's in allowed positions, NAs in restricted positions) + # For each column, posible positions are the position with the max posterior + # probability and those within the threshold of the max (> (max - 0.1)) + + thresh <- apply(post.admat, 2, max) - threshold + apply(post.admat, 2, function(x) ifelse(x > (max(x)-threshold), 0, NA)) +} + +initializeAdjacencyMatrix <- function(mcf_stats=NULL, mcf_matrix=NULL, zero.thresh=0.01) { + if (!is.null(mcf_stats)) { + MCF <- mcfMatrix(mcf_stats) + } else if (!is.null(mcf_matrix)) { + MCF <- mcf_matrix + } else stop("must supply either mcf_stats or mcf_matrix") + + ## + ## for each row (cluster) of the MCF matrix, + ## list indices of samples for which the cluster is present + ## + K <- nrow(MCF) + S <- ncol(MCF) + # cluster.sample.presence should be a list + MCF_list <- split(MCF, seq(nrow(MCF))) + cluster.sample.presence <- lapply(MCF_list, + function(x) which(x > zero.thresh)) + all.samples <- seq_len(S) + ## initialize adjacency matrix + ##admat <- matrix(data=0, nrow=(1+K), ncol=K) + admat <- matrix(data=0, K, K) + ## can't go to self + diag(admat) <- NA + for(from in seq_len(K)){ + for(to in seq_len(K)){ + ## can't go to self + if (from == to) next() + ## hierarchy restraints + from.samples <- cluster.sample.presence[[from]] + to.samples <- cluster.sample.presence[[to]] + ## no restraints if same sample presence + if (identical(from.samples, to.samples)) next() + ## restraint if # from.samples < # to.samples + if(length(from.samples) < length(to.samples)) { + admat[from, to] <- NA + next() + } + ## no restraints if to.samples is subset of from.samples + if (all(to.samples %in% from.samples)) { + next + } else { + admat[from, to] <- NA + } + } + } + ## Add root + ## can go from root to anyone + admat <- rbind(0, admat) + dimnames(admat) <- list(c("root", paste0("cluster", seq_len(K))), + paste0("cluster", seq_len(K))) + admat +} + +init.admat <- function(w, zero.thresh) { + base <- base.admat(w, zero.thresh) + rand.admat(base) +} + +#' @export +mutate.admat <- function(admat, ncol.to.mutate) { + ## choose a column(s) to mutate + K <- ncol(admat) + rand.ks <- sample(seq_len(K), size=ncol.to.mutate) + ## mutate columns + new.admat <- admat + for(k in rand.ks){ + + temp.admat <- mutate.column(new.admat, k) + # make sure admat is fully connected + while (!is.fully.connected(temp.admat)) { + temp.admat <- mutate.column(new.admat, k) + } + new.admat <- temp.admat + } + + # replace root edge if missing + if (sum(new.admat[1, ]) == 0) { + otherCol <- sample(seq_len(K)[-rand.ks], size = 1) + ind.1 <- which(admat[, otherCol] == 1) + new.admat[1, otherCol] <- 1 + new.admat[ind.1, otherCol] <- 0 + } + + # fix bidirectional edge if present + while (is.bidirectional(new.admat)) { + new.admat <- fix.bidirectional(new.admat) + } + + new.admat +} + +#' @export +mutate.admat.2 <- function(admat, ncol.to.mutate) { + + new.admat <- mutate.n.columns(admat, ncol.to.mutate) + + # make sure admat is fully connected + while (!is.fully.connected(new.admat)) { + new.admat <- mutate.n.columns(admat, ncol.to.mutate) + } + + new.admat +} + +#' @export +mutate.n.columns <- function(admat, ncol.to.mutate) { + K <- ncol(admat) + # columns with more than 1 possible position + columns.to.choose.from <- seq_len(K)[apply(admat, 2, function(x) sum(is.na(x)) < K)] + # sample columns from those with more than 1 possible position + rand.ks <- sample(columns.to.choose.from, size=ncol.to.mutate) + for(k in rand.ks){ + admat <- mutate.column(admat, k) + } + admat +} + +#' @export +mutate.column <- function(admat, k) { + ## possible positions (0's) + possiblePos <- which(!is.na(admat[, k]) & admat[, k] != 1) + ## current position with 1 + ind.1 <- which(admat[, k] == 1) + ## select new position + if (length(possiblePos) == 1) { + new.1 <- possiblePos + } else { + new.1 <- sample(possiblePos, size=1) + } + + admat[ind.1, k] <- 0 + admat[new.1, k] <- 1 + admat +} + +#' @export +mutate.admat.3 <- function(admat, ncol.to.mutate, mcf_matrix) { + + mutate.prob.tb <- get.cluster.mutate.prob(mcf_matrix) + + new.admat <- mutate.n.columns.clusterprob(admat, ncol.to.mutate, mutate.prob.tb) + + # make sure admat is fully connected + while (!is.fully.connected(new.admat)) { + new.admat <- mutate.n.columns.clusterprob(admat, ncol.to.mutate, mutate.prob.tb) + } + + new.admat +} + +#' @export +mutate.n.columns.clusterprob <- function(admat, ncol.to.mutate, mutate.prob.tb) { + K <- ncol(admat) + rand.ks <- sample(seq_len(K), size=ncol.to.mutate, prob = mutate.prob.tb$cluster_prob) + for(k in rand.ks){ + admat <- mutate.column(admat, k) + } + admat +} + +get.cluster.mutate.prob <- function(mcf_matrix) { + sample.presence <- get.sample.presence(mcf_matrix) + tiers <- sort(unique(sample.presence$num_samples)) + tier.prob <- get.tier.probs(tiers) + mutate.prob.tb <- sample.presence %>% + mutate(tier_prob = tier.prob$tier_prob[match(sample.presence$num_samples, tier.prob$num_samples)]) + mutate.prob.tb <- mutate.prob.tb %>% + group_by(num_samples) %>% + mutate(cluster_prob = tier_prob / n()) + mutate.prob.tb +} + +get.sample.presence <- function(mcf_matrix) { + mcf <- round(mcf_matrix, 2) + sample.presence <- tibble(cluster = paste0("cluster", 1:nrow(mcf_matrix)), + num_samples = rowSums(mcf > 0)) + sample.presence +} + +get.tier.probs <- function(tiers) { + numTiers <- length(tiers) + tibble(num_samples = tiers, + tier_prob = round(rev(10^(numTiers))/sum(10^(numTiers)), 5)) + #tier_prob = rev(1:numTiers / sum(1:numTiers))) +} + +is.fully.connected <- function(admat) { + # checks if admat is fully connected + numClusters <- nrow(admat) + nodesInMainTree <- bfs(admat) + numNodesInMainTree <- length(nodesInMainTree) + numClusters == numNodesInMainTree +} + +bfs <- function(admat) { + # starting at root + children <- names(which(admat[1, ] == 1)) + nodes <- c("root", children) + + while(length(children) > 0) { + c <- children[1] + children <- children[-1] + temp.children <- names(which(admat[c, ] == 1)) + children <- c(children, temp.children) + nodes <- c(nodes, temp.children) + } + nodes +} + +is.bidirectional <- function(admat) { + numClusters <- ncol(admat) + for (i in seq_len(numClusters)) { + for (j in seq_len(numClusters)) { + s <- sum(admat[i+1, j], admat[j+1, i], na.rm = TRUE) + if (s == 2) return(TRUE) + } + } + + FALSE +} + +fix.bidirectional <- function(admat) { + numClusters <- ncol(admat) + for (i in seq_len(numClusters)) { + for (j in seq_len(numClusters)) { + s <- sum(admat[i+1, j], admat[j+1, i], na.rm = TRUE) + if (s == 2) { + # pick one edge to change + to <- sample(c(i,j), size = 1) + + new.admat <- admat + ## possible positions (0's) + possiblePos <- which(!is.na(admat[, to]) & admat[, to] != 1) + ## current position with 1 + ind.1 <- which(admat[, to] == 1) + ## select new position + if (length(possiblePos) == 1) { + new.1 <- possiblePos + } else { + new.1 <- sample(possiblePos, size=1) + } + + new.admat[ind.1, to] <- 0 + new.admat[new.1, to] <- 1 + return(new.admat) + } + } + } +} + + +plotDAG <- function(admat){ + admat.untouched <- admat + admat <- cbind(0, admat) ## add column for root + dimnames(admat)[[2]][1] <- "root" + dimnames(admat) <- lapply(dimnames(admat), function(x) gsub("cluster", "", x)) + + admat[is.na(admat)] <- 0 + + net <- network::network(admat, directed=TRUE) + GGally::ggnet2(net, label=TRUE, arrow.size=12, + arrow.gap=0.025, mode = get.DAG.coords.2(admat.untouched)) +} + +get.DAG.coords <- function(admat) { + dat <- data.frame(label = rownames(admat), + x = 0, + y = 0) + + # fix root position + dat[dat$label=="root", ]$x <- 0.5 + dat[dat$label=="root", ]$y <- 1 + + + lvls <- getNodesInLevels(admat) + #lvls <- lapply(lvls, function(x) gsub("cluster", "", x)) + + yvals <- seq(1, 0, by = -1/length(lvls))[-1] + for (i in 1:length(lvls)) { + nodes <- lvls[[i]] + dat[match(nodes, dat$label), ]$y <- yvals[i] + + xvals <- seq(0, 1, by = 1/(length(nodes) + 1))[-c(1, length(nodes) + 2)] + dat[match(nodes, dat$label), ]$x <- xvals + } + cbind(dat$x, dat$y) +} + + +get.DAG.coords.2 <- function(admat) { + nodeInfo <- getNodeInfo(admat) + nodeInfo$x <- 0 + nodeInfo$y <- 0 + + # fix root position + nodeInfo[nodeInfo$node=="root", ]$x <- 0.5 + nodeInfo[nodeInfo$node=="root", ]$y <- 1 + + yvals <- seq(1, 0, by = -1/max(nodeInfo$level))[-1] + + + for (i in 1:max(nodeInfo$level)-1) { + + parents <- nodeInfo[nodeInfo$level == i, ]$node + for (parent in parents) { + if(nodeInfo[nodeInfo$node == parent, ]$numKids == 0) next + kids <- nodeInfo[which(nodeInfo$parent == parent), ]$node + + # set y vals + nodeInfo[match(kids, nodeInfo$node), ]$y <- yvals[i+1] + + # set x vals + if (parent == "root") { + xvals <- seq(0, 1, by = 1/(length(kids) + 1))[-c(1, length(kids) + 2)] + nodeInfo[match(kids, nodeInfo$node), ]$x <- xvals + } else { + p.x <- nodeInfo[which(nodeInfo$node == parent), ]$x + if (length(kids) == 1) { + nodeInfo[match(kids, nodeInfo$node), ]$x <- p.x + } else { + r <- 0.025*(max(nodeInfo$level)-i)* length(kids) + xvals <- seq(p.x - (r/2), p.x + (r/2), length.out = length(kids)) + nodeInfo[match(kids, nodeInfo$node), ]$x <- xvals + } + + } + } + } + cbind(nodeInfo$x, nodeInfo$y) +} + +getNodeInfo <- function(admat) { + nodeInfo <- getLevels(admat) + nodeInfo$parent <- NA + for (r in 2:nrow(nodeInfo)) { + nodeInfo$parent[r] <- names(which(admat[, r-1] == 1)) + + } + nodeInfo$numKids <- rowSums(admat, na.rm = T) + nodeInfo +} + +getLevels <- function(admat) { + nodeNames <- rownames(admat) + numNodes <- nrow(admat) + + lvl <- data.frame(node = nodeNames, + level = 0, + stringsAsFactors = F) + + currParents <- "root" + currKids <- unname(unlist(sapply(currParents, function(x) names(which(admat[x, ] == 1))))) + currlvl <- 1 + + while (length(currKids) > 0) { + lvl[match(currKids, lvl$node), ]$level <- currlvl + currParents <- currKids + currKids <- unname(unlist(sapply(currParents, function(x) names(which(admat[x, ] == 1))))) + + currlvl <- currlvl + 1 + } + lvl +} + +getNodesInLevels <- function(admat) { + nodeNames <- rownames(admat) + numNodes <- nrow(admat) + + lvls <- list() + + currParents <- "root" + currKids <- unname(unlist(lapply(currParents, function(x) names(which(admat[x, ] == 1))))) + currlvl <- 1 + + while (length(currKids) > 0) { + lvls[[currlvl]] <- currKids + currParents <- currKids + currKids <- unname(unlist(lapply(currParents, function(x) names(which(admat[x, ] == 1))))) + currlvl <- currlvl + 1 + } + lvls +} + + + +numericRepresentation <- function(x){ + x[is.na(x)] <- 0 + x <- as.numeric(x) + paste(x, collapse="") +} + +plainAdmat <- function(admat) { + plain <- cbind(root = rep(0, nrow(admat)), admat) + plain[is.na(plain)] <- 0 + plain +} + +calc.tree.prop.true <- function(admat, true.admat) { + # inputs: + # - admat = an adjacency matrix + # - true.admat = the true adjacency matrix + # output: proportion of edges that are correct [0,1] + true.edges <- which(true.admat == 1) + admat.edges <- which(admat == 1) + sum(admat.edges %in% true.edges) / length(true.edges) +} + +plotEnsembleDAG <- function(post.admat, filter1 = TRUE, filter1.threshold = 0.1) { + # filter1 filters columns for edges with posterior prob > (max(column) - filter1.threshold) + + admat <- cbind(0, post.admat) ## add column for root + dimnames(admat)[[2]][1] <- "root" + dimnames(admat) <- lapply(dimnames(admat), function(x) gsub("cluster", "", x)) + admat <- as.matrix(admat) + + ad <- admat + # filter edges + if (filter1) { + #thresh <- apply(admat, 2, max) - filter1.threshold + ad <- apply(admat, 2, function(x) ifelse(x > (max(x)-filter1.threshold), x, 0)) + } + + ig <- graph_from_adjacency_matrix(ad, mode = "directed", weighted = TRUE, + diag = FALSE, add.row = TRUE) + + E(ig)$lty <- ifelse(E(ig)$weight < 0.25, 2, 1) + + # make edge black if only 1 edge to vertex + e <- ends(ig, E(ig)) + numTo <- table(e[,2]) + edgeColors <- sapply(e[,2], function(x) ifelse(x %in% names(which(numTo==1)), "black", "darkgrey")) + E(ig)$color <- edgeColors + + V(ig)$label.cex <- 0.5 + + plot.igraph(ig, layout = layout_as_tree(ig), + vertex.color = "white", vertex.label.family = "Helvetica", + edge.arrow.size = 0.2, edge.arrow.width = 2, + edge.width = E(ig)$weight*3) +} + +mcfMatrix <- function(mcf_stats, parameter="mean"){ + K <- numberClusters(mcf_stats) + S <- numberSamples(mcf_stats) + if(parameter=="mean") + MCF <- matrix(mcf_stats$mean, K, S, byrow=TRUE) + if(parameter=="sd") + MCF <- matrix(mcf_stats$sd, K, S, byrow=TRUE) + MCF +} diff --git a/R/tree.R b/R/tree.R deleted file mode 100644 index ad48b46..0000000 --- a/R/tree.R +++ /dev/null @@ -1,474 +0,0 @@ -rand.admat <- function(admat) { - for(col in 1:ncol(admat)) { - ind.0 <- which(admat[,col] == 0) # possible positions (0's) - - # place 1 if only 1 position available - if (length(ind.0) == 1) { - admat[ind.0, col] <- 1 - - } else { # else randomly pick which position to place 1 - rand.ind <- sample(ind.0, size=1) - admat[rand.ind,col] <- 1 - } - } - - if (sum(admat[1, ], na.rm=TRUE) == 0) { - # pick random cluster to be connected to root - rand.k <- sample(1:ncol(admat), size = 1) - curr.1 <- which(admat[, rand.k] == 1) - admat[curr.1, rand.k] <- 0 - admat[1, rand.k] <- 1 - } - - while (is.bidirectional(admat)) { - admat <- fix.bidirectional(admat) - } - - admat -} - -restrictions.from.post.admat <- function(post.admat, threshold=0.1) { - # input: posterior adjacency matrix - # output: adjacency matrix with restrictions (0's in allowed positions, NAs in restricted positions) - # For each column, posible positions are the position with the max posterior - # probability and those within the threshold of the max (> (max - 0.1)) - - thresh <- apply(post.admat, 2, max) - threshold - apply(post.admat, 2, function(x) ifelse(x > (max(x)-threshold), 0, NA)) -} - -initializeAdjacencyMatrix <- function(mcf_stats=NULL, mcf_matrix=NULL, zero.thresh=0.01) { - if (!is.null(mcf_stats)) { - MCF <- mcfMatrix(mcf_stats) - } else if (!is.null(mcf_matrix)) { - MCF <- mcf_matrix - } else stop("must supply either mcf_stats or mcf_matrix") - - ## - ## for each row (cluster) of the MCF matrix, - ## list indices of samples for which the cluster is present - ## - K <- nrow(MCF) - S <- ncol(MCF) - # cluster.sample.presence should be a list - MCF_list <- split(MCF, seq(nrow(MCF))) - cluster.sample.presence <- lapply(MCF_list, - function(x) which(x > zero.thresh)) - all.samples <- seq_len(S) - ## initialize adjacency matrix - ##admat <- matrix(data=0, nrow=(1+K), ncol=K) - admat <- matrix(data=0, K, K) - ## can't go to self - diag(admat) <- NA - for(from in seq_len(K)){ - for(to in seq_len(K)){ - ## can't go to self - if (from == to) next() - ## hierarchy restraints - from.samples <- cluster.sample.presence[[from]] - to.samples <- cluster.sample.presence[[to]] - ## no restraints if same sample presence - if (identical(from.samples, to.samples)) next() - ## restraint if # from.samples < # to.samples - if(length(from.samples) < length(to.samples)) { - admat[from, to] <- NA - next() - } - ## no restraints if to.samples is subset of from.samples - if (all(to.samples %in% from.samples)) { - next - } else { - admat[from, to] <- NA - } - } - } - ## Add root - ## can go from root to anyone - admat <- rbind(0, admat) - dimnames(admat) <- list(c("root", paste0("cluster", seq_len(K))), - paste0("cluster", seq_len(K))) - admat -} - -init.admat <- function(w, zero.thresh) { - base <- base.admat(w, zero.thresh) - rand.admat(base) -} - - -mutate.admat <- function(admat, ncol.to.mutate) { - ## choose a column(s) to mutate - K <- ncol(admat) - rand.ks <- sample(seq_len(K), size=ncol.to.mutate) - ## mutate columns - new.admat <- admat - for(k in rand.ks){ - - temp.admat <- mutate.column(new.admat, k) - # make sure admat is fully connected - while (!is.fully.connected(temp.admat)) { - temp.admat <- mutate.column(new.admat, k) - } - new.admat <- temp.admat - } - - # replace root edge if missing - if (sum(new.admat[1, ]) == 0) { - otherCol <- sample(seq_len(K)[-rand.ks], size = 1) - ind.1 <- which(admat[, otherCol] == 1) - new.admat[1, otherCol] <- 1 - new.admat[ind.1, otherCol] <- 0 - } - - # fix bidirectional edge if present - while (is.bidirectional(new.admat)) { - new.admat <- fix.bidirectional(new.admat) - } - - new.admat -} - -mutate.admat.2 <- function(admat, ncol.to.mutate) { - - new.admat <- mutate.n.columns(admat, ncol.to.mutate) - - # make sure admat is fully connected - while (!is.fully.connected(new.admat)) { - new.admat <- mutate.n.columns(admat, ncol.to.mutate) - } - - new.admat -} - -mutate.n.columns <- function(admat, ncol.to.mutate) { - K <- ncol(admat) - # columns with more than 1 possible position - columns.to.choose.from <- seq_len(K)[apply(admat, 2, function(x) sum(is.na(x)) < K)] - # sample columns from those with more than 1 possible position - rand.ks <- sample(columns.to.choose.from, size=ncol.to.mutate) - for(k in rand.ks){ - admat <- mutate.column(admat, k) - } - admat -} - -mutate.column <- function(admat, k) { - ## possible positions (0's) - possiblePos <- which(!is.na(admat[, k]) & admat[, k] != 1) - ## current position with 1 - ind.1 <- which(admat[, k] == 1) - ## select new position - if (length(possiblePos) == 1) { - new.1 <- possiblePos - } else { - new.1 <- sample(possiblePos, size=1) - } - - admat[ind.1, k] <- 0 - admat[new.1, k] <- 1 - admat -} - -mutate.admat.3 <- function(admat, ncol.to.mutate, mcf_matrix) { - - mutate.prob.tb <- get.cluster.mutate.prob(mcf_matrix) - - new.admat <- mutate.n.columns.clusterprob(admat, ncol.to.mutate, mutate.prob.tb) - - # make sure admat is fully connected - while (!is.fully.connected(new.admat)) { - new.admat <- mutate.n.columns.clusterprob(admat, ncol.to.mutate, mutate.prob.tb) - } - - new.admat -} - -mutate.n.columns.clusterprob <- function(admat, ncol.to.mutate, mutate.prob.tb) { - K <- ncol(admat) - rand.ks <- sample(seq_len(K), size=ncol.to.mutate, prob = mutate.prob.tb$cluster_prob) - for(k in rand.ks){ - admat <- mutate.column(admat, k) - } - admat -} - -get.cluster.mutate.prob <- function(mcf_matrix) { - sample.presence <- get.sample.presence(mcf_matrix) - tiers <- sort(unique(sample.presence$num_samples)) - tier.prob <- get.tier.probs(tiers) - mutate.prob.tb <- sample.presence %>% - mutate(tier_prob = tier.prob$tier_prob[match(sample.presence$num_samples, tier.prob$num_samples)]) - mutate.prob.tb <- mutate.prob.tb %>% - group_by(num_samples) %>% - mutate(cluster_prob = tier_prob / n()) - mutate.prob.tb -} - -get.sample.presence <- function(mcf_matrix) { - mcf <- round(mcf_matrix, 2) - sample.presence <- tibble(cluster = paste0("cluster", 1:nrow(mcf_matrix)), - num_samples = rowSums(mcf > 0)) - sample.presence -} - -get.tier.probs <- function(tiers) { - numTiers <- length(tiers) - tibble(num_samples = tiers, - tier_prob = round(rev(10^(numTiers))/sum(10^(numTiers)), 5)) - #tier_prob = rev(1:numTiers / sum(1:numTiers))) -} - -is.fully.connected <- function(admat) { - # checks if admat is fully connected - numClusters <- nrow(admat) - nodesInMainTree <- bfs(admat) - numNodesInMainTree <- length(nodesInMainTree) - numClusters == numNodesInMainTree -} - -bfs <- function(admat) { - # starting at root - children <- names(which(admat[1, ] == 1)) - nodes <- c("root", children) - - while(length(children) > 0) { - c <- children[1] - children <- children[-1] - temp.children <- names(which(admat[c, ] == 1)) - children <- c(children, temp.children) - nodes <- c(nodes, temp.children) - } - nodes -} - -is.bidirectional <- function(admat) { - numClusters <- ncol(admat) - for (i in seq_len(numClusters)) { - for (j in seq_len(numClusters)) { - s <- sum(admat[i+1, j], admat[j+1, i], na.rm = TRUE) - if (s == 2) return(TRUE) - } - } - - FALSE -} - -fix.bidirectional <- function(admat) { - numClusters <- ncol(admat) - for (i in seq_len(numClusters)) { - for (j in seq_len(numClusters)) { - s <- sum(admat[i+1, j], admat[j+1, i], na.rm = TRUE) - if (s == 2) { - # pick one edge to change - to <- sample(c(i,j), size = 1) - - new.admat <- admat - ## possible positions (0's) - possiblePos <- which(!is.na(admat[, to]) & admat[, to] != 1) - ## current position with 1 - ind.1 <- which(admat[, to] == 1) - ## select new position - if (length(possiblePos) == 1) { - new.1 <- possiblePos - } else { - new.1 <- sample(possiblePos, size=1) - } - - new.admat[ind.1, to] <- 0 - new.admat[new.1, to] <- 1 - return(new.admat) - } - } - } -} - - -plotDAG <- function(admat){ - admat.untouched <- admat - admat <- cbind(0, admat) ## add column for root - dimnames(admat)[[2]][1] <- "root" - dimnames(admat) <- lapply(dimnames(admat), function(x) gsub("cluster", "", x)) - - admat[is.na(admat)] <- 0 - - net <- network::network(admat, directed=TRUE) - GGally::ggnet2(net, label=TRUE, arrow.size=12, - arrow.gap=0.025, mode = get.DAG.coords.2(admat.untouched)) -} - -get.DAG.coords <- function(admat) { - dat <- data.frame(label = rownames(admat), - x = 0, - y = 0) - - # fix root position - dat[dat$label=="root", ]$x <- 0.5 - dat[dat$label=="root", ]$y <- 1 - - - lvls <- getNodesInLevels(admat) - #lvls <- lapply(lvls, function(x) gsub("cluster", "", x)) - - yvals <- seq(1, 0, by = -1/length(lvls))[-1] - for (i in 1:length(lvls)) { - nodes <- lvls[[i]] - dat[match(nodes, dat$label), ]$y <- yvals[i] - - xvals <- seq(0, 1, by = 1/(length(nodes) + 1))[-c(1, length(nodes) + 2)] - dat[match(nodes, dat$label), ]$x <- xvals - } - cbind(dat$x, dat$y) -} - - -get.DAG.coords.2 <- function(admat) { - nodeInfo <- getNodeInfo(admat) - nodeInfo$x <- 0 - nodeInfo$y <- 0 - - # fix root position - nodeInfo[nodeInfo$node=="root", ]$x <- 0.5 - nodeInfo[nodeInfo$node=="root", ]$y <- 1 - - yvals <- seq(1, 0, by = -1/max(nodeInfo$level))[-1] - - - for (i in 1:max(nodeInfo$level)-1) { - - parents <- nodeInfo[nodeInfo$level == i, ]$node - for (parent in parents) { - if(nodeInfo[nodeInfo$node == parent, ]$numKids == 0) next - kids <- nodeInfo[which(nodeInfo$parent == parent), ]$node - - # set y vals - nodeInfo[match(kids, nodeInfo$node), ]$y <- yvals[i+1] - - # set x vals - if (parent == "root") { - xvals <- seq(0, 1, by = 1/(length(kids) + 1))[-c(1, length(kids) + 2)] - nodeInfo[match(kids, nodeInfo$node), ]$x <- xvals - } else { - p.x <- nodeInfo[which(nodeInfo$node == parent), ]$x - if (length(kids) == 1) { - nodeInfo[match(kids, nodeInfo$node), ]$x <- p.x - } else { - r <- 0.025*(max(nodeInfo$level)-i)* length(kids) - xvals <- seq(p.x - (r/2), p.x + (r/2), length.out = length(kids)) - nodeInfo[match(kids, nodeInfo$node), ]$x <- xvals - } - - } - } - } - cbind(nodeInfo$x, nodeInfo$y) -} - -getNodeInfo <- function(admat) { - nodeInfo <- getLevels(admat) - nodeInfo$parent <- NA - for (r in 2:nrow(nodeInfo)) { - nodeInfo$parent[r] <- names(which(admat[, r-1] == 1)) - - } - nodeInfo$numKids <- rowSums(admat, na.rm = T) - nodeInfo -} - -getLevels <- function(admat) { - nodeNames <- rownames(admat) - numNodes <- nrow(admat) - - lvl <- data.frame(node = nodeNames, - level = 0, - stringsAsFactors = F) - - currParents <- "root" - currKids <- unname(unlist(sapply(currParents, function(x) names(which(admat[x, ] == 1))))) - currlvl <- 1 - - while (length(currKids) > 0) { - lvl[match(currKids, lvl$node), ]$level <- currlvl - currParents <- currKids - currKids <- unname(unlist(sapply(currParents, function(x) names(which(admat[x, ] == 1))))) - - currlvl <- currlvl + 1 - } - lvl -} - -getNodesInLevels <- function(admat) { - nodeNames <- rownames(admat) - numNodes <- nrow(admat) - - lvls <- list() - - currParents <- "root" - currKids <- unname(unlist(lapply(currParents, function(x) names(which(admat[x, ] == 1))))) - currlvl <- 1 - - while (length(currKids) > 0) { - lvls[[currlvl]] <- currKids - currParents <- currKids - currKids <- unname(unlist(lapply(currParents, function(x) names(which(admat[x, ] == 1))))) - currlvl <- currlvl + 1 - } - lvls -} - - - -numericRepresentation <- function(x){ - x[is.na(x)] <- 0 - x <- as.numeric(x) - paste(x, collapse="") -} - -plainAdmat <- function(admat) { - plain <- cbind(root = rep(0, nrow(admat)), admat) - plain[is.na(plain)] <- 0 - plain -} - -calc.tree.prop.true <- function(admat, true.admat) { - # inputs: - # - admat = an adjacency matrix - # - true.admat = the true adjacency matrix - # output: proportion of edges that are correct [0,1] - true.edges <- which(true.admat == 1) - admat.edges <- which(admat == 1) - sum(admat.edges %in% true.edges) / length(true.edges) -} - -plotEnsembleDAG <- function(post.admat, filter1 = TRUE, filter1.threshold = 0.1) { - # filter1 filters columns for edges with posterior prob > (max(column) - filter1.threshold) - - admat <- cbind(0, post.admat) ## add column for root - dimnames(admat)[[2]][1] <- "root" - dimnames(admat) <- lapply(dimnames(admat), function(x) gsub("cluster", "", x)) - admat <- as.matrix(admat) - - ad <- admat - # filter edges - if (filter1) { - #thresh <- apply(admat, 2, max) - filter1.threshold - ad <- apply(admat, 2, function(x) ifelse(x > (max(x)-filter1.threshold), x, 0)) - } - - ig <- graph_from_adjacency_matrix(ad, mode = "directed", weighted = TRUE, - diag = FALSE, add.row = TRUE) - - E(ig)$lty <- ifelse(E(ig)$weight < 0.25, 2, 1) - - # make edge black if only 1 edge to vertex - e <- ends(ig, E(ig)) - numTo <- table(e[,2]) - edgeColors <- sapply(e[,2], function(x) ifelse(x %in% names(which(numTo==1)), "black", "darkgrey")) - E(ig)$color <- edgeColors - - V(ig)$label.cex <- 0.5 - - plot.igraph(ig, layout = layout_as_tree(ig), - vertex.color = "white", vertex.label.family = "Helvetica", - edge.arrow.size = 0.2, edge.arrow.width = 2, - edge.width = E(ig)$weight*3) -} \ No newline at end of file diff --git a/R/tree_scoring.R b/R/tree_scoring.R deleted file mode 100644 index 0424d5e..0000000 --- a/R/tree_scoring.R +++ /dev/null @@ -1,243 +0,0 @@ -summarizeWChain <- function(w.chain) { - # output: mcf_stats - mcf_stats <- w.chain %>% - group_by(Parameter) %>% - summarize(sd=sd(value), - mean=mean(value)) - return(mcf_stats) -} - -create.cpov <- function(mcf_stats, alpha=0.05, zero.thresh=0.01, mcf_matrix = NULL, restriction.val = 1) { - cpov <- NA - MCF <- NA - - ## if mcf_matrix is supplied, use that to create cpov - if (is.null(mcf_matrix)) { - cpov <- initializeAdjacencyMatrix(mcf_stats = mcf_stats, zero.thresh = zero.thresh) - cpov[is.na(cpov)] <- restriction.val - MCF <- mcfMatrix(mcf_stats) - } else { - cpov <- initializeAdjacencyMatrix(mcf_matrix = mcf_matrix, zero.thresh = zero.thresh) - cpov[is.na(cpov)] <- restriction.val - MCF <- mcf_matrix - } - - sds <- mcfMatrix(mcf_stats, parameter="sd") - ##S <- ncol(mcmc_w) # number of samples - S <- numberSamples(mcf_stats) - ##cpov <- cpov[-1, ] - ## root can go to anyone -- all 0's (default base admat value) - for (r in 2:nrow(cpov)) { - for (c in 1:ncol(cpov)) { - if (cpov[r,c] == restriction.val) next # skip restricted position - from <- r-1 # 'from' cluster node - to <- c # 'to' cluster node - statistic <- 0 - pval <- 0 - for(s in seq_len(S)) { - ##d <- mcmc_w[from,s] - mcmc_w[to,s] - d <- MCF[from, s] - MCF[to, s] - d_sd <- sqrt(sds[from, s]^2 + sds[to, s]^2) - ##d_sd <- sqrt((mcmc_w_sd[from,s])^2 + (mcmc_w_sd[to,s])^2) - I <- sum(d < 0) - ## cumulative sum of the - ## number of standard deviations for the difference in - ## MCFs between 2 samples - if (d == 0 || is.nan(d / d_sd)) { - next - } else { - statistic <- statistic + (d / d_sd)^2 * I - } - - for (k in 0:S) { - pval <- pval + ((1 - pchisq(statistic, k)) * - choose(S, k) / (2^S)) - } - } - ## - ## edge seems to be based on this ad-hoc statistic, not - ## the probability of the tree - ## - cpov[r,c] <- decide.ht(pval, alpha) - } - } - cpov -} - -decide.ht <- function(pval, alpha=0.05) { - # 1 signals rejection event for null of i -> j - if (pval <= alpha) return(1) - else return(0) -} - -calcTopologyCost <- function(am, cpov, am_format = "long") { - TC <- 0 - - if (am_format == "long") { - am <- toWide(am) - } - - edges <- which(am == 1, arr.ind=TRUE) - N <- nrow(edges) - for (i in seq_len(N)) { - TC <- TC + cpov[edges[i,1], edges[i,2]] - } - - TC -} - -getEdges <- function(am.long) { - am.long %>% - filter(connected == 1) %>% - mutate(parent = as.character(parent)) -} - -getChildren <- function(am.long, node) { - # returns vector of children nodes - edges <- am.long %>% - mutate(parent = as.character(parent)) %>% - filter(connected == 1) %>% - filter(parent == node) - return(edges$child) -} - -calcMassCost <- function(am, mcf_matrix, am_format="long") { - num_samples <- ncol(mcf_matrix) - - if (am_format == "long") { - edges <- getEdges(am) - - parent_nodes <- unique(edges$parent) - mass_cost <- rep(0, length(parent_nodes)) # mass cost of each parent node - - for (i in seq_len(length(parent_nodes))) { - parent_node <- parent_nodes[i] - - # root CCF is 1 - if (parent_node == "root") { - parent_w <- rep(1, num_samples) - } else { - parent_w <- mcf_matrix[as.numeric(parent_node), ,drop=FALSE] - } - - kids <- getChildren(am, parent_node) - if (length(kids) > 1) { - children_w <- colSums(mcf_matrix[as.numeric(kids), ,drop=FALSE]) - } else { - children_w <- mcf_matrix[as.numeric(kids), ,drop=FALSE] - } - - mc_s <- ifelse(parent_w >= children_w, 0, children_w - parent_w) - #mass_cost[i] <- sqrt(sum(mc_s^2)) - mass_cost[i] <- max(mc_s) # take max across samples instead of euclidean distance - } - return(sum(mass_cost)) - - } else if (am_format == "wide") { - num_children <- rowSums(am, na.rm = T) - nodes <- which(num_children > 0, arr.ind = T) # not leaves - mc_node <- rep(0, length(nodes)) - - for (i in 1:length(nodes)) { - node <- nodes[i] - - # root node: MCF = 1 - parent_w <- rep(1, ncol(mcf_matrix)) - # not root node: look up MCF in mcf_matrix - if (node != 1) { - parent_w <- mcf_matrix[node-1,] - } - - kids <- which(am[node,] == 1, arr.ind = T) - if (num_children[node] > 1) { - children_w <- colSums(mcf_matrix[kids, ]) - } else { - children_w <- mcf_matrix[kids, ] - } - - mc_s <- ifelse(parent_w >= children_w, 0, children_w - parent_w) - mc_node[i] <- sqrt(sum(mc_s^2)) - } - return(sum(mc_node)) - } -} - -edgesToAmLong <- function(edges) { - am_wide <- initEmptyAdmatFromK(length(unique(edges$child))) - edges[edges$parent == "root", "parent"] <- "0" - edges <- edges %>% - mutate(parent = as.numeric(parent) + 1, - child = as.numeric(child)) %>% - select(parent, child) - edges <- as.matrix(edges) - for (r in 1:nrow(edges)) { - am_wide[edges[r,1], edges[r,2]] <- 1 - } - admat <- toLong(am_wide) - admat <- reversedEdges(admat) %>% - mutate(reversed_connected=reversedConnection(.), - bi_directional=NA, - root_connected=NA) - admat <- updateGraphElements(admat) - return(admat) -} - -calcTreeFitness <- function(admat, cpov, mcf_matrix, am_format = "long", weight_mass = 1, weight_topology = 1, scaling_coeff=5) { - # if only edges are given, change into long format - if (am_format == "edges") { - admat <- edgesToAmLong(admat) - am_format <- "long" - } - - TC <- calcTopologyCost(admat, cpov, am_format) - MC <- calcMassCost(admat, mcf_matrix, am_format) - Z <- weight_topology * TC + weight_mass * MC - fitness <- exp(-scaling_coeff * Z) - fitness -} - -satisfiesCCFSumProperties <- function(am_long, mcf_matrix, threshold = 0.2) { - # threshold = max value that children CCFs can be larger than parent; - # i.e. function returns false if (sum of children CCFs) - parent CCF > threshold - - edges <- getEdges(am_long) - parent_nodes <- unique(edges$parent) - - for (i in seq_len(length(parent_nodes))) { - parent_node <- parent_nodes[i] - - # root CCF is 1 - if (parent_node == "root") { - parent_w <- rep(1, ncol(mcf_matrix)) - } else { - parent_w <- mcf_matrix[as.numeric(parent_node), ] - } - - kids <- getChildren(am_long, parent_node) - if (length(kids) > 1) { - children_w <- colSums(mcf_matrix[as.numeric(kids), ]) - } else { - children_w <- mcf_matrix[as.numeric(kids), ] - } - - # return false if violates sum properties - if (any(children_w - parent_w > threshold)) return(FALSE) - } - - return(TRUE) -} - -#' Calculate SCHISM fitness scores for trees -#' -#' @export -#' @param w_chain MCMC chain of CCF values, which is the first item in the list returned by \code{clusterSep} -#' @param trees list of tibbles, where each tibble contains edges of a tree with columns edge, parent, child -calcTreeScores <- function(w_chain, trees, mc.cores = 1) { - mcf_stats <- summarizeWChain(w_chain) - cpov <- create.cpov(mcf_stats) - w_mat <- estimateCCFs(w_chain) - schism_scores <- unlist(parallel:::mclapply(trees, - function(x) calcTreeFitness(x, cpov, w_mat, am_format = "edges"), - mc.cores = mc.cores)) - return(schism_scores) -} \ No newline at end of file diff --git a/data/example_results.rda b/data/example_results.rda deleted file mode 100644 index 65c9ad6..0000000 Binary files a/data/example_results.rda and /dev/null differ diff --git a/data/sim_data_1.rda b/data/sim_data_1.rda deleted file mode 100644 index e764857..0000000 Binary files a/data/sim_data_1.rda and /dev/null differ diff --git a/inst/extdata/example1_snv.csv b/inst/extdata/example1_snv.csv new file mode 100644 index 0000000..88e11f5 --- /dev/null +++ b/inst/extdata/example1_snv.csv @@ -0,0 +1,191 @@ +sample,mutation,total_reads,alt_reads,tumor_integer_copy_number,cncf +sample1,chr1-57-clone1,100,67,4,0.8 +sample1,chr1-110-clone1,100,67,4,0.8 +sample1,chr1-167-clone1,100,67,4,0.8 +sample1,chr1-386-clone1,100,40.0,2,0 +sample1,chr1-441-clone1,100,40.0,2,0 +sample1,chr1-1276-clone1,100,40.0,2,0 +sample1,chr1-1496-clone1,100,40.0,2,0 +sample1,chr1-1631-clone1,100,40.0,2,0 +sample1,chr1-1735-clone1,100,40.0,2,0 +sample1,chr1-2036-clone1,100,40.0,2,0 +sample1,chr1-2428-clone1,100,40.0,2,0 +sample1,chr1-2650-clone1,100,40.0,2,0 +sample1,chr1-2684-clone1,100,40.0,2,0 +sample1,chr1-2730-clone1,100,40.0,2,0 +sample1,chr1-2857-clone1,100,40.0,2,0 +sample1,chr1-2931-clone1,100,40.0,2,0 +sample1,chr1-2977-clone1,100,40.0,2,0 +sample1,chr1-3061-clone1,100,40.0,2,0 +sample1,chr1-3113-clone1,100,40.0,2,0 +sample1,chr1-3173-clone1,100,40.0,2,0 +sample1,chr1-3371-clone1,100,40.0,2,0 +sample1,chr1-3812-clone1,100,40.0,2,0 +sample1,chr1-4278-clone1,100,40.0,2,0 +sample1,chr1-4428-clone1,100,40.0,2,0 +sample1,chr1-4696-clone1,100,40.0,2,0 +sample1,chr1-4893-clone1,100,40.0,2,0 +sample1,chr1-5277-clone1,100,40.0,2,0 +sample1,chr1-5563-clone1,100,40.0,2,0 +sample1,chr1-5597-clone1,100,40.0,2,0 +sample1,chr1-5815-clone1,100,40.0,2,0 +sample1,chr1-6037-clone1,100,40.0,2,0 +sample1,chr1-6113-clone1,100,40.0,2,0 +sample1,chr1-6145-clone1,100,40.0,2,0 +sample1,chr1-6753-clone1,100,40.0,2,0 +sample1,chr1-6792-clone1,100,40.0,2,0 +sample1,chr1-6808-clone1,100,40.0,2,0 +sample1,chr1-6826-clone1,100,40.0,2,0 +sample1,chr1-7094-clone1,100,40.0,2,0 +sample1,chr1-7148-clone1,100,40.0,2,0 +sample1,chr1-7518-clone1,100,40.0,2,0 +sample1,chr1-8257-clone1,100,40.0,2,0 +sample1,chr1-8368-clone1,100,40.0,2,0 +sample1,chr1-8579-clone1,100,40.0,2,0 +sample1,chr1-8701-clone1,100,40.0,2,0 +sample1,chr1-9064-clone1,100,40.0,2,0 +sample1,chr1-9210-clone1,100,40.0,2,0 +sample1,chr1-9274-clone1,100,40.0,2,0 +sample1,chr1-9472-clone1,100,40.0,2,0 +sample1,chr1-9505-clone1,100,40.0,2,0 +sample1,chr1-9586-clone1,100,40.0,2,0 +sample2,chr1-57-clone1,100,62,4,0.7 +sample2,chr1-110-clone1,100,62,4,0.7 +sample2,chr1-154-clone2,100,6,4,0.7 +sample2,chr1-167-clone1,100,62,4,0.7 +sample2,chr1-386-clone1,100,35.0,2,0 +sample2,chr1-441-clone1,100,35.0,2,0 +sample2,chr1-660-clone2,100,10.0,2,0 +sample2,chr1-681-clone2,100,10.0,2,0 +sample2,chr1-1276-clone1,100,35.0,2,0 +sample2,chr1-1496-clone1,100,35.0,2,0 +sample2,chr1-1631-clone1,100,35.0,2,0 +sample2,chr1-1735-clone1,100,35.0,2,0 +sample2,chr1-1963-clone2,100,10.0,2,0 +sample2,chr1-2036-clone1,100,35.0,2,0 +sample2,chr1-2128-clone2,100,10.0,2,0 +sample2,chr1-2428-clone1,100,35.0,2,0 +sample2,chr1-2463-clone2,100,10.0,2,0 +sample2,chr1-2650-clone1,100,35.0,2,0 +sample2,chr1-2684-clone1,100,35.0,2,0 +sample2,chr1-2690-clone2,100,10.0,2,0 +sample2,chr1-2730-clone1,100,35.0,2,0 +sample2,chr1-2857-clone1,100,35.0,2,0 +sample2,chr1-2931-clone1,100,35.0,2,0 +sample2,chr1-2977-clone1,100,35.0,2,0 +sample2,chr1-3045-clone2,100,10.0,2,0 +sample2,chr1-3061-clone1,100,35.0,2,0 +sample2,chr1-3113-clone1,100,35.0,2,0 +sample2,chr1-3173-clone1,100,35.0,2,0 +sample2,chr1-3371-clone1,100,35.0,2,0 +sample2,chr1-3546-clone2,100,10.0,2,0 +sample2,chr1-3754-clone2,100,10.0,2,0 +sample2,chr1-3807-clone2,100,10.0,2,0 +sample2,chr1-3812-clone1,100,35.0,2,0 +sample2,chr1-4223-clone2,100,10.0,2,0 +sample2,chr1-4278-clone1,100,35.0,2,0 +sample2,chr1-4428-clone1,100,35.0,2,0 +sample2,chr1-4696-clone1,100,35.0,2,0 +sample2,chr1-4893-clone1,100,35.0,2,0 +sample2,chr1-4937-clone2,100,10.0,2,0 +sample2,chr1-5277-clone1,100,35.0,2,0 +sample2,chr1-5405-clone2,100,10.0,2,0 +sample2,chr1-5563-clone1,100,35.0,2,0 +sample2,chr1-5597-clone1,100,35.0,2,0 +sample2,chr1-5815-clone1,100,35.0,2,0 +sample2,chr1-6037-clone1,100,35.0,2,0 +sample2,chr1-6113-clone1,100,35.0,2,0 +sample2,chr1-6145-clone1,100,35.0,2,0 +sample2,chr1-6753-clone1,100,35.0,2,0 +sample2,chr1-6792-clone1,100,35.0,2,0 +sample2,chr1-6808-clone1,100,35.0,2,0 +sample2,chr1-6826-clone1,100,35.0,2,0 +sample2,chr1-7094-clone1,100,35.0,2,0 +sample2,chr1-7106-clone2,100,10.0,2,0 +sample2,chr1-7148-clone1,100,35.0,2,0 +sample2,chr1-7518-clone1,100,35.0,2,0 +sample2,chr1-8180-clone2,100,10.0,2,0 +sample2,chr1-8257-clone1,100,35.0,2,0 +sample2,chr1-8368-clone1,100,35.0,2,0 +sample2,chr1-8491-clone2,100,10.0,2,0 +sample2,chr1-8579-clone1,100,35.0,2,0 +sample2,chr1-8701-clone1,100,35.0,2,0 +sample2,chr1-9064-clone1,100,35.0,2,0 +sample2,chr1-9095-clone2,100,10.0,2,0 +sample2,chr1-9210-clone1,100,35.0,2,0 +sample2,chr1-9274-clone1,100,35.0,2,0 +sample2,chr1-9472-clone1,100,35.0,2,0 +sample2,chr1-9505-clone1,100,35.0,2,0 +sample2,chr1-9586-clone1,100,35.0,2,0 +sample2,chr1-9644-clone2,100,10.0,2,0 +sample2,chr1-9765-clone2,100,10.0,2,0 +sample3,chr1-57-clone1,100,62,4,0.7 +sample3,chr1-110-clone1,100,62,4,0.7 +sample3,chr1-167-clone1,100,62,4,0.7 +sample3,chr1-386-clone1,100,35.0,2,0 +sample3,chr1-441-clone1,100,50,1,0.6 +sample3,chr1-1276-clone1,100,50,1,0.6 +sample3,chr1-1496-clone1,100,50,1,0.6 +sample3,chr1-1631-clone1,100,35.0,2,0 +sample3,chr1-1735-clone1,100,35.0,2,0 +sample3,chr1-2036-clone1,100,35.0,2,0 +sample3,chr1-2428-clone1,100,35.0,2,0 +sample3,chr1-2650-clone1,100,35.0,2,0 +sample3,chr1-2684-clone1,100,35.0,2,0 +sample3,chr1-2730-clone1,100,35.0,2,0 +sample3,chr1-2857-clone1,100,35.0,2,0 +sample3,chr1-2931-clone1,100,35.0,2,0 +sample3,chr1-2977-clone1,100,35.0,2,0 +sample3,chr1-3061-clone1,100,35.0,2,0 +sample3,chr1-3113-clone1,100,35.0,2,0 +sample3,chr1-3173-clone1,100,35.0,2,0 +sample3,chr1-3371-clone1,100,35.0,2,0 +sample3,chr1-3812-clone1,100,35.0,2,0 +sample3,chr1-4278-clone1,100,35.0,2,0 +sample3,chr1-4428-clone1,100,35.0,2,0 +sample3,chr1-4696-clone1,100,35.0,2,0 +sample3,chr1-4893-clone1,100,35.0,2,0 +sample3,chr1-5277-clone1,100,35.0,2,0 +sample3,chr1-5563-clone1,100,35.0,2,0 +sample3,chr1-5597-clone1,100,35.0,2,0 +sample3,chr1-5815-clone1,100,35.0,2,0 +sample3,chr1-6037-clone1,100,35.0,2,0 +sample3,chr1-6113-clone1,100,35.0,2,0 +sample3,chr1-6145-clone1,100,35.0,2,0 +sample3,chr1-6753-clone1,100,35.0,2,0 +sample3,chr1-6792-clone1,100,35.0,2,0 +sample3,chr1-6808-clone1,100,35.0,2,0 +sample3,chr1-6826-clone1,100,35.0,2,0 +sample3,chr1-7094-clone1,100,27,3,0.6 +sample3,chr1-7148-clone1,100,27,3,0.6 +sample3,chr1-7518-clone1,100,35.0,2,0 +sample3,chr1-8257-clone1,100,35.0,2,0 +sample3,chr1-8368-clone1,100,35.0,2,0 +sample3,chr1-8579-clone1,100,35.0,2,0 +sample3,chr1-8701-clone1,100,35.0,2,0 +sample3,chr1-9064-clone1,100,35.0,2,0 +sample3,chr1-9210-clone1,100,35.0,2,0 +sample3,chr1-9274-clone1,100,35.0,2,0 +sample3,chr1-9472-clone1,100,35.0,2,0 +sample3,chr1-9505-clone1,100,35.0,2,0 +sample3,chr1-9586-clone1,100,35.0,2,0 +sample3,chr1-8716-clone3,100,30.0,2,0 +sample3,chr1-9104-clone3,100,30.0,2,0 +sample3,chr1-8700-clone3,100,30.0,2,0 +sample3,chr1-8321-clone3,100,30.0,2,0 +sample3,chr1-8790-clone3,100,30.0,2,0 +sample3,chr1-8663-clone3,100,30.0,2,0 +sample3,chr1-8026-clone3,100,30.0,2,0 +sample3,chr1-8101-clone3,100,30.0,2,0 +sample3,chr1-8124-clone3,100,30.0,2,0 +sample3,chr1-9457-clone3,100,30.0,2,0 +sample3,chr1-8911-clone3,100,30.0,2,0 +sample3,chr1-8160-clone3,100,30.0,2,0 +sample3,chr1-8794-clone3,100,30.0,2,0 +sample3,chr1-9636-clone3,100,30.0,2,0 +sample3,chr1-9885-clone3,100,30.0,2,0 +sample3,chr1-8367-clone3,100,30.0,2,0 +sample3,chr1-9615-clone3,100,30.0,2,0 +sample3,chr1-8023-clone3,100,30.0,2,0 +sample3,chr1-9581-clone3,100,30.0,2,0 +sample3,chr1-9458-clone3,100,30.0,2,0 diff --git a/inst/extdata/example1_snv_with_purity.csv b/inst/extdata/example1_snv_with_purity.csv new file mode 100644 index 0000000..35cc190 --- /dev/null +++ b/inst/extdata/example1_snv_with_purity.csv @@ -0,0 +1,191 @@ +sample,mutation,total_reads,alt_reads,tumor_integer_copy_number,cncf,purity +sample1,chr1-57-clone1,100,67,4,0.8,0.8 +sample1,chr1-110-clone1,100,67,4,0.8,0.8 +sample1,chr1-167-clone1,100,67,4,0.8,0.8 +sample1,chr1-386-clone1,100,40.0,2,0,0.8 +sample1,chr1-441-clone1,100,40.0,2,0,0.8 +sample1,chr1-1276-clone1,100,40.0,2,0,0.8 +sample1,chr1-1496-clone1,100,40.0,2,0,0.8 +sample1,chr1-1631-clone1,100,40.0,2,0,0.8 +sample1,chr1-1735-clone1,100,40.0,2,0,0.8 +sample1,chr1-2036-clone1,100,40.0,2,0,0.8 +sample1,chr1-2428-clone1,100,40.0,2,0,0.8 +sample1,chr1-2650-clone1,100,40.0,2,0,0.8 +sample1,chr1-2684-clone1,100,40.0,2,0,0.8 +sample1,chr1-2730-clone1,100,40.0,2,0,0.8 +sample1,chr1-2857-clone1,100,40.0,2,0,0.8 +sample1,chr1-2931-clone1,100,40.0,2,0,0.8 +sample1,chr1-2977-clone1,100,40.0,2,0,0.8 +sample1,chr1-3061-clone1,100,40.0,2,0,0.8 +sample1,chr1-3113-clone1,100,40.0,2,0,0.8 +sample1,chr1-3173-clone1,100,40.0,2,0,0.8 +sample1,chr1-3371-clone1,100,40.0,2,0,0.8 +sample1,chr1-3812-clone1,100,40.0,2,0,0.8 +sample1,chr1-4278-clone1,100,40.0,2,0,0.8 +sample1,chr1-4428-clone1,100,40.0,2,0,0.8 +sample1,chr1-4696-clone1,100,40.0,2,0,0.8 +sample1,chr1-4893-clone1,100,40.0,2,0,0.8 +sample1,chr1-5277-clone1,100,40.0,2,0,0.8 +sample1,chr1-5563-clone1,100,40.0,2,0,0.8 +sample1,chr1-5597-clone1,100,40.0,2,0,0.8 +sample1,chr1-5815-clone1,100,40.0,2,0,0.8 +sample1,chr1-6037-clone1,100,40.0,2,0,0.8 +sample1,chr1-6113-clone1,100,40.0,2,0,0.8 +sample1,chr1-6145-clone1,100,40.0,2,0,0.8 +sample1,chr1-6753-clone1,100,40.0,2,0,0.8 +sample1,chr1-6792-clone1,100,40.0,2,0,0.8 +sample1,chr1-6808-clone1,100,40.0,2,0,0.8 +sample1,chr1-6826-clone1,100,40.0,2,0,0.8 +sample1,chr1-7094-clone1,100,40.0,2,0,0.8 +sample1,chr1-7148-clone1,100,40.0,2,0,0.8 +sample1,chr1-7518-clone1,100,40.0,2,0,0.8 +sample1,chr1-8257-clone1,100,40.0,2,0,0.8 +sample1,chr1-8368-clone1,100,40.0,2,0,0.8 +sample1,chr1-8579-clone1,100,40.0,2,0,0.8 +sample1,chr1-8701-clone1,100,40.0,2,0,0.8 +sample1,chr1-9064-clone1,100,40.0,2,0,0.8 +sample1,chr1-9210-clone1,100,40.0,2,0,0.8 +sample1,chr1-9274-clone1,100,40.0,2,0,0.8 +sample1,chr1-9472-clone1,100,40.0,2,0,0.8 +sample1,chr1-9505-clone1,100,40.0,2,0,0.8 +sample1,chr1-9586-clone1,100,40.0,2,0,0.8 +sample2,chr1-57-clone1,100,62,4,0.7,0.7 +sample2,chr1-110-clone1,100,62,4,0.7,0.7 +sample2,chr1-154-clone2,100,6,4,0.7,0.7 +sample2,chr1-167-clone1,100,62,4,0.7,0.7 +sample2,chr1-386-clone1,100,35.0,2,0,0.7 +sample2,chr1-441-clone1,100,35.0,2,0,0.7 +sample2,chr1-660-clone2,100,10.0,2,0,0.7 +sample2,chr1-681-clone2,100,10.0,2,0,0.7 +sample2,chr1-1276-clone1,100,35.0,2,0,0.7 +sample2,chr1-1496-clone1,100,35.0,2,0,0.7 +sample2,chr1-1631-clone1,100,35.0,2,0,0.7 +sample2,chr1-1735-clone1,100,35.0,2,0,0.7 +sample2,chr1-1963-clone2,100,10.0,2,0,0.7 +sample2,chr1-2036-clone1,100,35.0,2,0,0.7 +sample2,chr1-2128-clone2,100,10.0,2,0,0.7 +sample2,chr1-2428-clone1,100,35.0,2,0,0.7 +sample2,chr1-2463-clone2,100,10.0,2,0,0.7 +sample2,chr1-2650-clone1,100,35.0,2,0,0.7 +sample2,chr1-2684-clone1,100,35.0,2,0,0.7 +sample2,chr1-2690-clone2,100,10.0,2,0,0.7 +sample2,chr1-2730-clone1,100,35.0,2,0,0.7 +sample2,chr1-2857-clone1,100,35.0,2,0,0.7 +sample2,chr1-2931-clone1,100,35.0,2,0,0.7 +sample2,chr1-2977-clone1,100,35.0,2,0,0.7 +sample2,chr1-3045-clone2,100,10.0,2,0,0.7 +sample2,chr1-3061-clone1,100,35.0,2,0,0.7 +sample2,chr1-3113-clone1,100,35.0,2,0,0.7 +sample2,chr1-3173-clone1,100,35.0,2,0,0.7 +sample2,chr1-3371-clone1,100,35.0,2,0,0.7 +sample2,chr1-3546-clone2,100,10.0,2,0,0.7 +sample2,chr1-3754-clone2,100,10.0,2,0,0.7 +sample2,chr1-3807-clone2,100,10.0,2,0,0.7 +sample2,chr1-3812-clone1,100,35.0,2,0,0.7 +sample2,chr1-4223-clone2,100,10.0,2,0,0.7 +sample2,chr1-4278-clone1,100,35.0,2,0,0.7 +sample2,chr1-4428-clone1,100,35.0,2,0,0.7 +sample2,chr1-4696-clone1,100,35.0,2,0,0.7 +sample2,chr1-4893-clone1,100,35.0,2,0,0.7 +sample2,chr1-4937-clone2,100,10.0,2,0,0.7 +sample2,chr1-5277-clone1,100,35.0,2,0,0.7 +sample2,chr1-5405-clone2,100,10.0,2,0,0.7 +sample2,chr1-5563-clone1,100,35.0,2,0,0.7 +sample2,chr1-5597-clone1,100,35.0,2,0,0.7 +sample2,chr1-5815-clone1,100,35.0,2,0,0.7 +sample2,chr1-6037-clone1,100,35.0,2,0,0.7 +sample2,chr1-6113-clone1,100,35.0,2,0,0.7 +sample2,chr1-6145-clone1,100,35.0,2,0,0.7 +sample2,chr1-6753-clone1,100,35.0,2,0,0.7 +sample2,chr1-6792-clone1,100,35.0,2,0,0.7 +sample2,chr1-6808-clone1,100,35.0,2,0,0.7 +sample2,chr1-6826-clone1,100,35.0,2,0,0.7 +sample2,chr1-7094-clone1,100,35.0,2,0,0.7 +sample2,chr1-7106-clone2,100,10.0,2,0,0.7 +sample2,chr1-7148-clone1,100,35.0,2,0,0.7 +sample2,chr1-7518-clone1,100,35.0,2,0,0.7 +sample2,chr1-8180-clone2,100,10.0,2,0,0.7 +sample2,chr1-8257-clone1,100,35.0,2,0,0.7 +sample2,chr1-8368-clone1,100,35.0,2,0,0.7 +sample2,chr1-8491-clone2,100,10.0,2,0,0.7 +sample2,chr1-8579-clone1,100,35.0,2,0,0.7 +sample2,chr1-8701-clone1,100,35.0,2,0,0.7 +sample2,chr1-9064-clone1,100,35.0,2,0,0.7 +sample2,chr1-9095-clone2,100,10.0,2,0,0.7 +sample2,chr1-9210-clone1,100,35.0,2,0,0.7 +sample2,chr1-9274-clone1,100,35.0,2,0,0.7 +sample2,chr1-9472-clone1,100,35.0,2,0,0.7 +sample2,chr1-9505-clone1,100,35.0,2,0,0.7 +sample2,chr1-9586-clone1,100,35.0,2,0,0.7 +sample2,chr1-9644-clone2,100,10.0,2,0,0.7 +sample2,chr1-9765-clone2,100,10.0,2,0,0.7 +sample3,chr1-57-clone1,100,62,4,0.7,0.7 +sample3,chr1-110-clone1,100,62,4,0.7,0.7 +sample3,chr1-167-clone1,100,62,4,0.7,0.7 +sample3,chr1-386-clone1,100,35.0,2,0,0.7 +sample3,chr1-441-clone1,100,50,1,0.6,0.7 +sample3,chr1-1276-clone1,100,50,1,0.6,0.7 +sample3,chr1-1496-clone1,100,50,1,0.6,0.7 +sample3,chr1-1631-clone1,100,35.0,2,0,0.7 +sample3,chr1-1735-clone1,100,35.0,2,0,0.7 +sample3,chr1-2036-clone1,100,35.0,2,0,0.7 +sample3,chr1-2428-clone1,100,35.0,2,0,0.7 +sample3,chr1-2650-clone1,100,35.0,2,0,0.7 +sample3,chr1-2684-clone1,100,35.0,2,0,0.7 +sample3,chr1-2730-clone1,100,35.0,2,0,0.7 +sample3,chr1-2857-clone1,100,35.0,2,0,0.7 +sample3,chr1-2931-clone1,100,35.0,2,0,0.7 +sample3,chr1-2977-clone1,100,35.0,2,0,0.7 +sample3,chr1-3061-clone1,100,35.0,2,0,0.7 +sample3,chr1-3113-clone1,100,35.0,2,0,0.7 +sample3,chr1-3173-clone1,100,35.0,2,0,0.7 +sample3,chr1-3371-clone1,100,35.0,2,0,0.7 +sample3,chr1-3812-clone1,100,35.0,2,0,0.7 +sample3,chr1-4278-clone1,100,35.0,2,0,0.7 +sample3,chr1-4428-clone1,100,35.0,2,0,0.7 +sample3,chr1-4696-clone1,100,35.0,2,0,0.7 +sample3,chr1-4893-clone1,100,35.0,2,0,0.7 +sample3,chr1-5277-clone1,100,35.0,2,0,0.7 +sample3,chr1-5563-clone1,100,35.0,2,0,0.7 +sample3,chr1-5597-clone1,100,35.0,2,0,0.7 +sample3,chr1-5815-clone1,100,35.0,2,0,0.7 +sample3,chr1-6037-clone1,100,35.0,2,0,0.7 +sample3,chr1-6113-clone1,100,35.0,2,0,0.7 +sample3,chr1-6145-clone1,100,35.0,2,0,0.7 +sample3,chr1-6753-clone1,100,35.0,2,0,0.7 +sample3,chr1-6792-clone1,100,35.0,2,0,0.7 +sample3,chr1-6808-clone1,100,35.0,2,0,0.7 +sample3,chr1-6826-clone1,100,35.0,2,0,0.7 +sample3,chr1-7094-clone1,100,27,3,0.6,0.7 +sample3,chr1-7148-clone1,100,27,3,0.6,0.7 +sample3,chr1-7518-clone1,100,35.0,2,0,0.7 +sample3,chr1-8257-clone1,100,35.0,2,0,0.7 +sample3,chr1-8368-clone1,100,35.0,2,0,0.7 +sample3,chr1-8579-clone1,100,35.0,2,0,0.7 +sample3,chr1-8701-clone1,100,35.0,2,0,0.7 +sample3,chr1-9064-clone1,100,35.0,2,0,0.7 +sample3,chr1-9210-clone1,100,35.0,2,0,0.7 +sample3,chr1-9274-clone1,100,35.0,2,0,0.7 +sample3,chr1-9472-clone1,100,35.0,2,0,0.7 +sample3,chr1-9505-clone1,100,35.0,2,0,0.7 +sample3,chr1-9586-clone1,100,35.0,2,0,0.7 +sample3,chr1-8716-clone3,100,30.0,2,0,0.7 +sample3,chr1-9104-clone3,100,30.0,2,0,0.7 +sample3,chr1-8700-clone3,100,30.0,2,0,0.7 +sample3,chr1-8321-clone3,100,30.0,2,0,0.7 +sample3,chr1-8790-clone3,100,30.0,2,0,0.7 +sample3,chr1-8663-clone3,100,30.0,2,0,0.7 +sample3,chr1-8026-clone3,100,30.0,2,0,0.7 +sample3,chr1-8101-clone3,100,30.0,2,0,0.7 +sample3,chr1-8124-clone3,100,30.0,2,0,0.7 +sample3,chr1-9457-clone3,100,30.0,2,0,0.7 +sample3,chr1-8911-clone3,100,30.0,2,0,0.7 +sample3,chr1-8160-clone3,100,30.0,2,0,0.7 +sample3,chr1-8794-clone3,100,30.0,2,0,0.7 +sample3,chr1-9636-clone3,100,30.0,2,0,0.7 +sample3,chr1-9885-clone3,100,30.0,2,0,0.7 +sample3,chr1-8367-clone3,100,30.0,2,0,0.7 +sample3,chr1-9615-clone3,100,30.0,2,0,0.7 +sample3,chr1-8023-clone3,100,30.0,2,0,0.7 +sample3,chr1-9581-clone3,100,30.0,2,0,0.7 +sample3,chr1-9458-clone3,100,30.0,2,0,0.7 diff --git a/inst/extdata/example2_snv.csv b/inst/extdata/example2_snv.csv new file mode 100644 index 0000000..87c7e3f --- /dev/null +++ b/inst/extdata/example2_snv.csv @@ -0,0 +1,191 @@ +sample,mutation,total_reads,alt_reads,tumor_integer_copy_number,major_integer_copy_number,cncf +sample1,chr1-57-clone1,100,67,4,3,0.8 +sample1,chr1-110-clone1,100,67,4,3,0.8 +sample1,chr1-167-clone1,100,67,4,3,0.8 +sample1,chr1-386-clone1,100,40.0,2,1,0 +sample1,chr1-441-clone1,100,40.0,2,1,0 +sample1,chr1-1276-clone1,100,40.0,2,1,0 +sample1,chr1-1496-clone1,100,40.0,2,1,0 +sample1,chr1-1631-clone1,100,40.0,2,1,0 +sample1,chr1-1735-clone1,100,40.0,2,1,0 +sample1,chr1-2036-clone1,100,40.0,2,1,0 +sample1,chr1-2428-clone1,100,40.0,2,1,0 +sample1,chr1-2650-clone1,100,40.0,2,1,0 +sample1,chr1-2684-clone1,100,40.0,2,1,0 +sample1,chr1-2730-clone1,100,40.0,2,1,0 +sample1,chr1-2857-clone1,100,40.0,2,1,0 +sample1,chr1-2931-clone1,100,40.0,2,1,0 +sample1,chr1-2977-clone1,100,40.0,2,1,0 +sample1,chr1-3061-clone1,100,40.0,2,1,0 +sample1,chr1-3113-clone1,100,40.0,2,1,0 +sample1,chr1-3173-clone1,100,40.0,2,1,0 +sample1,chr1-3371-clone1,100,40.0,2,1,0 +sample1,chr1-3812-clone1,100,40.0,2,1,0 +sample1,chr1-4278-clone1,100,40.0,2,1,0 +sample1,chr1-4428-clone1,100,40.0,2,1,0 +sample1,chr1-4696-clone1,100,40.0,2,1,0 +sample1,chr1-4893-clone1,100,40.0,2,1,0 +sample1,chr1-5277-clone1,100,40.0,2,1,0 +sample1,chr1-5563-clone1,100,40.0,2,1,0 +sample1,chr1-5597-clone1,100,40.0,2,1,0 +sample1,chr1-5815-clone1,100,40.0,2,1,0 +sample1,chr1-6037-clone1,100,40.0,2,1,0 +sample1,chr1-6113-clone1,100,40.0,2,1,0 +sample1,chr1-6145-clone1,100,40.0,2,1,0 +sample1,chr1-6753-clone1,100,40.0,2,1,0 +sample1,chr1-6792-clone1,100,40.0,2,1,0 +sample1,chr1-6808-clone1,100,40.0,2,1,0 +sample1,chr1-6826-clone1,100,40.0,2,1,0 +sample1,chr1-7094-clone1,100,40.0,2,1,0 +sample1,chr1-7148-clone1,100,40.0,2,1,0 +sample1,chr1-7518-clone1,100,40.0,2,1,0 +sample1,chr1-8257-clone1,100,40.0,2,1,0 +sample1,chr1-8368-clone1,100,40.0,2,1,0 +sample1,chr1-8579-clone1,100,40.0,2,1,0 +sample1,chr1-8701-clone1,100,40.0,2,1,0 +sample1,chr1-9064-clone1,100,40.0,2,1,0 +sample1,chr1-9210-clone1,100,40.0,2,1,0 +sample1,chr1-9274-clone1,100,40.0,2,1,0 +sample1,chr1-9472-clone1,100,40.0,2,1,0 +sample1,chr1-9505-clone1,100,40.0,2,1,0 +sample1,chr1-9586-clone1,100,40.0,2,1,0 +sample2,chr1-57-clone1,100,62,4,3,0.7 +sample2,chr1-110-clone1,100,62,4,3,0.7 +sample2,chr1-154-clone2,100,6,4,3,0.7 +sample2,chr1-167-clone1,100,62,4,3,0.7 +sample2,chr1-386-clone1,100,35.0,2,1,0 +sample2,chr1-441-clone1,100,35.0,2,1,0 +sample2,chr1-660-clone2,100,10.0,2,1,0 +sample2,chr1-681-clone2,100,10.0,2,1,0 +sample2,chr1-1276-clone1,100,35.0,2,1,0 +sample2,chr1-1496-clone1,100,35.0,2,1,0 +sample2,chr1-1631-clone1,100,35.0,2,1,0 +sample2,chr1-1735-clone1,100,35.0,2,1,0 +sample2,chr1-1963-clone2,100,10.0,2,1,0 +sample2,chr1-2036-clone1,100,35.0,2,1,0 +sample2,chr1-2128-clone2,100,10.0,2,1,0 +sample2,chr1-2428-clone1,100,35.0,2,1,0 +sample2,chr1-2463-clone2,100,10.0,2,1,0 +sample2,chr1-2650-clone1,100,35.0,2,1,0 +sample2,chr1-2684-clone1,100,35.0,2,1,0 +sample2,chr1-2690-clone2,100,10.0,2,1,0 +sample2,chr1-2730-clone1,100,35.0,2,1,0 +sample2,chr1-2857-clone1,100,35.0,2,1,0 +sample2,chr1-2931-clone1,100,35.0,2,1,0 +sample2,chr1-2977-clone1,100,35.0,2,1,0 +sample2,chr1-3045-clone2,100,10.0,2,1,0 +sample2,chr1-3061-clone1,100,35.0,2,1,0 +sample2,chr1-3113-clone1,100,35.0,2,1,0 +sample2,chr1-3173-clone1,100,35.0,2,1,0 +sample2,chr1-3371-clone1,100,35.0,2,1,0 +sample2,chr1-3546-clone2,100,10.0,2,1,0 +sample2,chr1-3754-clone2,100,10.0,2,1,0 +sample2,chr1-3807-clone2,100,10.0,2,1,0 +sample2,chr1-3812-clone1,100,35.0,2,1,0 +sample2,chr1-4223-clone2,100,10.0,2,1,0 +sample2,chr1-4278-clone1,100,35.0,2,1,0 +sample2,chr1-4428-clone1,100,35.0,2,1,0 +sample2,chr1-4696-clone1,100,35.0,2,1,0 +sample2,chr1-4893-clone1,100,35.0,2,1,0 +sample2,chr1-4937-clone2,100,10.0,2,1,0 +sample2,chr1-5277-clone1,100,35.0,2,1,0 +sample2,chr1-5405-clone2,100,10.0,2,1,0 +sample2,chr1-5563-clone1,100,35.0,2,1,0 +sample2,chr1-5597-clone1,100,35.0,2,1,0 +sample2,chr1-5815-clone1,100,35.0,2,1,0 +sample2,chr1-6037-clone1,100,35.0,2,1,0 +sample2,chr1-6113-clone1,100,35.0,2,1,0 +sample2,chr1-6145-clone1,100,35.0,2,1,0 +sample2,chr1-6753-clone1,100,35.0,2,1,0 +sample2,chr1-6792-clone1,100,35.0,2,1,0 +sample2,chr1-6808-clone1,100,35.0,2,1,0 +sample2,chr1-6826-clone1,100,35.0,2,1,0 +sample2,chr1-7094-clone1,100,35.0,2,1,0 +sample2,chr1-7106-clone2,100,10.0,2,1,0 +sample2,chr1-7148-clone1,100,35.0,2,1,0 +sample2,chr1-7518-clone1,100,35.0,2,1,0 +sample2,chr1-8180-clone2,100,10.0,2,1,0 +sample2,chr1-8257-clone1,100,35.0,2,1,0 +sample2,chr1-8368-clone1,100,35.0,2,1,0 +sample2,chr1-8491-clone2,100,10.0,2,1,0 +sample2,chr1-8579-clone1,100,35.0,2,1,0 +sample2,chr1-8701-clone1,100,35.0,2,1,0 +sample2,chr1-9064-clone1,100,35.0,2,1,0 +sample2,chr1-9095-clone2,100,10.0,2,1,0 +sample2,chr1-9210-clone1,100,35.0,2,1,0 +sample2,chr1-9274-clone1,100,35.0,2,1,0 +sample2,chr1-9472-clone1,100,35.0,2,1,0 +sample2,chr1-9505-clone1,100,35.0,2,1,0 +sample2,chr1-9586-clone1,100,35.0,2,1,0 +sample2,chr1-9644-clone2,100,10.0,2,1,0 +sample2,chr1-9765-clone2,100,10.0,2,1,0 +sample3,chr1-57-clone1,100,62,4,3,0.7 +sample3,chr1-110-clone1,100,62,4,3,0.7 +sample3,chr1-167-clone1,100,62,4,3,0.7 +sample3,chr1-386-clone1,100,35.0,2,1,0 +sample3,chr1-441-clone1,100,50,1,1,0.6 +sample3,chr1-1276-clone1,100,50,1,1,0.6 +sample3,chr1-1496-clone1,100,50,1,1,0.6 +sample3,chr1-1631-clone1,100,35.0,2,1,0 +sample3,chr1-1735-clone1,100,35.0,2,1,0 +sample3,chr1-2036-clone1,100,35.0,2,1,0 +sample3,chr1-2428-clone1,100,35.0,2,1,0 +sample3,chr1-2650-clone1,100,35.0,2,1,0 +sample3,chr1-2684-clone1,100,35.0,2,1,0 +sample3,chr1-2730-clone1,100,35.0,2,1,0 +sample3,chr1-2857-clone1,100,35.0,2,1,0 +sample3,chr1-2931-clone1,100,35.0,2,1,0 +sample3,chr1-2977-clone1,100,35.0,2,1,0 +sample3,chr1-3061-clone1,100,35.0,2,1,0 +sample3,chr1-3113-clone1,100,35.0,2,1,0 +sample3,chr1-3173-clone1,100,35.0,2,1,0 +sample3,chr1-3371-clone1,100,35.0,2,1,0 +sample3,chr1-3812-clone1,100,35.0,2,1,0 +sample3,chr1-4278-clone1,100,35.0,2,1,0 +sample3,chr1-4428-clone1,100,35.0,2,1,0 +sample3,chr1-4696-clone1,100,35.0,2,1,0 +sample3,chr1-4893-clone1,100,35.0,2,1,0 +sample3,chr1-5277-clone1,100,35.0,2,1,0 +sample3,chr1-5563-clone1,100,35.0,2,1,0 +sample3,chr1-5597-clone1,100,35.0,2,1,0 +sample3,chr1-5815-clone1,100,35.0,2,1,0 +sample3,chr1-6037-clone1,100,35.0,2,1,0 +sample3,chr1-6113-clone1,100,35.0,2,1,0 +sample3,chr1-6145-clone1,100,35.0,2,1,0 +sample3,chr1-6753-clone1,100,35.0,2,1,0 +sample3,chr1-6792-clone1,100,35.0,2,1,0 +sample3,chr1-6808-clone1,100,35.0,2,1,0 +sample3,chr1-6826-clone1,100,35.0,2,1,0 +sample3,chr1-7094-clone1,100,27,3,2,0.6 +sample3,chr1-7148-clone1,100,27,3,2,0.6 +sample3,chr1-7518-clone1,100,35.0,2,1,0 +sample3,chr1-8257-clone1,100,35.0,2,1,0 +sample3,chr1-8368-clone1,100,35.0,2,1,0 +sample3,chr1-8579-clone1,100,35.0,2,1,0 +sample3,chr1-8701-clone1,100,35.0,2,1,0 +sample3,chr1-9064-clone1,100,35.0,2,1,0 +sample3,chr1-9210-clone1,100,35.0,2,1,0 +sample3,chr1-9274-clone1,100,35.0,2,1,0 +sample3,chr1-9472-clone1,100,35.0,2,1,0 +sample3,chr1-9505-clone1,100,35.0,2,1,0 +sample3,chr1-9586-clone1,100,35.0,2,1,0 +sample3,chr1-8716-clone3,100,30.0,2,1,0 +sample3,chr1-9104-clone3,100,30.0,2,1,0 +sample3,chr1-8700-clone3,100,30.0,2,1,0 +sample3,chr1-8321-clone3,100,30.0,2,1,0 +sample3,chr1-8790-clone3,100,30.0,2,1,0 +sample3,chr1-8663-clone3,100,30.0,2,1,0 +sample3,chr1-8026-clone3,100,30.0,2,1,0 +sample3,chr1-8101-clone3,100,30.0,2,1,0 +sample3,chr1-8124-clone3,100,30.0,2,1,0 +sample3,chr1-9457-clone3,100,30.0,2,1,0 +sample3,chr1-8911-clone3,100,30.0,2,1,0 +sample3,chr1-8160-clone3,100,30.0,2,1,0 +sample3,chr1-8794-clone3,100,30.0,2,1,0 +sample3,chr1-9636-clone3,100,30.0,2,1,0 +sample3,chr1-9885-clone3,100,30.0,2,1,0 +sample3,chr1-8367-clone3,100,30.0,2,1,0 +sample3,chr1-9615-clone3,100,30.0,2,1,0 +sample3,chr1-8023-clone3,100,30.0,2,1,0 +sample3,chr1-9581-clone3,100,30.0,2,1,0 +sample3,chr1-9458-clone3,100,30.0,2,1,0 diff --git a/inst/extdata/example2_snv_with_purity.csv b/inst/extdata/example2_snv_with_purity.csv new file mode 100644 index 0000000..4295875 --- /dev/null +++ b/inst/extdata/example2_snv_with_purity.csv @@ -0,0 +1,191 @@ +sample,mutation,total_reads,alt_reads,tumor_integer_copy_number,major_integer_copy_number,cncf,purity +sample1,chr1-57-clone1,100,67,4,3,0.8,0.8 +sample1,chr1-110-clone1,100,67,4,3,0.8,0.8 +sample1,chr1-167-clone1,100,67,4,3,0.8,0.8 +sample1,chr1-386-clone1,100,40.0,2,1,0,0.8 +sample1,chr1-441-clone1,100,40.0,2,1,0,0.8 +sample1,chr1-1276-clone1,100,40.0,2,1,0,0.8 +sample1,chr1-1496-clone1,100,40.0,2,1,0,0.8 +sample1,chr1-1631-clone1,100,40.0,2,1,0,0.8 +sample1,chr1-1735-clone1,100,40.0,2,1,0,0.8 +sample1,chr1-2036-clone1,100,40.0,2,1,0,0.8 +sample1,chr1-2428-clone1,100,40.0,2,1,0,0.8 +sample1,chr1-2650-clone1,100,40.0,2,1,0,0.8 +sample1,chr1-2684-clone1,100,40.0,2,1,0,0.8 +sample1,chr1-2730-clone1,100,40.0,2,1,0,0.8 +sample1,chr1-2857-clone1,100,40.0,2,1,0,0.8 +sample1,chr1-2931-clone1,100,40.0,2,1,0,0.8 +sample1,chr1-2977-clone1,100,40.0,2,1,0,0.8 +sample1,chr1-3061-clone1,100,40.0,2,1,0,0.8 +sample1,chr1-3113-clone1,100,40.0,2,1,0,0.8 +sample1,chr1-3173-clone1,100,40.0,2,1,0,0.8 +sample1,chr1-3371-clone1,100,40.0,2,1,0,0.8 +sample1,chr1-3812-clone1,100,40.0,2,1,0,0.8 +sample1,chr1-4278-clone1,100,40.0,2,1,0,0.8 +sample1,chr1-4428-clone1,100,40.0,2,1,0,0.8 +sample1,chr1-4696-clone1,100,40.0,2,1,0,0.8 +sample1,chr1-4893-clone1,100,40.0,2,1,0,0.8 +sample1,chr1-5277-clone1,100,40.0,2,1,0,0.8 +sample1,chr1-5563-clone1,100,40.0,2,1,0,0.8 +sample1,chr1-5597-clone1,100,40.0,2,1,0,0.8 +sample1,chr1-5815-clone1,100,40.0,2,1,0,0.8 +sample1,chr1-6037-clone1,100,40.0,2,1,0,0.8 +sample1,chr1-6113-clone1,100,40.0,2,1,0,0.8 +sample1,chr1-6145-clone1,100,40.0,2,1,0,0.8 +sample1,chr1-6753-clone1,100,40.0,2,1,0,0.8 +sample1,chr1-6792-clone1,100,40.0,2,1,0,0.8 +sample1,chr1-6808-clone1,100,40.0,2,1,0,0.8 +sample1,chr1-6826-clone1,100,40.0,2,1,0,0.8 +sample1,chr1-7094-clone1,100,40.0,2,1,0,0.8 +sample1,chr1-7148-clone1,100,40.0,2,1,0,0.8 +sample1,chr1-7518-clone1,100,40.0,2,1,0,0.8 +sample1,chr1-8257-clone1,100,40.0,2,1,0,0.8 +sample1,chr1-8368-clone1,100,40.0,2,1,0,0.8 +sample1,chr1-8579-clone1,100,40.0,2,1,0,0.8 +sample1,chr1-8701-clone1,100,40.0,2,1,0,0.8 +sample1,chr1-9064-clone1,100,40.0,2,1,0,0.8 +sample1,chr1-9210-clone1,100,40.0,2,1,0,0.8 +sample1,chr1-9274-clone1,100,40.0,2,1,0,0.8 +sample1,chr1-9472-clone1,100,40.0,2,1,0,0.8 +sample1,chr1-9505-clone1,100,40.0,2,1,0,0.8 +sample1,chr1-9586-clone1,100,40.0,2,1,0,0.8 +sample2,chr1-57-clone1,100,62,4,3,0.7,0.7 +sample2,chr1-110-clone1,100,62,4,3,0.7,0.7 +sample2,chr1-154-clone2,100,6,4,3,0.7,0.7 +sample2,chr1-167-clone1,100,62,4,3,0.7,0.7 +sample2,chr1-386-clone1,100,35.0,2,1,0,0.7 +sample2,chr1-441-clone1,100,35.0,2,1,0,0.7 +sample2,chr1-660-clone2,100,10.0,2,1,0,0.7 +sample2,chr1-681-clone2,100,10.0,2,1,0,0.7 +sample2,chr1-1276-clone1,100,35.0,2,1,0,0.7 +sample2,chr1-1496-clone1,100,35.0,2,1,0,0.7 +sample2,chr1-1631-clone1,100,35.0,2,1,0,0.7 +sample2,chr1-1735-clone1,100,35.0,2,1,0,0.7 +sample2,chr1-1963-clone2,100,10.0,2,1,0,0.7 +sample2,chr1-2036-clone1,100,35.0,2,1,0,0.7 +sample2,chr1-2128-clone2,100,10.0,2,1,0,0.7 +sample2,chr1-2428-clone1,100,35.0,2,1,0,0.7 +sample2,chr1-2463-clone2,100,10.0,2,1,0,0.7 +sample2,chr1-2650-clone1,100,35.0,2,1,0,0.7 +sample2,chr1-2684-clone1,100,35.0,2,1,0,0.7 +sample2,chr1-2690-clone2,100,10.0,2,1,0,0.7 +sample2,chr1-2730-clone1,100,35.0,2,1,0,0.7 +sample2,chr1-2857-clone1,100,35.0,2,1,0,0.7 +sample2,chr1-2931-clone1,100,35.0,2,1,0,0.7 +sample2,chr1-2977-clone1,100,35.0,2,1,0,0.7 +sample2,chr1-3045-clone2,100,10.0,2,1,0,0.7 +sample2,chr1-3061-clone1,100,35.0,2,1,0,0.7 +sample2,chr1-3113-clone1,100,35.0,2,1,0,0.7 +sample2,chr1-3173-clone1,100,35.0,2,1,0,0.7 +sample2,chr1-3371-clone1,100,35.0,2,1,0,0.7 +sample2,chr1-3546-clone2,100,10.0,2,1,0,0.7 +sample2,chr1-3754-clone2,100,10.0,2,1,0,0.7 +sample2,chr1-3807-clone2,100,10.0,2,1,0,0.7 +sample2,chr1-3812-clone1,100,35.0,2,1,0,0.7 +sample2,chr1-4223-clone2,100,10.0,2,1,0,0.7 +sample2,chr1-4278-clone1,100,35.0,2,1,0,0.7 +sample2,chr1-4428-clone1,100,35.0,2,1,0,0.7 +sample2,chr1-4696-clone1,100,35.0,2,1,0,0.7 +sample2,chr1-4893-clone1,100,35.0,2,1,0,0.7 +sample2,chr1-4937-clone2,100,10.0,2,1,0,0.7 +sample2,chr1-5277-clone1,100,35.0,2,1,0,0.7 +sample2,chr1-5405-clone2,100,10.0,2,1,0,0.7 +sample2,chr1-5563-clone1,100,35.0,2,1,0,0.7 +sample2,chr1-5597-clone1,100,35.0,2,1,0,0.7 +sample2,chr1-5815-clone1,100,35.0,2,1,0,0.7 +sample2,chr1-6037-clone1,100,35.0,2,1,0,0.7 +sample2,chr1-6113-clone1,100,35.0,2,1,0,0.7 +sample2,chr1-6145-clone1,100,35.0,2,1,0,0.7 +sample2,chr1-6753-clone1,100,35.0,2,1,0,0.7 +sample2,chr1-6792-clone1,100,35.0,2,1,0,0.7 +sample2,chr1-6808-clone1,100,35.0,2,1,0,0.7 +sample2,chr1-6826-clone1,100,35.0,2,1,0,0.7 +sample2,chr1-7094-clone1,100,35.0,2,1,0,0.7 +sample2,chr1-7106-clone2,100,10.0,2,1,0,0.7 +sample2,chr1-7148-clone1,100,35.0,2,1,0,0.7 +sample2,chr1-7518-clone1,100,35.0,2,1,0,0.7 +sample2,chr1-8180-clone2,100,10.0,2,1,0,0.7 +sample2,chr1-8257-clone1,100,35.0,2,1,0,0.7 +sample2,chr1-8368-clone1,100,35.0,2,1,0,0.7 +sample2,chr1-8491-clone2,100,10.0,2,1,0,0.7 +sample2,chr1-8579-clone1,100,35.0,2,1,0,0.7 +sample2,chr1-8701-clone1,100,35.0,2,1,0,0.7 +sample2,chr1-9064-clone1,100,35.0,2,1,0,0.7 +sample2,chr1-9095-clone2,100,10.0,2,1,0,0.7 +sample2,chr1-9210-clone1,100,35.0,2,1,0,0.7 +sample2,chr1-9274-clone1,100,35.0,2,1,0,0.7 +sample2,chr1-9472-clone1,100,35.0,2,1,0,0.7 +sample2,chr1-9505-clone1,100,35.0,2,1,0,0.7 +sample2,chr1-9586-clone1,100,35.0,2,1,0,0.7 +sample2,chr1-9644-clone2,100,10.0,2,1,0,0.7 +sample2,chr1-9765-clone2,100,10.0,2,1,0,0.7 +sample3,chr1-57-clone1,100,62,4,3,0.7,0.7 +sample3,chr1-110-clone1,100,62,4,3,0.7,0.7 +sample3,chr1-167-clone1,100,62,4,3,0.7,0.7 +sample3,chr1-386-clone1,100,35.0,2,1,0,0.7 +sample3,chr1-441-clone1,100,50,1,1,0.6,0.7 +sample3,chr1-1276-clone1,100,50,1,1,0.6,0.7 +sample3,chr1-1496-clone1,100,50,1,1,0.6,0.7 +sample3,chr1-1631-clone1,100,35.0,2,1,0,0.7 +sample3,chr1-1735-clone1,100,35.0,2,1,0,0.7 +sample3,chr1-2036-clone1,100,35.0,2,1,0,0.7 +sample3,chr1-2428-clone1,100,35.0,2,1,0,0.7 +sample3,chr1-2650-clone1,100,35.0,2,1,0,0.7 +sample3,chr1-2684-clone1,100,35.0,2,1,0,0.7 +sample3,chr1-2730-clone1,100,35.0,2,1,0,0.7 +sample3,chr1-2857-clone1,100,35.0,2,1,0,0.7 +sample3,chr1-2931-clone1,100,35.0,2,1,0,0.7 +sample3,chr1-2977-clone1,100,35.0,2,1,0,0.7 +sample3,chr1-3061-clone1,100,35.0,2,1,0,0.7 +sample3,chr1-3113-clone1,100,35.0,2,1,0,0.7 +sample3,chr1-3173-clone1,100,35.0,2,1,0,0.7 +sample3,chr1-3371-clone1,100,35.0,2,1,0,0.7 +sample3,chr1-3812-clone1,100,35.0,2,1,0,0.7 +sample3,chr1-4278-clone1,100,35.0,2,1,0,0.7 +sample3,chr1-4428-clone1,100,35.0,2,1,0,0.7 +sample3,chr1-4696-clone1,100,35.0,2,1,0,0.7 +sample3,chr1-4893-clone1,100,35.0,2,1,0,0.7 +sample3,chr1-5277-clone1,100,35.0,2,1,0,0.7 +sample3,chr1-5563-clone1,100,35.0,2,1,0,0.7 +sample3,chr1-5597-clone1,100,35.0,2,1,0,0.7 +sample3,chr1-5815-clone1,100,35.0,2,1,0,0.7 +sample3,chr1-6037-clone1,100,35.0,2,1,0,0.7 +sample3,chr1-6113-clone1,100,35.0,2,1,0,0.7 +sample3,chr1-6145-clone1,100,35.0,2,1,0,0.7 +sample3,chr1-6753-clone1,100,35.0,2,1,0,0.7 +sample3,chr1-6792-clone1,100,35.0,2,1,0,0.7 +sample3,chr1-6808-clone1,100,35.0,2,1,0,0.7 +sample3,chr1-6826-clone1,100,35.0,2,1,0,0.7 +sample3,chr1-7094-clone1,100,27,3,2,0.6,0.7 +sample3,chr1-7148-clone1,100,27,3,2,0.6,0.7 +sample3,chr1-7518-clone1,100,35.0,2,1,0,0.7 +sample3,chr1-8257-clone1,100,35.0,2,1,0,0.7 +sample3,chr1-8368-clone1,100,35.0,2,1,0,0.7 +sample3,chr1-8579-clone1,100,35.0,2,1,0,0.7 +sample3,chr1-8701-clone1,100,35.0,2,1,0,0.7 +sample3,chr1-9064-clone1,100,35.0,2,1,0,0.7 +sample3,chr1-9210-clone1,100,35.0,2,1,0,0.7 +sample3,chr1-9274-clone1,100,35.0,2,1,0,0.7 +sample3,chr1-9472-clone1,100,35.0,2,1,0,0.7 +sample3,chr1-9505-clone1,100,35.0,2,1,0,0.7 +sample3,chr1-9586-clone1,100,35.0,2,1,0,0.7 +sample3,chr1-8716-clone3,100,30.0,2,1,0,0.7 +sample3,chr1-9104-clone3,100,30.0,2,1,0,0.7 +sample3,chr1-8700-clone3,100,30.0,2,1,0,0.7 +sample3,chr1-8321-clone3,100,30.0,2,1,0,0.7 +sample3,chr1-8790-clone3,100,30.0,2,1,0,0.7 +sample3,chr1-8663-clone3,100,30.0,2,1,0,0.7 +sample3,chr1-8026-clone3,100,30.0,2,1,0,0.7 +sample3,chr1-8101-clone3,100,30.0,2,1,0,0.7 +sample3,chr1-8124-clone3,100,30.0,2,1,0,0.7 +sample3,chr1-9457-clone3,100,30.0,2,1,0,0.7 +sample3,chr1-8911-clone3,100,30.0,2,1,0,0.7 +sample3,chr1-8160-clone3,100,30.0,2,1,0,0.7 +sample3,chr1-8794-clone3,100,30.0,2,1,0,0.7 +sample3,chr1-9636-clone3,100,30.0,2,1,0,0.7 +sample3,chr1-9885-clone3,100,30.0,2,1,0,0.7 +sample3,chr1-8367-clone3,100,30.0,2,1,0,0.7 +sample3,chr1-9615-clone3,100,30.0,2,1,0,0.7 +sample3,chr1-8023-clone3,100,30.0,2,1,0,0.7 +sample3,chr1-9581-clone3,100,30.0,2,1,0,0.7 +sample3,chr1-9458-clone3,100,30.0,2,1,0,0.7 diff --git a/inst/extdata/example_S1.csv b/inst/extdata/example_S1.csv deleted file mode 100644 index 726bc67..0000000 --- a/inst/extdata/example_S1.csv +++ /dev/null @@ -1,9 +0,0 @@ -sample,mutation,total_reads,alt_reads,tumor_integer_copy_number,purity,multiplicity -S1,mut1,99,28,2,0.65,1 -S1,mut2,102,65,2,0.65,2 -S1,mut3,93,33,2,0.65,1 -S1,mut4,94,26,2,0.65,1 -S1,mut5,105,32,2,0.65,1 -S1,mut6,96,60,2,0.65,2 -S1,mut7,93,58,2,0.65,2 -S1,mut8,81,52,2,0.65,2 \ No newline at end of file diff --git a/inst/extdata/example_S1_error1.csv b/inst/extdata/example_S1_error1.csv deleted file mode 100644 index 65c4e16..0000000 --- a/inst/extdata/example_S1_error1.csv +++ /dev/null @@ -1,18 +0,0 @@ -sample,mutation,total_reads,alt_reads,tumor_integer_copy_number,purity,multiplicity -S1,mut1,99,28,2,0.1,1 -S1,mut2,102,65,2,0.1,2 -S1,mut3,93,33,2,0.1,1 -S1,mut4,94,26,2,0.2,1 -S1,mut5,105,32,2,0.65,1 -S1,mut6,96,60,2,0.65,2 -S1,mut7,93,58,2,0.65,2 -S1,mut8,81,52,2,0.65,2 -S1,mut9,104,0,2,0.65,1 -S1,mut10,106,0,2,0.65,2 -S1,mut11,84,0,2,0.65,2 -S1,mut12,100,0,2,0.65,2 -S1,mut13,112,0,2,0.65,2 -S1,mut14,108,0,2,0.65,2 -S1,mut15,113,0,2,0.65,2 -S1,mut16,106,0,2,0.65,2 -S1,mut17,109,0,2,0.65,1 \ No newline at end of file diff --git a/inst/extdata/example_S1_error2.csv b/inst/extdata/example_S1_error2.csv deleted file mode 100644 index 025a30f..0000000 --- a/inst/extdata/example_S1_error2.csv +++ /dev/null @@ -1,18 +0,0 @@ -sample,mutation,total_reads,alt_reads,tumor_integer_copy_number,purity,multiplicity -S1,mut1,99,28,2,0.65,0 -S1,mut2,102,65,2,0.65,2 -S1,mut3,93,33,2,0.65,1 -S1,mut4,94,26,2,0.65,1 -S1,mut5,105,32,2,0.65,1 -S1,mut6,96,60,2,0.65,2 -S1,mut7,93,58,2,0.65,2 -S1,mut8,81,52,2,0.65,2 -S1,mut9,104,0,2,0.65,1 -S1,mut10,106,0,2,0.65,2 -S1,mut11,84,0,2,0.65,2 -S1,mut12,100,0,2,0.65,2 -S1,mut13,112,0,2,0.65,2 -S1,mut14,108,0,2,0.65,2 -S1,mut15,113,0,2,0.65,2 -S1,mut16,106,0,2,0.65,2 -S1,mut17,109,0,2,0.65,1 \ No newline at end of file diff --git a/inst/extdata/example_S1_error3.csv b/inst/extdata/example_S1_error3.csv deleted file mode 100644 index 288454f..0000000 --- a/inst/extdata/example_S1_error3.csv +++ /dev/null @@ -1,18 +0,0 @@ -sample,mutation,total_reads,alt_reads,tumor_integer_copy_number,purity,multiplicity -S1,mut1,20,28,2,0.65,1 -S1,mut2,102,65,2,0.65,2 -S1,mut3,93,33,2,0.65,1 -S1,mut4,94,26,2,0.65,1 -S1,mut5,105,32,2,0.65,1 -S1,mut6,96,60,2,0.65,2 -S1,mut7,93,58,2,0.65,2 -S1,mut8,81,52,2,0.65,2 -S1,mut9,104,0,2,0.65,1 -S1,mut10,106,0,2,0.65,2 -S1,mut11,84,0,2,0.65,2 -S1,mut12,100,0,2,0.65,2 -S1,mut13,112,0,2,0.65,2 -S1,mut14,108,0,2,0.65,2 -S1,mut15,113,0,2,0.65,2 -S1,mut16,106,0,2,0.65,2 -S1,mut17,109,0,2,0.65,1 \ No newline at end of file diff --git a/inst/extdata/example_S1_error4.csv b/inst/extdata/example_S1_error4.csv deleted file mode 100644 index e3154c4..0000000 --- a/inst/extdata/example_S1_error4.csv +++ /dev/null @@ -1,18 +0,0 @@ -sample,mutation,total_reads,alt_reads,tumor_integer_copy_number,purity,multiplicity -S1,mut1,0,0,2,0.65,1 -S1,mut2,102,65,2,0.65,2 -S1,mut3,93,33,2,0.65,1 -S1,mut4,94,26,2,0.65,1 -S1,mut5,105,32,2,0.65,1 -S1,mut6,96,60,2,0.65,2 -S1,mut7,93,58,2,0.65,2 -S1,mut8,81,52,2,0.65,2 -S1,mut9,104,0,2,0.65,1 -S1,mut10,106,0,2,0.65,2 -S1,mut11,84,0,2,0.65,2 -S1,mut12,100,0,2,0.65,2 -S1,mut13,112,0,2,0.65,2 -S1,mut14,108,0,2,0.65,2 -S1,mut15,113,0,2,0.65,2 -S1,mut16,106,0,2,0.65,2 -S1,mut17,109,0,2,0.65,1 \ No newline at end of file diff --git a/inst/extdata/example_error1.csv b/inst/extdata/example_error1.csv deleted file mode 100644 index ef207f5..0000000 --- a/inst/extdata/example_error1.csv +++ /dev/null @@ -1,127 +0,0 @@ -sample,mutation,total_reads,alt_reads,tumor_integer_copy_number,purity,multiplicity -S1,mut1,99,28,2,0.65,1 -S1,mut2,102,65,2,0.65,2 -S1,mut3,93,33,2,0.65,0 -S1,mut4,94,26,2,0.65,1 -S1,mut5,105,32,2,0.65,1 -S1,mut6,96,60,2,0.65,2 -S1,mut7,93,58,2,0.65,2 -S1,mut8,81,52,2,0.65,2 -S1,mut9,104,0,2,0.65,1 -S1,mut10,106,0,2,0.65,2 -S1,mut11,84,0,2,0.65,2 -S1,mut12,100,0,2,0.65,2 -S1,mut13,112,0,2,0.65,2 -S1,mut14,108,0,2,0.65,2 -S1,mut15,113,0,2,0.65,2 -S1,mut16,106,0,2,0.65,2 -S1,mut17,109,0,2,0.65,1 -S1,mut18,87,6,2,0.65,1 -S1,mut19,82,4,2,0.65,1 -S1,mut20,87,14,2,0.65,2 -S1,mut21,114,6,2,0.65,1 -S1,mut22,94,6,2,0.65,1 -S1,mut23,110,18,2,0.65,2 -S1,mut24,104,17,2,0.65,2 -S1,mut25,92,5,2,0.65,1 -S1,mut26,95,7,2,0.65,1 -S1,mut27,83,0,2,0.65,1 -S1,mut28,103,0,2,0.65,2 -S1,mut29,93,0,2,0.65,2 -S1,mut30,100,0,2,0.65,2 -S1,mut31,83,0,2,0.65,2 -S1,mut32,92,0,2,0.65,1 -S1,mut33,113,0,2,0.65,1 -S1,mut34,86,0,2,0.65,1 -S1,mut35,94,56,2,0.65,2 -S1,mut36,106,70,2,0.65,2 -S1,mut37,85,27,2,0.65,1 -S1,mut38,113,56,2,0.65,2 -S1,mut39,91,57,2,0.65,2 -S1,mut40,91,22,2,0.65,1 -S1,mut41,100,30,2,0.65,1 -S1,mut42,92,31,2,0.65,1 -S2,mut1,91,13,2,0.85,1 -S2,mut2,99,43,2,0.85,2 -S2,mut3,87,18,2,0.85,1 -S2,mut4,99,25,2,0.85,1 -S2,mut5,102,19,2,0.85,1 -S2,mut6,94,40,2,0.85,2 -S2,mut7,90,31,2,0.85,2 -S2,mut8,90,28,2,0.85,2 -S2,mut9,100,8,2,0.85,1 -S2,mut10,97,14,2,0.85,2 -S2,mut11,104,12,2,0.85,2 -S2,mut12,125,15,2,0.85,2 -S2,mut13,96,10,2,0.85,2 -S2,mut14,100,15,2,0.85,2 -S2,mut15,94,7,2,0.85,2 -S2,mut16,101,16,2,0.85,2 -S2,mut17,92,2,2,0.85,1 -S2,mut18,95,12,2,0.85,1 -S2,mut19,137,16,2,0.85,1 -S2,mut20,98,24,2,0.85,2 -S2,mut21,100,11,2,0.85,1 -S2,mut22,102,5,2,0.85,1 -S2,mut23,101,20,2,0.85,2 -S2,mut24,106,18,2,0.85,2 -S2,mut25,96,7,2,0.85,1 -S2,mut26,116,10,2,0.85,1 -S2,mut27,88,18,2,0.85,1 -S2,mut28,110,47,2,0.85,2 -S2,mut29,95,48,2,0.85,2 -S2,mut30,101,42,2,0.85,2 -S2,mut31,91,42,2,0.85,2 -S2,mut32,94,19,2,0.85,1 -S2,mut33,97,20,2,0.85,1 -S2,mut34,86,27,2,0.85,1 -S2,mut35,95,83,2,0.85,2 -S2,mut36,96,72,2,0.85,2 -S2,mut37,108,47,2,0.85,1 -S2,mut38,107,89,2,0.85,2 -S2,mut39,95,81,2,0.85,2 -S2,mut40,106,52,2,0.85,1 -S2,mut41,91,35,2,0.85,1 -S2,mut42,108,42,2,0.85,1 -S3,mut1,80,12,2,0.58,1 -S3,mut2,105,38,2,0.58,2 -S3,mut3,95,15,2,0.58,1 -S3,mut4,92,13,2,0.58,1 -S3,mut5,91,20,2,0.58,1 -S3,mut6,82,30,2,0.58,2 -S3,mut7,107,39,2,0.58,2 -S3,mut8,93,31,2,0.58,2 -S3,mut9,98,10,2,0.58,1 -S3,mut10,100,20,2,0.58,2 -S3,mut11,109,17,2,0.58,2 -S3,mut12,97,29,2,0.58,2 -S3,mut13,97,20,2,0.58,2 -S3,mut14,101,27,2,0.58,2 -S3,mut15,99,20,2,0.58,2 -S3,mut16,104,17,2,0.58,2 -S3,mut17,94,12,2,0.58,1 -S3,mut18,96,5,2,0.58,1 -S3,mut19,97,7,2,0.58,1 -S3,mut20,116,14,2,0.58,2 -S3,mut21,104,7,2,0.58,1 -S3,mut22,115,5,2,0.58,1 -S3,mut23,82,14,2,0.58,2 -S3,mut24,125,20,2,0.58,2 -S3,mut25,111,10,2,0.58,1 -S3,mut26,106,9,2,0.58,1 -S3,mut27,85,12,2,0.58,1 -S3,mut28,107,28,2,0.58,2 -S3,mut29,100,25,2,0.58,2 -S3,mut30,103,22,2,0.58,2 -S3,mut31,92,17,2,0.58,2 -S3,mut32,98,9,2,0.58,1 -S3,mut33,110,19,2,0.58,1 -S3,mut34,96,15,2,0.58,1 -S3,mut35,108,70,2,0.58,2 -S3,mut36,104,57,2,0.58,2 -S3,mut37,88,21,2,0.58,1 -S3,mut38,97,58,2,0.58,2 -S3,mut39,110,62,2,0.58,2 -S3,mut40,81,25,2,0.58,1 -S3,mut41,86,21,2,0.58,1 -S3,mut42,85,27,2,0.58,1 \ No newline at end of file diff --git a/inst/extdata/example_error2.csv b/inst/extdata/example_error2.csv deleted file mode 100644 index 97cc391..0000000 --- a/inst/extdata/example_error2.csv +++ /dev/null @@ -1,127 +0,0 @@ -sample,mutation,total_reads,alt_reads,tumor_integer_copy_number,purity,multiplicity -S1,mut1,20,28,2,0.65,1 -S1,mut2,102,65,2,0.65,2 -S1,mut3,93,33,2,0.65,1 -S1,mut4,94,26,2,0.65,1 -S1,mut5,105,32,2,0.65,1 -S1,mut6,96,60,2,0.65,2 -S1,mut7,93,58,2,0.65,2 -S1,mut8,81,52,2,0.65,2 -S1,mut9,104,0,2,0.65,1 -S1,mut10,106,0,2,0.65,2 -S1,mut11,84,0,2,0.65,2 -S1,mut12,100,0,2,0.65,2 -S1,mut13,112,0,2,0.65,2 -S1,mut14,108,0,2,0.65,2 -S1,mut15,113,0,2,0.65,2 -S1,mut16,106,0,2,0.65,2 -S1,mut17,109,0,2,0.65,1 -S1,mut18,87,6,2,0.65,1 -S1,mut19,82,4,2,0.65,1 -S1,mut20,87,14,2,0.65,2 -S1,mut21,114,6,2,0.65,1 -S1,mut22,94,6,2,0.65,1 -S1,mut23,110,18,2,0.65,2 -S1,mut24,104,17,2,0.65,2 -S1,mut25,92,5,2,0.65,1 -S1,mut26,95,7,2,0.65,1 -S1,mut27,83,0,2,0.65,1 -S1,mut28,103,0,2,0.65,2 -S1,mut29,93,0,2,0.65,2 -S1,mut30,100,0,2,0.65,2 -S1,mut31,83,0,2,0.65,2 -S1,mut32,92,0,2,0.65,1 -S1,mut33,113,0,2,0.65,1 -S1,mut34,86,0,2,0.65,1 -S1,mut35,94,56,2,0.65,2 -S1,mut36,106,70,2,0.65,2 -S1,mut37,85,27,2,0.65,1 -S1,mut38,113,56,2,0.65,2 -S1,mut39,91,57,2,0.65,2 -S1,mut40,91,22,2,0.65,1 -S1,mut41,100,30,2,0.65,1 -S1,mut42,92,31,2,0.65,1 -S2,mut1,91,13,2,0.85,1 -S2,mut2,99,43,2,0.85,2 -S2,mut3,87,18,2,0.85,1 -S2,mut4,99,25,2,0.85,1 -S2,mut5,102,19,2,0.85,1 -S2,mut6,94,40,2,0.85,2 -S2,mut7,90,31,2,0.85,2 -S2,mut8,90,28,2,0.85,2 -S2,mut9,100,8,2,0.85,1 -S2,mut10,97,14,2,0.85,2 -S2,mut11,104,12,2,0.85,2 -S2,mut12,125,15,2,0.85,2 -S2,mut13,96,10,2,0.85,2 -S2,mut14,100,15,2,0.85,2 -S2,mut15,94,7,2,0.85,2 -S2,mut16,101,16,2,0.85,2 -S2,mut17,92,2,2,0.85,1 -S2,mut18,95,12,2,0.85,1 -S2,mut19,137,16,2,0.85,1 -S2,mut20,98,24,2,0.85,2 -S2,mut21,100,11,2,0.85,1 -S2,mut22,102,5,2,0.85,1 -S2,mut23,101,20,2,0.85,2 -S2,mut24,106,18,2,0.85,2 -S2,mut25,96,7,2,0.85,1 -S2,mut26,116,10,2,0.85,1 -S2,mut27,88,18,2,0.85,1 -S2,mut28,110,47,2,0.85,2 -S2,mut29,95,48,2,0.85,2 -S2,mut30,101,42,2,0.85,2 -S2,mut31,91,42,2,0.85,2 -S2,mut32,94,19,2,0.85,1 -S2,mut33,97,20,2,0.85,1 -S2,mut34,86,27,2,0.85,1 -S2,mut35,95,83,2,0.85,2 -S2,mut36,96,72,2,0.85,2 -S2,mut37,108,47,2,0.85,1 -S2,mut38,107,89,2,0.85,2 -S2,mut39,95,81,2,0.85,2 -S2,mut40,106,52,2,0.85,1 -S2,mut41,91,35,2,0.85,1 -S2,mut42,108,42,2,0.85,1 -S3,mut1,80,12,2,0.58,1 -S3,mut2,105,38,2,0.58,2 -S3,mut3,95,15,2,0.58,1 -S3,mut4,92,13,2,0.58,1 -S3,mut5,91,20,2,0.58,1 -S3,mut6,82,30,2,0.58,2 -S3,mut7,107,39,2,0.58,2 -S3,mut8,93,31,2,0.58,2 -S3,mut9,98,10,2,0.58,1 -S3,mut10,100,20,2,0.58,2 -S3,mut11,109,17,2,0.58,2 -S3,mut12,97,29,2,0.58,2 -S3,mut13,97,20,2,0.58,2 -S3,mut14,101,27,2,0.58,2 -S3,mut15,99,20,2,0.58,2 -S3,mut16,104,17,2,0.58,2 -S3,mut17,94,12,2,0.58,1 -S3,mut18,96,5,2,0.58,1 -S3,mut19,97,7,2,0.58,1 -S3,mut20,116,14,2,0.58,2 -S3,mut21,104,7,2,0.58,1 -S3,mut22,115,5,2,0.58,1 -S3,mut23,82,14,2,0.58,2 -S3,mut24,125,20,2,0.58,2 -S3,mut25,111,10,2,0.58,1 -S3,mut26,106,9,2,0.58,1 -S3,mut27,85,12,2,0.58,1 -S3,mut28,107,28,2,0.58,2 -S3,mut29,100,25,2,0.58,2 -S3,mut30,103,22,2,0.58,2 -S3,mut31,92,17,2,0.58,2 -S3,mut32,98,9,2,0.58,1 -S3,mut33,110,19,2,0.58,1 -S3,mut34,96,15,2,0.58,1 -S3,mut35,108,70,2,0.58,2 -S3,mut36,104,57,2,0.58,2 -S3,mut37,88,21,2,0.58,1 -S3,mut38,97,58,2,0.58,2 -S3,mut39,110,62,2,0.58,2 -S3,mut40,81,25,2,0.58,1 -S3,mut41,86,21,2,0.58,1 -S3,mut42,85,27,2,0.58,1 \ No newline at end of file diff --git a/inst/extdata/example_error3.csv b/inst/extdata/example_error3.csv deleted file mode 100644 index 47632b1..0000000 --- a/inst/extdata/example_error3.csv +++ /dev/null @@ -1,127 +0,0 @@ -sample,mutation,total_reads,alt_reads,tumor_integer_copy_number,purity,multiplicity -S1,mut1,99,28,2,0.1,1 -S1,mut2,102,65,2,0.65,2 -S1,mut3,93,33,2,0.1,1 -S1,mut4,94,26,2,0.65,1 -S1,mut5,105,32,2,0.1,1 -S1,mut6,96,60,2,0.1,2 -S1,mut7,93,58,2,0.1,2 -S1,mut8,81,52,2,0.1,2 -S1,mut9,104,0,2,0.1,1 -S1,mut10,106,0,2,0.1,2 -S1,mut11,84,0,2,0.65,2 -S1,mut12,100,0,2,0.65,2 -S1,mut13,112,0,2,0.65,2 -S1,mut14,108,0,2,0.65,2 -S1,mut15,113,0,2,0.65,2 -S1,mut16,106,0,2,0.65,2 -S1,mut17,109,0,2,0.65,1 -S1,mut18,87,6,2,0.65,1 -S1,mut19,82,4,2,0.65,1 -S1,mut20,87,14,2,0.65,2 -S1,mut21,114,6,2,0.65,1 -S1,mut22,94,6,2,0.65,1 -S1,mut23,110,18,2,0.65,2 -S1,mut24,104,17,2,0.65,2 -S1,mut25,92,5,2,0.65,1 -S1,mut26,95,7,2,0.65,1 -S1,mut27,83,0,2,0.65,1 -S1,mut28,103,0,2,0.65,2 -S1,mut29,93,0,2,0.65,2 -S1,mut30,100,0,2,0.65,2 -S1,mut31,83,0,2,0.65,2 -S1,mut32,92,0,2,0.65,1 -S1,mut33,113,0,2,0.65,1 -S1,mut34,86,0,2,0.65,1 -S1,mut35,94,56,2,0.65,2 -S1,mut36,106,70,2,0.65,2 -S1,mut37,85,27,2,0.65,1 -S1,mut38,113,56,2,0.65,2 -S1,mut39,91,57,2,0.65,2 -S1,mut40,91,22,2,0.65,1 -S1,mut41,100,30,2,0.65,1 -S1,mut42,92,31,2,0.65,1 -S2,mut1,91,13,2,0.85,1 -S2,mut2,99,43,2,0.85,2 -S2,mut3,87,18,2,0.85,1 -S2,mut4,99,25,2,0.85,1 -S2,mut5,102,19,2,0.85,1 -S2,mut6,94,40,2,0.85,2 -S2,mut7,90,31,2,0.85,2 -S2,mut8,90,28,2,0.85,2 -S2,mut9,100,8,2,0.85,1 -S2,mut10,97,14,2,0.85,2 -S2,mut11,104,12,2,0.85,2 -S2,mut12,125,15,2,0.85,2 -S2,mut13,96,10,2,0.85,2 -S2,mut14,100,15,2,0.85,2 -S2,mut15,94,7,2,0.85,2 -S2,mut16,101,16,2,0.85,2 -S2,mut17,92,2,2,0.85,1 -S2,mut18,95,12,2,0.85,1 -S2,mut19,137,16,2,0.85,1 -S2,mut20,98,24,2,0.85,2 -S2,mut21,100,11,2,0.85,1 -S2,mut22,102,5,2,0.85,1 -S2,mut23,101,20,2,0.85,2 -S2,mut24,106,18,2,0.85,2 -S2,mut25,96,7,2,0.85,1 -S2,mut26,116,10,2,0.85,1 -S2,mut27,88,18,2,0.85,1 -S2,mut28,110,47,2,0.85,2 -S2,mut29,95,48,2,0.85,2 -S2,mut30,101,42,2,0.85,2 -S2,mut31,91,42,2,0.85,2 -S2,mut32,94,19,2,0.85,1 -S2,mut33,97,20,2,0.85,1 -S2,mut34,86,27,2,0.85,1 -S2,mut35,95,83,2,0.85,2 -S2,mut36,96,72,2,0.85,2 -S2,mut37,108,47,2,0.85,1 -S2,mut38,107,89,2,0.85,2 -S2,mut39,95,81,2,0.85,2 -S2,mut40,106,52,2,0.85,1 -S2,mut41,91,35,2,0.85,1 -S2,mut42,108,42,2,0.85,1 -S3,mut1,80,12,2,0.58,1 -S3,mut2,105,38,2,0.58,2 -S3,mut3,95,15,2,0.58,1 -S3,mut4,92,13,2,0.58,1 -S3,mut5,91,20,2,0.58,1 -S3,mut6,82,30,2,0.58,2 -S3,mut7,107,39,2,0.58,2 -S3,mut8,93,31,2,0.58,2 -S3,mut9,98,10,2,0.58,1 -S3,mut10,100,20,2,0.58,2 -S3,mut11,109,17,2,0.58,2 -S3,mut12,97,29,2,0.58,2 -S3,mut13,97,20,2,0.58,2 -S3,mut14,101,27,2,0.58,2 -S3,mut15,99,20,2,0.58,2 -S3,mut16,104,17,2,0.58,2 -S3,mut17,94,12,2,0.58,1 -S3,mut18,96,5,2,0.58,1 -S3,mut19,97,7,2,0.58,1 -S3,mut20,116,14,2,0.58,2 -S3,mut21,104,7,2,0.58,1 -S3,mut22,115,5,2,0.58,1 -S3,mut23,82,14,2,0.58,2 -S3,mut24,125,20,2,0.58,2 -S3,mut25,111,10,2,0.58,1 -S3,mut26,106,9,2,0.58,1 -S3,mut27,85,12,2,0.58,1 -S3,mut28,107,28,2,0.58,2 -S3,mut29,100,25,2,0.58,2 -S3,mut30,103,22,2,0.58,2 -S3,mut31,92,17,2,0.58,2 -S3,mut32,98,9,2,0.58,1 -S3,mut33,110,19,2,0.58,1 -S3,mut34,96,15,2,0.58,1 -S3,mut35,108,70,2,0.58,2 -S3,mut36,104,57,2,0.58,2 -S3,mut37,88,21,2,0.58,1 -S3,mut38,97,58,2,0.58,2 -S3,mut39,110,62,2,0.58,2 -S3,mut40,81,25,2,0.58,1 -S3,mut41,86,21,2,0.58,1 -S3,mut42,85,27,2,0.58,1 \ No newline at end of file diff --git a/inst/extdata/example_error4.csv b/inst/extdata/example_error4.csv deleted file mode 100644 index 942bbd7..0000000 --- a/inst/extdata/example_error4.csv +++ /dev/null @@ -1,127 +0,0 @@ -sample,mutation,total_reads,alt_reads,tumor_integer_copy_number,purity,multiplicity -S1,mut1,0,0,2,0.65,1 -S1,mut2,102,65,2,0.65,2 -S1,mut3,0,0,2,0.65,1 -S1,mut4,94,26,2,0.65,1 -S1,mut5,105,32,2,0.65,1 -S1,mut6,96,60,2,0.65,2 -S1,mut7,93,58,2,0.65,2 -S1,mut8,81,52,2,0.65,2 -S1,mut9,104,0,2,0.65,1 -S1,mut10,106,0,2,0.65,2 -S1,mut11,84,0,2,0.65,2 -S1,mut12,100,0,2,0.65,2 -S1,mut13,112,0,2,0.65,2 -S1,mut14,108,0,2,0.65,2 -S1,mut15,113,0,2,0.65,2 -S1,mut16,106,0,2,0.65,2 -S1,mut17,109,0,2,0.65,1 -S1,mut18,87,6,2,0.65,1 -S1,mut19,82,4,2,0.65,1 -S1,mut20,87,14,2,0.65,2 -S1,mut21,114,6,2,0.65,1 -S1,mut22,94,6,2,0.65,1 -S1,mut23,110,18,2,0.65,2 -S1,mut24,104,17,2,0.65,2 -S1,mut25,92,5,2,0.65,1 -S1,mut26,95,7,2,0.65,1 -S1,mut27,83,0,2,0.65,1 -S1,mut28,103,0,2,0.65,2 -S1,mut29,93,0,2,0.65,2 -S1,mut30,100,0,2,0.65,2 -S1,mut31,83,0,2,0.65,2 -S1,mut32,92,0,2,0.65,1 -S1,mut33,113,0,2,0.65,1 -S1,mut34,86,0,2,0.65,1 -S1,mut35,94,56,2,0.65,2 -S1,mut36,106,70,2,0.65,2 -S1,mut37,85,27,2,0.65,1 -S1,mut38,113,56,2,0.65,2 -S1,mut39,91,57,2,0.65,2 -S1,mut40,91,22,2,0.65,1 -S1,mut41,100,30,2,0.65,1 -S1,mut42,92,31,2,0.65,1 -S2,mut1,91,13,2,0.85,1 -S2,mut2,99,43,2,0.85,2 -S2,mut3,87,18,2,0.85,1 -S2,mut4,99,25,2,0.85,1 -S2,mut5,102,19,2,0.85,1 -S2,mut6,94,40,2,0.85,2 -S2,mut7,90,31,2,0.85,2 -S2,mut8,90,28,2,0.85,2 -S2,mut9,100,8,2,0.85,1 -S2,mut10,97,14,2,0.85,2 -S2,mut11,104,12,2,0.85,2 -S2,mut12,125,15,2,0.85,2 -S2,mut13,96,10,2,0.85,2 -S2,mut14,100,15,2,0.85,2 -S2,mut15,94,7,2,0.85,2 -S2,mut16,101,16,2,0.85,2 -S2,mut17,92,2,2,0.85,1 -S2,mut18,95,12,2,0.85,1 -S2,mut19,137,16,2,0.85,1 -S2,mut20,98,24,2,0.85,2 -S2,mut21,100,11,2,0.85,1 -S2,mut22,102,5,2,0.85,1 -S2,mut23,101,20,2,0.85,2 -S2,mut24,106,18,2,0.85,2 -S2,mut25,96,7,2,0.85,1 -S2,mut26,116,10,2,0.85,1 -S2,mut27,88,18,2,0.85,1 -S2,mut28,110,47,2,0.85,2 -S2,mut29,95,48,2,0.85,2 -S2,mut30,101,42,2,0.85,2 -S2,mut31,91,42,2,0.85,2 -S2,mut32,94,19,2,0.85,1 -S2,mut33,97,20,2,0.85,1 -S2,mut34,86,27,2,0.85,1 -S2,mut35,95,83,2,0.85,2 -S2,mut36,96,72,2,0.85,2 -S2,mut37,108,47,2,0.85,1 -S2,mut38,107,89,2,0.85,2 -S2,mut39,95,81,2,0.85,2 -S2,mut40,106,52,2,0.85,1 -S2,mut41,91,35,2,0.85,1 -S2,mut42,108,42,2,0.85,1 -S3,mut1,80,12,2,0.58,1 -S3,mut2,105,38,2,0.58,2 -S3,mut3,95,15,2,0.58,1 -S3,mut4,92,13,2,0.58,1 -S3,mut5,91,20,2,0.58,1 -S3,mut6,82,30,2,0.58,2 -S3,mut7,107,39,2,0.58,2 -S3,mut8,93,31,2,0.58,2 -S3,mut9,98,10,2,0.58,1 -S3,mut10,100,20,2,0.58,2 -S3,mut11,109,17,2,0.58,2 -S3,mut12,97,29,2,0.58,2 -S3,mut13,97,20,2,0.58,2 -S3,mut14,101,27,2,0.58,2 -S3,mut15,99,20,2,0.58,2 -S3,mut16,104,17,2,0.58,2 -S3,mut17,94,12,2,0.58,1 -S3,mut18,96,5,2,0.58,1 -S3,mut19,97,7,2,0.58,1 -S3,mut20,116,14,2,0.58,2 -S3,mut21,104,7,2,0.58,1 -S3,mut22,115,5,2,0.58,1 -S3,mut23,82,14,2,0.58,2 -S3,mut24,125,20,2,0.58,2 -S3,mut25,111,10,2,0.58,1 -S3,mut26,106,9,2,0.58,1 -S3,mut27,85,12,2,0.58,1 -S3,mut28,107,28,2,0.58,2 -S3,mut29,100,25,2,0.58,2 -S3,mut30,103,22,2,0.58,2 -S3,mut31,92,17,2,0.58,2 -S3,mut32,98,9,2,0.58,1 -S3,mut33,110,19,2,0.58,1 -S3,mut34,96,15,2,0.58,1 -S3,mut35,108,70,2,0.58,2 -S3,mut36,104,57,2,0.58,2 -S3,mut37,88,21,2,0.58,1 -S3,mut38,97,58,2,0.58,2 -S3,mut39,110,62,2,0.58,2 -S3,mut40,81,25,2,0.58,1 -S3,mut41,86,21,2,0.58,1 -S3,mut42,85,27,2,0.58,1 \ No newline at end of file diff --git a/inst/extdata/example_input.csv b/inst/extdata/example_input.csv deleted file mode 100644 index 10dbd58..0000000 --- a/inst/extdata/example_input.csv +++ /dev/null @@ -1,127 +0,0 @@ -sample,mutation,total_reads,alt_reads,tumor_integer_copy_number,purity,multiplicity -S1,mut1,99,28,2,0.65,1 -S1,mut2,102,65,2,0.65,2 -S1,mut3,93,33,2,0.65,1 -S1,mut4,94,26,2,0.65,1 -S1,mut5,105,32,2,0.65,1 -S1,mut6,96,60,2,0.65,2 -S1,mut7,93,58,2,0.65,2 -S1,mut8,81,52,2,0.65,2 -S1,mut9,104,0,2,0.65,1 -S1,mut10,106,0,2,0.65,2 -S1,mut11,84,0,2,0.65,2 -S1,mut12,100,0,2,0.65,2 -S1,mut13,112,0,2,0.65,2 -S1,mut14,108,0,2,0.65,2 -S1,mut15,113,0,2,0.65,2 -S1,mut16,106,0,2,0.65,2 -S1,mut17,109,0,2,0.65,1 -S1,mut18,87,6,2,0.65,1 -S1,mut19,82,4,2,0.65,1 -S1,mut20,87,14,2,0.65,2 -S1,mut21,114,6,2,0.65,1 -S1,mut22,94,6,2,0.65,1 -S1,mut23,110,18,2,0.65,2 -S1,mut24,104,17,2,0.65,2 -S1,mut25,92,5,2,0.65,1 -S1,mut26,95,7,2,0.65,1 -S1,mut27,83,0,2,0.65,1 -S1,mut28,103,0,2,0.65,2 -S1,mut29,93,0,2,0.65,2 -S1,mut30,100,0,2,0.65,2 -S1,mut31,83,0,2,0.65,2 -S1,mut32,92,0,2,0.65,1 -S1,mut33,113,0,2,0.65,1 -S1,mut34,86,0,2,0.65,1 -S1,mut35,94,56,2,0.65,2 -S1,mut36,106,70,2,0.65,2 -S1,mut37,85,27,2,0.65,1 -S1,mut38,113,56,2,0.65,2 -S1,mut39,91,57,2,0.65,2 -S1,mut40,91,22,2,0.65,1 -S1,mut41,100,30,2,0.65,1 -S1,mut42,92,31,2,0.65,1 -S2,mut1,91,13,2,0.85,1 -S2,mut2,99,43,2,0.85,2 -S2,mut3,87,18,2,0.85,1 -S2,mut4,99,25,2,0.85,1 -S2,mut5,102,19,2,0.85,1 -S2,mut6,94,40,2,0.85,2 -S2,mut7,90,31,2,0.85,2 -S2,mut8,90,28,2,0.85,2 -S2,mut9,100,8,2,0.85,1 -S2,mut10,97,14,2,0.85,2 -S2,mut11,104,12,2,0.85,2 -S2,mut12,125,15,2,0.85,2 -S2,mut13,96,10,2,0.85,2 -S2,mut14,100,15,2,0.85,2 -S2,mut15,94,7,2,0.85,2 -S2,mut16,101,16,2,0.85,2 -S2,mut17,92,2,2,0.85,1 -S2,mut18,95,12,2,0.85,1 -S2,mut19,137,16,2,0.85,1 -S2,mut20,98,24,2,0.85,2 -S2,mut21,100,11,2,0.85,1 -S2,mut22,102,5,2,0.85,1 -S2,mut23,101,20,2,0.85,2 -S2,mut24,106,18,2,0.85,2 -S2,mut25,96,7,2,0.85,1 -S2,mut26,116,10,2,0.85,1 -S2,mut27,88,18,2,0.85,1 -S2,mut28,110,47,2,0.85,2 -S2,mut29,95,48,2,0.85,2 -S2,mut30,101,42,2,0.85,2 -S2,mut31,91,42,2,0.85,2 -S2,mut32,94,19,2,0.85,1 -S2,mut33,97,20,2,0.85,1 -S2,mut34,86,27,2,0.85,1 -S2,mut35,95,83,2,0.85,2 -S2,mut36,96,72,2,0.85,2 -S2,mut37,108,47,2,0.85,1 -S2,mut38,107,89,2,0.85,2 -S2,mut39,95,81,2,0.85,2 -S2,mut40,106,52,2,0.85,1 -S2,mut41,91,35,2,0.85,1 -S2,mut42,108,42,2,0.85,1 -S3,mut1,80,12,2,0.58,1 -S3,mut2,105,38,2,0.58,2 -S3,mut3,95,15,2,0.58,1 -S3,mut4,92,13,2,0.58,1 -S3,mut5,91,20,2,0.58,1 -S3,mut6,82,30,2,0.58,2 -S3,mut7,107,39,2,0.58,2 -S3,mut8,93,31,2,0.58,2 -S3,mut9,98,10,2,0.58,1 -S3,mut10,100,20,2,0.58,2 -S3,mut11,109,17,2,0.58,2 -S3,mut12,97,29,2,0.58,2 -S3,mut13,97,20,2,0.58,2 -S3,mut14,101,27,2,0.58,2 -S3,mut15,99,20,2,0.58,2 -S3,mut16,104,17,2,0.58,2 -S3,mut17,94,12,2,0.58,1 -S3,mut18,96,5,2,0.58,1 -S3,mut19,97,7,2,0.58,1 -S3,mut20,116,14,2,0.58,2 -S3,mut21,104,7,2,0.58,1 -S3,mut22,115,5,2,0.58,1 -S3,mut23,82,14,2,0.58,2 -S3,mut24,125,20,2,0.58,2 -S3,mut25,111,10,2,0.58,1 -S3,mut26,106,9,2,0.58,1 -S3,mut27,85,12,2,0.58,1 -S3,mut28,107,28,2,0.58,2 -S3,mut29,100,25,2,0.58,2 -S3,mut30,103,22,2,0.58,2 -S3,mut31,92,17,2,0.58,2 -S3,mut32,98,9,2,0.58,1 -S3,mut33,110,19,2,0.58,1 -S3,mut34,96,15,2,0.58,1 -S3,mut35,108,70,2,0.58,2 -S3,mut36,104,57,2,0.58,2 -S3,mut37,88,21,2,0.58,1 -S3,mut38,97,58,2,0.58,2 -S3,mut39,110,62,2,0.58,2 -S3,mut40,81,25,2,0.58,1 -S3,mut41,86,21,2,0.58,1 -S3,mut42,85,27,2,0.58,1 \ No newline at end of file diff --git a/inst/extdata/model-simple-set-beta.jags b/inst/extdata/model-simple-set-beta.jags deleted file mode 100644 index 38ab5a4..0000000 --- a/inst/extdata/model-simple-set-beta.jags +++ /dev/null @@ -1,34 +0,0 @@ -model { - for (i in 1:I) { ## for each variant - for (s in 1:S) { ## for each sample - theta.temp[i,s] <- m[i,s] * w[z[i], s] * purity[s] / - (tcn[i,s] * purity[s] + 2 * (1 - purity[s])) - theta[i,s] <- ifelse(theta.temp[i,s] > 1, 1, theta.temp[i,s]) - y[i, s] ~ dbin(theta[i,s], n[i,s]) - ## posterior predictive distribution - ystar[i, s] ~ dbin(theta[i, s], n[i, s]) - } - z[i] ~ dcat(pi[1:K]) - } - alpha <- rep(1, K) - pi[1:K] ~ ddirch(alpha[1:K]) - - for (k in 1:K) { - - for (s in 1:S) { - #p[k,s] ~ dbeta(2,2) - #z.p[k,s] ~ dcat(c(p[k,s], 1-p[k,s])) - - #w1[k,s] ~ dbeta(1,1)T(0.005, ) - #w_unsorted[k,s] <- ifelse(z.p[k,s] == 1, w1[k,s], 0) - w_unsorted[k,s] ~ dbeta(cluster_shape1[k,s], cluster_shape2[k,s]) - - } - - } - - # order w for sample_to_sort - for (s in 1:S) { - w[1:K, s] <- ifelse(s == sample_to_sort, sort(w_unsorted[1:K, s]), w_unsorted[1:K, s]) - } -} \ No newline at end of file diff --git a/inst/extdata/model-test.jags b/inst/extdata/model-test.jags deleted file mode 100644 index 679bfe4..0000000 --- a/inst/extdata/model-test.jags +++ /dev/null @@ -1,34 +0,0 @@ -model { - for (i in 1:I) { ## for each variant - for (s in 1:S) { ## for each sample - theta.temp[i,s] <- m[i,s] * w[z[i], s] * purity[s] / - (tcn[i,s] * purity[s] + 2 * (1 - purity[s])) - theta[i,s] <- ifelse(theta.temp[i,s] > 1, 1, theta.temp[i,s]) - y[i, s] ~ dbin(theta[i,s], n[i,s]) - ## posterior predictive distribution - ystar[i, s] ~ dbin(theta[i, s], n[i, s]) - } - z[i] ~ dcat(pi[1:K]) - } - alpha <- rep(1, K) - pi[1:K] ~ ddirch(alpha[1:K]) - - for (k in 1:K) { - - for (s in 1:S) { - #p[k,s] ~ dbeta(2,2) - #z.p[k,s] ~ dcat(c(p[k,s], 1-p[k,s])) - - #w1[k,s] ~ dbeta(1,1)T(0.005, ) - #w_unsorted[k,s] <- ifelse(z.p[k,s] == 1, w1[k,s], 0) - w_unsorted[k,s] ~ dbeta(1,1) - - } - - } - - # order w for sample_to_sort - for (s in 1:S) { - w[1:K, s] <- ifelse(s == sample_to_sort, sort(w_unsorted[1:K, s]), w_unsorted[1:K, s]) - } -} \ No newline at end of file diff --git a/inst/extdata/model.jags b/inst/extdata/model.jags new file mode 100644 index 0000000..db73341 --- /dev/null +++ b/inst/extdata/model.jags @@ -0,0 +1,60 @@ +model { + for (i in 1:I) { ## for each variant + for (s in 1:S) { ## for each sample + y[i,s] ~ dbin(vaf[i,s], n[i,s]) + ## posterior predictive distribution + ystar[i,s] ~ dbin(vaf[i,s], n[i,s]) + } + } + + for (i in 1:I) { + for (s in 1:S) { + tcn.snv.est[i,s] <- icn[i] * cncf[i,s] + ploidy * (1 - cncf[i,s]) + tcn.cna.est[i,s] <- icn[i] * mcf[z[i], s] + ploidy * (1 - mcf[z[i], s]) + + vaf.snv[i,s] <- (mcf[z[i], s] + (m[i]-1) * cncf[i,s]) / tcn.snv.est[i,s] + vaf.cna[i,s] <- (mcf[z[i], s] * m[i] + 1 - mcf[z[i], s]) / tcn.cna.est[i,s] + vaf.temp[i,s] <- ifelse(is_cn[i]==0, vaf.snv[i,s], vaf.cna[i,s]) + vaf.temp1[i,s] <- ifelse(vaf.temp[i,s] < 0, 0, vaf.temp[i,s]) + vaf[i,s] <- ifelse(vaf.temp1[i,s] > 1, 1, vaf.temp1[i,s]) + } + } + + for (i in 1:I) { + for (s in 1:S) { + tcn[i,s] ~ dnorm(icn[i] * cncf[i, s] + ploidy * (1 - cncf[i, s]), epsilon)T(0,) + } + } + + epsilon ~ dnorm(0,1)T(0,) + + for (i in 1:I) { + z[i] ~ dcat(pi[1:K]) + + m1[i] ~ dcat(c(1/3,1/3,1/3)) + + #lambda[i] ~ dbin(0.5, 1) + #m[i] <- lambda[i]*mtp[i] + (1-lambda[i])*(icn[i]-mtp[i]) + + m2[i] <- ifelse(m1[i] == 1, 1, ifelse(m1[i] == 2, mtp[i], icn[i]-mtp[i])) + m[i] <- ifelse(is_cn[i] == 0, m2[i], max(icn[i]-mtp[i], mtp[i])) + #m[i] <- ifelse(m1[i] == 1, 1, ifelse(m1[i] == 2, mtp[i], icn[i]-mtp[i])) + } + + alpha <- rep(1, K) + pi[1:K] ~ ddirch(alpha[1:K]) + + for (s in 1:S) { + mcf[1:K, s] <- ifelse(s == sample_to_sort, sort(mcf_unsorted[1:K, s]), mcf_unsorted[1:K, s]) + } + + for (k in 1:K) { + for (s in 1:S) { + eta[k,s] ~ dbeta(5,2) + z.eta[k,s] ~ dcat(c(eta[k,s], 1-eta[k,s])) + + mcf1[k,s] ~ dbeta(1,1)T(0.01, purity[s]) + mcf_unsorted[k,s] <- ifelse(z.eta[k,s] == 1, mcf1[k,s], 0) + } + } +} \ No newline at end of file diff --git a/inst/extdata/model_K1.jags b/inst/extdata/model_K1.jags new file mode 100644 index 0000000..cae443a --- /dev/null +++ b/inst/extdata/model_K1.jags @@ -0,0 +1,54 @@ +model { + for (i in 1:I) { ## for each variant + for (s in 1:S) { ## for each sample + y[i,s] ~ dbin(vaf[i,s], n[i,s]) + ## posterior predictive distribution + ystar[i,s] ~ dbin(vaf[i,s], n[i,s]) + } + } + + for (i in 1:I) { + for (s in 1:S) { + + tcn.snv.est[i,s] <- icn[i] * cncf[i,s] + ploidy * (1 - cncf[i,s]) + tcn.cna.est[i,s] <- icn[i] * mcf[z[i], s] + ploidy * (1 - mcf[z[i], s]) + + vaf.snv[i,s] <- (mcf[z[i], s] + (m[i]-1) * cncf[i,s]) / tcn.snv.est[i,s] + vaf.cna[i,s] <- (mcf[z[i], s] * m[i] + 1 - mcf[z[i], s]) / tcn.cna.est[i,s] + vaf.temp[i,s] <- ifelse(is_cn[i]==0, vaf.snv[i,s], vaf.cna[i,s]) + vaf.temp1[i,s] <- ifelse(vaf.temp[i,s] < 0, 0, vaf.temp[i,s]) + vaf[i,s] <- ifelse(vaf.temp1[i,s] > 1, 1, vaf.temp1[i,s]) + } + } + + for (i in 1:I) { + for (s in 1:S) { + tcn[i,s] ~ dnorm(icn[i] * cncf[i, s] + ploidy * (1 - cncf[i, s]), epsilon)T(0,) + } + } + + epsilon ~ dnorm(0,1)T(0,) + + for (i in 1:I) { + z[i] <- 1 + + m1[i] ~ dcat(c(1/3,1/3,1/3)) + + #lambda[i] ~ dbin(0.5, 1) + #m[i] <- lambda[i]*mtp[i] + (1-lambda[i])*(icn[i]-mtp[i]) + + #m[i] <- ifelse(m1[i] == 1, 1, ifelse(m1[i] == 2, mtp[i], icn[i]-mtp[i])) + m2[i] <- ifelse(m1[i] == 1, 1, ifelse(m1[i] == 2, mtp[i], icn[i]-mtp[i])) + m[i] <- ifelse(is_cn[i]==0, m2[i], max(icn[i]-mtp[i], mtp[i])) + } + + for (s in 1:S) { + + eta[1,s] ~ dbeta(5,2) + z.eta[1,s] ~ dcat(c(eta[1,s], 1-eta[1,s])) + + mcf1[1,s] ~ dbeta(1,1)T(0.01, purity[s]) + mcf[1,s] <- ifelse(z.eta[1,s] == 1, mcf1[1,s], 0) + } + +} \ No newline at end of file diff --git a/inst/extdata/output/all_trees/tree1.csv b/inst/extdata/output/all_trees/tree1.csv new file mode 100644 index 0000000..662f18b --- /dev/null +++ b/inst/extdata/output/all_trees/tree1.csv @@ -0,0 +1,4 @@ +edge,parent,child +root->1,root,1 +1->2,1,2 +1->3,1,3 diff --git a/inst/extdata/output/all_trees/tree1.png b/inst/extdata/output/all_trees/tree1.png new file mode 100644 index 0000000..6fb70db Binary files /dev/null and b/inst/extdata/output/all_trees/tree1.png differ diff --git a/inst/extdata/output/all_trees/tree_1_purity.csv b/inst/extdata/output/all_trees/tree_1_purity.csv new file mode 100644 index 0000000..64bd5e8 --- /dev/null +++ b/inst/extdata/output/all_trees/tree_1_purity.csv @@ -0,0 +1,2 @@ +sample1,sample2,sample3 +0.789,0.7,0.7 diff --git a/inst/extdata/output/all_trees/tree_1_subclone_proportion.csv b/inst/extdata/output/all_trees/tree_1_subclone_proportion.csv new file mode 100644 index 0000000..4d1c62d --- /dev/null +++ b/inst/extdata/output/all_trees/tree_1_subclone_proportion.csv @@ -0,0 +1,4 @@ +,sample1,sample2,sample3 +1,1,0.714,0.14 +2,0,0.286,0 +3,0,0,0.86 diff --git a/inst/extdata/output/all_trees/tree_1_subclone_proportion.png b/inst/extdata/output/all_trees/tree_1_subclone_proportion.png new file mode 100644 index 0000000..d8c02bb Binary files /dev/null and b/inst/extdata/output/all_trees/tree_1_subclone_proportion.png differ diff --git a/inst/extdata/output/clusterAssign.csv b/inst/extdata/output/clusterAssign.csv new file mode 100644 index 0000000..85fcf45 --- /dev/null +++ b/inst/extdata/output/clusterAssign.csv @@ -0,0 +1,91 @@ +Mut_ID,Cluster +chr1-57-clone1,1 +chr1-110-clone1,1 +chr1-167-clone1,1 +chr1-386-clone1,1 +chr1-441-clone1,1 +chr1-1276-clone1,1 +chr1-1496-clone1,1 +chr1-1631-clone1,1 +chr1-1735-clone1,1 +chr1-2036-clone1,1 +chr1-2428-clone1,1 +chr1-2650-clone1,1 +chr1-2684-clone1,1 +chr1-2730-clone1,1 +chr1-2857-clone1,1 +chr1-2931-clone1,1 +chr1-2977-clone1,1 +chr1-3061-clone1,1 +chr1-3113-clone1,1 +chr1-3173-clone1,1 +chr1-3371-clone1,1 +chr1-3812-clone1,1 +chr1-4278-clone1,1 +chr1-4428-clone1,1 +chr1-4696-clone1,1 +chr1-4893-clone1,1 +chr1-5277-clone1,1 +chr1-5563-clone1,1 +chr1-5597-clone1,1 +chr1-5815-clone1,1 +chr1-6037-clone1,1 +chr1-6113-clone1,1 +chr1-6145-clone1,1 +chr1-6753-clone1,1 +chr1-6792-clone1,1 +chr1-6808-clone1,1 +chr1-6826-clone1,1 +chr1-7094-clone1,1 +chr1-7148-clone1,1 +chr1-7518-clone1,1 +chr1-8257-clone1,1 +chr1-8368-clone1,1 +chr1-8579-clone1,1 +chr1-8701-clone1,1 +chr1-9064-clone1,1 +chr1-9210-clone1,1 +chr1-9274-clone1,1 +chr1-9472-clone1,1 +chr1-9505-clone1,1 +chr1-9586-clone1,1 +chr1-154-clone2,2 +chr1-660-clone2,2 +chr1-681-clone2,2 +chr1-1963-clone2,2 +chr1-2128-clone2,2 +chr1-2463-clone2,2 +chr1-2690-clone2,2 +chr1-3045-clone2,2 +chr1-3546-clone2,2 +chr1-3754-clone2,2 +chr1-3807-clone2,2 +chr1-4223-clone2,2 +chr1-4937-clone2,2 +chr1-5405-clone2,2 +chr1-7106-clone2,2 +chr1-8180-clone2,2 +chr1-8491-clone2,2 +chr1-9095-clone2,2 +chr1-9644-clone2,2 +chr1-9765-clone2,2 +chr1-8716-clone3,3 +chr1-9104-clone3,3 +chr1-8700-clone3,3 +chr1-8321-clone3,3 +chr1-8790-clone3,3 +chr1-8663-clone3,3 +chr1-8026-clone3,3 +chr1-8101-clone3,3 +chr1-8124-clone3,3 +chr1-9457-clone3,3 +chr1-8911-clone3,3 +chr1-8160-clone3,3 +chr1-8794-clone3,3 +chr1-9636-clone3,3 +chr1-9885-clone3,3 +chr1-8367-clone3,3 +chr1-9615-clone3,3 +chr1-8023-clone3,3 +chr1-9581-clone3,3 +chr1-9458-clone3,3 diff --git a/inst/extdata/output/icn_all.csv b/inst/extdata/output/icn_all.csv new file mode 100644 index 0000000..7b8858a --- /dev/null +++ b/inst/extdata/output/icn_all.csv @@ -0,0 +1,91 @@ +Mut_ID,icn +chr1-57-clone1,4 +chr1-110-clone1,4 +chr1-167-clone1,4 +chr1-386-clone1,2 +chr1-441-clone1,1 +chr1-1276-clone1,1 +chr1-1496-clone1,1 +chr1-1631-clone1,2 +chr1-1735-clone1,2 +chr1-2036-clone1,2 +chr1-2428-clone1,2 +chr1-2650-clone1,2 +chr1-2684-clone1,2 +chr1-2730-clone1,2 +chr1-2857-clone1,2 +chr1-2931-clone1,2 +chr1-2977-clone1,2 +chr1-3061-clone1,2 +chr1-3113-clone1,2 +chr1-3173-clone1,2 +chr1-3371-clone1,2 +chr1-3812-clone1,2 +chr1-4278-clone1,2 +chr1-4428-clone1,2 +chr1-4696-clone1,2 +chr1-4893-clone1,2 +chr1-5277-clone1,2 +chr1-5563-clone1,2 +chr1-5597-clone1,2 +chr1-5815-clone1,2 +chr1-6037-clone1,2 +chr1-6113-clone1,2 +chr1-6145-clone1,2 +chr1-6753-clone1,2 +chr1-6792-clone1,2 +chr1-6808-clone1,2 +chr1-6826-clone1,2 +chr1-7094-clone1,3 +chr1-7148-clone1,3 +chr1-7518-clone1,2 +chr1-8257-clone1,2 +chr1-8368-clone1,2 +chr1-8579-clone1,2 +chr1-8701-clone1,2 +chr1-9064-clone1,2 +chr1-9210-clone1,2 +chr1-9274-clone1,2 +chr1-9472-clone1,2 +chr1-9505-clone1,2 +chr1-9586-clone1,2 +chr1-154-clone2,4 +chr1-660-clone2,2 +chr1-681-clone2,2 +chr1-1963-clone2,2 +chr1-2128-clone2,2 +chr1-2463-clone2,2 +chr1-2690-clone2,2 +chr1-3045-clone2,2 +chr1-3546-clone2,2 +chr1-3754-clone2,2 +chr1-3807-clone2,2 +chr1-4223-clone2,2 +chr1-4937-clone2,2 +chr1-5405-clone2,2 +chr1-7106-clone2,2 +chr1-8180-clone2,2 +chr1-8491-clone2,2 +chr1-9095-clone2,2 +chr1-9644-clone2,2 +chr1-9765-clone2,2 +chr1-8716-clone3,2 +chr1-9104-clone3,2 +chr1-8700-clone3,2 +chr1-8321-clone3,2 +chr1-8790-clone3,2 +chr1-8663-clone3,2 +chr1-8026-clone3,2 +chr1-8101-clone3,2 +chr1-8124-clone3,2 +chr1-9457-clone3,2 +chr1-8911-clone3,2 +chr1-8160-clone3,2 +chr1-8794-clone3,2 +chr1-9636-clone3,2 +chr1-9885-clone3,2 +chr1-8367-clone3,2 +chr1-9615-clone3,2 +chr1-8023-clone3,2 +chr1-9581-clone3,2 +chr1-9458-clone3,2 diff --git a/inst/extdata/output/mcf.csv b/inst/extdata/output/mcf.csv new file mode 100644 index 0000000..a6f64ee --- /dev/null +++ b/inst/extdata/output/mcf.csv @@ -0,0 +1,4 @@ +Cluster,sample1,sample2,sample3 +1,0.789,0.7,0.7 +2,0,0.2,0 +3,0,0,0.602 diff --git a/inst/extdata/output/mcf.png b/inst/extdata/output/mcf.png new file mode 100644 index 0000000..56c4750 Binary files /dev/null and b/inst/extdata/output/mcf.png differ diff --git a/inst/extdata/output/multiplicity_all.csv b/inst/extdata/output/multiplicity_all.csv new file mode 100644 index 0000000..886700a --- /dev/null +++ b/inst/extdata/output/multiplicity_all.csv @@ -0,0 +1,91 @@ +Mut_ID,Multiplicity +chr1-57-clone1,3 +chr1-110-clone1,3 +chr1-167-clone1,3 +chr1-386-clone1,1 +chr1-441-clone1,1 +chr1-1276-clone1,1 +chr1-1496-clone1,1 +chr1-1631-clone1,1 +chr1-1735-clone1,1 +chr1-2036-clone1,1 +chr1-2428-clone1,1 +chr1-2650-clone1,1 +chr1-2684-clone1,1 +chr1-2730-clone1,1 +chr1-2857-clone1,1 +chr1-2931-clone1,1 +chr1-2977-clone1,1 +chr1-3061-clone1,1 +chr1-3113-clone1,1 +chr1-3173-clone1,1 +chr1-3371-clone1,1 +chr1-3812-clone1,1 +chr1-4278-clone1,1 +chr1-4428-clone1,1 +chr1-4696-clone1,1 +chr1-4893-clone1,1 +chr1-5277-clone1,1 +chr1-5563-clone1,1 +chr1-5597-clone1,1 +chr1-5815-clone1,1 +chr1-6037-clone1,1 +chr1-6113-clone1,1 +chr1-6145-clone1,1 +chr1-6753-clone1,1 +chr1-6792-clone1,1 +chr1-6808-clone1,1 +chr1-6826-clone1,1 +chr1-7094-clone1,1 +chr1-7148-clone1,1 +chr1-7518-clone1,1 +chr1-8257-clone1,1 +chr1-8368-clone1,1 +chr1-8579-clone1,1 +chr1-8701-clone1,1 +chr1-9064-clone1,1 +chr1-9210-clone1,1 +chr1-9274-clone1,1 +chr1-9472-clone1,1 +chr1-9505-clone1,1 +chr1-9586-clone1,1 +chr1-154-clone2,1 +chr1-660-clone2,1 +chr1-681-clone2,1 +chr1-1963-clone2,1 +chr1-2128-clone2,1 +chr1-2463-clone2,1 +chr1-2690-clone2,1 +chr1-3045-clone2,1 +chr1-3546-clone2,1 +chr1-3754-clone2,1 +chr1-3807-clone2,1 +chr1-4223-clone2,1 +chr1-4937-clone2,1 +chr1-5405-clone2,1 +chr1-7106-clone2,1 +chr1-8180-clone2,1 +chr1-8491-clone2,1 +chr1-9095-clone2,1 +chr1-9644-clone2,1 +chr1-9765-clone2,1 +chr1-8716-clone3,1 +chr1-9104-clone3,1 +chr1-8700-clone3,1 +chr1-8321-clone3,1 +chr1-8790-clone3,1 +chr1-8663-clone3,1 +chr1-8026-clone3,1 +chr1-8101-clone3,1 +chr1-8124-clone3,1 +chr1-9457-clone3,1 +chr1-8911-clone3,1 +chr1-8160-clone3,1 +chr1-8794-clone3,1 +chr1-9636-clone3,1 +chr1-9885-clone3,1 +chr1-8367-clone3,1 +chr1-9615-clone3,1 +chr1-8023-clone3,1 +chr1-9581-clone3,1 +chr1-9458-clone3,1 diff --git a/inst/extdata/output/purity.csv b/inst/extdata/output/purity.csv new file mode 100644 index 0000000..64bd5e8 --- /dev/null +++ b/inst/extdata/output/purity.csv @@ -0,0 +1,2 @@ +sample1,sample2,sample3 +0.789,0.7,0.7 diff --git a/inst/extdata/output/subclone_proportion.csv b/inst/extdata/output/subclone_proportion.csv new file mode 100644 index 0000000..4d1c62d --- /dev/null +++ b/inst/extdata/output/subclone_proportion.csv @@ -0,0 +1,4 @@ +,sample1,sample2,sample3 +1,1,0.714,0.14 +2,0,0.286,0 +3,0,0,0.86 diff --git a/inst/extdata/output/subclone_props.png b/inst/extdata/output/subclone_props.png new file mode 100644 index 0000000..d8c02bb Binary files /dev/null and b/inst/extdata/output/subclone_props.png differ diff --git a/inst/extdata/output/tree.csv b/inst/extdata/output/tree.csv new file mode 100644 index 0000000..662f18b --- /dev/null +++ b/inst/extdata/output/tree.csv @@ -0,0 +1,4 @@ +edge,parent,child +root->1,root,1 +1->2,1,2 +1->3,1,3 diff --git a/inst/extdata/output/tree.png b/inst/extdata/output/tree.png new file mode 100644 index 0000000..6fb70db Binary files /dev/null and b/inst/extdata/output/tree.png differ diff --git a/inst/extdata/output/tree_ensemble.png b/inst/extdata/output/tree_ensemble.png new file mode 100644 index 0000000..1adc91d Binary files /dev/null and b/inst/extdata/output/tree_ensemble.png differ diff --git a/inst/extdata/output/upsetR.png b/inst/extdata/output/upsetR.png new file mode 100644 index 0000000..a40eb36 Binary files /dev/null and b/inst/extdata/output/upsetR.png differ diff --git a/inst/extdata/sim1.csv b/inst/extdata/sim1.csv deleted file mode 100644 index d3411e8..0000000 --- a/inst/extdata/sim1.csv +++ /dev/null @@ -1,101 +0,0 @@ -sample,mutation,total_reads,alt_reads,tumor_integer_copy_number,purity,multiplicity -S1,mut1,787,20,2,1,1 -S1,mut2,153,1,2,1,1 -S1,mut3,145,0,2,1,1 -S1,mut4,622,0,2,1,1 -S1,mut5,117,0,2,1,1 -S1,mut6,530,8,2,1,1 -S1,mut7,916,12,2,1,1 -S1,mut8,550,3,2,1,1 -S1,mut9,346,2,2,1,1 -S1,mut10,539,2,2,1,1 -S1,mut11,403,4,2,1,1 -S1,mut12,385,0,2,1,1 -S1,mut13,149,1,2,1,1 -S1,mut14,223,0,2,1,1 -S1,mut15,210,1,2,1,1 -S1,mut16,317,0,2,1,1 -S1,mut17,493,1,2,1,1 -S1,mut18,302,1,2,1,1 -S1,mut19,130,0,2,1,1 -S1,mut20,212,0,2,1,1 -S2,mut1,435,105,2,1,1 -S2,mut2,132,31,2,1,1 -S2,mut3,494,117,2,1,1 -S2,mut4,158,34,2,1,1 -S2,mut5,649,149,2,1,1 -S2,mut6,145,32,2,1,1 -S2,mut7,112,25,2,1,1 -S2,mut8,76,12,2,1,1 -S2,mut9,598,90,2,1,1 -S2,mut10,208,30,2,1,1 -S2,mut11,217,33,2,1,1 -S2,mut12,467,73,2,1,1 -S2,mut13,239,28,2,1,1 -S2,mut14,181,24,2,1,1 -S2,mut15,320,39,2,1,1 -S2,mut16,893,125,2,1,1 -S2,mut17,204,25,2,1,1 -S2,mut18,326,16,2,1,1 -S2,mut19,154,11,2,1,1 -S2,mut20,986,58,2,1,1 -S3,mut1,198,71,2,1,1 -S3,mut2,547,0,2,1,1 -S3,mut3,192,0,2,1,1 -S3,mut4,582,0,2,1,1 -S3,mut5,110,0,2,1,1 -S3,mut6,279,0,2,1,1 -S3,mut7,232,3,2,1,1 -S3,mut8,180,1,2,1,1 -S3,mut9,167,0,2,1,1 -S3,mut10,257,0,2,1,1 -S3,mut11,882,3,2,1,1 -S3,mut12,155,0,2,1,1 -S3,mut13,409,0,2,1,1 -S3,mut14,263,2,2,1,1 -S3,mut15,186,0,2,1,1 -S3,mut16,344,0,2,1,1 -S3,mut17,244,0,2,1,1 -S3,mut18,395,0,2,1,1 -S3,mut19,166,0,2,1,1 -S3,mut20,107,0,2,1,1 -S4,mut1,428,133,2,1,1 -S4,mut2,219,0,2,1,1 -S4,mut3,275,0,2,1,1 -S4,mut4,218,1,2,1,1 -S4,mut5,321,0,2,1,1 -S4,mut6,345,3,2,1,1 -S4,mut7,465,4,2,1,1 -S4,mut8,403,5,2,1,1 -S4,mut9,294,0,2,1,1 -S4,mut10,328,1,2,1,1 -S4,mut11,320,0,2,1,1 -S4,mut12,454,4,2,1,1 -S4,mut13,143,1,2,1,1 -S4,mut14,262,0,2,1,1 -S4,mut15,243,0,2,1,1 -S4,mut16,99,0,2,1,1 -S4,mut17,388,2,2,1,1 -S4,mut18,152,0,2,1,1 -S4,mut19,774,0,2,1,1 -S4,mut20,579,0,2,1,1 -S5,mut1,288,88,2,1,1 -S5,mut2,350,0,2,1,1 -S5,mut3,100,0,2,1,1 -S5,mut4,645,0,2,1,1 -S5,mut5,363,0,2,1,1 -S5,mut6,802,4,2,1,1 -S5,mut7,162,0,2,1,1 -S5,mut8,490,0,2,1,1 -S5,mut9,263,7,2,1,1 -S5,mut10,486,3,2,1,1 -S5,mut11,332,2,2,1,1 -S5,mut12,295,5,2,1,1 -S5,mut13,427,2,2,1,1 -S5,mut14,155,0,2,1,1 -S5,mut15,447,0,2,1,1 -S5,mut16,535,2,2,1,1 -S5,mut17,675,4,2,1,1 -S5,mut18,283,0,2,1,1 -S5,mut19,185,0,2,1,1 -S5,mut20,622,1,2,1,1 \ No newline at end of file diff --git a/inst/extdata/spike_and_slab_purity_2.jags b/inst/extdata/spike_and_slab_purity_2.jags deleted file mode 100644 index 4603dff..0000000 --- a/inst/extdata/spike_and_slab_purity_2.jags +++ /dev/null @@ -1,28 +0,0 @@ -model { - for (i in 1:I) { ## for each variant - for (s in 1:S) { ## for each sample - theta.temp[i,s] <- m[i,s] * w[z[i], s] * purity[s] / - (tcn[i,s] * purity[s] + 2 * (1 - purity[s])) - theta[i,s] <- ifelse(theta.temp[i,s] > 1, 1, theta.temp[i,s]) - y[i, s] ~ dbin(theta[i,s], n[i,s]) - ## posterior predictive distribution - ystar[i, s] ~ dbin(theta[i, s], n[i, s]) - } - z[i] ~ dcat(pi[1:K]) - } - alpha <- rep(1, K) - pi[1:K] ~ ddirch(alpha[1:K]) - - for (k in 1:K) { - for (s in 1:S) { - p[k,s] ~ dbeta(5,2) - z.p[k,s] ~ dcat(c(p[k,s], 1-p[k,s])) - - # w[k,s] ~ dbeta(a[z.p[k,s]], b[z.p[k,s]]) - - w1[k,s] ~ dbeta(1,1)T(0.005, ) - w[k,s] <- ifelse(z.p[k,s] == 1, w1[k,s], 0) - - } - } -} \ No newline at end of file diff --git a/inst/extdata/spike_and_slab_purity_2_K1.jags b/inst/extdata/spike_and_slab_purity_2_K1.jags deleted file mode 100644 index 0676217..0000000 --- a/inst/extdata/spike_and_slab_purity_2_K1.jags +++ /dev/null @@ -1,25 +0,0 @@ -model { - for (i in 1:I) { ## for each variant - for (s in 1:S) { ## for each sample - theta.temp[i,s] <- m[i,s] * w[z[i], s] * purity[s] / - (tcn[i,s] * purity[s] + 2 * (1 - purity[s])) - theta[i,s] <- ifelse(theta.temp[i,s] > 1, 1, theta.temp[i,s]) - y[i, s] ~ dbin(theta[i,s], n[i,s]) - ## posterior predictive distribution - ystar[i, s] ~ dbin(theta[i, s], n[i, s]) - } - - z[i] <- 1 - } - - for (s in 1:S) { - p[1,s] ~ dbeta(5,2) - z.p[1,s] ~ dcat(c(p[1,s], 1-p[1,s])) - - # w[1,s] ~ dbeta(a[z.p[1,s]], b[z.p[1,s]]) - - w1[1,s] ~ dbeta(1,1)T(0.005, ) - w[1,s] <- ifelse(z.p[1,s] == 1, w1[1,s], 0) - - } -} \ No newline at end of file diff --git a/inst/extdata/spike_and_slab_purity_2_K1_I1.jags b/inst/extdata/spike_and_slab_purity_2_K1_I1.jags deleted file mode 100644 index 20032f7..0000000 --- a/inst/extdata/spike_and_slab_purity_2_K1_I1.jags +++ /dev/null @@ -1,22 +0,0 @@ -model { - - for (s in 1:S) { ## for each sample - theta.temp[s] <- m[1,s] * w[1,s] * purity[s] / - (tcn[1,s] * purity[s] + 2 * (1 - purity[s])) - theta[s] <- ifelse(theta.temp[s] > 1, 1, theta.temp[s]) - y[1,s] ~ dbin(theta[s], n[1,s]) - ## posterior predictive distribution - ystar[1,s] ~ dbin(theta[s], n[1,s]) - } - z <- 1 - for (s in 1:S) { - p[1,s] ~ dbeta(5,2) - z.p[1,s] ~ dcat(c(p[1,s], 1-p[1,s])) - - # w[1,s] ~ dbeta(a[z.p[1,s]], b[z.p[1,s]]) - - w1[1,s] ~ dbeta(1,1)T(0.005, ) - w[1,s] <- ifelse(z.p[1,s] == 1, w1[1,s], 0) - - } -} \ No newline at end of file diff --git a/inst/extdata/spike_and_slab_purity_ident.jags b/inst/extdata/spike_and_slab_purity_ident.jags deleted file mode 100644 index 772f44f..0000000 --- a/inst/extdata/spike_and_slab_purity_ident.jags +++ /dev/null @@ -1,34 +0,0 @@ -model { - for (i in 1:I) { ## for each variant - for (s in 1:S) { ## for each sample - theta.temp[i,s] <- m[i,s] * w[z[i], s] * purity[s] / - (tcn[i,s] * purity[s] + 2 * (1 - purity[s])) - theta[i,s] <- ifelse(theta.temp[i,s] > 1, 1, theta.temp[i,s]) - y[i, s] ~ dbin(theta[i,s], n[i,s]) - ## posterior predictive distribution - ystar[i, s] ~ dbin(theta[i, s], n[i, s]) - } - z[i] ~ dcat(pi[1:K]) - } - alpha <- rep(1, K) - pi[1:K] ~ ddirch(alpha[1:K]) - - for (k in 1:K) { - - for (s in 1:S) { - p[k,s] ~ dbeta(5,2) - z.p[k,s] ~ dcat(c(p[k,s], 1-p[k,s])) - - w1[k,s] ~ dbeta(1,1)T(0.005, ) - w_unsorted[k,s] <- ifelse(z.p[k,s] == 1, w1[k,s], 0) - - - } - - } - - # order w for sample_to_sort - for (s in 1:S) { - w[1:K, s] <- ifelse(s == sample_to_sort, sort(w_unsorted[1:K, s]), w_unsorted[1:K, s]) - } -} \ No newline at end of file diff --git a/man/allThreshes.Rd b/man/allThreshes.Rd new file mode 100644 index 0000000..7cd58e6 --- /dev/null +++ b/man/allThreshes.Rd @@ -0,0 +1,11 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/MCMC-main.R +\name{allThreshes} +\alias{allThreshes} +\title{defines the thresholds to be used for tree building} +\usage{ +allThreshes() +} +\description{ +defines the thresholds to be used for tree building +} diff --git a/man/calcSubcloneProportions.Rd b/man/calcSubcloneProportions.Rd index ec355de..92c6e44 100644 --- a/man/calcSubcloneProportions.Rd +++ b/man/calcSubcloneProportions.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/subclone-proportions.R +% Please edit documentation in R/subclone-proportion.R \name{calcSubcloneProportions} \alias{calcSubcloneProportions} \title{Calculate proportions of subclones in each sample (assumes CCFs comply with lineage precedence and sum condition)} diff --git a/man/calcSubcloneProportions2.Rd b/man/calcSubcloneProportions2.Rd deleted file mode 100644 index 3a03686..0000000 --- a/man/calcSubcloneProportions2.Rd +++ /dev/null @@ -1,16 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/subclone-proportions.R -\name{calcSubcloneProportions2} -\alias{calcSubcloneProportions2} -\title{Calculate proportions of subclones in each sample} -\usage{ -calcSubcloneProportions2(w_mat, tree_edges) -} -\arguments{ -\item{w_mat}{Matrix of CCF estimates (from \code{estimateCCFs})} - -\item{tree_edges}{Tibble of tree edges with columns edge, parent, and child} -} -\description{ -Calculate proportions of subclones in each sample -} diff --git a/man/calcTreeScores.Rd b/man/calcTreeScores.Rd index 5c75634..241bb11 100644 --- a/man/calcTreeScores.Rd +++ b/man/calcTreeScores.Rd @@ -1,15 +1,15 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/tree_scoring.R +% Please edit documentation in R/tree-score.R \name{calcTreeScores} \alias{calcTreeScores} \title{Calculate SCHISM fitness scores for trees} \usage{ -calcTreeScores(w_chain, trees, mc.cores = 1) +calcTreeScores(mcf_chain, trees, purity, mc.cores = 1) } \arguments{ -\item{w_chain}{MCMC chain of CCF values, which is the first item in the list returned by \code{clusterSep}} - \item{trees}{list of tibbles, where each tibble contains edges of a tree with columns edge, parent, child} + +\item{w_chain}{MCMC chain of CCF values, which is the first item in the list returned by \code{clusterSep}} } \description{ Calculate SCHISM fitness scores for trees diff --git a/man/calculateTreeScoreMutations.Rd b/man/calculateTreeScoreMutations.Rd new file mode 100644 index 0000000..5e5cbda --- /dev/null +++ b/man/calculateTreeScoreMutations.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/tree-score.R +\name{calculateTreeScoreMutations} +\alias{calculateTreeScoreMutations} +\title{Calculate SCHISM fitness scores for trees} +\usage{ +calculateTreeScoreMutations( + mcf_chain, + data, + icnTable, + cncfTable, + multiplicityTable, + clusterAssingmentTable, + purity, + trees, + restriction.val = 1, + mc.cores = 8 +) +} +\description{ +Calculate SCHISM fitness scores for trees +} diff --git a/man/clusterSep.Rd b/man/clusterSep.Rd deleted file mode 100644 index 494a832..0000000 --- a/man/clusterSep.Rd +++ /dev/null @@ -1,42 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/cluster_separately.R -\name{clusterSep} -\alias{clusterSep} -\title{Run MCMC to cluster mutations and estimate CCFs} -\usage{ -clusterSep( - input_data, - n.iter = 10000, - n.burn = 1000, - thin = 10, - mc.cores = 1, - inits = list(.RNG.name = "base::Wichmann-Hill", .RNG.seed = 123), - max_K = 5, - model_type = "spike_and_slab", - beta.prior = FALSE, - drop_zero = FALSE, - one_box = TRUE -) -} -\arguments{ -\item{input_data}{list of input data objects;} - -\item{n.iter}{number of iterations to run MCMC} - -\item{n.burn}{number of iterations for burn in} - -\item{thin}{thinning parameter} - -\item{mc.cores}{number of cores for parallelization} - -\item{max_K}{maximum number of clusters to assess for each mutation set} - -\item{model_type}{hierarchical model type for ("spike_and_slab" or "simple)} - -\item{beta.prior}{option to run an initial MCMC chain and use results to specify beta priors for a second MCMC chain} - -\item{one_box}{option to run the MCMC chain without using sample presence} -} -\description{ -Run MCMC to cluster mutations and estimate CCFs -} diff --git a/man/collectBestKChains.Rd b/man/collectBestKChains.Rd index 5b2856a..c0d9be1 100644 --- a/man/collectBestKChains.Rd +++ b/man/collectBestKChains.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/cluster_separately.R +% Please edit documentation in R/MCMC-process-clusters.R \name{collectBestKChains} \alias{collectBestKChains} \title{Collect chains for best K of each mutation set} @@ -7,7 +7,7 @@ collectBestKChains(all_set_results, chosen_K = NULL) } \arguments{ -\item{all_set_results}{List of MCMC results for each mutation set; returned by \code{clusterSep}} +\item{all_set_results}{List of MCMC results for each mutation set} \item{chosen_K}{(Optional) Vector of K to choose for each mutation set, in the same order as all_set_results. If left blank, function will select best K automatically selected by \code{clusterSep}} } diff --git a/man/colorScheme.Rd b/man/colorScheme.Rd index 26eeff9..5789628 100644 --- a/man/colorScheme.Rd +++ b/man/colorScheme.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/plot.R +% Please edit documentation in R/tree-plot.R \name{colorScheme} \alias{colorScheme} \title{generate colors for each vertice} diff --git a/man/enumerateSpanningTrees.Rd b/man/enumerateSpanningTrees.Rd deleted file mode 100644 index c04fb38..0000000 --- a/man/enumerateSpanningTrees.Rd +++ /dev/null @@ -1,18 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/gabowmyers.R -\name{enumerateSpanningTrees} -\alias{enumerateSpanningTrees} -\title{Enumerate all spanning trees and filter based on Sum Condition} -\usage{ -enumerateSpanningTrees(graph_G, w, sum_filter_thresh = 0.2) -} -\arguments{ -\item{graph_G}{tibble of possible edges with columns edge, parent, child} - -\item{w}{matrix of CCF values (rows = clusters, columns = samples)} - -\item{sum_filter_thresh}{thresh maximum allowed violation of Sum Condition (default = 0.2)} -} -\description{ -Enumerate all spanning trees and filter based on Sum Condition -} diff --git a/man/enumerateSpanningTreesModified.Rd b/man/enumerateSpanningTreesModified.Rd index 1c00f0d..0c57bc1 100644 --- a/man/enumerateSpanningTreesModified.Rd +++ b/man/enumerateSpanningTreesModified.Rd @@ -1,17 +1,17 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/gabowmyers.R +% Please edit documentation in R/tree-gabowmyers.R \name{enumerateSpanningTreesModified} \alias{enumerateSpanningTreesModified} \title{Enumerate all spanning trees using modified Gabow-Myers} \usage{ -enumerateSpanningTreesModified(graph_G, w, sum_filter_thresh = 0.2) +enumerateSpanningTreesModified(graph_G, mcf, purity, sum_filter_thresh = 0.2) } \arguments{ \item{graph_G}{tibble of possible edges with columns edge, parent, child} -\item{w}{matrix of CCF values (rows = clusters, columns = samples)} - \item{sum_filter_thresh}{thresh maximum allowed violation of Sum Condition (default = 0.2)} + +\item{w}{matrix of CCF values (rows = clusters, columns = samples)} } \description{ Enumerate all spanning trees using modified Gabow-Myers diff --git a/man/estimateCCFs.Rd b/man/estimateCCFs.Rd deleted file mode 100644 index 633f3c4..0000000 --- a/man/estimateCCFs.Rd +++ /dev/null @@ -1,17 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/clustering_functions.R -\name{estimateCCFs} -\alias{estimateCCFs} -\title{Determine the most probable cluster CCF values by taking the mode of the posterior distributions} -\usage{ -estimateCCFs(w_chain) -} -\arguments{ -\item{w_chain}{MCMC chain of CCF values, which is the first item in the list returned by \code{clusterSep}} -} -\value{ -matrix of estimated cluster CCFs -} -\description{ -Determine the most probable cluster CCF values by taking the mode of the posterior distributions -} diff --git a/man/estimateClusterAssignments.Rd b/man/estimateClusterAssignments.Rd index c6c45ef..a645b34 100644 --- a/man/estimateClusterAssignments.Rd +++ b/man/estimateClusterAssignments.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/clustering_functions.R +% Please edit documentation in R/MCMC-process-clusters.R \name{estimateClusterAssignments} \alias{estimateClusterAssignments} \title{Determine most probable mutation cluster assignments by taking those with highest posterior probability} @@ -7,7 +7,7 @@ estimateClusterAssignments(z_chain) } \arguments{ -\item{z_chain}{MCMC chain of mutation cluster assignment values, which is the second item in the list returned by \code{clusterSep}} +\item{z_chain}{MCMC chain of mutation cluster assignment values} } \description{ Determine most probable mutation cluster assignments by taking those with highest posterior probability diff --git a/man/estimateICN.Rd b/man/estimateICN.Rd new file mode 100644 index 0000000..cea0ffa --- /dev/null +++ b/man/estimateICN.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/MCMC-process-clusters.R +\name{estimateICN} +\alias{estimateICN} +\title{Determine most probable integer assignments by taking those with highest posterior probability} +\usage{ +estimateICN(icn_chain) +} +\arguments{ +\item{icn_chain}{MCMC chain of integer copy number assignment values} +} +\description{ +Determine most probable integer assignments by taking those with highest posterior probability +} diff --git a/man/estimateMCFs.Rd b/man/estimateMCFs.Rd new file mode 100644 index 0000000..7bc2150 --- /dev/null +++ b/man/estimateMCFs.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/MCMC-process-clusters.R +\name{estimateMCFs} +\alias{estimateMCFs} +\title{Determine the most probable cluster MCF values by taking the mean of the posterior distributions} +\usage{ +estimateMCFs(mcf_chain) +} +\arguments{ +\item{mcf_chain}{MCMC chain of mCF values} +} +\value{ +matrix of estimated cluster MCFs +} +\description{ +Determine the most probable cluster MCF values by taking the mean of the posterior distributions +} diff --git a/man/estimateMultiplicity.Rd b/man/estimateMultiplicity.Rd new file mode 100644 index 0000000..a8920da --- /dev/null +++ b/man/estimateMultiplicity.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/MCMC-process-clusters.R +\name{estimateMultiplicity} +\alias{estimateMultiplicity} +\title{Determine most probable mutation cluster assignments by taking those with highest posterior probability} +\usage{ +estimateMultiplicity(m_chain) +} +\arguments{ +\item{m_chain}{MCMC chain of multiplicity assignment values} +} +\description{ +Determine most probable mutation cluster assignments by taking those with highest posterior probability +} diff --git a/man/filterEdgesBasedOnCCFs.Rd b/man/filterEdgesBasedOnCCFs.Rd index 387c80c..dc07f07 100644 --- a/man/filterEdgesBasedOnCCFs.Rd +++ b/man/filterEdgesBasedOnCCFs.Rd @@ -1,17 +1,17 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/gabowmyers.R +% Please edit documentation in R/tree-gabowmyers.R \name{filterEdgesBasedOnCCFs} \alias{filterEdgesBasedOnCCFs} \title{Filter possible edges based on lineage precedence} \usage{ -filterEdgesBasedOnCCFs(graph_G, w, thresh = 0.1) +filterEdgesBasedOnCCFs(graph_G, mcf, thresh = 0.1) } \arguments{ \item{graph_G}{tibble of possible edges with columns edge, parent, child} -\item{w}{matrix of CCF values (rows = clusters, columns = samples)} - \item{thresh}{maximum allowed violation of lineage precedence (default = 0.1)} + +\item{w}{matrix of CCF values (rows = clusters, columns = samples)} } \description{ Filter possible edges based on lineage precedence diff --git a/man/forceCCFs.Rd b/man/forceCCFs.Rd deleted file mode 100644 index 60e9b9f..0000000 --- a/man/forceCCFs.Rd +++ /dev/null @@ -1,16 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/subclone-proportions.R -\name{forceCCFs} -\alias{forceCCFs} -\title{Force CCFs to comply with lineage precedence and sum condition} -\usage{ -forceCCFs(w_mat, tree_edges) -} -\arguments{ -\item{w_mat}{Matrix of CCF estimates (from \code{estimateCCFs})} - -\item{tree_edges}{Tibble of tree edges with columns edge, parent, and child} -} -\description{ -Force CCFs to comply with lineage precedence and sum condition -} diff --git a/man/generateAllTrees.Rd b/man/generateAllTrees.Rd index e218ea7..f61a73a 100644 --- a/man/generateAllTrees.Rd +++ b/man/generateAllTrees.Rd @@ -1,17 +1,22 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/gabowmyers.R +% Please edit documentation in R/tree-gabowmyers.R \name{generateAllTrees} \alias{generateAllTrees} \title{Enumerate all spanning trees using modified Gabow-Myers wrapper} \usage{ -generateAllTrees(w, lineage_precedence_thresh = 0.1, sum_filter_thresh = 0.2) +generateAllTrees( + mcf, + purity, + lineage_precedence_thresh = 0.1, + sum_filter_thresh = 0.2 +) } \arguments{ -\item{w}{matrix of CCF values (rows = clusters, columns = samples)} - \item{lineage_precedence_thresh}{maximum allowed violation of lineage precedence (default = 0.1)} \item{sum_filter_thresh}{thresh maximum allowed violation of Sum Condition (default = 0.2)} + +\item{w}{matrix of CCF values (rows = clusters, columns = samples)} } \description{ Enumerate all spanning trees using modified Gabow-Myers wrapper diff --git a/man/getSetName.Rd b/man/getSetName.Rd deleted file mode 100644 index c83981b..0000000 --- a/man/getSetName.Rd +++ /dev/null @@ -1,11 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/plot.R -\name{getSetName} -\alias{getSetName} -\title{Convert binary set names to long form with Sample_ID} -\usage{ -getSetName(binary_name, Sample_ID, collapse_string = ", \\n") -} -\description{ -Convert binary set names to long form with Sample_ID -} diff --git a/man/importCSV.Rd b/man/importCSV.Rd deleted file mode 100644 index 5e1b57a..0000000 --- a/man/importCSV.Rd +++ /dev/null @@ -1,14 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/preprocessing.R -\name{importCSV} -\alias{importCSV} -\title{read input data file and store in PICTOGRAPH input format} -\usage{ -importCSV(inputFile, alt_reads_thresh = 6, vaf_thresh = 0.02) -} -\arguments{ -\item{input_file}{input data file;} -} -\description{ -read input data file and store in PICTOGRAPH input format -} diff --git a/man/importFiles.Rd b/man/importFiles.Rd new file mode 100644 index 0000000..109adb6 --- /dev/null +++ b/man/importFiles.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/preprocessing.R +\name{importFiles} +\alias{importFiles} +\title{read input data file and store in required format} +\usage{ +importFiles( + mutation_file, + outputDir = NULL, + alt_reads_thresh = 0, + vaf_thresh = 0 +) +} +\arguments{ +\item{mutation_file}{mutation data file; see inst/extdata/examples/*_snv.csv for examples} + +\item{outputDir}{output directory for saving output data} +} +\description{ +read input data file and store in required format +} diff --git a/man/importMutationFileOnly.Rd b/man/importMutationFileOnly.Rd new file mode 100644 index 0000000..f5d9d99 --- /dev/null +++ b/man/importMutationFileOnly.Rd @@ -0,0 +1,11 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/preprocessing.R +\name{importMutationFileOnly} +\alias{importMutationFileOnly} +\title{import mutation file} +\usage{ +importMutationFileOnly(mutation_file, alt_reads_thresh = 0, vaf_thresh = 0) +} +\description{ +import mutation file +} diff --git a/man/mcmcMain.Rd b/man/mcmcMain.Rd new file mode 100644 index 0000000..19ad08b --- /dev/null +++ b/man/mcmcMain.Rd @@ -0,0 +1,56 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/MCMC-main.R +\name{mcmcMain} +\alias{mcmcMain} +\title{run PICTograph in an automated pipeline} +\usage{ +mcmcMain( + mutation_file, + outputDir = NULL, + sample_presence = TRUE, + score = "silhouette", + max_K = 10, + min_mutation_per_cluster = 5, + cluster_diff_thresh = 0.05, + n.iter = 5000, + n.burn = 1000, + thin = 10, + mc.cores = 8, + inits = list(.RNG.name = "base::Wichmann-Hill", .RNG.seed = 123), + alt_reads_thresh = 0, + vaf_thresh = 0 +) +} +\arguments{ +\item{mutation_file}{a csv file that include information for SSMs.} + +\item{outputDir}{output directory for saving all files.} + +\item{sample_presence}{whether to use sample presence to separate the mutations. Not applicable if dual_model is set to FALSE and a copy number file is provided.} + +\item{score}{scoring function to estimate the number of clusters. silhouette or BIC.} + +\item{max_K}{user defined maximum number of clusters.} + +\item{min_mutation_per_cluster}{minumum number of mutations in each cluster.} + +\item{cluster_diff_thresh}{threshold to merge two clusters.} + +\item{n.iter}{number of iterations by JAGS.} + +\item{n.burn}{number of burns by JAGS.} + +\item{thin}{number of thin by JAGS.} + +\item{mc.cores}{number of cores to use for parallel computing; not applicable to windows.} + +\item{inits}{additional parameters by JAGS.} +} +\description{ +run MCMC chains to infer the clonal evolution of tumors from single or multi-region sequencing data. +This function automatically runs a pipeline of the tool. It models uncertainty of mutation cellular +fraction (MCF) in small somatic mutations (SSMs) and copy number alterations (CNAs), assigning SSMs +and CNAs to subclones using a Bayesian hierarchical model, and reconstruct tumor evolutionary trees +that are constrained based on principles of lineage precedence, sum condition, and optionally by +sample-presence. +} diff --git a/man/mergeSetChains.Rd b/man/mergeSetChains.Rd index 2f5f64a..c24dcdb 100644 --- a/man/mergeSetChains.Rd +++ b/man/mergeSetChains.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/cluster_separately.R +% Please edit documentation in R/MCMC-process-clusters.R \name{mergeSetChains} \alias{mergeSetChains} \title{Relabel chains for all sets and merge} @@ -7,7 +7,7 @@ mergeSetChains(best_set_chains, indata) } \arguments{ -\item{best_set_chains}{List of lists of MCMC chains (w_chain, z_chain, ystar_chain) for each mutation set} +\item{best_set_chains}{List of lists of MCMC chains (mcf_chain, z_chain, ystar_chain) for each mutation set} \item{indata}{List of input data objects (same as provided to clusterSep)} } diff --git a/man/pictograph-package.Rd b/man/pictograph-package.Rd new file mode 100644 index 0000000..3a79b70 --- /dev/null +++ b/man/pictograph-package.Rd @@ -0,0 +1,9 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/pictograph-package.R +\name{pictograph-package} +\alias{pictograph-package} +\alias{pictograph} +\title{pictograph} +\description{ +infer tumor evolution using SNV and CNA +} diff --git a/man/pictograph.Rd b/man/pictograph.Rd deleted file mode 100644 index 733bdcc..0000000 --- a/man/pictograph.Rd +++ /dev/null @@ -1,6 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/help.R -\docType{package} -\name{pictograph} -\alias{pictograph} -\title{pictograph} diff --git a/man/plotAllTrees.Rd b/man/plotAllTrees.Rd new file mode 100644 index 0000000..1f88bad --- /dev/null +++ b/man/plotAllTrees.Rd @@ -0,0 +1,11 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/MCMC-main.R +\name{plotAllTrees} +\alias{plotAllTrees} +\title{Plot all trees with the highest scores} +\usage{ +plotAllTrees(outputDir, scores, all_spanning_trees, mcfTable, data) +} +\description{ +Plot all trees with the highest scores +} diff --git a/man/plotAllZProb.Rd b/man/plotAllZProb.Rd deleted file mode 100644 index 4d06d12..0000000 --- a/man/plotAllZProb.Rd +++ /dev/null @@ -1,28 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/plot.R -\name{plotAllZProb} -\alias{plotAllZProb} -\title{Plot probabilities of mutation cluster assignments (vertical) for tested K across all mutation sets} -\usage{ -plotAllZProb( - all_set_results, - outdir, - SampleID = NULL, - filter_thresh = 0.05, - compare = FALSE -) -} -\arguments{ -\item{all_set_results}{List of MCMC results for each mutation set; returned by \code{clusterSep}} - -\item{outdir}{Path to directory for output of plots} - -\item{SampleID}{(Optional) Vector of sample IDs for labeling purposes. Same order as supplied as input data (e.g. indata$Sample_ID)} - -\item{filter_thresh}{Lowest posterior probability to include cluster assignment. Default value is 0.05 (inclusive)} - -\item{compare}{Option to only plot cluster probabilities for K chosen by minimum BIC, elbow or knee of plot when different (default FALSE plots all K tested)} -} -\description{ -Plot probabilities of mutation cluster assignments (vertical) for tested K across all mutation sets -} diff --git a/man/plotBIC.Rd b/man/plotBIC.Rd deleted file mode 100644 index 82f5bee..0000000 --- a/man/plotBIC.Rd +++ /dev/null @@ -1,16 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/plot.R -\name{plotBIC} -\alias{plotBIC} -\title{Plot BIC for all mutation sets} -\usage{ -plotBIC(all_set_results, Sample_ID = NULL) -} -\arguments{ -\item{all_set_results}{List of MCMC results for each mutation set; returned by \code{clusterSep}} - -\item{SampleID}{(Optional) Vector of sample IDs for labeling purposes. Same order as supplied as input data (e.g. indata$Sample_ID)} -} -\description{ -Plot BIC for all mutation sets -} diff --git a/man/plotChainsCCF.Rd b/man/plotChainsCCF.Rd deleted file mode 100644 index 4fc51dc..0000000 --- a/man/plotChainsCCF.Rd +++ /dev/null @@ -1,14 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/plot.R -\name{plotChainsCCF} -\alias{plotChainsCCF} -\title{Plot CCF chain trace} -\usage{ -plotChainsCCF(w_chain) -} -\arguments{ -\item{w_chain}{MCMC chain of CCF values, which is the first item in the list returned by \code{mergeSetChains}} -} -\description{ -Plot CCF chain trace -} diff --git a/man/plotChainsMCF.Rd b/man/plotChainsMCF.Rd new file mode 100644 index 0000000..e7e1029 --- /dev/null +++ b/man/plotChainsMCF.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/MCMC_plot_clusters.R +\name{plotChainsMCF} +\alias{plotChainsMCF} +\title{Plot CCF chain trace} +\usage{ +plotChainsMCF(mcf_chain) +} +\arguments{ +\item{mcf_chain}{MCMC chain of CCF values, which is the first item in the list returned by \code{mergeSetChains}} +} +\description{ +Plot CCF chain trace +} diff --git a/man/plotClusterAssignmentProb.Rd b/man/plotClusterAssignmentProb.Rd deleted file mode 100644 index 2e379c5..0000000 --- a/man/plotClusterAssignmentProb.Rd +++ /dev/null @@ -1,14 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/plot.R -\name{plotClusterAssignmentProb} -\alias{plotClusterAssignmentProb} -\title{Plot probabilities of mutation cluster assignments} -\usage{ -plotClusterAssignmentProb(z_chain) -} -\arguments{ -\item{z_chain}{MCMC chain of mutation cluster assignment values, which is the second item in the list returned by \code{clusterSep}} -} -\description{ -Plot probabilities of mutation cluster assignments -} diff --git a/man/plotClusterAssignmentProbVertical.Rd b/man/plotClusterAssignmentProbVertical.Rd index 63a315f..cbea76d 100644 --- a/man/plotClusterAssignmentProbVertical.Rd +++ b/man/plotClusterAssignmentProbVertical.Rd @@ -1,12 +1,12 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/plot.R +% Please edit documentation in R/MCMC_plot_clusters.R \name{plotClusterAssignmentProbVertical} \alias{plotClusterAssignmentProbVertical} \title{Plot probabilities of mutation cluster assignments - vertical} \usage{ plotClusterAssignmentProbVertical( z_chain, - w_chain, + mcf_chain, filter_thresh = 0.05, MutID = NULL, SampleID = NULL @@ -15,7 +15,7 @@ plotClusterAssignmentProbVertical( \arguments{ \item{z_chain}{MCMC chain of mutation cluster assignment values, which is the second item in the list returned by \code{clusterSep}} -\item{w_chain}{MCMC chain of CCF values, which is the first item in the list returned by \code{clusterSep}} +\item{mcf_chain}{MCMC chain of CCF values, which is the first item in the list returned by \code{clusterSep}} \item{filter_thresh}{Lowest posterior probability to include cluster assignment. Default value is 0.05 (inclusive)} diff --git a/man/plotDensityCCF.Rd b/man/plotDensityCCF.Rd deleted file mode 100644 index 8dbcc0c..0000000 --- a/man/plotDensityCCF.Rd +++ /dev/null @@ -1,14 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/plot.R -\name{plotDensityCCF} -\alias{plotDensityCCF} -\title{Plot cluster CCF posterior distributions} -\usage{ -plotDensityCCF(w_chain) -} -\arguments{ -\item{w_chain}{MCMC chain of CCF values, which is the first item in the list returned by \code{clusterSep}} -} -\description{ -Plot cluster CCF posterior distributions -} diff --git a/man/plotEnsembleTree.Rd b/man/plotEnsembleTree.Rd index e8e39c7..49f2382 100644 --- a/man/plotEnsembleTree.Rd +++ b/man/plotEnsembleTree.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/plot.R +% Please edit documentation in R/tree-plot.R \name{plotEnsembleTree} \alias{plotEnsembleTree} \title{Plot ensemble tree} diff --git a/man/plotCCFViolin.Rd b/man/plotMCFViolin.Rd similarity index 68% rename from man/plotCCFViolin.Rd rename to man/plotMCFViolin.Rd index d233acc..06a0fdf 100644 --- a/man/plotCCFViolin.Rd +++ b/man/plotMCFViolin.Rd @@ -1,13 +1,13 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/plot.R -\name{plotCCFViolin} -\alias{plotCCFViolin} +% Please edit documentation in R/MCMC_plot_clusters.R +\name{plotMCFViolin} +\alias{plotMCFViolin} \title{Plot cluster CCF posterior distributions as violin plots} \usage{ -plotCCFViolin(w_chain, z_chain = NULL, indata = NULL) +plotMCFViolin(mcf_chain, z_chain = NULL, indata = NULL) } \arguments{ -\item{w_chain}{MCMC chain of CCF values} +\item{mcf_chain}{MCMC chain of CCF values} \item{z_chain}{(Optional) MCMC chain of mutation cluster assignment values. If provided, cluster names will show the number of mutations assigned in brackets} diff --git a/man/plotPPD.Rd b/man/plotPPD.Rd deleted file mode 100644 index 3363105..0000000 --- a/man/plotPPD.Rd +++ /dev/null @@ -1,20 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/plot.R -\name{plotPPD} -\alias{plotPPD} -\title{Plot posterior predictive distribution for number of variant reads} -\usage{ -plotPPD(ystar_chain, indata, Sample_ID = NULL, Mutation_ID = NULL) -} -\arguments{ -\item{ystar_chain}{MCMC chain of ystar values, which is the third item in the list returned by \code{clusterSep}} - -\item{indata}{List of input data items} - -\item{Mutation_ID}{(Optional) Vector of mutation IDs. If not provided, function will use the MutID in indata} - -\item{SampleID}{(Optional) Vector of sample IDs for labeling purposes. Same order as supplied as input data (e.g. indata$Sample_ID). If not provided, function will use the Sample_ID in indata} -} -\description{ -Plot posterior predictive distribution for number of variant reads -} diff --git a/man/plotSubcloneBar.Rd b/man/plotSubcloneBar.Rd index aef6778..456fc8b 100644 --- a/man/plotSubcloneBar.Rd +++ b/man/plotSubcloneBar.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/subclone-proportions.R +% Please edit documentation in R/subclone-proportion.R \name{plotSubcloneBar} \alias{plotSubcloneBar} \title{Plot subclone proportions in each sample as stacked bar chart} diff --git a/man/plotSubclonePie.Rd b/man/plotSubclonePie.Rd index 8e15a22..f9f72a5 100644 --- a/man/plotSubclonePie.Rd +++ b/man/plotSubclonePie.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/subclone-proportions.R +% Please edit documentation in R/subclone-proportion.R \name{plotSubclonePie} \alias{plotSubclonePie} \title{Plot pie charts for subclone proportions in each sample} diff --git a/man/plotTree.Rd b/man/plotTree.Rd index dd55c48..f97e8af 100644 --- a/man/plotTree.Rd +++ b/man/plotTree.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/plot.R +% Please edit documentation in R/tree-plot.R \name{plotTree} \alias{plotTree} \title{Plot single tree} diff --git a/man/prepareGraph.Rd b/man/prepareGraph.Rd index 82a0efc..32cc167 100644 --- a/man/prepareGraph.Rd +++ b/man/prepareGraph.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/gabowmyers.R +% Please edit documentation in R/tree-gabowmyers.R \name{prepareGraph} \alias{prepareGraph} \title{Create tibble of possible edges from CCF values based on w_mat only} \usage{ -prepareGraph(w_mat, thresh) +prepareGraph(mcf_mat, thresh) } \arguments{ \item{w}{matrix of CCF values (rows = clusters, columns = samples)} diff --git a/man/prepareGraphForGabowMyers.Rd b/man/prepareGraphForGabowMyers.Rd deleted file mode 100644 index 722ed18..0000000 --- a/man/prepareGraphForGabowMyers.Rd +++ /dev/null @@ -1,21 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/gabowmyers.R -\name{prepareGraphForGabowMyers} -\alias{prepareGraphForGabowMyers} -\title{Create tibble of possible edges from CCF values based on sample-presence} -\usage{ -prepareGraphForGabowMyers(w, chains, input_data) -} -\arguments{ -\item{w}{matrix of CCF values (rows = clusters, columns = samples)} - -\item{chains}{list of MCMC chains (must contain w_chain and z_chain)} - -\item{input_data}{list of input data; same as supplied for clustering} -} -\value{ -graph_G tibble of possible edges with columns edge, parent, child -} -\description{ -Create tibble of possible edges from CCF values based on sample-presence -} diff --git a/man/runMCMCForAllBoxes.Rd b/man/runMCMCForAllBoxes.Rd new file mode 100644 index 0000000..e4383b3 --- /dev/null +++ b/man/runMCMCForAllBoxes.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/MCMC-clustering.R +\name{runMCMCForAllBoxes} +\alias{runMCMCForAllBoxes} +\title{run MCMC using JAGS} +\usage{ +runMCMCForAllBoxes( + sep_list, + sample_presence = TRUE, + ploidy = 2, + max_K = 5, + min_mutation_per_cluster = 5, + cluster_diff_thresh = 0.05, + n.iter = 5000, + n.burn = 1000, + thin = 10, + mc.cores = 4, + inits = list(.RNG.name = "base::Wichmann-Hill", .RNG.seed = 123) +) +} +\description{ +run MCMC using JAGS +} diff --git a/man/separateMutationsBySamplePresence.Rd b/man/separateMutationsBySamplePresence.Rd new file mode 100644 index 0000000..a929c83 --- /dev/null +++ b/man/separateMutationsBySamplePresence.Rd @@ -0,0 +1,11 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/sample-presence.R +\name{separateMutationsBySamplePresence} +\alias{separateMutationsBySamplePresence} +\title{sample presence for MCMC} +\usage{ +separateMutationsBySamplePresence(input_data) +} +\description{ +sample presence for MCMC +} diff --git a/man/writeClusterAssignmentsTable.Rd b/man/writeClusterAssignmentsTable.Rd index d031881..b6f3a99 100644 --- a/man/writeClusterAssignmentsTable.Rd +++ b/man/writeClusterAssignmentsTable.Rd @@ -1,13 +1,18 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/clustering_functions.R +% Please edit documentation in R/MCMC-process-clusters.R \name{writeClusterAssignmentsTable} \alias{writeClusterAssignmentsTable} \title{Determine most probable mutation cluster assignments by taking those with highest posterior probability.} \usage{ -writeClusterAssignmentsTable(z_chain, Mut_ID = NULL) +writeClusterAssignmentsTable( + z_chain, + mcf_chain = NULL, + cncf = NULL, + Mut_ID = NULL +) } \arguments{ -\item{z_chain}{MCMC chain of mutation cluster assignment values, which is the second item in the list returned by \code{clusterSep}} +\item{z_chain}{MCMC chain of mutation cluster assignment values} \item{Mut_ID}{Vector of mutation IDs, same order as provided as input data (e.g. indata$Mut_ID)} } diff --git a/man/writeClusterCCFsTable.Rd b/man/writeClusterCCFsTable.Rd deleted file mode 100644 index 97ccabb..0000000 --- a/man/writeClusterCCFsTable.Rd +++ /dev/null @@ -1,19 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/clustering_functions.R -\name{writeClusterCCFsTable} -\alias{writeClusterCCFsTable} -\title{Determine the most probable cluster CCF values by taking the mode of the posterior distributions} -\usage{ -writeClusterCCFsTable(w_chain, Sample_ID = NULL) -} -\arguments{ -\item{w_chain}{MCMC chain of CCF values, which is the first item in the list returned by \code{clusterSep}} - -\item{Sample_ID}{Vector of sample IDs, same order as provided as input data (e.g. indata$Sample_ID)} -} -\value{ -A tibble of estimated cluster CCFs in each sample -} -\description{ -Determine the most probable cluster CCF values by taking the mode of the posterior distributions -} diff --git a/man/writeClusterMCFsTable.Rd b/man/writeClusterMCFsTable.Rd new file mode 100644 index 0000000..b24a881 --- /dev/null +++ b/man/writeClusterMCFsTable.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/MCMC-process-clusters.R +\name{writeClusterMCFsTable} +\alias{writeClusterMCFsTable} +\title{Determine the most probable cluster MCF values by taking the mode of the posterior distributions} +\usage{ +writeClusterMCFsTable(mcf_chain, Sample_ID = NULL) +} +\arguments{ +\item{mcf_chain}{MCMC chain of MCF values} + +\item{Sample_ID}{Vector of sample IDs, same order as provided as input data (e.g. indata$Sample_ID)} +} +\value{ +A tibble of estimated cluster MCFs in each sample +} +\description{ +Determine the most probable cluster MCF values by taking the mode of the posterior distributions +} diff --git a/man/writeIcnTable.Rd b/man/writeIcnTable.Rd new file mode 100644 index 0000000..999fc8d --- /dev/null +++ b/man/writeIcnTable.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/MCMC-process-clusters.R +\name{writeIcnTable} +\alias{writeIcnTable} +\title{Determine most probable integer copy number by taking those with highest posterior probability.} +\usage{ +writeIcnTable(icn_chain, Mut_ID = NULL) +} +\arguments{ +\item{icn_chain}{MCMC chain of integer copy number} + +\item{Mut_ID}{Vector of mutation IDs, same order as provided as input data (e.g. indata$Mut_ID)} +} +\value{ +A tibble listing mutation IDs and their cluster assignments +} +\description{ +Determine most probable integer copy number by taking those with highest posterior probability. +} diff --git a/man/writeMultiplicityTable.Rd b/man/writeMultiplicityTable.Rd new file mode 100644 index 0000000..41a9fcb --- /dev/null +++ b/man/writeMultiplicityTable.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/MCMC-process-clusters.R +\name{writeMultiplicityTable} +\alias{writeMultiplicityTable} +\title{Determine most probable multiplicity assignments by taking those with highest posterior probability.} +\usage{ +writeMultiplicityTable(m_chain, Mut_ID = NULL) +} +\arguments{ +\item{m_chain}{MCMC chain of mutation cluster assignment values} + +\item{Mut_ID}{Vector of mutation IDs, same order as provided as input data (e.g. indata$Mut_ID)} +} +\value{ +A tibble listing mutation IDs and their cluster assignments +} +\description{ +Determine most probable multiplicity assignments by taking those with highest posterior probability. +} diff --git a/man/writeSetKTable.Rd b/man/writeSetKTable.Rd index 41f114a..58d0588 100644 --- a/man/writeSetKTable.Rd +++ b/man/writeSetKTable.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/bic.R +% Please edit documentation in R/calculate-BIC.R \name{writeSetKTable} \alias{writeSetKTable} \title{Make table listing possible choices of K (minimum BIC and elbow of BIC plot) for each mutation set} diff --git a/vignettes/pictograph.Rmd b/vignettes/pictograph.Rmd index 2afa419..2686e33 100644 --- a/vignettes/pictograph.Rmd +++ b/vignettes/pictograph.Rmd @@ -16,123 +16,165 @@ knitr::opts_chunk$set( ```{r setup, include = FALSE} library(pictograph) -library(dplyr) ``` ## 1. Introduction -This tutorial walks through how to run PICTograph on a toy example. PICTograph infers the clonal evolution of tumors from multi-region sequencing data. It models uncertainty in assigning mutations to subclones using a Bayesian hierarchical model and reduces the space of possible evolutionary trees by using constraints based on principles of lineage precedence, sum condition, and optionally by sample-presence. The inputs to PICTograph are variant ("alt allele") read counts of small somatic mutations, sequencing depth at the mutation loci, DNA copy number of the tumor genome at the mutation loci, tumor purity, and , if known, the number of mutant alleles (multiplicity). If multiplicity is not entered, PICTograph will estimate it. If multiple tumor samples are considered, an option is available to restrict the number of possible evolutionary trees by partitioning mutations according to sample presence. PICTograph summarizes the posterior distributions of the mutation cluster assignments and the cancer cell fractions (CCF) for each cluster by the mode. The estimates of cluster CCFs are then used to determine the most probable trees. Multiple trees that share the same score can be summarized as an ensemble tree, where edges are weighted by their concordance among constituent trees in the ensemble. +This tutorial walks through how to run PICTograph on a toy example. PICTograph infers the clonal evolution of tumors from multi-region sequencing data. It models uncertainty in assigning mutations to subclones using a Bayesian hierarchical model and reduces the space of possible evolutionary trees by using constraints based on principles of lineage precedence, sum condition, and optionally by sample-presence. The inputs to PICTograph are variant ("alt allele") read counts of small somatic mutations, sequencing depth at the mutation loci, DNA copy number of the tumor genome at the mutation loci, and, if known, the tumor purity and the number of major alleles. If major allele is not entered, PICTograph will estimate it. If multiple tumor samples are considered, an option is available to restrict the number of possible evolutionary trees by partitioning mutations according to sample presence. PICTograph summarizes the posterior distributions of the mutation cluster assignments and the mutation cell fractions (MCF) for each cluster by the mode. The estimates of cluster MCFs are then used to determine the most probable trees. Multiple trees that share the same score can be summarized as an ensemble tree, where edges are weighted by their concordance among constituent trees in the ensemble. ## 2. Input data -The required user input is a csv file that contains at least columns named "sample", "mutation", "total_reads", "alt_reads", "tumor_integer_copy_number", and "purity". All mutations in a sample must share the same purity. Users can also provide their own estimates of mutation multiplicity (number of tumor DNA copies harboring the mutation). An example input file from an individual with 3 tumor samples and 42 mutations is shown below. This data was simulated from a tree containing 5 mutation clusters. - +The required user input is a csv file that contains at least columns named "sample", "mutation", "total_reads", "alt_reads", "tumor_integer_copy_number", and "cncf". ```{r raw_data format} -head(read.csv(system.file('extdata/example_input.csv', package = 'pictograph'))) +head(read.csv(system.file('extdata/example1_snv.csv', package = 'pictograph'))) ``` -The input file can be read using the importCSV function. Alt read count (y), total read count (n), tumor copy number (tcn), and multiplicity (m) are stored in matrices where the columns are samples, and rows are variants. Purity is supplied as a vector. I and S are integers representing the number of variants and number of samples, respectively. The function will also pre-process the data to remove potential false positive read counts. By default, a mutation with alt reads less than 6 or a VAF less than 2% is considered a false positive. Users can change the thresholds using the `alt_reads_thresh` and `vaf_thresh` parameters. +Users can also provide an optional column "major_integer_copy_number" that provides the information of the integer copy number of the major allele. If "major_integer_copy_number" is not provided, it will be estimated using an internal function built in the package. -```{r input_data} -input_data <- importCSV(system.file('extdata/example_input.csv', package = 'pictograph'), - alt_reads_thresh = 0, vaf_thresh = 0) +```{r} +head(read.csv(system.file('extdata/example2_snv.csv', package = 'pictograph'))) ``` -## 3. Clustering mutations and estimating CCFs +Another optional column is "purity" column that provides the information of normal contamination of a sample. Purity of 0.8 wil be used if not provided. +```{r} +head(read.csv(system.file('extdata/example2_snv_with_purity.csv', package = 'pictograph'))) +``` -The first step of evolutionary analysis is clustering mutations and estimating their cancer cell fractions (CCFs). This is comprised of three steps: +## 3. Run PICTograph in one function -a. By default, for individuals with multiple tumor samples, the mutation data is initially split into sets by sample-presence. Users can choose not to apply sample-presence by setting the one_box parameter in the clusterSep function to 'T'. For individuals with many samples and few mutations, this may be preferable. Next, PICTograph estimates the joint posterior of CCFs and cluster assignments by Markov chain Monte Carlo (MCMC) across a range of possible values for the number of clusters, $K$ -b. Selecting the best $K$ for each mutation set -c. Merging the best chains of each mutation set +Run mcmcMain to generate all the data in one function. This will store all the output files in the user-provided output directory. + +```{r} +mcmcMain(mutation_file=system.file('extdata/example1_snv.csv', package = 'pictograph'), + outputDir=system.file('extdata/output', package = 'pictograph'), + sample_presence=TRUE, + score="silhouette", # either BIC or silhouette + max_K = 10, + min_mutation_per_cluster=5, + cluster_diff_thresh=0.05, + n.iter=5000, + n.burn=1000, + thin=10, + mc.cores=8, + inits=list(".RNG.name" = "base::Wichmann-Hill",".RNG.seed" = 123)) +``` -### 3a. Run clustering and CCF estimation separately for each mutation set +## 4. Step-through of the tool in a chain -In toy example, we run a short MCMC chain with only 1000 iterations (`n.iter`), burn-in of 100 (`n.burn`), and no thinning (`thin`). In practice, we recommend running the MCMC for longer e.g. 10,000 iterations, burn-in of 1000, and thinning by 10. By default, PICTograph separates mutations into sets by sample-presence patterns, and clusters mutations separately within each set. The maximum number of clusters can be set with the `max_K` option. To turn off the sample-presence separation, set the `one_box` option to `T`. To run the MCMC chains in parallel, set the `mc.cores` option to the number of desired cores. +If the user want to run each individual function in the tool, the following steps can be used. -```{r cluster, eval = FALSE} -all_set_results <- clusterSep(input_data, - n.iter = 1000, n.burn = 100, thin = 1, - max_K = 5, one_box = T, mc.cores = 2) +### 4.1. Import data + +The input file can be read using the importCSV function. Alt read count (y), total read count (n), tumor copy number (tcn), and multiplicity (m) are stored in matrices where the columns are samples, and rows are variants. Purity is supplied as a vector. I and S are integers representing the number of variants and number of samples, respectively. + +```{r data} +data <- importFiles(mutation_file=system.file('extdata/example1_snv.csv', package = 'pictograph'), + outputDir=system.file('extdata/output', package = 'pictograph')) ``` -This gives us a list with results for each mutation set, which contains `all_chains`, `BIC`, `best_chains`, and `best_K`. +```{r input_data} +input_data <- list(y=data$y, + n=data$n, + tcn=data$tcn, + is_cn=data$is_cn, + mtp=data$mtp, + icn=data$icn, + cncf=data$cncf, + MutID=data$MutID, + purity=data$purity) +``` +### 4.2. Clustering mutations and estimating MCFs -`all_chains` is a list of MCMC chains for each value of $K$ tested. For each $K$, there are chains for cluster CCF (`w_chain`), mutation cluster assignments (`z_chain`), and simulated variant read counts (`ystar_chain`) for posterior predictive distributions. +The first step of evolutionary analysis is clustering mutations and estimating their mutation cell fractions (MCFs). This is comprised of three steps: -`BIC` is a table of the BIC for each $K$ assessed. +a. By default, for individuals with multiple tumor samples, the mutation data is initially split into sets by sample-presence. Next, PICTograph estimates the joint posterior of MCFs and cluster assignments by Markov chain Monte Carlo (MCMC) across a range of possible values for the number of clusters, $K$ +b. Selecting the best $K$ for each mutation set +c. Merging the best chains of each mutation set -As default, PICTograph chooses the $K$ with the lowest BIC. The MCMC chains for this chosen $K$ are under `best_chains` and the $K$ chosen is listed under `best_K`. +#### 4.2a. Run clustering and MCF estimation separately for each mutation set -```{r cluster_res} -str(all_set_results, give.attr = F, max.level = 4) -``` +In toy example, we run a short MCMC chain with only 5000 iterations (`n.iter`), burn-in of 1000 (`n.burn`), and 10 thinning (`thin`). In practice, we recommend running the MCMC for longer e.g. 10,000 iterations, burn-in of 1000, and thinning by 10. By default, PICTograph separates mutations into sets by sample-presence patterns, and clusters mutations separately within each set. The maximum number of clusters can be set with the `max_K` option. To run the MCMC chains in parallel, set the `mc.cores` option to the number of desired cores. -### 3b. Select the best number of clusters, $K$, for each mutation set +```{r} +# using sample presence +sep_list <- separateMutationsBySamplePresence(input_data) +``` -From `all_set_results`, BIC values for all $K$ assessed in each mutation set can be visualized using `plotBIC`. The minimum, elbow, and knee points are marked. +```{r} +all_set_results <- runMCMCForAllBoxes(sep_list, max_K = 5, min_mutation_per_cluster = 10, n.iter=5000, n.burn=1000, thin=10, + cluster_diff_thresh = 0.1, mc.cores = 8) -```{r plot_bic, fig.height = 3, fig.width = 7} -plotBIC(all_set_results) +# if not use sample presence, run the following +# all_set_results <- runMCMCForAllBoxes(input_data, sample_presence = FALSE, max_K = 5, n.iter=5000, n.burn=1000, thin=10, +# min_mutation_per_cluster = 10, +# cluster_diff_thresh = 0.1, mc.cores = 8) ``` -Use `writeSetKTable` to generate a table with the K at minimum, elbow, and knee points of the BIC plot for each mutation set. The default chosen_K is the minimum K among the three, but users can also adjust the chosen_K accordingly. +This gives us a list with results for each mutation set, which contains `all_chains`, `BIC`, `best_chains`, `silhouette`, and `best_K`. + +`all_chains` is a list of MCMC chains for each value of $K$ tested. For each $K$, there are chains for cluster MCF (`mcf_chain`), mutation cluster assignments (`z_chain`), and simulated variant read counts (`ystar_chain`) for posterior predictive distributions. + +As default, PICTograph chooses the $K$ with the highest silhouette coefficient. The MCMC chains for this chosen $K$ are under `best_chains` and the $K$ chosen is listed under `best_K`. + +#### 4.2b. Select the best number of clusters, $K$, for each mutation set + +Use `writeSetKTable` to generate a table with the K at minimum, elbow, and knee points of the BIC plot for each mutation set. ```{r set_choices} set_k_choices <- writeSetKTable(all_set_results) set_k_choices ``` -We can extract the MCMC chains for the best $K$ of each mutation set using `collectBestKChains`. As default, PICTograph chooses the $K$ with the lowest BIC, and these chains will be automatically extracted. Users also have the option to specify the $K$ to choose for each mutation set by supplying a vector of integers to the parameter `chosen_K` in the same order as the listed sets in `all_set_results`. +We can extract the MCMC chains for the best $K$ of each mutation set using `collectBestKChains`. As default, PICTograph chooses the $K$ with the highest silhouette coefficient, and these chains will be automatically extracted. Users also have the option to specify the $K$ to choose for each mutation set by supplying a vector of integers to the parameter `chosen_K` in the same order as the listed sets in `all_set_results`. ```{r collect_best_chains} -best_set_chains <- collectBestKChains(all_set_results, chosen_K = set_k_choices$chosen_K) +best_set_chains <- collectBestKChains(all_set_results, chosen_K = set_k_choices$silhouette_K) str(best_set_chains, give.attr = F, max.level = 2) ``` -### 3c. Merge results for all mutation sets +#### 4.2c. Merge results for all mutation sets Finally, we can merge `best_set_chains` to obtain chains with the final mutation cluster numbering and correct mutation indices (original order provided in input data). -```{r merge, eval = FALSE} +```{r} chains <- mergeSetChains(best_set_chains, input_data) ``` -### Visualizing clustering and CCF estimation results +#### 4.2d Visualizing clustering and MCF estimation results -Traces for CCF chains can be visualized to check for convergence. +Traces for MCF chains can be visualized to check for convergence. -```{r ccf_trace, fig.height = 6, fig.width = 6} -plotChainsCCF(chains$w_chain) +```{r mcf_trace, fig.height = 6, fig.width = 6} +plotChainsMCF(chains$mcf_chain) ``` -The posterior distribtuion of cluster CCFs can be visualized as violin plots. The number of mutations assigned to each cluster is listed in brackets after the cluster name. +The posterior distribtuion of cluster MCFs can be visualized as violin plots. The number of mutations assigned to each cluster is listed in brackets after the cluster name. -```{r ccf_violin, fig.height = 4, fig.width = 6} -plotCCFViolin(chains$w_chain, chains$z_chain, indata = input_data) +```{r mcf_violin, fig.height = 4, fig.width = 6} +plotMCFViolin(chains$mcf_chain, chains$z_chain, indata = input_data) ``` We can also visualize the posterior probabilities of mutation cluster assignments and determine the most probable cluster assignments. In this toy example, there is high concordance of the cluster assignments of mutations across the MCMC chain. ```{r cluster-assignments, fig.height = 6, fig.width = 5} -plotClusterAssignmentProbVertical(chains$z_chain, chains$w_chain) +plotClusterAssignmentProbVertical(chains$z_chain, chains$mcf_chain) ``` -We can write tables for estimated cluster CCFs and mutation cluster assignments. +We can write tables for estimated cluster MCFs and mutation cluster assignments. ```{r tables} -writeClusterCCFsTable(chains$w_chain) +writeClusterMCFsTable(chains$mcf_chain) writeClusterAssignmentsTable(chains$z_chain, Mut_ID = input_data$MutID) ``` -## 4. Tree inference +### 4.3. Tree inference -We can then use the mutation cluster CCF estimates for tree inference. We first estimate the possible edges by applying lineage precedence filters to order the clusters on a tree. The `lineage_precedence_thresh` option allows relaxation of lineage precedence constraints so that the child cluster CCF can exceed a parent cluster CCF by a small amounts. The default of `lineage_precedence_thresh` is 0.1. Filtered edges are stored in an object named `graph_G`. +We can then use the mutation cluster MCF estimates for tree inference. We first estimate the possible edges by applying lineage precedence filters to order the clusters on a tree. The `lineage_precedence_thresh` option allows relaxation of lineage precedence constraints so that the child cluster MCF can exceed a parent cluster MCF by a small amounts. The default of `lineage_precedence_thresh` is 0.1. Filtered edges are stored in an object named `graph_G`. -Next, we enumerate this constrained tree space, and apply a filter based on the sum condition. The `sum_filter_thresh` option allows relaxation of sum condition constraints so that the sum of child cluster CCFs at a branch point can exceed the parent cluster CCF by a small amount. The default of `sum_filter_thresh` is 0.2. +Next, we enumerate this constrained tree space, and apply a filter based on the sum condition. The `sum_filter_thresh` option allows relaxation of sum condition constraints so that the sum of child cluster MCFs at a branch point can exceed the parent cluster MCF by a small amount. The default of `sum_filter_thresh` is 0.2. ```{r generate-all-tree} -generateAllTrees(chains$w_chain, lineage_precedence_thresh = 0.1, sum_filter_thresh = 0.2) +generateAllTrees(chains$mcf_chain, data$purity, lineage_precedence_thresh = 0.1, sum_filter_thresh = 0.2) ``` All spanning trees given by the possible edges that pass the sum condition filter are stored in `all_spanning_trees`. @@ -146,7 +188,7 @@ We then calculate a fitness score for all the trees that have passed our filteri ```{r tree-scoring, fig.height = 4, fig.width = 5} # calculate SCHISM fitness score for all trees -scores <- calcTreeScores(chains$w_chain, all_spanning_trees) +scores <- calcTreeScores(chains$mcf_chain, all_spanning_trees, purity=data$purity) scores # highest scoring tree best_tree <- all_spanning_trees[[which.max(scores)]] @@ -160,41 +202,12 @@ In this toy example, there is only one tree with the maximum score. In some case plotEnsembleTree(all_spanning_trees) ``` -## Subclone proportions +### 4.4 Subclone proportions -For individuals with multiple available tumor samples, the proportion of each subclone in each sample can be calculated using `calcSubcloneProportions`. This is calculated using the estimated cluster CCFs and the ordering of clusters on the best scoring tree. Two available options for visualizing subclone proportions are pie charts using `plotSubclonePie` and stacked bar graphs using `plotSubcloneBar`. +For individuals with multiple available tumor samples, the proportion of each subclone in each sample can be calculated using `calcSubcloneProportions`. This is calculated using the estimated cluster MCFs and the ordering of clusters on the best scoring tree. Two available options for visualizing subclone proportions are pie charts using `plotSubclonePie` and stacked bar graphs using `plotSubcloneBar`. ```{r subclone-props, fig.height = 4, fig.width = 5} -subclone_props <- calcSubcloneProportions(w_mat, best_tree) +subclone_props <- calcSubcloneProportions(mcf_mat, best_tree) plotSubclonePie(subclone_props, sample_names = colnames(input_data$y)) plotSubcloneBar(subclone_props, sample_names = colnames(input_data$y)) ``` - -We can force the cluster CCF estimates to comply with the sum condition for a given tree using `forceCCFs`. - -```{r force_ccfs, fig.height = 4, fig.width = 5} -fixed_w_mat <- forceCCFs(w_mat, best_tree) -fixed_subclone_props <- calcSubcloneProportions(fixed_w_mat, best_tree) -plotSubcloneBar(fixed_subclone_props) -``` - -Subclone proportions can also be calculated for the edges of an ensemble tree that are shared by all summarized trees (backbone). For this example ensemble tree, the "backbone" is made up of the root node and nodes 1, 2, 3, and 5, which are connected by thick black edges. - -```{r backbone, fig.height = 4, fig.width = 5} -selected_trees <- all_spanning_trees -plotEnsembleTree(selected_trees) - -backbone <- bind_rows(selected_trees) %>% - group_by(edge) %>% - mutate(count = n()) %>% - ungroup() %>% - filter(count == length(selected_trees)) %>% - distinct() %>% - select(edge, parent, child) - -fixed_w_mat_backbone <- forceCCFs(w_mat, backbone) -fixed_w_mat_backbone[is.na(fixed_w_mat_backbone)] = 0 - -backbone_subclone_props <- calcSubcloneProportions(fixed_w_mat_backbone, backbone) -plotSubcloneBar(backbone_subclone_props) -```