Merge pull request #11 from KarchinLab/staging

Staging
KarchinLab · Jun 6, 2023 · 6cef8bb · 6cef8bb
2 parents 74f99c0 + d1b8936
commit 6cef8bb
Show file tree

Hide file tree

Showing 3 changed files with 27 additions and 23 deletions.
diff --git a/R/gabowmyers.R b/R/gabowmyers.R
@@ -211,15 +211,15 @@ prepareGraphForGabowMyers <- function(w, chains, input_data) {
 #' @import tidyr
 #' @param w matrix of CCF values (rows = clusters, columns = samples)
 #' @return graph_G tibble of possible edges with columns edge, parent, child
-prepareGraph <- function(w_mat) {
+prepareGraph <- function(w_mat, thresh) {
   graph_pre <- data.frame(edge = character(), parent = character(), child = character())
   for (i in seq_len(nrow(w_mat))) {
     graph_pre <- graph_pre %>% add_row(edge = paste("root->", i, sep = ""), parent = "root", child = as.character(i))
     for (j in seq_len(nrow(w_mat))) {
       if (i!=j) {
         i_row = w_mat[i, ]
         j_row = w_mat[j, ]
-        if (all(j_row[i_row > 0] > 0)) {
+        if (all(j_row-i_row > -thresh)) {
           graph_pre <- graph_pre %>% add_row(edge = paste(j, "->", i, sep = ""), parent = as.character(j), child = as.character(i))
         }
       }
@@ -261,7 +261,7 @@ enumerateSpanningTrees <- function(graph_G, w, sum_filter_thresh=0.2) {
 generateAllTrees <- function(w, lineage_precedence_thresh=0.1, sum_filter_thresh=0.2) {
   w_mat <- estimateCCFs(w)
   w_mat <- assign("w_mat", w_mat, envir = .GlobalEnv)
-  graph_G_pre <- prepareGraph(w_mat)
+  graph_G_pre <- prepareGraph(w_mat, lineage_precedence_thresh)
   graph_G <- filterEdgesBasedOnCCFs(graph_G_pre, w_mat, thresh = lineage_precedence_thresh)
   graph_G <- assign("graph_G", graph_G, envir = .GlobalEnv)
   enumerateSpanningTreesModified(graph_G, w_mat, sum_filter_thresh = sum_filter_thresh)

diff --git a/R/preprocessing.R b/R/preprocessing.R
@@ -2,7 +2,7 @@
 #' 
 #' @export
 #' @param input_file input data file; 
-importCSV <- function(inputFile) {
+importCSV <- function(inputFile, alt_reads_thresh = 6, vaf_thresh = 0.02) {
   data <- read_csv(inputFile, show_col_types = FALSE)
   output_data <- list()
 
@@ -36,7 +36,7 @@ importCSV <- function(inputFile) {
     output_data$n[output_data$n==0] <- round(mean(output_data$n))
   }
 
-  output_data$tcn <- as.matrix(data[c("mutation", "sample", "tumor_integer_copy_number")] %>% pivot_wider(names_from = sample, values_from = tumor_integer_copy_number, values_fill = 0))
+  output_data$tcn <- as.matrix(data[c("mutation", "sample", "tumor_integer_copy_number")] %>% pivot_wider(names_from = sample, values_from = tumor_integer_copy_number, values_fill = 2))
   rownames(output_data$tcn) <- output_data$tcn[,'mutation']
   output_data$tcn <- output_data$tcn[,-1, drop=FALSE]
   rowname = rownames(output_data$tcn)
@@ -45,29 +45,30 @@ importCSV <- function(inputFile) {
   rownames(output_data$tcn) = rowname
   colnames(output_data$tcn) = colname
 
-  output_data$purity <- as.matrix(data[c("mutation", "sample", "purity")] %>% pivot_wider(names_from = sample, values_from = purity))
+  output_data$purity <- as.matrix(data[c("mutation", "sample", "purity")] %>% pivot_wider(names_from = sample, values_from = purity, values_fill = 0))
   rownames(output_data$purity) <- output_data$purity[,'mutation']
   output_data$purity <- output_data$purity[,-1, drop=FALSE]
   rowname = rownames(output_data$purity)
   colname = colnames(output_data$purity)
   output_data$purity <- matrix(as.numeric(output_data$purity), ncol = ncol(output_data$purity))
   rownames(output_data$purity) = rowname
   colnames(output_data$purity) = colname
-  if (ncol(output_data$purity) == 1) {
-    if (length(unique(output_data$purity[,])) != 1) {
-      warning("purity not consistent for the same sample; taking the mean purity")
-      output_data$purity <- round(colMeans(output_data$purity), digit=2)
-    } else {
-      output_data$purity <- unique(output_data$purity[,])
-    }
-  } else {
-    if (nrow(unique(output_data$purity[,])) != 1) {
-      warning("Purity not consistent for the same sample; taking the mean purity")
-      output_data$purity <- round(colMeans(output_data$purity), digit=2)
-    } else {
-      output_data$purity <- unique(output_data$purity[,])[1,]
-    }
-  }
+  output_data$purity = colSums(output_data$purity) / colSums(!!output_data$purity)
+  # if (ncol(output_data$purity) == 1) {
+  #   if (length(unique(output_data$purity[,])) != 1) {
+  #     warning("purity not consistent for the same sample; taking the mean purity")
+  #     output_data$purity <- round(colMeans(output_data$purity), digit=2)
+  #   } else {
+  #     output_data$purity <- unique(output_data$purity[,])
+  #   }
+  # } else {
+  #   if (nrow(unique(output_data$purity[,])) != 1) {
+  #     warning("Purity not consistent for the same sample; taking the mean purity")
+  #     output_data$purity <- round(colMeans(output_data$purity), digit=2)
+  #   } else {
+  #     output_data$purity <- unique(output_data$purity[,])[1,]
+  #   }
+  # }
 
   output_data$S = ncol(output_data$y)
   output_data$I = nrow(output_data$y)
@@ -88,6 +89,8 @@ importCSV <- function(inputFile) {
     output_data$m <- estimateMultiplicityMatrix(output_data)
   }
 
+  output_data$y[output_data$y / output_data$n < vaf_thresh] = 0
+  output_data$y[output_data$y < alt_reads_thresh] = 0
   return(output_data)
 }
 

diff --git a/vignettes/pictograph.Rmd b/vignettes/pictograph.Rmd
@@ -31,10 +31,11 @@ The required user input is a csv file that contains at least columns named "samp
 head(read.csv(system.file('extdata/example_input.csv', package = 'pictograph')))
 ```
 
-The input file can be read using the importCSV function. Alt read count (y), total read count (n), tumor copy number (tcn), and multiplicity (m) are stored in matrices where the columns are samples, and rows are variants. Purity is supplied as a vector. I and S are integers representing the number of variants and number of samples, respectively.  
+The input file can be read using the importCSV function. Alt read count (y), total read count (n), tumor copy number (tcn), and multiplicity (m) are stored in matrices where the columns are samples, and rows are variants. Purity is supplied as a vector. I and S are integers representing the number of variants and number of samples, respectively. The function will also pre-process the data to remove potential false positive read counts. By default, a mutation with alt reads less than 6 or a VAF less than 2% is considered a false positive. Users can change the thresholds using the `alt_reads_thresh` and `vaf_thresh` parameters.
 
 ```{r input_data}
-input_data <- importCSV(system.file('extdata/example_input.csv', package = 'pictograph'))
+input_data <- importCSV(system.file('extdata/example_input.csv', package = 'pictograph'),
+                        alt_reads_thresh = 6, vaf_thresh = 0.02)
 ```
 
 ```{r load_results, include = F}