Merge pull request #1 from taylor-lab/feature/facets2N

Feature/facets2 n
taylor-lab · Mar 2, 2020 · 4f2f8ae · 4f2f8ae
2 parents 5e9c401 + 3a889e9
commit 4f2f8ae
Show file tree

Hide file tree

Showing 21 changed files with 529 additions and 144 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,19 +1,19 @@
 Package: facetsSuite
 Type: Package
 Title: Functions to run and parse output from FACETS
-Version: 2.0.0
+Version: 2.0.4
 Authors@R: person(given = "Philip",
              family = "Jonsson",
              role = c("aut", "cre"),
              email = "[email protected]")
 URL: https://github.com/mskcc/facets-suite
 Description: This package provides functions that wrap the FACETS package for allele-specific copy-number analysis as well as functions to perform post-hoc analyses thereof.
 License: MIT + file LICENSE
-RoxygenNote: 6.1.1
+RoxygenNote: 7.0.2
 LazyData: true
 Encoding: UTF-8
 Depends:
-    R (>= 2.10)
+    R (>= 3.4.0), pctGCdata (>= 0.3.0)
 Imports:
     data.table (>= 1.11.8),
     dplyr (>= 0.8.3),
@@ -25,9 +25,11 @@ Imports:
     egg (>= 0.4.2),
     scales (>= 1.0.0),
     argparse (>= 1.1.1),
-    tibble (>= 2.1.3)
+    tibble (>= 2.1.3),
+    facets (>= 0.5.6), 
+    facets2n (>= 0.2.2)
 NeedsCompilation: no
-Packaged: 2019-02-07 16:38:38 UTC; jonssonp
+Packaged: 2020-03-01 21:56:37 UTC; rptashkin
 Suggests: 
     covr (>= 3.2.0),
     testthat (>= 2.1.0)
diff --git a/NAMESPACE b/NAMESPACE
@@ -12,16 +12,19 @@ export(cf_plot)
 export(check_fit)
 export(closeup_plot)
 export(cnlr_plot)
+export(create_legacy_output)
 export(format_igv_seg)
 export(gene_level_changes)
 export(icn_plot)
+export(load_facets_output)
 export(plot_segmentation)
 export(read_snp_matrix)
+export(read_snp_matrix_facets2n)
 export(run_facets)
-export(create_legacy_output)
-export(load_facets_output)
 export(valor_plot)
 import(data.table)
+import(facets)
+import(facets2n)
 import(ggplot2)
 importFrom(data.table,":=")
 importFrom(data.table,foverlaps)
@@ -45,6 +48,7 @@ importFrom(dplyr,summarize)
 importFrom(dplyr,ungroup)
 importFrom(egg,ggarrange)
 importFrom(grDevices,colorRampPalette)
+importFrom(pctGCdata,getGCpct)
 importFrom(plyr,mapvalues)
 importFrom(purrr,discard)
 importFrom(purrr,map_chr)
@@ -55,3 +59,4 @@ importFrom(scales,pretty_breaks)
 importFrom(stats,dbinom)
 importFrom(tidyr,gather)
 importFrom(tidyr,separate_rows)
+importFrom(utils,read.csv)
diff --git a/R/ccf-annotate-maf.R b/R/ccf-annotate-maf.R
@@ -61,10 +61,10 @@ ccf_annotate_maf = function(maf,
     maf[, c('ccf_Mcopies', 'ccf_Mcopies_lower', 'ccf_Mcopies_upper', 'ccf_Mcopies_prob95', 'ccf_Mcopies_prob90') :=
             estimate_ccf(purity, tcn, tcn - lcn, t_alt_count, t_depth), by = seq_len(nrow(maf))]
     maf[, c('ccf_1copy', 'ccf_1copy_lower', 'ccf_1copy_upper', 'ccf_1copy_prob95', 'ccf_1copy_prob90') :=
-            estimate_ccf(purity, tcn, tcn - lcn, t_alt_count, t_depth), by = seq_len(nrow(maf))]
+            estimate_ccf(purity, tcn, 1, t_alt_count, t_depth), by = seq_len(nrow(maf))]
     maf[, c('ccf_expected_copies', 'ccf_expected_copies_lower', 'ccf_expected_copies_upper',
             'ccf_expected_copies_prob95', 'ccf_expected_copies_prob90') :=
-            estimate_ccf(purity, tcn, tcn - lcn, t_alt_count, t_depth), by = seq_len(nrow(maf))]
+            estimate_ccf(purity, tcn, expected_alt_copies, t_alt_count, t_depth), by = seq_len(nrow(maf))]
     as.data.frame(maf)
 }
 

diff --git a/R/check-fit.R b/R/check-fit.R
@@ -54,7 +54,7 @@ check_fit = function(facets_output,
     dipLogR = facets_output$dipLogR
     alballogr = as.numeric(facets_output$alballogr[, 1])
     purity = facets_output$purity
-    mafr_thresh = facets_output$mafr_thresh
+
     fcna_output = calculate_fraction_cna(segs, facets_output$ploidy, genome, algorithm)
     wgd = fcna_output$genome_doubled
 
@@ -66,9 +66,9 @@ check_fit = function(facets_output,
     setkey(segs, chrom, start, end)
     snps = as.data.table(snps)
     setkey(snps, chrom, maploc)
-
+    
     # Label clonal segments
-    segs[, clonal := cf >= purity - .1]
+    segs[, clonal := cf >= (purity * 0.8)]
 
     # Subset on autosomes
     auto_segs = segs[chrom < 23]
@@ -92,6 +92,34 @@ check_fit = function(facets_output,
     n_dip_imbal_segs = nrow(dip_imbal_segs)
     frac_dip_imbal_segs = sum(dip_imbal_segs$length)/sum(auto_segs$length)
 
+    #############################
+    ### Other metrics
+    #############################
+    ## fraction genome unaltered (2-1) -- too high --> low purity?
+    segs_unaltered = auto_segs[tcn == 2 & lcn == 1, ]    
+    n_segs_unaltered = nrow(segs_unaltered)    
+    frac_genome_unaltered = sum(segs_unaltered$length)/sum(auto_segs$length)
+
+    segs_below_dipLogR = auto_segs[cnlr.median.clust < dipLogR]    # part of ploidy filter
+    n_segs_below_dipLogR = nrow(segs_below_dipLogR)    # part of ploidy filter
+    frac_below_dipLogR = sum(segs_below_dipLogR$length)/sum(auto_segs$length)
+
+    segs_balanced_odd_tcn = auto_segs[mafR <= 0.025 & (tcn %% 2) != 0 & clonal ==T & tcn < 8,]
+    n_segs_balanced_odd_tcn = nrow(segs_balanced_odd_tcn)
+    frac_balanced_odd_tcn = sum(segs_balanced_odd_tcn$length)/sum(auto_segs$length)
+
+    segs_imbalanced_diploid_cn = auto_segs[clonal==T & mafR > 0.1 & !is.na(lcn) & lcn != 0  & (as.double(tcn) / lcn) == 2, ]
+    n_segs_imbalanced_diploid_cn = nrow(segs_imbalanced_diploid_cn)
+    frac_imbalanced_diploid_cn = sum(segs_imbalanced_diploid_cn$length)/sum(auto_segs$length)
+
+    segs_lcn_greater_mcn = auto_segs[clonal==T & lcn < mcn,]
+    n_segs_lcn_greater_mcn = nrow(segs_lcn_greater_mcn)
+    frac_lcn_greater_mcn = sum(segs_lcn_greater_mcn$length)/sum(auto_segs$length)
+
+    mafr_median_all = median(auto_segs$mafR, na.rm = T)
+    mafr_median_clonal = median(auto_segs[clonal==T, ]$mafR, na.rm = T)
+    mafr_n_gt_1 = nrow(auto_segs[mafR > 1, ])
+
     # Number of high-level amplifications and homozygous deletions
     # Clonal homdels, how much of the genome do they represent
     n_amps = nrow(segs[tcn >= 10])
@@ -110,28 +138,32 @@ check_fit = function(facets_output,
     n_cnlr_clusters = length(unique(segs$cnlr.median.clust))
 
     # Number of segments with lcn==NA
-    n_lcn_na = nrow(segs[is.na(lcn)])
+    n_lcn_na = nrow(auto_segs[is.na(lcn)])
+    frac_lcn_na = sum(auto_segs[is.na(lcn)]$length)/sum(auto_segs$length)
 
     # Number of segments with LOH
     loh_segs = auto_segs[lcn == 0,]
     n_loh = nrow(loh_segs)
     frac_loh = sum(loh_segs$length)/sum(auto_segs$length)
 
-    # Check fraction of subclonal events
-    subclonal_segs = segs[clonal == F, ]
+    # Check fraction of subclonal events within autosomal chromosomes
+    subclonal_segs = auto_segs[clonal == F, ]
     n_segs_subclonal = nrow(subclonal_segs)
-    frac_segs_subclonal = sum(subclonal_segs$length)/sum(segs$length)
-
+    frac_segs_subclonal = sum(subclonal_segs$length)/sum(auto_segs$length)
+    
     # Compile SNP statistics
     n_snps = nrow(snps)
     het_snps = snps[het == 1]
     n_het_snps = nrow(het_snps)
     frac_het_snps = n_het_snps/n_snps
-    n_het_snps_hom_in_tumor_1pct = nrow(het_snps[ vafT < 0.01 | vafT > 0.99, ])
-    n_het_snps_hom_in_tumor_5pct = nrow(het_snps[ vafT < 0.05 | vafT > 0.95, ])
+
+    n_snps_with_300x_in_tumor = nrow(snps[rCountT > 300, ])
+    n_het_snps_with_300x_in_tumor = nrow(het_snps[rCountT > 300, ])
+    n_het_snps_hom_in_tumor_1pct = nrow(het_snps[  (vafT < 0.01 | vafT > 0.99), ])
+    n_het_snps_hom_in_tumor_5pct = nrow(het_snps[ (rCountN > 35 & rCountT > 35) & (vafT < 0.05 | vafT > 0.95), ])
     frac_het_snps_hom_in_tumor_1pct = n_het_snps_hom_in_tumor_1pct/n_het_snps
     frac_het_snps_hom_in_tumor_5pct = n_het_snps_hom_in_tumor_5pct/n_het_snps
-
+    
     # Check the mean/standard deviation of the cnlr
     snps[, cnlr_residual := cnlr - median(cnlr), by = seg]
     mean_cnlr_residual = mean(snps$cnlr_residual)
@@ -191,9 +223,19 @@ check_fit = function(facets_output,
         frac_loh = frac_loh,
         n_segs_subclonal = n_segs_subclonal,
         frac_segs_subclonal = frac_segs_subclonal,
+        n_segs_below_dipLogR = n_segs_below_dipLogR,
+        frac_below_dipLogR = frac_below_dipLogR,
+        n_segs_balanced_odd_tcn = n_segs_balanced_odd_tcn,
+        frac_balanced_odd_tcn = frac_balanced_odd_tcn,
+        n_segs_imbalanced_diploid_cn = n_segs_imbalanced_diploid_cn,
+        frac_imbalanced_diploid_cn = frac_imbalanced_diploid_cn,
+        n_segs_lcn_greater_mcn = n_segs_lcn_greater_mcn,
+        frac_lcn_greater_mcn = frac_lcn_greater_mcn,
         n_snps = n_snps,
         n_het_snps = n_het_snps,
         frac_het_snps = frac_het_snps,
+        n_snps_with_300x_in_tumor = n_snps_with_300x_in_tumor,
+        n_het_snps_with_300x_in_tumor = n_het_snps_with_300x_in_tumor,
         n_het_snps_hom_in_tumor_1pct = n_het_snps_hom_in_tumor_1pct,
         n_het_snps_hom_in_tumor_5pct = n_het_snps_hom_in_tumor_5pct,
         frac_het_snps_hom_in_tumor_1pct = frac_het_snps_hom_in_tumor_1pct,
@@ -207,22 +249,27 @@ check_fit = function(facets_output,
         n_segs_discordant_both = discordant_stats$n_discordant_both,
         frac_discordant_both = discordant_stats$length_discordant_both/evaluable_length$both,
         n_segs_icn_cnlor_discordant = n_icn_cnlor_discordant,
-        frac_icn_cnlor_discordant = frac_icn_cnlor_discordant
+        frac_icn_cnlor_discordant = frac_icn_cnlor_discordant,
+        mafr_median_all = mafr_median_all,
+        mafr_median_clonal = mafr_median_clonal,
+        mafr_n_gt_1 = mafr_n_gt_1
     )
 
+
+
     # If input MAF is provided add some stats based on this
     if (!is.null(maf)) {
         maf = as.data.table(maf)
 
         maf[, `:=` (
             Chromosome = ifelse(Chromosome == 'X', 23, Chromosome),
             t_var_freq = t_alt_count/(t_alt_count+t_ref_count)
-            )][, Chromosome := as.integer(Chromosome)]
+        )][, Chromosome := as.integer(Chromosome)]
         setkey(maf, Chromosome, Start_Position, End_Position)
         maf = foverlaps(maf, segs, mult = 'first', nomatch = NA,
                         by.x = c('Chromosome', 'Start_Position', 'End_Position'),
                         by.y = c('chrom', 'start', 'end'))
-
+        
         # Median mutation VAF at clonal 2:1 segments
         output$dip_median_vaf = maf[clonal == TRUE & mcn == 1 & lcn == 1][, median(t_var_freq)]
 

diff --git a/R/plot-facets.R b/R/plot-facets.R
@@ -65,8 +65,8 @@ cnlr_plot = function(facets_data,
 
     ymin = floor(min(segs$cnlr.median, na.rm = T))
     ymax = ceiling(max(segs$cnlr.median, na.rm = T))
-    if (ymin > -3) ymin = -3
-    if (ymax < 3) ymax = 3
+    if (ymin > -2) ymin = -2
+    if (ymax < 2) ymax = 2
 
     if (adjust_dipLogR) {
         snps$cnlr = snps$cnlr - dipLogR
@@ -240,9 +240,11 @@ cf_plot = function(facets_data,
 
     starts = cumsum(c(1, segs$num.mark))[seq_along(segs$num.mark)]
     ends = cumsum(c(segs$num.mark))
+    my_starts = snps[starts, 'chr_maploc']
+    my_ends = snps[ends, 'chr_maploc']
 
     cf = ggplot(segs) +
-        geom_rect(aes(xmin = starts, xmax = ends, ymax = 1, ymin = 0),
+        geom_rect(aes(xmin = my_starts, xmax = my_ends, ymax = 1, ymin = 0),
                   fill = cols, col = 'white', size = 0) +
         scale_x_continuous(breaks = mid, labels = names(mid), expand = c(.01, 0)) +
         scale_y_continuous(expand = c(0, 0)) +
@@ -315,7 +317,7 @@ icn_plot = function(facets_data,
         geom_segment(col = 'red', size = 1, 
                      aes(x = my_lcn_starts$chr_maploc, xend = my_lcn_ends$chr_maploc, 
                          y = my_lcn_starts$lcn, yend = my_lcn_ends$lcn)) +
-        geom_segment(col = 'black', size = 1, 
+        geom_segment(col = 'black', size = 0.8, 
                      aes(x = my_tcn_starts$chr_maploc, xend = my_tcn_ends$chr_maploc, 
                          y = my_tcn_starts$tcn, yend = my_tcn_ends$tcn)) +
         # scale_y_continuous(breaks=c(0:5, 5 + seq_len(35) / 3), labels = 0:40, limits = c(0, NA)) +

diff --git a/R/read-snp-matrix.R b/R/read-snp-matrix.R
@@ -1,28 +1,26 @@
-#' Read counts file
-#'
-#' Reads a gzipped output file from \code{snp-pileup}.
-#'
-#' @param input_file Path to input file.
-#' @param err.thresh Threshold for errors at locus.
-#' @param del.thresh Threshold for deletions at locus.
-#' 
-#' @source \code{snp-pileup} is part of \href{www.github.com/mskcc/facets}{FACETS}.
-#'
-#' @return A SNP count matrix, with the following columns:
-#' \itemize{
-#'       \item{\code{Chromosome} and \code{Position}:} {Chromosomal position of SNP.}
-#'       \item{\code{NOR.DP}:} {Total read depth in normal sample.}
-#'       \item{\code{TUM.DP}:} {Total read depth in tumor sample.}
-#'       \item{\code{NOR.RD}:} {Reference allele read depth in normal sample.}
-#'       \item{\code{TUM.RD}:} {Reference allele read depth in tumor sample.}
-#' }
-#' 
-#' @import data.table
 
-#' @export
 read_snp_matrix = function(input_file,
                            err.thresh = 10,
                            del.thresh = 10) {
+    #' Reads a gzipped output file from \code{snp-pileup}.
+    #'
+    #' @param input_file Path to input file.
+    #' @param err.thresh Threshold for errors at locus.
+    #' @param del.thresh Threshold for deletions at locus.
+    #' 
+    #' @source \code{snp-pileup} is part of \href{www.github.com/mskcc/facets}{FACETS}.
+    #'
+    #' @return A SNP count matrix, with the following columns:
+    #' \itemize{
+    #'       \item{\code{Chromosome} and \code{Position}:} {Chromosomal position of SNP.}
+    #'       \item{\code{NOR.DP}:} {Total read depth in normal sample.}
+    #'       \item{\code{TUM.DP}:} {Total read depth in tumor sample.}
+    #'       \item{\code{NOR.RD}:} {Reference allele read depth in normal sample.}
+    #'       \item{\code{TUM.RD}:} {Reference allele read depth in tumor sample.}
+    #' }
+    #' 
+    #' @import data.table
+    #' @export
 
     read_counts = data.table::fread(cmd = paste('gunzip -c', input_file), key = c('Chromosome', 'Position'))
 
@@ -44,3 +42,34 @@ read_snp_matrix = function(input_file,
 
     read_counts[order(Chromosome, Position)][, list(Chromosome, Position, NOR.DP, TUM.DP, NOR.RD, TUM.RD)]
 }
+
+read_snp_matrix_facets2n = function(filename, err.thresh=Inf, del.thresh=Inf, perl.pileup=FALSE,
+                                    MandUnormal=FALSE, spanT=0.2, spanA=0.2, spanX=0.2, gbuild="hg19",
+                                    ReferencePileupFile=NULL, ReferenceLoessFile=NULL, MinOverlap=0.9, useMatchedX=FALSE){
+    #' #' Read in the snp-pileup generated SNP read count matrix file for facets2n processing
+    #' @importFrom utils read.csv
+    #' @param filename counts file from snp-pileup
+    #' @param skip (character) Skip n number of lines in the input file.
+    #' @param err.thresh (numeric) Error threshold to be used to filter snp-pileup data frame.
+    #' @param del.thresh (numeric) Deletion threshold to be used to filter snp-pileup data frame.
+    #' @param perl.pileup (logical) Is the pileup data generated using perl pileup tool?
+    #' @param MandUnormal (logical) Is CNLR analysis to be peformed using unmatched reference normals?
+    #' @param spanT (numeric) Default span value to be used for loess normalization in tumor sample.
+    #' @param spanA (numeric) Default span value to be used for loess normalization across autosomal chromosomes in the normal sample.
+    #' @param spanX (numeric) Default span value to be used for loess normalization in Chr X in the normal sample.
+    #' @param gbuild (character) Genome build (Default: hg19).
+    #' @param ReferencePileupFile (character) Filepath to an optional snp-pileup generated pileup data of one or more reference normals.
+    #' @param ReferenceLoessFile (character) Filepath to an optional loess data, generated using the facets2n package, of one or more reference normals. The number of normals in this data should match that in the ReferencePileupFile, and should be in the same order.
+    #' @param MinOverlap (numeric) Mininum overlap fraction of loci between a tumor pileup and reference pileup data.
+    #' @param useMatchedX (logical) Is the matched normal to be used for ChrX normalization?
+    #' @return A dataframe of pileup depth values for Tumor and Matched Normal if MandUnormal is FALSE. Else, a list of data frame with pileup depth values of Tumor, matched Normal, and a best unmatched normal, and the associated span values.
+    #' @source \code{snp-pileup} is part of \href{www.github.com/mskcc/facets}{FACETS}.
+    #' @import facets2n
+    #' @importFrom pctGCdata getGCpct
+    #' @export
+    if (MandUnormal){
+        facets2n::readSnpMatrix(filename, MandUnormal = MandUnormal, ReferencePileupFile = ReferencePileupFile, ReferenceLoessFile = ReferenceLoessFile, useMatchedX = useMatchedX)
+    }else{
+        facets2n::readSnpMatrix(filename, MandUnormal = MandUnormal)
+    }
+}