Merge pull request #2 from taylor-lab/feature/facets2N

Feature/facets2 n
taylor-lab · Apr 30, 2020 · 1399b5c · 1399b5c
2 parents ba49e98 + c3013ea
commit 1399b5c
Show file tree

Hide file tree

Showing 11 changed files with 161 additions and 68 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,19 +1,19 @@
 Package: facetsSuite
 Type: Package
 Title: Functions to run and parse output from FACETS
-Version: 1.0.1
+Version: 2.0.5
 Authors@R: person(given = "Philip",
              family = "Jonsson",
              role = c("aut", "cre"),
              email = "[email protected]")
-URL: https://github.com/mskcc/facetsSuite
+URL: https://github.com/mskcc/facets-suite
 Description: This package provides functions that wrap the FACETS package for allele-specific copy-number analysis as well as functions to perform post-hoc analyses thereof.
 License: MIT + file LICENSE
 RoxygenNote: 7.0.2
 LazyData: true
 Encoding: UTF-8
 Depends:
-    R (>= 3.4.0), pctGCdata (>= 0.2.0)
+    R (>= 3.4.0), pctGCdata (>= 0.3.0)
 Imports:
     data.table (>= 1.11.8),
     dplyr (>= 0.8.3),
@@ -25,9 +25,11 @@ Imports:
     egg (>= 0.4.2),
     scales (>= 1.0.0),
     argparse (>= 1.1.1),
-    tibble (>= 2.1.3)
+    tibble (>= 2.1.3),
+    facets (>= 0.5.6), 
+    facets2n (>= 0.2.6)
 NeedsCompilation: no
-Packaged: 2020-02-05 12:05:36 UTC; rptashkin
+Packaged: 2020-04-29 20:11:41 UTC; rptashkin
 Suggests: 
     covr (>= 3.2.0),
     testthat (>= 2.1.0)
diff --git a/R/ccf-annotate-maf.R b/R/ccf-annotate-maf.R
@@ -61,10 +61,10 @@ ccf_annotate_maf = function(maf,
     maf[, c('ccf_Mcopies', 'ccf_Mcopies_lower', 'ccf_Mcopies_upper', 'ccf_Mcopies_prob95', 'ccf_Mcopies_prob90') :=
             estimate_ccf(purity, tcn, tcn - lcn, t_alt_count, t_depth), by = seq_len(nrow(maf))]
     maf[, c('ccf_1copy', 'ccf_1copy_lower', 'ccf_1copy_upper', 'ccf_1copy_prob95', 'ccf_1copy_prob90') :=
-            estimate_ccf(purity, tcn, tcn - lcn, t_alt_count, t_depth), by = seq_len(nrow(maf))]
+            estimate_ccf(purity, tcn, 1, t_alt_count, t_depth), by = seq_len(nrow(maf))]
     maf[, c('ccf_expected_copies', 'ccf_expected_copies_lower', 'ccf_expected_copies_upper',
             'ccf_expected_copies_prob95', 'ccf_expected_copies_prob90') :=
-            estimate_ccf(purity, tcn, tcn - lcn, t_alt_count, t_depth), by = seq_len(nrow(maf))]
+            estimate_ccf(purity, tcn, expected_alt_copies, t_alt_count, t_depth), by = seq_len(nrow(maf))]
     as.data.frame(maf)
 }
 

diff --git a/R/check-fit.R b/R/check-fit.R
@@ -54,7 +54,7 @@ check_fit = function(facets_output,
     dipLogR = facets_output$dipLogR
     alballogr = as.numeric(facets_output$alballogr[, 1])
     purity = facets_output$purity
-    mafr_thresh = facets_output$mafr_thresh
+
     fcna_output = calculate_fraction_cna(segs, facets_output$ploidy, genome, algorithm)
     wgd = fcna_output$genome_doubled
 
@@ -66,9 +66,9 @@ check_fit = function(facets_output,
     setkey(segs, chrom, start, end)
     snps = as.data.table(snps)
     setkey(snps, chrom, maploc)
-
+    
     # Label clonal segments
-    segs[, clonal := cf >= purity - .1]
+    segs[, clonal := cf >= (purity * 0.8)]
 
     # Subset on autosomes
     auto_segs = segs[chrom < 23]
@@ -92,6 +92,34 @@ check_fit = function(facets_output,
     n_dip_imbal_segs = nrow(dip_imbal_segs)
     frac_dip_imbal_segs = sum(dip_imbal_segs$length)/sum(auto_segs$length)
 
+    #############################
+    ### Other metrics
+    #############################
+    ## fraction genome unaltered (2-1) -- too high --> low purity?
+    segs_unaltered = auto_segs[tcn == 2 & lcn == 1, ]    
+    n_segs_unaltered = nrow(segs_unaltered)    
+    frac_genome_unaltered = sum(segs_unaltered$length)/sum(auto_segs$length)
+
+    segs_below_dipLogR = auto_segs[cnlr.median.clust < dipLogR]    # part of ploidy filter
+    n_segs_below_dipLogR = nrow(segs_below_dipLogR)    # part of ploidy filter
+    frac_below_dipLogR = sum(segs_below_dipLogR$length)/sum(auto_segs$length)
+
+    segs_balanced_odd_tcn = auto_segs[mafR <= 0.025 & (tcn %% 2) != 0 & clonal ==T & tcn < 8,]
+    n_segs_balanced_odd_tcn = nrow(segs_balanced_odd_tcn)
+    frac_balanced_odd_tcn = sum(segs_balanced_odd_tcn$length)/sum(auto_segs$length)
+
+    segs_imbalanced_diploid_cn = auto_segs[clonal==T & mafR > 0.1 & !is.na(lcn) & lcn != 0  & (as.double(tcn) / lcn) == 2, ]
+    n_segs_imbalanced_diploid_cn = nrow(segs_imbalanced_diploid_cn)
+    frac_imbalanced_diploid_cn = sum(segs_imbalanced_diploid_cn$length)/sum(auto_segs$length)
+
+    segs_lcn_greater_mcn = auto_segs[clonal==T & lcn < mcn,]
+    n_segs_lcn_greater_mcn = nrow(segs_lcn_greater_mcn)
+    frac_lcn_greater_mcn = sum(segs_lcn_greater_mcn$length)/sum(auto_segs$length)
+
+    mafr_median_all = median(auto_segs$mafR, na.rm = T)
+    mafr_median_clonal = median(auto_segs[clonal==T, ]$mafR, na.rm = T)
+    mafr_n_gt_1 = nrow(auto_segs[mafR > 1, ])
+
     # Number of high-level amplifications and homozygous deletions
     # Clonal homdels, how much of the genome do they represent
     n_amps = nrow(segs[tcn >= 10])
@@ -110,28 +138,32 @@ check_fit = function(facets_output,
     n_cnlr_clusters = length(unique(segs$cnlr.median.clust))
 
     # Number of segments with lcn==NA
-    n_lcn_na = nrow(segs[is.na(lcn)])
+    n_lcn_na = nrow(auto_segs[is.na(lcn)])
+    frac_lcn_na = sum(auto_segs[is.na(lcn)]$length)/sum(auto_segs$length)
 
     # Number of segments with LOH
     loh_segs = auto_segs[lcn == 0,]
     n_loh = nrow(loh_segs)
     frac_loh = sum(loh_segs$length)/sum(auto_segs$length)
 
-    # Check fraction of subclonal events
-    subclonal_segs = segs[clonal == F, ]
+    # Check fraction of subclonal events within autosomal chromosomes
+    subclonal_segs = auto_segs[clonal == F, ]
     n_segs_subclonal = nrow(subclonal_segs)
-    frac_segs_subclonal = sum(subclonal_segs$length)/sum(segs$length)
-
+    frac_segs_subclonal = sum(subclonal_segs$length)/sum(auto_segs$length)
+    
     # Compile SNP statistics
     n_snps = nrow(snps)
     het_snps = snps[het == 1]
     n_het_snps = nrow(het_snps)
     frac_het_snps = n_het_snps/n_snps
-    n_het_snps_hom_in_tumor_1pct = nrow(het_snps[ vafT < 0.01 | vafT > 0.99, ])
-    n_het_snps_hom_in_tumor_5pct = nrow(het_snps[ vafT < 0.05 | vafT > 0.95, ])
+
+    n_snps_with_300x_in_tumor = nrow(snps[rCountT > 300, ])
+    n_het_snps_with_300x_in_tumor = nrow(het_snps[rCountT > 300, ])
+    n_het_snps_hom_in_tumor_1pct = nrow(het_snps[  (vafT < 0.01 | vafT > 0.99), ])
+    n_het_snps_hom_in_tumor_5pct = nrow(het_snps[ (rCountN > 35 & rCountT > 35) & (vafT < 0.05 | vafT > 0.95), ])
     frac_het_snps_hom_in_tumor_1pct = n_het_snps_hom_in_tumor_1pct/n_het_snps
     frac_het_snps_hom_in_tumor_5pct = n_het_snps_hom_in_tumor_5pct/n_het_snps
-
+    
     # Check the mean/standard deviation of the cnlr
     snps[, cnlr_residual := cnlr - median(cnlr), by = seg]
     mean_cnlr_residual = mean(snps$cnlr_residual)
@@ -173,6 +205,7 @@ check_fit = function(facets_output,
     output = list(
         dipLogR_flag = dipLogR_flag,
         n_alternative_dipLogR = n_alt_dipLogR,
+        wgd = wgd,
         n_dip_bal_segs = n_dip_bal_segs,
         frac_dip_bal_segs = frac_dip_bal_segs,
         n_dip_imbal_segs = n_dip_imbal_segs,
@@ -190,9 +223,19 @@ check_fit = function(facets_output,
         frac_loh = frac_loh,
         n_segs_subclonal = n_segs_subclonal,
         frac_segs_subclonal = frac_segs_subclonal,
+        n_segs_below_dipLogR = n_segs_below_dipLogR,
+        frac_below_dipLogR = frac_below_dipLogR,
+        n_segs_balanced_odd_tcn = n_segs_balanced_odd_tcn,
+        frac_balanced_odd_tcn = frac_balanced_odd_tcn,
+        n_segs_imbalanced_diploid_cn = n_segs_imbalanced_diploid_cn,
+        frac_imbalanced_diploid_cn = frac_imbalanced_diploid_cn,
+        n_segs_lcn_greater_mcn = n_segs_lcn_greater_mcn,
+        frac_lcn_greater_mcn = frac_lcn_greater_mcn,
         n_snps = n_snps,
         n_het_snps = n_het_snps,
         frac_het_snps = frac_het_snps,
+        n_snps_with_300x_in_tumor = n_snps_with_300x_in_tumor,
+        n_het_snps_with_300x_in_tumor = n_het_snps_with_300x_in_tumor,
         n_het_snps_hom_in_tumor_1pct = n_het_snps_hom_in_tumor_1pct,
         n_het_snps_hom_in_tumor_5pct = n_het_snps_hom_in_tumor_5pct,
         frac_het_snps_hom_in_tumor_1pct = frac_het_snps_hom_in_tumor_1pct,
@@ -206,22 +249,27 @@ check_fit = function(facets_output,
         n_segs_discordant_both = discordant_stats$n_discordant_both,
         frac_discordant_both = discordant_stats$length_discordant_both/evaluable_length$both,
         n_segs_icn_cnlor_discordant = n_icn_cnlor_discordant,
-        frac_icn_cnlor_discordant = frac_icn_cnlor_discordant
+        frac_icn_cnlor_discordant = frac_icn_cnlor_discordant,
+        mafr_median_all = mafr_median_all,
+        mafr_median_clonal = mafr_median_clonal,
+        mafr_n_gt_1 = mafr_n_gt_1
     )
 
+
+
     # If input MAF is provided add some stats based on this
     if (!is.null(maf)) {
         maf = as.data.table(maf)
 
         maf[, `:=` (
             Chromosome = ifelse(Chromosome == 'X', 23, Chromosome),
             t_var_freq = t_alt_count/(t_alt_count+t_ref_count)
-            )][, Chromosome := as.integer(Chromosome)]
+        )][, Chromosome := as.integer(Chromosome)]
         setkey(maf, Chromosome, Start_Position, End_Position)
         maf = foverlaps(maf, segs, mult = 'first', nomatch = NA,
                         by.x = c('Chromosome', 'Start_Position', 'End_Position'),
                         by.y = c('chrom', 'start', 'end'))
-
+        
         # Median mutation VAF at clonal 2:1 segments
         output$dip_median_vaf = maf[clonal == TRUE & mcn == 1 & lcn == 1][, median(t_var_freq)]
 

diff --git a/R/plot-facets.R b/R/plot-facets.R
@@ -65,8 +65,8 @@ cnlr_plot = function(facets_data,
 
     ymin = floor(min(segs$cnlr.median, na.rm = T))
     ymax = ceiling(max(segs$cnlr.median, na.rm = T))
-    if (ymin > -3) ymin = -3
-    if (ymax < 3) ymax = 3
+    if (ymin > -2) ymin = -2
+    if (ymax < 2) ymax = 2
 
     if (adjust_dipLogR) {
         snps$cnlr = snps$cnlr - dipLogR
@@ -240,9 +240,14 @@ cf_plot = function(facets_data,
 
     starts = cumsum(c(1, segs$num.mark))[seq_along(segs$num.mark)]
     ends = cumsum(c(segs$num.mark))
+    my_starts = snps[starts, 'chr_maploc']
+    my_ends = snps[ends, 'chr_maploc']
+
+    my_starts = snps[starts, 'chr_maploc']
+    my_ends = snps[ends, 'chr_maploc']
 
     cf = ggplot(segs) +
-        geom_rect(aes(xmin = starts, xmax = ends, ymax = 1, ymin = 0),
+        geom_rect(aes(xmin = my_starts, xmax = my_ends, ymax = 1, ymin = 0),
                   fill = cols, col = 'white', size = 0) +
         scale_x_continuous(breaks = mid, labels = names(mid), expand = c(.01, 0)) +
         scale_y_continuous(expand = c(0, 0)) +
@@ -277,8 +282,8 @@ icn_plot = function(facets_data,
     genome = match.arg(genome, c('hg19', 'hg18', 'hg38'), several.ok = FALSE)
     method = match.arg(method, c('em', 'cncf'), several.ok = FALSE)
 
-    snps = facets_data$snps
-    segs = facets_data$segs
+    snps = facets_data$snps %>% data.table
+    segs = facets_data$segs %>% data.table
     if (!plotX) {
         snps = subset(snps, chrom < 23)
         segs = subset(segs, chrom < 23)
@@ -310,15 +315,19 @@ icn_plot = function(facets_data,
     my_tcn_ends = snps[ends, c('chr_maploc', 'tcn')]
     my_lcn_starts = snps[starts, c('chr_maploc', 'lcn')]
     my_lcn_ends = snps[ends, c('chr_maploc', 'lcn')]
+
+    axis_breaks = c(0:5, 5 + (sort(unique(segs[tcn >5, tcn])) - 5)/3)
+    axis_labels = c(0:5, sort(unique(segs[tcn >5, tcn])))
 
     icn = ggplot(segs) +
         geom_segment(col = 'red', size = 1, 
                      aes(x = my_lcn_starts$chr_maploc, xend = my_lcn_ends$chr_maploc, 
                          y = my_lcn_starts$lcn, yend = my_lcn_ends$lcn)) +
-        geom_segment(col = 'black', size = 1, 
+        geom_segment(col = 'black', size = 0.8, 
                      aes(x = my_tcn_starts$chr_maploc, xend = my_tcn_ends$chr_maploc, 
                          y = my_tcn_starts$tcn, yend = my_tcn_ends$tcn)) +
         # scale_y_continuous(breaks=c(0:5, 5 + seq_len(35) / 3), labels = 0:40, limits = c(0, NA)) +
+        scale_y_continuous(breaks=axis_breaks, labels=axis_labels) + 
         scale_x_continuous(breaks = mid, labels = names(mid), expand = c(.01, 0)) +
         labs(x = NULL, y = my_ylab) +
         theme_bw() +
@@ -330,16 +339,6 @@ icn_plot = function(facets_data,
               panel.grid.minor.y = element_blank(),
               plot.margin = unit(c(0, 1, 0, 0), 'lines'))
 
-    max_tcn = max(segs$tcn, na.rm = T)
-    if (max_tcn > 10) {
-        top = 5*round(43/5)
-        icn = icn + 
-            scale_y_continuous(breaks = c(0:10, seq(15, 45, 5)), labels = c(0:10, seq(15, 45, 5)), limits = c(0, max_tcn))
-    } else {
-        icn = icn + 
-            scale_y_continuous(breaks = 0:max_tcn, labels = 0:max_tcn, limits = c(0, max_tcn))
-    }
-
     if (!is.null(highlight_gene)) {
         if (is.character(highlight_gene)) {
             highlight_gene = get_gene_position(gene_pos)

diff --git a/R/read-snp-matrix.R b/R/read-snp-matrix.R
@@ -45,7 +45,7 @@ read_snp_matrix = function(input_file,
 
 read_snp_matrix_facets2n = function(filename, err.thresh=Inf, del.thresh=Inf, perl.pileup=FALSE,
                                     MandUnormal=FALSE, spanT=0.2, spanA=0.2, spanX=0.2, gbuild="hg19",
-                                    ReferencePileupFile=NULL, ReferenceLoessFile=NULL, MinOverlap=0.9, useMatchedX=FALSE){
+                                    ReferencePileupFile=NULL, ReferenceLoessFile=NULL, MinOverlap=0.9, useMatchedX=FALSE, refX=FALSE){
     #' #' Read in the snp-pileup generated SNP read count matrix file for facets2n processing
     #' @importFrom utils read.csv
     #' @param filename counts file from snp-pileup
@@ -62,13 +62,14 @@ read_snp_matrix_facets2n = function(filename, err.thresh=Inf, del.thresh=Inf, pe
     #' @param ReferenceLoessFile (character) Filepath to an optional loess data, generated using the facets2n package, of one or more reference normals. The number of normals in this data should match that in the ReferencePileupFile, and should be in the same order.
     #' @param MinOverlap (numeric) Mininum overlap fraction of loci between a tumor pileup and reference pileup data.
     #' @param useMatchedX (logical) Is the matched normal to be used for ChrX normalization?
+    #' @param refX (logical) Use sex matched reference normal for chrX normalization
     #' @return A dataframe of pileup depth values for Tumor and Matched Normal if MandUnormal is FALSE. Else, a list of data frame with pileup depth values of Tumor, matched Normal, and a best unmatched normal, and the associated span values.
     #' @source \code{snp-pileup} is part of \href{www.github.com/mskcc/facets}{FACETS}.
     #' @import facets2n
     #' @importFrom pctGCdata getGCpct
     #' @export
     if (MandUnormal){
-        facets2n::readSnpMatrix(filename, MandUnormal = MandUnormal, ReferencePileupFile = ReferencePileupFile, ReferenceLoessFile = ReferenceLoessFile, useMatchedX = useMatchedX)
+        facets2n::readSnpMatrix(filename, MandUnormal = MandUnormal, ReferencePileupFile = ReferencePileupFile, ReferenceLoessFile = ReferenceLoessFile, useMatchedX = useMatchedX, refX = refX)
     }else{
         facets2n::readSnpMatrix(filename, MandUnormal = MandUnormal)
     }

diff --git a/R/run-facets.R b/R/run-facets.R
@@ -37,7 +37,6 @@
 #' }
 #' 
 #' @import facets
-# @import pctGCdata
 #' @importFrom pctGCdata getGCpct
 #' @import facets2n
 
@@ -91,7 +90,7 @@ run_facets = function(read_counts,
                                 gbuild = genome, hetscale = TRUE, unmatched = FALSE, ndepthmax = 5000)
       }
       out = facets2n::procSample(dat, cval = cval, min.nhet = min_nhet, dipLogR = dipLogR)
-      fit = facets::emcncf(out, min.nhet = min_nhet)
+      fit = facets2n::emcncf(out, min.nhet = min_nhet)
 
     }else{
       dat = facets::preProcSample(read_counts, ndepth = ndepth, het.thresh = 0.25, snp.nbhd = snp_nbhd, cval = 25,
@@ -223,4 +222,23 @@ create_legacy_output <- function(facets_output, directory, sample_id, counts_fil
     out_txt = paste0(out_txt, '# ', run$flags,'\n\n')   
 
     cat(out_txt, file=paste0(directory, '/', sample_id, '.out'))
+
+    #try default facets2n plotting, continue if no facets2n lib
+    tryCatch({
+      outfile = paste0(output_prefix, "_legacy.tiff")
+      plot_title = paste0(basename(output_prefix),
+                          ' | cval=', cval,
+                          ' | purity_cval=', purity_cval,
+                          ' | purity=', round(fit$purity, 2),
+                          ' | ploidy=', round(fit$ploidy, 2),
+                          ' | dipLogR=', round(fit$dipLogR, 2))
+      message(plot_title)
+      tiff(file = outfile, width = 6.5, height = 8, res = 300,units = 'in')
+      facets2n::plotSample(x = out, emfit = fit, plot.type = "both", sname = plot_title)
+      dev.off()
+    },
+    error= function(cond) {message(cond); return(NA)},
+    warning=function(cond) {message(cond); return(NA)},
+    finally = NULL
+    )
 }