diff --git a/DESCRIPTION b/DESCRIPTION index 80ebb46..732539e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: facetsSuite Type: Package Title: Functions to run and parse output from FACETS -Version: 2.0.0 +Version: 2.0.4 Authors@R: person(given = "Philip", family = "Jonsson", role = c("aut", "cre"), @@ -9,11 +9,11 @@ Authors@R: person(given = "Philip", URL: https://github.com/mskcc/facets-suite Description: This package provides functions that wrap the FACETS package for allele-specific copy-number analysis as well as functions to perform post-hoc analyses thereof. License: MIT + file LICENSE -RoxygenNote: 6.1.1 +RoxygenNote: 7.0.2 LazyData: true Encoding: UTF-8 Depends: - R (>= 2.10) + R (>= 3.4.0), pctGCdata (>= 0.3.0) Imports: data.table (>= 1.11.8), dplyr (>= 0.8.3), @@ -25,9 +25,11 @@ Imports: egg (>= 0.4.2), scales (>= 1.0.0), argparse (>= 1.1.1), - tibble (>= 2.1.3) + tibble (>= 2.1.3), + facets (>= 0.5.6), + facets2n (>= 0.2.2) NeedsCompilation: no -Packaged: 2019-02-07 16:38:38 UTC; jonssonp +Packaged: 2020-03-01 21:56:37 UTC; rptashkin Suggests: covr (>= 3.2.0), testthat (>= 2.1.0) diff --git a/NAMESPACE b/NAMESPACE index bdab53a..4a4af5f 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -12,16 +12,19 @@ export(cf_plot) export(check_fit) export(closeup_plot) export(cnlr_plot) +export(create_legacy_output) export(format_igv_seg) export(gene_level_changes) export(icn_plot) +export(load_facets_output) export(plot_segmentation) export(read_snp_matrix) +export(read_snp_matrix_facets2n) export(run_facets) -export(create_legacy_output) -export(load_facets_output) export(valor_plot) import(data.table) +import(facets) +import(facets2n) import(ggplot2) importFrom(data.table,":=") importFrom(data.table,foverlaps) @@ -45,6 +48,7 @@ importFrom(dplyr,summarize) importFrom(dplyr,ungroup) importFrom(egg,ggarrange) importFrom(grDevices,colorRampPalette) +importFrom(pctGCdata,getGCpct) importFrom(plyr,mapvalues) importFrom(purrr,discard) importFrom(purrr,map_chr) @@ -55,3 +59,4 @@ importFrom(scales,pretty_breaks) importFrom(stats,dbinom) importFrom(tidyr,gather) importFrom(tidyr,separate_rows) +importFrom(utils,read.csv) diff --git a/R/ccf-annotate-maf.R b/R/ccf-annotate-maf.R index 43e9316..c9cd13d 100644 --- a/R/ccf-annotate-maf.R +++ b/R/ccf-annotate-maf.R @@ -61,10 +61,10 @@ ccf_annotate_maf = function(maf, maf[, c('ccf_Mcopies', 'ccf_Mcopies_lower', 'ccf_Mcopies_upper', 'ccf_Mcopies_prob95', 'ccf_Mcopies_prob90') := estimate_ccf(purity, tcn, tcn - lcn, t_alt_count, t_depth), by = seq_len(nrow(maf))] maf[, c('ccf_1copy', 'ccf_1copy_lower', 'ccf_1copy_upper', 'ccf_1copy_prob95', 'ccf_1copy_prob90') := - estimate_ccf(purity, tcn, tcn - lcn, t_alt_count, t_depth), by = seq_len(nrow(maf))] + estimate_ccf(purity, tcn, 1, t_alt_count, t_depth), by = seq_len(nrow(maf))] maf[, c('ccf_expected_copies', 'ccf_expected_copies_lower', 'ccf_expected_copies_upper', 'ccf_expected_copies_prob95', 'ccf_expected_copies_prob90') := - estimate_ccf(purity, tcn, tcn - lcn, t_alt_count, t_depth), by = seq_len(nrow(maf))] + estimate_ccf(purity, tcn, expected_alt_copies, t_alt_count, t_depth), by = seq_len(nrow(maf))] as.data.frame(maf) } diff --git a/R/check-fit.R b/R/check-fit.R index 7bb5677..2aec684 100644 --- a/R/check-fit.R +++ b/R/check-fit.R @@ -54,7 +54,7 @@ check_fit = function(facets_output, dipLogR = facets_output$dipLogR alballogr = as.numeric(facets_output$alballogr[, 1]) purity = facets_output$purity - mafr_thresh = facets_output$mafr_thresh + fcna_output = calculate_fraction_cna(segs, facets_output$ploidy, genome, algorithm) wgd = fcna_output$genome_doubled @@ -66,9 +66,9 @@ check_fit = function(facets_output, setkey(segs, chrom, start, end) snps = as.data.table(snps) setkey(snps, chrom, maploc) - + # Label clonal segments - segs[, clonal := cf >= purity - .1] + segs[, clonal := cf >= (purity * 0.8)] # Subset on autosomes auto_segs = segs[chrom < 23] @@ -92,6 +92,34 @@ check_fit = function(facets_output, n_dip_imbal_segs = nrow(dip_imbal_segs) frac_dip_imbal_segs = sum(dip_imbal_segs$length)/sum(auto_segs$length) + ############################# + ### Other metrics + ############################# + ## fraction genome unaltered (2-1) -- too high --> low purity? + segs_unaltered = auto_segs[tcn == 2 & lcn == 1, ] + n_segs_unaltered = nrow(segs_unaltered) + frac_genome_unaltered = sum(segs_unaltered$length)/sum(auto_segs$length) + + segs_below_dipLogR = auto_segs[cnlr.median.clust < dipLogR] # part of ploidy filter + n_segs_below_dipLogR = nrow(segs_below_dipLogR) # part of ploidy filter + frac_below_dipLogR = sum(segs_below_dipLogR$length)/sum(auto_segs$length) + + segs_balanced_odd_tcn = auto_segs[mafR <= 0.025 & (tcn %% 2) != 0 & clonal ==T & tcn < 8,] + n_segs_balanced_odd_tcn = nrow(segs_balanced_odd_tcn) + frac_balanced_odd_tcn = sum(segs_balanced_odd_tcn$length)/sum(auto_segs$length) + + segs_imbalanced_diploid_cn = auto_segs[clonal==T & mafR > 0.1 & !is.na(lcn) & lcn != 0 & (as.double(tcn) / lcn) == 2, ] + n_segs_imbalanced_diploid_cn = nrow(segs_imbalanced_diploid_cn) + frac_imbalanced_diploid_cn = sum(segs_imbalanced_diploid_cn$length)/sum(auto_segs$length) + + segs_lcn_greater_mcn = auto_segs[clonal==T & lcn < mcn,] + n_segs_lcn_greater_mcn = nrow(segs_lcn_greater_mcn) + frac_lcn_greater_mcn = sum(segs_lcn_greater_mcn$length)/sum(auto_segs$length) + + mafr_median_all = median(auto_segs$mafR, na.rm = T) + mafr_median_clonal = median(auto_segs[clonal==T, ]$mafR, na.rm = T) + mafr_n_gt_1 = nrow(auto_segs[mafR > 1, ]) + # Number of high-level amplifications and homozygous deletions # Clonal homdels, how much of the genome do they represent n_amps = nrow(segs[tcn >= 10]) @@ -110,28 +138,32 @@ check_fit = function(facets_output, n_cnlr_clusters = length(unique(segs$cnlr.median.clust)) # Number of segments with lcn==NA - n_lcn_na = nrow(segs[is.na(lcn)]) + n_lcn_na = nrow(auto_segs[is.na(lcn)]) + frac_lcn_na = sum(auto_segs[is.na(lcn)]$length)/sum(auto_segs$length) # Number of segments with LOH loh_segs = auto_segs[lcn == 0,] n_loh = nrow(loh_segs) frac_loh = sum(loh_segs$length)/sum(auto_segs$length) - # Check fraction of subclonal events - subclonal_segs = segs[clonal == F, ] + # Check fraction of subclonal events within autosomal chromosomes + subclonal_segs = auto_segs[clonal == F, ] n_segs_subclonal = nrow(subclonal_segs) - frac_segs_subclonal = sum(subclonal_segs$length)/sum(segs$length) - + frac_segs_subclonal = sum(subclonal_segs$length)/sum(auto_segs$length) + # Compile SNP statistics n_snps = nrow(snps) het_snps = snps[het == 1] n_het_snps = nrow(het_snps) frac_het_snps = n_het_snps/n_snps - n_het_snps_hom_in_tumor_1pct = nrow(het_snps[ vafT < 0.01 | vafT > 0.99, ]) - n_het_snps_hom_in_tumor_5pct = nrow(het_snps[ vafT < 0.05 | vafT > 0.95, ]) + + n_snps_with_300x_in_tumor = nrow(snps[rCountT > 300, ]) + n_het_snps_with_300x_in_tumor = nrow(het_snps[rCountT > 300, ]) + n_het_snps_hom_in_tumor_1pct = nrow(het_snps[ (vafT < 0.01 | vafT > 0.99), ]) + n_het_snps_hom_in_tumor_5pct = nrow(het_snps[ (rCountN > 35 & rCountT > 35) & (vafT < 0.05 | vafT > 0.95), ]) frac_het_snps_hom_in_tumor_1pct = n_het_snps_hom_in_tumor_1pct/n_het_snps frac_het_snps_hom_in_tumor_5pct = n_het_snps_hom_in_tumor_5pct/n_het_snps - + # Check the mean/standard deviation of the cnlr snps[, cnlr_residual := cnlr - median(cnlr), by = seg] mean_cnlr_residual = mean(snps$cnlr_residual) @@ -191,9 +223,19 @@ check_fit = function(facets_output, frac_loh = frac_loh, n_segs_subclonal = n_segs_subclonal, frac_segs_subclonal = frac_segs_subclonal, + n_segs_below_dipLogR = n_segs_below_dipLogR, + frac_below_dipLogR = frac_below_dipLogR, + n_segs_balanced_odd_tcn = n_segs_balanced_odd_tcn, + frac_balanced_odd_tcn = frac_balanced_odd_tcn, + n_segs_imbalanced_diploid_cn = n_segs_imbalanced_diploid_cn, + frac_imbalanced_diploid_cn = frac_imbalanced_diploid_cn, + n_segs_lcn_greater_mcn = n_segs_lcn_greater_mcn, + frac_lcn_greater_mcn = frac_lcn_greater_mcn, n_snps = n_snps, n_het_snps = n_het_snps, frac_het_snps = frac_het_snps, + n_snps_with_300x_in_tumor = n_snps_with_300x_in_tumor, + n_het_snps_with_300x_in_tumor = n_het_snps_with_300x_in_tumor, n_het_snps_hom_in_tumor_1pct = n_het_snps_hom_in_tumor_1pct, n_het_snps_hom_in_tumor_5pct = n_het_snps_hom_in_tumor_5pct, frac_het_snps_hom_in_tumor_1pct = frac_het_snps_hom_in_tumor_1pct, @@ -207,9 +249,14 @@ check_fit = function(facets_output, n_segs_discordant_both = discordant_stats$n_discordant_both, frac_discordant_both = discordant_stats$length_discordant_both/evaluable_length$both, n_segs_icn_cnlor_discordant = n_icn_cnlor_discordant, - frac_icn_cnlor_discordant = frac_icn_cnlor_discordant + frac_icn_cnlor_discordant = frac_icn_cnlor_discordant, + mafr_median_all = mafr_median_all, + mafr_median_clonal = mafr_median_clonal, + mafr_n_gt_1 = mafr_n_gt_1 ) + + # If input MAF is provided add some stats based on this if (!is.null(maf)) { maf = as.data.table(maf) @@ -217,12 +264,12 @@ check_fit = function(facets_output, maf[, `:=` ( Chromosome = ifelse(Chromosome == 'X', 23, Chromosome), t_var_freq = t_alt_count/(t_alt_count+t_ref_count) - )][, Chromosome := as.integer(Chromosome)] + )][, Chromosome := as.integer(Chromosome)] setkey(maf, Chromosome, Start_Position, End_Position) maf = foverlaps(maf, segs, mult = 'first', nomatch = NA, by.x = c('Chromosome', 'Start_Position', 'End_Position'), by.y = c('chrom', 'start', 'end')) - + # Median mutation VAF at clonal 2:1 segments output$dip_median_vaf = maf[clonal == TRUE & mcn == 1 & lcn == 1][, median(t_var_freq)] diff --git a/R/plot-facets.R b/R/plot-facets.R index e327208..37064e9 100644 --- a/R/plot-facets.R +++ b/R/plot-facets.R @@ -65,8 +65,8 @@ cnlr_plot = function(facets_data, ymin = floor(min(segs$cnlr.median, na.rm = T)) ymax = ceiling(max(segs$cnlr.median, na.rm = T)) - if (ymin > -3) ymin = -3 - if (ymax < 3) ymax = 3 + if (ymin > -2) ymin = -2 + if (ymax < 2) ymax = 2 if (adjust_dipLogR) { snps$cnlr = snps$cnlr - dipLogR @@ -240,9 +240,11 @@ cf_plot = function(facets_data, starts = cumsum(c(1, segs$num.mark))[seq_along(segs$num.mark)] ends = cumsum(c(segs$num.mark)) + my_starts = snps[starts, 'chr_maploc'] + my_ends = snps[ends, 'chr_maploc'] cf = ggplot(segs) + - geom_rect(aes(xmin = starts, xmax = ends, ymax = 1, ymin = 0), + geom_rect(aes(xmin = my_starts, xmax = my_ends, ymax = 1, ymin = 0), fill = cols, col = 'white', size = 0) + scale_x_continuous(breaks = mid, labels = names(mid), expand = c(.01, 0)) + scale_y_continuous(expand = c(0, 0)) + @@ -315,7 +317,7 @@ icn_plot = function(facets_data, geom_segment(col = 'red', size = 1, aes(x = my_lcn_starts$chr_maploc, xend = my_lcn_ends$chr_maploc, y = my_lcn_starts$lcn, yend = my_lcn_ends$lcn)) + - geom_segment(col = 'black', size = 1, + geom_segment(col = 'black', size = 0.8, aes(x = my_tcn_starts$chr_maploc, xend = my_tcn_ends$chr_maploc, y = my_tcn_starts$tcn, yend = my_tcn_ends$tcn)) + # scale_y_continuous(breaks=c(0:5, 5 + seq_len(35) / 3), labels = 0:40, limits = c(0, NA)) + diff --git a/R/read-snp-matrix.R b/R/read-snp-matrix.R index 6bb74a5..cf1bcf4 100644 --- a/R/read-snp-matrix.R +++ b/R/read-snp-matrix.R @@ -1,28 +1,26 @@ -#' Read counts file -#' -#' Reads a gzipped output file from \code{snp-pileup}. -#' -#' @param input_file Path to input file. -#' @param err.thresh Threshold for errors at locus. -#' @param del.thresh Threshold for deletions at locus. -#' -#' @source \code{snp-pileup} is part of \href{www.github.com/mskcc/facets}{FACETS}. -#' -#' @return A SNP count matrix, with the following columns: -#' \itemize{ -#' \item{\code{Chromosome} and \code{Position}:} {Chromosomal position of SNP.} -#' \item{\code{NOR.DP}:} {Total read depth in normal sample.} -#' \item{\code{TUM.DP}:} {Total read depth in tumor sample.} -#' \item{\code{NOR.RD}:} {Reference allele read depth in normal sample.} -#' \item{\code{TUM.RD}:} {Reference allele read depth in tumor sample.} -#' } -#' -#' @import data.table -#' @export read_snp_matrix = function(input_file, err.thresh = 10, del.thresh = 10) { + #' Reads a gzipped output file from \code{snp-pileup}. + #' + #' @param input_file Path to input file. + #' @param err.thresh Threshold for errors at locus. + #' @param del.thresh Threshold for deletions at locus. + #' + #' @source \code{snp-pileup} is part of \href{www.github.com/mskcc/facets}{FACETS}. + #' + #' @return A SNP count matrix, with the following columns: + #' \itemize{ + #' \item{\code{Chromosome} and \code{Position}:} {Chromosomal position of SNP.} + #' \item{\code{NOR.DP}:} {Total read depth in normal sample.} + #' \item{\code{TUM.DP}:} {Total read depth in tumor sample.} + #' \item{\code{NOR.RD}:} {Reference allele read depth in normal sample.} + #' \item{\code{TUM.RD}:} {Reference allele read depth in tumor sample.} + #' } + #' + #' @import data.table + #' @export read_counts = data.table::fread(cmd = paste('gunzip -c', input_file), key = c('Chromosome', 'Position')) @@ -44,3 +42,34 @@ read_snp_matrix = function(input_file, read_counts[order(Chromosome, Position)][, list(Chromosome, Position, NOR.DP, TUM.DP, NOR.RD, TUM.RD)] } + +read_snp_matrix_facets2n = function(filename, err.thresh=Inf, del.thresh=Inf, perl.pileup=FALSE, + MandUnormal=FALSE, spanT=0.2, spanA=0.2, spanX=0.2, gbuild="hg19", + ReferencePileupFile=NULL, ReferenceLoessFile=NULL, MinOverlap=0.9, useMatchedX=FALSE){ + #' #' Read in the snp-pileup generated SNP read count matrix file for facets2n processing + #' @importFrom utils read.csv + #' @param filename counts file from snp-pileup + #' @param skip (character) Skip n number of lines in the input file. + #' @param err.thresh (numeric) Error threshold to be used to filter snp-pileup data frame. + #' @param del.thresh (numeric) Deletion threshold to be used to filter snp-pileup data frame. + #' @param perl.pileup (logical) Is the pileup data generated using perl pileup tool? + #' @param MandUnormal (logical) Is CNLR analysis to be peformed using unmatched reference normals? + #' @param spanT (numeric) Default span value to be used for loess normalization in tumor sample. + #' @param spanA (numeric) Default span value to be used for loess normalization across autosomal chromosomes in the normal sample. + #' @param spanX (numeric) Default span value to be used for loess normalization in Chr X in the normal sample. + #' @param gbuild (character) Genome build (Default: hg19). + #' @param ReferencePileupFile (character) Filepath to an optional snp-pileup generated pileup data of one or more reference normals. + #' @param ReferenceLoessFile (character) Filepath to an optional loess data, generated using the facets2n package, of one or more reference normals. The number of normals in this data should match that in the ReferencePileupFile, and should be in the same order. + #' @param MinOverlap (numeric) Mininum overlap fraction of loci between a tumor pileup and reference pileup data. + #' @param useMatchedX (logical) Is the matched normal to be used for ChrX normalization? + #' @return A dataframe of pileup depth values for Tumor and Matched Normal if MandUnormal is FALSE. Else, a list of data frame with pileup depth values of Tumor, matched Normal, and a best unmatched normal, and the associated span values. + #' @source \code{snp-pileup} is part of \href{www.github.com/mskcc/facets}{FACETS}. + #' @import facets2n + #' @importFrom pctGCdata getGCpct + #' @export + if (MandUnormal){ + facets2n::readSnpMatrix(filename, MandUnormal = MandUnormal, ReferencePileupFile = ReferencePileupFile, ReferenceLoessFile = ReferenceLoessFile, useMatchedX = useMatchedX) + }else{ + facets2n::readSnpMatrix(filename, MandUnormal = MandUnormal) + } +} diff --git a/R/run-facets.R b/R/run-facets.R index 7800047..40ebf3a 100644 --- a/R/run-facets.R +++ b/R/run-facets.R @@ -10,6 +10,12 @@ #' @param min_nhet Minimum number of heterozygous SNPs on segment required for clustering, see `facets` help. #' @param genome Genome build. #' @param seed Seed value for random number generation, set to enable full reproducibility. +#' @param facets_lib_path path to facets R library +#' @param facets2n_lib_path path to facets2n R library +#' @param referencePileup facets2n option: Filepath to an optional snp-pileup generated pileup data of one or more reference normals. +#' @param referenceLoess facets2n option: Filepath to an optional loess data, generated using the facets2n package, of one or more reference normals. The number of normals in this data should match that in the ReferencePileupFile, and should be in the same order. +#' @param MandUnormal (logical) facets2n option: Is CNLR analysis to be peformed using unmatched reference normals? +#' @param useMatchedX (logical) facets2n option: Force matched normal to be used for ChrX normalization? #' #' @return A list object containing the following items. See \href{www.github.com/mskcc/facets}{FACETS documentation} for more details: #' \itemize{ @@ -31,41 +37,68 @@ #' } #' #' @import facets -#' @import pctGCdata +#' @importFrom pctGCdata getGCpct +#' @import facets2n #' @export run_facets run_facets = function(read_counts, cval = 100, dipLogR = NULL, - ndepth = 35, + ndepth = 25, snp_nbhd = 250, - min_nhet = 15, + min_nhet = 10, genome = c('hg18', 'hg19', 'hg38', 'mm9', 'mm10'), seed = 100, - facets_lib_path = '') { + facets_lib_path = '', + facets2n_lib_path = '', + MandUnormal = FALSE, + useMatchedX = FALSE, + referencePileup=NULL, + referenceLoess=NULL) { - if ( facets_lib_path != '' ) { - library(facets, lib.loc = facets_lib_path) - } else { - library(facets) + if (facets_lib_path == '' & facets2n_lib_path == ''){ + stop('path to facets library is missing, must supply path to either facets or facets2n', call. = FALSE) } - print(paste0('loaded facets version: ', packageVersion('facets'))) - - # Check input - missing_cols = setdiff(c('Chromosome', 'Position', 'NOR.DP', 'TUM.DP', 'NOR.RD', 'TUM.RD'), names(read_counts)) - if (length(missing_cols) > 0) { - stop(paste0('Input missing column(s)', paste(missing_cols, collapse = ', '), '.'), call. = FALSE) + if (facets2n_lib_path != ''){ + library(facets2n, lib.loc = facets2n_lib_path) + print(paste0('loaded facets2n version: ', packageVersion('facets2n'))) + }else{ + library(facets, lib.loc = facets_lib_path) + print(paste0('loaded facets version: ', packageVersion('facets'))) + # Check input + missing_cols = setdiff(c('Chromosome', 'Position', 'NOR.DP', 'TUM.DP', 'NOR.RD', 'TUM.RD'), names(read_counts)) + if (length(missing_cols) > 0) { + stop(paste0('Input missing column(s)', paste(missing_cols, collapse = ', '), '.'), call. = FALSE) + } + } - + set.seed(seed) genome = match.arg(genome) # Run FACETS algorithm - dat = facets::preProcSample(read_counts, ndepth = ndepth, het.thresh = 0.25, snp.nbhd = snp_nbhd, cval = 25, - gbuild = genome, hetscale = TRUE, unmatched = FALSE, ndepthmax = 1000) - out = facets::procSample(dat, cval = cval, min.nhet = min_nhet, dipLogR = dipLogR) - fit = facets::emcncf(out) - + dat = NULL + out = NULL + fit = NULL + if (facets2n_lib_path != ''){ + if (MandUnormal){ + dat = facets2n::preProcSample(read_counts$rcmat, ndepth = ndepth, het.thresh = 0.25, snp.nbhd = snp_nbhd, cval = 25, + gbuild = genome, hetscale = TRUE, unmatched = FALSE, ndepthmax = 5000, + spanT = read_counts$spanT, spanA = read_counts$spanA, spanX = read_counts$spanX, MandUnormal = MandUnormal) + }else{ + dat = facets2n::preProcSample(read_counts, ndepth = ndepth, het.thresh = 0.25, snp.nbhd = snp_nbhd, cval = 25, + gbuild = genome, hetscale = TRUE, unmatched = FALSE, ndepthmax = 5000) + } + out = facets2n::procSample(dat, cval = cval, min.nhet = min_nhet, dipLogR = dipLogR) + fit = facets2n::emcncf(out, min.nhet = min_nhet) + + }else{ + dat = facets::preProcSample(read_counts, ndepth = ndepth, het.thresh = 0.25, snp.nbhd = snp_nbhd, cval = 25, + gbuild = genome, hetscale = TRUE, unmatched = FALSE, ndepthmax = 5000) + out = facets::procSample(dat, cval = cval, min.nhet = min_nhet, dipLogR = dipLogR) + fit = facets::emcncf(out) + } + # Fix bad NAs fit$cncf = cbind(fit$cncf, cf = out$out$cf, tcn = out$out$tcn, lcn = out$out$lcn) fit$cncf$lcn[fit$cncf$tcn == 1] = 0 @@ -189,4 +222,23 @@ create_legacy_output <- function(facets_output, directory, sample_id, counts_fil out_txt = paste0(out_txt, '# ', run$flags,'\n\n') cat(out_txt, file=paste0(directory, '/', sample_id, '.out')) + + #try default facets2n plotting, continue if no facets2n lib + tryCatch({ + outfile = paste0(output_prefix, "_legacy.tiff") + plot_title = paste0(basename(output_prefix), + ' | cval=', cval, + ' | purity_cval=', purity_cval, + ' | purity=', round(fit$purity, 2), + ' | ploidy=', round(fit$ploidy, 2), + ' | dipLogR=', round(fit$dipLogR, 2)) + message(plot_title) + tiff(file = outfile, width = 6.5, height = 8, res = 300,units = 'in') + facets2n::plotSample(x = out, emfit = fit, plot.type = "both", sname = plot_title) + dev.off() + }, + error= function(cond) {message(cond); return(NA)}, + warning=function(cond) {message(cond); return(NA)}, + finally = NULL + ) } diff --git a/README.md b/README.md index ee81eb8..f1d6b8f 100644 --- a/README.md +++ b/README.md @@ -11,11 +11,16 @@ facetsSuite is an R package with functions to run [FACETS](https://github.com/ms You can install facetsSuite in R from this repository with: +First, follow the [instructions for installing FACETS](https://github.com/mskcc/facets). + +FACETS2N (https://github.com/rptashkin/facets2n) can be installed as: ``` r -devtools::install_github("taylor-lab/facets-suite") +devtools::install_github("rptashkin/facets2n", ref= "master") ``` -Also follow the [instructions for installing FACETS](https://github.com/mskcc/facets). +``` r +devtools::install_github("taylor-lab/facets-suite", ref = "feature/facets2N") +``` _Note: For the wrapper script `snp-pileup-wrapper.R` you need to specify the variable `snp_pileup_path` in the script to point to the installation path of snp-pileup _**or**_ set the environment variable SNP_PILEUP. Alternatively, the [docker](README.md#run-wrappers-from-container) image contains the executable._ @@ -54,20 +59,27 @@ Most use of this package can be done from the command line using three wrapper s --normal-bam normal.bam \ --tumor-bam tumor.bam \ --output-prefix + --unmatched-normal-BAMS ``` The input VCF file should contain polymorphic SNPs, so that FACETS can infer changes in allelic configuration at genomic loci from changes in allele ratios. [dbSNP](https://www.ncbi.nlm.nih.gov/snp/) is a good source for this. By default, `snp-pileup` also estimates the read depth in the input BAM files every 50th base. - `run-facets-wrapper.R`:\ - This wrapper takes above SNP "pileup" as input and executes the FACETS algorithm. The ouputs are in the form of Rdata objects, TXT files, and PNGs of the samples overall copy-number profile. The wrapper allows for running FACETS in a two-pass mode, where first a "purity" run estimates the overall segmentation profile, sample purity and ploidy, and subsequently the dipLogR value from this run seeds a "high-sensitivity" run which may detect more focal events. To run in the two-pass mode, specify the input arguments prefixed by `purity`. The cval (`--purity-cval` and `--cval`) parameters tune the segmentation coarseness.\ + This wrapper takes above SNP "pileup" as input and executes the FACETS algorithm. The ouputs are in the form of Rdata objects, TXT files, and PNGs of the samples overall copy-number profile. The wrapper allows for running FACETS in a two-pass mode, where first a "purity" run estimates the overall segmentation profile, sample purity and ploidy, and subsequently the dipLogR value from this run seeds a "high-sensitivity" run which may detect more focal events. To run in the two-pass mode, specify the input arguments prefixed by `purity`. The cval (`--purity-cval` and `--cval`) parameters tune the segmentation coarseness. One of --facets2n-lib-path or --facets-lib-path is required to run with either facets2n or facets.\ Example command: ```shell run-facets-wrapper.R \ --counts-file tumor_normal.snp_pileup.gz \ --sample-id tumorID__normalID \ - --purity-cval 1000 --cval 500 \ - --everything + --purity-cval 150 --cval 50 \ + --everything \ + --purity-min-nhet 10 \ + --legacy-output \ + --facets2n-lib-path (optional) \ + --facets-lib-path (optional) \ + --reference-snp-pileup (optional) \ + --reference-loess-file (optional) ``` - The above command runs FACETS in the two-pass mode, first at cval 1000, then at cval 500 based on the sample-specific baseline found at the higher cval. The full suite of analysis and QC is run with the `--everything` flag. If no output directory is specified, a directory named `sample-id` is created. + The above command runs FACETS in the two-pass mode, first at cval 150, then at cval 50 based on the sample-specific baseline found at the higher cval. The full suite of analysis and QC is run with the `--everything` flag. If no output directory is specified, a directory named `sample-id` is created. - `annotate-maf-wrapper.R`:\ This script estimates the cancer-cell fractions (CCFs) of somatic mutations using purity and ploidy estimates from FACETS. It requires a input MAF file and a mapping of sample names in the MAF file (column `Tumor_Sample_Barcode`) to FACETS output RDS files (i.e. file paths). Alternatively, it can be run in a single-sample mode by pointing direct to the RDS and providing a MAF file with only mutation calls for the given sample.\ diff --git a/man/arm_level_changes.Rd b/man/arm_level_changes.Rd index c771917..e5654c1 100644 --- a/man/arm_level_changes.Rd +++ b/man/arm_level_changes.Rd @@ -7,8 +7,12 @@ \url{https://www.ncbi.nlm.nih.gov/pubmed/29622463} } \usage{ -arm_level_changes(segs, ploidy, genome = c("hg19", "hg18", "hg38"), - algorithm = c("em", "cncf")) +arm_level_changes( + segs, + ploidy, + genome = c("hg19", "hg18", "hg38"), + algorithm = c("em", "cncf") +) } \arguments{ \item{segs}{FACETS segmentation output.} diff --git a/man/ccf_annotate_maf_legacy.Rd b/man/ccf_annotate_maf_legacy.Rd new file mode 100644 index 0000000..a83bb43 --- /dev/null +++ b/man/ccf_annotate_maf_legacy.Rd @@ -0,0 +1,33 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ccf-annotate-maf.R +\name{ccf_annotate_maf_legacy} +\alias{ccf_annotate_maf_legacy} +\title{Estimate CCFs of somatic mutations from cncf.txt file.} +\usage{ +ccf_annotate_maf_legacy( + maf, + cncf_txt_file, + purity, + algorithm = c("em", "cncf") +) +} +\arguments{ +\item{maf}{Input MAF file.} + +\item{cncf_txt_file}{.cncf.txt file created with legacy output of facets-suite.} + +\item{purity}{Sample purity estimate.} + +\item{algorithm}{Choice between FACETS \code{em} and \code{cncf} algorithm.} +} +\value{ +MAF file annotated with clonality estimates for each mutation, where the following column prefixes are used: +\itemize{ + \item{\code{ccf_Mcopies*}:} {Inferred CCF if mutation is on the major allele.} + \item{\code{ccf_1copy*}:} {Inferred CCF if mutation exists in one copy.} + \item{\code{ccf_expected_copies*}:} {Inferred CCF if mutation exists in number of copies expected from observed VAF and local ploidy.} +} +} +\description{ +Based on FACETS data, infer cancer-cell fraction (CCF) for somatic mutations in a sample. +} diff --git a/man/check_fit.Rd b/man/check_fit.Rd index 11375b6..1a44496 100644 --- a/man/check_fit.Rd +++ b/man/check_fit.Rd @@ -4,8 +4,12 @@ \alias{check_fit} \title{Sample QC} \usage{ -check_fit(facets_output, genome = c("hg19", "hg18", "hg38"), - algorithm = c("em", "cncf"), maf = NULL) +check_fit( + facets_output, + genome = c("hg19", "hg18", "hg38"), + algorithm = c("em", "cncf"), + maf = NULL +) } \arguments{ \item{facets_output}{Output from \code{run_facets}.} @@ -19,8 +23,8 @@ check_fit(facets_output, genome = c("hg19", "hg18", "hg38"), \value{ A list object with the following items: \itemize{ - \item{\code{diplogr_flag}:} {Boolean indicating extreme dipLogR value.} - \item{\code{n_alternative_diplogr}:} {Number of alternative dipLogR values.} + \item{\code{dipLogR_flag}:} {Boolean indicating extreme dipLogR value.} + \item{\code{n_alternative_dipLogR}:} {Number of alternative dipLogR values.} \item{\code{n_dip_bal_segs}, \code{frac_dip_bal_segs}:} {Number of balanced segments at dipLogR and the fraction of genome they represent.} \item{\code{n_dip_imbal_segs}, \code{frac_dip_imbal_segs}:} {Number of imbalanced segments at dipLogR and the fraction of genome they represent.} \item{\code{n_amp}:} {Number of segments at total copy number >= 10.} diff --git a/man/copy_number_scores.Rd b/man/copy_number_scores.Rd index 9867f27..ebc4732 100644 --- a/man/copy_number_scores.Rd +++ b/man/copy_number_scores.Rd @@ -12,17 +12,38 @@ \url{https://www.ncbi.nlm.nih.gov/pubmed/26015868} } \usage{ -calculate_fraction_cna(segs, ploidy, genome = c("hg19", "hg18", "hg38"), - algorithm = c("em", "cncf")) +calculate_fraction_cna( + segs, + ploidy, + genome = c("hg19", "hg18", "hg38"), + algorithm = c("em", "cncf") +) -calculate_loh(segs, snps, genome = c("hg19", "hg18", "hg38"), - algorithm = c("em", "cncf"), hypoploidy_threshold = 0.5) +calculate_loh( + segs, + snps, + genome = c("hg19", "hg18", "hg38"), + algorithm = c("em", "cncf"), + hypoploidy_threshold = 0.5 +) -calculate_ntai(segs, ploidy, genome = c("hg19", "hg18", "hg38"), - algorithm = c("em", "cncf"), min_size = 0, min_probes = 250) +calculate_ntai( + segs, + ploidy, + genome = c("hg19", "hg18", "hg38"), + algorithm = c("em", "cncf"), + min_size = 0, + min_probes = 250 +) -calculate_lst(segs, ploidy, genome = c("hg19", "hg18", "hg38"), - algorithm = c("em", "cncf"), min_size = 1e+07, min_probes = 50) +calculate_lst( + segs, + ploidy, + genome = c("hg19", "hg18", "hg38"), + algorithm = c("em", "cncf"), + min_size = 1e+07, + min_probes = 50 +) calculate_hrdloh(segs, ploidy, algorithm = c("em", "cncf")) } diff --git a/man/facetsSuite.Rd b/man/facetsSuite.Rd index a47b04b..802cdf0 100644 --- a/man/facetsSuite.Rd +++ b/man/facetsSuite.Rd @@ -3,7 +3,6 @@ \docType{package} \name{facetsSuite} \alias{facetsSuite} -\alias{facetsSuite-package} \title{\code{facetsSuite} package} \description{ Functions to run and parse output from \href{https://github.com/mskcc/facets}{FACETS}. diff --git a/man/gene_level_changes.Rd b/man/gene_level_changes.Rd index b92192d..182664a 100644 --- a/man/gene_level_changes.Rd +++ b/man/gene_level_changes.Rd @@ -4,8 +4,11 @@ \alias{gene_level_changes} \title{Get gene-level changes in copy number from FACETS output.} \usage{ -gene_level_changes(facets_output, genome = c("hg19", "hg38"), - algorithm = c("em", "cncf")) +gene_level_changes( + facets_output, + genome = c("hg19", "hg38"), + algorithm = c("em", "cncf") +) } \arguments{ \item{facets_output}{Full FACETS output from \code{run_facets}.} diff --git a/man/plot_facets.Rd b/man/plot_facets.Rd index 60cbc89..eaf2ef9 100644 --- a/man/plot_facets.Rd +++ b/man/plot_facets.Rd @@ -9,24 +9,52 @@ \alias{closeup_plot} \title{Plot FACETS output} \usage{ -cnlr_plot(facets_data, colors = c("#0080FF", "#4CC4FF"), plotX = FALSE, - genome = c("hg19", "hg18", "hg38"), highlight_gene = NULL, - adjust_diplogr = TRUE, subset_snps = NULL, return_object = FALSE) +cnlr_plot( + facets_data, + colors = c("#0080FF", "#4CC4FF"), + plotX = FALSE, + genome = c("hg19", "hg18", "hg38"), + highlight_gene = NULL, + adjust_dipLogR = TRUE, + subset_snps = NULL, + return_object = FALSE +) -valor_plot(facets_data, colors = c("#0080FF", "#4CC4FF"), - plotX = FALSE, genome = c("hg19", "hg18", "hg38"), - highlight_gene = NULL, subset_snps = NULL, return_object = FALSE) +valor_plot( + facets_data, + colors = c("#0080FF", "#4CC4FF"), + plotX = FALSE, + genome = c("hg19", "hg18", "hg38"), + highlight_gene = NULL, + subset_snps = NULL, + return_object = FALSE +) -cf_plot(facets_data, method = c("em", "cncf"), plotX = FALSE, - genome = c("hg19", "hg18", "hg38"), return_object = FALSE) +cf_plot( + facets_data, + method = c("em", "cncf"), + plotX = FALSE, + genome = c("hg19", "hg18", "hg38"), + return_object = FALSE +) -icn_plot(facets_data, method = c("em", "cncf"), plotX = FALSE, - genome = c("hg19", "hg18", "hg38"), highlight_gene = NULL, - return_object = FALSE) +icn_plot( + facets_data, + method = c("em", "cncf"), + plotX = FALSE, + genome = c("hg19", "hg18", "hg38"), + highlight_gene = NULL, + return_object = FALSE +) -closeup_plot(facets_data, highlight_gene = NULL, plot_chroms = NULL, - method = c("em", "cncf"), genome = c("hg19", "hg18", "hg38"), - return_object = FALSE) +closeup_plot( + facets_data, + highlight_gene = NULL, + plot_chroms = NULL, + method = c("em", "cncf"), + genome = c("hg19", "hg18", "hg38"), + return_object = FALSE +) } \arguments{ \item{facets_data}{Output object from \code{run_facets}.} @@ -39,7 +67,7 @@ closeup_plot(facets_data, highlight_gene = NULL, plot_chroms = NULL, \item{highlight_gene}{Highlight gene(s), provide gene symbol or mapped positions (internally).} -\item{adjust_diplogr}{Normalize by sample dipLogR.} +\item{adjust_dipLogR}{Normalize by sample dipLogR.} \item{subset_snps}{Subset the SNP profile to reduce weight of plotting, supply a factor by which to reduce or \code{TRUE} for default.} diff --git a/man/plot_segmentation.Rd b/man/plot_segmentation.Rd index 27faee9..ffffeee 100644 --- a/man/plot_segmentation.Rd +++ b/man/plot_segmentation.Rd @@ -4,9 +4,14 @@ \alias{plot_segmentation} \title{Plot segmentation profile} \usage{ -plot_segmentation(segs, plotX = FALSE, sample_order = NULL, - cap_log_ratios = TRUE, colors = c("darkblue", "white", "red"), - return_object = FALSE) +plot_segmentation( + segs, + plotX = FALSE, + sample_order = NULL, + cap_log_ratios = TRUE, + colors = c("darkblue", "white", "red"), + return_object = FALSE +) } \arguments{ \item{segs}{FACETS segment output, IGV formatted.} diff --git a/man/read_snp_matrix.Rd b/man/read_snp_matrix.Rd index 31dd068..eb8cb27 100644 --- a/man/read_snp_matrix.Rd +++ b/man/read_snp_matrix.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/read-snp-matrix.R \name{read_snp_matrix} \alias{read_snp_matrix} -\title{Read counts file} +\title{Reads a gzipped output file from \code{snp-pileup}.} \source{ \code{snp-pileup} is part of \href{www.github.com/mskcc/facets}{FACETS}. } diff --git a/man/read_snp_matrix_facets2n.Rd b/man/read_snp_matrix_facets2n.Rd new file mode 100644 index 0000000..cb2610e --- /dev/null +++ b/man/read_snp_matrix_facets2n.Rd @@ -0,0 +1,60 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/read-snp-matrix.R +\name{read_snp_matrix_facets2n} +\alias{read_snp_matrix_facets2n} +\title{#' Read in the snp-pileup generated SNP read count matrix file for facets2n processing} +\source{ +\code{snp-pileup} is part of \href{www.github.com/mskcc/facets}{FACETS}. +} +\usage{ +read_snp_matrix_facets2n( + filename, + err.thresh = Inf, + del.thresh = Inf, + perl.pileup = FALSE, + MandUnormal = FALSE, + spanT = 0.2, + spanA = 0.2, + spanX = 0.2, + gbuild = "hg19", + ReferencePileupFile = NULL, + ReferenceLoessFile = NULL, + MinOverlap = 0.9, + useMatchedX = FALSE +) +} +\arguments{ +\item{filename}{counts file from snp-pileup} + +\item{err.thresh}{(numeric) Error threshold to be used to filter snp-pileup data frame.} + +\item{del.thresh}{(numeric) Deletion threshold to be used to filter snp-pileup data frame.} + +\item{perl.pileup}{(logical) Is the pileup data generated using perl pileup tool?} + +\item{MandUnormal}{(logical) Is CNLR analysis to be peformed using unmatched reference normals?} + +\item{spanT}{(numeric) Default span value to be used for loess normalization in tumor sample.} + +\item{spanA}{(numeric) Default span value to be used for loess normalization across autosomal chromosomes in the normal sample.} + +\item{spanX}{(numeric) Default span value to be used for loess normalization in Chr X in the normal sample.} + +\item{gbuild}{(character) Genome build (Default: hg19).} + +\item{ReferencePileupFile}{(character) Filepath to an optional snp-pileup generated pileup data of one or more reference normals.} + +\item{ReferenceLoessFile}{(character) Filepath to an optional loess data, generated using the facets2n package, of one or more reference normals. The number of normals in this data should match that in the ReferencePileupFile, and should be in the same order.} + +\item{MinOverlap}{(numeric) Mininum overlap fraction of loci between a tumor pileup and reference pileup data.} + +\item{useMatchedX}{(logical) Is the matched normal to be used for ChrX normalization?} + +\item{skip}{(character) Skip n number of lines in the input file.} +} +\value{ +A dataframe of pileup depth values for Tumor and Matched Normal if MandUnormal is FALSE. Else, a list of data frame with pileup depth values of Tumor, matched Normal, and a best unmatched normal, and the associated span values. +} +\description{ +#' Read in the snp-pileup generated SNP read count matrix file for facets2n processing +} diff --git a/man/run_facets.Rd b/man/run_facets.Rd index 471b420..eec39e3 100644 --- a/man/run_facets.Rd +++ b/man/run_facets.Rd @@ -4,16 +4,29 @@ \alias{run_facets} \title{Run FACETS} \usage{ -run_facets(read_counts, cval = 100, diplogr = NULL, ndepth = 35, - snp_nbhd = 250, min_nhet = 15, genome = c("hg18", "hg19", "hg38", - "mm9", "mm10"), seed = 100) +run_facets( + read_counts, + cval = 100, + dipLogR = NULL, + ndepth = 25, + snp_nbhd = 250, + min_nhet = 10, + genome = c("hg18", "hg19", "hg38", "mm9", "mm10"), + seed = 100, + facets_lib_path = "", + facets2n_lib_path = "", + MandUnormal = FALSE, + useMatchedX = FALSE, + referencePileup = NULL, + referenceLoess = NULL +) } \arguments{ \item{read_counts}{Read counts object, generated by `snp-pileup`.} \item{cval}{Segmentation parameter, higher values for higher sensitivity.} -\item{diplogr}{Manual dipLogR value, if left empty `facets` finds the most likely sample baseline.} +\item{dipLogR}{Manual dipLogR value, if left empty `facets` finds the most likely sample baseline.} \item{ndepth}{Minimum depth in normal to retain SNP, see `facets` help.} @@ -24,6 +37,18 @@ run_facets(read_counts, cval = 100, diplogr = NULL, ndepth = 35, \item{genome}{Genome build.} \item{seed}{Seed value for random number generation, set to enable full reproducibility.} + +\item{facets_lib_path}{path to facets R library} + +\item{facets2n_lib_path}{path to facets2n R library} + +\item{MandUnormal}{(logical) facets2n option: Is CNLR analysis to be peformed using unmatched reference normals?} + +\item{useMatchedX}{(logical) facets2n option: Force matched normal to be used for ChrX normalization?} + +\item{referencePileup}{facets2n option: Filepath to an optional snp-pileup generated pileup data of one or more reference normals.} + +\item{referenceLoess}{facets2n option: Filepath to an optional loess data, generated using the facets2n package, of one or more reference normals. The number of normals in this data should match that in the ReferencePileupFile, and should be in the same order.} } \value{ A list object containing the following items. See \href{www.github.com/mskcc/facets}{FACETS documentation} for more details: @@ -32,8 +57,8 @@ A list object containing the following items. See \href{www.github.com/mskcc/fac \item{\code{segs}:} {Inferred copy-number segmentation.} \item{\code{purity}:} {Inferred sample purity, i.e. fraction of tumor cells of the total cellular population.} \item{\code{ploidy}:} {Inferred sample ploidy.} - \item{\code{diplogr}:} {Inferred dipLogR, the sample-specific baseline corresponding to the diploid state.} - \item{\code{alballogr}:} {Alternative dipLogR value(s) at which a balanced solution was found.} + \item{\code{dipLogR}:} {Inferred dipLogR, the sample-specific baseline corresponding to the diploid state.} + \item{\code{alBalLogR}:} {Alternative dipLogR value(s) at which a balanced solution was found.} \item{\code{flags}:} {Warning flags from the naïve segmentation algorithm.} \item{\code{em_flags}:} {Warning flags from the expectation-maximization segmentation algorithm.} \item{\code{loglik}:} {Log-likelihood value of the fitted model.} diff --git a/run-facets-wrapper.R b/run-facets-wrapper.R index 67b2756..6b97507 100755 --- a/run-facets-wrapper.R +++ b/run-facets-wrapper.R @@ -33,23 +33,33 @@ parser$add_argument('-g', '--genome', required = FALSE, parser$add_argument('-c', '--cval', required = FALSE, type = 'integer', default = 50, help = 'Segmentation parameter (cval) [default %(default)s]') parser$add_argument('-pc', '--purity-cval', required = FALSE, type = 'integer', - default = 100, help = 'If two pass, purity segmentation parameter (cval)') + default = 150, help = 'If two pass, purity segmentation parameter (cval)') parser$add_argument('-m', '--min-nhet', required = FALSE, type = 'integer', - default = 15, help = 'Min. number of heterozygous SNPs required for clustering [default %(default)s]') + default = 10, help = 'Min. number of heterozygous SNPs required for clustering [default %(default)s]') parser$add_argument('-pm', '--purity-min-nhet', required = FALSE, type = 'integer', - default = 15, help = 'If two pass, purity min. number of heterozygous SNPs (cval) [default %(default)s]') + default = 10, help = 'If two pass, purity min. number of heterozygous SNPs (cval) [default %(default)s]') parser$add_argument('-n', '--snp-window-size', required = FALSE, type = 'integer', default = 250, help = 'Window size for heterozygous SNPs [default %(default)s]') parser$add_argument('-nd', '--normal-depth', required = FALSE, type = 'integer', - default = 35, help = 'Min. depth in normal to keep SNPs [default %(default)s]') + default = 25, help = 'Min. depth in normal to keep SNPs [default %(default)s]') parser$add_argument('-d', '--dipLogR', required = FALSE, type = 'double', default = NULL, help = 'Manual dipLogR') parser$add_argument('-S', '--seed', required = FALSE, type = 'integer', default = 100, help = 'Manual seed value [default %(default)s]') -parser$add_argument('-l', '--legacy-output', required = FALSE, type = 'logical', - default = FALSE, help = 'create legacy output files (.RData and .cncf.txt) [default %(default)s]') -parser$add_argument('-fl', '--facets-lib-path', required = TRUE, - default = '', help = 'path to the facets library. if none provided, uses version available to `library(facets)`') +parser$add_argument('-l', '--legacy-output', required = FALSE, action="store_true", + default = FALSE, help = 'create legacy output files (.RData and .cncf.txt)') +parser$add_argument('-fl', '--facets-lib-path', required = FALSE, + default = '', help = 'path to the facets library. must supply either --facets-lib-path or --facets2n-lib-path') +parser$add_argument('-f2l', '--facets2n-lib-path', required = FALSE, + default = '', help = 'path to the facets2n library. must supply either --facets-lib-path or --facets2n-lib-path') +parser$add_argument('-refc', '--reference-snp-pileup', required = FALSE, + default = NULL, help = 'A snp-pileup generated pileup data frame (of reference normals) with sample columns that match with the ReferenceLoess object.') +parser$add_argument('-refl', '--reference-loess-file', required = FALSE, + default = NULL, help = ' A ReferenceLoess matrix with a header and span values in the first row.') +parser$add_argument('-MandU', '--MandUnormal', required = FALSE, action="store_true", + default = FALSE, help = 'facets2n option: Is CNLR analysis to be peformed using unmatched reference normals?') +parser$add_argument('-x', '--useMatchedX', required = FALSE, action="store_true", + default = FALSE, help = 'facets2n option: Force matched normal to be used for ChrX normalization') args = parser$parse_args() @@ -115,16 +125,16 @@ print_plots = function(outfile, ' | ploidy=', round(facets_output$ploidy, 2), ' | dipLogR=', round(facets_output$dipLogR, 2)) - png(file = outfile, width = 850, height = 999, units = 'px', type = 'cairo-png', res = 96) + png(file = outfile, width = 6.5, height = 8, units = 'in', type = 'cairo-png', res = 300) suppressWarnings( egg::ggarrange( plots = list( - cnlr_plot(facets_output), - valor_plot(facets_output), - icn_plot(facets_output, method = 'em'), - cf_plot(facets_output, method = 'em'), - icn_plot(facets_output, method = 'cncf'), - cf_plot(facets_output, method = 'cncf') + cnlr_plot(facets_output,plotX = TRUE), + valor_plot(facets_output, plotX=TRUE), + icn_plot(facets_output, method = 'em',plotX = TRUE), + cf_plot(facets_output, method = 'em', plotX = TRUE), + icn_plot(facets_output, method = 'cncf', plotX = TRUE), + cf_plot(facets_output, method = 'cncf', plotX = TRUE) ), ncol = 1, nrow = 6, @@ -167,7 +177,12 @@ facets_iteration = function(name_prefix, ...) { min_nhet = params$min_nhet, genome = params$genome, seed = params$seed, - facets_lib_path = params$facets_lib_path) + facets_lib_path = params$facets_lib_path, + facets2n_lib_path = params$facets2n_lib_path, + referencePileup= params$reference-snp-pileup, + referenceLoess = params$reference-loess-file, + MandUnormal = params$MandUnormal, + useMatchedX = params$useMatchedX) # No need to print the segmentation # print_segments(outfile = paste0(name_prefix, '.cncf.txt'), @@ -199,7 +214,11 @@ if (dir.exists(directory)) { # Read SNP counts file message(paste('Reading', args$counts_file)) -read_counts = read_snp_matrix(args$counts_file) +if(args$facets2n_lib_path != ''){ + read_counts = read_snp_matrix_facets2n(args$counts_file,MandUnormal= args$MandUnormal, ReferencePileupFile=args$reference_snp_pileup, ReferenceLoessFile=args$reference_loess_file, useMatchedX=args$useMatchedX) +}else{ + read_counts = read_snp_matrix(args$counts_file) +} message(paste('Writing to', directory)) # Determine if running two-pass @@ -213,7 +232,13 @@ if (!is.null(args$purity_cval)) { min_nhet = args$purity_min_nhet, genome = args$genome, seed = args$seed, - facets_lib_path = args$facets_lib_path) + facets_lib_path = args$facets_lib_path, + facets2n_lib_path = args$facets2n_lib_path, + referencePileup= args$reference_snp_pileup, + referenceLoess = args$reference_loess_file, + MandUnormal = args$MandUnormal, + useMatchedX = args$useMatchedX + ) hisens_output = facets_iteration(name_prefix = paste0(name, '_hisens'), dipLogR = purity_output$dipLogR, @@ -223,7 +248,12 @@ if (!is.null(args$purity_cval)) { min_nhet = args$min_nhet, genome = args$genome, seed = args$seed, - facets_lib_path = args$facets_lib_path) + facets_lib_path = args$facets_lib_path, + facets2n_lib_path = args$facets2n_lib_path, + referencePileup= args$reference_snp_pileup, + referenceLoess = args$reference_loess_file, + MandUnormal = args$MandUnormal, + useMatchedX = args$useMatchedX) metadata = NULL if (args$everything) { diff --git a/snp-pileup-wrapper.R b/snp-pileup-wrapper.R index dea9155..539fb4c 100755 --- a/snp-pileup-wrapper.R +++ b/snp-pileup-wrapper.R @@ -17,24 +17,30 @@ parser$add_argument('-sp', '--snp-pileup-path', required = FALSE, help = 'Path to snp-pileup executable [default environment variable $SNP_PILEUP]') parser$add_argument('-vcf', '--vcf-file', required = TRUE, help = 'Path to VCF file containing SNP positions') -parser$add_argument('-n', '--normal-bam', required = TRUE, +parser$add_argument('-n', '--normal-bam', required = FALSE, help = 'Path to normal sample BAM file') -parser$add_argument('-t', '--tumor-bam', required = TRUE, +parser$add_argument('-t', '--tumor-bam', required = FALSE, help = 'Path to tumor sample BAM file') parser$add_argument('-o', '--output-prefix', required = TRUE, help = 'Name prefix for output file') parser$add_argument('-p', '--pseudo-snps', required = FALSE, default = 50, help = 'Do pileup at every p:th position [default %(default)s]') -parser$add_argument('-d', '--max-depth', required = FALSE, default = 4000, +parser$add_argument('-d', '--max-depth', required = FALSE, default = 20000, help = 'Maximum read depth [default %(default)s]') +parser$add_argument('-q', '--min-map-quality', required = FALSE, default = 0, + help = 'Sets the minimum threshold for mapping quality') +parser$add_argument('-Q', '--min-base-quality', required = FALSE, default = 0, + help = 'Sets the minimum threshold for base quality.') +parser$add_argument('-un', '--unmatched-normal-BAMS', required = FALSE, default = FALSE, + help = 'full path(s) as quoted string to unmatched normal BAM(s) to use for log ratio normalization, e.g. "/*-NS_*bam"') args = parser$parse_args() # Prepare output -------------------------------------------------------------------------------------------------- -snp_pileup_env = Sys.getenv('SNP_PILEUP') +snp_pileup_env = Sys.which("snp-pileup") -if (is.null(args$snp_pileup_path)) { +if (!is.null(args$snp_pileup_path)) { if (snp_pileup_env == '') { stop(paste('No snp-pileup path provided or in user environment.'), call. = F) } else { @@ -42,24 +48,42 @@ if (is.null(args$snp_pileup_path)) { } } - output_file = paste0(args$output_prefix, '.snp_pileup.gz') if (file.exists(output_file)) { stop(paste(output_file, 'already exists. Remove before running.'), call. = F) } - default_args = c('--count-orphans --gzip') +#enforce minmum read count of 10 for all normals analyzed +if (args$unmatched_normal_BAMS!=FALSE){ + unmatched_bam_count = length(system(paste("ls ", args$unmatched_normal_BAMS), intern=TRUE)) + if (is.null(args$normal_bam) & is.null(args$tumor_bam)) { + message("generating counts file for ", unmatched_bam_count, " BAMs") + min_read_counts =gsub(", ", ",", toString(c(" --min-read-counts 1", rep("1", unmatched_bam_count-1)))) + }else{ + min_read_counts = gsub(", ", ",", toString(c(" --min-read-counts 10,0", rep("0", unmatched_bam_count)))) + message("incorporating ", unmatched_bam_count, " unmatched bams into analysis") + } +}else{ + min_read_counts = " --min-read-counts 10,0" +} + pileup_cmd = paste( snp_pileup_path, default_args, '-P', args$pseudo_snps, '-d', args$max_depth, + '-q', args$min_map_quality, + '-Q', args$min_base_quality, + min_read_counts, args$vcf_file, output_file, args$normal_bam, - args$tumor_bam - ) + args$tumor_bam, + args$unmatched_normal_BAMS + +) +message("pileup cmd: ", pileup_cmd) system(pileup_cmd)