diff --git a/deeptools4.0.0_changes.md b/deeptools4.0.0_changes.md index 330fafbb..7b2c01dd 100644 --- a/deeptools4.0.0_changes.md +++ b/deeptools4.0.0_changes.md @@ -9,4 +9,10 @@ for -bs > 1, a read is split in multiple contiguous blocks # normalization -Exactscaling is no longer an option, it's always performed. \ No newline at end of file +Exactscaling is no longer an option, it's always performed. + +# Todo + + - allow multithreaded bw writing + - properly divide region work over threads -> region sorting & taking size into account + \ No newline at end of file diff --git a/pydeeptools/deeptools/bamCompare2.py b/pydeeptools/deeptools/bamCompare2.py new file mode 100644 index 00000000..fcd96028 --- /dev/null +++ b/pydeeptools/deeptools/bamCompare2.py @@ -0,0 +1,287 @@ +import argparse +from deeptools import parserCommon +from deeptools.hp import r_bamcompare + +def parseArguments(): + parentParser = parserCommon.getParentArgParse() + bamParser = parserCommon.read_options() + normalizationParser = parserCommon.normalization_options() + requiredArgs = getRequiredArgs() + optionalArgs = getOptionalArgs() + outputParser = parserCommon.output() + parser = argparse.ArgumentParser( + parents=[requiredArgs, outputParser, optionalArgs, + parentParser, normalizationParser, bamParser], + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + description='This tool compares two BAM files based on the number of ' + 'mapped reads. To compare the BAM files, the genome is partitioned ' + 'into bins of equal size, then the number of reads found in each bin' + ' is counted per file, and finally a summary value is ' + 'reported. This value can be the ratio of the number of reads per ' + 'bin, the log2 of the ratio, or the difference. This tool can ' + 'normalize the number of reads in each BAM file using the SES method ' + 'proposed by Diaz et al. (2012) "Normalization, bias correction, and ' + 'peak calling for ChIP-seq". Statistical Applications in Genetics ' + 'and Molecular Biology, 11(3). Normalization based on read counts ' + 'is also available. The output is either a bedgraph or bigWig file ' + 'containing the bin location and the resulting comparison value. ' + 'Note that *each end* in a pair (for paired-end reads) is treated ' + 'independently. If this is undesirable, then use the --samFlagInclude ' + 'or --samFlagExclude options.', + + usage='bamCompare -b1 treatment.bam -b2 control.bam -o log2ratio.bw\n' + 'help: bamCompare -h / bamCompare --help', + + add_help=False) + + return parser + + +def getRequiredArgs(): + parser = argparse.ArgumentParser(add_help=False) + + required = parser.add_argument_group('Required arguments') + + # define the arguments + required.add_argument('--bamfile1', '-b1', + metavar='BAM file', + help='Sorted BAM file 1. Usually the BAM file ' + 'for the treatment.', + required=True) + + required.add_argument('--bamfile2', '-b2', + metavar='BAM file', + help='Sorted BAM file 2. Usually the BAM ' + 'file for the control.', + required=True) + + return parser + + +def getOptionalArgs(): + + parser = argparse.ArgumentParser(add_help=False) + optional = parser.add_argument_group('Optional arguments') + + optional.add_argument("--help", "-h", action="help", + help="show this help message and exit") + + optional.add_argument('--scaleFactorsMethod', + help='Method to use to scale the samples. ' + 'If a method is specified, then it will be used to compensate ' + 'for sequencing depth differences between the samples. ' + 'As an alternative, this can be set to None and an option from ' + '--normalizeUsing can be used. (Default: %(default)s)', + choices=['readCount', 'SES', 'None'], + default='readCount') + + optional.add_argument('--sampleLength', '-l', + help='*Only relevant when SES is chosen for the ' + 'scaleFactorsMethod.* To compute the SES, specify ' + 'the length (in bases) of the regions (see --numberOfSamples) ' + 'that will be randomly sampled to calculate the scaling factors. ' + 'If you do not have a good sequencing depth for ' + 'your samples consider increasing the sampling ' + 'regions\' size to minimize the probability ' + 'that zero-coverage regions are used. (Default: %(default)s)', + default=1000, + type=int) + + optional.add_argument('--numberOfSamples', '-n', + help='*Only relevant when SES is chosen for the ' + 'scaleFactorsMethod.* Number of samplings taken ' + 'from the genome to compute the scaling factors. (Default: %(default)s)', + default=1e5, + type=int) + + optional.add_argument('--scaleFactors', + help='Set this parameter manually to avoid the computation of ' + 'scaleFactors. The format is scaleFactor1:scaleFactor2.' + 'For example, --scaleFactor 0.7:1 will cause the first BAM file to' + 'be multiplied by 0.7, while not scaling ' + 'the second BAM file (multiplication with 1).', + default=None, + required=False) + + optional.add_argument('--operation', + help='The default is to output the log2 ratio of the ' + 'two samples. The reciprocal ratio returns the ' + 'the negative of the inverse of the ratio ' + 'if the ratio is less than 0. The resulting ' + 'values are interpreted as negative fold changes. ' + 'Instead of performing a computation using both files, the scaled signal can ' + 'alternatively be output for the first or second file using ' + 'the \'--operation first\' or \'--operation second\'. (Default: %(default)s)', + default='log2', + choices=['log2', 'ratio', 'subtract', 'add', 'mean', + 'reciprocal_ratio', 'first', 'second'], + required=False) + + optional.add_argument('--pseudocount', + help='A small number to avoid x/0. Only useful ' + 'together with --operation log2 or --operation ratio. ' + 'You can specify different values as pseudocounts for ' + 'the numerator and the denominator by providing two ' + 'values (the first value is used as the numerator ' + 'pseudocount and the second the denominator pseudocount). (Default: %(default)s)', + default=[1], + type=float, + nargs='+', + action=parserCommon.requiredLength(1, 2), + required=False) + + optional.add_argument('--skipZeroOverZero', + help='Skip bins where BOTH BAM files lack coverage. ' + 'This is determined BEFORE any applicable pseudocount ' + 'is added.', + action='store_true') + + return parser + + +def process_args(args=None): + args = parseArguments().parse_args(args) + + if args.smoothLength and args.smoothLength <= args.binSize: + print("Warning: the smooth length given ({}) is smaller than the bin " + "size ({}).\n\n No smoothing will be " + "done".format(args.smoothLength, + args.binSize)) + args.smoothLength = None + + if not args.ignoreForNormalization: + args.ignoreForNormalization = [] + + if not isinstance(args.pseudocount, list): + args.pseudocount = [args.pseudocount] + + if len(args.pseudocount) == 1: + args.pseudocount *= 2 + + return args + +# get_scale_factors function is used for scaling in bamCompare +# while get_scale_factor is used for depth normalization + + +def get_scale_factors(args, statsList, mappedList): + + if args.scaleFactors: + scale_factors = list(map(float, args.scaleFactors.split(":"))) + elif args.scaleFactorsMethod == 'SES': + scalefactors_dict = estimateScaleFactor( + [args.bamfile1, args.bamfile2], + args.sampleLength, args.numberOfSamples, + 1, + mappingStatsList=mappedList, + blackListFileName=args.blackListFileName, + numberOfProcessors=args.numberOfProcessors, + verbose=args.verbose, + chrsToSkip=args.ignoreForNormalization) + + scale_factors = scalefactors_dict['size_factors'] + + if args.verbose: + print("Size factors using SES: {}".format(scale_factors)) + print("%s regions of size %s where used " % + (scalefactors_dict['sites_sampled'], + args.sampleLength)) + + print("ignoring filtering/blacklists, size factors if the number of mapped " + "reads would have been used:") + print(tuple( + float(min(mappedList)) / np.array(mappedList))) + + elif args.scaleFactorsMethod == 'readCount': + # change the scaleFactor to 1.0 + args.scaleFactor = 1.0 + # get num of kept reads for bam file 1 + args.bam = args.bamfile1 + bam1_mapped, _ = get_num_kept_reads(args, statsList[0]) + # get num of kept reads for bam file 2 + args.bam = args.bamfile2 + bam2_mapped, _ = get_num_kept_reads(args, statsList[1]) + + mapped_reads = [bam1_mapped, bam2_mapped] + + # new scale_factors (relative to min of two bams) + scale_factors = float(min(bam1_mapped, bam2_mapped)) / np.array(mapped_reads) + if args.verbose: + print("Size factors using total number " + "of mapped reads: {}".format(scale_factors)) + + elif args.scaleFactorsMethod == 'None': + scale_factors = None + + return scale_factors + + +def main(args=None): + """ + The algorithm is composed of two steps. + + + 1. Per-sample scaling / depth Normalization: + + If scaling is used (using the SES or read counts method), appropriate scaling + factors are determined to account for sequencing depth differences. + + Optionally scaling can be turned off and individual samples could be depth normalized using + RPKM, BPM or CPM methods + + 2. Ratio calculation between two bam files: + + The genome is transversed and computing + the log ratio/ratio/difference etc. for bins of fixed width + given by the user. + + """ + args = process_args(args) + + # Some sanity checks again + if not args.effectiveGenomeSize: + args.effectiveGenomeSize = 0 + # If + if args.normalizeUsing: + print("Normalization of the bam files requested, turning off scaling.") + args.scaleFactorsMethod = 'None' + else: + args.normalizeUsing = 'None' + + r_bamcompare( + args.bamfile1, # bam file 1 + args.bamfile2, # bam file 2 + args.outFileName, # output file + args.outFileFormat, # output file format + args.normalizeUsing, # normalization method + args.scaleFactorsMethod, # scaling method + args.effectiveGenomeSize, # effective genome size + args.numberOfProcessors, # threads + args.binSize, # bin size + [], # regions + True # verbose + ) + + # #if args.normalizeUsing == "RPGC": + # # sys.exit("RPGC normalization (--normalizeUsing RPGC) is not supported with bamCompare!") + # #if args.normalizeUsing == 'None': + # args.normalizeUsing = None # For the sake of sanity + # if args.scaleFactorsMethod != 'None' and args.normalizeUsing: + # sys.exit("`--normalizeUsing {}` is only valid if you also use `--scaleFactorsMethod None`! To prevent erroneous output, I will quit now.\n".format(args.normalizeUsing)) + + # # Get mapping statistics + # bam1, mapped1, unmapped1, stats1 = bamHandler.openBam(args.bamfile1, returnStats=True, nThreads=args.numberOfProcessors) + # bam1.close() + # bam2, mapped2, unmapped2, stats2 = bamHandler.openBam(args.bamfile2, returnStats=True, nThreads=args.numberOfProcessors) + # bam2.close() + + # scale_factors = get_scale_factors(args, [stats1, stats2], [mapped1, mapped2]) + # if scale_factors is None: + # # check whether one of the depth norm methods are selected + # if args.normalizeUsing is not None: + # args.scaleFactor = 1.0 + # # if a normalization is required then compute the scale factors + # args.bam = args.bamfile1 + # scale_factor_bam1 = get_scale_factor(args, stats1) + # args.bam = args.bamfile2 + # scale_factor_bam2 = get_scale_factor(args, stats2) + # scale_factors = [scale_factor_bam1, scale_factor_bam2] + # else: + # scale_factors = [1, 1] \ No newline at end of file diff --git a/pydeeptools/deeptools/bamCoverage2.py b/pydeeptools/deeptools/bamCoverage2.py index c736f595..5b34a1b5 100644 --- a/pydeeptools/deeptools/bamCoverage2.py +++ b/pydeeptools/deeptools/bamCoverage2.py @@ -149,8 +149,5 @@ def main(args=None): args.numberOfProcessors, # threads args.binSize, # bin size [], # regions - True # verbose - ) - import pyBigWig - a = pyBigWig.open(args.outFileName) - a.header() + args.verbose # verbose + ) \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 06190917..414e8fc9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -64,7 +64,6 @@ packages = ["deeptools"] alignmentSieve = "deeptools.alignmentSieve:main" bamCompare = "deeptools.bamCompare:main" bamCoverage = "deeptools.bamCoverage:main" -bamCoverage2 = "deeptools.bamCoverage2:main" bamPEFragmentSize = "deeptools.bamPEFragmentSize:main" bigwigAverage = "deeptools.bigwigAverage:main" bigwigCompare = "deeptools.bigwigCompare:main" @@ -84,3 +83,5 @@ plotFingerprint = "deeptools.plotFingerprint:main" plotHeatmap = "deeptools.plotHeatmap:main" plotPCA = "deeptools.plotPCA:main" plotProfile = "deeptools.plotProfile:main" +bamCoverage2 = "deeptools.bamCoverage2:main" +bamCompare2 = "deeptools.bamCompare2:main" \ No newline at end of file diff --git a/src/bamcompare.rs b/src/bamcompare.rs new file mode 100644 index 00000000..029481cb --- /dev/null +++ b/src/bamcompare.rs @@ -0,0 +1,96 @@ +use pyo3::prelude::*; +use rayon::prelude::*; +use rayon::ThreadPoolBuilder; +use crate::filehandler::bam_stats; +use crate::normalization::scale_factor_bamcompare; +use crate::covcalc::{bam_pileup, parse_regions}; + +#[pyfunction] +pub fn r_bamcompare( + bam_ifile: &str, + bam_ifile2: &str, + _ofile: &str, + _ofiletype: &str, + _norm: &str, + scalefactorsmethod: &str, + effective_genome_size: u64, + nproc: usize, + binsize: u32, + regions: Vec<(String, u64, u64)>, + verbose: bool +) -> PyResult<()> { + // put statistics into scope, this should probably be rewritten. (can't we always assume at least 2 threads ? ) + // will need to be revisited for multiBamsummaries / computeMatrix. + let mut total_reads1: u64 = 0; + let mut total_reads2: u64 = 0; + let mut mapped_reads1: u64 = 0; + let mut mapped_reads2: u64 = 0; + let mut unmapped_reads1: u64 = 0; + let mut unmapped_reads2: u64 = 0; + let mut readlen1: f32 = 0.0; + let mut readlen2: f32 = 0.0; + let mut fraglen1: f32 = 0.0; + let mut fraglen2: f32 = 0.0; + // Get statistics of bam file. + if nproc > 1 { + let pool2 = ThreadPoolBuilder::new().num_threads(2).build().unwrap(); + let bamstatvec: Vec<_> = pool2.install(|| { + vec![ + (bam_ifile, &verbose), + (bam_ifile2, &verbose) + ] + .par_iter() + .map(|(bam_ifile, verbose)| bam_stats(bam_ifile, verbose)) + .collect() + }); + let (_total_reads1, _mapped_reads1, _unmapped_reads1, _readlen1, _fraglen1) = bamstatvec[0]; + let (_total_reads2, _mapped_reads2, _unmapped_reads2, _readlen2, _fraglen2) = bamstatvec[1]; + total_reads1 = _total_reads1; + total_reads2 = _total_reads2; + mapped_reads1 = _mapped_reads1; + mapped_reads2 = _mapped_reads2; + unmapped_reads1 = _unmapped_reads1; + unmapped_reads2 = _unmapped_reads2; + readlen1 = _readlen1; + readlen2 = _readlen2; + fraglen1 = _fraglen1; + fraglen2 = _fraglen2; + + } else { + let (_total_reads1, _mapped_reads1, _unmapped_reads1, _readlen1, _fraglen1) = bam_stats(bam_ifile, &verbose); + let (_total_reads2, _mapped_reads2, _unmapped_reads2, _readlen2, _fraglen2) = bam_stats(bam_ifile2, &verbose); + total_reads1 = _total_reads1; + total_reads2 = _total_reads2; + mapped_reads1 = _mapped_reads1; + mapped_reads2 = _mapped_reads2; + unmapped_reads1 = _unmapped_reads1; + unmapped_reads2 = _unmapped_reads2; + readlen1 = _readlen1; + readlen2 = _readlen2; + fraglen1 = _fraglen1; + fraglen2 = _fraglen2; + } + + // Calculate scale factors + let (scale_factor1, scale_factor2) = scale_factor_bamcompare( + scalefactorsmethod, + mapped_reads1, + mapped_reads2, + binsize as u64, + effective_genome_size + ); + println!("scalefactors = {} and {}", scale_factor1, scale_factor2); + let (regions, chromsizes) = parse_regions(®ions, bam_ifile); + let pool = ThreadPoolBuilder::new().num_threads(nproc).build().unwrap(); + let _bg1: Vec<(String, u64, u64, f64)> = pool.install(|| { + regions.par_iter() + .flat_map(|i| bam_pileup(bam_ifile, &i, &binsize, scale_factor1)) + .collect() + }); + let _bg2: Vec<(String, u64, u64, f64)> = pool.install(|| { + regions.par_iter() + .flat_map(|i| bam_pileup(bam_ifile2, &i, &binsize, scale_factor2)) + .collect() + }); + Ok(()) +} \ No newline at end of file diff --git a/src/calc.rs b/src/calc.rs index b36c6262..0c0f1d1a 100644 --- a/src/calc.rs +++ b/src/calc.rs @@ -1,10 +1,15 @@ pub fn median(mut nvec: Vec) -> f32 { - nvec.sort_unstable(); - let len = nvec.len(); - if len % 2 == 1 { - return nvec[len / 2] as f32; + if nvec.is_empty() { + return 0.0; + } else if nvec.len() == 1 { + return nvec[0] as f32; } else { - return (nvec[len / 2] + nvec[len / 2 - 1]) as f32 / 2.0; + nvec.sort_unstable(); + let len = nvec.len(); + if len % 2 == 1 { + return nvec[len / 2] as f32; + } else { + return (nvec[len / 2] + nvec[len / 2 - 1]) as f32 / 2.0; + } } - } \ No newline at end of file diff --git a/src/lib.rs b/src/lib.rs index 7a0459ec..a9203a07 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,13 +1,15 @@ use pyo3::prelude::*; mod bamcoverage; +//mod bamcompare; mod covcalc; mod alignmentsieve; -mod bamhandler; +mod filehandler; mod normalization; mod calc; #[pymodule] fn hp(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_function(wrap_pyfunction!(bamcoverage::r_bamcoverage, m)?)?; + //m.add_function(wrap_pyfunction!(bamcompare::r_bamcompare, m)?)?; Ok(()) } diff --git a/src/normalization.rs b/src/normalization.rs index 9e1d68f9..ef04f19f 100644 --- a/src/normalization.rs +++ b/src/normalization.rs @@ -1,32 +1,65 @@ -pub fn scale_factor(norm_method: &str, mapped: u64, binsize: u64, effective_genome_size: u64) -> f64 { +use std::cmp; + +pub fn scale_factor( + norm_method: &str, + mapped: u64, + binsize: u32, + effective_genome_size: u64, + readlen: f32, + verbose: &bool +) -> f64 { let mut scale_factor = 1.0; - return match norm_method { + match norm_method { "RPKM" => { // RPKM = # reads per tile / total reads (millions) * tile length (kb) let mmr = mapped as f64 / 1e6; let bs_kb = binsize as f64 / 1000.0; scale_factor *= 1.0 / (mmr * bs_kb); - scale_factor } "CPM" => { // CPM = # reads per tile / total reads (millions) let mmr = mapped as f64 / 1e6; scale_factor *= 1.0 / mmr; - scale_factor } "BPM" => { // BPM = bins per million mapped reads let bs_kb: f64 = binsize as f64 / 1000.0; let tmp_scalefactor = (mapped as f64 / bs_kb) / 1e6; scale_factor *= 1.0 / (tmp_scalefactor * bs_kb); - scale_factor } "RPGC" => { // RPGC = mapped reads * fragment length / effective genome size - scale_factor + let tmp_scalefactor = (mapped as f64 * readlen as f64) / effective_genome_size as f64; + scale_factor *= 1.0 / tmp_scalefactor; + } + _ => {} + }; + if *verbose { + println!("Scale factor: {}", scale_factor); + } + return scale_factor; +} + +pub fn scale_factor_bamcompare( + norm_method: &str, + mapped_bam1: u64, + mapped_bam2: u64, + _binsize: u64, + _effective_genome_size: u64 +) -> (f64, f64) { + return match norm_method { + "readCount" => { + let min = cmp::min(mapped_bam1, mapped_bam2); + let scale_factor1 = min as f64 / mapped_bam1 as f64; + let scale_factor2 = min as f64 / mapped_bam2 as f64; + (scale_factor1, scale_factor2) + } + "SES" => { + // to be implemented + (1.0, 1.0) } _ => { - scale_factor + (1.0, 1.0) } } -} \ No newline at end of file +}