Skip to content

Commit

Permalink
latest WIP for bamCompare
Browse files Browse the repository at this point in the history
  • Loading branch information
WardDeb committed Nov 21, 2024
1 parent f12c04d commit 1a1e011
Show file tree
Hide file tree
Showing 8 changed files with 449 additions and 22 deletions.
8 changes: 7 additions & 1 deletion deeptools4.0.0_changes.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,10 @@ for -bs > 1, a read is split in multiple contiguous blocks

# normalization

Exactscaling is no longer an option, it's always performed.
Exactscaling is no longer an option, it's always performed.

# Todo

- allow multithreaded bw writing
- properly divide region work over threads -> region sorting & taking size into account

287 changes: 287 additions & 0 deletions pydeeptools/deeptools/bamCompare2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,287 @@
import argparse
from deeptools import parserCommon
from deeptools.hp import r_bamcompare

def parseArguments():
parentParser = parserCommon.getParentArgParse()
bamParser = parserCommon.read_options()
normalizationParser = parserCommon.normalization_options()
requiredArgs = getRequiredArgs()
optionalArgs = getOptionalArgs()
outputParser = parserCommon.output()
parser = argparse.ArgumentParser(
parents=[requiredArgs, outputParser, optionalArgs,
parentParser, normalizationParser, bamParser],
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
description='This tool compares two BAM files based on the number of '
'mapped reads. To compare the BAM files, the genome is partitioned '
'into bins of equal size, then the number of reads found in each bin'
' is counted per file, and finally a summary value is '
'reported. This value can be the ratio of the number of reads per '
'bin, the log2 of the ratio, or the difference. This tool can '
'normalize the number of reads in each BAM file using the SES method '
'proposed by Diaz et al. (2012) "Normalization, bias correction, and '
'peak calling for ChIP-seq". Statistical Applications in Genetics '
'and Molecular Biology, 11(3). Normalization based on read counts '
'is also available. The output is either a bedgraph or bigWig file '
'containing the bin location and the resulting comparison value. '
'Note that *each end* in a pair (for paired-end reads) is treated '
'independently. If this is undesirable, then use the --samFlagInclude '
'or --samFlagExclude options.',

usage='bamCompare -b1 treatment.bam -b2 control.bam -o log2ratio.bw\n'
'help: bamCompare -h / bamCompare --help',

add_help=False)

return parser


def getRequiredArgs():
parser = argparse.ArgumentParser(add_help=False)

required = parser.add_argument_group('Required arguments')

# define the arguments
required.add_argument('--bamfile1', '-b1',
metavar='BAM file',
help='Sorted BAM file 1. Usually the BAM file '
'for the treatment.',
required=True)

required.add_argument('--bamfile2', '-b2',
metavar='BAM file',
help='Sorted BAM file 2. Usually the BAM '
'file for the control.',
required=True)

return parser


def getOptionalArgs():

parser = argparse.ArgumentParser(add_help=False)
optional = parser.add_argument_group('Optional arguments')

optional.add_argument("--help", "-h", action="help",
help="show this help message and exit")

optional.add_argument('--scaleFactorsMethod',
help='Method to use to scale the samples. '
'If a method is specified, then it will be used to compensate '
'for sequencing depth differences between the samples. '
'As an alternative, this can be set to None and an option from '
'--normalizeUsing <method> can be used. (Default: %(default)s)',
choices=['readCount', 'SES', 'None'],
default='readCount')

optional.add_argument('--sampleLength', '-l',
help='*Only relevant when SES is chosen for the '
'scaleFactorsMethod.* To compute the SES, specify '
'the length (in bases) of the regions (see --numberOfSamples) '
'that will be randomly sampled to calculate the scaling factors. '
'If you do not have a good sequencing depth for '
'your samples consider increasing the sampling '
'regions\' size to minimize the probability '
'that zero-coverage regions are used. (Default: %(default)s)',
default=1000,
type=int)

optional.add_argument('--numberOfSamples', '-n',
help='*Only relevant when SES is chosen for the '
'scaleFactorsMethod.* Number of samplings taken '
'from the genome to compute the scaling factors. (Default: %(default)s)',
default=1e5,
type=int)

optional.add_argument('--scaleFactors',
help='Set this parameter manually to avoid the computation of '
'scaleFactors. The format is scaleFactor1:scaleFactor2.'
'For example, --scaleFactor 0.7:1 will cause the first BAM file to'
'be multiplied by 0.7, while not scaling '
'the second BAM file (multiplication with 1).',
default=None,
required=False)

optional.add_argument('--operation',
help='The default is to output the log2 ratio of the '
'two samples. The reciprocal ratio returns the '
'the negative of the inverse of the ratio '
'if the ratio is less than 0. The resulting '
'values are interpreted as negative fold changes. '
'Instead of performing a computation using both files, the scaled signal can '
'alternatively be output for the first or second file using '
'the \'--operation first\' or \'--operation second\'. (Default: %(default)s)',
default='log2',
choices=['log2', 'ratio', 'subtract', 'add', 'mean',
'reciprocal_ratio', 'first', 'second'],
required=False)

optional.add_argument('--pseudocount',
help='A small number to avoid x/0. Only useful '
'together with --operation log2 or --operation ratio. '
'You can specify different values as pseudocounts for '
'the numerator and the denominator by providing two '
'values (the first value is used as the numerator '
'pseudocount and the second the denominator pseudocount). (Default: %(default)s)',
default=[1],
type=float,
nargs='+',
action=parserCommon.requiredLength(1, 2),
required=False)

optional.add_argument('--skipZeroOverZero',
help='Skip bins where BOTH BAM files lack coverage. '
'This is determined BEFORE any applicable pseudocount '
'is added.',
action='store_true')

return parser


def process_args(args=None):
args = parseArguments().parse_args(args)

if args.smoothLength and args.smoothLength <= args.binSize:
print("Warning: the smooth length given ({}) is smaller than the bin "
"size ({}).\n\n No smoothing will be "
"done".format(args.smoothLength,
args.binSize))
args.smoothLength = None

if not args.ignoreForNormalization:
args.ignoreForNormalization = []

if not isinstance(args.pseudocount, list):
args.pseudocount = [args.pseudocount]

if len(args.pseudocount) == 1:
args.pseudocount *= 2

return args

# get_scale_factors function is used for scaling in bamCompare
# while get_scale_factor is used for depth normalization


def get_scale_factors(args, statsList, mappedList):

if args.scaleFactors:
scale_factors = list(map(float, args.scaleFactors.split(":")))
elif args.scaleFactorsMethod == 'SES':
scalefactors_dict = estimateScaleFactor(
[args.bamfile1, args.bamfile2],
args.sampleLength, args.numberOfSamples,
1,
mappingStatsList=mappedList,
blackListFileName=args.blackListFileName,
numberOfProcessors=args.numberOfProcessors,
verbose=args.verbose,
chrsToSkip=args.ignoreForNormalization)

scale_factors = scalefactors_dict['size_factors']

if args.verbose:
print("Size factors using SES: {}".format(scale_factors))
print("%s regions of size %s where used " %
(scalefactors_dict['sites_sampled'],
args.sampleLength))

print("ignoring filtering/blacklists, size factors if the number of mapped "
"reads would have been used:")
print(tuple(
float(min(mappedList)) / np.array(mappedList)))

elif args.scaleFactorsMethod == 'readCount':
# change the scaleFactor to 1.0
args.scaleFactor = 1.0
# get num of kept reads for bam file 1
args.bam = args.bamfile1
bam1_mapped, _ = get_num_kept_reads(args, statsList[0])
# get num of kept reads for bam file 2
args.bam = args.bamfile2
bam2_mapped, _ = get_num_kept_reads(args, statsList[1])

mapped_reads = [bam1_mapped, bam2_mapped]

# new scale_factors (relative to min of two bams)
scale_factors = float(min(bam1_mapped, bam2_mapped)) / np.array(mapped_reads)
if args.verbose:
print("Size factors using total number "
"of mapped reads: {}".format(scale_factors))

elif args.scaleFactorsMethod == 'None':
scale_factors = None

return scale_factors


def main(args=None):
"""
The algorithm is composed of two steps.
1. Per-sample scaling / depth Normalization:
+ If scaling is used (using the SES or read counts method), appropriate scaling
factors are determined to account for sequencing depth differences.
+ Optionally scaling can be turned off and individual samples could be depth normalized using
RPKM, BPM or CPM methods
2. Ratio calculation between two bam files:
+ The genome is transversed and computing
the log ratio/ratio/difference etc. for bins of fixed width
given by the user.
"""
args = process_args(args)

# Some sanity checks again
if not args.effectiveGenomeSize:
args.effectiveGenomeSize = 0
# If
if args.normalizeUsing:
print("Normalization of the bam files requested, turning off scaling.")
args.scaleFactorsMethod = 'None'
else:
args.normalizeUsing = 'None'

r_bamcompare(
args.bamfile1, # bam file 1
args.bamfile2, # bam file 2
args.outFileName, # output file
args.outFileFormat, # output file format
args.normalizeUsing, # normalization method
args.scaleFactorsMethod, # scaling method
args.effectiveGenomeSize, # effective genome size
args.numberOfProcessors, # threads
args.binSize, # bin size
[], # regions
True # verbose
)

# #if args.normalizeUsing == "RPGC":
# # sys.exit("RPGC normalization (--normalizeUsing RPGC) is not supported with bamCompare!")
# #if args.normalizeUsing == 'None':
# args.normalizeUsing = None # For the sake of sanity
# if args.scaleFactorsMethod != 'None' and args.normalizeUsing:
# sys.exit("`--normalizeUsing {}` is only valid if you also use `--scaleFactorsMethod None`! To prevent erroneous output, I will quit now.\n".format(args.normalizeUsing))

# # Get mapping statistics
# bam1, mapped1, unmapped1, stats1 = bamHandler.openBam(args.bamfile1, returnStats=True, nThreads=args.numberOfProcessors)
# bam1.close()
# bam2, mapped2, unmapped2, stats2 = bamHandler.openBam(args.bamfile2, returnStats=True, nThreads=args.numberOfProcessors)
# bam2.close()

# scale_factors = get_scale_factors(args, [stats1, stats2], [mapped1, mapped2])
# if scale_factors is None:
# # check whether one of the depth norm methods are selected
# if args.normalizeUsing is not None:
# args.scaleFactor = 1.0
# # if a normalization is required then compute the scale factors
# args.bam = args.bamfile1
# scale_factor_bam1 = get_scale_factor(args, stats1)
# args.bam = args.bamfile2
# scale_factor_bam2 = get_scale_factor(args, stats2)
# scale_factors = [scale_factor_bam1, scale_factor_bam2]
# else:
# scale_factors = [1, 1]
7 changes: 2 additions & 5 deletions pydeeptools/deeptools/bamCoverage2.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,8 +149,5 @@ def main(args=None):
args.numberOfProcessors, # threads
args.binSize, # bin size
[], # regions
True # verbose
)
import pyBigWig
a = pyBigWig.open(args.outFileName)
a.header()
args.verbose # verbose
)
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,6 @@ packages = ["deeptools"]
alignmentSieve = "deeptools.alignmentSieve:main"
bamCompare = "deeptools.bamCompare:main"
bamCoverage = "deeptools.bamCoverage:main"
bamCoverage2 = "deeptools.bamCoverage2:main"
bamPEFragmentSize = "deeptools.bamPEFragmentSize:main"
bigwigAverage = "deeptools.bigwigAverage:main"
bigwigCompare = "deeptools.bigwigCompare:main"
Expand All @@ -84,3 +83,5 @@ plotFingerprint = "deeptools.plotFingerprint:main"
plotHeatmap = "deeptools.plotHeatmap:main"
plotPCA = "deeptools.plotPCA:main"
plotProfile = "deeptools.plotProfile:main"
bamCoverage2 = "deeptools.bamCoverage2:main"
bamCompare2 = "deeptools.bamCompare2:main"
Loading

0 comments on commit 1a1e011

Please sign in to comment.