Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Maturin #1336

Merged
merged 23 commits into from
Nov 21, 2024
Merged

Maturin #1336

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -52,4 +52,11 @@ _sources
._.DS_Store

# Planemo
tool_test*
tool_test*

# local conda env folder
env.yml

# Rust stuff
Cargo.lock
target*
18 changes: 18 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
[package]
name = "hp"
version = "0.1.0"
edition = "2021"

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[lib]
name = "hp"
crate-type = ["cdylib"]

[dependencies]
pyo3 = { version = "0.22.5", features = ["extension-module"] }
rust-htslib = "0.47.0"
rayon = "1.10.0"
itertools = "0.12.1"
bigtools = "0.5.3"
tokio = "*"
tempfile = "*"
18 changes: 18 additions & 0 deletions deeptools4.0.0_changes.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Counting

Counting reads has slightly changed.
for -bs 1, the counting mechanism remains the same.
for -bs > 1, a read is split in multiple contiguous blocks
- multiple blocks in 1 bin only count as 1
- multiple blocks in multiple bins count as 1 per bin
- one block spanning multiple bins counts as 1 in each bin

# normalization

Exactscaling is no longer an option, it's always performed.

# Todo

- allow multithreaded bw writing
- properly divide region work over threads -> region sorting & taking size into account

File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
287 changes: 287 additions & 0 deletions pydeeptools/deeptools/bamCompare2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,287 @@
import argparse
from deeptools import parserCommon
from deeptools.hp import r_bamcompare

def parseArguments():
parentParser = parserCommon.getParentArgParse()
bamParser = parserCommon.read_options()
normalizationParser = parserCommon.normalization_options()
requiredArgs = getRequiredArgs()
optionalArgs = getOptionalArgs()
outputParser = parserCommon.output()
parser = argparse.ArgumentParser(
parents=[requiredArgs, outputParser, optionalArgs,
parentParser, normalizationParser, bamParser],
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
description='This tool compares two BAM files based on the number of '
'mapped reads. To compare the BAM files, the genome is partitioned '
'into bins of equal size, then the number of reads found in each bin'
' is counted per file, and finally a summary value is '
'reported. This value can be the ratio of the number of reads per '
'bin, the log2 of the ratio, or the difference. This tool can '
'normalize the number of reads in each BAM file using the SES method '
'proposed by Diaz et al. (2012) "Normalization, bias correction, and '
'peak calling for ChIP-seq". Statistical Applications in Genetics '
'and Molecular Biology, 11(3). Normalization based on read counts '
'is also available. The output is either a bedgraph or bigWig file '
'containing the bin location and the resulting comparison value. '
'Note that *each end* in a pair (for paired-end reads) is treated '
'independently. If this is undesirable, then use the --samFlagInclude '
'or --samFlagExclude options.',

usage='bamCompare -b1 treatment.bam -b2 control.bam -o log2ratio.bw\n'
'help: bamCompare -h / bamCompare --help',

add_help=False)

return parser


def getRequiredArgs():
parser = argparse.ArgumentParser(add_help=False)

required = parser.add_argument_group('Required arguments')

# define the arguments
required.add_argument('--bamfile1', '-b1',
metavar='BAM file',
help='Sorted BAM file 1. Usually the BAM file '
'for the treatment.',
required=True)

required.add_argument('--bamfile2', '-b2',
metavar='BAM file',
help='Sorted BAM file 2. Usually the BAM '
'file for the control.',
required=True)

return parser


def getOptionalArgs():

parser = argparse.ArgumentParser(add_help=False)
optional = parser.add_argument_group('Optional arguments')

optional.add_argument("--help", "-h", action="help",
help="show this help message and exit")

optional.add_argument('--scaleFactorsMethod',
help='Method to use to scale the samples. '
'If a method is specified, then it will be used to compensate '
'for sequencing depth differences between the samples. '
'As an alternative, this can be set to None and an option from '
'--normalizeUsing <method> can be used. (Default: %(default)s)',
choices=['readCount', 'SES', 'None'],
default='readCount')

optional.add_argument('--sampleLength', '-l',
help='*Only relevant when SES is chosen for the '
'scaleFactorsMethod.* To compute the SES, specify '
'the length (in bases) of the regions (see --numberOfSamples) '
'that will be randomly sampled to calculate the scaling factors. '
'If you do not have a good sequencing depth for '
'your samples consider increasing the sampling '
'regions\' size to minimize the probability '
'that zero-coverage regions are used. (Default: %(default)s)',
default=1000,
type=int)

optional.add_argument('--numberOfSamples', '-n',
help='*Only relevant when SES is chosen for the '
'scaleFactorsMethod.* Number of samplings taken '
'from the genome to compute the scaling factors. (Default: %(default)s)',
default=1e5,
type=int)

optional.add_argument('--scaleFactors',
help='Set this parameter manually to avoid the computation of '
'scaleFactors. The format is scaleFactor1:scaleFactor2.'
'For example, --scaleFactor 0.7:1 will cause the first BAM file to'
'be multiplied by 0.7, while not scaling '
'the second BAM file (multiplication with 1).',
default=None,
required=False)

optional.add_argument('--operation',
help='The default is to output the log2 ratio of the '
'two samples. The reciprocal ratio returns the '
'the negative of the inverse of the ratio '
'if the ratio is less than 0. The resulting '
'values are interpreted as negative fold changes. '
'Instead of performing a computation using both files, the scaled signal can '
'alternatively be output for the first or second file using '
'the \'--operation first\' or \'--operation second\'. (Default: %(default)s)',
default='log2',
choices=['log2', 'ratio', 'subtract', 'add', 'mean',
'reciprocal_ratio', 'first', 'second'],
required=False)

optional.add_argument('--pseudocount',
help='A small number to avoid x/0. Only useful '
'together with --operation log2 or --operation ratio. '
'You can specify different values as pseudocounts for '
'the numerator and the denominator by providing two '
'values (the first value is used as the numerator '
'pseudocount and the second the denominator pseudocount). (Default: %(default)s)',
default=[1],
type=float,
nargs='+',
action=parserCommon.requiredLength(1, 2),
required=False)

optional.add_argument('--skipZeroOverZero',
help='Skip bins where BOTH BAM files lack coverage. '
'This is determined BEFORE any applicable pseudocount '
'is added.',
action='store_true')

return parser


def process_args(args=None):
args = parseArguments().parse_args(args)

if args.smoothLength and args.smoothLength <= args.binSize:
print("Warning: the smooth length given ({}) is smaller than the bin "
"size ({}).\n\n No smoothing will be "
"done".format(args.smoothLength,
args.binSize))
args.smoothLength = None

if not args.ignoreForNormalization:
args.ignoreForNormalization = []

if not isinstance(args.pseudocount, list):
args.pseudocount = [args.pseudocount]

if len(args.pseudocount) == 1:
args.pseudocount *= 2

return args

# get_scale_factors function is used for scaling in bamCompare
# while get_scale_factor is used for depth normalization


def get_scale_factors(args, statsList, mappedList):

if args.scaleFactors:
scale_factors = list(map(float, args.scaleFactors.split(":")))
elif args.scaleFactorsMethod == 'SES':
scalefactors_dict = estimateScaleFactor(
[args.bamfile1, args.bamfile2],
args.sampleLength, args.numberOfSamples,
1,
mappingStatsList=mappedList,
blackListFileName=args.blackListFileName,
numberOfProcessors=args.numberOfProcessors,
verbose=args.verbose,
chrsToSkip=args.ignoreForNormalization)

scale_factors = scalefactors_dict['size_factors']

if args.verbose:
print("Size factors using SES: {}".format(scale_factors))
print("%s regions of size %s where used " %
(scalefactors_dict['sites_sampled'],
args.sampleLength))

print("ignoring filtering/blacklists, size factors if the number of mapped "
"reads would have been used:")
print(tuple(
float(min(mappedList)) / np.array(mappedList)))

elif args.scaleFactorsMethod == 'readCount':
# change the scaleFactor to 1.0
args.scaleFactor = 1.0
# get num of kept reads for bam file 1
args.bam = args.bamfile1
bam1_mapped, _ = get_num_kept_reads(args, statsList[0])
# get num of kept reads for bam file 2
args.bam = args.bamfile2
bam2_mapped, _ = get_num_kept_reads(args, statsList[1])

mapped_reads = [bam1_mapped, bam2_mapped]

# new scale_factors (relative to min of two bams)
scale_factors = float(min(bam1_mapped, bam2_mapped)) / np.array(mapped_reads)
if args.verbose:
print("Size factors using total number "
"of mapped reads: {}".format(scale_factors))

elif args.scaleFactorsMethod == 'None':
scale_factors = None

return scale_factors


def main(args=None):
"""
The algorithm is composed of two steps.


1. Per-sample scaling / depth Normalization:
+ If scaling is used (using the SES or read counts method), appropriate scaling
factors are determined to account for sequencing depth differences.
+ Optionally scaling can be turned off and individual samples could be depth normalized using
RPKM, BPM or CPM methods

2. Ratio calculation between two bam files:
+ The genome is transversed and computing
the log ratio/ratio/difference etc. for bins of fixed width
given by the user.

"""
args = process_args(args)

# Some sanity checks again
if not args.effectiveGenomeSize:
args.effectiveGenomeSize = 0
# If
if args.normalizeUsing:
print("Normalization of the bam files requested, turning off scaling.")
args.scaleFactorsMethod = 'None'
else:
args.normalizeUsing = 'None'

r_bamcompare(
args.bamfile1, # bam file 1
args.bamfile2, # bam file 2
args.outFileName, # output file
args.outFileFormat, # output file format
args.normalizeUsing, # normalization method
args.scaleFactorsMethod, # scaling method
args.effectiveGenomeSize, # effective genome size
args.numberOfProcessors, # threads
args.binSize, # bin size
[], # regions
True # verbose
)

# #if args.normalizeUsing == "RPGC":
# # sys.exit("RPGC normalization (--normalizeUsing RPGC) is not supported with bamCompare!")
# #if args.normalizeUsing == 'None':
# args.normalizeUsing = None # For the sake of sanity
# if args.scaleFactorsMethod != 'None' and args.normalizeUsing:
# sys.exit("`--normalizeUsing {}` is only valid if you also use `--scaleFactorsMethod None`! To prevent erroneous output, I will quit now.\n".format(args.normalizeUsing))

# # Get mapping statistics
# bam1, mapped1, unmapped1, stats1 = bamHandler.openBam(args.bamfile1, returnStats=True, nThreads=args.numberOfProcessors)
# bam1.close()
# bam2, mapped2, unmapped2, stats2 = bamHandler.openBam(args.bamfile2, returnStats=True, nThreads=args.numberOfProcessors)
# bam2.close()

# scale_factors = get_scale_factors(args, [stats1, stats2], [mapped1, mapped2])
# if scale_factors is None:
# # check whether one of the depth norm methods are selected
# if args.normalizeUsing is not None:
# args.scaleFactor = 1.0
# # if a normalization is required then compute the scale factors
# args.bam = args.bamfile1
# scale_factor_bam1 = get_scale_factor(args, stats1)
# args.bam = args.bamfile2
# scale_factor_bam2 = get_scale_factor(args, stats2)
# scale_factors = [scale_factor_bam1, scale_factor_bam2]
# else:
# scale_factors = [1, 1]
Original file line number Diff line number Diff line change
Expand Up @@ -252,7 +252,6 @@ def main(args=None):
chrsToSkip=args.ignoreForNormalization,
verbose=args.verbose,
)

wr.run(writeBedGraph.scaleCoverage, func_args, args.outFileName,
blackListFileName=args.blackListFileName,
format=args.outFileFormat, smoothLength=args.smoothLength)
Expand Down
Loading
Loading