Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cram handling illumina #2

Merged
merged 10 commits into from
Sep 16, 2024
62 changes: 35 additions & 27 deletions bin/generate_cram_csv.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,36 +7,42 @@
# Author = yy5
# ><((((°> Y ><((((°> U ><((((°> M ><((((°> I ><((((°>

# NOTE: chunk_size is the number of containers per chunk (not records/reads)

# Function to process chunking of a CRAM file

chunk_cram() {
local cram=$1
local chunkn=$2
local outcsv=$3
local crai=$4
realcram=$(readlink -f ${cram})
# realcrai=$(readlink -f ${cram}.crai)
realcrai=$(readlink -f ${crai})
local chunk_size=$5

local rgline=$(samtools view -H "${realcram}" | grep "@RG" | sed 's/\t/\\t/g' | sed "s/'//g")
local ncontainers=$(zcat "${realcrai}" | wc -l)
# local ncontainers=$(cat "${realcram}" | wc -l)
local base=$(basename "${realcram}" .cram)
local from=0
local to=10000

if [ $chunk_size -gt $ncontainers ]; then
chunk_size=$((ncontainers - 1))
fi
local from=0
local to=$((chunk_size - 1))

while [ $to -lt $ncontainers ]; do
#echo "chunk $chunkn: $from - $to"
echo "${realcram},${realcrai},${from},${to},${base},${chunkn},${rgline}" >> $outcsv
from=$((to + 1))
((to += 10000))
to=$((to + chunk_size))
((chunkn++))
done

if [ $from -le $ncontainers ]; then
echo "${realcram},${realcrai},${from},${ncontainers},${base},${chunkn},${rgline}" >> $outcsv
# Catch any remainder
if [ $from -lt $ncontainers ]; then
to=$((ncontainers - 1))
#echo "chunk $chunkn: $from - $to"
echo "${realcram},${realcrai},${from},${to},${base},${chunkn},${rgline}" >> $outcsv
((chunkn++))
fi

echo $chunkn
}

# Function to process a CRAM file
Expand All @@ -45,47 +51,49 @@ process_cram_file() {
local chunkn=$2
local outcsv=$3
local crai=$4
local chunk_size=$5

local read_groups=$(samtools view -H "$cram" | grep '@RG' | awk '{for(i=1;i<=NF;i++){if($i ~ /^ID:/){print substr($i,4)}}}')
local num_read_groups=$(echo "$read_groups" | wc -w)

if [ "$num_read_groups" -gt 1 ]; then
# Multiple read groups: process each separately
for rg in $read_groups; do
local output_cram="$(basename "${cram%.cram}")_output_${rg}.cram"
samtools view -h -r "$rg" -o "$output_cram" "$cram"
samtools index "$output_cram"
chunkn=$(chunk_cram "$output_cram" "$chunkn" "$outcsv" "$crai")
#chunkn=$(chunk_cram "$output_cram" "$chunkn" "$outcsv" "$crai" "$chunk_size")
chunk_cram "$output_cram" "$chunkn" "$outcsv" "$crai" "$chunk_size"
done
else
# Single read group or no read groups
chunkn=$(chunk_cram "$cram" "$chunkn" "$outcsv" "$crai")
#chunkn=$(chunk_cram "$cram" "$chunkn" "$outcsv" "$crai" "$chunk_size")
chunk_cram "$cram" "$chunkn" "$outcsv" "$crai" "$chunk_size"
fi

echo $chunkn
}

# /\_/\ /\_/\
# ( o.o ) main ( o.o )
# > ^ < > ^ <

# Check if cram_path is provided
if [ -z "$1" ]; then
echo "Usage: $0 <cram_path>"
if [ $# -lt 3 ]; then
echo "Usage: $0 <cram_path> <output_csv> <crai_file> <chunk_size>"
exit 1
fi

# cram_path=$1
cram=$1
chunkn=0
outcsv=$2
crai=$3
if [ -z "$4" ]; then
chunk_size=10000
else
chunk_size=$4
fi
chunkn=0

# # Loop through each CRAM file in the specified directory. cram cannot be the synlinked cram
# for cram in ${cram_path}/*.cram; do
# realcram=$(readlink -f $cram)
# chunkn=$(process_cram_file $realcram $chunkn $outcsv)
# done
# Operates on a single CRAM file
realcram=$(readlink -f $cram)
realcrai=$(readlink -f $crai)
chunkn=$(process_cram_file $realcram $chunkn $outcsv $realcrai)
if [ -f "$outcsv" ]; then
rm "$outcsv"
fi
process_cram_file $realcram $chunkn $outcsv $realcrai $chunk_size
47 changes: 34 additions & 13 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,6 @@ process {
ext.args = '-F 0x200 -nt'
}

withName: '.*:.*:ALIGN_HIC:BWAMEM2_MEM' {
ext.args = { "-5SPCp -R ${meta.read_group}" }
}

withName: '.*:.*:ALIGN_ILLUMINA:BWAMEM2_MEM' {
ext.args = { "-R ${meta.read_group}" }
}

withName: SAMTOOLS_MERGE {
ext.args = { "-c -p" }
ext.prefix = { "${meta.id}.merge" }
Expand Down Expand Up @@ -50,23 +42,47 @@ process {
ext.args = "--output-fmt cram"
}

withName: ".*:.*:HIC_BWAMEM2:CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT" {
withName: '.*:.*:ALIGN_HIC:BWAMEM2_MEM' {
ext.args = { "-5SPCp -R ${meta.read_group}" }
}

withName: '.*:.*:ALIGN_ILLUMINA:BWAMEM2_MEM' {
ext.args = { "-p -R ${meta.read_group}" }
}

withName: ".*:ALIGN_ILLUMINA:.*:CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT" {
ext.args = ""
ext.args1 = { "-F 0x200 -nt -1 '${base}_${chunkid}_1.fastq.gz'" }
ext.args1 = { "-F 0x200 -nt" }
ext.args2 = { "-p -R '${rglines}'" }
ext.args3 = "-mpu"
ext.args4 = { "--write-index -l1" }
}

withName: ".*:ALIGN_ILLUMINA:.*:CRAM_FILTER_MINIMAP2_FILTER5END_FIXMATE_SORT" {
ext.args = ""
ext.args1 = { "-F 0x200 -nt" }
ext.args2 = { "-ax sr" }
ext.args3 = "-mpu"
ext.args4 = { "--write-index -l1" }
}

withName: ".*:ALIGN_HIC:.*:CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT" {
ext.args = ""
ext.args1 = { "-F 0x200 -nt" }
ext.args2 = { "-5SPCp -R '${rglines}'" }
ext.args3 = "-mpu"
ext.args4 = { "--write-index -l1" }
}

withName: ".*:.*:HIC_MINIMAP2:CRAM_FILTER_MINIMAP2_FILTER5END_FIXMATE_SORT" {
withName: ".*:ALIGN_HIC:.*:CRAM_FILTER_MINIMAP2_FILTER5END_FIXMATE_SORT" {
ext.args = ""
ext.args1 = { "-F 0x200 -nt -1 '${base}_${chunkid}_1.fastq.gz'" }
ext.args1 = { "" }
ext.args2 = { "-ax sr" }
ext.args3 = "-mpu"
ext.args4 = { "--write-index -l1" }
}

withName: ".*:.*:HIC_MINIMAP2:MINIMAP2_INDEX" {
withName: "MINIMAP2_INDEX" {
ext.args = { "${fasta.size() > 2.5e9 ? (" -I " + Math.ceil(fasta.size()/1e9)+"G") : ""} "}
}

Expand Down Expand Up @@ -105,6 +121,11 @@ process {
ext.prefix = { "${bam.baseName}" }
}

withName: SAMTOOLS_ADDREPLACERG {
ext.prefix = { "${input.baseName}_addRG" }
ext.args = { "-r ${meta.read_group} --no-PG" }
}

withName: SAMTOOLS_STATS {
ext.prefix = { "${input.baseName}" }
}
Expand Down
2 changes: 1 addition & 1 deletion conf/test.config
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ params {
outdir = "${projectDir}/results"

// Aligner
hic_aligner = "minimap2"
short_aligner = "minimap2"

// Fasta references
fasta = "https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/assembly/release/mMelMel3.1_paternal_haplotype/GCA_922984935.2.subset.fasta.gz"
Expand Down
2 changes: 1 addition & 1 deletion modules/local/cram_filter_align_bwamem2_fixmate_sort.nf
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
process CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT {
tag "$meta.id"
tag "$meta.chunk_id"
label "process_high"

container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
process CRAM_FILTER_MINIMAP2_FILTER5END_FIXMATE_SORT {
tag "$meta.id"
tag "$meta.chunk_id"
label "process_high"

container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
Expand Down
8 changes: 5 additions & 3 deletions modules/local/generate_cram_csv.nf
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
process GENERATE_CRAM_CSV {
tag "${meta.id}"
// label 'process_tiny'
label 'process_single'

container 'quay.io/sanger-tol/cramfilter_bwamem2_minimap2_samtools_perl:0.001-c1'
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://quay.io/sanger-tol/cramfilter_bwamem2_minimap2_samtools_perl:0.001-c1' :
'sanger-tol/cramfilter_bwamem2_minimap2_samtools_perl:0.001-c1' }"

input:
tuple val(meta), path(crampath), path(crai)
Expand All @@ -15,7 +17,7 @@ process GENERATE_CRAM_CSV {
script:
def prefix = task.ext.prefix ?: "${meta.id}"
"""
generate_cram_csv.sh $crampath ${prefix}_cram.csv $crai
generate_cram_csv.sh $crampath ${prefix}_cram.csv $crai ${params.chunk_size}

cat <<-END_VERSIONS > versions.yml
"${task.process}":
Expand Down
56 changes: 56 additions & 0 deletions modules/local/samtools_addreplacerg.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
process SAMTOOLS_ADDREPLACERG {
tag "$meta.id"
label 'process_low'

conda "bioconda::samtools=1.20"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/samtools:1.20--h50ea8bc_0' :
'biocontainers/samtools:1.20--h50ea8bc_0' }"

input:
tuple val(meta), path(input)

output:
tuple val(meta), path("${prefix}.bam"), emit: bam, optional: true
tuple val(meta), path("${prefix}.cram"), emit: cram, optional: true
tuple val(meta), path("${prefix}.sam"), emit: sam, optional: true
path "versions.yml", emit: versions

when:
task.ext.when == null || task.ext.when

script:
def args = task.ext.args ?: ''
prefix = task.ext.prefix ?: "${meta.id}"
file_type = input.getExtension()
if ("$input" == "${prefix}.${file_type}") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!"
"""
samtools \\
addreplacerg \\
--threads $task.cpus \\
$args \\
-o ${prefix}.${file_type} \\
$input

cat <<-END_VERSIONS > versions.yml
"${task.process}":
samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//')
END_VERSIONS
"""

stub:
def args = task.ext.args ?: ''
prefix = task.ext.prefix ?: "${meta.id}"
file_type = input.getExtension()
if ("$input" == "${prefix}.${file_type}") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!"

"""
touch ${prefix}.${file_type}
${index}

cat <<-END_VERSIONS > versions.yml
"${task.process}":
samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//')
END_VERSIONS
"""
}
5 changes: 4 additions & 1 deletion nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,11 @@ params {
bwamem2_index = null
fasta = null
header = null

// Aligner option
hic_aligner = "minimap2" // Can choose minimap2 and bwamem2
short_aligner = "minimap2" // Can choose minimap2 and bwamem2
chunk_size = 10000

// Output format options
outfmt = "cram"
compression = "crumble"
Expand Down
30 changes: 23 additions & 7 deletions nextflow_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -54,13 +54,6 @@
"fa_icon": "fas fa-envelope",
"help_text": "Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits. If set in your user config file (`~/.nextflow/config`) then you don't need to specify this on the command line for every run.",
"pattern": "^([a-zA-Z0-9_\\-\\.]+)@([a-zA-Z0-9_\\-\\.]+)\\.([a-zA-Z]{2,5})$"
},
"hic_aligner": {
"type": "string",
"description": "The aligner to be used for HiC reads.",
"enum": ["bwamem2", "minimap2"],
"default": "minimap2",
"fa_icon": "fas fa-code"
}
}
},
Expand Down Expand Up @@ -93,6 +86,26 @@
}
}
},
"aligner_options": {
"title": "Aligner options",
"type": "object",
"description": "",
"default": "",
"properties": {
"short_aligner": {
"type": "string",
"default": "minimap2",
"description": "Alignment method to use for short-reads (options are: \"minimap2\" or \"bwamem2\"",
"enum": ["minimap2", "bwamem2"]
},
"chunk_size": {
"type": "integer",
"default": 10000,
"description": "Chunk size for parallel processing of CRAM files, in number of containers",
"minimum": 1
}
}
},
"execution": {
"title": "Execution",
"type": "object",
Expand Down Expand Up @@ -292,6 +305,9 @@
{
"$ref": "#/definitions/reference_genome_options"
},
{
"$ref": "#/definitions/aligner_options"
},
{
"$ref": "#/definitions/execution"
},
Expand Down
Loading
Loading