From 9734e661df5bb8129bd0ca52e0e0be09e3a9cec4 Mon Sep 17 00:00:00 2001
From: reichan1998 <baochauduong1311@gmail.com>
Date: Fri, 30 Aug 2024 20:46:47 +0700
Subject: [PATCH 01/15] Adapting hic_mapping.nf from TreeVal to implement
 Minimap2 for HiC data and chunking

---
 bin/awk_filter_reads.sh                       |   1 +
 bin/filter_five_end.pl                        | 109 ++++++++++++++++
 bin/generate_cram_csv.sh                      |  86 +++++++++++++
 bin/grep_pg.sh                                |  10 ++
 conf/base.config                              |  21 ++++
 conf/modules.config                           |  28 +++++
 conf/test.config                              |   3 +
 modules.json                                  |   5 +
 .../cram_filter_align_bwamem2_fixmate_sort.nf |  55 +++++++++
 ...filter_minimap2_filter5end_fixmate_sort.nf |  59 +++++++++
 modules/local/generate_cram_csv.nf            |  35 ++++++
 .../nf-core/minimap2/index/environment.yml    |   7 ++
 modules/nf-core/minimap2/index/main.nf        |  44 +++++++
 modules/nf-core/minimap2/index/meta.yml       |  43 +++++++
 .../nf-core/minimap2/index/tests/main.nf.test |  32 +++++
 .../minimap2/index/tests/main.nf.test.snap    |  68 ++++++++++
 modules/nf-core/minimap2/index/tests/tags.yml |   2 +
 nextflow.config                               |   3 +-
 seq_cache_populate.pl                         |   0
 subworkflows/local/align_short_hic.nf         | 116 ++++++++++++++++++
 subworkflows/local/hic_bwamem2.nf             |  90 ++++++++++++++
 subworkflows/local/hic_minimap2.nf            | 101 +++++++++++++++
 workflows/readmapping.nf                      |   2 +-
 23 files changed, 918 insertions(+), 2 deletions(-)
 create mode 100755 bin/awk_filter_reads.sh
 create mode 100644 bin/filter_five_end.pl
 create mode 100755 bin/generate_cram_csv.sh
 create mode 100755 bin/grep_pg.sh
 create mode 100644 modules/local/cram_filter_align_bwamem2_fixmate_sort.nf
 create mode 100644 modules/local/cram_filter_minimap2_filter5end_fixmate_sort.nf
 create mode 100644 modules/local/generate_cram_csv.nf
 create mode 100644 modules/nf-core/minimap2/index/environment.yml
 create mode 100644 modules/nf-core/minimap2/index/main.nf
 create mode 100644 modules/nf-core/minimap2/index/meta.yml
 create mode 100644 modules/nf-core/minimap2/index/tests/main.nf.test
 create mode 100644 modules/nf-core/minimap2/index/tests/main.nf.test.snap
 create mode 100644 modules/nf-core/minimap2/index/tests/tags.yml
 create mode 100644 seq_cache_populate.pl
 create mode 100644 subworkflows/local/align_short_hic.nf
 create mode 100644 subworkflows/local/hic_bwamem2.nf
 create mode 100644 subworkflows/local/hic_minimap2.nf
diff --git a/bin/awk_filter_reads.sh b/bin/awk_filter_reads.sh
new file mode 100755
index 0000000..3066f11
--- /dev/null
+++ b/bin/awk_filter_reads.sh
@@ -0,0 +1 @@
+awk 'BEGIN{OFS="\t"}{if($1 ~ /^\@/) {print($0)} else {$2=and($2,compl(2048)); print(substr($0,2))}}'
\ No newline at end of file
diff --git a/bin/filter_five_end.pl b/bin/filter_five_end.pl
new file mode 100644
index 0000000..0beb2ac
--- /dev/null
+++ b/bin/filter_five_end.pl
@@ -0,0 +1,109 @@
+#!/usr/bin/perl
+use strict;
+use warnings;
+
+my $prev_id = "";
+my @five;
+my @three;
+my @unmap;
+my @mid;
+my @all;
+my $counter = 0;
+
+while (<STDIN>){
+    chomp;
+    if (/^@/){
+        print $_."\n";
+        next;
+    }
+    my ($id, $flag, $chr_from, $loc_from, $mapq, $cigar, $d1, $d2, $d3, $read, $read_qual, @rest) = split /\t/;
+    my $bin = reverse(dec2bin($flag));
+    my @binary = split(//,$bin);
+    if ($prev_id ne $id && $prev_id ne ""){
+        if ($counter == 1){
+            if (@five == 1){
+                print $five[0]."\n";
+            }
+            else{
+                my ($id_1, $flag_1, $chr_from_1, $loc_from_1, $mapq_1, $cigar_1, $d1_1, $d2_1, $d3_1, $read_1, $read_qual_1, @rest_1) = split /\t/, $all[0];
+                my $bin_1 = reverse(dec2bin($flag_1));
+                my @binary_1 = split(//,$bin_1);
+                $binary_1[2] = 1;
+                my $bin_1_new = reverse(join("",@binary_1));
+                my $flag_1_new =  bin2dec($bin_1_new);
+                print(join("\t",$id_1, $flag_1_new, $chr_from_1, $loc_from_1, $mapq_1, $cigar_1, $d1_1, $d2_1, $d3_1, $read_1, $read_qual_1, @rest_1)."\n");
+            }
+        }
+        elsif ($counter == 2 && @five == 1){
+            print $five[0]."\n";
+        }
+        else{
+            my ($id_1, $flag_1, $chr_from_1, $loc_from_1, $mapq_1, $cigar_1, $d1_1, $d2_1, $d3_1, $read_1, $read_qual_1, @rest_1) = split /\t/, $all[0];
+            my $bin_1 = reverse(dec2bin($flag_1));
+            my @binary_1 = split(//,$bin_1);
+            $binary_1[2] = 1;
+            my $bin_1_new = reverse(join("",@binary_1));
+            my $flag_1_new =  bin2dec($bin_1_new);
+            print(join("\t",$id_1, $flag_1_new, $chr_from_1, $loc_from_1, $mapq_1, $cigar_1, $d1_1, $d2_1, $d3_1, $read_1, $read_qual_1, @rest_1)."\n");
+        }
+
+        $counter = 0;
+        undef @unmap;
+        undef @five;
+        undef @three;
+        undef @mid;
+        undef @all;
+    }
+
+    $counter++;
+    $prev_id = $id;
+    push @all,$_;
+    if ($binary[2]==1){
+        push @unmap,$_;
+    }
+    elsif ($binary[4]==0 && $cigar =~ m/^[0-9]*M/ || $binary[4]==1 && $cigar =~ m/.*M$/){
+        push @five, $_;
+    }
+    elsif ($binary[4]==1 && $cigar =~ m/^[0-9]*M/ || $binary[4]==0 && $cigar =~ m/.*M$/){
+        push @three, $_;
+    }
+    elsif ($cigar =~ m/^[0-9]*[HS].*M.*[HS]$/){
+        push @mid, $_;
+    }
+}
+
+if ($counter == 1){
+    if (@five == 1){
+        print $five[0]."\n";
+    }
+    else{
+        my ($id_1, $flag_1, $chr_from_1, $loc_from_1, $mapq_1, $cigar_1, $d1_1, $d2_1, $d3_1, $read_1, $read_qual_1, @rest_1) = split /\t/, $all[0];
+        my $bin_1 = reverse(dec2bin($flag_1));
+        my @binary_1 = split(//,$bin_1);
+        $binary_1[2] = 1;
+        my $bin_1_new = reverse(join("",@binary_1));
+        my $flag_1_new =  bin2dec($bin_1_new);
+        print(join("\t",$id_1, $flag_1_new, $chr_from_1, $loc_from_1, $mapq_1, $cigar_1, $d1_1, $d2_1, $d3_1, $read_1, $read_qual_1, @rest_1)."\n");
+    }
+}
+elsif ($counter == 2 && @five == 1){
+    print $five[0]."\n";
+}
+else{
+    my ($id_1, $flag_1, $chr_from_1, $loc_from_1, $mapq_1, $cigar_1, $d1_1, $d2_1, $d3_1, $read_1, $read_qual_1, @rest_1) = split /\t/, $all[0];
+    my $bin_1 = reverse(dec2bin($flag_1));
+    my @binary_1 = split(//,$bin_1);
+    $binary_1[2] = 1;
+    my $bin_1_new = reverse(join("",@binary_1));
+    my $flag_1_new =  bin2dec($bin_1_new);
+    print(join("\t",$id_1, $flag_1_new, $chr_from_1, $loc_from_1, $mapq_1, $cigar_1, $d1_1, $d2_1, $d3_1, $read_1, $read_qual_1, @rest_1)."\n");
+}
+
+sub dec2bin {
+    my $str = unpack("B32", pack("N", shift));
+    return $str;
+}
+
+sub bin2dec {
+    return unpack("N", pack("B32", substr("0" x 32 . shift, -32)));
+}
\ No newline at end of file
diff --git a/bin/generate_cram_csv.sh b/bin/generate_cram_csv.sh
new file mode 100755
index 0000000..1a951da
--- /dev/null
+++ b/bin/generate_cram_csv.sh
@@ -0,0 +1,86 @@
+#!/bin/bash
+
+# generate_cram_csv.sh
+# -------------------
+# Generate a csv file describing the CRAM folder
+# ><((((°>    Y    ><((((°>    U     ><((((°>    M     ><((((°>     I     ><((((°>
+# Author = yy5
+# ><((((°>    Y    ><((((°>    U     ><((((°>    M     ><((((°>     I     ><((((°>
+
+# Function to process chunking of a CRAM file
+chunk_cram() {
+    local cram=$1
+    local chunkn=$2
+    local outcsv=$3
+    realcram=$(readlink -f ${cram})
+    realcrai=$(readlink -f ${cram}.crai)
+    local rgline=$(samtools view -H "${realcram}" | grep "@RG" | sed 's/\t/\\t/g' | sed "s/'//g")
+    # local ncontainers=$(zcat "${realcrai}" | wc -l)
+    local ncontainers=$(cat "${realcram}" | wc -l)
+    local base=$(basename "${realcram}" .cram)
+    local from=0
+    local to=10000
+
+
+    while [ $to -lt $ncontainers ]; do
+        echo "${realcram},${realcrai},${from},${to},${base},${chunkn},${rgline}" >> $outcsv
+        from=$((to + 1))
+        ((to += 10000))
+        ((chunkn++))
+    done
+
+    if [ $from -le $ncontainers ]; then
+        echo "${realcram},${realcrai},${from},${ncontainers},${base},${chunkn},${rgline}" >> $outcsv
+        ((chunkn++))
+    fi
+
+    echo $chunkn
+}
+
+# Function to process a CRAM file
+process_cram_file() {
+    local cram=$1
+    local chunkn=$2
+    local outcsv=$3
+
+    local read_groups=$(samtools view -H "$cram" | grep '@RG' | awk '{for(i=1;i<=NF;i++){if($i ~ /^ID:/){print substr($i,4)}}}')
+    local num_read_groups=$(echo "$read_groups" | wc -w)
+
+    if [ "$num_read_groups" -gt 1 ]; then
+        # Multiple read groups: process each separately
+        for rg in $read_groups; do
+            local output_cram="$(basename "${cram%.cram}")_output_${rg}.cram"
+            samtools view -h -r "$rg" -o "$output_cram" "$cram"
+            samtools index "$output_cram"
+            chunkn=$(chunk_cram "$output_cram" "$chunkn" "$outcsv")
+        done
+    else
+        # Single read group or no read groups
+        chunkn=$(chunk_cram "$cram" "$chunkn" "$outcsv")
+    fi
+
+    echo $chunkn
+}
+
+#  /\_/\        /\_/\
+# ( o.o ) main ( o.o )
+#  > ^ <        > ^ <
+
+# Check if cram_path is provided
+if [ -z "$1" ]; then
+    echo "Usage: $0 <cram_path>"
+    exit 1
+fi
+
+# cram_path=$1
+cram=$1
+chunkn=0
+outcsv=$2
+
+# # Loop through each CRAM file in the specified directory. cram cannot be the synlinked cram
+# for cram in ${cram_path}/*.cram; do
+#     realcram=$(readlink -f $cram)
+#     chunkn=$(process_cram_file $realcram $chunkn $outcsv)
+# done
+realcram=$(readlink -f $cram)
+chunkn=$(process_cram_file $realcram $chunkn $outcsv)
diff --git a/bin/grep_pg.sh b/bin/grep_pg.sh
new file mode 100755
index 0000000..8f76e23
--- /dev/null
+++ b/bin/grep_pg.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+# grep_pg.sh
+# -------------------
+# A shell script to exclude pg lines and label read 1 and read 2 from cram containers
+#
+# -------------------
+# Author = yy5
+
+grep -v "^\@PG" | awk '{if($1 ~ /^\@/) {print($0)} else {if(and($2,64)>0) {print(1$0)} else {print(2$0)}}}'
\ No newline at end of file
diff --git a/conf/base.config b/conf/base.config
index ca753f3..33cc764 100644
--- a/conf/base.config
+++ b/conf/base.config
@@ -52,6 +52,12 @@ process {
         time   = { check_max( 2.h * Math.ceil( meta.read_count / 100000000 ) * task.attempt / log_increase_cpus(2, 6*task.attempt, 1, 2), 'time' ) }
     }
 
+    withName: SAMTOOLS_INDEX {
+        cpus   = { log_increase_cpus(2, 6*task.attempt, 1, 2) }
+        memory = { check_max( 4.GB + 850.MB * log_increase_cpus(2, 6*task.attempt, 1, 2) * task.attempt + 0.6.GB * Math.ceil( meta.read_count / 100000000 ), 'memory' ) }
+        time   = { check_max( 2.h * Math.ceil( meta.read_count / 100000000 ) * task.attempt / log_increase_cpus(2, 6*task.attempt, 1, 2), 'time' ) }
+    }
+    
     withName: BLAST_BLASTN {
         time   = { check_max(          2.hour  * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'time'   ) }
         memory = { check_max( 100.MB + 20.MB   * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'memory' ) }
@@ -88,6 +94,21 @@ process {
         time   = { check_max(        1.h  * Math.ceil( meta.read_count   / 1000000   ) * task.attempt, 'time'   ) }
     }
 
+    withName: CRAM_FILTER_MINIMAP2_FILTER5END_FIXMATE_SORT {
+        cpus    = { check_max( 16       * 1                                                            , 'cpus'   ) }
+        memory  = { check_max( 1.GB     * ( reference.size() < 2e9 ? 50 : Math.ceil( ( reference.size() / 1e+9 ) * 20 ) * Math.ceil( task.attempt * 1 ) ) , 'memory') }
+    }
+
+    withName: CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT {
+        cpus    = { check_max( 16       * 1                                                 , 'cpus'   ) }
+        memory  = { check_max( 1.GB     * ( reference.size() < 2e9 ? 50 : Math.ceil( ( reference.size() / 1e+9 ) * 20 ) * Math.ceil( task.attempt * 1 ) ) , 'memory') }
+    }
+
+    withName: MINIMAP2_INDEX {
+        cpus    = { check_max( 2    * task.attempt, 'cpus'    ) }
+        memory  = { check_max( 1.GB * Math.ceil( 30 * fasta.size() / 1e+9 ) * task.attempt, 'memory' ) }
+    }
+
     withName: CRUMBLE {
         // No correlation between memory usage and the number of reads or the genome size.
         // Most genomes seem happy with 1 GB, then some with 2 GB, then some with 5 GB.
diff --git a/conf/modules.config b/conf/modules.config
index a2d4464..88180ed 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -46,6 +46,34 @@ process {
         ext.args = "-be '[rq]>=0.99' -x fi -x fp -x ri -x rp --write-index"
     }
 
+    withName: CONVERT_CRAM {
+        ext.args = "--output-fmt cram"
+    }
+
+    withName: SAMTOOLS_INDEX {
+        ext.args = "-C --output-fmt cram --write-index"
+    }
+
+    withName: ".*:.*:HIC_BWAMEM2:CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT" {
+        ext.args        = ""
+        ext.args1       = "-F0xB00 -nt"
+        ext.args2       = { "-5SPCp -H'${rglines}'" }
+        ext.args3       = "-mpu"
+        ext.args4       = { "--write-index -l1" }
+    }
+
+    withName: ".*:.*:HIC_MINIMAP2:CRAM_FILTER_MINIMAP2_FILTER5END_FIXMATE_SORT" {
+        ext.args        = ""
+        ext.args1       = ""
+        ext.args2       = { "-ax sr" }
+        ext.args3       = "-mpu"
+        ext.args4       = { "--write-index -l1" }
+    }
+
+    withName: ".*:.*:HIC_MINIMAP2:MINIMAP2_INDEX" {
+        ext.args        = { "${fasta.size() > 2.5e9 ? (" -I " + Math.ceil(fasta.size()/1e9)+"G") : ""} "}
+    }
+
         // minimap2 2.24 can only work with genomes up to 4 Gbp. For larger genomes, add the -I option with the genome size in Gbp.
         // In fact, we can also use -I to *decrease* the memory requirements for smaller genomes
         // NOTE: minimap2 uses the decimal system ! 1G = 1,000,000,000 bp
diff --git a/conf/test.config b/conf/test.config
index d78e008..e59b389 100644
--- a/conf/test.config
+++ b/conf/test.config
@@ -25,6 +25,9 @@ params {
     // Output directory
     outdir = "${projectDir}/results"
 
+    // Aligner
+    hic_aligner = "minimap2"
+
     // Fasta references
     fasta = "https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/assembly/release/mMelMel3.1_paternal_haplotype/GCA_922984935.2.subset.fasta.gz"
     header = "https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/assembly/release/mMelMel3.1_paternal_haplotype/GCA_922984935.2.subset.header.sam"
diff --git a/modules.json b/modules.json
index b7bb93d..8896416 100644
--- a/modules.json
+++ b/modules.json
@@ -40,6 +40,11 @@
                         "git_sha": "a33ef9475558c6b8da08c5f522ddaca1ec810306",
                         "installed_by": ["modules"]
                     },
+                    "minimap2/index": {
+                        "branch": "master",
+                        "git_sha": "72e277acfd9e61a9f1368eafb4a9e83f5bcaa9f5",
+                        "installed_by": ["modules"]
+                    },
                     "samtools/faidx": {
                         "branch": "master",
                         "git_sha": "04fbbc7c43cebc0b95d5b126f6d9fe4effa33519",
diff --git a/modules/local/cram_filter_align_bwamem2_fixmate_sort.nf b/modules/local/cram_filter_align_bwamem2_fixmate_sort.nf
new file mode 100644
index 0000000..666b8bd
--- /dev/null
+++ b/modules/local/cram_filter_align_bwamem2_fixmate_sort.nf
@@ -0,0 +1,55 @@
+process CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT {
+    tag "$meta.id"
+    label "process_high"
+
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/mulled-v2-1a6fe65bd6674daba65066aa796ed8f5e8b4687b:688e175eb0db54de17822ba7810cc9e20fa06dd5-0' :
+        'biocontainers/mulled-v2-1a6fe65bd6674daba65066aa796ed8f5e8b4687b:688e175eb0db54de17822ba7810cc9e20fa06dd5-0' }"
+
+    input:
+    tuple val(meta), path(cramfile), path(cramindex), val(from), val(to), val(base), val(chunkid), val(rglines), val(bwaprefix), path(reference)
+
+    output:
+    tuple val(meta), path("*.bam"), emit: mappedbam
+    path "versions.yml"           , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    def args1 = task.ext.args1 ?: ''
+    def args2 = task.ext.args2 ?: ''
+    def args3 = task.ext.args3 ?: ''
+    def args4 = task.ext.args4 ?: ''
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    // Please be aware one of the tools here required mem = 28 * reference size!!!
+    """
+    cram_filter -n ${from}-${to} ${cramfile} - | \\
+        samtools fastq ${args1} | \\
+        bwa-mem2 mem -p ${bwaprefix} -t${task.cpus} -5SPCp -H'${rglines}' - | \\
+        samtools fixmate ${args3} - - | \\
+        samtools sort ${args4} -@${task.cpus} -T ${base}_${chunkid}_sort_tmp -o ${prefix}_${base}_${chunkid}_mem.bam -
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//' )
+        bwa-mem2: \$(bwa-mem2 --version | sed 's/bwa-mem2 //g')
+    END_VERSIONS
+    """
+    // temp removal staden_io_lib: \$(echo \$(staden_io_lib --version 2>&1) | sed 's/^.*staden_io_lib //; s/Using.*\$//') CAUSES ERROR
+
+    stub:
+    def prefix  = task.ext.prefix ?: "${meta.id}"
+    def base    = "45022_3#2"
+    def chunkid = "1"
+    """
+    touch ${prefix}_${base}_${chunkid}_mem.bam
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//' )
+        bwamem2: \$(echo \$(bwa-mem2 version 2>&1) | sed 's/.* //')
+    END_VERSIONS
+    """
+}
\ No newline at end of file
diff --git a/modules/local/cram_filter_minimap2_filter5end_fixmate_sort.nf b/modules/local/cram_filter_minimap2_filter5end_fixmate_sort.nf
new file mode 100644
index 0000000..2127880
--- /dev/null
+++ b/modules/local/cram_filter_minimap2_filter5end_fixmate_sort.nf
@@ -0,0 +1,59 @@
+process CRAM_FILTER_MINIMAP2_FILTER5END_FIXMATE_SORT {
+    tag "$meta.id"
+    label "process_high"
+
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/mulled-v2-1a6fe65bd6674daba65066aa796ed8f5e8b4687b:688e175eb0db54de17822ba7810cc9e20fa06dd5-0' :
+        'biocontainers/mulled-v2-1a6fe65bd6674daba65066aa796ed8f5e8b4687b:688e175eb0db54de17822ba7810cc9e20fa06dd5-0' }"
+
+    input:
+    tuple val(meta), path(cramfile), path(cramindex), val(from), val(to), val(base), val(chunkid), val(rglines), val(ref), path(reference)
+
+    output:
+    tuple val(meta), path("*.bam"), emit: mappedbam
+    path "versions.yml"           , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    def args1 = task.ext.args1 ?: ''
+    def args2 = task.ext.args2 ?: ''
+    def args3 = task.ext.args3 ?: ''
+    def args4 = task.ext.args4 ?: ''
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    def VERSION = "1.15" // Staden_io versions break the pipeline
+    """
+    cram_filter -n ${from}-${to} ${cramfile} - | \\
+        samtools fastq ${args1} - |  \\
+        minimap2 -t${task.cpus} -R '${rglines}' ${args2} ${ref} - |  \\
+        ${projectDir}/bin/grep_pg.sh |  \\
+        perl ${projectDir}/bin/filter_five_end.pl |  \\
+        ${projectDir}/bin/awk_filter_reads.sh |  \\
+        samtools fixmate ${args3} - - | \\
+        samtools sort ${args4} -@${task.cpus} -T ${base}_${chunkid}_sort_tmp -o ${prefix}_${base}_${chunkid}_mm.bam -
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//' )
+        minimap2: \$(minimap2 --version | sed 's/minimap2 //g')
+        staden_io: $VERSION
+    END_VERSIONS
+    """
+
+    stub:
+    def prefix  = task.ext.prefix ?: "${meta.id}"
+    def base    = "45022_3#2"
+    def chunkid = "1"
+    """
+    touch ${prefix}_${base}_${chunkid}_mm.bam
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//' )
+        minimap2: \$(echo \$(minimap2 version 2>&1) | sed 's/.* //')
+        staden_io: $VERSION
+    END_VERSIONS
+    """
+}
\ No newline at end of file
diff --git a/modules/local/generate_cram_csv.nf b/modules/local/generate_cram_csv.nf
new file mode 100644
index 0000000..3e6e224
--- /dev/null
+++ b/modules/local/generate_cram_csv.nf
@@ -0,0 +1,35 @@
+process GENERATE_CRAM_CSV {
+    tag "${meta.id}"
+    // label 'process_tiny'
+
+    container 'quay.io/sanger-tol/cramfilter_bwamem2_minimap2_samtools_perl:0.001-c1'
+
+    input:
+    tuple val(meta), path(crampath)
+
+
+    output:
+    tuple val(meta), path('*.csv'), emit: csv
+    path "versions.yml",            emit: versions
+
+    script:
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    """
+    generate_cram_csv.sh $crampath ${prefix}_cram.csv
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//' )
+    END_VERSIONS
+    """
+
+    stub:
+    """
+    touch ${meta.id}.csv
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//' )
+    END_VERSIONS
+    """
+}
\ No newline at end of file
diff --git a/modules/nf-core/minimap2/index/environment.yml b/modules/nf-core/minimap2/index/environment.yml
new file mode 100644
index 0000000..8a912a1
--- /dev/null
+++ b/modules/nf-core/minimap2/index/environment.yml
@@ -0,0 +1,7 @@
+name: minimap2_index
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - bioconda::minimap2=2.28
diff --git a/modules/nf-core/minimap2/index/main.nf b/modules/nf-core/minimap2/index/main.nf
new file mode 100644
index 0000000..3832021
--- /dev/null
+++ b/modules/nf-core/minimap2/index/main.nf
@@ -0,0 +1,44 @@
+process MINIMAP2_INDEX {
+    label 'process_low'
+
+    // Note: the versions here need to match the versions used in minimap2/align
+    conda "${moduleDir}/environment.yml"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/minimap2:2.28--he4a0461_0' :
+        'biocontainers/minimap2:2.28--he4a0461_0' }"
+
+    input:
+    tuple val(meta), path(fasta)
+
+    output:
+    tuple val(meta), path("*.mmi"), emit: index
+    path "versions.yml"           , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    """
+    minimap2 \\
+        -t $task.cpus \\
+        -d ${fasta.baseName}.mmi \\
+        $args \\
+        $fasta
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        minimap2: \$(minimap2 --version 2>&1)
+    END_VERSIONS
+    """
+
+    stub:
+    """
+    touch ${fasta.baseName}.mmi
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        minimap2: \$(minimap2 --version 2>&1)
+    END_VERSIONS
+    """
+}
diff --git a/modules/nf-core/minimap2/index/meta.yml b/modules/nf-core/minimap2/index/meta.yml
new file mode 100644
index 0000000..1d29e3f
--- /dev/null
+++ b/modules/nf-core/minimap2/index/meta.yml
@@ -0,0 +1,43 @@
+name: minimap2_index
+description: Provides fasta index required by minimap2 alignment.
+keywords:
+  - index
+  - fasta
+  - reference
+tools:
+  - minimap2:
+      description: |
+        A versatile pairwise aligner for genomic and spliced nucleotide sequences.
+      homepage: https://github.com/lh3/minimap2
+      documentation: https://github.com/lh3/minimap2#uguide
+      licence: ["MIT"]
+input:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - fasta:
+      type: file
+      description: |
+        Reference database in FASTA format.
+output:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - index:
+      type: file
+      description: Minimap2 fasta index.
+      pattern: "*.mmi"
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: "versions.yml"
+authors:
+  - "@yuukiiwa"
+  - "@drpatelh"
+maintainers:
+  - "@yuukiiwa"
+  - "@drpatelh"
diff --git a/modules/nf-core/minimap2/index/tests/main.nf.test b/modules/nf-core/minimap2/index/tests/main.nf.test
new file mode 100644
index 0000000..97840ff
--- /dev/null
+++ b/modules/nf-core/minimap2/index/tests/main.nf.test
@@ -0,0 +1,32 @@
+nextflow_process {
+
+    name "Test Process MINIMAP2_INDEX"
+    script "../main.nf"
+    process "MINIMAP2_INDEX"
+
+    tag "modules"
+    tag "modules_nfcore"
+    tag "minimap2"
+    tag "minimap2/index"
+
+    test("minimap2 index") {
+
+        when {
+            process {
+                """
+                input[0] = [
+                    [ id:'test' ],
+                    file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true)
+                ]
+                """
+            }
+        }
+
+        then {
+            assert process.success
+            assert snapshot(process.out).match()
+        }
+
+    }
+
+}
\ No newline at end of file
diff --git a/modules/nf-core/minimap2/index/tests/main.nf.test.snap b/modules/nf-core/minimap2/index/tests/main.nf.test.snap
new file mode 100644
index 0000000..0b09882
--- /dev/null
+++ b/modules/nf-core/minimap2/index/tests/main.nf.test.snap
@@ -0,0 +1,68 @@
+{
+    "Should run without failures": {
+        "content": [
+            {
+                "0": [
+                    [
+                        {
+                            "id": "test_ref"
+                        },
+                        "genome.mmi:md5,72e450f12dc691e763c697463bdb1571"
+                    ]
+                ],
+                "1": [
+                    "versions.yml:md5,0fced0ee8015e7f50b82566e3db8f7b0"
+                ],
+                "index": [
+                    [
+                        {
+                            "id": "test_ref"
+                        },
+                        "genome.mmi:md5,72e450f12dc691e763c697463bdb1571"
+                    ]
+                ],
+                "versions": [
+                    "versions.yml:md5,0fced0ee8015e7f50b82566e3db8f7b0"
+                ]
+            }
+        ],
+        "meta": {
+            "nf-test": "0.8.4",
+            "nextflow": "23.10.1"
+        },
+        "timestamp": "2024-03-18T11:46:30.000058092"
+    },
+    "minimap2 index": {
+        "content": [
+            {
+                "0": [
+                    [
+                        {
+                            "id": "test"
+                        },
+                        "genome.mmi:md5,72e450f12dc691e763c697463bdb1571"
+                    ]
+                ],
+                "1": [
+                    "versions.yml:md5,2f8340380c6741e9261a284262a90bde"
+                ],
+                "index": [
+                    [
+                        {
+                            "id": "test"
+                        },
+                        "genome.mmi:md5,72e450f12dc691e763c697463bdb1571"
+                    ]
+                ],
+                "versions": [
+                    "versions.yml:md5,2f8340380c6741e9261a284262a90bde"
+                ]
+            }
+        ],
+        "meta": {
+            "nf-test": "0.8.4",
+            "nextflow": "23.10.1"
+        },
+        "timestamp": "2024-04-05T10:58:29.828187662"
+    }
+}
\ No newline at end of file
diff --git a/modules/nf-core/minimap2/index/tests/tags.yml b/modules/nf-core/minimap2/index/tests/tags.yml
new file mode 100644
index 0000000..e5ef8e1
--- /dev/null
+++ b/modules/nf-core/minimap2/index/tests/tags.yml
@@ -0,0 +1,2 @@
+minimap2/index:
+  - modules/nf-core/minimap2/index/**
diff --git a/nextflow.config b/nextflow.config
index 437ff82..8c3c4d0 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -15,7 +15,8 @@ params {
     bwamem2_index              = null
     fasta                      = null
     header                     = null
-
+    // Aligner option
+    hic_aligner                = "minimap2" // Can choose minimap2 and bwamem2
     // Output format options
     outfmt                     = "cram"
     compression                = "crumble"
diff --git a/seq_cache_populate.pl b/seq_cache_populate.pl
new file mode 100644
index 0000000..e69de29
diff --git a/subworkflows/local/align_short_hic.nf b/subworkflows/local/align_short_hic.nf
new file mode 100644
index 0000000..4a4a31d
--- /dev/null
+++ b/subworkflows/local/align_short_hic.nf
@@ -0,0 +1,116 @@
+//
+// Align short read (HiC and Illumina) data against the genome
+//
+
+include { SAMTOOLS_FASTQ                        } from '../../modules/nf-core/samtools/fastq/main'
+include { SAMTOOLS_MERGE                        } from '../../modules/nf-core/samtools/merge/main'
+include { SAMTOOLS_SORMADUP                     } from '../../modules/local/samtools_sormadup'
+include { SAMTOOLS_VIEW as SAMTOOLS_INDEX       } from '../../modules/nf-core/samtools/view/main'
+include { GENERATE_CRAM_CSV                     } from '../../modules/local/generate_cram_csv'
+include { SAMTOOLS_SORMADUP as CONVERT_CRAM     } from '../../modules/local/samtools_sormadup'
+include { HIC_MINIMAP2                          } from '../../subworkflows/local/hic_minimap2'
+include { HIC_BWAMEM2                           } from '../../subworkflows/local/hic_bwamem2'
+
+workflow ALIGN_SHORT_HIC {
+    take: 
+    fasta    // channel: [ val(meta), /path/to/fasta ] reference_tuple
+    index    // channel: [ val(meta), /path/to/bwamem2/ ]
+    reads    // channel: [ val(meta), /path/to/datafile ] hic_reads_path
+
+
+    main:
+    ch_versions = Channel.empty()
+    ch_merged_bam   = Channel.empty()
+
+    // Check file types and branch
+    reads
+    | branch {
+        meta, reads ->
+            fastq : reads.findAll { it.getName().toLowerCase() =~ /.*f.*\.gz/ }
+            cram : true
+    }
+    | set { ch_reads }
+
+
+    // Convert FASTQ to CRAM only if FASTQ were provided as input
+    CONVERT_CRAM ( ch_reads.fastq, fasta )
+    ch_versions = ch_versions.mix (    CONVERT_CRAM.out.versions.first() )
+
+    CONVERT_CRAM.out.bam
+    | mix ( ch_reads.cram )
+    | map { meta, cram -> [ meta, cram, [] ] }
+    | set { ch_reads_cram }
+
+    // Index the CRAM file
+    SAMTOOLS_INDEX ( ch_reads_cram, fasta, [] )
+    ch_versions         = ch_versions.mix( SAMTOOLS_INDEX.out.versions )
+
+    SAMTOOLS_INDEX.out.cram
+    | join ( SAMTOOLS_INDEX.out.crai )
+    | set { ch_reads_cram }
+
+
+    //
+    // MODULE: generate a CRAM CSV file containing the required parametres for CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT
+    //
+    GENERATE_CRAM_CSV( ch_reads_cram )
+    ch_versions         = ch_versions.mix( GENERATE_CRAM_CSV.out.versions )
+
+
+    //
+    // SUBWORKFLOW: mapping hic reads using minimap2 or bwamem2
+    //
+    if (params.hic_aligner == 'minimap2') {
+        HIC_MINIMAP2 (
+            fasta,
+            GENERATE_CRAM_CSV.out.csv,
+            index
+        )
+        ch_versions         = ch_versions.mix( HIC_MINIMAP2.out.versions )
+        ch_merged_bam           = ch_merged_bam.mix(HIC_MINIMAP2.out.mergedbam)
+    } else {
+        HIC_BWAMEM2 (
+            fasta,
+            GENERATE_CRAM_CSV.out.csv,
+            index
+        )
+        ch_versions         = ch_versions.mix( HIC_BWAMEM2.out.versions )
+        ch_merged_bam           = ch_merged_bam.mix(HIC_BWAMEM2.out.mergedbam)
+    }
+    
+    ch_merged_bam
+    | combine( ch_reads_cram )
+    | map { meta_bam, bam, meta_cram, cram, crai -> [ meta_cram, bam ] }
+    | set { ch_merged_bam }
+    
+    // // Collect all BWAMEM2 output by sample name
+    ch_merged_bam
+    | map { meta, bam -> [['id': meta.id.split('_')[0..-2].join('_'), 'datatype': meta.datatype], meta.read_count, bam] }
+    | groupTuple( by: [0] )
+    | map { meta, read_counts, bams -> [meta + [read_count: read_counts.sum()], bams] }
+    | branch {
+        meta, bams ->
+            single_bam: bams.size() == 1
+            multi_bams: true
+    }
+    | set { ch_bams }
+
+
+    // Merge, but only if there is more than 1 file
+    SAMTOOLS_MERGE ( ch_bams.multi_bams, [ [], [] ], [ [], [] ] )
+    ch_versions = ch_versions.mix ( SAMTOOLS_MERGE.out.versions.first() )
+
+
+    SAMTOOLS_MERGE.out.bam
+    | mix ( ch_bams.single_bam )
+    | set { ch_bam }
+
+
+    // Mark duplicates
+    SAMTOOLS_SORMADUP ( ch_bam, fasta )
+    ch_versions = ch_versions.mix ( SAMTOOLS_SORMADUP.out.versions )
+
+    emit:
+    bam      = SAMTOOLS_SORMADUP.out.bam     // channel: [ val(meta), /path/to/bam ]
+    versions = ch_versions                   // channel: [ versions.yml ]
+}
diff --git a/subworkflows/local/hic_bwamem2.nf b/subworkflows/local/hic_bwamem2.nf
new file mode 100644
index 0000000..8a06f44
--- /dev/null
+++ b/subworkflows/local/hic_bwamem2.nf
@@ -0,0 +1,90 @@
+#!/usr/bin/env nextflow
+
+//
+// MODULE IMPORT BLOCK
+//
+include { BWAMEM2_INDEX                                   } from '../../modules/nf-core/bwamem2/index/main'
+include { CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT          } from '../../modules/local/cram_filter_align_bwamem2_fixmate_sort'
+include { SAMTOOLS_MERGE                                  } from '../../modules/nf-core/samtools/merge/main'
+
+workflow HIC_BWAMEM2 {
+    take:
+    fasta     // Channel: tuple [ val(meta), path( file )      ]
+    csv_ch
+    index
+
+
+    main:
+    ch_versions         = Channel.empty()
+    mappedbam_ch        = Channel.empty()
+
+    BWAMEM2_INDEX (
+        fasta
+        )
+    ch_versions         = ch_versions.mix( BWAMEM2_INDEX.out.versions )
+
+    csv_ch
+        .splitCsv()
+        .combine ( fasta )
+        .combine ( BWAMEM2_INDEX.out.index )
+        .map{ cram_id, cram_info, ref_id, ref_dir, bwa_id, bwa_path ->
+            tuple([
+                    id: cram_id.id
+                    ],
+                file(cram_info[0]),
+                cram_info[1],
+                cram_info[2],
+                cram_info[3],
+                cram_info[4],
+                cram_info[5],
+                cram_info[6],
+                bwa_path.toString() + '/' + ref_dir.toString().split('/')[-1],
+                ref_dir
+            )
+    }
+    .set { ch_filtering_input }
+
+    //
+    // MODULE: map hic reads by 10,000 container per time using bwamem2
+    //
+    CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT (
+        ch_filtering_input
+
+    )
+    ch_versions         = ch_versions.mix( CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT.out.versions )
+    mappedbam_ch        = CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT.out.mappedbam
+
+    //
+    // LOGIC: PREPARING BAMS FOR MERGE
+    //
+    mappedbam_ch
+        .map{ meta, file ->
+            tuple( file )
+        }
+        .collect()
+        .map { file ->
+            tuple (
+                [
+                id: file[0].toString().split('/')[-1].split('_')[0] + '_' + file[0].toString().split('/')[-1].split('_')[1]
+                ],
+                file
+            )
+        }
+        .set { collected_files_for_merge }
+
+    
+    //
+    // MODULE: MERGE POSITION SORTED BAM FILES AND MARK DUPLICATES
+    //
+    SAMTOOLS_MERGE (
+        collected_files_for_merge,
+        fasta,
+        index
+    )
+    ch_versions         = ch_versions.mix ( SAMTOOLS_MERGE.out.versions.first() )
+
+
+    emit:
+    mergedbam           = SAMTOOLS_MERGE.out.bam
+    versions            = ch_versions.ifEmpty(null)
+}
\ No newline at end of file
diff --git a/subworkflows/local/hic_minimap2.nf b/subworkflows/local/hic_minimap2.nf
new file mode 100644
index 0000000..ddda9ac
--- /dev/null
+++ b/subworkflows/local/hic_minimap2.nf
@@ -0,0 +1,101 @@
+#!/usr/bin/env nextflow
+
+//
+// MODULE IMPORT BLOCK
+//
+include { CRAM_FILTER_MINIMAP2_FILTER5END_FIXMATE_SORT    } from '../../modules/local/cram_filter_minimap2_filter5end_fixmate_sort'
+include { SAMTOOLS_MERGE                                  } from '../../modules/nf-core/samtools/merge/main'
+include { MINIMAP2_INDEX                                  } from '../../modules/nf-core/minimap2/index/main'
+
+
+workflow HIC_MINIMAP2 {
+    take:
+    fasta    // Channel: tuple [ val(meta), path( file )      ]
+    csv_ch
+    index
+
+
+    main:
+    ch_versions         = Channel.empty()
+    mappedbam_ch        = Channel.empty()
+
+
+    //
+    // MODULE: generate minimap2 mmi file 
+    //       
+    MINIMAP2_INDEX (
+        fasta
+        )
+    ch_versions         = ch_versions.mix( MINIMAP2_INDEX.out.versions )
+
+
+    //
+    // LOGIC: generate input channel for mapping
+    // 
+    csv_ch
+        .splitCsv()
+        .combine ( fasta )
+        .combine ( MINIMAP2_INDEX.out.index )
+        .map{ cram_id, cram_info, ref_id, ref_dir, mmi_id, mmi_path->
+            tuple([
+                    id: cram_id.id
+                    ],
+                file(cram_info[0]),
+                cram_info[1],
+                cram_info[2],
+                cram_info[3],
+                cram_info[4],
+                cram_info[5],
+                cram_info[6],
+                mmi_path.toString(),
+                ref_dir
+            )
+    }
+    .set { ch_filtering_input }
+
+
+    //
+    // MODULE: map hic reads by 10,000 container per time
+    // 
+    CRAM_FILTER_MINIMAP2_FILTER5END_FIXMATE_SORT (
+        ch_filtering_input
+
+    )
+    ch_versions         = ch_versions.mix( CRAM_FILTER_MINIMAP2_FILTER5END_FIXMATE_SORT.out.versions )
+    mappedbam_ch        = CRAM_FILTER_MINIMAP2_FILTER5END_FIXMATE_SORT.out.mappedbam
+
+
+    //
+    // LOGIC: PREPARING BAMS FOR MERGE
+    //
+    mappedbam_ch
+        .map{ meta, file ->
+            tuple( file )
+        }
+        .collect()
+        .map { file ->
+            tuple (
+                [
+                id: file[0].toString().split('/')[-1].split('_')[0] + '_' + file[0].toString().split('/')[-1].split('_')[1]
+                ],
+                file
+            )
+        }
+        .set { collected_files_for_merge }
+
+
+    //
+    // MODULE: MERGE POSITION SORTED BAM FILES AND MARK DUPLICATES
+    //
+    SAMTOOLS_MERGE (
+        collected_files_for_merge,
+        fasta,
+        index
+    )
+    ch_versions         = ch_versions.mix ( SAMTOOLS_MERGE.out.versions.first() )
+    
+    
+    emit:
+    mergedbam           = SAMTOOLS_MERGE.out.bam
+    versions            = ch_versions.ifEmpty(null)
+}
\ No newline at end of file
diff --git a/workflows/readmapping.nf b/workflows/readmapping.nf
index c70a406..bbc0d23 100644
--- a/workflows/readmapping.nf
+++ b/workflows/readmapping.nf
@@ -18,7 +18,7 @@ include { SAMTOOLS_REHEADER           } from '../modules/local/samtools_replaceh
 
 include { INPUT_CHECK                   } from '../subworkflows/local/input_check'
 include { PREPARE_GENOME                } from '../subworkflows/local/prepare_genome'
-include { ALIGN_SHORT as ALIGN_HIC      } from '../subworkflows/local/align_short'
+include { ALIGN_SHORT_HIC as ALIGN_HIC      } from '../subworkflows/local/align_short_hic'
 include { ALIGN_SHORT as ALIGN_ILLUMINA } from '../subworkflows/local/align_short'
 include { ALIGN_PACBIO as ALIGN_HIFI    } from '../subworkflows/local/align_pacbio'
 include { ALIGN_PACBIO as ALIGN_CLR     } from '../subworkflows/local/align_pacbio'

From 168566fc2e276c567b642c6c85e3a611f7f32189 Mon Sep 17 00:00:00 2001
From: Tyler Chafin <tc25@sanger.ac.uk>
Date: Mon, 2 Sep 2024 16:24:24 +0100
Subject: [PATCH 02/15] generate csv seems to be working now?

---
 assets/samplesheet_s3.csv                     |   3 -
 conf/modules.config                           |   4 -
 modules.json                                  |  93 +++++--
 modules/local/generate_cram_csv.nf            |   4 +-
 .../nf-core/samtools/index/environment.yml    |   8 +
 modules/nf-core/samtools/index/main.nf        |  49 ++++
 modules/nf-core/samtools/index/meta.yml       |  57 ++++
 .../samtools/index/tests/csi.nextflow.config  |   7 +
 .../nf-core/samtools/index/tests/main.nf.test | 140 ++++++++++
 .../samtools/index/tests/main.nf.test.snap    | 250 ++++++++++++++++++
 modules/nf-core/samtools/index/tests/tags.yml |   2 +
 subworkflows/local/align_short_hic.nf         | 148 ++++++-----
 12 files changed, 668 insertions(+), 97 deletions(-)
 create mode 100644 modules/nf-core/samtools/index/environment.yml
 create mode 100644 modules/nf-core/samtools/index/main.nf
 create mode 100644 modules/nf-core/samtools/index/meta.yml
 create mode 100644 modules/nf-core/samtools/index/tests/csi.nextflow.config
 create mode 100644 modules/nf-core/samtools/index/tests/main.nf.test
 create mode 100644 modules/nf-core/samtools/index/tests/main.nf.test.snap
 create mode 100644 modules/nf-core/samtools/index/tests/tags.yml

diff --git a/assets/samplesheet_s3.csv b/assets/samplesheet_s3.csv
index ff9641b..c059c97 100644
--- a/assets/samplesheet_s3.csv
+++ b/assets/samplesheet_s3.csv
@@ -2,6 +2,3 @@ sample,datatype,datafile,library
 mMelMel1,illumina,https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/genomic_data/mMelMel1/illumina/31231_3%231.subset.cram,
 mMelMel2,illumina,https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/genomic_data/mMelMel2/illumina/31231_4%231.subset.fastq.gz,
 mMelMel3,hic,https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/genomic_data/mMelMel3/hic/35528_2%231.subset.cram,
-mMelMel3,ont,https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/genomic_data/mMelMel3/ont/PAE35587_pass_1f1f0707_115.subset.fastq.gz,
-mMelMel3,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/genomic_data/mMelMel3/pacbio/m64094_200910_173211.ccs.bc1022_BAK8B_OA--bc1022_BAK8B_OA.subset.bam,
-mMelMel3,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/genomic_data/mMelMel3/pacbio/m64094_200911_174739.ccs.bc1022_BAK8B_OA--bc1022_BAK8B_OA.subset.fastq.gz,
diff --git a/conf/modules.config b/conf/modules.config
index 88180ed..e7ed388 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -50,10 +50,6 @@ process {
         ext.args = "--output-fmt cram"
     }
 
-    withName: SAMTOOLS_INDEX {
-        ext.args = "-C --output-fmt cram --write-index"
-    }
-
     withName: ".*:.*:HIC_BWAMEM2:CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT" {
         ext.args        = ""
         ext.args1       = "-F0xB00 -nt"
diff --git a/modules.json b/modules.json
index 8896416..05aa1cc 100644
--- a/modules.json
+++ b/modules.json
@@ -8,93 +8,136 @@
                     "blast/blastn": {
                         "branch": "master",
                         "git_sha": "583edaf97c9373a20df05a3b7be5a6677f9cd719",
-                        "installed_by": ["modules"]
+                        "installed_by": [
+                            "modules"
+                        ]
                     },
                     "bwamem2/index": {
                         "branch": "master",
                         "git_sha": "7081e04c18de9480948d34513a1c1e2d0fa9126d",
-                        "installed_by": ["modules"]
+                        "installed_by": [
+                            "modules"
+                        ]
                     },
                     "bwamem2/mem": {
                         "branch": "master",
                         "git_sha": "2201e21b09213f083832ac58e33353d410a6fde7",
-                        "installed_by": ["modules"]
+                        "installed_by": [
+                            "modules"
+                        ]
                     },
                     "crumble": {
                         "branch": "master",
                         "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5",
-                        "installed_by": ["modules"]
+                        "installed_by": [
+                            "modules"
+                        ]
                     },
                     "custom/dumpsoftwareversions": {
                         "branch": "master",
                         "git_sha": "82024cf6325d2ee194e7f056d841ecad2f6856e9",
-                        "installed_by": ["modules"]
+                        "installed_by": [
+                            "modules"
+                        ]
                     },
                     "gunzip": {
                         "branch": "master",
                         "git_sha": "4e5f4687318f24ba944a13609d3ea6ebd890737d",
-                        "installed_by": ["modules"]
+                        "installed_by": [
+                            "modules"
+                        ]
                     },
                     "minimap2/align": {
                         "branch": "master",
                         "git_sha": "a33ef9475558c6b8da08c5f522ddaca1ec810306",
-                        "installed_by": ["modules"]
+                        "installed_by": [
+                            "modules"
+                        ]
                     },
                     "minimap2/index": {
                         "branch": "master",
                         "git_sha": "72e277acfd9e61a9f1368eafb4a9e83f5bcaa9f5",
-                        "installed_by": ["modules"]
+                        "installed_by": [
+                            "modules"
+                        ]
                     },
                     "samtools/faidx": {
                         "branch": "master",
                         "git_sha": "04fbbc7c43cebc0b95d5b126f6d9fe4effa33519",
-                        "installed_by": ["modules"]
+                        "installed_by": [
+                            "modules"
+                        ]
                     },
                     "samtools/fastq": {
                         "branch": "master",
                         "git_sha": "04fbbc7c43cebc0b95d5b126f6d9fe4effa33519",
-                        "installed_by": ["modules"]
+                        "installed_by": [
+                            "modules"
+                        ]
                     },
                     "samtools/flagstat": {
                         "branch": "master",
                         "git_sha": "46eca555142d6e597729fcb682adcc791796f514",
-                        "installed_by": ["modules"]
+                        "installed_by": [
+                            "modules"
+                        ]
                     },
                     "samtools/idxstats": {
                         "branch": "master",
                         "git_sha": "46eca555142d6e597729fcb682adcc791796f514",
-                        "installed_by": ["modules"]
+                        "installed_by": [
+                            "modules"
+                        ]
+                    },
+                    "samtools/index": {
+                        "branch": "master",
+                        "git_sha": "46eca555142d6e597729fcb682adcc791796f514",
+                        "installed_by": [
+                            "modules"
+                        ]
                     },
                     "samtools/merge": {
                         "branch": "master",
                         "git_sha": "04fbbc7c43cebc0b95d5b126f6d9fe4effa33519",
-                        "installed_by": ["modules"],
+                        "installed_by": [
+                            "modules"
+                        ],
                         "patch": "modules/nf-core/samtools/merge/samtools-merge.diff"
                     },
                     "samtools/stats": {
                         "branch": "master",
                         "git_sha": "1fe379cf6e6c1e6fa5e32bcbeefea0f1e874dac6",
-                        "installed_by": ["modules"]
+                        "installed_by": [
+                            "modules"
+                        ]
                     },
                     "samtools/view": {
                         "branch": "master",
                         "git_sha": "6c2309aaec566c0d44a6cf14d4b2d0c51afe2e91",
-                        "installed_by": ["modules"]
+                        "installed_by": [
+                            "modules"
+                        ]
                     },
                     "seqkit/fq2fa": {
                         "branch": "master",
                         "git_sha": "03fbf6c89e551bd8d77f3b751fb5c955f75b34c5",
-                        "installed_by": ["modules"]
+                        "installed_by": [
+                            "modules"
+                        ]
                     },
                     "seqtk/subseq": {
                         "branch": "master",
                         "git_sha": "730f3aee80d5f8d0b5fc532202ac59361414d006",
-                        "installed_by": ["modules"]
+                        "installed_by": [
+                            "modules"
+                        ]
                     },
                     "untar": {
                         "branch": "master",
                         "git_sha": "4e5f4687318f24ba944a13609d3ea6ebd890737d",
-                        "installed_by": ["modules"]
+                        "installed_by": [
+                            "modules"
+                        ]
                     }
                 }
             },
@@ -103,20 +146,26 @@
                     "utils_nextflow_pipeline": {
                         "branch": "master",
                         "git_sha": "5caf7640a9ef1d18d765d55339be751bb0969dfa",
-                        "installed_by": ["subworkflows"]
+                        "installed_by": [
+                            "subworkflows"
+                        ]
                     },
                     "utils_nfcore_pipeline": {
                         "branch": "master",
                         "git_sha": "92de218a329bfc9a9033116eb5f65fd270e72ba3",
-                        "installed_by": ["subworkflows"]
+                        "installed_by": [
+                            "subworkflows"
+                        ]
                     },
                     "utils_nfvalidation_plugin": {
                         "branch": "master",
                         "git_sha": "5caf7640a9ef1d18d765d55339be751bb0969dfa",
-                        "installed_by": ["subworkflows"]
+                        "installed_by": [
+                            "subworkflows"
+                        ]
                     }
                 }
             }
         }
     }
-}
+}
\ No newline at end of file
diff --git a/modules/local/generate_cram_csv.nf b/modules/local/generate_cram_csv.nf
index 3e6e224..c1d8965 100644
--- a/modules/local/generate_cram_csv.nf
+++ b/modules/local/generate_cram_csv.nf
@@ -5,7 +5,7 @@ process GENERATE_CRAM_CSV {
     container 'quay.io/sanger-tol/cramfilter_bwamem2_minimap2_samtools_perl:0.001-c1'
 
     input:
-    tuple val(meta), path(crampath)
+    tuple val(meta), path(crampath), path(crai)
 
 
     output:
@@ -32,4 +32,4 @@ process GENERATE_CRAM_CSV {
         samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//' )
     END_VERSIONS
     """
-}
\ No newline at end of file
+}
diff --git a/modules/nf-core/samtools/index/environment.yml b/modules/nf-core/samtools/index/environment.yml
new file mode 100644
index 0000000..260d516
--- /dev/null
+++ b/modules/nf-core/samtools/index/environment.yml
@@ -0,0 +1,8 @@
+name: samtools_index
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - bioconda::samtools=1.20
+  - bioconda::htslib=1.20
diff --git a/modules/nf-core/samtools/index/main.nf b/modules/nf-core/samtools/index/main.nf
new file mode 100644
index 0000000..e002585
--- /dev/null
+++ b/modules/nf-core/samtools/index/main.nf
@@ -0,0 +1,49 @@
+process SAMTOOLS_INDEX {
+    tag "$meta.id"
+    label 'process_low'
+
+    conda "${moduleDir}/environment.yml"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/samtools:1.20--h50ea8bc_0' :
+        'biocontainers/samtools:1.20--h50ea8bc_0' }"
+
+    input:
+    tuple val(meta), path(input)
+
+    output:
+    tuple val(meta), path("*.bai") , optional:true, emit: bai
+    tuple val(meta), path("*.csi") , optional:true, emit: csi
+    tuple val(meta), path("*.crai"), optional:true, emit: crai
+    path  "versions.yml"           , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    """
+    samtools \\
+        index \\
+        -@ ${task.cpus-1} \\
+        $args \\
+        $input
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//')
+    END_VERSIONS
+    """
+
+    stub:
+    def args = task.ext.args ?: ''
+    def extension = file(input).getExtension() == 'cram' ?
+                    "crai" : args.contains("-c") ?  "csi" : "bai"
+    """
+    touch ${input}.${extension}
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//')
+    END_VERSIONS
+    """
+}
diff --git a/modules/nf-core/samtools/index/meta.yml b/modules/nf-core/samtools/index/meta.yml
new file mode 100644
index 0000000..01a4ee0
--- /dev/null
+++ b/modules/nf-core/samtools/index/meta.yml
@@ -0,0 +1,57 @@
+name: samtools_index
+description: Index SAM/BAM/CRAM file
+keywords:
+  - index
+  - bam
+  - sam
+  - cram
+tools:
+  - samtools:
+      description: |
+        SAMtools is a set of utilities for interacting with and post-processing
+        short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li.
+        These files are generated as output by short read aligners like BWA.
+      homepage: http://www.htslib.org/
+      documentation: http://www.htslib.org/doc/samtools.html
+      doi: 10.1093/bioinformatics/btp352
+      licence: ["MIT"]
+input:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - bam:
+      type: file
+      description: BAM/CRAM/SAM file
+      pattern: "*.{bam,cram,sam}"
+output:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - bai:
+      type: file
+      description: BAM/CRAM/SAM index file
+      pattern: "*.{bai,crai,sai}"
+  - crai:
+      type: file
+      description: BAM/CRAM/SAM index file
+      pattern: "*.{bai,crai,sai}"
+  - csi:
+      type: file
+      description: CSI index file
+      pattern: "*.{csi}"
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: "versions.yml"
+authors:
+  - "@drpatelh"
+  - "@ewels"
+  - "@maxulysse"
+maintainers:
+  - "@drpatelh"
+  - "@ewels"
+  - "@maxulysse"
diff --git a/modules/nf-core/samtools/index/tests/csi.nextflow.config b/modules/nf-core/samtools/index/tests/csi.nextflow.config
new file mode 100644
index 0000000..0ed260e
--- /dev/null
+++ b/modules/nf-core/samtools/index/tests/csi.nextflow.config
@@ -0,0 +1,7 @@
+process {
+
+    withName: SAMTOOLS_INDEX {
+        ext.args = '-c'
+    }
+
+}
diff --git a/modules/nf-core/samtools/index/tests/main.nf.test b/modules/nf-core/samtools/index/tests/main.nf.test
new file mode 100644
index 0000000..ca34fb5
--- /dev/null
+++ b/modules/nf-core/samtools/index/tests/main.nf.test
@@ -0,0 +1,140 @@
+nextflow_process {
+
+    name "Test Process SAMTOOLS_INDEX"
+    script "../main.nf"
+    process "SAMTOOLS_INDEX"
+    tag "modules"
+    tag "modules_nfcore"
+    tag "samtools"
+    tag "samtools/index"
+
+    test("bai") {
+        when {
+            process {
+                """
+                input[0] = Channel.of([
+                    [ id:'test', single_end:false ], // meta map
+                    file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam', checkIfExists: true)
+                ])
+                """
+            }
+        }
+
+        then {
+            assertAll (
+                { assert process.success },
+                { assert snapshot(process.out).match() }
+            )
+        }
+    }
+
+    test("crai") {
+        when {
+            process {
+                """
+                input[0] = Channel.of([
+                    [ id:'test', single_end:false ], // meta map
+                    file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/cram/test.paired_end.recalibrated.sorted.cram', checkIfExists: true)
+                ])
+                """
+            }
+        }
+
+        then {
+            assertAll (
+                { assert process.success },
+                { assert snapshot(process.out).match() }
+            )
+        }
+    }
+
+    test("csi") {
+        config "./csi.nextflow.config"
+
+        when {
+            process {
+                """
+                input[0] = Channel.of([
+                    [ id:'test', single_end:false ], // meta map
+                    file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam', checkIfExists: true)
+                ])
+                """
+            }
+        }
+
+        then {
+            assertAll (
+                { assert process.success },
+                { assert snapshot(
+                    file(process.out.csi[0][1]).name,
+                    process.out.versions
+                ).match() }
+            )
+        }
+    }
+
+    test("bai - stub") {
+        options "-stub"
+        when {
+            process {
+                """
+                input[0] = Channel.of([
+                    [ id:'test', single_end:false ], // meta map
+                    file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam', checkIfExists: true)
+                ])
+                """
+            }
+        }
+
+        then {
+            assertAll (
+                { assert process.success },
+                { assert snapshot(process.out).match() }
+            )
+        }
+    }
+
+    test("crai - stub") {
+        options "-stub"
+        when {
+            process {
+                """
+                input[0] = Channel.of([
+                    [ id:'test', single_end:false ], // meta map
+                    file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/cram/test.paired_end.recalibrated.sorted.cram', checkIfExists: true)
+                ])
+                """
+            }
+        }
+
+        then {
+            assertAll (
+                { assert process.success },
+                { assert snapshot(process.out).match() }
+            )
+        }
+    }
+
+    test("csi - stub") {
+        options "-stub"
+        config "./csi.nextflow.config"
+
+        when {
+            process {
+                """
+                input[0] = Channel.of([
+                    [ id:'test', single_end:false ], // meta map
+                    file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam', checkIfExists: true)
+                ])
+                """
+            }
+        }
+
+        then {
+            assertAll (
+                { assert process.success },
+                { assert snapshot(process.out).match() }
+            )
+        }
+    }
+}
diff --git a/modules/nf-core/samtools/index/tests/main.nf.test.snap b/modules/nf-core/samtools/index/tests/main.nf.test.snap
new file mode 100644
index 0000000..799d199
--- /dev/null
+++ b/modules/nf-core/samtools/index/tests/main.nf.test.snap
@@ -0,0 +1,250 @@
+{
+    "csi - stub": {
+        "content": [
+            {
+                "0": [
+                    
+                ],
+                "1": [
+                    [
+                        {
+                            "id": "test",
+                            "single_end": false
+                        },
+                        "test.paired_end.sorted.bam.csi:md5,d41d8cd98f00b204e9800998ecf8427e"
+                    ]
+                ],
+                "2": [
+                    
+                ],
+                "3": [
+                    "versions.yml:md5,802c9776d9c5e95314e888cf18e96d77"
+                ],
+                "bai": [
+                    
+                ],
+                "crai": [
+                    
+                ],
+                "csi": [
+                    [
+                        {
+                            "id": "test",
+                            "single_end": false
+                        },
+                        "test.paired_end.sorted.bam.csi:md5,d41d8cd98f00b204e9800998ecf8427e"
+                    ]
+                ],
+                "versions": [
+                    "versions.yml:md5,802c9776d9c5e95314e888cf18e96d77"
+                ]
+            }
+        ],
+        "meta": {
+            "nf-test": "0.9.0",
+            "nextflow": "24.04.3"
+        },
+        "timestamp": "2024-07-22T16:51:53.9057"
+    },
+    "crai - stub": {
+        "content": [
+            {
+                "0": [
+                    
+                ],
+                "1": [
+                    
+                ],
+                "2": [
+                    [
+                        {
+                            "id": "test",
+                            "single_end": false
+                        },
+                        "test.paired_end.recalibrated.sorted.cram.crai:md5,d41d8cd98f00b204e9800998ecf8427e"
+                    ]
+                ],
+                "3": [
+                    "versions.yml:md5,802c9776d9c5e95314e888cf18e96d77"
+                ],
+                "bai": [
+                    
+                ],
+                "crai": [
+                    [
+                        {
+                            "id": "test",
+                            "single_end": false
+                        },
+                        "test.paired_end.recalibrated.sorted.cram.crai:md5,d41d8cd98f00b204e9800998ecf8427e"
+                    ]
+                ],
+                "csi": [
+                    
+                ],
+                "versions": [
+                    "versions.yml:md5,802c9776d9c5e95314e888cf18e96d77"
+                ]
+            }
+        ],
+        "meta": {
+            "nf-test": "0.9.0",
+            "nextflow": "24.04.3"
+        },
+        "timestamp": "2024-07-22T16:51:45.931558"
+    },
+    "bai - stub": {
+        "content": [
+            {
+                "0": [
+                    [
+                        {
+                            "id": "test",
+                            "single_end": false
+                        },
+                        "test.paired_end.sorted.bam.bai:md5,d41d8cd98f00b204e9800998ecf8427e"
+                    ]
+                ],
+                "1": [
+                    
+                ],
+                "2": [
+                    
+                ],
+                "3": [
+                    "versions.yml:md5,802c9776d9c5e95314e888cf18e96d77"
+                ],
+                "bai": [
+                    [
+                        {
+                            "id": "test",
+                            "single_end": false
+                        },
+                        "test.paired_end.sorted.bam.bai:md5,d41d8cd98f00b204e9800998ecf8427e"
+                    ]
+                ],
+                "crai": [
+                    
+                ],
+                "csi": [
+                    
+                ],
+                "versions": [
+                    "versions.yml:md5,802c9776d9c5e95314e888cf18e96d77"
+                ]
+            }
+        ],
+        "meta": {
+            "nf-test": "0.9.0",
+            "nextflow": "24.04.3"
+        },
+        "timestamp": "2024-07-22T16:51:34.807525"
+    },
+    "csi": {
+        "content": [
+            "test.paired_end.sorted.bam.csi",
+            [
+                "versions.yml:md5,802c9776d9c5e95314e888cf18e96d77"
+            ]
+        ],
+        "meta": {
+            "nf-test": "0.9.0",
+            "nextflow": "24.04.3"
+        },
+        "timestamp": "2024-07-22T16:52:55.688799"
+    },
+    "crai": {
+        "content": [
+            {
+                "0": [
+                    
+                ],
+                "1": [
+                    
+                ],
+                "2": [
+                    [
+                        {
+                            "id": "test",
+                            "single_end": false
+                        },
+                        "test.paired_end.recalibrated.sorted.cram.crai:md5,14bc3bd5c89cacc8f4541f9062429029"
+                    ]
+                ],
+                "3": [
+                    "versions.yml:md5,802c9776d9c5e95314e888cf18e96d77"
+                ],
+                "bai": [
+                    
+                ],
+                "crai": [
+                    [
+                        {
+                            "id": "test",
+                            "single_end": false
+                        },
+                        "test.paired_end.recalibrated.sorted.cram.crai:md5,14bc3bd5c89cacc8f4541f9062429029"
+                    ]
+                ],
+                "csi": [
+                    
+                ],
+                "versions": [
+                    "versions.yml:md5,802c9776d9c5e95314e888cf18e96d77"
+                ]
+            }
+        ],
+        "meta": {
+            "nf-test": "0.9.0",
+            "nextflow": "24.04.3"
+        },
+        "timestamp": "2024-07-22T16:51:17.609533"
+    },
+    "bai": {
+        "content": [
+            {
+                "0": [
+                    [
+                        {
+                            "id": "test",
+                            "single_end": false
+                        },
+                        "test.paired_end.sorted.bam.bai:md5,704c10dd1326482448ca3073fdebc2f4"
+                    ]
+                ],
+                "1": [
+                    
+                ],
+                "2": [
+                    
+                ],
+                "3": [
+                    "versions.yml:md5,802c9776d9c5e95314e888cf18e96d77"
+                ],
+                "bai": [
+                    [
+                        {
+                            "id": "test",
+                            "single_end": false
+                        },
+                        "test.paired_end.sorted.bam.bai:md5,704c10dd1326482448ca3073fdebc2f4"
+                    ]
+                ],
+                "crai": [
+                    
+                ],
+                "csi": [
+                    
+                ],
+                "versions": [
+                    "versions.yml:md5,802c9776d9c5e95314e888cf18e96d77"
+                ]
+            }
+        ],
+        "meta": {
+            "nf-test": "0.9.0",
+            "nextflow": "24.04.3"
+        },
+        "timestamp": "2024-07-22T16:51:04.16585"
+    }
+}
\ No newline at end of file
diff --git a/modules/nf-core/samtools/index/tests/tags.yml b/modules/nf-core/samtools/index/tests/tags.yml
new file mode 100644
index 0000000..e0f58a7
--- /dev/null
+++ b/modules/nf-core/samtools/index/tests/tags.yml
@@ -0,0 +1,2 @@
+samtools/index:
+  - modules/nf-core/samtools/index/**
diff --git a/subworkflows/local/align_short_hic.nf b/subworkflows/local/align_short_hic.nf
index 4a4a31d..0675236 100644
--- a/subworkflows/local/align_short_hic.nf
+++ b/subworkflows/local/align_short_hic.nf
@@ -5,14 +5,14 @@
 include { SAMTOOLS_FASTQ                        } from '../../modules/nf-core/samtools/fastq/main'
 include { SAMTOOLS_MERGE                        } from '../../modules/nf-core/samtools/merge/main'
 include { SAMTOOLS_SORMADUP                     } from '../../modules/local/samtools_sormadup'
-include { SAMTOOLS_VIEW as SAMTOOLS_INDEX       } from '../../modules/nf-core/samtools/view/main'
+include { SAMTOOLS_INDEX                        } from '../../modules/nf-core/samtools/index/main'
 include { GENERATE_CRAM_CSV                     } from '../../modules/local/generate_cram_csv'
 include { SAMTOOLS_SORMADUP as CONVERT_CRAM     } from '../../modules/local/samtools_sormadup'
 include { HIC_MINIMAP2                          } from '../../subworkflows/local/hic_minimap2'
 include { HIC_BWAMEM2                           } from '../../subworkflows/local/hic_bwamem2'
 
 workflow ALIGN_SHORT_HIC {
-    take: 
+    take:
     fasta    // channel: [ val(meta), /path/to/fasta ] reference_tuple
     index    // channel: [ val(meta), /path/to/bwamem2/ ]
     reads    // channel: [ val(meta), /path/to/datafile ] hic_reads_path
@@ -31,86 +31,102 @@ workflow ALIGN_SHORT_HIC {
     }
     | set { ch_reads }
 
-
     // Convert FASTQ to CRAM only if FASTQ were provided as input
     CONVERT_CRAM ( ch_reads.fastq, fasta )
-    ch_versions = ch_versions.mix (    CONVERT_CRAM.out.versions.first() )
+    ch_versions = ch_versions.mix (    CONVERT_CRAM.out.versions )
 
     CONVERT_CRAM.out.bam
     | mix ( ch_reads.cram )
-    | map { meta, cram -> [ meta, cram, [] ] }
     | set { ch_reads_cram }
 
+    ch_reads_cram.view()
+
+    // fasta
+    // | combine( ch_reads.cram )
+    // | map { meta, ref, hic_meta, hic_reads_path ->
+    //         tuple(
+    //             [ id: meta.id, single_end: true],
+    //             hic_reads_path
+    //         )
+    // }
+    // .set { get_reads_input }
+
+    // get_reads_input.view()
+
     // Index the CRAM file
-    SAMTOOLS_INDEX ( ch_reads_cram, fasta, [] )
-    ch_versions         = ch_versions.mix( SAMTOOLS_INDEX.out.versions )
+    SAMTOOLS_INDEX ( ch_reads_cram )
+    ch_versions = ch_versions.mix( SAMTOOLS_INDEX.out.versions )
+
+    SAMTOOLS_INDEX.out.crai.view()
 
-    SAMTOOLS_INDEX.out.cram
+    ch_reads_cram
     | join ( SAMTOOLS_INDEX.out.crai )
-    | set { ch_reads_cram }
+    | set { ch_reads_cram_crai }
 
+    ch_reads_cram_crai.view()
 
     //
     // MODULE: generate a CRAM CSV file containing the required parametres for CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT
     //
-    GENERATE_CRAM_CSV( ch_reads_cram )
-    ch_versions         = ch_versions.mix( GENERATE_CRAM_CSV.out.versions )
-
-
-    //
-    // SUBWORKFLOW: mapping hic reads using minimap2 or bwamem2
-    //
-    if (params.hic_aligner == 'minimap2') {
-        HIC_MINIMAP2 (
-            fasta,
-            GENERATE_CRAM_CSV.out.csv,
-            index
-        )
-        ch_versions         = ch_versions.mix( HIC_MINIMAP2.out.versions )
-        ch_merged_bam           = ch_merged_bam.mix(HIC_MINIMAP2.out.mergedbam)
-    } else {
-        HIC_BWAMEM2 (
-            fasta,
-            GENERATE_CRAM_CSV.out.csv,
-            index
-        )
-        ch_versions         = ch_versions.mix( HIC_BWAMEM2.out.versions )
-        ch_merged_bam           = ch_merged_bam.mix(HIC_BWAMEM2.out.mergedbam)
-    }
-    
-    ch_merged_bam
-    | combine( ch_reads_cram )
-    | map { meta_bam, bam, meta_cram, cram, crai -> [ meta_cram, bam ] }
-    | set { ch_merged_bam }
-    
-    // // Collect all BWAMEM2 output by sample name
-    ch_merged_bam
-    | map { meta, bam -> [['id': meta.id.split('_')[0..-2].join('_'), 'datatype': meta.datatype], meta.read_count, bam] }
-    | groupTuple( by: [0] )
-    | map { meta, read_counts, bams -> [meta + [read_count: read_counts.sum()], bams] }
-    | branch {
-        meta, bams ->
-            single_bam: bams.size() == 1
-            multi_bams: true
-    }
-    | set { ch_bams }
-
-
-    // Merge, but only if there is more than 1 file
-    SAMTOOLS_MERGE ( ch_bams.multi_bams, [ [], [] ], [ [], [] ] )
-    ch_versions = ch_versions.mix ( SAMTOOLS_MERGE.out.versions.first() )
-
-
-    SAMTOOLS_MERGE.out.bam
-    | mix ( ch_bams.single_bam )
-    | set { ch_bam }
-
-
-    // Mark duplicates
-    SAMTOOLS_SORMADUP ( ch_bam, fasta )
-    ch_versions = ch_versions.mix ( SAMTOOLS_SORMADUP.out.versions )
+    GENERATE_CRAM_CSV( ch_reads_cram_crai )
+    ch_versions = ch_versions.mix( GENERATE_CRAM_CSV.out.versions )
+
+    // GENERATE_CRAM_CSV.out.csv.view()
+
+    // //
+    // // SUBWORKFLOW: mapping hic reads using minimap2 or bwamem2
+    // //
+    // if (params.hic_aligner == 'minimap2') {
+    //     HIC_MINIMAP2 (
+    //         fasta,
+    //         GENERATE_CRAM_CSV.out.csv,
+    //         index
+    //     )
+    //     ch_versions         = ch_versions.mix( HIC_MINIMAP2.out.versions )
+    //     ch_merged_bam           = ch_merged_bam.mix(HIC_MINIMAP2.out.mergedbam)
+    // } else {
+    //     HIC_BWAMEM2 (
+    //         fasta,
+    //         GENERATE_CRAM_CSV.out.csv,
+    //         index
+    //     )
+    //     ch_versions         = ch_versions.mix( HIC_BWAMEM2.out.versions )
+    //     ch_merged_bam           = ch_merged_bam.mix(HIC_BWAMEM2.out.mergedbam)
+    // }
+
+    // ch_merged_bam
+    // | combine( ch_reads_cram )
+    // | map { meta_bam, bam, meta_cram, cram, crai -> [ meta_cram, bam ] }
+    // | set { ch_merged_bam }
+
+    // // // Collect all BWAMEM2 output by sample name
+    // ch_merged_bam
+    // | map { meta, bam -> [['id': meta.id.split('_')[0..-2].join('_'), 'datatype': meta.datatype], meta.read_count, bam] }
+    // | groupTuple( by: [0] )
+    // | map { meta, read_counts, bams -> [meta + [read_count: read_counts.sum()], bams] }
+    // | branch {
+    //     meta, bams ->
+    //         single_bam: bams.size() == 1
+    //         multi_bams: true
+    // }
+    // | set { ch_bams }
+
+
+    // // Merge, but only if there is more than 1 file
+    // SAMTOOLS_MERGE ( ch_bams.multi_bams, [ [], [] ], [ [], [] ] )
+    // ch_versions = ch_versions.mix ( SAMTOOLS_MERGE.out.versions.first() )
+
+
+    // SAMTOOLS_MERGE.out.bam
+    // | mix ( ch_bams.single_bam )
+    // | set { ch_bam }
+
+
+    // // Mark duplicates
+    // SAMTOOLS_SORMADUP ( ch_bam, fasta )
+    // ch_versions = ch_versions.mix ( SAMTOOLS_SORMADUP.out.versions )
 
     emit:
-    bam      = SAMTOOLS_SORMADUP.out.bam     // channel: [ val(meta), /path/to/bam ]
+    bam      = ch_reads_cram_crai     // channel: [ val(meta), /path/to/bam ]
     versions = ch_versions                   // channel: [ versions.yml ]
 }

From d7e2761bacd136538838b819f826daf380ff85da Mon Sep 17 00:00:00 2001
From: reichan1998 <baochauduong1311@gmail.com>
Date: Tue, 3 Sep 2024 00:47:55 +0700
Subject: [PATCH 03/15] adapt generate_cram_csv.sh to handle crai file

---
 bin/generate_cram_csv.sh              |  17 ++--
 modules/local/generate_cram_csv.nf    |   2 +-
 subworkflows/local/align_short_hic.nf | 127 ++++++++++++--------------
 workflows/readmapping.nf              |   2 +-
 4 files changed, 70 insertions(+), 78 deletions(-)

diff --git a/bin/generate_cram_csv.sh b/bin/generate_cram_csv.sh
index 1a951da..b36fa5d 100755
--- a/bin/generate_cram_csv.sh
+++ b/bin/generate_cram_csv.sh
@@ -12,11 +12,13 @@ chunk_cram() {
     local cram=$1
     local chunkn=$2
     local outcsv=$3
+    local crai=$4
     realcram=$(readlink -f ${cram})
-    realcrai=$(readlink -f ${cram}.crai)
+    # realcrai=$(readlink -f ${cram}.crai)
+    realcrai=$(readlink -f ${crai})
     local rgline=$(samtools view -H "${realcram}" | grep "@RG" | sed 's/\t/\\t/g' | sed "s/'//g")
-    # local ncontainers=$(zcat "${realcrai}" | wc -l)
-    local ncontainers=$(cat "${realcram}" | wc -l)
+    local ncontainers=$(zcat "${realcrai}" | wc -l)
+    # local ncontainers=$(cat "${realcram}" | wc -l)
     local base=$(basename "${realcram}" .cram)
     local from=0
     local to=10000
@@ -42,6 +44,7 @@ process_cram_file() {
     local cram=$1
     local chunkn=$2
     local outcsv=$3
+    local crai=$4
 
     local read_groups=$(samtools view -H "$cram" | grep '@RG' | awk '{for(i=1;i<=NF;i++){if($i ~ /^ID:/){print substr($i,4)}}}')
     local num_read_groups=$(echo "$read_groups" | wc -w)
@@ -52,11 +55,11 @@ process_cram_file() {
             local output_cram="$(basename "${cram%.cram}")_output_${rg}.cram"
             samtools view -h -r "$rg" -o "$output_cram" "$cram"
             samtools index "$output_cram"
-            chunkn=$(chunk_cram "$output_cram" "$chunkn" "$outcsv")
+            chunkn=$(chunk_cram "$output_cram" "$chunkn" "$outcsv" "$crai")
         done
     else
         # Single read group or no read groups
-        chunkn=$(chunk_cram "$cram" "$chunkn" "$outcsv")
+        chunkn=$(chunk_cram "$cram" "$chunkn" "$outcsv" "$crai")
     fi
 
     echo $chunkn
@@ -76,6 +79,7 @@ fi
 cram=$1
 chunkn=0
 outcsv=$2
+crai=$3
 
 # # Loop through each CRAM file in the specified directory. cram cannot be the synlinked cram
 # for cram in ${cram_path}/*.cram; do
@@ -83,4 +87,5 @@ outcsv=$2
 #     chunkn=$(process_cram_file $realcram $chunkn $outcsv)
 # done
 realcram=$(readlink -f $cram)
-chunkn=$(process_cram_file $realcram $chunkn $outcsv)
+realcrai=$(readlink -f $crai)
+chunkn=$(process_cram_file $realcram $chunkn $outcsv $realcrai) 
diff --git a/modules/local/generate_cram_csv.nf b/modules/local/generate_cram_csv.nf
index c1d8965..6a3b120 100644
--- a/modules/local/generate_cram_csv.nf
+++ b/modules/local/generate_cram_csv.nf
@@ -15,7 +15,7 @@ process GENERATE_CRAM_CSV {
     script:
     def prefix = task.ext.prefix ?: "${meta.id}"
     """
-    generate_cram_csv.sh $crampath ${prefix}_cram.csv
+    generate_cram_csv.sh $crampath ${prefix}_cram.csv $crai
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":
diff --git a/subworkflows/local/align_short_hic.nf b/subworkflows/local/align_short_hic.nf
index 0675236..1055b61 100644
--- a/subworkflows/local/align_short_hic.nf
+++ b/subworkflows/local/align_short_hic.nf
@@ -31,6 +31,7 @@ workflow ALIGN_SHORT_HIC {
     }
     | set { ch_reads }
 
+
     // Convert FASTQ to CRAM only if FASTQ were provided as input
     CONVERT_CRAM ( ch_reads.fastq, fasta )
     ch_versions = ch_versions.mix (    CONVERT_CRAM.out.versions )
@@ -39,94 +40,80 @@ workflow ALIGN_SHORT_HIC {
     | mix ( ch_reads.cram )
     | set { ch_reads_cram }
 
-    ch_reads_cram.view()
-
-    // fasta
-    // | combine( ch_reads.cram )
-    // | map { meta, ref, hic_meta, hic_reads_path ->
-    //         tuple(
-    //             [ id: meta.id, single_end: true],
-    //             hic_reads_path
-    //         )
-    // }
-    // .set { get_reads_input }
-
-    // get_reads_input.view()
 
     // Index the CRAM file
     SAMTOOLS_INDEX ( ch_reads_cram )
     ch_versions = ch_versions.mix( SAMTOOLS_INDEX.out.versions )
 
-    SAMTOOLS_INDEX.out.crai.view()
-
     ch_reads_cram
     | join ( SAMTOOLS_INDEX.out.crai )
     | set { ch_reads_cram_crai }
 
     ch_reads_cram_crai.view()
 
+
     //
     // MODULE: generate a CRAM CSV file containing the required parametres for CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT
     //
     GENERATE_CRAM_CSV( ch_reads_cram_crai )
     ch_versions = ch_versions.mix( GENERATE_CRAM_CSV.out.versions )
 
-    // GENERATE_CRAM_CSV.out.csv.view()
-
-    // //
-    // // SUBWORKFLOW: mapping hic reads using minimap2 or bwamem2
-    // //
-    // if (params.hic_aligner == 'minimap2') {
-    //     HIC_MINIMAP2 (
-    //         fasta,
-    //         GENERATE_CRAM_CSV.out.csv,
-    //         index
-    //     )
-    //     ch_versions         = ch_versions.mix( HIC_MINIMAP2.out.versions )
-    //     ch_merged_bam           = ch_merged_bam.mix(HIC_MINIMAP2.out.mergedbam)
-    // } else {
-    //     HIC_BWAMEM2 (
-    //         fasta,
-    //         GENERATE_CRAM_CSV.out.csv,
-    //         index
-    //     )
-    //     ch_versions         = ch_versions.mix( HIC_BWAMEM2.out.versions )
-    //     ch_merged_bam           = ch_merged_bam.mix(HIC_BWAMEM2.out.mergedbam)
-    // }
-
-    // ch_merged_bam
-    // | combine( ch_reads_cram )
-    // | map { meta_bam, bam, meta_cram, cram, crai -> [ meta_cram, bam ] }
-    // | set { ch_merged_bam }
-
-    // // // Collect all BWAMEM2 output by sample name
-    // ch_merged_bam
-    // | map { meta, bam -> [['id': meta.id.split('_')[0..-2].join('_'), 'datatype': meta.datatype], meta.read_count, bam] }
-    // | groupTuple( by: [0] )
-    // | map { meta, read_counts, bams -> [meta + [read_count: read_counts.sum()], bams] }
-    // | branch {
-    //     meta, bams ->
-    //         single_bam: bams.size() == 1
-    //         multi_bams: true
-    // }
-    // | set { ch_bams }
-
-
-    // // Merge, but only if there is more than 1 file
-    // SAMTOOLS_MERGE ( ch_bams.multi_bams, [ [], [] ], [ [], [] ] )
-    // ch_versions = ch_versions.mix ( SAMTOOLS_MERGE.out.versions.first() )
-
-
-    // SAMTOOLS_MERGE.out.bam
-    // | mix ( ch_bams.single_bam )
-    // | set { ch_bam }
-
-
-    // // Mark duplicates
-    // SAMTOOLS_SORMADUP ( ch_bam, fasta )
-    // ch_versions = ch_versions.mix ( SAMTOOLS_SORMADUP.out.versions )
+
+    //
+    // SUBWORKFLOW: mapping hic reads using minimap2 or bwamem2
+    //
+    if (params.hic_aligner == 'minimap2') {
+        HIC_MINIMAP2 (
+            fasta,
+            GENERATE_CRAM_CSV.out.csv,
+            index
+        )
+        ch_versions         = ch_versions.mix( HIC_MINIMAP2.out.versions )
+        ch_merged_bam           = ch_merged_bam.mix(HIC_MINIMAP2.out.mergedbam)
+    } else {
+        HIC_BWAMEM2 (
+            fasta,
+            GENERATE_CRAM_CSV.out.csv,
+            index
+        )
+        ch_versions         = ch_versions.mix( HIC_BWAMEM2.out.versions )
+        ch_merged_bam           = ch_merged_bam.mix(HIC_BWAMEM2.out.mergedbam)
+    }
+
+    ch_merged_bam
+    | combine( ch_reads_cram_crai )
+    | map { meta_bam, bam, meta_cram, cram, crai -> [ meta_cram, bam ] }
+    | set { ch_merged_bam }
+
+    
+    // Collect all BAM output by sample name
+    ch_merged_bam
+    | map { meta, bam -> [['id': meta.id.split('_')[0..-2].join('_'), 'datatype': meta.datatype], meta.read_count, bam] }
+    | groupTuple( by: [0] )
+    | map { meta, read_counts, bams -> [meta + [read_count: read_counts.sum()], bams] }
+    | branch {
+        meta, bams ->
+            single_bam: bams.size() == 1
+            multi_bams: true
+    }
+    | set { ch_bams }
+
+
+    // Merge, but only if there is more than 1 file
+    SAMTOOLS_MERGE ( ch_bams.multi_bams, [ [], [] ], [ [], [] ] )
+    ch_versions = ch_versions.mix ( SAMTOOLS_MERGE.out.versions.first() )
+
+
+    SAMTOOLS_MERGE.out.bam
+    | mix ( ch_bams.single_bam )
+    | set { ch_bam }
+
+
+    // Mark duplicates
+    SAMTOOLS_SORMADUP ( ch_bam, fasta )
+    ch_versions = ch_versions.mix ( SAMTOOLS_SORMADUP.out.versions )
 
     emit:
-    bam      = ch_reads_cram_crai     // channel: [ val(meta), /path/to/bam ]
+    bam      = SAMTOOLS_SORMADUP.out.bam     // channel: [ val(meta), /path/to/bam ]
     versions = ch_versions                   // channel: [ versions.yml ]
 }
diff --git a/workflows/readmapping.nf b/workflows/readmapping.nf
index bbc0d23..1a06b5e 100644
--- a/workflows/readmapping.nf
+++ b/workflows/readmapping.nf
@@ -18,7 +18,7 @@ include { SAMTOOLS_REHEADER           } from '../modules/local/samtools_replaceh
 
 include { INPUT_CHECK                   } from '../subworkflows/local/input_check'
 include { PREPARE_GENOME                } from '../subworkflows/local/prepare_genome'
-include { ALIGN_SHORT_HIC as ALIGN_HIC      } from '../subworkflows/local/align_short_hic'
+include { ALIGN_SHORT_HIC as ALIGN_HIC  } from '../subworkflows/local/align_short_hic'
 include { ALIGN_SHORT as ALIGN_ILLUMINA } from '../subworkflows/local/align_short'
 include { ALIGN_PACBIO as ALIGN_HIFI    } from '../subworkflows/local/align_pacbio'
 include { ALIGN_PACBIO as ALIGN_CLR     } from '../subworkflows/local/align_pacbio'

From 7c35dad4776d7ab5f32b82a70e11a40fc6837eef Mon Sep 17 00:00:00 2001
From: reichan1998 <baochauduong1311@gmail.com>
Date: Tue, 10 Sep 2024 16:11:16 +0700
Subject: [PATCH 04/15] ensure the sam cram output for HiC reads

---
 conf/modules.config                           |  6 +++---
 .../cram_filter_align_bwamem2_fixmate_sort.nf |  9 +++++++--
 subworkflows/local/align_short_hic.nf         |  5 +----
 subworkflows/local/hic_bwamem2.nf             | 19 ++++++-------------
 subworkflows/local/hic_minimap2.nf            |  3 +--
 5 files changed, 18 insertions(+), 24 deletions(-)

diff --git a/conf/modules.config b/conf/modules.config
index e7ed388..a7a59cb 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -52,15 +52,15 @@ process {
 
     withName: ".*:.*:HIC_BWAMEM2:CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT" {
         ext.args        = ""
-        ext.args1       = "-F0xB00 -nt"
-        ext.args2       = { "-5SPCp -H'${rglines}'" }
+        ext.args1       = { "-F 0x200 -nt -1 '${base}_${chunkid}_1.fastq.gz'" }
+        ext.args2       = { "-5SPCp -R '${rglines}'" }
         ext.args3       = "-mpu"
         ext.args4       = { "--write-index -l1" }
     }
 
     withName: ".*:.*:HIC_MINIMAP2:CRAM_FILTER_MINIMAP2_FILTER5END_FIXMATE_SORT" {
         ext.args        = ""
-        ext.args1       = ""
+        ext.args1       = { "-F 0x200 -nt -1 '${base}_${chunkid}_1.fastq.gz'" }
         ext.args2       = { "-ax sr" }
         ext.args3       = "-mpu"
         ext.args4       = { "--write-index -l1" }
diff --git a/modules/local/cram_filter_align_bwamem2_fixmate_sort.nf b/modules/local/cram_filter_align_bwamem2_fixmate_sort.nf
index 666b8bd..cf5241f 100644
--- a/modules/local/cram_filter_align_bwamem2_fixmate_sort.nf
+++ b/modules/local/cram_filter_align_bwamem2_fixmate_sort.nf
@@ -7,7 +7,9 @@ process CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT {
         'biocontainers/mulled-v2-1a6fe65bd6674daba65066aa796ed8f5e8b4687b:688e175eb0db54de17822ba7810cc9e20fa06dd5-0' }"
 
     input:
-    tuple val(meta), path(cramfile), path(cramindex), val(from), val(to), val(base), val(chunkid), val(rglines), val(bwaprefix), path(reference)
+    tuple val(meta), path(cramfile), path(cramindex), val(from), val(to), val(base), val(chunkid), val(rglines)
+    tuple val(meta2), path(fasta)
+    tuple val(meta3), path(index)
 
     output:
     tuple val(meta), path("*.bam"), emit: mappedbam
@@ -24,10 +26,13 @@ process CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT {
     def args4 = task.ext.args4 ?: ''
     def prefix = task.ext.prefix ?: "${meta.id}"
     // Please be aware one of the tools here required mem = 28 * reference size!!!
+    
     """
+    INDEX=`find -L ./ -name "*.amb" | sed 's/\\.amb\$//'`
+
     cram_filter -n ${from}-${to} ${cramfile} - | \\
         samtools fastq ${args1} | \\
-        bwa-mem2 mem -p ${bwaprefix} -t${task.cpus} -5SPCp -H'${rglines}' - | \\
+        bwa-mem2 mem \$INDEX -t ${task.cpus} ${args2} - | \\
         samtools fixmate ${args3} - - | \\
         samtools sort ${args4} -@${task.cpus} -T ${base}_${chunkid}_sort_tmp -o ${prefix}_${base}_${chunkid}_mem.bam -
 
diff --git a/subworkflows/local/align_short_hic.nf b/subworkflows/local/align_short_hic.nf
index 1055b61..ce50542 100644
--- a/subworkflows/local/align_short_hic.nf
+++ b/subworkflows/local/align_short_hic.nf
@@ -49,8 +49,6 @@ workflow ALIGN_SHORT_HIC {
     | join ( SAMTOOLS_INDEX.out.crai )
     | set { ch_reads_cram_crai }
 
-    ch_reads_cram_crai.view()
-
 
     //
     // MODULE: generate a CRAM CSV file containing the required parametres for CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT
@@ -65,8 +63,7 @@ workflow ALIGN_SHORT_HIC {
     if (params.hic_aligner == 'minimap2') {
         HIC_MINIMAP2 (
             fasta,
-            GENERATE_CRAM_CSV.out.csv,
-            index
+            GENERATE_CRAM_CSV.out.csv
         )
         ch_versions         = ch_versions.mix( HIC_MINIMAP2.out.versions )
         ch_merged_bam           = ch_merged_bam.mix(HIC_MINIMAP2.out.mergedbam)
diff --git a/subworkflows/local/hic_bwamem2.nf b/subworkflows/local/hic_bwamem2.nf
index 8a06f44..3c7176a 100644
--- a/subworkflows/local/hic_bwamem2.nf
+++ b/subworkflows/local/hic_bwamem2.nf
@@ -3,7 +3,6 @@
 //
 // MODULE IMPORT BLOCK
 //
-include { BWAMEM2_INDEX                                   } from '../../modules/nf-core/bwamem2/index/main'
 include { CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT          } from '../../modules/local/cram_filter_align_bwamem2_fixmate_sort'
 include { SAMTOOLS_MERGE                                  } from '../../modules/nf-core/samtools/merge/main'
 
@@ -18,16 +17,10 @@ workflow HIC_BWAMEM2 {
     ch_versions         = Channel.empty()
     mappedbam_ch        = Channel.empty()
 
-    BWAMEM2_INDEX (
-        fasta
-        )
-    ch_versions         = ch_versions.mix( BWAMEM2_INDEX.out.versions )
 
     csv_ch
         .splitCsv()
-        .combine ( fasta )
-        .combine ( BWAMEM2_INDEX.out.index )
-        .map{ cram_id, cram_info, ref_id, ref_dir, bwa_id, bwa_path ->
+        .map{ cram_id, cram_info ->
             tuple([
                     id: cram_id.id
                     ],
@@ -37,9 +30,7 @@ workflow HIC_BWAMEM2 {
                 cram_info[3],
                 cram_info[4],
                 cram_info[5],
-                cram_info[6],
-                bwa_path.toString() + '/' + ref_dir.toString().split('/')[-1],
-                ref_dir
+                cram_info[6]
             )
     }
     .set { ch_filtering_input }
@@ -48,7 +39,9 @@ workflow HIC_BWAMEM2 {
     // MODULE: map hic reads by 10,000 container per time using bwamem2
     //
     CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT (
-        ch_filtering_input
+        ch_filtering_input,
+        fasta,
+        index
 
     )
     ch_versions         = ch_versions.mix( CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT.out.versions )
@@ -79,7 +72,7 @@ workflow HIC_BWAMEM2 {
     SAMTOOLS_MERGE (
         collected_files_for_merge,
         fasta,
-        index
+        [ [], [] ]
     )
     ch_versions         = ch_versions.mix ( SAMTOOLS_MERGE.out.versions.first() )
 
diff --git a/subworkflows/local/hic_minimap2.nf b/subworkflows/local/hic_minimap2.nf
index ddda9ac..8637c83 100644
--- a/subworkflows/local/hic_minimap2.nf
+++ b/subworkflows/local/hic_minimap2.nf
@@ -12,7 +12,6 @@ workflow HIC_MINIMAP2 {
     take:
     fasta    // Channel: tuple [ val(meta), path( file )      ]
     csv_ch
-    index
 
 
     main:
@@ -90,7 +89,7 @@ workflow HIC_MINIMAP2 {
     SAMTOOLS_MERGE (
         collected_files_for_merge,
         fasta,
-        index
+        [ [], [] ]
     )
     ch_versions         = ch_versions.mix ( SAMTOOLS_MERGE.out.versions.first() )
     

From 842fb5d9bddb27f09df11999967acb90401f0157 Mon Sep 17 00:00:00 2001
From: reichan1998 <baochauduong1311@gmail.com>
Date: Tue, 10 Sep 2024 18:39:31 +0700
Subject: [PATCH 05/15] include hic_aligner in nextflow_schema.json

---
 assets/samplesheet_s3.csv |  3 +++
 nextflow_schema.json      | 16 ++++++++++++++++
 2 files changed, 19 insertions(+)

diff --git a/assets/samplesheet_s3.csv b/assets/samplesheet_s3.csv
index c059c97..ff9641b 100644
--- a/assets/samplesheet_s3.csv
+++ b/assets/samplesheet_s3.csv
@@ -2,3 +2,6 @@ sample,datatype,datafile,library
 mMelMel1,illumina,https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/genomic_data/mMelMel1/illumina/31231_3%231.subset.cram,
 mMelMel2,illumina,https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/genomic_data/mMelMel2/illumina/31231_4%231.subset.fastq.gz,
 mMelMel3,hic,https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/genomic_data/mMelMel3/hic/35528_2%231.subset.cram,
+mMelMel3,ont,https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/genomic_data/mMelMel3/ont/PAE35587_pass_1f1f0707_115.subset.fastq.gz,
+mMelMel3,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/genomic_data/mMelMel3/pacbio/m64094_200910_173211.ccs.bc1022_BAK8B_OA--bc1022_BAK8B_OA.subset.bam,
+mMelMel3,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/genomic_data/mMelMel3/pacbio/m64094_200911_174739.ccs.bc1022_BAK8B_OA--bc1022_BAK8B_OA.subset.fastq.gz,
diff --git a/nextflow_schema.json b/nextflow_schema.json
index 2209d6b..5b5b4e5 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -57,6 +57,22 @@
                 }
             }
         },
+        "aligner_options": {
+            "title": "Aligner options",
+            "type": "object",
+            "fa_icon": "fas fa-code",
+            "description": "Options for the aligner used in the pipeline.",
+            "required": ["hic_aligner"],
+            "properties": {
+                "hic_aligner": {
+                    "type": "string",
+                    "description": "The aligner to be used for HiC reads.",
+                    "enum": ["bwamem2", "minimap2"],
+                    "default": "minimap2",
+                    "fa_icon": "fas fa-code"
+                }
+            }
+        },
         "reference_genome_options": {
             "title": "Reference genome options",
             "type": "object",

From 3fd90a66e1b3e616885d65a68274565557cc6b7a Mon Sep 17 00:00:00 2001
From: reichan1998 <baochauduong1311@gmail.com>
Date: Tue, 10 Sep 2024 22:21:43 +0700
Subject: [PATCH 06/15] fix linting

---
 bin/awk_filter_reads.sh                       |  2 +-
 bin/filter_five_end.pl                        |  2 +-
 bin/generate_cram_csv.sh                      |  2 +-
 bin/grep_pg.sh                                |  2 +-
 conf/base.config                              |  2 +-
 modules.json                                  | 90 +++++--------------
 .../cram_filter_align_bwamem2_fixmate_sort.nf |  4 +-
 ...filter_minimap2_filter5end_fixmate_sort.nf |  2 +-
 nextflow_schema.json                          | 11 +--
 subworkflows/local/align_short.nf             |  4 +-
 subworkflows/local/align_short_hic.nf         |  2 +-
 subworkflows/local/hic_bwamem2.nf             |  4 +-
 subworkflows/local/hic_minimap2.nf            | 14 +--
 workflows/readmapping.nf                      |  2 +-
 14 files changed, 45 insertions(+), 98 deletions(-)

diff --git a/bin/awk_filter_reads.sh b/bin/awk_filter_reads.sh
index 3066f11..d50aa9e 100755
--- a/bin/awk_filter_reads.sh
+++ b/bin/awk_filter_reads.sh
@@ -1 +1 @@
-awk 'BEGIN{OFS="\t"}{if($1 ~ /^\@/) {print($0)} else {$2=and($2,compl(2048)); print(substr($0,2))}}'
\ No newline at end of file
+awk 'BEGIN{OFS="\t"}{if($1 ~ /^\@/) {print($0)} else {$2=and($2,compl(2048)); print(substr($0,2))}}'
diff --git a/bin/filter_five_end.pl b/bin/filter_five_end.pl
index 0beb2ac..6f9b4d5 100644
--- a/bin/filter_five_end.pl
+++ b/bin/filter_five_end.pl
@@ -106,4 +106,4 @@ sub dec2bin {
 
 sub bin2dec {
     return unpack("N", pack("B32", substr("0" x 32 . shift, -32)));
-}
\ No newline at end of file
+}
diff --git a/bin/generate_cram_csv.sh b/bin/generate_cram_csv.sh
index b36fa5d..89920fd 100755
--- a/bin/generate_cram_csv.sh
+++ b/bin/generate_cram_csv.sh
@@ -88,4 +88,4 @@ crai=$3
 # done
 realcram=$(readlink -f $cram)
 realcrai=$(readlink -f $crai)
-chunkn=$(process_cram_file $realcram $chunkn $outcsv $realcrai) 
+chunkn=$(process_cram_file $realcram $chunkn $outcsv $realcrai)
diff --git a/bin/grep_pg.sh b/bin/grep_pg.sh
index 8f76e23..680b5ec 100755
--- a/bin/grep_pg.sh
+++ b/bin/grep_pg.sh
@@ -7,4 +7,4 @@
 # -------------------
 # Author = yy5
 
-grep -v "^\@PG" | awk '{if($1 ~ /^\@/) {print($0)} else {if(and($2,64)>0) {print(1$0)} else {print(2$0)}}}'
\ No newline at end of file
+grep -v "^\@PG" | awk '{if($1 ~ /^\@/) {print($0)} else {if(and($2,64)>0) {print(1$0)} else {print(2$0)}}}'
diff --git a/conf/base.config b/conf/base.config
index 33cc764..cd54c75 100644
--- a/conf/base.config
+++ b/conf/base.config
@@ -57,7 +57,7 @@ process {
         memory = { check_max( 4.GB + 850.MB * log_increase_cpus(2, 6*task.attempt, 1, 2) * task.attempt + 0.6.GB * Math.ceil( meta.read_count / 100000000 ), 'memory' ) }
         time   = { check_max( 2.h * Math.ceil( meta.read_count / 100000000 ) * task.attempt / log_increase_cpus(2, 6*task.attempt, 1, 2), 'time' ) }
     }
-    
+
     withName: BLAST_BLASTN {
         time   = { check_max(          2.hour  * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'time'   ) }
         memory = { check_max( 100.MB + 20.MB   * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'memory' ) }
diff --git a/modules.json b/modules.json
index 05aa1cc..8e24d3e 100644
--- a/modules.json
+++ b/modules.json
@@ -8,136 +8,98 @@
                     "blast/blastn": {
                         "branch": "master",
                         "git_sha": "583edaf97c9373a20df05a3b7be5a6677f9cd719",
-                        "installed_by": [
-                            "modules"
-                        ]
+                        "installed_by": ["modules"]
                     },
                     "bwamem2/index": {
                         "branch": "master",
                         "git_sha": "7081e04c18de9480948d34513a1c1e2d0fa9126d",
-                        "installed_by": [
-                            "modules"
-                        ]
+                        "installed_by": ["modules"]
                     },
                     "bwamem2/mem": {
                         "branch": "master",
                         "git_sha": "2201e21b09213f083832ac58e33353d410a6fde7",
-                        "installed_by": [
-                            "modules"
-                        ]
+                        "installed_by": ["modules"]
                     },
                     "crumble": {
                         "branch": "master",
                         "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5",
-                        "installed_by": [
-                            "modules"
-                        ]
+                        "installed_by": ["modules"]
                     },
                     "custom/dumpsoftwareversions": {
                         "branch": "master",
                         "git_sha": "82024cf6325d2ee194e7f056d841ecad2f6856e9",
-                        "installed_by": [
-                            "modules"
-                        ]
+                        "installed_by": ["modules"]
                     },
                     "gunzip": {
                         "branch": "master",
                         "git_sha": "4e5f4687318f24ba944a13609d3ea6ebd890737d",
-                        "installed_by": [
-                            "modules"
-                        ]
+                        "installed_by": ["modules"]
                     },
                     "minimap2/align": {
                         "branch": "master",
                         "git_sha": "a33ef9475558c6b8da08c5f522ddaca1ec810306",
-                        "installed_by": [
-                            "modules"
-                        ]
+                        "installed_by": ["modules"]
                     },
                     "minimap2/index": {
                         "branch": "master",
                         "git_sha": "72e277acfd9e61a9f1368eafb4a9e83f5bcaa9f5",
-                        "installed_by": [
-                            "modules"
-                        ]
+                        "installed_by": ["modules"]
                     },
                     "samtools/faidx": {
                         "branch": "master",
                         "git_sha": "04fbbc7c43cebc0b95d5b126f6d9fe4effa33519",
-                        "installed_by": [
-                            "modules"
-                        ]
+                        "installed_by": ["modules"]
                     },
                     "samtools/fastq": {
                         "branch": "master",
                         "git_sha": "04fbbc7c43cebc0b95d5b126f6d9fe4effa33519",
-                        "installed_by": [
-                            "modules"
-                        ]
+                        "installed_by": ["modules"]
                     },
                     "samtools/flagstat": {
                         "branch": "master",
                         "git_sha": "46eca555142d6e597729fcb682adcc791796f514",
-                        "installed_by": [
-                            "modules"
-                        ]
+                        "installed_by": ["modules"]
                     },
                     "samtools/idxstats": {
                         "branch": "master",
                         "git_sha": "46eca555142d6e597729fcb682adcc791796f514",
-                        "installed_by": [
-                            "modules"
-                        ]
+                        "installed_by": ["modules"]
                     },
                     "samtools/index": {
                         "branch": "master",
                         "git_sha": "46eca555142d6e597729fcb682adcc791796f514",
-                        "installed_by": [
-                            "modules"
-                        ]
+                        "installed_by": ["modules"]
                     },
                     "samtools/merge": {
                         "branch": "master",
                         "git_sha": "04fbbc7c43cebc0b95d5b126f6d9fe4effa33519",
-                        "installed_by": [
-                            "modules"
-                        ],
+                        "installed_by": ["modules"],
                         "patch": "modules/nf-core/samtools/merge/samtools-merge.diff"
                     },
                     "samtools/stats": {
                         "branch": "master",
                         "git_sha": "1fe379cf6e6c1e6fa5e32bcbeefea0f1e874dac6",
-                        "installed_by": [
-                            "modules"
-                        ]
+                        "installed_by": ["modules"]
                     },
                     "samtools/view": {
                         "branch": "master",
                         "git_sha": "6c2309aaec566c0d44a6cf14d4b2d0c51afe2e91",
-                        "installed_by": [
-                            "modules"
-                        ]
+                        "installed_by": ["modules"]
                     },
                     "seqkit/fq2fa": {
                         "branch": "master",
                         "git_sha": "03fbf6c89e551bd8d77f3b751fb5c955f75b34c5",
-                        "installed_by": [
-                            "modules"
-                        ]
+                        "installed_by": ["modules"]
                     },
                     "seqtk/subseq": {
                         "branch": "master",
                         "git_sha": "730f3aee80d5f8d0b5fc532202ac59361414d006",
-                        "installed_by": [
-                            "modules"
-                        ]
+                        "installed_by": ["modules"]
                     },
                     "untar": {
                         "branch": "master",
                         "git_sha": "4e5f4687318f24ba944a13609d3ea6ebd890737d",
-                        "installed_by": [
-                            "modules"
-                        ]
+                        "installed_by": ["modules"]
                     }
                 }
             },
@@ -146,26 +108,20 @@
                     "utils_nextflow_pipeline": {
                         "branch": "master",
                         "git_sha": "5caf7640a9ef1d18d765d55339be751bb0969dfa",
-                        "installed_by": [
-                            "subworkflows"
-                        ]
+                        "installed_by": ["subworkflows"]
                     },
                     "utils_nfcore_pipeline": {
                         "branch": "master",
                         "git_sha": "92de218a329bfc9a9033116eb5f65fd270e72ba3",
-                        "installed_by": [
-                            "subworkflows"
-                        ]
+                        "installed_by": ["subworkflows"]
                     },
                     "utils_nfvalidation_plugin": {
                         "branch": "master",
                         "git_sha": "5caf7640a9ef1d18d765d55339be751bb0969dfa",
-                        "installed_by": [
-                            "subworkflows"
-                        ]
+                        "installed_by": ["subworkflows"]
                     }
                 }
             }
         }
     }
-}
\ No newline at end of file
+}
diff --git a/modules/local/cram_filter_align_bwamem2_fixmate_sort.nf b/modules/local/cram_filter_align_bwamem2_fixmate_sort.nf
index cf5241f..9b5ca29 100644
--- a/modules/local/cram_filter_align_bwamem2_fixmate_sort.nf
+++ b/modules/local/cram_filter_align_bwamem2_fixmate_sort.nf
@@ -26,7 +26,7 @@ process CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT {
     def args4 = task.ext.args4 ?: ''
     def prefix = task.ext.prefix ?: "${meta.id}"
     // Please be aware one of the tools here required mem = 28 * reference size!!!
-    
+
     """
     INDEX=`find -L ./ -name "*.amb" | sed 's/\\.amb\$//'`
 
@@ -57,4 +57,4 @@ process CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT {
         bwamem2: \$(echo \$(bwa-mem2 version 2>&1) | sed 's/.* //')
     END_VERSIONS
     """
-}
\ No newline at end of file
+}
diff --git a/modules/local/cram_filter_minimap2_filter5end_fixmate_sort.nf b/modules/local/cram_filter_minimap2_filter5end_fixmate_sort.nf
index 2127880..8d8d69e 100644
--- a/modules/local/cram_filter_minimap2_filter5end_fixmate_sort.nf
+++ b/modules/local/cram_filter_minimap2_filter5end_fixmate_sort.nf
@@ -56,4 +56,4 @@ process CRAM_FILTER_MINIMAP2_FILTER5END_FIXMATE_SORT {
         staden_io: $VERSION
     END_VERSIONS
     """
-}
\ No newline at end of file
+}
diff --git a/nextflow_schema.json b/nextflow_schema.json
index 5b5b4e5..63a6999 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -54,16 +54,7 @@
                     "fa_icon": "fas fa-envelope",
                     "help_text": "Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits. If set in your user config file (`~/.nextflow/config`) then you don't need to specify this on the command line for every run.",
                     "pattern": "^([a-zA-Z0-9_\\-\\.]+)@([a-zA-Z0-9_\\-\\.]+)\\.([a-zA-Z]{2,5})$"
-                }
-            }
-        },
-        "aligner_options": {
-            "title": "Aligner options",
-            "type": "object",
-            "fa_icon": "fas fa-code",
-            "description": "Options for the aligner used in the pipeline.",
-            "required": ["hic_aligner"],
-            "properties": {
+                },
                 "hic_aligner": {
                     "type": "string",
                     "description": "The aligner to be used for HiC reads.",
diff --git a/subworkflows/local/align_short.nf b/subworkflows/local/align_short.nf
index e74b480..cebc194 100644
--- a/subworkflows/local/align_short.nf
+++ b/subworkflows/local/align_short.nf
@@ -31,8 +31,8 @@ workflow ALIGN_SHORT {
     // Convert from CRAM to FASTQ only if CRAM files were provided as input
     SAMTOOLS_FASTQ ( ch_reads.cram, false )
     ch_versions = ch_versions.mix ( SAMTOOLS_FASTQ.out.versions.first() )
-    
-    
+
+
     SAMTOOLS_FASTQ.out.fastq
     | mix ( ch_reads.fastq )
     | set { ch_reads_fastq }
diff --git a/subworkflows/local/align_short_hic.nf b/subworkflows/local/align_short_hic.nf
index ce50542..39e0cab 100644
--- a/subworkflows/local/align_short_hic.nf
+++ b/subworkflows/local/align_short_hic.nf
@@ -82,7 +82,7 @@ workflow ALIGN_SHORT_HIC {
     | map { meta_bam, bam, meta_cram, cram, crai -> [ meta_cram, bam ] }
     | set { ch_merged_bam }
 
-    
+
     // Collect all BAM output by sample name
     ch_merged_bam
     | map { meta, bam -> [['id': meta.id.split('_')[0..-2].join('_'), 'datatype': meta.datatype], meta.read_count, bam] }
diff --git a/subworkflows/local/hic_bwamem2.nf b/subworkflows/local/hic_bwamem2.nf
index 3c7176a..edf4691 100644
--- a/subworkflows/local/hic_bwamem2.nf
+++ b/subworkflows/local/hic_bwamem2.nf
@@ -65,7 +65,7 @@ workflow HIC_BWAMEM2 {
         }
         .set { collected_files_for_merge }
 
-    
+
     //
     // MODULE: MERGE POSITION SORTED BAM FILES AND MARK DUPLICATES
     //
@@ -80,4 +80,4 @@ workflow HIC_BWAMEM2 {
     emit:
     mergedbam           = SAMTOOLS_MERGE.out.bam
     versions            = ch_versions.ifEmpty(null)
-}
\ No newline at end of file
+}
diff --git a/subworkflows/local/hic_minimap2.nf b/subworkflows/local/hic_minimap2.nf
index 8637c83..23ab866 100644
--- a/subworkflows/local/hic_minimap2.nf
+++ b/subworkflows/local/hic_minimap2.nf
@@ -20,8 +20,8 @@ workflow HIC_MINIMAP2 {
 
 
     //
-    // MODULE: generate minimap2 mmi file 
-    //       
+    // MODULE: generate minimap2 mmi file
+    //
     MINIMAP2_INDEX (
         fasta
         )
@@ -30,7 +30,7 @@ workflow HIC_MINIMAP2 {
 
     //
     // LOGIC: generate input channel for mapping
-    // 
+    //
     csv_ch
         .splitCsv()
         .combine ( fasta )
@@ -55,7 +55,7 @@ workflow HIC_MINIMAP2 {
 
     //
     // MODULE: map hic reads by 10,000 container per time
-    // 
+    //
     CRAM_FILTER_MINIMAP2_FILTER5END_FIXMATE_SORT (
         ch_filtering_input
 
@@ -92,9 +92,9 @@ workflow HIC_MINIMAP2 {
         [ [], [] ]
     )
     ch_versions         = ch_versions.mix ( SAMTOOLS_MERGE.out.versions.first() )
-    
-    
+
+
     emit:
     mergedbam           = SAMTOOLS_MERGE.out.bam
     versions            = ch_versions.ifEmpty(null)
-}
\ No newline at end of file
+}
diff --git a/workflows/readmapping.nf b/workflows/readmapping.nf
index 1a06b5e..a5e9575 100644
--- a/workflows/readmapping.nf
+++ b/workflows/readmapping.nf
@@ -104,7 +104,7 @@ workflow READMAPPING {
     //
     ALIGN_HIC ( PREPARE_GENOME.out.fasta, PREPARE_GENOME.out.bwaidx, ch_reads.hic )
     ch_versions = ch_versions.mix ( ALIGN_HIC.out.versions )
-    
+
     ALIGN_ILLUMINA ( PREPARE_GENOME.out.fasta, PREPARE_GENOME.out.bwaidx, ch_reads.illumina )
     ch_versions = ch_versions.mix ( ALIGN_ILLUMINA.out.versions )
 

From 69693036fc70779742071a061f6778cf4ebc96c8 Mon Sep 17 00:00:00 2001
From: Tyler Chafin <tc25@sanger.ac.uk>
Date: Wed, 11 Sep 2024 15:12:23 +0100
Subject: [PATCH 07/15] schema update; hic_ to short_aligner and added
 chunksize

---
 nextflow.config      |  4 +++-
 nextflow_schema.json | 30 +++++++++++++++++++++++-------
 2 files changed, 26 insertions(+), 8 deletions(-)

diff --git a/nextflow.config b/nextflow.config
index 8c3c4d0..9e1b306 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -16,7 +16,9 @@ params {
     fasta                      = null
     header                     = null
     // Aligner option
-    hic_aligner                = "minimap2" // Can choose minimap2 and bwamem2
+    short_aligner              = "minimap2" // Can choose minimap2 and bwamem2
+    chunk_size                 = 10000
+
     // Output format options
     outfmt                     = "cram"
     compression                = "crumble"
diff --git a/nextflow_schema.json b/nextflow_schema.json
index 63a6999..91eb9f6 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -54,13 +54,6 @@
                     "fa_icon": "fas fa-envelope",
                     "help_text": "Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits. If set in your user config file (`~/.nextflow/config`) then you don't need to specify this on the command line for every run.",
                     "pattern": "^([a-zA-Z0-9_\\-\\.]+)@([a-zA-Z0-9_\\-\\.]+)\\.([a-zA-Z]{2,5})$"
-                },
-                "hic_aligner": {
-                    "type": "string",
-                    "description": "The aligner to be used for HiC reads.",
-                    "enum": ["bwamem2", "minimap2"],
-                    "default": "minimap2",
-                    "fa_icon": "fas fa-code"
                 }
             }
         },
@@ -93,6 +86,26 @@
                 }
             }
         },
+        "aligner_options": {
+            "title": "Aligner options",
+            "type": "object",
+            "description": "",
+            "default": "",
+            "properties": {
+                "short_aligner": {
+                    "type": "string",
+                    "default": "minimap2",
+                    "description": "Alignment method to use for short-reads (options are: \"minimap2\" or \"bwamem2\"",
+                    "enum": ["minimap2", "bwamem2"]
+                },
+                "chunk_size": {
+                    "type": "integer",
+                    "default": 10000,
+                    "description": "Chunk size for parallel processing of CRAM files, in number of containers",
+                    "minimum": 1
+                }
+            }
+        },
         "execution": {
             "title": "Execution",
             "type": "object",
@@ -292,6 +305,9 @@
         {
             "$ref": "#/definitions/reference_genome_options"
         },
+        {
+            "$ref": "#/definitions/aligner_options"
+        },
         {
             "$ref": "#/definitions/execution"
         },

From 6ee5f51e5e953f552ad3a8a73d9d23b076d8cde5 Mon Sep 17 00:00:00 2001
From: Tyler Chafin <tc25@sanger.ac.uk>
Date: Wed, 11 Sep 2024 15:12:42 +0100
Subject: [PATCH 08/15] made chunk_size an argument and fix off-by-1 error

---
 bin/generate_cram_csv.sh | 58 +++++++++++++++++++++-------------------
 1 file changed, 31 insertions(+), 27 deletions(-)

diff --git a/bin/generate_cram_csv.sh b/bin/generate_cram_csv.sh
index 89920fd..71af968 100755
--- a/bin/generate_cram_csv.sh
+++ b/bin/generate_cram_csv.sh
@@ -7,36 +7,38 @@
 # Author = yy5
 # ><((((°>    Y    ><((((°>    U     ><((((°>    M     ><((((°>     I     ><((((°>
 
+# NOTE: chunk_size is the number of containers per chunk (not records/reads)
+
 # Function to process chunking of a CRAM file
+
 chunk_cram() {
     local cram=$1
     local chunkn=$2
     local outcsv=$3
     local crai=$4
-    realcram=$(readlink -f ${cram})
-    # realcrai=$(readlink -f ${cram}.crai)
-    realcrai=$(readlink -f ${crai})
+    local chunk_size=$5
+
     local rgline=$(samtools view -H "${realcram}" | grep "@RG" | sed 's/\t/\\t/g' | sed "s/'//g")
     local ncontainers=$(zcat "${realcrai}" | wc -l)
-    # local ncontainers=$(cat "${realcram}" | wc -l)
     local base=$(basename "${realcram}" .cram)
     local from=0
-    local to=10000
-
+    local to=$((chunk_size - 1))
 
     while [ $to -lt $ncontainers ]; do
+        echo "chunk $chunkn: $from - $to"
         echo "${realcram},${realcrai},${from},${to},${base},${chunkn},${rgline}" >> $outcsv
         from=$((to + 1))
-        ((to += 10000))
+        to=$((to + chunk_size))
         ((chunkn++))
     done
 
-    if [ $from -le $ncontainers ]; then
-        echo "${realcram},${realcrai},${from},${ncontainers},${base},${chunkn},${rgline}" >> $outcsv
+    # Catch any remainder
+    if [ $from -lt $ncontainers ]; then
+        to=$((ncontainers - 1))
+        echo "chunk $chunkn: $from - $to"
+        echo "${realcram},${realcrai},${from},${to},${base},${chunkn},${rgline}" >> $outcsv
         ((chunkn++))
     fi
-
-    echo $chunkn
 }
 
 # Function to process a CRAM file
@@ -45,24 +47,23 @@ process_cram_file() {
     local chunkn=$2
     local outcsv=$3
     local crai=$4
+    local chunk_size=$5
 
     local read_groups=$(samtools view -H "$cram" | grep '@RG' | awk '{for(i=1;i<=NF;i++){if($i ~ /^ID:/){print substr($i,4)}}}')
     local num_read_groups=$(echo "$read_groups" | wc -w)
-
     if [ "$num_read_groups" -gt 1 ]; then
         # Multiple read groups: process each separately
         for rg in $read_groups; do
             local output_cram="$(basename "${cram%.cram}")_output_${rg}.cram"
             samtools view -h -r "$rg" -o "$output_cram" "$cram"
-            samtools index "$output_cram"
-            chunkn=$(chunk_cram "$output_cram" "$chunkn" "$outcsv" "$crai")
+            #chunkn=$(chunk_cram "$output_cram" "$chunkn" "$outcsv" "$crai" "$chunk_size")
+            chunk_cram "$output_cram" "$chunkn" "$outcsv" "$crai" "$chunk_size"
         done
     else
         # Single read group or no read groups
-        chunkn=$(chunk_cram "$cram" "$chunkn" "$outcsv" "$crai")
+        #chunkn=$(chunk_cram "$cram" "$chunkn" "$outcsv" "$crai" "$chunk_size")
+        chunk_cram "$cram" "$chunkn" "$outcsv" "$crai" "$chunk_size"
     fi
-
-    echo $chunkn
 }
 
 #  /\_/\        /\_/\
@@ -70,22 +71,25 @@ process_cram_file() {
 #  > ^ <        > ^ <
 
 # Check if cram_path is provided
-if [ -z "$1" ]; then
-    echo "Usage: $0 <cram_path>"
+if [ $# -lt 3 ]; then
+    echo "Usage: $0 <cram_path> <output_csv> <crai_file> <chunk_size>"
     exit 1
 fi
 
-# cram_path=$1
 cram=$1
-chunkn=0
 outcsv=$2
 crai=$3
+if [ -z "$4" ]; then
+    chunk_size=10000
+else
+    chunk_size=$4
+fi
+chunkn=0
 
-# # Loop through each CRAM file in the specified directory. cram cannot be the synlinked cram
-# for cram in ${cram_path}/*.cram; do
-#     realcram=$(readlink -f $cram)
-#     chunkn=$(process_cram_file $realcram $chunkn $outcsv)
-# done
+# Operates on a single CRAM file
 realcram=$(readlink -f $cram)
 realcrai=$(readlink -f $crai)
-chunkn=$(process_cram_file $realcram $chunkn $outcsv $realcrai)
+if [ -f "$outcsv" ]; then
+    rm "$outcsv"
+fi
+process_cram_file $realcram $chunkn $outcsv $realcrai $chunk_size

From 513229d22e413f6ebafdd337d66df5c3547e4964 Mon Sep 17 00:00:00 2001
From: Tyler Chafin <tc25@sanger.ac.uk>
Date: Wed, 11 Sep 2024 15:14:32 +0100
Subject: [PATCH 09/15] change hic_ to _mapreduce subworkflows

---
 subworkflows/local/align_short_hic.nf    |   6 +-
 subworkflows/local/bwamem2_mapreduce.nf  |  84 +++++++++++++++++++
 subworkflows/local/minimap2_mapreduce.nf | 101 +++++++++++++++++++++++
 3 files changed, 188 insertions(+), 3 deletions(-)
 create mode 100644 subworkflows/local/bwamem2_mapreduce.nf
 create mode 100644 subworkflows/local/minimap2_mapreduce.nf

diff --git a/subworkflows/local/align_short_hic.nf b/subworkflows/local/align_short_hic.nf
index 39e0cab..92c2010 100644
--- a/subworkflows/local/align_short_hic.nf
+++ b/subworkflows/local/align_short_hic.nf
@@ -8,8 +8,8 @@ include { SAMTOOLS_SORMADUP                     } from '../../modules/local/samt
 include { SAMTOOLS_INDEX                        } from '../../modules/nf-core/samtools/index/main'
 include { GENERATE_CRAM_CSV                     } from '../../modules/local/generate_cram_csv'
 include { SAMTOOLS_SORMADUP as CONVERT_CRAM     } from '../../modules/local/samtools_sormadup'
-include { HIC_MINIMAP2                          } from '../../subworkflows/local/hic_minimap2'
-include { HIC_BWAMEM2                           } from '../../subworkflows/local/hic_bwamem2'
+include { MINIMAP2_MAPREDUCE as HIC_MINIMAP2    } from '../../subworkflows/local/minimap2_mapreduce'
+include { BWAMEM2_MAPREDUCE as HIC_BWAMEM2      } from '../../subworkflows/local/bwamem2_mapreduce'
 
 workflow ALIGN_SHORT_HIC {
     take:
@@ -60,7 +60,7 @@ workflow ALIGN_SHORT_HIC {
     //
     // SUBWORKFLOW: mapping hic reads using minimap2 or bwamem2
     //
-    if (params.hic_aligner == 'minimap2') {
+    if (params.short_aligner.startsWith("minimap")) {
         HIC_MINIMAP2 (
             fasta,
             GENERATE_CRAM_CSV.out.csv
diff --git a/subworkflows/local/bwamem2_mapreduce.nf b/subworkflows/local/bwamem2_mapreduce.nf
new file mode 100644
index 0000000..ceaff1b
--- /dev/null
+++ b/subworkflows/local/bwamem2_mapreduce.nf
@@ -0,0 +1,84 @@
+#!/usr/bin/env nextflow
+
+//
+// MODULE IMPORT BLOCK
+//
+include { CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT          } from '../../modules/local/cram_filter_align_bwamem2_fixmate_sort'
+include { SAMTOOLS_MERGE                                  } from '../../modules/nf-core/samtools/merge/main'
+
+workflow BWAMEM2_MAPREDUCE {
+    take:
+    fasta     // Channel: tuple [ val(meta), path( file )      ]
+    csv_ch
+    index
+
+
+    main:
+    ch_versions         = Channel.empty()
+    mappedbam_ch        = Channel.empty()
+
+
+    csv_ch
+        .splitCsv()
+        .map{ cram_id, cram_info ->
+            tuple([
+                    id: cram_id.id,
+                    chunk_id: cram_id.id + "_" + cram_info[5]
+                    ],
+                file(cram_info[0]),
+                cram_info[1],
+                cram_info[2],
+                cram_info[3],
+                cram_info[4],
+                cram_info[5],
+                cram_info[6]
+            )
+    }
+    .set { ch_filtering_input }
+
+    //
+    // MODULE: map hic reads by 10,000 container per time using bwamem2
+    //
+    CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT (
+        ch_filtering_input,
+        fasta,
+        index
+
+    )
+    ch_versions         = ch_versions.mix( CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT.out.versions )
+    mappedbam_ch        = CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT.out.mappedbam
+
+    //
+    // LOGIC: PREPARING BAMS FOR MERGE
+    //
+    mappedbam_ch
+        .map{ meta, file ->
+            tuple( file )
+        }
+        .collect()
+        .map { file ->
+            tuple (
+                [
+                id: file[0].toString().split('/')[-1].split('_')[0] + '_' + file[0].toString().split('/')[-1].split('_')[1]
+                ],
+                file
+            )
+        }
+        .set { collected_files_for_merge }
+
+
+    //
+    // MODULE: MERGE POSITION SORTED BAM FILES AND MARK DUPLICATES
+    //
+    SAMTOOLS_MERGE (
+        collected_files_for_merge,
+        fasta,
+        [ [], [] ]
+    )
+    ch_versions         = ch_versions.mix ( SAMTOOLS_MERGE.out.versions.first() )
+
+
+    emit:
+    mergedbam           = SAMTOOLS_MERGE.out.bam
+    versions            = ch_versions.ifEmpty(null)
+}
diff --git a/subworkflows/local/minimap2_mapreduce.nf b/subworkflows/local/minimap2_mapreduce.nf
new file mode 100644
index 0000000..35b5aae
--- /dev/null
+++ b/subworkflows/local/minimap2_mapreduce.nf
@@ -0,0 +1,101 @@
+#!/usr/bin/env nextflow
+
+//
+// MODULE IMPORT BLOCK
+//
+include { CRAM_FILTER_MINIMAP2_FILTER5END_FIXMATE_SORT    } from '../../modules/local/cram_filter_minimap2_filter5end_fixmate_sort'
+include { SAMTOOLS_MERGE                                  } from '../../modules/nf-core/samtools/merge/main'
+include { MINIMAP2_INDEX                                  } from '../../modules/nf-core/minimap2/index/main'
+
+
+workflow MINIMAP2_MAPREDUCE {
+    take:
+    fasta    // Channel: tuple [ val(meta), path( file )      ]
+    csv_ch
+
+
+    main:
+    ch_versions         = Channel.empty()
+    mappedbam_ch        = Channel.empty()
+
+
+    //
+    // MODULE: generate minimap2 mmi file
+    //
+    MINIMAP2_INDEX (
+        fasta
+        )
+    ch_versions         = ch_versions.mix( MINIMAP2_INDEX.out.versions )
+
+
+    //
+    // LOGIC: generate input channel for mapping
+    //
+    csv_ch
+        .splitCsv()
+        .combine ( fasta )
+        .combine ( MINIMAP2_INDEX.out.index )
+        .map{ cram_id, cram_info, ref_id, ref_dir, mmi_id, mmi_path->
+            tuple([
+                    id: cram_id.id,
+                    chunk_id: cram_id.id + "_" + cram_info[5]
+                    ],
+                file(cram_info[0]),
+                cram_info[1],
+                cram_info[2],
+                cram_info[3],
+                cram_info[4],
+                cram_info[5],
+                cram_info[6],
+                mmi_path.toString(),
+                ref_dir
+            )
+    }
+    .set { ch_filtering_input }
+
+
+    //
+    // MODULE: map hic reads by 10,000 container per time
+    //
+    CRAM_FILTER_MINIMAP2_FILTER5END_FIXMATE_SORT (
+        ch_filtering_input
+
+    )
+    ch_versions         = ch_versions.mix( CRAM_FILTER_MINIMAP2_FILTER5END_FIXMATE_SORT.out.versions )
+    mappedbam_ch        = CRAM_FILTER_MINIMAP2_FILTER5END_FIXMATE_SORT.out.mappedbam
+
+
+    //
+    // LOGIC: PREPARING BAMS FOR MERGE
+    //
+    mappedbam_ch
+        .map{ meta, file ->
+            tuple( file )
+        }
+        .collect()
+        .map { file ->
+            tuple (
+                [
+                id: file[0].toString().split('/')[-1].split('_')[0] + '_' + file[0].toString().split('/')[-1].split('_')[1]
+                ],
+                file
+            )
+        }
+        .set { collected_files_for_merge }
+
+
+    //
+    // MODULE: MERGE POSITION SORTED BAM FILES AND MARK DUPLICATES
+    //
+    SAMTOOLS_MERGE (
+        collected_files_for_merge,
+        fasta,
+        [ [], [] ]
+    )
+    ch_versions         = ch_versions.mix ( SAMTOOLS_MERGE.out.versions.first() )
+
+
+    emit:
+    mergedbam           = SAMTOOLS_MERGE.out.bam
+    versions            = ch_versions.ifEmpty(null)
+}

From 11c3c314c1857f3648f5fe8cc51e8e599f6d0f1f Mon Sep 17 00:00:00 2001
From: Tyler Chafin <tc25@sanger.ac.uk>
Date: Wed, 11 Sep 2024 15:15:23 +0100
Subject: [PATCH 10/15] change hic_ to _mapreduce subworkflows

---
 subworkflows/local/hic_bwamem2.nf  |  83 ------------------------
 subworkflows/local/hic_minimap2.nf | 100 -----------------------------
 2 files changed, 183 deletions(-)
 delete mode 100644 subworkflows/local/hic_bwamem2.nf
 delete mode 100644 subworkflows/local/hic_minimap2.nf

diff --git a/subworkflows/local/hic_bwamem2.nf b/subworkflows/local/hic_bwamem2.nf
deleted file mode 100644
index edf4691..0000000
--- a/subworkflows/local/hic_bwamem2.nf
+++ /dev/null
@@ -1,83 +0,0 @@
-#!/usr/bin/env nextflow
-
-//
-// MODULE IMPORT BLOCK
-//
-include { CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT          } from '../../modules/local/cram_filter_align_bwamem2_fixmate_sort'
-include { SAMTOOLS_MERGE                                  } from '../../modules/nf-core/samtools/merge/main'
-
-workflow HIC_BWAMEM2 {
-    take:
-    fasta     // Channel: tuple [ val(meta), path( file )      ]
-    csv_ch
-    index
-
-
-    main:
-    ch_versions         = Channel.empty()
-    mappedbam_ch        = Channel.empty()
-
-
-    csv_ch
-        .splitCsv()
-        .map{ cram_id, cram_info ->
-            tuple([
-                    id: cram_id.id
-                    ],
-                file(cram_info[0]),
-                cram_info[1],
-                cram_info[2],
-                cram_info[3],
-                cram_info[4],
-                cram_info[5],
-                cram_info[6]
-            )
-    }
-    .set { ch_filtering_input }
-
-    //
-    // MODULE: map hic reads by 10,000 container per time using bwamem2
-    //
-    CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT (
-        ch_filtering_input,
-        fasta,
-        index
-
-    )
-    ch_versions         = ch_versions.mix( CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT.out.versions )
-    mappedbam_ch        = CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT.out.mappedbam
-
-    //
-    // LOGIC: PREPARING BAMS FOR MERGE
-    //
-    mappedbam_ch
-        .map{ meta, file ->
-            tuple( file )
-        }
-        .collect()
-        .map { file ->
-            tuple (
-                [
-                id: file[0].toString().split('/')[-1].split('_')[0] + '_' + file[0].toString().split('/')[-1].split('_')[1]
-                ],
-                file
-            )
-        }
-        .set { collected_files_for_merge }
-
-
-    //
-    // MODULE: MERGE POSITION SORTED BAM FILES AND MARK DUPLICATES
-    //
-    SAMTOOLS_MERGE (
-        collected_files_for_merge,
-        fasta,
-        [ [], [] ]
-    )
-    ch_versions         = ch_versions.mix ( SAMTOOLS_MERGE.out.versions.first() )
-
-
-    emit:
-    mergedbam           = SAMTOOLS_MERGE.out.bam
-    versions            = ch_versions.ifEmpty(null)
-}
diff --git a/subworkflows/local/hic_minimap2.nf b/subworkflows/local/hic_minimap2.nf
deleted file mode 100644
index 23ab866..0000000
--- a/subworkflows/local/hic_minimap2.nf
+++ /dev/null
@@ -1,100 +0,0 @@
-#!/usr/bin/env nextflow
-
-//
-// MODULE IMPORT BLOCK
-//
-include { CRAM_FILTER_MINIMAP2_FILTER5END_FIXMATE_SORT    } from '../../modules/local/cram_filter_minimap2_filter5end_fixmate_sort'
-include { SAMTOOLS_MERGE                                  } from '../../modules/nf-core/samtools/merge/main'
-include { MINIMAP2_INDEX                                  } from '../../modules/nf-core/minimap2/index/main'
-
-
-workflow HIC_MINIMAP2 {
-    take:
-    fasta    // Channel: tuple [ val(meta), path( file )      ]
-    csv_ch
-
-
-    main:
-    ch_versions         = Channel.empty()
-    mappedbam_ch        = Channel.empty()
-
-
-    //
-    // MODULE: generate minimap2 mmi file
-    //
-    MINIMAP2_INDEX (
-        fasta
-        )
-    ch_versions         = ch_versions.mix( MINIMAP2_INDEX.out.versions )
-
-
-    //
-    // LOGIC: generate input channel for mapping
-    //
-    csv_ch
-        .splitCsv()
-        .combine ( fasta )
-        .combine ( MINIMAP2_INDEX.out.index )
-        .map{ cram_id, cram_info, ref_id, ref_dir, mmi_id, mmi_path->
-            tuple([
-                    id: cram_id.id
-                    ],
-                file(cram_info[0]),
-                cram_info[1],
-                cram_info[2],
-                cram_info[3],
-                cram_info[4],
-                cram_info[5],
-                cram_info[6],
-                mmi_path.toString(),
-                ref_dir
-            )
-    }
-    .set { ch_filtering_input }
-
-
-    //
-    // MODULE: map hic reads by 10,000 container per time
-    //
-    CRAM_FILTER_MINIMAP2_FILTER5END_FIXMATE_SORT (
-        ch_filtering_input
-
-    )
-    ch_versions         = ch_versions.mix( CRAM_FILTER_MINIMAP2_FILTER5END_FIXMATE_SORT.out.versions )
-    mappedbam_ch        = CRAM_FILTER_MINIMAP2_FILTER5END_FIXMATE_SORT.out.mappedbam
-
-
-    //
-    // LOGIC: PREPARING BAMS FOR MERGE
-    //
-    mappedbam_ch
-        .map{ meta, file ->
-            tuple( file )
-        }
-        .collect()
-        .map { file ->
-            tuple (
-                [
-                id: file[0].toString().split('/')[-1].split('_')[0] + '_' + file[0].toString().split('/')[-1].split('_')[1]
-                ],
-                file
-            )
-        }
-        .set { collected_files_for_merge }
-
-
-    //
-    // MODULE: MERGE POSITION SORTED BAM FILES AND MARK DUPLICATES
-    //
-    SAMTOOLS_MERGE (
-        collected_files_for_merge,
-        fasta,
-        [ [], [] ]
-    )
-    ch_versions         = ch_versions.mix ( SAMTOOLS_MERGE.out.versions.first() )
-
-
-    emit:
-    mergedbam           = SAMTOOLS_MERGE.out.bam
-    versions            = ch_versions.ifEmpty(null)
-}

From fe7dd7a9bffa9215e98fbc74fd3a8ab28e0a5854 Mon Sep 17 00:00:00 2001
From: Tyler Chafin <tc25@sanger.ac.uk>
Date: Wed, 11 Sep 2024 15:19:45 +0100
Subject: [PATCH 11/15] now passes params.chunk_size

---
 modules/local/generate_cram_csv.nf | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/modules/local/generate_cram_csv.nf b/modules/local/generate_cram_csv.nf
index 6a3b120..d64933f 100644
--- a/modules/local/generate_cram_csv.nf
+++ b/modules/local/generate_cram_csv.nf
@@ -1,8 +1,10 @@
 process GENERATE_CRAM_CSV {
     tag "${meta.id}"
-    // label 'process_tiny'
+    label 'process_single'
 
-    container 'quay.io/sanger-tol/cramfilter_bwamem2_minimap2_samtools_perl:0.001-c1'
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'quay.io/sanger-tol/cramfilter_bwamem2_minimap2_samtools_perl:0.001-c1' :
+        'sanger-tol/cramfilter_bwamem2_minimap2_samtools_perl:0.001-c1' }"
 
     input:
     tuple val(meta), path(crampath), path(crai)
@@ -15,7 +17,7 @@ process GENERATE_CRAM_CSV {
     script:
     def prefix = task.ext.prefix ?: "${meta.id}"
     """
-    generate_cram_csv.sh $crampath ${prefix}_cram.csv $crai
+    generate_cram_csv.sh $crampath ${prefix}_cram.csv $crai ${params.chunk_size}
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":

From b26d3403daa79b87eb2a46255fee74e3c22c7fd5 Mon Sep 17 00:00:00 2001
From: Tyler Chafin <tc25@sanger.ac.uk>
Date: Wed, 11 Sep 2024 15:20:10 +0100
Subject: [PATCH 12/15] tag with chunk_id so process tracking is accurate

---
 modules/local/cram_filter_align_bwamem2_fixmate_sort.nf       | 2 +-
 modules/local/cram_filter_minimap2_filter5end_fixmate_sort.nf | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/local/cram_filter_align_bwamem2_fixmate_sort.nf b/modules/local/cram_filter_align_bwamem2_fixmate_sort.nf
index 9b5ca29..667e49b 100644
--- a/modules/local/cram_filter_align_bwamem2_fixmate_sort.nf
+++ b/modules/local/cram_filter_align_bwamem2_fixmate_sort.nf
@@ -1,5 +1,5 @@
 process CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT {
-    tag "$meta.id"
+    tag "$meta.chunk_id"
     label "process_high"
 
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
diff --git a/modules/local/cram_filter_minimap2_filter5end_fixmate_sort.nf b/modules/local/cram_filter_minimap2_filter5end_fixmate_sort.nf
index 8d8d69e..ceafb86 100644
--- a/modules/local/cram_filter_minimap2_filter5end_fixmate_sort.nf
+++ b/modules/local/cram_filter_minimap2_filter5end_fixmate_sort.nf
@@ -1,5 +1,5 @@
 process CRAM_FILTER_MINIMAP2_FILTER5END_FIXMATE_SORT {
-    tag "$meta.id"
+    tag "$meta.chunk_id"
     label "process_high"
 
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?

From fe4d66081acf993c6fcc2a11073c1c0ca95cede3 Mon Sep 17 00:00:00 2001
From: Tyler Chafin <tc25@sanger.ac.uk>
Date: Wed, 11 Sep 2024 16:18:19 +0100
Subject: [PATCH 13/15] samtools/addreplacerg module

---
 conf/modules.config                    | 21 ++++++++++
 modules/local/samtools_addreplacerg.nf | 56 ++++++++++++++++++++++++++
 2 files changed, 77 insertions(+)
 create mode 100644 modules/local/samtools_addreplacerg.nf

diff --git a/conf/modules.config b/conf/modules.config
index a7a59cb..3fea793 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -50,6 +50,22 @@ process {
         ext.args = "--output-fmt cram"
     }
 
+    withName: ".*:.*:ILLUMINA_BWAMEM2:CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT" {
+        ext.args        = ""
+        ext.args1       = { "-F 0x200 -nt -1 '${base}_${chunkid}_1.fastq.gz'" }
+        ext.args2       = { "-R '${rglines}'" }
+        ext.args3       = "-mpu"
+        ext.args4       = { "--write-index -l1" }
+    }
+
+    withName: ".*:.*:ILLUMINA_MINIMAP2:CRAM_FILTER_MINIMAP2_FILTER5END_FIXMATE_SORT" {
+        ext.args        = ""
+        ext.args1       = { "-F 0x200 -nt -1 '${base}_${chunkid}_1.fastq.gz'" }
+        ext.args2       = { "-ax sr" }
+        ext.args3       = "-mpu"
+        ext.args4       = { "--write-index -l1" }
+    }
+
     withName: ".*:.*:HIC_BWAMEM2:CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT" {
         ext.args        = ""
         ext.args1       = { "-F 0x200 -nt -1 '${base}_${chunkid}_1.fastq.gz'" }
@@ -105,6 +121,11 @@ process {
         ext.prefix = { "${bam.baseName}" }
     }
 
+    withName: SAMTOOLS_ADDREPLACERG {
+        ext.prefix = { "${input.baseName}_addRG" }
+        ext.args = { "-r ${meta.read_group}" }
+    }
+
     withName: SAMTOOLS_STATS {
         ext.prefix = { "${input.baseName}" }
     }
diff --git a/modules/local/samtools_addreplacerg.nf b/modules/local/samtools_addreplacerg.nf
new file mode 100644
index 0000000..5b59500
--- /dev/null
+++ b/modules/local/samtools_addreplacerg.nf
@@ -0,0 +1,56 @@
+process SAMTOOLS_ADDREPLACERG {
+    tag "$meta.id"
+    label 'process_low'
+
+    conda "bioconda::samtools=1.20"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/samtools:1.20--h50ea8bc_0' :
+        'biocontainers/samtools:1.20--h50ea8bc_0' }"
+
+    input:
+    tuple val(meta), path(input)
+
+    output:
+    tuple val(meta), path("${prefix}.bam"),                                    emit: bam,              optional: true
+    tuple val(meta), path("${prefix}.cram"),                                   emit: cram,             optional: true
+    tuple val(meta), path("${prefix}.sam"),                                    emit: sam,              optional: true
+    path  "versions.yml",                                                      emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    prefix = task.ext.prefix ?: "${meta.id}"
+    file_type = input.getExtension()
+    if ("$input" == "${prefix}.${file_type}") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!"
+    """
+    samtools \\
+        addreplacerg \\
+        --threads ${task.cpus-1} \\
+        $args \\
+        -o ${prefix}.${file_type} \\
+        $input
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//')
+    END_VERSIONS
+    """
+
+    stub:
+    def args = task.ext.args ?: ''
+    prefix = task.ext.prefix ?: "${meta.id}"
+    file_type = input.getExtension()
+    if ("$input" == "${prefix}.${file_type}") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!"
+
+    """
+    touch ${prefix}.${file_type}
+    ${index}
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//')
+    END_VERSIONS
+    """
+}

From 17bbbc747974bb8a6831720ad62c8687b9efc959 Mon Sep 17 00:00:00 2001
From: Tyler Chafin <tc25@sanger.ac.uk>
Date: Thu, 12 Sep 2024 15:05:56 +0100
Subject: [PATCH 14/15] minor changes... currently working

---
 bin/generate_cram_csv.sh | 8 ++++++--
 conf/modules.config      | 8 ++++----
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/bin/generate_cram_csv.sh b/bin/generate_cram_csv.sh
index 71af968..9e56831 100755
--- a/bin/generate_cram_csv.sh
+++ b/bin/generate_cram_csv.sh
@@ -21,11 +21,15 @@ chunk_cram() {
     local rgline=$(samtools view -H "${realcram}" | grep "@RG" | sed 's/\t/\\t/g' | sed "s/'//g")
     local ncontainers=$(zcat "${realcrai}" | wc -l)
     local base=$(basename "${realcram}" .cram)
+
+    if [ $chunk_size -gt $ncontainers ]; then
+        chunk_size=$((ncontainers - 1))
+    fi
     local from=0
     local to=$((chunk_size - 1))
 
     while [ $to -lt $ncontainers ]; do
-        echo "chunk $chunkn: $from - $to"
+        #echo "chunk $chunkn: $from - $to"
         echo "${realcram},${realcrai},${from},${to},${base},${chunkn},${rgline}" >> $outcsv
         from=$((to + 1))
         to=$((to + chunk_size))
@@ -35,7 +39,7 @@ chunk_cram() {
     # Catch any remainder
     if [ $from -lt $ncontainers ]; then
         to=$((ncontainers - 1))
-        echo "chunk $chunkn: $from - $to"
+        #echo "chunk $chunkn: $from - $to"
         echo "${realcram},${realcrai},${from},${to},${base},${chunkn},${rgline}" >> $outcsv
         ((chunkn++))
     fi
diff --git a/conf/modules.config b/conf/modules.config
index 3fea793..028ef95 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -52,7 +52,7 @@ process {
 
     withName: ".*:.*:ILLUMINA_BWAMEM2:CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT" {
         ext.args        = ""
-        ext.args1       = { "-F 0x200 -nt -1 '${base}_${chunkid}_1.fastq.gz'" }
+        ext.args1       = { "" }
         ext.args2       = { "-R '${rglines}'" }
         ext.args3       = "-mpu"
         ext.args4       = { "--write-index -l1" }
@@ -60,7 +60,7 @@ process {
 
     withName: ".*:.*:ILLUMINA_MINIMAP2:CRAM_FILTER_MINIMAP2_FILTER5END_FIXMATE_SORT" {
         ext.args        = ""
-        ext.args1       = { "-F 0x200 -nt -1 '${base}_${chunkid}_1.fastq.gz'" }
+        ext.args1       = { "" }
         ext.args2       = { "-ax sr" }
         ext.args3       = "-mpu"
         ext.args4       = { "--write-index -l1" }
@@ -68,7 +68,7 @@ process {
 
     withName: ".*:.*:HIC_BWAMEM2:CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT" {
         ext.args        = ""
-        ext.args1       = { "-F 0x200 -nt -1 '${base}_${chunkid}_1.fastq.gz'" }
+        ext.args1       = { "" }
         ext.args2       = { "-5SPCp -R '${rglines}'" }
         ext.args3       = "-mpu"
         ext.args4       = { "--write-index -l1" }
@@ -76,7 +76,7 @@ process {
 
     withName: ".*:.*:HIC_MINIMAP2:CRAM_FILTER_MINIMAP2_FILTER5END_FIXMATE_SORT" {
         ext.args        = ""
-        ext.args1       = { "-F 0x200 -nt -1 '${base}_${chunkid}_1.fastq.gz'" }
+        ext.args1       = { "" }
         ext.args2       = { "-ax sr" }
         ext.args3       = "-mpu"
         ext.args4       = { "--write-index -l1" }

From a7db73770e216868862f856082bc028b9a0fdf72 Mon Sep 17 00:00:00 2001
From: Tyler Chafin <tc25@sanger.ac.uk>
Date: Mon, 16 Sep 2024 12:02:56 +0100
Subject: [PATCH 15/15] generalise mapreduce to illumina

---
 conf/modules.config                     |  36 ++++----
 conf/test.config                        |   2 +-
 modules/local/generate_cram_csv.nf      |   2 +-
 modules/local/samtools_addreplacerg.nf  |   2 +-
 nextflow.config                         |   1 +
 subworkflows/local/align_short.nf       |  86 +++++++++++++-----
 subworkflows/local/align_short_hic.nf   | 116 ------------------------
 subworkflows/local/bwamem2_mapreduce.nf |   3 +-
 subworkflows/local/prepare_genome.nf    |   6 +-
 workflows/readmapping.nf                |   2 +-
 10 files changed, 92 insertions(+), 164 deletions(-)
 delete mode 100644 subworkflows/local/align_short_hic.nf

diff --git a/conf/modules.config b/conf/modules.config
index 028ef95..ebf3fb0 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -15,14 +15,6 @@ process {
         ext.args = '-F 0x200 -nt'
     }
 
-    withName: '.*:.*:ALIGN_HIC:BWAMEM2_MEM' {
-        ext.args = { "-5SPCp -R ${meta.read_group}" }
-    }
-
-    withName: '.*:.*:ALIGN_ILLUMINA:BWAMEM2_MEM' {
-        ext.args = { "-R ${meta.read_group}" }
-    }
-
     withName: SAMTOOLS_MERGE {
         ext.args = { "-c -p" }
         ext.prefix = { "${meta.id}.merge" }
@@ -50,31 +42,39 @@ process {
         ext.args = "--output-fmt cram"
     }
 
-    withName: ".*:.*:ILLUMINA_BWAMEM2:CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT" {
+    withName: '.*:.*:ALIGN_HIC:BWAMEM2_MEM' {
+        ext.args = { "-5SPCp -R ${meta.read_group}" }
+    }
+
+    withName: '.*:.*:ALIGN_ILLUMINA:BWAMEM2_MEM' {
+        ext.args = { "-p -R ${meta.read_group}" }
+    }
+
+    withName: ".*:ALIGN_ILLUMINA:.*:CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT" {
         ext.args        = ""
-        ext.args1       = { "" }
-        ext.args2       = { "-R '${rglines}'" }
+        ext.args1       = { "-F 0x200 -nt" }
+        ext.args2       = { "-p -R '${rglines}'" }
         ext.args3       = "-mpu"
         ext.args4       = { "--write-index -l1" }
     }
 
-    withName: ".*:.*:ILLUMINA_MINIMAP2:CRAM_FILTER_MINIMAP2_FILTER5END_FIXMATE_SORT" {
+    withName: ".*:ALIGN_ILLUMINA:.*:CRAM_FILTER_MINIMAP2_FILTER5END_FIXMATE_SORT" {
         ext.args        = ""
-        ext.args1       = { "" }
+        ext.args1       = { "-F 0x200 -nt" }
         ext.args2       = { "-ax sr" }
         ext.args3       = "-mpu"
         ext.args4       = { "--write-index -l1" }
     }
 
-    withName: ".*:.*:HIC_BWAMEM2:CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT" {
+    withName: ".*:ALIGN_HIC:.*:CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT" {
         ext.args        = ""
-        ext.args1       = { "" }
+        ext.args1       = { "-F 0x200 -nt" }
         ext.args2       = { "-5SPCp -R '${rglines}'" }
         ext.args3       = "-mpu"
         ext.args4       = { "--write-index -l1" }
     }
 
-    withName: ".*:.*:HIC_MINIMAP2:CRAM_FILTER_MINIMAP2_FILTER5END_FIXMATE_SORT" {
+    withName: ".*:ALIGN_HIC:.*:CRAM_FILTER_MINIMAP2_FILTER5END_FIXMATE_SORT" {
         ext.args        = ""
         ext.args1       = { "" }
         ext.args2       = { "-ax sr" }
@@ -82,7 +82,7 @@ process {
         ext.args4       = { "--write-index -l1" }
     }
 
-    withName: ".*:.*:HIC_MINIMAP2:MINIMAP2_INDEX" {
+    withName: "MINIMAP2_INDEX" {
         ext.args        = { "${fasta.size() > 2.5e9 ? (" -I " + Math.ceil(fasta.size()/1e9)+"G") : ""} "}
     }
 
@@ -123,7 +123,7 @@ process {
 
     withName: SAMTOOLS_ADDREPLACERG {
         ext.prefix = { "${input.baseName}_addRG" }
-        ext.args = { "-r ${meta.read_group}" }
+        ext.args = { "-r ${meta.read_group} --no-PG" }
     }
 
     withName: SAMTOOLS_STATS {
diff --git a/conf/test.config b/conf/test.config
index e59b389..add9178 100644
--- a/conf/test.config
+++ b/conf/test.config
@@ -26,7 +26,7 @@ params {
     outdir = "${projectDir}/results"
 
     // Aligner
-    hic_aligner = "minimap2"
+    short_aligner = "minimap2"
 
     // Fasta references
     fasta = "https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/assembly/release/mMelMel3.1_paternal_haplotype/GCA_922984935.2.subset.fasta.gz"
diff --git a/modules/local/generate_cram_csv.nf b/modules/local/generate_cram_csv.nf
index d64933f..f3cfe96 100644
--- a/modules/local/generate_cram_csv.nf
+++ b/modules/local/generate_cram_csv.nf
@@ -3,7 +3,7 @@ process GENERATE_CRAM_CSV {
     label 'process_single'
 
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
-        'quay.io/sanger-tol/cramfilter_bwamem2_minimap2_samtools_perl:0.001-c1' :
+        'https://quay.io/sanger-tol/cramfilter_bwamem2_minimap2_samtools_perl:0.001-c1' :
         'sanger-tol/cramfilter_bwamem2_minimap2_samtools_perl:0.001-c1' }"
 
     input:
diff --git a/modules/local/samtools_addreplacerg.nf b/modules/local/samtools_addreplacerg.nf
index 5b59500..6515c3e 100644
--- a/modules/local/samtools_addreplacerg.nf
+++ b/modules/local/samtools_addreplacerg.nf
@@ -27,7 +27,7 @@ process SAMTOOLS_ADDREPLACERG {
     """
     samtools \\
         addreplacerg \\
-        --threads ${task.cpus-1} \\
+        --threads $task.cpus \\
         $args \\
         -o ${prefix}.${file_type} \\
         $input
diff --git a/nextflow.config b/nextflow.config
index 9e1b306..d143247 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -15,6 +15,7 @@ params {
     bwamem2_index              = null
     fasta                      = null
     header                     = null
+
     // Aligner option
     short_aligner              = "minimap2" // Can choose minimap2 and bwamem2
     chunk_size                 = 10000
diff --git a/subworkflows/local/align_short.nf b/subworkflows/local/align_short.nf
index cebc194..4f49cdc 100644
--- a/subworkflows/local/align_short.nf
+++ b/subworkflows/local/align_short.nf
@@ -2,21 +2,26 @@
 // Align short read (HiC and Illumina) data against the genome
 //
 
-include { SAMTOOLS_FASTQ    } from '../../modules/nf-core/samtools/fastq/main'
-include { BWAMEM2_MEM       } from '../../modules/nf-core/bwamem2/mem/main'
-include { SAMTOOLS_MERGE    } from '../../modules/nf-core/samtools/merge/main'
-include { SAMTOOLS_SORMADUP } from '../../modules/local/samtools_sormadup'
-
+include { SAMTOOLS_FASTQ                        } from '../../modules/nf-core/samtools/fastq/main'
+include { SAMTOOLS_MERGE                        } from '../../modules/nf-core/samtools/merge/main'
+include { SAMTOOLS_SORMADUP                     } from '../../modules/local/samtools_sormadup'
+include { SAMTOOLS_INDEX                        } from '../../modules/nf-core/samtools/index/main'
+include { GENERATE_CRAM_CSV                     } from '../../modules/local/generate_cram_csv'
+include { SAMTOOLS_SORMADUP as CONVERT_CRAM     } from '../../modules/local/samtools_sormadup'
+include { SAMTOOLS_ADDREPLACERG                 } from '../../modules/local/samtools_addreplacerg'
+include { MINIMAP2_MAPREDUCE                    } from '../../subworkflows/local/minimap2_mapreduce'
+include { BWAMEM2_MAPREDUCE                     } from '../../subworkflows/local/bwamem2_mapreduce'
 
 workflow ALIGN_SHORT {
     take:
-    fasta    // channel: [ val(meta), /path/to/fasta ]
+    fasta    // channel: [ val(meta), /path/to/fasta ] reference_tuple
     index    // channel: [ val(meta), /path/to/bwamem2/ ]
-    reads    // channel: [ val(meta), /path/to/datafile ]
+    reads    // channel: [ val(meta), /path/to/datafile ] hic_reads_path
 
 
     main:
     ch_versions = Channel.empty()
+    ch_merged_bam   = Channel.empty()
 
     // Check file types and branch
     reads
@@ -28,23 +33,60 @@ workflow ALIGN_SHORT {
     | set { ch_reads }
 
 
-    // Convert from CRAM to FASTQ only if CRAM files were provided as input
-    SAMTOOLS_FASTQ ( ch_reads.cram, false )
-    ch_versions = ch_versions.mix ( SAMTOOLS_FASTQ.out.versions.first() )
-
-
-    SAMTOOLS_FASTQ.out.fastq
-    | mix ( ch_reads.fastq )
-    | set { ch_reads_fastq }
-
+    // Convert FASTQ to CRAM only if FASTQ were provided as input
+    CONVERT_CRAM ( ch_reads.fastq, fasta )
+    ch_versions = ch_versions.mix ( CONVERT_CRAM.out.versions )
+
+    SAMTOOLS_ADDREPLACERG ( CONVERT_CRAM.out.bam )
+    ch_versions = ch_versions.mix ( SAMTOOLS_ADDREPLACERG.out.versions )
+
+    SAMTOOLS_ADDREPLACERG.out.cram
+    | mix ( ch_reads.cram )
+    | set { ch_reads_cram }
+
+    // Index the CRAM file
+    SAMTOOLS_INDEX ( ch_reads_cram )
+    ch_versions = ch_versions.mix( SAMTOOLS_INDEX.out.versions )
+
+    ch_reads_cram
+    | join ( SAMTOOLS_INDEX.out.crai )
+    | set { ch_reads_cram_crai }
+
+
+    //
+    // MODULE: generate a CRAM CSV file containing the required parametres for CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT
+    //
+    GENERATE_CRAM_CSV( ch_reads_cram_crai )
+    ch_versions = ch_versions.mix( GENERATE_CRAM_CSV.out.versions )
+
+    //
+    // SUBWORKFLOW: mapping hic reads using minimap2 or bwamem2
+    //
+    if (params.short_aligner.startsWith("minimap")) {
+        MINIMAP2_MAPREDUCE (
+            fasta,
+            GENERATE_CRAM_CSV.out.csv
+        )
+        ch_versions         = ch_versions.mix( MINIMAP2_MAPREDUCE.out.versions )
+        ch_merged_bam           = ch_merged_bam.mix(MINIMAP2_MAPREDUCE.out.mergedbam)
+    } else {
+        BWAMEM2_MAPREDUCE (
+            fasta,
+            GENERATE_CRAM_CSV.out.csv,
+            index
+        )
+        ch_versions         = ch_versions.mix( BWAMEM2_MAPREDUCE.out.versions )
+        ch_merged_bam           = ch_merged_bam.mix(BWAMEM2_MAPREDUCE.out.mergedbam)
+    }
 
-    // Align Fastq to Genome and output sorted BAM
-    BWAMEM2_MEM ( ch_reads_fastq, index, fasta, true )
-    ch_versions = ch_versions.mix ( BWAMEM2_MEM.out.versions.first() )
+    ch_merged_bam
+    | combine( ch_reads_cram_crai )
+    | map { meta_bam, bam, meta_cram, cram, crai -> [ meta_cram, bam ] }
+    | set { ch_merged_bam }
 
 
-    // Collect all BWAMEM2 output by sample name
-    BWAMEM2_MEM.out.bam
+    // Collect all BAM output by sample name
+    ch_merged_bam
     | map { meta, bam -> [['id': meta.id.split('_')[0..-2].join('_'), 'datatype': meta.datatype], meta.read_count, bam] }
     | groupTuple( by: [0] )
     | map { meta, read_counts, bams -> [meta + [read_count: read_counts.sum()], bams] }
@@ -58,7 +100,7 @@ workflow ALIGN_SHORT {
 
     // Merge, but only if there is more than 1 file
     SAMTOOLS_MERGE ( ch_bams.multi_bams, [ [], [] ], [ [], [] ] )
-    ch_versions = ch_versions.mix ( SAMTOOLS_MERGE.out.versions.first() )
+    ch_versions = ch_versions.mix ( SAMTOOLS_MERGE.out.versions )
 
 
     SAMTOOLS_MERGE.out.bam
diff --git a/subworkflows/local/align_short_hic.nf b/subworkflows/local/align_short_hic.nf
deleted file mode 100644
index 92c2010..0000000
--- a/subworkflows/local/align_short_hic.nf
+++ /dev/null
@@ -1,116 +0,0 @@
-//
-// Align short read (HiC and Illumina) data against the genome
-//
-
-include { SAMTOOLS_FASTQ                        } from '../../modules/nf-core/samtools/fastq/main'
-include { SAMTOOLS_MERGE                        } from '../../modules/nf-core/samtools/merge/main'
-include { SAMTOOLS_SORMADUP                     } from '../../modules/local/samtools_sormadup'
-include { SAMTOOLS_INDEX                        } from '../../modules/nf-core/samtools/index/main'
-include { GENERATE_CRAM_CSV                     } from '../../modules/local/generate_cram_csv'
-include { SAMTOOLS_SORMADUP as CONVERT_CRAM     } from '../../modules/local/samtools_sormadup'
-include { MINIMAP2_MAPREDUCE as HIC_MINIMAP2    } from '../../subworkflows/local/minimap2_mapreduce'
-include { BWAMEM2_MAPREDUCE as HIC_BWAMEM2      } from '../../subworkflows/local/bwamem2_mapreduce'
-
-workflow ALIGN_SHORT_HIC {
-    take:
-    fasta    // channel: [ val(meta), /path/to/fasta ] reference_tuple
-    index    // channel: [ val(meta), /path/to/bwamem2/ ]
-    reads    // channel: [ val(meta), /path/to/datafile ] hic_reads_path
-
-
-    main:
-    ch_versions = Channel.empty()
-    ch_merged_bam   = Channel.empty()
-
-    // Check file types and branch
-    reads
-    | branch {
-        meta, reads ->
-            fastq : reads.findAll { it.getName().toLowerCase() =~ /.*f.*\.gz/ }
-            cram : true
-    }
-    | set { ch_reads }
-
-
-    // Convert FASTQ to CRAM only if FASTQ were provided as input
-    CONVERT_CRAM ( ch_reads.fastq, fasta )
-    ch_versions = ch_versions.mix (    CONVERT_CRAM.out.versions )
-
-    CONVERT_CRAM.out.bam
-    | mix ( ch_reads.cram )
-    | set { ch_reads_cram }
-
-
-    // Index the CRAM file
-    SAMTOOLS_INDEX ( ch_reads_cram )
-    ch_versions = ch_versions.mix( SAMTOOLS_INDEX.out.versions )
-
-    ch_reads_cram
-    | join ( SAMTOOLS_INDEX.out.crai )
-    | set { ch_reads_cram_crai }
-
-
-    //
-    // MODULE: generate a CRAM CSV file containing the required parametres for CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT
-    //
-    GENERATE_CRAM_CSV( ch_reads_cram_crai )
-    ch_versions = ch_versions.mix( GENERATE_CRAM_CSV.out.versions )
-
-
-    //
-    // SUBWORKFLOW: mapping hic reads using minimap2 or bwamem2
-    //
-    if (params.short_aligner.startsWith("minimap")) {
-        HIC_MINIMAP2 (
-            fasta,
-            GENERATE_CRAM_CSV.out.csv
-        )
-        ch_versions         = ch_versions.mix( HIC_MINIMAP2.out.versions )
-        ch_merged_bam           = ch_merged_bam.mix(HIC_MINIMAP2.out.mergedbam)
-    } else {
-        HIC_BWAMEM2 (
-            fasta,
-            GENERATE_CRAM_CSV.out.csv,
-            index
-        )
-        ch_versions         = ch_versions.mix( HIC_BWAMEM2.out.versions )
-        ch_merged_bam           = ch_merged_bam.mix(HIC_BWAMEM2.out.mergedbam)
-    }
-
-    ch_merged_bam
-    | combine( ch_reads_cram_crai )
-    | map { meta_bam, bam, meta_cram, cram, crai -> [ meta_cram, bam ] }
-    | set { ch_merged_bam }
-
-
-    // Collect all BAM output by sample name
-    ch_merged_bam
-    | map { meta, bam -> [['id': meta.id.split('_')[0..-2].join('_'), 'datatype': meta.datatype], meta.read_count, bam] }
-    | groupTuple( by: [0] )
-    | map { meta, read_counts, bams -> [meta + [read_count: read_counts.sum()], bams] }
-    | branch {
-        meta, bams ->
-            single_bam: bams.size() == 1
-            multi_bams: true
-    }
-    | set { ch_bams }
-
-
-    // Merge, but only if there is more than 1 file
-    SAMTOOLS_MERGE ( ch_bams.multi_bams, [ [], [] ], [ [], [] ] )
-    ch_versions = ch_versions.mix ( SAMTOOLS_MERGE.out.versions.first() )
-
-
-    SAMTOOLS_MERGE.out.bam
-    | mix ( ch_bams.single_bam )
-    | set { ch_bam }
-
-
-    // Mark duplicates
-    SAMTOOLS_SORMADUP ( ch_bam, fasta )
-    ch_versions = ch_versions.mix ( SAMTOOLS_SORMADUP.out.versions )
-
-    emit:
-    bam      = SAMTOOLS_SORMADUP.out.bam     // channel: [ val(meta), /path/to/bam ]
-    versions = ch_versions                   // channel: [ versions.yml ]
-}
diff --git a/subworkflows/local/bwamem2_mapreduce.nf b/subworkflows/local/bwamem2_mapreduce.nf
index ceaff1b..13711fb 100644
--- a/subworkflows/local/bwamem2_mapreduce.nf
+++ b/subworkflows/local/bwamem2_mapreduce.nf
@@ -36,8 +36,9 @@ workflow BWAMEM2_MAPREDUCE {
     }
     .set { ch_filtering_input }
 
+
     //
-    // MODULE: map hic reads by 10,000 container per time using bwamem2
+    // MODULE: map hic reads in each chunk using bwamem2
     //
     CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT (
         ch_filtering_input,
diff --git a/subworkflows/local/prepare_genome.nf b/subworkflows/local/prepare_genome.nf
index aa951b6..f7a4754 100644
--- a/subworkflows/local/prepare_genome.nf
+++ b/subworkflows/local/prepare_genome.nf
@@ -30,7 +30,7 @@ workflow PREPARE_GENOME {
 
     // Unmask genome fasta
     UNMASK ( ch_fasta )
-    ch_versions = ch_versions.mix ( UNMASK.out.versions.first() )
+    ch_versions = ch_versions.mix ( UNMASK.out.versions )
 
     // Generate BWA index
     if ( checkShortReads( params.input ) ) {
@@ -42,14 +42,14 @@ workflow PREPARE_GENOME {
 
             if ( params.bwamem2_index.endsWith('.tar.gz') ) {
                 ch_bwamem2_index = UNTAR ( ch_bwamem ).untar
-                ch_versions      = ch_versions.mix ( UNTAR.out.versions.first() )
+                ch_versions      = ch_versions.mix ( UNTAR.out.versions )
             } else {
                 ch_bwamem2_index = ch_bwamem
             }
 
         } else {
             ch_bwamem2_index = BWAMEM2_INDEX ( UNMASK.out.fasta ).index
-            ch_versions      = ch_versions.mix ( BWAMEM2_INDEX.out.versions.first() )
+            ch_versions      = ch_versions.mix ( BWAMEM2_INDEX.out.versions )
         }
     } else {
         ch_bwamem2_index = Channel.empty()
diff --git a/workflows/readmapping.nf b/workflows/readmapping.nf
index a5e9575..a71c77e 100644
--- a/workflows/readmapping.nf
+++ b/workflows/readmapping.nf
@@ -18,7 +18,7 @@ include { SAMTOOLS_REHEADER           } from '../modules/local/samtools_replaceh
 
 include { INPUT_CHECK                   } from '../subworkflows/local/input_check'
 include { PREPARE_GENOME                } from '../subworkflows/local/prepare_genome'
-include { ALIGN_SHORT_HIC as ALIGN_HIC  } from '../subworkflows/local/align_short_hic'
+include { ALIGN_SHORT as ALIGN_HIC      } from '../subworkflows/local/align_short'
 include { ALIGN_SHORT as ALIGN_ILLUMINA } from '../subworkflows/local/align_short'
 include { ALIGN_PACBIO as ALIGN_HIFI    } from '../subworkflows/local/align_pacbio'
 include { ALIGN_PACBIO as ALIGN_CLR     } from '../subworkflows/local/align_pacbio'