review comments

sanger-tol · Sep 27, 2023 · 021565a · 021565a
1 parent f09fa15
commit 021565a
Show file tree

Hide file tree

Showing 12 changed files with 80 additions and 55 deletions.
diff --git a/README.md b/README.md
@@ -60,7 +60,7 @@ mMelMel1,illumina,GCA_922984935.2.illumina.mMelMel1.cram
 mMelMel3,ont,GCA_922984935.2.ont.mMelMel3.cram
 ```
 
-Each row represents an aligned file. Rows with the same sample identifier are considered technical replicates. The datatype refers to the sequencing technology used to generate the underlying raw data and follows a controlled vocabulary (ont, hic, pacbio, illumina). The aligned read files can be generated using the [sanger-tol/readmapping](https://github.com/sanger-tol/readmapping) pipeline.
+Each row represents an aligned file. Rows with the same sample identifier are considered technical replicates. The datatype refers to the sequencing technology used to generate the underlying raw data and follows a controlled vocabulary (ont, hic, pacbio, pacbio_clr illumina). The aligned read files can be generated using the [sanger-tol/readmapping](https://github.com/sanger-tol/readmapping) pipeline.
 
 Now, you can run the pipeline using:
 

diff --git a/assets/schema_input.json b/assets/schema_input.json
@@ -16,7 +16,7 @@
             "datatype": {
                 "type": "string",
                 "pattern": "^\\S+$",
-                "enum": ["hic", "illumina", "ont", "pacbio"],
+                "enum": ["hic", "illumina", "ont", "pacbio", "pacbio_clr"],
                 "errorMessage": "Data type, and must be one of: 'hic' or 'illumina' or 'ont' or 'pacbio'"
             },
             "datafile": {

diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py
@@ -33,6 +33,7 @@ class RowChecker:
         "hic",
         "illumina",
         "pacbio",
+        "pacbio_clr",
         "ont",
     )
 

diff --git a/conf/modules.config b/conf/modules.config
@@ -45,7 +45,7 @@ process {
     }
 
     withName: "GOAT_TAXONSEARCH" {
-        ext.args = "-l -b"
+        ext.args = "--lineage --busco"
     }
 
     withName: "SAMTOOLS_VIEW" {

diff --git a/conf/test.config b/conf/test.config
@@ -12,7 +12,7 @@
 
 params {
     config_profile_name        = 'Test profile'
-    config_profile_description = 'Minimal test dataset to check pipeline function'
+    config_profile_description = 'Minimal aligned test dataset to check pipeline function'
 
     // Limit resources so that this can run on GitHub Actions
     max_cpus   = 2

diff --git a/conf/test_full.config b/conf/test_full.config
@@ -22,7 +22,7 @@ params {
     input     = "${projectDir}/assets/test_full/full_samplesheet.csv"
 
     // Fasta references
-    fasta     = "/lustre/scratch123/tol/resources/nextflow/test-data/Laetiporus_sulphureus/assembly/release/gfLaeSulp1.1/insdc/GCA_927399515.1.fasta.gz"
+    fasta     = "https://tolit.cog.sanger.ac.uk/test-data/Laetiporus_sulphureus/assembly/release/gfLaeSulp1.1/insdc/GCA_927399515.1.fasta.gz"
     accession = "GCA_927399515.1"
     taxon     = "Laetiporus sulphureus"
 

diff --git a/conf/test_raw.config b/conf/test_raw.config
@@ -11,8 +11,8 @@
 */
 
 params {
-    config_profile_name        = 'Test profile'
-    config_profile_description = 'Minimal test dataset to check pipeline function'
+    config_profile_name        = 'Raw test profile'
+    config_profile_description = 'Minimal raw test dataset to check pipeline function'
 
     // Limit resources so that this can run on GitHub Actions
     max_cpus   = 2
@@ -23,14 +23,15 @@ params {
     // Specify the paths to your test data
     // Give any required params for the test so that command line flags are not needed
     input     = "${projectDir}/assets/test/samplesheet_raw.csv"
+    align     = true
 
     // Fasta references
-    fasta     = "/lustre/scratch123/tol/resources/nextflow/test-data/Meles_meles/assembly/release/mMelMel3.1_paternal_haplotype/GCA_922984935.2.subset.fasta.gz"
+    fasta     = "https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/assembly/release/mMelMel3.1_paternal_haplotype/GCA_922984935.2.subset.fasta.gz"
     accession = "GCA_922984935.2"
     taxon     = "Meles meles"
 
     // Databases
     taxdump   = "/lustre/scratch123/tol/teams/grit/geval_pipeline/btk_databases/taxdump"
-    busco     = "/lustre/scratch123/tol/resources/nextflow/busco_2021_06_reduced/"
-    uniprot   = "${projectDir}/assets/test/mCerEla1.1.buscogenes.dmnd"
+    busco     = "/lustre/scratch123/tol/resources/nextflow/busco/blobtoolkit.GCA_922984935.2.2023-08-03"
+    uniprot   = "https://tolit.cog.sanger.ac.uk/test-data/resources/diamond/mCerEla1.1.buscogenes.dmnd"
 }
diff --git a/docs/usage.md b/docs/usage.md
@@ -43,7 +43,7 @@ sample3,ont,ont.cram
 | Column     | Description                                                                                                                                                                           |
 | ---------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `sample`   | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (\_). |
-| `datatype` | Type of sequencing data. Must be one of `hic`, `illumina`, `pacbio`, or `ont`.                                                                                                        |
+| `datatype` | Type of sequencing data. Must be one of `hic`, `illumina`, `pacbio`, `pacbio_clr` or `ont`.                                                                                                        |
 | `datafile` | Full path to read data file.                                                                                                                                                          |
 
 An [example samplesheet](https://raw.githubusercontent.com/sanger-tol/blobtoolkit/main/assets/test/samplesheet.csv) has been provided with the pipeline.

diff --git a/modules.json b/modules.json
@@ -8,63 +8,87 @@
                     "busco": {
                         "branch": "master",
                         "git_sha": "6d6552cb582f56b6101c452e16ee7c23073f91de",
-                        "installed_by": ["modules"],
+                        "installed_by": [
+                            "modules"
+                        ],
                         "patch": "modules/nf-core/busco/busco.diff"
                     },
                     "custom/dumpsoftwareversions": {
                         "branch": "master",
                         "git_sha": "05c280924b6c768d484c7c443dad5e605c4ff4b4",
-                        "installed_by": ["modules"]
+                        "installed_by": [
+                            "modules"
+                        ]
                     },
                     "diamond/blastp": {
                         "branch": "master",
                         "git_sha": "911696ea0b62df80e900ef244d7867d177971f73",
-                        "installed_by": ["modules"]
+                        "installed_by": [
+                            "modules"
+                        ]
                     },
                     "fastawindows": {
                         "branch": "master",
                         "git_sha": "911696ea0b62df80e900ef244d7867d177971f73",
-                        "installed_by": ["modules"]
+                        "installed_by": [
+                            "modules"
+                        ]
                     },
                     "goat/taxonsearch": {
                         "branch": "master",
                         "git_sha": "911696ea0b62df80e900ef244d7867d177971f73",
-                        "installed_by": ["modules"]
+                        "installed_by": [
+                            "modules"
+                        ]
                     },
                     "gunzip": {
                         "branch": "master",
                         "git_sha": "e06548bfa36ee31869b81041879dd6b3a83b1d57",
-                        "installed_by": ["modules"]
+                        "installed_by": [
+                            "modules"
+                        ]
                     },
                     "minimap2/align": {
                         "branch": "master",
                         "git_sha": "603ecbd9f45300c9788f197d2a15a005685b4220",
-                        "installed_by": ["modules"]
+                        "installed_by": [
+                            "modules"
+                        ]
                     },
                     "mosdepth": {
                         "branch": "master",
                         "git_sha": "ebb27711cd5f4de921244bfa81c676504072d31c",
-                        "installed_by": ["modules"]
+                        "installed_by": [
+                            "modules"
+                        ]
                     },
                     "multiqc": {
                         "branch": "master",
                         "git_sha": "a6e11ac655e744f7ebc724be669dd568ffdc0e80",
-                        "installed_by": ["modules"]
+                        "installed_by": [
+                            "modules"
+                        ]
                     },
                     "samtools/fasta": {
                         "branch": "master",
                         "git_sha": "911696ea0b62df80e900ef244d7867d177971f73",
-                        "installed_by": ["modules"]
+                        "installed_by": [
+                            "modules"
+                        ]
                     },
                     "samtools/index": {
                         "branch": "master",
                         "git_sha": "911696ea0b62df80e900ef244d7867d177971f73",
-                        "installed_by": ["modules"]
+                        "installed_by": [
+                            "modules"
+                        ]
                     },
                     "samtools/view": {
                         "branch": "master",
                         "git_sha": "3ffae3598260a99e8db3207dead9f73f87f90d1f",
-                        "installed_by": ["modules"]
+                        "installed_by": [
+                            "modules"
+                        ]
                     }
                 }
             },
@@ -73,4 +97,4 @@
             }
         }
     }
-}
+}
diff --git a/subworkflows/local/coverage_stats.nf b/subworkflows/local/coverage_stats.nf
@@ -2,15 +2,16 @@
 // Calculate genome coverage and statistics
 //
 
-include { SAMTOOLS_VIEW } from '../../modules/nf-core/samtools/view/main'
-include { MOSDEPTH      } from '../../modules/nf-core/mosdepth/main'
-include { FASTAWINDOWS  } from '../../modules/nf-core/fastawindows/main'
-include { CREATE_BED    } from '../../modules/local/create_bed'
+include { SAMTOOLS_VIEW  } from '../../modules/nf-core/samtools/view/main'
+include { SAMTOOLS_INDEX } from '../../modules/nf-core/samtools/index/main'
+include { MOSDEPTH       } from '../../modules/nf-core/mosdepth/main'
+include { FASTAWINDOWS   } from '../../modules/nf-core/fastawindows/main'
+include { CREATE_BED     } from '../../modules/local/create_bed'
 
 
 workflow COVERAGE_STATS {
     take: 
-    input    // channel: [ val(meta), path(aligned) or path(aligned), path(index) ] 
+    input    // channel: [ val(meta), path(aln) ] 
     fasta    // channel: [ val(meta), path(fasta) ]
 
 
@@ -19,24 +20,32 @@ workflow COVERAGE_STATS {
 
 
     // Create aligned BAM and index CSI channel
-    if (params.align) {
-
-        ch_bam_csi = input
+    input
+    | branch { meta, aln ->
+        bam : aln.toString().endsWith("bam") == true
+            return [ meta, aln ]
+        cram : aln.toString().endsWith("cram") == true
+            return [ meta, aln, [] ]
+    }
+    | set { ch_aln_idx}
 
-    } else {
+    SAMTOOLS_VIEW ( ch_aln_idx.cram, fasta, [] )
+    ch_versions = ch_versions.mix ( SAMTOOLS_VIEW.out.versions.first() )
 
-        input
-        | map { meta, cram -> [ meta, cram, [] ] }
-        | set { ch_cram_crai}
+    SAMTOOLS_VIEW.out.bam
+    | join ( SAMTOOLS_VIEW.out.csi )
+    | set { ch_view }
 
-        SAMTOOLS_VIEW ( ch_cram_crai, fasta, [] )
-        ch_versions = ch_versions.mix ( SAMTOOLS_VIEW.out.versions.first() )
+    SAMTOOLS_INDEX ( ch_aln_idx.bam )
+    ch_versions = ch_versions.mix ( SAMTOOLS_INDEX.out.versions.first() )
 
-        SAMTOOLS_VIEW.out.bam
-        | join ( SAMTOOLS_VIEW.out.csi )
-        | set { ch_bam_csi }
+    ch_aln_idx.bam
+    | join ( SAMTOOLS_INDEX.out.csi )
+    | set { ch_index }
 
-    }
+    ch_view
+    | mix ( ch_index )
+    | set { ch_bam_csi }
 
 
     // Calculate genome statistics

diff --git a/subworkflows/local/minimap_alignment.nf b/subworkflows/local/minimap_alignment.nf
@@ -8,7 +8,6 @@ include { MINIMAP2_ALIGN as MINIMAP2_ILMN } from '../../modules/nf-core/minimap2
 include { MINIMAP2_ALIGN as MINIMAP2_CCS  } from '../../modules/nf-core/minimap2/align/main'
 include { MINIMAP2_ALIGN as MINIMAP2_CLR  } from '../../modules/nf-core/minimap2/align/main'
 include { MINIMAP2_ALIGN as MINIMAP2_ONT  } from '../../modules/nf-core/minimap2/align/main'
-include { SAMTOOLS_INDEX                  } from '../../modules/nf-core/samtools/index/main'
 
 
 workflow MINIMAP2_ALIGNMENT {
@@ -60,7 +59,7 @@ workflow MINIMAP2_ALIGNMENT {
     ch_versions = ch_versions.mix(MINIMAP2_ONT.out.versions.first())
 
 
-    // Index aligned reads
+    // Combine aligned reads
     Channel.empty()
     | mix ( MINIMAP2_HIC.out.bam )
     | mix ( MINIMAP2_ILMN.out.bam )
@@ -69,17 +68,8 @@ workflow MINIMAP2_ALIGNMENT {
     | mix ( MINIMAP2_ONT.out.bam )
     | set { ch_aligned }
 
-    SAMTOOLS_INDEX ( ch_aligned )
-    ch_versions = ch_versions.mix(SAMTOOLS_INDEX.out.versions.first())
-
-
-    // Combine aligned reads and indices
-    ch_aligned
-    | join ( SAMTOOLS_INDEX.out.csi )
-    | set { bam_csi }
-
 
     emit:
-    bam_csi                      // channel: [ val(meta), bam, csi ]
+    aln      = ch_aligned        // channel: [ val(meta), bam ]
     versions = ch_versions       // channel: [ versions.yml ]
 }
diff --git a/workflows/blobtoolkit.nf b/workflows/blobtoolkit.nf
@@ -106,7 +106,7 @@ workflow BLOBTOOLKIT {
     if ( params.align ) {
         MINIMAP2_ALIGNMENT ( INPUT_CHECK.out.aln, ch_genome )
         ch_versions = ch_versions.mix ( MINIMAP2_ALIGNMENT.out.versions )
-        ch_aligned = MINIMAP2_ALIGNMENT.out.bam_csi
+        ch_aligned = MINIMAP2_ALIGNMENT.out.aln
     } else {
         ch_aligned = INPUT_CHECK.out.aln
     }
-Original file line number
+Diff line change
@@ Expand Up / @@ -33,6 +33,7 @@ class RowChecker: @@
             "hic",
             "illumina",
             "pacbio",
+            "pacbio_clr",
             "ont",
         )
@@ Expand Down @@