From b16c1130f2e75549bd686989b346653cd9bd3308 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Wed, 19 Oct 2022 01:50:26 +0100 Subject: [PATCH 01/72] Version bump on dev --- nextflow.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow.config b/nextflow.config index 6ea7bca..e1b49a3 100644 --- a/nextflow.config +++ b/nextflow.config @@ -173,7 +173,7 @@ manifest { description = 'Pipeline that downloads repeats annotations from Ensembl into the Tree of Life directory structure' mainScript = 'main.nf' nextflowVersion = '!>=22.04.0' - version = '1.0.0' + version = '1.1dev' } // Load modules.config for DSL2 module specific options From d64deb430cd090741772f8b222b010b840c7fd41 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Tue, 20 Dec 2022 10:43:26 +0000 Subject: [PATCH 02/72] Added support for the annotation method --- assets/samplesheet.csv | 10 ++-- assets/schema_input.json | 9 +++- bin/check_samplesheet.py | 17 ++++++- conf/test.config | 3 +- docs/usage.md | 61 +++++++++++------------- lib/WorkflowEnsemblrepeatdownload.groovy | 4 +- nextflow.config | 1 + nextflow_schema.json | 8 +++- subworkflows/local/download.nf | 7 +-- subworkflows/local/params_check.nf | 5 +- workflows/ensemblrepeatdownload.nf | 1 + 11 files changed, 74 insertions(+), 52 deletions(-) diff --git a/assets/samplesheet.csv b/assets/samplesheet.csv index a0049cd..c902d01 100644 --- a/assets/samplesheet.csv +++ b/assets/samplesheet.csv @@ -1,5 +1,5 @@ -species_dir,assembly_name,assembly_accession,ensembl_species_name -25g/data/echinoderms/Asterias_rubens,eAstRub1.3,GCA_902459465.3,Asterias_rubens -25g/data/insects/Osmia_bicornis,iOsmBic2.1,GCA_907164935.1,Osmia_bicornis_bicornis -25g/data/insects/Osmia_bicornis,iOsmBic2.1_alternate_haplotype,GCA_907164925.1,Osmia_bicornis_bicornis -darwin/data/insects/Noctua_fimbriata,ilNocFimb1.1,GCA_905163415.1,Noctua_fimbriata +species_dir,assembly_name,assembly_accession,ensembl_species_name,annotation_method +25g/data/echinoderms/Asterias_rubens,eAstRub1.3,GCA_902459465.3,Asterias_rubens,refseq +25g/data/insects/Osmia_bicornis,iOsmBic2.1,GCA_907164935.1,Osmia_bicornis_bicornis,ensembl +25g/data/insects/Osmia_bicornis,iOsmBic2.1_alternate_haplotype,GCA_907164925.1,Osmia_bicornis_bicornis,ensembl +darwin/data/insects/Noctua_fimbriata,ilNocFimb1.1,GCA_905163415.1,Noctua_fimbriata,braker diff --git a/assets/schema_input.json b/assets/schema_input.json index e2302e3..05ce4ca 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -25,9 +25,14 @@ "ensembl_species_name": { "type": "string", "pattern": "^\\S+$", - "errorMessage": "Name of the species, as used in the Ensembl FTP" + "errorMessage": "The (Ensembl) species name must be provided and cannot contain spaces" + }, + "annotation_method": { + "type": "string", + "pattern": "^\\S+$", + "errorMessage": "The annotation method must be provided and cannot contain spaces" } }, - "required": ["species_dir", "assembly_name", "ensembl_species_name"] + "required": ["species_dir", "assembly_name", "ensembl_species_name", "annotation_method"] } } diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py index e123e02..1ea1063 100755 --- a/bin/check_samplesheet.py +++ b/bin/check_samplesheet.py @@ -32,6 +32,7 @@ def __init__( name_col="assembly_name", accession_col="assembly_accession", ensembl_name_col="ensembl_species_name", + method_col="annotation_method", **kwargs, ): """ @@ -46,6 +47,8 @@ def __init__( number (default "assembly_accession"). ensembl_name_col(str): The name of the column that contains the Ensembl species name (default "ensembl_species_name"). + annotation_method (str): The name of the column that contains the annotation method + (default "annotation_method"). """ super().__init__(**kwargs) @@ -53,6 +56,7 @@ def __init__( self._name_col = name_col self._accession_col = accession_col self._ensembl_name_col = ensembl_name_col + self._method_col = method_col self._seen = set() self.modified = [] self._regex_accession = re.compile(r"^GCA_[0-9]{9}\.[0-9]+$") @@ -70,6 +74,7 @@ def validate_and_transform(self, row): self._validate_name(row) self._validate_accession(row) self._validate_ensembl_name(row) + self._validate_method(row) self._seen.add(row[self._name_col]) self.modified.append(row) @@ -103,6 +108,13 @@ def _validate_ensembl_name(self, row): if " " in row[self._ensembl_name_col]: raise AssertionError("Ensembl names must not contain whitespace.") + def _validate_method(self, row): + """Assert that the annotation method is non-empty and has no space.""" + if not row[self._method_col]: + raise AssertionError("Annotation method is required.") + if " " in row[self._method_col]: + raise AssertionError("Annotation methods must not contain whitespace.") + def validate_unique_assemblies(self): """ Assert that the assembly parameters are unique. @@ -163,14 +175,15 @@ def check_samplesheet(file_in, file_out): Example: This function checks that the samplesheet follows the following structure:: - species_dir,assembly_name,ensembl_species_name - 25g/data/echinoderms/Asterias_rubens,eAstRub1.3,Asterias_rubens + species_dir,assembly_name,ensembl_species_name,annotation_method + 25g/data/echinoderms/Asterias_rubens,eAstRub1.3,Asterias_rubens,ensembl """ required_columns = { "species_dir", "assembly_name", "ensembl_species_name", + "annotation_method", } # See https://docs.python.org/3.9/library/csv.html#id3 to read up on `newline=""`. with file_in.open(newline="") as in_handle: diff --git a/conf/test.config b/conf/test.config index 97382d4..777e1e2 100644 --- a/conf/test.config +++ b/conf/test.config @@ -20,5 +20,6 @@ params { max_time = '6.h' ensembl_species_name = "Osmia_bicornis_bicornis" - assembly_accession = "GCA_907164935.1" + assembly_accession = "GCA_907164925.1" + annotation_method = "ensembl" } diff --git a/docs/usage.md b/docs/usage.md index 7797b59..be9553d 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -2,60 +2,54 @@ ## Introduction -The pipeline downloads Enembl gene and/or repeat annotations for one of multiple assemblied. +The pipeline downloads Enembl repeat annotations for one of multiple assemblies. It also builds a set of common indices (such as `samtools faidx`, `tabix`). ## One-off downloads The pipeline accepts command-one line arguments to specify a single genome to download: -- `--ensembl_species_name` (mandatory): How Ensembl name the species (as it can be different from Tree of Life), -- `--assembly_accession` (mandatory): The accession number of the assembly, -- `--outdir` (mandatory): Where to download the data. +- `--ensembl_species_name`: How Ensembl name the species (as it can be different from Tree of Life), +- `--assembly_accession`: The accession number of the assembly, +- `--annotation_method`: The annotation method of the geneset related to the repeat annotation (requirement of Ensembl's data-model), +- `--outdir`: Where to download the data. ```console -nextflow run sanger-tol/ensemblrepeatdownload -profile singularity --ensembl_species_name Noctua_fimbriata --assembly_accession GCA_905163415.1 --outdir ens1 +nextflow run sanger-tol/ensemblrepeatdownload -profile singularity --ensembl_species_name Noctua_fimbriata --assembly_accession GCA_905163415.1 --annotation_method braker --outdir results ``` -The pipeline downloads the repeat-masked genome to which the annotation is attached. - ## Bulk download To download multiple datasets at once, descrbe these in a "samplesheet": a comma-separated files that lists the command-line arguments. - -```bash ---input '[path to samplesheet file]' -``` - -The file must have four columns, although the last one (`geneset_version`) can have empty values, as in the [example samplesheet](../assets/samplesheet.csv) provided with the pipeline and pasted here: +The file must have four columns, but accepts five as in the [example samplesheet](../assets/samplesheet.csv) provided with the pipeline and pasted here: ```console -analysis_dir,ensembl_species_name,assembly_accession,geneset_version -darwin/data/insects/Noctua_fimbriata/analysis/ilNocFimb1.1,Noctua_fimbriata,GCA_905163415.1,2022_03 -25g/data/insects/Osmia_bicornis/analysis/iOsmBic2.1,Osmia_bicornis_bicornis,GCA_907164935.1, -25g/data/insects/Osmia_bicornis/analysis/iOsmBic2.1,Osmia_bicornis_bicornis,GCA_907164935.1,2021_11 -25g/data/insects/Osmia_bicornis/analysis/iOsmBic2.1_alternate_haplotype,Osmia_bicornis_bicornis,GCA_907164925.1,2022_02 -25g/data/insects/Osmia_bicornis/analysis/iOsmBic2.1_alternate_haplotype,Osmia_bicornis_bicornis,GCA_907164925.1, -25g/data/echinoderms/Asterias_rubens/analysis/eAstRub1.3,Asterias_rubens,GCA_902459465.3, -25g/data/echinoderms/Asterias_rubens/analysis/eAstRub1.3,Asterias_rubens,GCA_902459465.3,2020_11 -25g/data/echinoderms/Asterias_rubens/analysis/eAstRub1.3,Asterias_rubens,GCA_902459465.3,2022_03 +species_dir,assembly_name,assembly_accession,ensembl_species_name,annotation_method +25g/data/echinoderms/Asterias_rubens,eAstRub1.3,GCA_902459465.3,Asterias_rubens,refseq +25g/data/insects/Osmia_bicornis,iOsmBic2.1,GCA_907164935.1,Osmia_bicornis_bicornis,ensembl +25g/data/insects/Osmia_bicornis,iOsmBic2.1_alternate_haplotype,GCA_907164925.1,Osmia_bicornis_bicornis,ensembl +darwin/data/insects/Noctua_fimbriata,ilNocFimb1.1,GCA_905163415.1,Noctua_fimbriata,braker ``` -| Column | Description | -| ---------------------- | ------------------------------------------------------------------------------------------------------------- | -| `analysis_dir` | Output analysis directory for this assembly. Must be a relative path, which will be evaluated from `--outdir` | -| `ensembl_species_name` | Name of the species, _as used by Ensembl_. Note: it may differ from Tree of Life's | -| `assembly_accession` | Accession number of the assembly to download. Typically of the form `GCA_*.*`. | +| Column | Description | +| ---------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `species_dir` | Output directory for this species (evaluated from the current directory if a relative path). Analysis results are deposited in `analysis/$assembly_name/`. | +| `assembly_name` | Name of the assembly. Used to build the actual output directory. | +| `assembly_accession` | (Optional). Accession number of the assembly to download. Typically of the form `GCA_*.*`. If missing, the pipeline will infer it from the ACCESSION file. | +| `ensembl_species_name` | Name of the species, _as used by Ensembl_. Note: it may differ from Tree of Life's | +| `annotation_method` | Name of the method of the geneset that holds the repeat annotation. | -## Running the pipeline +A samplesheet may only: -The typical command for running the pipeline is as follows: +- multiple datasets of the same species +- only one dataset per assembly +- multiple datasets in the same output directory ```bash -nextflow run sanger-tol/ensemblrepeatdownload --input samplesheet.csv --outdir --genome GRCh37 -profile docker +nextflow run sanger-tol/ensemblrepeatdownload -profile singularity --input samplesheet.csv --outdir results ``` -This will launch the pipeline with the `docker` configuration profile. See below for more information about profiles. +## Running the pipeline Note that the pipeline will create the following files in your working directory: @@ -63,7 +57,7 @@ Note that the pipeline will create the following files in your working directory work # Directory containing the nextflow working files # Finished results in specified location (defined with --outdir) .nextflow_log # Log file from Nextflow -# Other nextflow hidden files, eg. history of pipeline runs and old logs. +.nextflow # Directory where Nextflow keeps track of jobs ``` ### Updating the pipeline @@ -116,10 +110,9 @@ If `-profile` is not specified, the pipeline will run locally and expect all sof - `test` - A profile with a minimal configuration for automated testing - Corresponds to defining the assembly to download as command-line parameters so needs no other parameters - - Includes links to test data so needs no other parameters - `test_full` - A profile with a complete configuration for automated testing - - Includes links to test data so needs no other parameters + - Corresponds to defining the assembly to download as a CSV file so needs no other parameters ### `-resume` diff --git a/lib/WorkflowEnsemblrepeatdownload.groovy b/lib/WorkflowEnsemblrepeatdownload.groovy index 77e0dd7..3f88764 100755 --- a/lib/WorkflowEnsemblrepeatdownload.groovy +++ b/lib/WorkflowEnsemblrepeatdownload.groovy @@ -17,8 +17,8 @@ class WorkflowEnsemblrepeatdownload { System.exit(1) } } else { - if (!params.assembly_accession || !params.ensembl_species_name || !params.outdir) { - log.error "Either --input, or --assembly_accession, --assembly_name, and --outdir must be provided" + if (!params.assembly_accession || !params.ensembl_species_name || !params.annotation_method || !params.outdir) { + log.error "Either --input, or --assembly_accession, --assembly_name, --annotation_method, and --outdir must be provided" System.exit(1) } } diff --git a/nextflow.config b/nextflow.config index e1b49a3..e31fb0b 100644 --- a/nextflow.config +++ b/nextflow.config @@ -13,6 +13,7 @@ params { input = null assembly_accession = null ensembl_species_name = null + annotation_method = null ftp_root = "https://ftp.ensembl.org/pub/rapid-release/species" // Boilerplate options diff --git a/nextflow_schema.json b/nextflow_schema.json index 3c4e832..ef43cce 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -24,6 +24,12 @@ "pattern": "^\\S+$", "fa_icon": "fas fa-italic" }, + "annotation_method": { + "type": "string", + "description": "Method used to annotate the genome. Typically `ensembl`, `braker`, etc.", + "pattern": "^\\S+$", + "fa_icon": "fas fa-book" + }, "outdir": { "type": "string", "format": "directory-path", @@ -37,7 +43,7 @@ "pattern": "^\\S+\\.csv$", "schema": "assets/schema_input.json", "description": "Path to comma-separated file containing information about the assemblies to download. Used for bulk download of many assemblies.", - "help_text": "The file has to be a comma-separated file with three columns, and a header row. The columns names must be `species_dir`, `assembly_name`, and `ensembl_species_name`. An additional `assembly_accession` column can be provided too.", + "help_text": "The file has to be a comma-separated file with five columns, and a header row. The columns names must be `species_dir`, `assembly_name`, `ensembl_species_name`, and `annotation_method`. An additional `assembly_accession` column can be provided too.", "fa_icon": "fas fa-file-csv" }, "ftp_root": { diff --git a/subworkflows/local/download.nf b/subworkflows/local/download.nf index bccbc7d..34b410e 100644 --- a/subworkflows/local/download.nf +++ b/subworkflows/local/download.nf @@ -8,7 +8,7 @@ include { ENSEMBL_GENOME_DOWNLOAD } from '../../modules/local/ensembl_geno workflow DOWNLOAD { take: - repeat_params // tuple(analysis_dir, ensembl_species_name, assembly_accession) + repeat_params // tuple(analysis_dir, ensembl_species_name, assembly_accession, annotation_method) main: @@ -19,11 +19,12 @@ workflow DOWNLOAD { // meta [ id: it[2] + ".masked.ensembl", + method: it[3], outdir: it[0], ], - // e.g. https://ftp.ensembl.org/pub/rapid-release/species/Agriopis_aurantiaria/GCA_914767915.1/genome/Agriopis_aurantiaria-GCA_914767915.1-softmasked.fa.gz + // e.g. https://ftp.ensembl.org/pub/rapid-release/species/Agriopis_aurantiaria/GCA_914767915.1/braker/genome/Agriopis_aurantiaria-GCA_914767915.1-softmasked.fa.gz // ftp_path - params.ftp_root + "/" + it[1] + "/" + it[2] + "/genome", + params.ftp_root + "/" + it[1] + "/" + it[2] + "/" + it[3] + "/genome", // remote_filename_stem it[1] + "-" + it[2], ] }, diff --git a/subworkflows/local/params_check.nf b/subworkflows/local/params_check.nf index 97ce891..bb7b769 100644 --- a/subworkflows/local/params_check.nf +++ b/subworkflows/local/params_check.nf @@ -18,7 +18,7 @@ workflow PARAMS_CHECK { if (samplesheet) { SAMPLESHEET_CHECK ( file(samplesheet, checkIfExists: true) ) .csv - // Provides species_dir, assembly_name, assembly_accession (optional), and ensembl_species_name + // Provides species_dir, assembly_name, assembly_accession (optional), ensembl_species_name, and annotation_method .splitCsv ( header:true, sep:',' ) .map { // If assembly_accession is missing, load the accession number from file, following the Tree of Life directory structure @@ -31,6 +31,7 @@ workflow PARAMS_CHECK { "${it["species_dir"]}/analysis/${it["assembly_name"]}", it["ensembl_species_name"], it["assembly_accession"], + it["annotation_method"], ] } .set { ch_inputs } @@ -42,7 +43,7 @@ workflow PARAMS_CHECK { } emit: - ensembl_params = ch_inputs // tuple(analysis_dir, ensembl_species_name, assembly_accession) + ensembl_params = ch_inputs // tuple(analysis_dir, ensembl_species_name, assembly_accession, annotation_method) versions = ch_versions // channel: versions.yml } diff --git a/workflows/ensemblrepeatdownload.nf b/workflows/ensemblrepeatdownload.nf index c622917..4eb6047 100644 --- a/workflows/ensemblrepeatdownload.nf +++ b/workflows/ensemblrepeatdownload.nf @@ -51,6 +51,7 @@ workflow ENSEMBLREPEATDOWNLOAD { params.outdir, params.ensembl_species_name, params.assembly_accession, + params.annotation_method, ] ), ) From bb741a472f1bc0ea094d986d272e6a95ef6611e3 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Tue, 20 Dec 2022 11:16:32 +0000 Subject: [PATCH 03/72] Evaluate relative paths from `--outdir` --- docs/usage.md | 2 +- subworkflows/local/params_check.nf | 3 ++- workflows/ensemblrepeatdownload.nf | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/docs/usage.md b/docs/usage.md index be9553d..e83b4df 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -33,7 +33,7 @@ darwin/data/insects/Noctua_fimbriata,ilNocFimb1.1,GCA_905163415.1,Noctua_fimbria | Column | Description | | ---------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `species_dir` | Output directory for this species (evaluated from the current directory if a relative path). Analysis results are deposited in `analysis/$assembly_name/`. | +| `species_dir` | Output directory for this species (evaluated from `--outdir` if a relative path). Analysis results are deposited in `analysis/$assembly_name/`. | | `assembly_name` | Name of the assembly. Used to build the actual output directory. | | `assembly_accession` | (Optional). Accession number of the assembly to download. Typically of the form `GCA_*.*`. If missing, the pipeline will infer it from the ACCESSION file. | | `ensembl_species_name` | Name of the species, _as used by Ensembl_. Note: it may differ from Tree of Life's | diff --git a/subworkflows/local/params_check.nf b/subworkflows/local/params_check.nf index bb7b769..3e6742e 100644 --- a/subworkflows/local/params_check.nf +++ b/subworkflows/local/params_check.nf @@ -9,6 +9,7 @@ workflow PARAMS_CHECK { take: samplesheet // file cli_params // tuple, see below + outdir // file output directory main: @@ -28,7 +29,7 @@ workflow PARAMS_CHECK { } // Convert to tuple, as required by the download subworkflow .map { [ - "${it["species_dir"]}/analysis/${it["assembly_name"]}", + (it["species_dir"].startsWith("/") ? "" : outdir + "/") + "${it["species_dir"]}/analysis/${it["assembly_name"]}", it["ensembl_species_name"], it["assembly_accession"], it["annotation_method"], diff --git a/workflows/ensemblrepeatdownload.nf b/workflows/ensemblrepeatdownload.nf index 4eb6047..b4c3a1b 100644 --- a/workflows/ensemblrepeatdownload.nf +++ b/workflows/ensemblrepeatdownload.nf @@ -48,12 +48,12 @@ workflow ENSEMBLREPEATDOWNLOAD { params.input, Channel.of( [ - params.outdir, params.ensembl_species_name, params.assembly_accession, params.annotation_method, ] ), + params.outdir, ) ch_versions = ch_versions.mix(PARAMS_CHECK.out.versions) From f0de6e0c0fd1488a06e2954b2fd479e7b0eacd38 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Mon, 9 Jan 2023 17:38:59 +0000 Subject: [PATCH 04/72] We currently expect the extra sanger-tol lines in this file --- .nf-core.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.nf-core.yml b/.nf-core.yml index 0969bd3..1a3053f 100644 --- a/.nf-core.yml +++ b/.nf-core.yml @@ -12,6 +12,7 @@ lint: - .github/workflows/awsfulltest.yml files_unchanged: - LICENSE + - .gitattributes - .github/CONTRIBUTING.md - .github/ISSUE_TEMPLATE/bug_report.yml - .github/workflows/linting.yml From d74ec900b78f1895927cc0b83540843ac5310e60 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Mon, 9 Jan 2023 17:39:54 +0000 Subject: [PATCH 05/72] Pinning the version of nf-core as long as this is a 2.5 pipeline --- .github/workflows/linting.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index 8a5ce69..3a91d8e 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -82,7 +82,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install nf-core + pip install nf-core==2.5.0 - name: Run nf-core lint env: From 4429a4bdd01e8765dff998d79d1f8872679d44a7 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Mon, 9 Jan 2023 19:01:39 +0000 Subject: [PATCH 06/72] --outdir should actually be considered a mandatory parameter --- lib/WorkflowEnsemblrepeatdownload.groovy | 8 ++++++-- nextflow.config | 2 +- nextflow_schema.json | 3 ++- subworkflows/local/params_check.nf | 2 +- 4 files changed, 10 insertions(+), 5 deletions(-) diff --git a/lib/WorkflowEnsemblrepeatdownload.groovy b/lib/WorkflowEnsemblrepeatdownload.groovy index 3f88764..a7d4b98 100755 --- a/lib/WorkflowEnsemblrepeatdownload.groovy +++ b/lib/WorkflowEnsemblrepeatdownload.groovy @@ -17,11 +17,15 @@ class WorkflowEnsemblrepeatdownload { System.exit(1) } } else { - if (!params.assembly_accession || !params.ensembl_species_name || !params.annotation_method || !params.outdir) { - log.error "Either --input, or --assembly_accession, --assembly_name, --annotation_method, and --outdir must be provided" + if (!params.assembly_accession || !params.ensembl_species_name || !params.annotation_method) { + log.error "Either --input, or --assembly_accession, --assembly_name, and --annotation_method must be provided" System.exit(1) } } + if (!params.outdir) { + log.error "--outdir is mandatory" + System.exit(1) + } } } diff --git a/nextflow.config b/nextflow.config index e31fb0b..a810d26 100644 --- a/nextflow.config +++ b/nextflow.config @@ -17,7 +17,7 @@ params { ftp_root = "https://ftp.ensembl.org/pub/rapid-release/species" // Boilerplate options - outdir = null + outdir = 'results' tracedir = "${params.outdir}/pipeline_info" publish_dir_mode = 'copy' email = null diff --git a/nextflow_schema.json b/nextflow_schema.json index ef43cce..defd572 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -33,7 +33,8 @@ "outdir": { "type": "string", "format": "directory-path", - "description": "The output directory where the results will be saved. Not considered when running the pipeline with a .csv file as input.", + "description": "The output directory where the results will be saved. Not considered for sample-sheet entries that have an absolute path.", + "default": "results", "fa_icon": "fas fa-folder-open" }, "input": { diff --git a/subworkflows/local/params_check.nf b/subworkflows/local/params_check.nf index 3e6742e..2adb7bd 100644 --- a/subworkflows/local/params_check.nf +++ b/subworkflows/local/params_check.nf @@ -40,7 +40,7 @@ workflow PARAMS_CHECK { } else { // Add the other input channel in, as it's expected to have all the parameters in the right order - ch_inputs = ch_inputs.mix(cli_params) + ch_inputs = ch_inputs.mix(cli_params.map { [outdir] + it } ) } emit: From 9609f434c698508f39ab2b1a7056217758acdd59 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Fri, 13 Jan 2023 18:18:26 +0000 Subject: [PATCH 07/72] Removed extra } --- lib/NfcoreTemplate.groovy | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/NfcoreTemplate.groovy b/lib/NfcoreTemplate.groovy index 78966e9..6a85110 100755 --- a/lib/NfcoreTemplate.groovy +++ b/lib/NfcoreTemplate.groovy @@ -228,7 +228,7 @@ class NfcoreTemplate { ${colors.blue} | (___ __ _ _ __ __ _ ___ _ __ ${colors.reset}______${colors.green}| |${colors.yellow} ___ ${colors.red}| |${colors.reset} ${colors.blue} \\___ \\ / _` | '_ \\ / _` |/ _ \\ '__|${colors.reset}______${colors.green}| |${colors.yellow}/ _ \\${colors.red}| |${colors.reset} ${colors.blue} ____) | (_| | | | | (_| | __/ | ${colors.green}| |${colors.yellow} (_) ${colors.red}| |____${colors.reset} - ${colors.blue} |_____/ \\__,_|_| |_|\\__, |\\___|_| ${colors.green}|_|${colors.yellow}\\___/${colors.red}}|______|${colors.reset} + ${colors.blue} |_____/ \\__,_|_| |_|\\__, |\\___|_| ${colors.green}|_|${colors.yellow}\\___/${colors.red}|______|${colors.reset} ${colors.blue} __/ |${colors.reset} ${colors.blue} |___/${colors.reset} ${colors.purple} ${workflow.manifest.name} v${workflow.manifest.version}${colors.reset} From 5fb68bf8f46c9f17ea3ac2c865f614576f5d2570 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Fri, 13 Jan 2023 18:19:13 +0000 Subject: [PATCH 08/72] Converted most "it" iterators to explicitly-named variables Except the iterators that were groovy maps --- subworkflows/local/download.nf | 44 ++++++++++++++++++++++++---------- 1 file changed, 31 insertions(+), 13 deletions(-) diff --git a/subworkflows/local/download.nf b/subworkflows/local/download.nf index 34b410e..0ac37d5 100644 --- a/subworkflows/local/download.nf +++ b/subworkflows/local/download.nf @@ -15,19 +15,37 @@ workflow DOWNLOAD { ch_versions = Channel.empty() ch_genome_fasta = ENSEMBL_GENOME_DOWNLOAD ( - repeat_params.map { [ - // meta - [ - id: it[2] + ".masked.ensembl", - method: it[3], - outdir: it[0], - ], - // e.g. https://ftp.ensembl.org/pub/rapid-release/species/Agriopis_aurantiaria/GCA_914767915.1/braker/genome/Agriopis_aurantiaria-GCA_914767915.1-softmasked.fa.gz - // ftp_path - params.ftp_root + "/" + it[1] + "/" + it[2] + "/" + it[3] + "/genome", - // remote_filename_stem - it[1] + "-" + it[2], - ] }, + repeat_params.map { + species_dir, + ensembl_species_name, + assembly_accession, + annotation_method + + -> [ + // meta + [ + id: assembly_accession + ".masked.ensembl", + method: annotation_method, + outdir: species_dir, + ], + + // e.g. https://ftp.ensembl.org/pub/rapid-release/species/Agriopis_aurantiaria/GCA_914767915.1/braker/genome/Agriopis_aurantiaria-GCA_914767915.1-softmasked.fa.gz + // ftp_path + [ + params.ftp_root, + ensembl_species_name, + assembly_accession, + annotation_method, + "genome", + ].join("/"), + + // remote_filename_stem + [ + ensembl_species_name, + assembly_accession, + ].join("-"), + ] + }, ).fasta ch_versions = ch_versions.mix(ENSEMBL_GENOME_DOWNLOAD.out.versions.first()) From e652647dcaf6e07dd5bff3f71262751f27c5c6a8 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Mon, 16 Jan 2023 16:27:25 +0000 Subject: [PATCH 09/72] Updated logo --- lib/NfcoreTemplate.groovy | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/lib/NfcoreTemplate.groovy b/lib/NfcoreTemplate.groovy index 6a85110..835aecc 100755 --- a/lib/NfcoreTemplate.groovy +++ b/lib/NfcoreTemplate.groovy @@ -223,12 +223,12 @@ class NfcoreTemplate { String.format( """\n ${dashedLine(monochrome_logs)} - ${colors.blue} _____ ${colors.green} _______ ${colors.red} _${colors.reset} - ${colors.blue} / ____| ${colors.green}|__ __| ${colors.red}| |${colors.reset} - ${colors.blue} | (___ __ _ _ __ __ _ ___ _ __ ${colors.reset}______${colors.green}| |${colors.yellow} ___ ${colors.red}| |${colors.reset} - ${colors.blue} \\___ \\ / _` | '_ \\ / _` |/ _ \\ '__|${colors.reset}______${colors.green}| |${colors.yellow}/ _ \\${colors.red}| |${colors.reset} - ${colors.blue} ____) | (_| | | | | (_| | __/ | ${colors.green}| |${colors.yellow} (_) ${colors.red}| |____${colors.reset} - ${colors.blue} |_____/ \\__,_|_| |_|\\__, |\\___|_| ${colors.green}|_|${colors.yellow}\\___/${colors.red}|______|${colors.reset} + ${colors.blue} _____ ${colors.green} _______ ${colors.red} _${colors.reset} + ${colors.blue} / ____| ${colors.green}|__ __| ${colors.red}| |${colors.reset} + ${colors.blue} | (___ __ _ _ __ __ _ ___ _ __ ${colors.reset} ___ ${colors.green}| |${colors.yellow} ___ ${colors.red}| |${colors.reset} + ${colors.blue} \\___ \\ / _` | '_ \\ / _` |/ _ \\ '__|${colors.reset}|___|${colors.green}| |${colors.yellow}/ _ \\${colors.red}| |${colors.reset} + ${colors.blue} ____) | (_| | | | | (_| | __/ | ${colors.green}| |${colors.yellow} (_) ${colors.red}| |____${colors.reset} + ${colors.blue} |_____/ \\__,_|_| |_|\\__, |\\___|_| ${colors.green}|_|${colors.yellow}\\___/${colors.red}|______|${colors.reset} ${colors.blue} __/ |${colors.reset} ${colors.blue} |___/${colors.reset} ${colors.purple} ${workflow.manifest.name} v${workflow.manifest.version}${colors.reset} From e40fd5430c0a53528b465a81d1b59b6734f9e542 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Sat, 14 Jan 2023 16:31:05 +0000 Subject: [PATCH 10/72] Typo: assembly_name is not a recognised command-line parameter --- lib/WorkflowEnsemblrepeatdownload.groovy | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/WorkflowEnsemblrepeatdownload.groovy b/lib/WorkflowEnsemblrepeatdownload.groovy index a7d4b98..b30e8a6 100755 --- a/lib/WorkflowEnsemblrepeatdownload.groovy +++ b/lib/WorkflowEnsemblrepeatdownload.groovy @@ -18,7 +18,7 @@ class WorkflowEnsemblrepeatdownload { } } else { if (!params.assembly_accession || !params.ensembl_species_name || !params.annotation_method) { - log.error "Either --input, or --assembly_accession, --assembly_name, and --annotation_method must be provided" + log.error "Either --input, or --assembly_accession, --ensembl_species_name, and --annotation_method must be provided" System.exit(1) } } From 76316f4c3fae073916ee6d1ac83dc295d5508f8f Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Tue, 17 Jan 2023 13:34:34 +0000 Subject: [PATCH 11/72] Added 130 for MEMLIMIT --- conf/base.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/base.config b/conf/base.config index 1ac3b01..2cd9888 100644 --- a/conf/base.config +++ b/conf/base.config @@ -6,7 +6,7 @@ process { - errorStrategy = { task.exitStatus in [143,137,104,134,139] ? 'retry' : 'finish' } + errorStrategy = { task.exitStatus in [130,143,137,104,134,139] ? 'retry' : 'finish' } maxRetries = 2 maxErrors = '-1' From 91e59ade9e71da23319e28be639aa422962b9bff Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Tue, 17 Jan 2023 13:35:40 +0000 Subject: [PATCH 12/72] Don't complain about missing files on the Ensembl FTP --- conf/base.config | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/conf/base.config b/conf/base.config index 2cd9888..cc5e5a7 100644 --- a/conf/base.config +++ b/conf/base.config @@ -10,6 +10,12 @@ process { maxRetries = 2 maxErrors = '-1' + // Files are often missing and added later by Ensembl. Since the pipeline won't + // create partial disks, no need to complain + withName: 'ENSEMBL_GENOME_DOWNLOAD' { + errorStrategy = { task.exitStatus in [130,143,137,104,134,139] ? 'retry' : 'ignore' } + } + // Most of the pipeline requires very little resources cpus = 1 // but still gradually increase the resources to allow the pipeline to self-heal From 46ef3e606782afd1eaf6f325ab12abf3cb05457d Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Tue, 17 Jan 2023 13:37:57 +0000 Subject: [PATCH 13/72] Read properties of the fasta file, and augment meta --- subworkflows/sanger-tol/prepare_fasta.nf | 47 +++++++++++++++++++++--- 1 file changed, 42 insertions(+), 5 deletions(-) diff --git a/subworkflows/sanger-tol/prepare_fasta.nf b/subworkflows/sanger-tol/prepare_fasta.nf index c2c576d..13b6859 100644 --- a/subworkflows/sanger-tol/prepare_fasta.nf +++ b/subworkflows/sanger-tol/prepare_fasta.nf @@ -24,16 +24,53 @@ workflow PREPARE_FASTA { ch_samtools_faidx = CUSTOM_GETCHROMSIZES (ch_compressed_fasta).fai ch_versions = ch_versions.mix(CUSTOM_GETCHROMSIZES.out.versions) + // Read the .fai file, extract sequence statistics, and make an extended meta map + sequence_map = ch_samtools_faidx.map { + meta, fai -> [meta, meta + get_sequence_map(fai)] + } + // Update all channels to use the extended meta map + fasta_gz = ch_compressed_fasta.join(sequence_map).map { [it[2], it[1]]} + faidx = ch_samtools_faidx.join(sequence_map).map { [it[2], it[1]]} + gzi = CUSTOM_GETCHROMSIZES.out.gzi.join(sequence_map).map { [it[2], it[1]]} + sizes = CUSTOM_GETCHROMSIZES.out.sizes.join(sequence_map).map { [it[2], it[1]]} + expanded_fasta = fasta.join(sequence_map).map { [it[2], it[1]]} + // Generate Samtools dictionary - ch_samtools_dict = SAMTOOLS_DICT (fasta).dict + ch_samtools_dict = SAMTOOLS_DICT (expanded_fasta).dict ch_versions = ch_versions.mix(SAMTOOLS_DICT.out.versions) emit: - fasta_gz = ch_compressed_fasta // path: genome.fa.gz - faidx = ch_samtools_faidx // path: genome.fa.gz.fai + fasta_gz = fasta_gz // path: genome.fa.gz + faidx = faidx // path: genome.fa.gz.fai dict = ch_samtools_dict // path: genome.fa.dict - gzi = CUSTOM_GETCHROMSIZES.out.gzi // path: genome.fa.gz.gzi - sizes = CUSTOM_GETCHROMSIZES.out.sizes // path: genome.fa.gz.sizes + gzi = gzi // path: genome.fa.gz.gzi + sizes = sizes // path: genome.fa.gz.sizes versions = ch_versions.ifEmpty(null) // channel: [ versions.yml ] } + +// Read the .fai file to extract the number of sequences, the maximum and total sequence length +// Inspired from https://github.com/nf-core/rnaseq/blob/3.10.1/lib/WorkflowRnaseq.groovy +def get_sequence_map(fai_file) { + def n_sequences = 0 + def max_length = 0 + def total_length = 0 + fai_file.eachLine { line -> + def lspl = line.split('\t') + def chrom = lspl[0] + def length = lspl[1].toInteger() + n_sequences ++ + total_length += length + if (length > max_length) { + max_length = length + } + } + + def sequence_map = [:] + sequence_map.n_sequences = n_sequences + sequence_map.total_length = total_length + if (n_sequences) { + sequence_map.max_length = max_length + } + return sequence_map +} From 3328f85dfbdfe9d91ff2c6784c6a420841f36075 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Tue, 17 Jan 2023 13:38:28 +0000 Subject: [PATCH 14/72] samtools dict needs an amount of RAM proportional to the length of the longest sequence --- conf/base.config | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/conf/base.config b/conf/base.config index cc5e5a7..8227ea4 100644 --- a/conf/base.config +++ b/conf/base.config @@ -22,10 +22,10 @@ process { memory = { check_max( 50.MB * task.attempt, 'memory' ) } time = { check_max( 30.min * task.attempt, 'time' ) } - // samtools dict takes more memory on larger genomes + // samtools dict loads entire sequences in memory withName: 'SAMTOOLS_DICT' { - // 50 MB per 500 Mbp - memory = { check_max( 50.MB + 50.MB * task.attempt * Math.ceil(fasta.size() / 500000000), 'memory' ) } + // 50 MB per 50 Mbp + memory = { check_max( 50.MB + 50.MB * task.attempt * Math.ceil(meta.max_length / 50000000), 'memory' ) } } withName:CUSTOM_DUMPSOFTWAREVERSIONS { From eec91549ea9e3014cbdc781d1f23d4f68ab5a11f Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Tue, 17 Jan 2023 13:40:52 +0000 Subject: [PATCH 15/72] Make prepare_repeats select the right tabix indexing thanks to the new meta keys --- bin/repeats_bed.py | 5 +++-- subworkflows/sanger-tol/prepare_repeats.nf | 13 ++++++++++--- workflows/ensemblrepeatdownload.nf | 2 +- 3 files changed, 14 insertions(+), 6 deletions(-) diff --git a/bin/repeats_bed.py b/bin/repeats_bed.py index 71043ea..d4ff4b7 100755 --- a/bin/repeats_bed.py +++ b/bin/repeats_bed.py @@ -2,6 +2,7 @@ # This script was originally conceived by @muffato import argparse +import gzip import sys __doc__ = "This script prints a BED file of the masked regions a fasta file." @@ -10,7 +11,7 @@ def fasta_to_bed(fasta): in_gap = None - with open(sys.argv[1]) as fh: + with gzip.open(fasta, "rt") if fasta.endswith(".gz") else open(fasta) as fh: for line in fh: line = line[:-1] if line.startswith(">"): @@ -41,7 +42,7 @@ def fasta_to_bed(fasta): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("fasta", help="Input Fasta file.") - parser.add_argument("--version", action="version", version="%(prog)s 1.0") + parser.add_argument("--version", action="version", version="%(prog)s 1.1") args = parser.parse_args() fasta_to_bed(args.fasta) diff --git a/subworkflows/sanger-tol/prepare_repeats.nf b/subworkflows/sanger-tol/prepare_repeats.nf index 7b25e03..6006bc1 100644 --- a/subworkflows/sanger-tol/prepare_repeats.nf +++ b/subworkflows/sanger-tol/prepare_repeats.nf @@ -26,10 +26,17 @@ workflow PREPARE_REPEATS { ch_compressed_bed = TABIX_BGZIP ( ch_bed ).output ch_versions = ch_versions.mix(TABIX_BGZIP.out.versions) - // Index the BED file in two formats for maximum compatibility - ch_indexed_bed_csi = TABIX_TABIX_CSI ( ch_compressed_bed ).csi + // Try indexing the BED file in two formats for maximum compatibility + // but each has its own limitations + tabix_selector = ch_compressed_bed.branch { meta, bed -> + tbi_and_csi: meta.max_length < 2**29 + only_csi: meta.max_length < 2**32 + } + + // Do the indexing on the compatible Fasta files + ch_indexed_bed_csi = TABIX_TABIX_CSI ( tabix_selector.tbi_and_csi.mix(tabix_selector.only_csi) ).csi ch_versions = ch_versions.mix(TABIX_TABIX_CSI.out.versions) - ch_indexed_bed_tbi = TABIX_TABIX_TBI ( ch_compressed_bed ).tbi + ch_indexed_bed_tbi = TABIX_TABIX_TBI ( tabix_selector.tbi_and_csi ).tbi ch_versions = ch_versions.mix(TABIX_TABIX_TBI.out.versions) diff --git a/workflows/ensemblrepeatdownload.nf b/workflows/ensemblrepeatdownload.nf index b4c3a1b..437327b 100644 --- a/workflows/ensemblrepeatdownload.nf +++ b/workflows/ensemblrepeatdownload.nf @@ -69,7 +69,7 @@ workflow ENSEMBLREPEATDOWNLOAD { ) ch_versions = ch_versions.mix(PREPARE_FASTA.out.versions) PREPARE_REPEATS ( - DOWNLOAD.out.genome + PREPARE_FASTA.out.fasta_gz ) ch_versions = ch_versions.mix(PREPARE_REPEATS.out.versions) From ee4eb3854db70162a27dc4688035432beb399fad Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Tue, 17 Jan 2023 13:41:48 +0000 Subject: [PATCH 16/72] For comprehensiveness, provide channels for the unavailable indices --- subworkflows/sanger-tol/prepare_repeats.nf | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/subworkflows/sanger-tol/prepare_repeats.nf b/subworkflows/sanger-tol/prepare_repeats.nf index 6006bc1..64d23f1 100644 --- a/subworkflows/sanger-tol/prepare_repeats.nf +++ b/subworkflows/sanger-tol/prepare_repeats.nf @@ -31,8 +31,14 @@ workflow PREPARE_REPEATS { tabix_selector = ch_compressed_bed.branch { meta, bed -> tbi_and_csi: meta.max_length < 2**29 only_csi: meta.max_length < 2**32 + no_tabix: true } + // Output channels to tell the downstream subworkflows which indexes are missing + // (therefore, only meta is available) + no_csi = tabix_selector.no_tabix.map {it[0]} + no_tbi = tabix_selector.only_csi.mix(tabix_selector.no_tabix).map {it[0]} + // Do the indexing on the compatible Fasta files ch_indexed_bed_csi = TABIX_TABIX_CSI ( tabix_selector.tbi_and_csi.mix(tabix_selector.only_csi) ).csi ch_versions = ch_versions.mix(TABIX_TABIX_CSI.out.versions) @@ -44,5 +50,7 @@ workflow PREPARE_REPEATS { bed_gz = ch_compressed_bed // path: genome.bed.gz bed_csi = ch_indexed_bed_csi // path: genome.bed.gz.csi bed_tbi = ch_indexed_bed_tbi // path: genome.bed.gz.tbi + no_csi = no_csi // (only meta) + no_tbi = no_tbi // (only meta) versions = ch_versions.ifEmpty(null) // channel: [ versions.yml ] } From 9050bae99588d8a9e94c0ca06110b97f84a68c43 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Tue, 17 Jan 2023 13:05:55 +0000 Subject: [PATCH 17/72] My convention is to have two blank lines there --- subworkflows/local/params_check.nf | 1 + 1 file changed, 1 insertion(+) diff --git a/subworkflows/local/params_check.nf b/subworkflows/local/params_check.nf index 2adb7bd..9337087 100644 --- a/subworkflows/local/params_check.nf +++ b/subworkflows/local/params_check.nf @@ -43,6 +43,7 @@ workflow PARAMS_CHECK { ch_inputs = ch_inputs.mix(cli_params.map { [outdir] + it } ) } + emit: ensembl_params = ch_inputs // tuple(analysis_dir, ensembl_species_name, assembly_accession, annotation_method) versions = ch_versions // channel: versions.yml From 4fdbaa34f04ed66172c4f220655a606d7861c29d Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Fri, 20 Jan 2023 18:18:54 +0000 Subject: [PATCH 18/72] Doc update --- docs/output.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/output.md b/docs/output.md index afda43f..8227afa 100644 --- a/docs/output.md +++ b/docs/output.md @@ -20,7 +20,7 @@ All data files are compressed (and indexed) with `bgzip`. All Fasta files are indexed with `samtools faidx`, which allows accessing any region of the assembly in constant time, and `samtools dict`, which allows identifying a sequence by its MD5 checksum. -All BED files are indexed with tabixin CSI mode, allowing large sequences. +All BED files are indexed with tabix in both TBI and CSI modes, unless the sequences are too large. ### Gene annotation files From 091bc8552bf077da6cc72ddf8fc34554416662a5 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Fri, 20 Jan 2023 18:25:13 +0000 Subject: [PATCH 19/72] Updated the change-log --- CHANGELOG.md | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index bb55124..6cdaf21 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,7 +3,21 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## v1.0.0 - [date] +## v1.1.0 - [date] + +### `Fixed` + +- Support for the updated directory structure of the Ensembl FTP +- Relative paths in the sample-sheet are now evaluated from the `--outdir` parameter +- Memory usage rules for `samtools dict` +- Appropriate use of `tabix`'s TBI and CSI indexing, depending on the sequence lengths + +### `Added` + +- New command-line parameter (`--annotation_method`): required for accessing the files on the Ensembl FTP +- `--outdir` is a _mandatory_ parameter + +## v1.0.0 - [2022-10-19] Initial release of sanger-tol/ensemblrepeatdownload, created with the [nf-core](https://nf-co.re/) template. @@ -13,7 +27,7 @@ Initial release of sanger-tol/ensemblrepeatdownload, created with the [nf-core]( - `samtools faidx` and `samtools dict` indices for the masked genome - BED file with the coordinates of the masked region - `samtools faidx` and `samtools dict` indices for the annotation fastas -- tavix index for the GFF3 file +- tabix index for the GFF3 file ### `Dependencies` From e9441a50e50cf34e4361bad1d83350d5796e2292 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Wed, 1 Feb 2023 08:11:00 +0000 Subject: [PATCH 20/72] Optimised resource usage for tabix too --- conf/base.config | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/conf/base.config b/conf/base.config index 8227ea4..9445efc 100644 --- a/conf/base.config +++ b/conf/base.config @@ -22,6 +22,12 @@ process { memory = { check_max( 50.MB * task.attempt, 'memory' ) } time = { check_max( 30.min * task.attempt, 'time' ) } + // tabix needs pointers to the sequences in memory + withName: '.*:.*:PREPARE_REPEATS:TABIX_TABIX_.*' { + // 50 MB per 25,000 sequences + memory = { check_max( 50.MB * task.attempt * Math.ceil(meta.n_sequences / 25000), 'memory' ) } + } + // samtools dict loads entire sequences in memory withName: 'SAMTOOLS_DICT' { // 50 MB per 50 Mbp From 61ceda62eb829c50d30c42be36a726f156e77b3b Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Wed, 1 Feb 2023 10:53:19 +0000 Subject: [PATCH 21/72] Nextflow version bump due to the way Conda is enabled/disabled --- .github/workflows/ci.yml | 2 +- README.md | 4 ++-- nextflow.config | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c74ffeb..8f6cf53 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -20,7 +20,7 @@ jobs: strategy: matrix: NXF_VER: - - "22.04.0" + - "22.10.0" - "latest-everything" steps: - name: Check out pipeline code diff --git a/README.md b/README.md index 1d9f9a6..9c26fc8 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ [![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.7183380-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.7183380) -[![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A522.04.0-23aa62.svg)](https://www.nextflow.io/) +[![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A522.10.0-23aa62.svg)](https://www.nextflow.io/) [![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/) [![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/) [![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/) @@ -40,7 +40,7 @@ Steps involved: ## Quick Start -1. Install [`Nextflow`](https://www.nextflow.io/docs/latest/getstarted.html#installation) (`>=22.04.0`) +1. Install [`Nextflow`](https://www.nextflow.io/docs/latest/getstarted.html#installation) (`>=22.10.0`) 2. Install any of [`Docker`](https://docs.docker.com/engine/installation/), [`Singularity`](https://www.sylabs.io/guides/3.0/user-guide/) (you can follow [this tutorial](https://singularity-tutorial.github.io/01-installation/)), [`Podman`](https://podman.io/), [`Shifter`](https://nersc.gitlab.io/development/shifter/how-to-use/) or [`Charliecloud`](https://hpc.github.io/charliecloud/) for full pipeline reproducibility _(you can use [`Conda`](https://conda.io/miniconda.html) both to install Nextflow itself and also to manage software within pipelines. Please only use it within pipelines as a last resort; see [docs](https://nf-co.re/usage/configuration#basic-configuration-profiles))_. diff --git a/nextflow.config b/nextflow.config index a810d26..1e0c61a 100644 --- a/nextflow.config +++ b/nextflow.config @@ -173,7 +173,7 @@ manifest { homePage = 'https://github.com/sanger-tol/ensemblrepeatdownload' description = 'Pipeline that downloads repeats annotations from Ensembl into the Tree of Life directory structure' mainScript = 'main.nf' - nextflowVersion = '!>=22.04.0' + nextflowVersion = '!>=22.10.0' version = '1.1dev' } From a1a6200cb0a27ed2a097a76c571da0582f0ead4f Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Wed, 1 Feb 2023 10:55:29 +0000 Subject: [PATCH 22/72] Documentation update --- docs/output.md | 3 ++- docs/usage.md | 6 +++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/docs/output.md b/docs/output.md index 8227afa..355a464 100644 --- a/docs/output.md +++ b/docs/output.md @@ -4,7 +4,8 @@ This document describes the output produced by the pipeline. -The directories listed below will be created in the results directory after the pipeline has finished. All paths are relative to the top-level results directory. +The directories listed below will be created in the results directory (or `species_dir` when using a samplesheet) after the pipeline has finished. +All paths are relative to the top-level results directory. The directories comply with Tree of Life's canonical directory structure. diff --git a/docs/usage.md b/docs/usage.md index e83b4df..3a4ad22 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -49,7 +49,10 @@ A samplesheet may only: nextflow run sanger-tol/ensemblrepeatdownload -profile singularity --input samplesheet.csv --outdir results ``` -## Running the pipeline +`species_dir` is used to fit the output of this pipeline into a directory structure compatible with the other pipelines +from Sanger Tree of Life. + +## Nextflow outputs Note that the pipeline will create the following files in your working directory: @@ -58,6 +61,7 @@ work # Directory containing the nextflow working files # Finished results in specified location (defined with --outdir) .nextflow_log # Log file from Nextflow .nextflow # Directory where Nextflow keeps track of jobs +# Other nextflow hidden files, eg. history of pipeline runs and old logs. ``` ### Updating the pipeline From 28af129bb94c8077171a372b6621dbc529edcfac Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Wed, 1 Feb 2023 11:24:19 +0000 Subject: [PATCH 23/72] Documentation update --- .nf-core.yml | 1 + docs/README.md | 2 ++ docs/output.md | 2 -- docs/parameters.md | 70 ++++++++++++++++++++++++++++++++++++++++++++++ docs/usage.md | 29 +++++++++++-------- 5 files changed, 91 insertions(+), 13 deletions(-) create mode 100644 docs/parameters.md diff --git a/.nf-core.yml b/.nf-core.yml index 1a3053f..574d9ad 100644 --- a/.nf-core.yml +++ b/.nf-core.yml @@ -18,6 +18,7 @@ lint: - .github/workflows/linting.yml - assets/email_template.html - assets/sendmail_template.txt + - docs/README.md - lib/NfcoreTemplate.groovy nextflow_config: - manifest.name diff --git a/docs/README.md b/docs/README.md index b3effd1..8b32bab 100644 --- a/docs/README.md +++ b/docs/README.md @@ -4,5 +4,7 @@ The sanger-tol/ensemblrepeatdownload documentation is split into the following p - [Usage](usage.md) - An overview of how the pipeline works, how to run it and a description of all of the different command-line flags. +- [Parameters](parameters.md) + - An overview of the different pipeline options and how to use them. - [Output](output.md) - An overview of the different results produced by the pipeline and how to interpret them. diff --git a/docs/output.md b/docs/output.md index 355a464..555b7d2 100644 --- a/docs/output.md +++ b/docs/output.md @@ -198,6 +198,4 @@ the directory structure includes the assembly name, e.g. `gfLaeSulp1.1`, and all - Reports generated by the pipeline: `pipeline_report.html`, `pipeline_report.txt` and `software_versions.yml`. The `pipeline_report*` files will only be present if the `--email` / `--email_on_fail` parameter's are used when running the pipeline. - Reformatted samplesheet files used as input to the pipeline: `samplesheet.valid.csv`. - - [Nextflow](https://www.nextflow.io/docs/latest/tracing.html) provides excellent functionality for generating various reports relevant to the running and execution of the pipeline. This will allow you to troubleshoot errors with the running of the pipeline, and also provide you with other information such as launch commands, run times and resource usage. diff --git a/docs/parameters.md b/docs/parameters.md new file mode 100644 index 0000000..3b8a1a0 --- /dev/null +++ b/docs/parameters.md @@ -0,0 +1,70 @@ +# sanger-tol/ensemblrepeatdownload pipeline parameters + +Pipeline that downloads repeats annotations from Ensembl into the Tree of Life directory structure + +## Input/output options + +Define where the pipeline should find input data and save output data. + +| Parameter | Description | Type | Default | Required | Hidden | +| ---------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------- | ------- | -------- | ------ | +| `assembly_accession` | Accession number of the assembly to download. Typically of the form `GCA_*.*` | `string` | | | | +| `ensembl_species_name` | Name of the species, _as used by Ensembl_. Note: it may differ from Tree of Life's | `string` | | | | +| `annotation_method` | Method used to annotate the genome. Typically `ensembl`, `braker`, etc. | `string` | | | | +| `outdir` | The output directory where the results will be saved. Not considered for sample-sheet entries that have an absolute path. | `string` | results | | | +| `input` | Path to comma-separated file containing information about the assemblies to download. Used for bulk download of many assemblies.
HelpThe file has to be a | + +comma-separated file with five columns, and a header row. The columns names must be `species_dir`, `assembly_name`, `ensembl_species_name`, and `annotation_method`. An additional `assembly_accession` column can +be provided too.
| `string` | | | | +| `ftp_root` | Root location of the Ensembl FTP, in which all annotations can be found. Access protocol is actually not limited to FTP, and we use HTTPS by default. | `string` | +https://ftp.ensembl.org/pub/rapid-release/species | | True | +| `email` | Email address for completion summary.
HelpSet this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow +exits. If set in your user config file (`~/.nextflow/config`) then you don't need to specify this on the command line for every run.
| `string` | | | True | + +## Institutional config options + +Parameters used to describe centralised config profiles. These should not be edited. + +| Parameter | Description | Type | Default | Required | Hidden | +| ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------- | ------- | -------- | ------ | +| `custom_config_version` | Git commit id for Institutional configs. | `string` | master | | True | +| `custom_config_base` | Base directory for Institutional configs.
HelpIf you're running offline, Nextflow will not be able to fetch the institutional config files from the | +| internet. If you don't need them, then this is not a problem. If you do need them, you should download the files from the repo and tell Nextflow where to find them with this parameter.
| +| `string` | https://raw.githubusercontent.com/nf-core/configs/master | | True | +| `config_profile_name` | Institutional config name. | `string` | | | True | +| `config_profile_description` | Institutional config description. | `string` | | | True | +| `config_profile_contact` | Institutional config contact information. | `string` | | | True | +| `config_profile_url` | Institutional config URL link. | `string` | | | True | + +## Max job request options + +Set the top limit for requested resources for any single job. + +| Parameter | Description | Type | Default | Required | Hidden | +| ------------------------------------------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------ | ------- | -------- | ------ | +| `max_cpus` | Maximum number of CPUs that can be requested for any single job.
HelpUse to set an upper-limit for the CPU requirement for each process. Should be an integer | +| e.g. `--max_cpus 1`
| `integer` | 16 | | True | +| `max_memory` | Maximum amount of memory that can be requested for any single job.
HelpUse to set an upper-limit for the memory requirement for each process. Should be a | +| string in the format integer-unit e.g. `--max_memory '8.GB'`
| `string` | 128.GB | | True | +| `max_time` | Maximum amount of time that can be requested for any single job.
HelpUse to set an upper-limit for the time requirement for each process. Should be a string in | +| the format integer-unit e.g. `--max_time '2.h'`
| `string` | 240.h | | True | + +## Generic options + +Less common options for the pipeline, typically set in a config file. + +| Parameter | Description | Type | Default | Required | Hidden | +| ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --------- | ------------------------------ | -------- | ------ | +| `help` | Display help text. | `boolean` | | | True | +| `publish_dir_mode` | Method used to save pipeline results to output directory.
HelpThe Nextflow `publishDir` option specifies which intermediate files should be saved to the | +| output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.
| +| `string` | copy | | True | +| `email_on_fail` | Email address for completion summary, only when pipeline fails.
HelpAn email address to send a summary email to when the pipeline is completed - ONLY sent | +| if the pipeline does not exit successfully.
| `string` | | | True | +| `plaintext_email` | Send plain-text email instead of HTML. | `boolean` | | | True | +| `monochrome_logs` | Do not use coloured log outputs. | `boolean` | | | True | +| `tracedir` | Directory to keep pipeline Nextflow logs and reports. | `string` | ${params.outdir}/pipeline_info | | True | +| `validate_params` | Boolean whether to validate parameters against the schema at runtime | `boolean` | True | | True | +| `show_hidden_params` | Show all params when using `--help`
HelpBy default, parameters set as _hidden_ in the schema are not shown on the command line when a user runs with | +| `--help`. Specifying this option will tell the pipeline to show all parameters.
| `boolean` | | | True | +| `enable_conda` | Run this workflow with Conda. You can also use '-profile conda' instead of providing this parameter. | `boolean` | | | True | diff --git a/docs/usage.md b/docs/usage.md index 3a4ad22..fdb3223 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -15,13 +15,16 @@ The pipeline accepts command-one line arguments to specify a single genome to do - `--outdir`: Where to download the data. ```console -nextflow run sanger-tol/ensemblrepeatdownload -profile singularity --ensembl_species_name Noctua_fimbriata --assembly_accession GCA_905163415.1 --annotation_method braker --outdir results +nextflow run sanger-tol/ensemblrepeatdownload -profile singularity --ensembl_species_name Noctua_fimbriata --assembly_accession GCA_905163415.1 --annotation_method braker --outdir Noctua_fimbriata_repeats ``` +This will launch the pipeline and download the assembly of `Noctua_fimbriata` accession `GCA_905163415.1` into the `Noctua_fimbriata_repeats/` directory, +which will be created if needed. + ## Bulk download -To download multiple datasets at once, descrbe these in a "samplesheet": a comma-separated files that lists the command-line arguments. -The file must have four columns, but accepts five as in the [example samplesheet](../assets/samplesheet.csv) provided with the pipeline and pasted here: +The pipeline can download multiple assemblies at once, by providing them in a `.csv` file through the `--input` parameter. +It has to be a comma-separated file with four or five columns, and a header row as shown in the examples below. ```console species_dir,assembly_name,assembly_accession,ensembl_species_name,annotation_method @@ -39,19 +42,23 @@ darwin/data/insects/Noctua_fimbriata,ilNocFimb1.1,GCA_905163415.1,Noctua_fimbria | `ensembl_species_name` | Name of the species, _as used by Ensembl_. Note: it may differ from Tree of Life's | | `annotation_method` | Name of the method of the geneset that holds the repeat annotation. | -A samplesheet may only: - -- multiple datasets of the same species -- only one dataset per assembly -- multiple datasets in the same output directory +A samplesheet may contain: -```bash -nextflow run sanger-tol/ensemblrepeatdownload -profile singularity --input samplesheet.csv --outdir results -``` +- multiple assemblies of the same species +- multiple assemblies in the same output directory +- only one row per assembly +All samplesheet columns correspond exactly to their corresponding command-line parameter, +except `species_dir` which overrides or complements `--oudir`. `species_dir` is used to fit the output of this pipeline into a directory structure compatible with the other pipelines from Sanger Tree of Life. +An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline. + +```bash +nextflow run sanger-tol/ensemblrepeatdownload -profile singularity --input /path/to/samplesheet.csv --outdir /path/to/results +``` + ## Nextflow outputs Note that the pipeline will create the following files in your working directory: From c9224e182192042bf6cdb9153843893351618ec9 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Wed, 1 Feb 2023 12:07:57 +0000 Subject: [PATCH 24/72] Brought some consistency --- .nf-core.yml | 1 - assets/email_template.html | 2 +- nextflow.config | 2 +- nextflow_schema.json | 2 +- 4 files changed, 3 insertions(+), 4 deletions(-) diff --git a/.nf-core.yml b/.nf-core.yml index 574d9ad..d4e1bfc 100644 --- a/.nf-core.yml +++ b/.nf-core.yml @@ -16,7 +16,6 @@ lint: - .github/CONTRIBUTING.md - .github/ISSUE_TEMPLATE/bug_report.yml - .github/workflows/linting.yml - - assets/email_template.html - assets/sendmail_template.txt - docs/README.md - lib/NfcoreTemplate.groovy diff --git a/assets/email_template.html b/assets/email_template.html index 275fbfb..9f35acc 100644 --- a/assets/email_template.html +++ b/assets/email_template.html @@ -4,7 +4,7 @@ - + sanger-tol/ensemblrepeatdownload Pipeline Report diff --git a/nextflow.config b/nextflow.config index 1e0c61a..3762fdd 100644 --- a/nextflow.config +++ b/nextflow.config @@ -171,7 +171,7 @@ manifest { name = 'sanger-tol/ensemblrepeatdownload' author = '@muffato' homePage = 'https://github.com/sanger-tol/ensemblrepeatdownload' - description = 'Pipeline that downloads repeats annotations from Ensembl into the Tree of Life directory structure' + description = 'Pipeline that downloads repeat annotations from Ensembl into a Tree of Life directory structure' mainScript = 'main.nf' nextflowVersion = '!>=22.10.0' version = '1.1dev' diff --git a/nextflow_schema.json b/nextflow_schema.json index defd572..4dba7ec 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -2,7 +2,7 @@ "$schema": "http://json-schema.org/draft-07/schema", "$id": "https://raw.githubusercontent.com/sanger-tol/ensemblrepeatdownload/master/nextflow_schema.json", "title": "sanger-tol/ensemblrepeatdownload pipeline parameters", - "description": "Pipeline that downloads repeats annotations from Ensembl into the Tree of Life directory structure", + "description": "Pipeline that downloads repeat annotations from Ensembl into a Tree of Life directory structure", "type": "object", "definitions": { "input_output_options": { From 19b9f6876abcb3796c6539bb4a414d0c5d3466d0 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Sun, 16 Jul 2023 09:23:32 +0100 Subject: [PATCH 25/72] Fixed the pipeline name --- .github/CONTRIBUTING.md | 14 +++++++------- .github/ISSUE_TEMPLATE/bug_report.yml | 2 +- .github/ISSUE_TEMPLATE/feature_request.yml | 2 +- .github/PULL_REQUEST_TEMPLATE.md | 8 ++++---- .github/workflows/branch.yml | 4 ++-- .github/workflows/ci.yml | 2 +- .github/workflows/fix-linting.yml | 2 +- .nf-core.yml | 6 +++--- CHANGELOG.md | 4 ++-- CITATIONS.md | 2 +- README.md | 12 ++++++------ assets/email_template.html | 14 +++++++------- assets/email_template.txt | 8 ++++---- assets/multiqc_config.yml | 4 ++-- assets/schema_input.json | 4 ++-- assets/sendmail_template.txt | 6 +++--- conf/base.config | 2 +- conf/test.config | 2 +- conf/test_full.config | 2 +- docs/README.md | 4 ++-- docs/output.md | 2 +- docs/usage.md | 8 ++++---- ...groovy => WorkflowEnsemblrepeatdownload.groovy} | 4 ++-- lib/WorkflowMain.groovy | 2 +- main.nf | 14 +++++++------- modules.json | 4 ++-- modules/local/samplesheet_check.nf | 2 +- nextflow.config | 12 ++++++------ nextflow_schema.json | 4 ++-- ...ensembldownload.nf => ensemblrepeatdownload.nf} | 6 +++--- 30 files changed, 81 insertions(+), 81 deletions(-) rename lib/{WorkflowEnsembldownload.groovy => WorkflowEnsemblrepeatdownload.groovy} (94%) rename workflows/{ensembldownload.nf => ensemblrepeatdownload.nf} (95%) diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md index bbe64d0..6a2b446 100644 --- a/.github/CONTRIBUTING.md +++ b/.github/CONTRIBUTING.md @@ -1,9 +1,9 @@ -# sanger-tol/ensembldownload: Contributing Guidelines +# sanger-tol/ensemblrepeatdownload: Contributing Guidelines Hi there! -Many thanks for taking an interest in improving sanger-tol/ensembldownload. +Many thanks for taking an interest in improving sanger-tol/ensemblrepeatdownload. -We try to manage the required tasks for sanger-tol/ensembldownload using GitHub issues, you probably came to this page when creating one. +We try to manage the required tasks for sanger-tol/ensemblrepeatdownload using GitHub issues, you probably came to this page when creating one. Please use the pre-filled template to save time. However, don't be put off by this template - other more general issues and suggestions are welcome! @@ -11,10 +11,10 @@ Contributions to the code are even more welcome ;) ## Contribution workflow -If you'd like to write some code for sanger-tol/ensembldownload, the standard workflow is as follows: +If you'd like to write some code for sanger-tol/ensemblrepeatdownload, the standard workflow is as follows: -1. Check that there isn't already an issue about your idea in the [sanger-tol/ensembldownload issues](https://github.com/sanger-tol/ensembldownload/issues) to avoid duplicating work. If there isn't one already, please create one so that others know you're working on this -2. [Fork](https://help.github.com/en/github/getting-started-with-github/fork-a-repo) the [sanger-tol/ensembldownload repository](https://github.com/sanger-tol/ensembldownload) to your GitHub account +1. Check that there isn't already an issue about your idea in the [sanger-tol/ensemblrepeatdownload issues](https://github.com/sanger-tol/ensemblrepeatdownload/issues) to avoid duplicating work. If there isn't one already, please create one so that others know you're working on this +2. [Fork](https://help.github.com/en/github/getting-started-with-github/fork-a-repo) the [sanger-tol/ensemblrepeatdownload repository](https://github.com/sanger-tol/ensemblrepeatdownload) to your GitHub account 3. Make the necessary changes / additions within your forked repository following [Pipeline conventions](#pipeline-contribution-conventions) 4. Use `nf-core schema build` and add any new parameters to the pipeline JSON schema (requires [nf-core tools](https://github.com/nf-core/tools) >= 1.10). 5. Submit a Pull Request against the `dev` branch and wait for the code to be reviewed and merged @@ -52,7 +52,7 @@ These tests are run both with the latest available version of `Nextflow` and als ## Pipeline contribution conventions -To make the sanger-tol/ensembldownload code and processing logic more understandable for new contributors and to ensure quality, we semi-standardise the way the code and other contributions are written. +To make the sanger-tol/ensemblrepeatdownload code and processing logic more understandable for new contributors and to ensure quality, we semi-standardise the way the code and other contributions are written. ### Adding a new step diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml index 704210f..28dcec8 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.yml +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -48,6 +48,6 @@ body: * OS _(eg. CentOS Linux, macOS, Linux Mint)_ - * Version of sanger-tol/ensembldownload _(eg. 1.1, 1.5, 1.8.2)_ + * Version of sanger-tol/ensemblrepeatdownload _(eg. 1.1, 1.5, 1.8.2)_ " diff --git a/.github/ISSUE_TEMPLATE/feature_request.yml b/.github/ISSUE_TEMPLATE/feature_request.yml index e7996f9..9d4bedf 100644 --- a/.github/ISSUE_TEMPLATE/feature_request.yml +++ b/.github/ISSUE_TEMPLATE/feature_request.yml @@ -1,5 +1,5 @@ name: Feature request -description: Suggest an idea for the sanger-tol/ensembldownload pipeline +description: Suggest an idea for the sanger-tol/ensemblrepeatdownload pipeline labels: enhancement body: - type: textarea diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 4f1152f..937b548 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -1,21 +1,21 @@ ## PR checklist - [ ] This comment contains a description of changes (with reason). - [ ] If you've fixed a bug or added code that should be tested, add tests! -- [ ] If you've added a new tool - have you followed the pipeline conventions in the [contribution docs](https://github.com/sanger-tol/ensembldownload/tree/master/.github/CONTRIBUTING.md) +- [ ] If you've added a new tool - have you followed the pipeline conventions in the [contribution docs](https://github.com/sanger-tol/ensemblrepeatdownload/tree/master/.github/CONTRIBUTING.md) - [ ] Make sure your code lints (`nf-core lint`). - [ ] Ensure the test suite passes (`nextflow run . -profile test,docker --outdir `). - [ ] Usage Documentation in `docs/usage.md` is updated. diff --git a/.github/workflows/branch.yml b/.github/workflows/branch.yml index a8d1a06..aba3fa2 100644 --- a/.github/workflows/branch.yml +++ b/.github/workflows/branch.yml @@ -11,9 +11,9 @@ jobs: steps: # PRs to the nf-core repo master branch are only ok if coming from the nf-core repo `dev` or any `patch` branches - name: Check PRs - if: github.repository == 'sanger-tol/ensembldownload' + if: github.repository == 'sanger-tol/ensemblrepeatdownload' run: | - { [[ ${{github.event.pull_request.head.repo.full_name }} == sanger-tol/ensembldownload ]] && [[ $GITHUB_HEAD_REF = "dev" ]]; } || [[ $GITHUB_HEAD_REF == "patch" ]] + { [[ ${{github.event.pull_request.head.repo.full_name }} == sanger-tol/ensemblrepeatdownload ]] && [[ $GITHUB_HEAD_REF = "dev" ]]; } || [[ $GITHUB_HEAD_REF == "patch" ]] # If the above check failed, post a comment on the PR explaining the failure # NOTE - this doesn't currently work if the PR is coming from a fork, due to limitations in GitHub actions secrets diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index cb567c3..ee8fd64 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -15,7 +15,7 @@ jobs: test: name: Run pipeline with test data # Only run on push if this is the nf-core dev branch (merged PRs) - if: "${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'sanger-tol/ensembldownload') }}" + if: "${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'sanger-tol/ensemblrepeatdownload') }}" runs-on: ubuntu-latest strategy: matrix: diff --git a/.github/workflows/fix-linting.yml b/.github/workflows/fix-linting.yml index cce2ec5..a29fcf2 100644 --- a/.github/workflows/fix-linting.yml +++ b/.github/workflows/fix-linting.yml @@ -9,7 +9,7 @@ jobs: if: > contains(github.event.comment.html_url, '/pull/') && contains(github.event.comment.body, '@nf-core-bot fix linting') && - github.repository == 'sanger-tol/ensembldownload' + github.repository == 'sanger-tol/ensemblrepeatdownload' runs-on: ubuntu-latest steps: # Use the @nf-core-bot token to check out so we can push later diff --git a/.nf-core.yml b/.nf-core.yml index 937bab7..a948ca8 100644 --- a/.nf-core.yml +++ b/.nf-core.yml @@ -2,9 +2,9 @@ repository_type: pipeline lint: files_exist: - CODE_OF_CONDUCT.md - - assets/nf-core-ensembldownload_logo_light.png - - docs/images/nf-core-ensembldownload_logo_light.png - - docs/images/nf-core-ensembldownload_logo_dark.png + - assets/nf-core-ensemblrepeatdownload_logo_light.png + - docs/images/nf-core-ensemblrepeatdownload_logo_light.png + - docs/images/nf-core-ensemblrepeatdownload_logo_dark.png - .github/ISSUE_TEMPLATE/config.yml - .github/workflows/awstest.yml - .github/workflows/awsfulltest.yml diff --git a/CHANGELOG.md b/CHANGELOG.md index 304b4f7..a272efb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,11 +1,11 @@ -# sanger-tol/ensembldownload: Changelog +# sanger-tol/ensemblrepeatdownload: Changelog The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## v1.0dev - [date] -Initial release of sanger-tol/ensembldownload, created with the [nf-core](https://nf-co.re/) template. +Initial release of sanger-tol/ensemblrepeatdownload, created with the [nf-core](https://nf-co.re/) template. ### `Added` diff --git a/CITATIONS.md b/CITATIONS.md index 064df6f..25f925c 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -1,4 +1,4 @@ -# sanger-tol/ensembldownload: Citations +# sanger-tol/ensemblrepeatdownload: Citations ## [nf-core](https://pubmed.ncbi.nlm.nih.gov/32055031/) diff --git a/README.md b/README.md index f3445f6..3e1a7b3 100644 --- a/README.md +++ b/README.md @@ -4,13 +4,13 @@ [![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/) [![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/) [![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/) -[![Launch on Nextflow Tower](https://img.shields.io/badge/Launch%20%F0%9F%9A%80-Nextflow%20Tower-%234256e7)](https://tower.nf/launch?pipeline=https://github.com/sanger-tol/ensembldownload) +[![Launch on Nextflow Tower](https://img.shields.io/badge/Launch%20%F0%9F%9A%80-Nextflow%20Tower-%234256e7)](https://tower.nf/launch?pipeline=https://github.com/sanger-tol/ensemblrepeatdownload) ## Introduction -**sanger-tol/ensembldownload** is a bioinformatics best-practice analysis pipeline for Pipeline to download annotations from Ensembl onto the Tree of Life directory structure. +**sanger-tol/ensemblrepeatdownload** is a bioinformatics best-practice analysis pipeline for Pipeline to download annotations from Ensembl onto the Tree of Life directory structure. The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It uses Docker/Singularity containers making installation trivial and results highly reproducible. The [Nextflow DSL2](https://www.nextflow.io/docs/latest/dsl2.html) implementation of this pipeline uses one container per process which makes it much easier to maintain and update software dependencies. Where possible, these processes have been submitted to and installed from [nf-core/modules](https://github.com/nf-core/modules) in order to make them available to all nf-core pipelines, and to everyone within the Nextflow community! @@ -34,7 +34,7 @@ On release, automated continuous integration tests run the pipeline on a full-si 3. Download the pipeline and test it on a minimal dataset with a single command: ```bash - nextflow run sanger-tol/ensembldownload -profile test,YOURPROFILE --outdir + nextflow run sanger-tol/ensemblrepeatdownload -profile test,YOURPROFILE --outdir ``` Note that some form of configuration will be needed so that Nextflow knows how to fetch the required software. This is usually done in the form of a config profile (`YOURPROFILE` in the example command above). You can chain multiple config profiles in a comma-separated string. @@ -49,12 +49,12 @@ On release, automated continuous integration tests run the pipeline on a full-si ```bash - nextflow run sanger-tol/ensembldownload --input samplesheet.csv --outdir --genome GRCh37 -profile + nextflow run sanger-tol/ensemblrepeatdownload --input samplesheet.csv --outdir --genome GRCh37 -profile ``` ## Credits -sanger-tol/ensembldownload was originally written by @muffato. +sanger-tol/ensemblrepeatdownload was originally written by @muffato. We thank the following people for their extensive assistance in the development of this pipeline: @@ -67,7 +67,7 @@ If you would like to contribute to this pipeline, please see the [contributing g ## Citations - + diff --git a/assets/email_template.html b/assets/email_template.html index 2fc58b2..8e7a3a4 100644 --- a/assets/email_template.html +++ b/assets/email_template.html @@ -4,21 +4,21 @@ - - sanger-tol/ensembldownload Pipeline Report + + sanger-tol/ensemblrepeatdownload Pipeline Report
-

sanger-tol/ensembldownload v${version}

+

sanger-tol/ensemblrepeatdownload v${version}

Run Name: $runName

<% if (!success){ out << """
-

sanger-tol/ensembldownload execution completed unsuccessfully!

+

sanger-tol/ensemblrepeatdownload execution completed unsuccessfully!

The exit status of the task that caused the workflow execution to fail was: $exitStatus.

The full error message was:

${errorReport}
@@ -27,7 +27,7 @@

sanger-tol/ensembldownload execution c } else { out << """
- sanger-tol/ensembldownload execution completed successfully! + sanger-tol/ensemblrepeatdownload execution completed successfully!
""" } @@ -44,8 +44,8 @@

Pipeline Configuration:

-

sanger-tol/ensembldownload

-

https://github.com/sanger-tol/ensembldownload

+

sanger-tol/ensemblrepeatdownload

+

https://github.com/sanger-tol/ensemblrepeatdownload

diff --git a/assets/email_template.txt b/assets/email_template.txt index 12f61af..d5a1d0c 100644 --- a/assets/email_template.txt +++ b/assets/email_template.txt @@ -1,10 +1,10 @@ Run Name: $runName <% if (success){ - out << "## sanger-tol/ensembldownload execution completed successfully! ##" + out << "## sanger-tol/ensemblrepeatdownload execution completed successfully! ##" } else { out << """#################################################### -## sanger-tol/ensembldownload execution completed unsuccessfully! ## +## sanger-tol/ensemblrepeatdownload execution completed unsuccessfully! ## #################################################### The exit status of the task that caused the workflow execution to fail was: $exitStatus. The full error message was: @@ -27,5 +27,5 @@ Pipeline Configuration: <% out << summary.collect{ k,v -> " - $k: $v" }.join("\n") %> -- -sanger-tol/ensembldownload -https://github.com/sanger-tol/ensembldownload +sanger-tol/ensemblrepeatdownload +https://github.com/sanger-tol/ensemblrepeatdownload diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml index 8712fd1..c0f5598 100644 --- a/assets/multiqc_config.yml +++ b/assets/multiqc_config.yml @@ -1,10 +1,10 @@ report_comment: > - This report has been generated by the sanger-tol/ensembldownload + This report has been generated by the sanger-tol/ensemblrepeatdownload analysis pipeline. report_section_order: software_versions: order: -1000 - "sanger-tol-ensembldownload-summary": + "sanger-tol-ensemblrepeatdownload-summary": order: -1001 export_plots: true diff --git a/assets/schema_input.json b/assets/schema_input.json index abb4320..7e00997 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -1,7 +1,7 @@ { "$schema": "http://json-schema.org/draft-07/schema", - "$id": "https://raw.githubusercontent.com/sanger-tol/ensembldownload/master/assets/schema_input.json", - "title": "sanger-tol/ensembldownload pipeline - params.input schema", + "$id": "https://raw.githubusercontent.com/sanger-tol/ensemblrepeatdownload/master/assets/schema_input.json", + "title": "sanger-tol/ensemblrepeatdownload pipeline - params.input schema", "description": "Schema for the file provided with params.input", "type": "array", "items": { diff --git a/assets/sendmail_template.txt b/assets/sendmail_template.txt index 1eeb9f4..cf4079e 100644 --- a/assets/sendmail_template.txt +++ b/assets/sendmail_template.txt @@ -9,12 +9,12 @@ Content-Type: text/html; charset=utf-8 $email_html --nfcoremimeboundary -Content-Type: image/png;name="sanger-tol-ensembldownload_logo.png" +Content-Type: image/png;name="sanger-tol-ensemblrepeatdownload_logo.png" Content-Transfer-Encoding: base64 Content-ID: -Content-Disposition: inline; filename="sanger-tol-ensembldownload_logo_light.png" +Content-Disposition: inline; filename="sanger-tol-ensemblrepeatdownload_logo_light.png" -<% out << new File("$projectDir/assets/sanger-tol-ensembldownload_logo_light.png"). +<% out << new File("$projectDir/assets/sanger-tol-ensemblrepeatdownload_logo_light.png"). bytes. encodeBase64(). toString(). diff --git a/conf/base.config b/conf/base.config index bf9e3c7..cffb6a7 100644 --- a/conf/base.config +++ b/conf/base.config @@ -1,6 +1,6 @@ /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - sanger-tol/ensembldownload Nextflow base config file + sanger-tol/ensemblrepeatdownload Nextflow base config file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ A 'blank slate' config file, appropriate for general use on most high performance compute environments. Assumes that all software is installed and available on diff --git a/conf/test.config b/conf/test.config index e71e569..aa1b96b 100644 --- a/conf/test.config +++ b/conf/test.config @@ -5,7 +5,7 @@ Defines input files and everything required to run a fast and simple pipeline test. Use as follows: - nextflow run sanger-tol/ensembldownload -profile test, --outdir + nextflow run sanger-tol/ensemblrepeatdownload -profile test, --outdir ---------------------------------------------------------------------------------------- */ diff --git a/conf/test_full.config b/conf/test_full.config index e8a6bb6..f56734b 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -5,7 +5,7 @@ Defines input files and everything required to run a full size pipeline test. Use as follows: - nextflow run sanger-tol/ensembldownload -profile test_full, --outdir + nextflow run sanger-tol/ensemblrepeatdownload -profile test_full, --outdir ---------------------------------------------------------------------------------------- */ diff --git a/docs/README.md b/docs/README.md index 170b51f..b3effd1 100644 --- a/docs/README.md +++ b/docs/README.md @@ -1,6 +1,6 @@ -# sanger-tol/ensembldownload: Documentation +# sanger-tol/ensemblrepeatdownload: Documentation -The sanger-tol/ensembldownload documentation is split into the following pages: +The sanger-tol/ensemblrepeatdownload documentation is split into the following pages: - [Usage](usage.md) - An overview of how the pipeline works, how to run it and a description of all of the different command-line flags. diff --git a/docs/output.md b/docs/output.md index bc4a79d..b8f90a4 100644 --- a/docs/output.md +++ b/docs/output.md @@ -1,4 +1,4 @@ -# sanger-tol/ensembldownload: Output +# sanger-tol/ensemblrepeatdownload: Output ## Introduction diff --git a/docs/usage.md b/docs/usage.md index bc39a4d..3b3a925 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -1,4 +1,4 @@ -# sanger-tol/ensembldownload: Usage +# sanger-tol/ensemblrepeatdownload: Usage > _Documentation of pipeline parameters is generated automatically from the pipeline schema and can no longer be found in markdown files._ @@ -55,7 +55,7 @@ An [example samplesheet](../assets/samplesheet.csv) has been provided with the p The typical command for running the pipeline is as follows: ```bash -nextflow run sanger-tol/ensembldownload --input samplesheet.csv --outdir --genome GRCh37 -profile docker +nextflow run sanger-tol/ensemblrepeatdownload --input samplesheet.csv --outdir --genome GRCh37 -profile docker ``` This will launch the pipeline with the `docker` configuration profile. See below for more information about profiles. @@ -74,14 +74,14 @@ work # Directory containing the nextflow working files When you run the above command, Nextflow automatically pulls the pipeline code from GitHub and stores it as a cached version. When running the pipeline after this, it will always use the cached version if available - even if the pipeline has been updated since. To make sure that you're running the latest version of the pipeline, make sure that you regularly update the cached version of the pipeline: ```bash -nextflow pull sanger-tol/ensembldownload +nextflow pull sanger-tol/ensemblrepeatdownload ``` ### Reproducibility It is a good idea to specify a pipeline version when running the pipeline on your data. This ensures that a specific version of the pipeline code and software are used when you run your pipeline. If you keep using the same tag, you'll be running the same version of the pipeline, even if there have been changes to the code since. -First, go to the [sanger-tol/ensembldownload releases page](https://github.com/sanger-tol/ensembldownload/releases) and find the latest version number - numeric only (eg. `1.3.1`). Then specify this when running the pipeline with `-r` (one hyphen) - eg. `-r 1.3.1`. +First, go to the [sanger-tol/ensemblrepeatdownload releases page](https://github.com/sanger-tol/ensemblrepeatdownload/releases) and find the latest version number - numeric only (eg. `1.3.1`). Then specify this when running the pipeline with `-r` (one hyphen) - eg. `-r 1.3.1`. This version number will be logged in reports when you run the pipeline, so that you'll know what you used when you look back in the future. diff --git a/lib/WorkflowEnsembldownload.groovy b/lib/WorkflowEnsemblrepeatdownload.groovy similarity index 94% rename from lib/WorkflowEnsembldownload.groovy rename to lib/WorkflowEnsemblrepeatdownload.groovy index 4ada576..596925c 100755 --- a/lib/WorkflowEnsembldownload.groovy +++ b/lib/WorkflowEnsemblrepeatdownload.groovy @@ -1,8 +1,8 @@ // -// This file holds several functions specific to the workflow/ensembldownload.nf in the sanger-tol/ensembldownload pipeline +// This file holds several functions specific to the workflow/ensemblrepeatdownload.nf in the sanger-tol/ensemblrepeatdownload pipeline // -class WorkflowEnsembldownload { +class WorkflowEnsemblrepeatdownload { // // Check and validate parameters diff --git a/lib/WorkflowMain.groovy b/lib/WorkflowMain.groovy index 4ab1ea4..284a160 100755 --- a/lib/WorkflowMain.groovy +++ b/lib/WorkflowMain.groovy @@ -1,5 +1,5 @@ // -// This file holds several functions specific to the main.nf workflow in the sanger-tol/ensembldownload pipeline +// This file holds several functions specific to the main.nf workflow in the sanger-tol/ensemblrepeatdownload pipeline // class WorkflowMain { diff --git a/main.nf b/main.nf index 8196c5f..cadcd03 100644 --- a/main.nf +++ b/main.nf @@ -1,9 +1,9 @@ #!/usr/bin/env nextflow /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - sanger-tol/ensembldownload + sanger-tol/ensemblrepeatdownload ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Github : https://github.com/sanger-tol/ensembldownload + Github : https://github.com/sanger-tol/ensemblrepeatdownload ---------------------------------------------------------------------------------------- */ @@ -23,13 +23,13 @@ WorkflowMain.initialise(workflow, params, log) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -include { ENSEMBLDOWNLOAD } from './workflows/ensembldownload' +include { ENSEMBLREPEATDOWNLOAD } from './workflows/ensemblrepeatdownload' // -// WORKFLOW: Run main sanger-tol/ensembldownload analysis pipeline +// WORKFLOW: Run main sanger-tol/ensemblrepeatdownload analysis pipeline // -workflow SANGERTOL_ENSEMBLDOWNLOAD { - ENSEMBLDOWNLOAD () +workflow SANGERTOL_ENSEMBLREPEATDOWNLOAD { + ENSEMBLREPEATDOWNLOAD () } /* @@ -43,7 +43,7 @@ workflow SANGERTOL_ENSEMBLDOWNLOAD { // See: https://github.com/nf-core/rnaseq/issues/619 // workflow { - SANGERTOL_ENSEMBLDOWNLOAD () + SANGERTOL_ENSEMBLREPEATDOWNLOAD () } /* diff --git a/modules.json b/modules.json index 9f003ac..c67eb62 100644 --- a/modules.json +++ b/modules.json @@ -1,6 +1,6 @@ { - "name": "sanger-tol/ensembldownload", - "homePage": "https://github.com/sanger-tol/ensembldownload", + "name": "sanger-tol/ensemblrepeatdownload", + "homePage": "https://github.com/sanger-tol/ensemblrepeatdownload", "repos": { "nf-core/modules": { "git_url": "https://github.com/nf-core/modules.git", diff --git a/modules/local/samplesheet_check.nf b/modules/local/samplesheet_check.nf index 5bba3d5..3e3e08e 100644 --- a/modules/local/samplesheet_check.nf +++ b/modules/local/samplesheet_check.nf @@ -13,7 +13,7 @@ process SAMPLESHEET_CHECK { path '*.csv' , emit: csv path "versions.yml", emit: versions - script: // This script is bundled with the pipeline, in sanger-tol/ensembldownload/bin/ + script: // This script is bundled with the pipeline, in sanger-tol/ensemblrepeatdownload/bin/ """ check_samplesheet.py \\ $samplesheet \\ diff --git a/nextflow.config b/nextflow.config index 10626e4..9e9d452 100644 --- a/nextflow.config +++ b/nextflow.config @@ -1,6 +1,6 @@ /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - sanger-tol/ensembldownload Nextflow config file + sanger-tol/ensemblrepeatdownload Nextflow config file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Default config options for all compute environments ---------------------------------------------------------------------------------------- @@ -60,12 +60,12 @@ try { System.err.println("WARNING: Could not load nf-core/config profiles: ${params.custom_config_base}/nfcore_custom.config") } -// Load sanger-tol/ensembldownload custom profiles from different institutions. +// Load sanger-tol/ensemblrepeatdownload custom profiles from different institutions. // Warning: Uncomment only if a pipeline-specific instititutional config already exists on nf-core/configs! // try { -// includeConfig "${params.custom_config_base}/pipeline/ensembldownload.config" +// includeConfig "${params.custom_config_base}/pipeline/ensemblrepeatdownload.config" // } catch (Exception e) { -// System.err.println("WARNING: Could not load nf-core/config/ensembldownload profiles: ${params.custom_config_base}/pipeline/ensembldownload.config") +// System.err.println("WARNING: Could not load nf-core/config/ensemblrepeatdownload profiles: ${params.custom_config_base}/pipeline/ensemblrepeatdownload.config") // } @@ -170,9 +170,9 @@ dag { } manifest { - name = 'sanger-tol/ensembldownload' + name = 'sanger-tol/ensemblrepeatdownload' author = '@muffato' - homePage = 'https://github.com/sanger-tol/ensembldownload' + homePage = 'https://github.com/sanger-tol/ensemblrepeatdownload' description = 'Pipeline to download annotations from Ensembl onto the Tree of Life directory structure' mainScript = 'main.nf' nextflowVersion = '!>=21.10.3' diff --git a/nextflow_schema.json b/nextflow_schema.json index b7059c6..12b2c59 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -1,7 +1,7 @@ { "$schema": "http://json-schema.org/draft-07/schema", - "$id": "https://raw.githubusercontent.com/sanger-tol/ensembldownload/master/nextflow_schema.json", - "title": "sanger-tol/ensembldownload pipeline parameters", + "$id": "https://raw.githubusercontent.com/sanger-tol/ensemblrepeatdownload/master/nextflow_schema.json", + "title": "sanger-tol/ensemblrepeatdownload pipeline parameters", "description": "Pipeline to download annotations from Ensembl onto the Tree of Life directory structure", "type": "object", "definitions": { diff --git a/workflows/ensembldownload.nf b/workflows/ensemblrepeatdownload.nf similarity index 95% rename from workflows/ensembldownload.nf rename to workflows/ensemblrepeatdownload.nf index 91e32f2..1e696c7 100644 --- a/workflows/ensembldownload.nf +++ b/workflows/ensemblrepeatdownload.nf @@ -7,7 +7,7 @@ def summary_params = NfcoreSchema.paramsSummaryMap(workflow, params) // Validate input parameters -WorkflowEnsembldownload.initialise(params, log) +WorkflowEnsemblrepeatdownload.initialise(params, log) // TODO nf-core: Add all file path parameters for the pipeline to the list below // Check input path parameters to see if they exist @@ -59,7 +59,7 @@ include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/modules/custom/ // Info required for completion email and summary def multiqc_report = [] -workflow ENSEMBLDOWNLOAD { +workflow ENSEMBLREPEATDOWNLOAD { ch_versions = Channel.empty() @@ -86,7 +86,7 @@ workflow ENSEMBLDOWNLOAD { // // MODULE: MultiQC // - workflow_summary = WorkflowEnsembldownload.paramsSummaryMultiqc(workflow, summary_params) + workflow_summary = WorkflowEnsemblrepeatdownload.paramsSummaryMultiqc(workflow, summary_params) ch_workflow_summary = Channel.value(workflow_summary) ch_multiqc_files = Channel.empty() From c8e3b8056cf10bcd81b7ed25b20001255ed5e59f Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Sun, 16 Jul 2023 09:55:26 +0100 Subject: [PATCH 26/72] Template update for nf-core/tools version 2.6 --- .prettierignore | 1 + CITATION.cff | 8 +-- assets/adaptivecard.json | 67 +++++++++++++++++++ assets/methods_description_template.yml | 25 +++++++ assets/multiqc_config.yml | 6 +- bin/check_samplesheet.py | 9 ++- docs/usage.md | 8 +++ lib/NfcoreTemplate.groovy | 55 +++++++++++++++ lib/Utils.groovy | 21 ++++-- lib/WorkflowEnsemblrepeatdownload.groovy | 19 ++++++ modules.json | 27 ++++---- .../custom/dumpsoftwareversions/main.nf | 8 +-- .../custom/dumpsoftwareversions/meta.yml | 0 .../templates/dumpsoftwareversions.py | 0 modules/nf-core/{modules => }/fastqc/main.nf | 12 ++++ modules/nf-core/{modules => }/fastqc/meta.yml | 0 modules/nf-core/modules/multiqc/main.nf | 31 --------- modules/nf-core/multiqc/main.nf | 53 +++++++++++++++ .../nf-core/{modules => }/multiqc/meta.yml | 15 +++++ nextflow.config | 5 +- nextflow_schema.json | 18 +++++ pyproject.toml | 10 +++ workflows/ensemblrepeatdownload.nf | 26 ++++--- 23 files changed, 351 insertions(+), 73 deletions(-) create mode 100644 assets/adaptivecard.json create mode 100644 assets/methods_description_template.yml mode change 100755 => 100644 lib/Utils.groovy rename modules/nf-core/{modules => }/custom/dumpsoftwareversions/main.nf (79%) rename modules/nf-core/{modules => }/custom/dumpsoftwareversions/meta.yml (100%) rename modules/nf-core/{modules => }/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py (100%) rename modules/nf-core/{modules => }/fastqc/main.nf (85%) rename modules/nf-core/{modules => }/fastqc/meta.yml (100%) delete mode 100644 modules/nf-core/modules/multiqc/main.nf create mode 100644 modules/nf-core/multiqc/main.nf rename modules/nf-core/{modules => }/multiqc/meta.yml (73%) create mode 100644 pyproject.toml diff --git a/.prettierignore b/.prettierignore index d0e7ae5..eb74a57 100644 --- a/.prettierignore +++ b/.prettierignore @@ -1,4 +1,5 @@ email_template.html +adaptivecard.json .nextflow* work/ data/ diff --git a/CITATION.cff b/CITATION.cff index 4533e2f..017666c 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -13,8 +13,8 @@ authors: given-names: Johannes - family-names: Wilm given-names: Andreas - - family-names: Ulysse Garcia - given-names: Maxime + - family-names: Garcia + given-names: Maxime Ulysse - family-names: Di Tommaso given-names: Paolo - family-names: Nahnsen @@ -39,8 +39,8 @@ prefered-citation: given-names: Johannes - family-names: Wilm given-names: Andreas - - family-names: Ulysse Garcia - given-names: Maxime + - family-names: Garcia + given-names: Maxime Ulysse - family-names: Di Tommaso given-names: Paolo - family-names: Nahnsen diff --git a/assets/adaptivecard.json b/assets/adaptivecard.json new file mode 100644 index 0000000..f0dc193 --- /dev/null +++ b/assets/adaptivecard.json @@ -0,0 +1,67 @@ +{ + "type": "message", + "attachments": [ + { + "contentType": "application/vnd.microsoft.card.adaptive", + "contentUrl": null, + "content": { + "\$schema": "http://adaptivecards.io/schemas/adaptive-card.json", + "msteams": { + "width": "Full" + }, + "type": "AdaptiveCard", + "version": "1.2", + "body": [ + { + "type": "TextBlock", + "size": "Large", + "weight": "Bolder", + "color": "<% if (success) { %>Good<% } else { %>Attention<%} %>", + "text": "sanger-tol/ensemblrepeatdownload v${version} - ${runName}", + "wrap": true + }, + { + "type": "TextBlock", + "spacing": "None", + "text": "Completed at ${dateComplete} (duration: ${duration})", + "isSubtle": true, + "wrap": true + }, + { + "type": "TextBlock", + "text": "<% if (success) { %>Pipeline completed successfully!<% } else { %>Pipeline completed with errors. The full error message was: ${errorReport}.<% } %>", + "wrap": true + }, + { + "type": "TextBlock", + "text": "The command used to launch the workflow was as follows:", + "wrap": true + }, + { + "type": "TextBlock", + "text": "${commandLine}", + "isSubtle": true, + "wrap": true + } + ], + "actions": [ + { + "type": "Action.ShowCard", + "title": "Pipeline Configuration", + "card": { + "type": "AdaptiveCard", + "\$schema": "http://adaptivecards.io/schemas/adaptive-card.json", + "body": [ + { + "type": "FactSet", + "facts": [<% out << summary.collect{ k,v -> "{\"title\": \"$k\", \"value\" : \"$v\"}"}.join(",\n") %> + ] + } + ] + } + } + ] + } + } + ] +} diff --git a/assets/methods_description_template.yml b/assets/methods_description_template.yml new file mode 100644 index 0000000..34ef135 --- /dev/null +++ b/assets/methods_description_template.yml @@ -0,0 +1,25 @@ +id: "sanger-tol-ensemblrepeatdownload-methods-description" +description: "Suggested text and references to use when describing pipeline usage within the methods section of a publication." +section_name: "sanger-tol/ensemblrepeatdownload Methods Description" +section_href: "https://github.com/sanger-tol/ensemblrepeatdownload" +plot_type: "html" +## TODO nf-core: Update the HTML below to your prefered methods description, e.g. add publication citation for this pipeline +## You inject any metadata in the Nextflow '${workflow}' object +data: | +

Methods

+

Data was processed using sanger-tol/ensemblrepeatdownload v${workflow.manifest.version} ${doi_text} of the nf-core collection of workflows (Ewels et al., 2020).

+

The pipeline was executed with Nextflow v${workflow.nextflow.version} (Di Tommaso et al., 2017) with the following command:

+
${workflow.commandLine}
+

References

+
    +
  • Di Tommaso, P., Chatzou, M., Floden, E. W., Barja, P. P., Palumbo, E., & Notredame, C. (2017). Nextflow enables reproducible computational workflows. Nature Biotechnology, 35(4), 316-319. https://doi.org/10.1038/nbt.3820
  • +
  • Ewels, P. A., Peltzer, A., Fillinger, S., Patel, H., Alneberg, J., Wilm, A., Garcia, M. U., Di Tommaso, P., & Nahnsen, S. (2020). The nf-core framework for community-curated bioinformatics pipelines. Nature Biotechnology, 38(3), 276-278. https://doi.org/10.1038/s41587-020-0439-x
  • +
+
+
Notes:
+
    + ${nodoi_text} +
  • The command above does not include parameters contained in any configs or profiles that may have been used. Ensure the config file is also uploaded with your publication!
  • +
  • You should also cite all software used within this run. Check the "Software Versions" of this report to get version information.
  • +
+
diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml index c0f5598..98b77ee 100644 --- a/assets/multiqc_config.yml +++ b/assets/multiqc_config.yml @@ -2,9 +2,11 @@ report_comment: > This report has been generated by the sanger-tol/ensemblrepeatdownload analysis pipeline. report_section_order: - software_versions: + "sanger-tol-ensemblrepeatdownload-methods-description": order: -1000 - "sanger-tol-ensemblrepeatdownload-summary": + software_versions: order: -1001 + "sanger-tol-ensemblrepeatdownload-summary": + order: -1002 export_plots: true diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py index 9a8b896..11b1557 100755 --- a/bin/check_samplesheet.py +++ b/bin/check_samplesheet.py @@ -98,7 +98,9 @@ def _validate_pair(self, row): """Assert that read pairs have the same file extension. Report pair status.""" if row[self._first_col] and row[self._second_col]: row[self._single_col] = False - if Path(row[self._first_col]).suffixes[-2:] != Path(row[self._second_col]).suffixes[-2:]: + first_col_suffix = Path(row[self._first_col]).suffixes[-2:] + second_col_suffix = Path(row[self._second_col]).suffixes[-2:] + if first_col_suffix != second_col_suffix: raise AssertionError("FASTQ pairs must have the same file extensions.") else: row[self._single_col] = True @@ -157,7 +159,7 @@ def sniff_format(handle): handle.seek(0) sniffer = csv.Sniffer() if not sniffer.has_header(peek): - logger.critical(f"The given sample sheet does not appear to contain a header.") + logger.critical("The given sample sheet does not appear to contain a header.") sys.exit(1) dialect = sniffer.sniff(peek) return dialect @@ -195,7 +197,8 @@ def check_samplesheet(file_in, file_out): reader = csv.DictReader(in_handle, dialect=sniff_format(in_handle)) # Validate the existence of the expected header columns. if not required_columns.issubset(reader.fieldnames): - logger.critical(f"The sample sheet **must** contain the column headers: {', '.join(required_columns)}.") + req_cols = ", ".join(required_columns) + logger.critical(f"The sample sheet **must** contain these column headers: {req_cols}.") sys.exit(1) # Validate each row. checker = RowChecker() diff --git a/docs/usage.md b/docs/usage.md index 3b3a925..1c82c04 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -235,6 +235,14 @@ See the main [Nextflow documentation](https://www.nextflow.io/docs/latest/config If you have any questions or issues please send us a message on [Slack](https://nf-co.re/join/slack) on the [`#configs` channel](https://nfcore.slack.com/channels/configs). +## Azure Resource Requests + +To be used with the `azurebatch` profile by specifying the `-profile azurebatch`. +We recommend providing a compute `params.vm_type` of `Standard_D16_v3` VMs by default but these options can be changed if required. + +Note that the choice of VM size depends on your quota and the overall workload during the analysis. +For a thorough list, please refer the [Azure Sizes for virtual machines in Azure](https://docs.microsoft.com/en-us/azure/virtual-machines/sizes). + ## Running in the background Nextflow handles job submissions and supervises the running jobs. The Nextflow process must run until the pipeline is finished. diff --git a/lib/NfcoreTemplate.groovy b/lib/NfcoreTemplate.groovy index 8f85736..45992fa 100755 --- a/lib/NfcoreTemplate.groovy +++ b/lib/NfcoreTemplate.groovy @@ -145,6 +145,61 @@ class NfcoreTemplate { output_tf.withWriter { w -> w << email_txt } } + // + // Construct and send adaptive card + // https://adaptivecards.io + // + public static void adaptivecard(workflow, params, summary_params, projectDir, log) { + def hook_url = params.hook_url + + def summary = [:] + for (group in summary_params.keySet()) { + summary << summary_params[group] + } + + def misc_fields = [:] + misc_fields['start'] = workflow.start + misc_fields['complete'] = workflow.complete + misc_fields['scriptfile'] = workflow.scriptFile + misc_fields['scriptid'] = workflow.scriptId + if (workflow.repository) misc_fields['repository'] = workflow.repository + if (workflow.commitId) misc_fields['commitid'] = workflow.commitId + if (workflow.revision) misc_fields['revision'] = workflow.revision + misc_fields['nxf_version'] = workflow.nextflow.version + misc_fields['nxf_build'] = workflow.nextflow.build + misc_fields['nxf_timestamp'] = workflow.nextflow.timestamp + + def msg_fields = [:] + msg_fields['version'] = workflow.manifest.version + msg_fields['runName'] = workflow.runName + msg_fields['success'] = workflow.success + msg_fields['dateComplete'] = workflow.complete + msg_fields['duration'] = workflow.duration + msg_fields['exitStatus'] = workflow.exitStatus + msg_fields['errorMessage'] = (workflow.errorMessage ?: 'None') + msg_fields['errorReport'] = (workflow.errorReport ?: 'None') + msg_fields['commandLine'] = workflow.commandLine + msg_fields['projectDir'] = workflow.projectDir + msg_fields['summary'] = summary << misc_fields + + // Render the JSON template + def engine = new groovy.text.GStringTemplateEngine() + def hf = new File("$projectDir/assets/adaptivecard.json") + def json_template = engine.createTemplate(hf).make(msg_fields) + def json_message = json_template.toString() + + // POST + def post = new URL(hook_url).openConnection(); + post.setRequestMethod("POST") + post.setDoOutput(true) + post.setRequestProperty("Content-Type", "application/json") + post.getOutputStream().write(json_message.getBytes("UTF-8")); + def postRC = post.getResponseCode(); + if (! postRC.equals(200)) { + log.warn(post.getErrorStream().getText()); + } + } + // // Print pipeline summary on completion // diff --git a/lib/Utils.groovy b/lib/Utils.groovy old mode 100755 new mode 100644 index 28567bd..8d030f4 --- a/lib/Utils.groovy +++ b/lib/Utils.groovy @@ -21,19 +21,26 @@ class Utils { } // Check that all channels are present - def required_channels = ['conda-forge', 'bioconda', 'defaults'] - def conda_check_failed = !required_channels.every { ch -> ch in channels } + // This channel list is ordered by required channel priority. + def required_channels_in_order = ['conda-forge', 'bioconda', 'defaults'] + def channels_missing = ((required_channels_in_order as Set) - (channels as Set)) as Boolean // Check that they are in the right order - conda_check_failed |= !(channels.indexOf('conda-forge') < channels.indexOf('bioconda')) - conda_check_failed |= !(channels.indexOf('bioconda') < channels.indexOf('defaults')) + def channel_priority_violation = false + def n = required_channels_in_order.size() + for (int i = 0; i < n - 1; i++) { + channel_priority_violation |= !(channels.indexOf(required_channels_in_order[i]) < channels.indexOf(required_channels_in_order[i+1])) + } - if (conda_check_failed) { + if (channels_missing | channel_priority_violation) { log.warn "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" + " There is a problem with your Conda configuration!\n\n" + " You will need to set-up the conda-forge and bioconda channels correctly.\n" + - " Please refer to https://bioconda.github.io/user/install.html#set-up-channels\n" + - " NB: The order of the channels matters!\n" + + " Please refer to https://bioconda.github.io/\n" + + " The observed channel order is \n" + + " ${channels}\n" + + " but the following channel order is required:\n" + + " ${required_channels_in_order}\n" + "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" } } diff --git a/lib/WorkflowEnsemblrepeatdownload.groovy b/lib/WorkflowEnsemblrepeatdownload.groovy index 596925c..b42def0 100755 --- a/lib/WorkflowEnsemblrepeatdownload.groovy +++ b/lib/WorkflowEnsemblrepeatdownload.groovy @@ -2,6 +2,8 @@ // This file holds several functions specific to the workflow/ensemblrepeatdownload.nf in the sanger-tol/ensemblrepeatdownload pipeline // +import groovy.text.SimpleTemplateEngine + class WorkflowEnsemblrepeatdownload { // @@ -41,4 +43,21 @@ class WorkflowEnsemblrepeatdownload { yaml_file_text += "data: |\n" yaml_file_text += "${summary_section}" return yaml_file_text + } + + public static String methodsDescriptionText(run_workflow, mqc_methods_yaml) { + // Convert to a named map so can be used as with familar NXF ${workflow} variable syntax in the MultiQC YML file + def meta = [:] + meta.workflow = run_workflow.toMap() + meta["manifest_map"] = run_workflow.manifest.toMap() + + meta["doi_text"] = meta.manifest_map.doi ? "(doi: ${meta.manifest_map.doi})" : "" + meta["nodoi_text"] = meta.manifest_map.doi ? "": "
  • If available, make sure to update the text to include the Zenodo DOI of version of the pipeline used.
  • " + + def methods_text = mqc_methods_yaml.text + + def engine = new SimpleTemplateEngine() + def description_html = engine.createTemplate(methods_text).make(meta) + + return description_html }} diff --git a/modules.json b/modules.json index c67eb62..24edaba 100644 --- a/modules.json +++ b/modules.json @@ -2,20 +2,21 @@ "name": "sanger-tol/ensemblrepeatdownload", "homePage": "https://github.com/sanger-tol/ensemblrepeatdownload", "repos": { - "nf-core/modules": { - "git_url": "https://github.com/nf-core/modules.git", + "https://github.com/nf-core/modules.git": { "modules": { - "custom/dumpsoftwareversions": { - "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d", - "branch": "master" - }, - "fastqc": { - "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d", - "branch": "master" - }, - "multiqc": { - "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d", - "branch": "master" + "nf-core": { + "custom/dumpsoftwareversions": { + "branch": "master", + "git_sha": "5e34754d42cd2d5d248ca8673c0a53cdf5624905" + }, + "fastqc": { + "branch": "master", + "git_sha": "5e34754d42cd2d5d248ca8673c0a53cdf5624905" + }, + "multiqc": { + "branch": "master", + "git_sha": "5e34754d42cd2d5d248ca8673c0a53cdf5624905" + } } } } diff --git a/modules/nf-core/modules/custom/dumpsoftwareversions/main.nf b/modules/nf-core/custom/dumpsoftwareversions/main.nf similarity index 79% rename from modules/nf-core/modules/custom/dumpsoftwareversions/main.nf rename to modules/nf-core/custom/dumpsoftwareversions/main.nf index 327d510..cebb6e0 100644 --- a/modules/nf-core/modules/custom/dumpsoftwareversions/main.nf +++ b/modules/nf-core/custom/dumpsoftwareversions/main.nf @@ -1,11 +1,11 @@ process CUSTOM_DUMPSOFTWAREVERSIONS { - label 'process_low' + label 'process_single' // Requires `pyyaml` which does not have a dedicated container but is in the MultiQC container - conda (params.enable_conda ? "bioconda::multiqc=1.11" : null) + conda (params.enable_conda ? 'bioconda::multiqc=1.13' : null) container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/multiqc:1.11--pyhdfd78af_0' : - 'quay.io/biocontainers/multiqc:1.11--pyhdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/multiqc:1.13--pyhdfd78af_0' : + 'quay.io/biocontainers/multiqc:1.13--pyhdfd78af_0' }" input: path versions diff --git a/modules/nf-core/modules/custom/dumpsoftwareversions/meta.yml b/modules/nf-core/custom/dumpsoftwareversions/meta.yml similarity index 100% rename from modules/nf-core/modules/custom/dumpsoftwareversions/meta.yml rename to modules/nf-core/custom/dumpsoftwareversions/meta.yml diff --git a/modules/nf-core/modules/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py b/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py similarity index 100% rename from modules/nf-core/modules/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py rename to modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py diff --git a/modules/nf-core/modules/fastqc/main.nf b/modules/nf-core/fastqc/main.nf similarity index 85% rename from modules/nf-core/modules/fastqc/main.nf rename to modules/nf-core/fastqc/main.nf index ed6b8c5..0573036 100644 --- a/modules/nf-core/modules/fastqc/main.nf +++ b/modules/nf-core/fastqc/main.nf @@ -44,4 +44,16 @@ process FASTQC { END_VERSIONS """ } + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.html + touch ${prefix}.zip + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + fastqc: \$( fastqc --version | sed -e "s/FastQC v//g" ) + END_VERSIONS + """ } diff --git a/modules/nf-core/modules/fastqc/meta.yml b/modules/nf-core/fastqc/meta.yml similarity index 100% rename from modules/nf-core/modules/fastqc/meta.yml rename to modules/nf-core/fastqc/meta.yml diff --git a/modules/nf-core/modules/multiqc/main.nf b/modules/nf-core/modules/multiqc/main.nf deleted file mode 100644 index 1264aac..0000000 --- a/modules/nf-core/modules/multiqc/main.nf +++ /dev/null @@ -1,31 +0,0 @@ -process MULTIQC { - label 'process_medium' - - conda (params.enable_conda ? 'bioconda::multiqc=1.12' : null) - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/multiqc:1.12--pyhdfd78af_0' : - 'quay.io/biocontainers/multiqc:1.12--pyhdfd78af_0' }" - - input: - path multiqc_files - - output: - path "*multiqc_report.html", emit: report - path "*_data" , emit: data - path "*_plots" , optional:true, emit: plots - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - """ - multiqc -f $args . - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - multiqc: \$( multiqc --version | sed -e "s/multiqc, version //g" ) - END_VERSIONS - """ -} diff --git a/modules/nf-core/multiqc/main.nf b/modules/nf-core/multiqc/main.nf new file mode 100644 index 0000000..a8159a5 --- /dev/null +++ b/modules/nf-core/multiqc/main.nf @@ -0,0 +1,53 @@ +process MULTIQC { + label 'process_single' + + conda (params.enable_conda ? 'bioconda::multiqc=1.13' : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/multiqc:1.13--pyhdfd78af_0' : + 'quay.io/biocontainers/multiqc:1.13--pyhdfd78af_0' }" + + input: + path multiqc_files, stageAs: "?/*" + path(multiqc_config) + path(extra_multiqc_config) + path(multiqc_logo) + + output: + path "*multiqc_report.html", emit: report + path "*_data" , emit: data + path "*_plots" , optional:true, emit: plots + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def config = multiqc_config ? "--config $multiqc_config" : '' + def extra_config = extra_multiqc_config ? "--config $extra_multiqc_config" : '' + """ + multiqc \\ + --force \\ + $args \\ + $config \\ + $extra_config \\ + . + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + multiqc: \$( multiqc --version | sed -e "s/multiqc, version //g" ) + END_VERSIONS + """ + + stub: + """ + touch multiqc_data + touch multiqc_plots + touch multiqc_report.html + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + multiqc: \$( multiqc --version | sed -e "s/multiqc, version //g" ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/modules/multiqc/meta.yml b/modules/nf-core/multiqc/meta.yml similarity index 73% rename from modules/nf-core/modules/multiqc/meta.yml rename to modules/nf-core/multiqc/meta.yml index 6fa891e..ebc29b2 100644 --- a/modules/nf-core/modules/multiqc/meta.yml +++ b/modules/nf-core/multiqc/meta.yml @@ -12,11 +12,25 @@ tools: homepage: https://multiqc.info/ documentation: https://multiqc.info/docs/ licence: ["GPL-3.0-or-later"] + input: - multiqc_files: type: file description: | List of reports / files recognised by MultiQC, for example the html and zip output of FastQC + - multiqc_config: + type: file + description: Optional config yml for MultiQC + pattern: "*.{yml,yaml}" + - extra_multiqc_config: + type: file + description: Second optional config yml for MultiQC. Will override common sections in multiqc_config. + pattern: "*.{yml,yaml}" + - multiqc_logo: + type: file + description: Optional logo file for MultiQC + pattern: "*.{png}" + output: - report: type: file @@ -38,3 +52,4 @@ authors: - "@abhi18av" - "@bunop" - "@drpatelh" + - "@jfy133" diff --git a/nextflow.config b/nextflow.config index 9e9d452..74ae549 100644 --- a/nextflow.config +++ b/nextflow.config @@ -16,7 +16,9 @@ params { // MultiQC options multiqc_config = null multiqc_title = null + multiqc_logo = null max_multiqc_email_size = '25.MB' + multiqc_methods_description = null // Boilerplate options outdir = null @@ -26,6 +28,7 @@ params { email_on_fail = null plaintext_email = false monochrome_logs = false + hook_url = null help = false validate_params = true show_hidden_params = false @@ -69,7 +72,6 @@ try { // } - profiles { debug { process.beforeScript = 'echo $HOSTNAME' } conda { @@ -177,6 +179,7 @@ manifest { mainScript = 'main.nf' nextflowVersion = '!>=21.10.3' version = '1.0dev' + doi = '' } // Load modules.config for DSL2 module specific options diff --git a/nextflow_schema.json b/nextflow_schema.json index 12b2c59..e8b6f8b 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -182,12 +182,30 @@ "fa_icon": "fas fa-palette", "hidden": true }, + "hook_url": { + "type": "string", + "description": "Incoming hook URL for messaging service", + "fa_icon": "fas fa-people-group", + "help_text": "Incoming hook URL for messaging service. Currently, only MS Teams is supported.", + "hidden": true + }, "multiqc_config": { "type": "string", "description": "Custom config file to supply to MultiQC.", "fa_icon": "fas fa-cog", "hidden": true }, + "multiqc_logo": { + "type": "string", + "description": "Custom logo file to supply to MultiQC. File name must also be set in the MultiQC config file", + "fa_icon": "fas fa-image", + "hidden": true + }, + "multiqc_methods_description": { + "type": "string", + "description": "Custom MultiQC yaml file containing HTML including a methods description.", + "fa_icon": "fas fa-cog" + }, "tracedir": { "type": "string", "description": "Directory to keep pipeline Nextflow logs and reports.", diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..0d62beb --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,10 @@ +# Config file for Python. Mostly used to configure linting of bin/check_samplesheet.py with Black. +# Should be kept the same as nf-core/tools to avoid fighting with template synchronisation. +[tool.black] +line-length = 120 +target_version = ["py37", "py38", "py39", "py310"] + +[tool.isort] +profile = "black" +known_first_party = ["nf_core"] +multi_line_output = 3 diff --git a/workflows/ensemblrepeatdownload.nf b/workflows/ensemblrepeatdownload.nf index 1e696c7..986b08b 100644 --- a/workflows/ensemblrepeatdownload.nf +++ b/workflows/ensemblrepeatdownload.nf @@ -23,8 +23,10 @@ if (params.input) { ch_input = file(params.input) } else { exit 1, 'Input sample ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -ch_multiqc_config = file("$projectDir/assets/multiqc_config.yml", checkIfExists: true) -ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath(params.multiqc_config) : Channel.empty() +ch_multiqc_config = Channel.fromPath("$projectDir/assets/multiqc_config.yml", checkIfExists: true) +ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath( params.multiqc_config, checkIfExists: true ) : Channel.empty() +ch_multiqc_logo = params.multiqc_logo ? Channel.fromPath( params.multiqc_logo, checkIfExists: true ) : Channel.empty() +ch_multiqc_custom_methods_description = params.multiqc_methods_description ? file(params.multiqc_methods_description, checkIfExists: true) : file("$projectDir/assets/methods_description_template.yml", checkIfExists: true) /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -46,9 +48,9 @@ include { INPUT_CHECK } from '../subworkflows/local/input_check' // // MODULE: Installed directly from nf-core/modules // -include { FASTQC } from '../modules/nf-core/modules/fastqc/main' -include { MULTIQC } from '../modules/nf-core/modules/multiqc/main' -include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/modules/custom/dumpsoftwareversions/main' +include { FASTQC } from '../modules/nf-core/fastqc/main' +include { MULTIQC } from '../modules/nf-core/multiqc/main' +include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -89,15 +91,20 @@ workflow ENSEMBLREPEATDOWNLOAD { workflow_summary = WorkflowEnsemblrepeatdownload.paramsSummaryMultiqc(workflow, summary_params) ch_workflow_summary = Channel.value(workflow_summary) + methods_description = WorkflowEnsemblrepeatdownload.methodsDescriptionText(workflow, ch_multiqc_custom_methods_description) + ch_methods_description = Channel.value(methods_description) + ch_multiqc_files = Channel.empty() - ch_multiqc_files = ch_multiqc_files.mix(Channel.from(ch_multiqc_config)) - ch_multiqc_files = ch_multiqc_files.mix(ch_multiqc_custom_config.collect().ifEmpty([])) ch_multiqc_files = ch_multiqc_files.mix(ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml')) + ch_multiqc_files = ch_multiqc_files.mix(ch_methods_description.collectFile(name: 'methods_description_mqc.yaml')) ch_multiqc_files = ch_multiqc_files.mix(CUSTOM_DUMPSOFTWAREVERSIONS.out.mqc_yml.collect()) ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip.collect{it[1]}.ifEmpty([])) MULTIQC ( - ch_multiqc_files.collect() + ch_multiqc_files.collect(), + ch_multiqc_config.collect().ifEmpty([]), + ch_multiqc_custom_config.collect().ifEmpty([]), + ch_multiqc_logo.collect().ifEmpty([]) ) multiqc_report = MULTIQC.out.report.toList() ch_versions = ch_versions.mix(MULTIQC.out.versions) @@ -114,6 +121,9 @@ workflow.onComplete { NfcoreTemplate.email(workflow, params, summary_params, projectDir, log, multiqc_report) } NfcoreTemplate.summary(workflow, params, log) + if (params.hook_url) { + NfcoreTemplate.adaptivecard(workflow, params, summary_params, projectDir, log) + } } /* From 5321874ba458c69d55e061db88d7febd592daeb2 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Sun, 16 Jul 2023 10:09:30 +0100 Subject: [PATCH 27/72] Template update for nf-core/tools version 2.7 --- .devcontainer/devcontainer.json | 27 +++++ .gitattributes | 1 + .github/CONTRIBUTING.md | 16 +++ .github/ISSUE_TEMPLATE/bug_report.yml | 98 +++++++++--------- .github/workflows/ci.yml | 8 +- .github/workflows/fix-linting.yml | 6 +- .github/workflows/linting.yml | 18 ++-- .github/workflows/linting_comment.yml | 2 +- .nf-core.yml | 20 ++-- .prettierignore | 2 + CITATION.cff | 56 ----------- README.md | 4 +- assets/slackreport.json | 34 +++++++ docs/usage.md | 24 +++-- lib/NfcoreSchema.groovy | 1 - lib/NfcoreTemplate.groovy | 41 ++++++-- lib/WorkflowMain.groovy | 20 ++-- modules.json | 9 +- modules/local/samplesheet_check.nf | 6 +- .../custom/dumpsoftwareversions/main.nf | 2 +- .../templates/dumpsoftwareversions.py | 99 ++++++++++--------- modules/nf-core/fastqc/main.nf | 40 +++----- modules/nf-core/multiqc/main.nf | 2 +- nextflow.config | 15 +-- nextflow_schema.json | 30 ++++-- workflows/ensemblrepeatdownload.nf | 9 +- 26 files changed, 338 insertions(+), 252 deletions(-) create mode 100644 .devcontainer/devcontainer.json delete mode 100644 CITATION.cff create mode 100644 assets/slackreport.json mode change 100644 => 100755 modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json new file mode 100644 index 0000000..ea27a58 --- /dev/null +++ b/.devcontainer/devcontainer.json @@ -0,0 +1,27 @@ +{ + "name": "nfcore", + "image": "nfcore/gitpod:latest", + "remoteUser": "gitpod", + + // Configure tool-specific properties. + "customizations": { + // Configure properties specific to VS Code. + "vscode": { + // Set *default* container specific settings.json values on container create. + "settings": { + "python.defaultInterpreterPath": "/opt/conda/bin/python", + "python.linting.enabled": true, + "python.linting.pylintEnabled": true, + "python.formatting.autopep8Path": "/opt/conda/bin/autopep8", + "python.formatting.yapfPath": "/opt/conda/bin/yapf", + "python.linting.flake8Path": "/opt/conda/bin/flake8", + "python.linting.pycodestylePath": "/opt/conda/bin/pycodestyle", + "python.linting.pydocstylePath": "/opt/conda/bin/pydocstyle", + "python.linting.pylintPath": "/opt/conda/bin/pylint" + }, + + // Add the IDs of extensions you want installed when the container is created. + "extensions": ["ms-python.python", "ms-python.vscode-pylance", "nf-core.nf-core-extensionpack"] + } + } +} diff --git a/.gitattributes b/.gitattributes index 050bb12..7a2dabc 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,3 +1,4 @@ *.config linguist-language=nextflow +*.nf.test linguist-language=nextflow modules/nf-core/** linguist-generated subworkflows/nf-core/** linguist-generated diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md index 6a2b446..b2a7acb 100644 --- a/.github/CONTRIBUTING.md +++ b/.github/CONTRIBUTING.md @@ -95,3 +95,19 @@ If you are using a new feature from core Nextflow, you may bump the minimum requ ### Images and figures For overview images and other documents we follow the nf-core [style guidelines and examples](https://nf-co.re/developers/design_guidelines). + +## GitHub Codespaces + +This repo includes a devcontainer configuration which will create a GitHub Codespaces for Nextflow development! This is an online developer environment that runs in your browser, complete with VSCode and a terminal. + +To get started: + +- Open the repo in [Codespaces](https://github.com/sanger-tol/ensemblrepeatdownload/codespaces) +- Tools installed + - nf-core + - Nextflow + +Devcontainer specs: + +- [DevContainer config](.devcontainer/devcontainer.json) +- [Dockerfile](.devcontainer/Dockerfile) diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml index 28dcec8..e523572 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.yml +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -2,52 +2,52 @@ name: Bug report description: Report something that is broken or incorrect labels: bug body: - - type: textarea - id: description - attributes: - label: Description of the bug - description: A clear and concise description of what the bug is. - validations: - required: true - - type: textarea - id: command_used - attributes: - label: Command used and terminal output - description: Steps to reproduce the behaviour. Please paste the command you used - to launch the pipeline and the output from your terminal. - render: console - placeholder: "$ nextflow run ... - - - Some output where something broke - - " - - type: textarea - id: files - attributes: - label: Relevant files - description: "Please drag and drop the relevant files here. Create a `.zip` archive - if the extension is not allowed. - - Your verbose log file `.nextflow.log` is often useful _(this is a hidden file - in the directory where you launched the pipeline)_ as well as custom Nextflow - configuration files. - - " - - type: textarea - id: system - attributes: - label: System information - description: "* Nextflow version _(eg. 21.10.3)_ - - * Hardware _(eg. HPC, Desktop, Cloud)_ - - * Executor _(eg. slurm, local, awsbatch)_ - - * Container engine: _(e.g. Docker, Singularity, Conda, Podman, Shifter or Charliecloud)_ - - * OS _(eg. CentOS Linux, macOS, Linux Mint)_ - - * Version of sanger-tol/ensemblrepeatdownload _(eg. 1.1, 1.5, 1.8.2)_ - - " +- type: textarea + id: description + attributes: + label: Description of the bug + description: A clear and concise description of what the bug is. + validations: + required: true +- type: textarea + id: command_used + attributes: + label: Command used and terminal output + description: Steps to reproduce the behaviour. Please paste the command you used + to launch the pipeline and the output from your terminal. + render: console + placeholder: '$ nextflow run ... + + + Some output where something broke + + ' +- type: textarea + id: files + attributes: + label: Relevant files + description: 'Please drag and drop the relevant files here. Create a `.zip` archive + if the extension is not allowed. + + Your verbose log file `.nextflow.log` is often useful _(this is a hidden file + in the directory where you launched the pipeline)_ as well as custom Nextflow + configuration files. + + ' +- type: textarea + id: system + attributes: + label: System information + description: '* Nextflow version _(eg. 22.10.1)_ + + * Hardware _(eg. HPC, Desktop, Cloud)_ + + * Executor _(eg. slurm, local, awsbatch)_ + + * Container engine: _(e.g. Docker, Singularity, Conda, Podman, Shifter or Charliecloud)_ + + * OS _(eg. CentOS Linux, macOS, Linux Mint)_ + + * Version of sanger-tol/ensemblrepeatdownload _(eg. 1.1, 1.5, 1.8.2)_ + + ' diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index ee8fd64..81b2082 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -11,6 +11,10 @@ on: env: NXF_ANSI_LOG: false +concurrency: + group: "${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}" + cancel-in-progress: true + jobs: test: name: Run pipeline with test data @@ -20,11 +24,11 @@ jobs: strategy: matrix: NXF_VER: - - "21.10.3" + - "22.10.1" - "latest-everything" steps: - name: Check out pipeline code - uses: actions/checkout@v2 + uses: actions/checkout@v3 - name: Install Nextflow uses: nf-core/setup-nextflow@v1 diff --git a/.github/workflows/fix-linting.yml b/.github/workflows/fix-linting.yml index a29fcf2..e06cc54 100644 --- a/.github/workflows/fix-linting.yml +++ b/.github/workflows/fix-linting.yml @@ -24,7 +24,7 @@ jobs: env: GITHUB_TOKEN: ${{ secrets.nf_core_bot_auth_token }} - - uses: actions/setup-node@v2 + - uses: actions/setup-node@v3 - name: Install Prettier run: npm install -g prettier @prettier/plugin-php @@ -34,9 +34,9 @@ jobs: id: prettier_status run: | if prettier --check ${GITHUB_WORKSPACE}; then - echo "::set-output name=result::pass" + echo "result=pass" >> $GITHUB_OUTPUT else - echo "::set-output name=result::fail" + echo "result=fail" >> $GITHUB_OUTPUT fi - name: Run 'prettier --write' diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index 8a5ce69..858d622 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -4,6 +4,8 @@ name: nf-core linting # that the code meets the nf-core guidelines. on: push: + branches: + - dev pull_request: release: types: [published] @@ -12,9 +14,9 @@ jobs: EditorConfig: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - - uses: actions/setup-node@v2 + - uses: actions/setup-node@v3 - name: Install editorconfig-checker run: npm install -g editorconfig-checker @@ -25,9 +27,9 @@ jobs: Prettier: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - - uses: actions/setup-node@v2 + - uses: actions/setup-node@v3 - name: Install Prettier run: npm install -g prettier @@ -38,7 +40,7 @@ jobs: PythonBlack: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: Check code lints with Black uses: psf/black@stable @@ -69,12 +71,12 @@ jobs: runs-on: ubuntu-latest steps: - name: Check out pipeline code - uses: actions/checkout@v2 + uses: actions/checkout@v3 - name: Install Nextflow uses: nf-core/setup-nextflow@v1 - - uses: actions/setup-python@v3 + - uses: actions/setup-python@v4 with: python-version: "3.7" architecture: "x64" @@ -97,7 +99,7 @@ jobs: - name: Upload linting log file artifact if: ${{ always() }} - uses: actions/upload-artifact@v2 + uses: actions/upload-artifact@v3 with: name: linting-logs path: | diff --git a/.github/workflows/linting_comment.yml b/.github/workflows/linting_comment.yml index 04758f6..0bbcd30 100644 --- a/.github/workflows/linting_comment.yml +++ b/.github/workflows/linting_comment.yml @@ -18,7 +18,7 @@ jobs: - name: Get PR number id: pr_number - run: echo "::set-output name=pr_number::$(cat linting-logs/PR_number.txt)" + run: echo "pr_number=$(cat linting-logs/PR_number.txt)" >> $GITHUB_OUTPUT - name: Post PR comment uses: marocchino/sticky-pull-request-comment@v2 diff --git a/.nf-core.yml b/.nf-core.yml index a948ca8..1f21935 100644 --- a/.nf-core.yml +++ b/.nf-core.yml @@ -1,15 +1,15 @@ repository_type: pipeline lint: files_exist: - - CODE_OF_CONDUCT.md - - assets/nf-core-ensemblrepeatdownload_logo_light.png - - docs/images/nf-core-ensemblrepeatdownload_logo_light.png - - docs/images/nf-core-ensemblrepeatdownload_logo_dark.png - - .github/ISSUE_TEMPLATE/config.yml - - .github/workflows/awstest.yml - - .github/workflows/awsfulltest.yml + - CODE_OF_CONDUCT.md + - assets/nf-core-ensemblrepeatdownload_logo_light.png + - docs/images/nf-core-ensemblrepeatdownload_logo_light.png + - docs/images/nf-core-ensemblrepeatdownload_logo_dark.png + - .github/ISSUE_TEMPLATE/config.yml + - .github/workflows/awstest.yml + - .github/workflows/awsfulltest.yml nextflow_config: - - manifest.name - - manifest.homePage + - manifest.name + - manifest.homePage multiqc_config: - - report_comment + - report_comment diff --git a/.prettierignore b/.prettierignore index eb74a57..437d763 100644 --- a/.prettierignore +++ b/.prettierignore @@ -1,5 +1,6 @@ email_template.html adaptivecard.json +slackreport.json .nextflow* work/ data/ @@ -8,3 +9,4 @@ results/ testing/ testing* *.pyc +bin/ diff --git a/CITATION.cff b/CITATION.cff deleted file mode 100644 index 017666c..0000000 --- a/CITATION.cff +++ /dev/null @@ -1,56 +0,0 @@ -cff-version: 1.2.0 -message: "If you use `nf-core tools` in your work, please cite the `nf-core` publication" -authors: - - family-names: Ewels - given-names: Philip - - family-names: Peltzer - given-names: Alexander - - family-names: Fillinger - given-names: Sven - - family-names: Patel - given-names: Harshil - - family-names: Alneberg - given-names: Johannes - - family-names: Wilm - given-names: Andreas - - family-names: Garcia - given-names: Maxime Ulysse - - family-names: Di Tommaso - given-names: Paolo - - family-names: Nahnsen - given-names: Sven -title: "The nf-core framework for community-curated bioinformatics pipelines." -version: 2.4.1 -doi: 10.1038/s41587-020-0439-x -date-released: 2022-05-16 -url: https://github.com/nf-core/tools -prefered-citation: - type: article - authors: - - family-names: Ewels - given-names: Philip - - family-names: Peltzer - given-names: Alexander - - family-names: Fillinger - given-names: Sven - - family-names: Patel - given-names: Harshil - - family-names: Alneberg - given-names: Johannes - - family-names: Wilm - given-names: Andreas - - family-names: Garcia - given-names: Maxime Ulysse - - family-names: Di Tommaso - given-names: Paolo - - family-names: Nahnsen - given-names: Sven - doi: 10.1038/s41587-020-0439-x - journal: nature biotechnology - start: 276 - end: 278 - title: "The nf-core framework for community-curated bioinformatics pipelines." - issue: 3 - volume: 38 - year: 2020 - url: https://dx.doi.org/10.1038/s41587-020-0439-x diff --git a/README.md b/README.md index 3e1a7b3..8b7f640 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ [![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.XXXXXXX-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.XXXXXXX) -[![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A521.10.3-23aa62.svg)](https://www.nextflow.io/) +[![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A522.10.1-23aa62.svg)](https://www.nextflow.io/) [![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/) [![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/) [![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/) @@ -27,7 +27,7 @@ On release, automated continuous integration tests run the pipeline on a full-si ## Quick Start -1. Install [`Nextflow`](https://www.nextflow.io/docs/latest/getstarted.html#installation) (`>=21.10.3`) +1. Install [`Nextflow`](https://www.nextflow.io/docs/latest/getstarted.html#installation) (`>=22.10.1`) 2. Install any of [`Docker`](https://docs.docker.com/engine/installation/), [`Singularity`](https://www.sylabs.io/guides/3.0/user-guide/) (you can follow [this tutorial](https://singularity-tutorial.github.io/01-installation/)), [`Podman`](https://podman.io/), [`Shifter`](https://nersc.gitlab.io/development/shifter/how-to-use/) or [`Charliecloud`](https://hpc.github.io/charliecloud/) for full pipeline reproducibility _(you can use [`Conda`](https://conda.io/miniconda.html) both to install Nextflow itself and also to manage software within pipelines. Please only use it within pipelines as a last resort; see [docs](https://nf-co.re/usage/configuration#basic-configuration-profiles))_. diff --git a/assets/slackreport.json b/assets/slackreport.json new file mode 100644 index 0000000..043d02f --- /dev/null +++ b/assets/slackreport.json @@ -0,0 +1,34 @@ +{ + "attachments": [ + { + "fallback": "Plain-text summary of the attachment.", + "color": "<% if (success) { %>good<% } else { %>danger<%} %>", + "author_name": "sanger-tol/readmapping v${version} - ${runName}", + "author_icon": "https://www.nextflow.io/docs/latest/_static/favicon.ico", + "text": "<% if (success) { %>Pipeline completed successfully!<% } else { %>Pipeline completed with errors<% } %>", + "fields": [ + { + "title": "Command used to launch the workflow", + "value": "```${commandLine}```", + "short": false + } + <% + if (!success) { %> + , + { + "title": "Full error message", + "value": "```${errorReport}```", + "short": false + }, + { + "title": "Pipeline configuration", + "value": "<% out << summary.collect{ k,v -> k == "hook_url" ? "_${k}_: (_hidden_)" : ( ( v.class.toString().contains('Path') || ( v.class.toString().contains('String') && v.contains('/') ) ) ? "_${k}_: `${v}`" : (v.class.toString().contains('DateTime') ? ("_${k}_: " + v.format(java.time.format.DateTimeFormatter.ofLocalizedDateTime(java.time.format.FormatStyle.MEDIUM))) : "_${k}_: ${v}") ) }.join(",\n") %>", + "short": false + } + <% } + %> + ], + "footer": "Completed at <% out << dateComplete.format(java.time.format.DateTimeFormatter.ofLocalizedDateTime(java.time.format.FormatStyle.MEDIUM)) %> (duration: ${duration})" + } + ] +} diff --git a/docs/usage.md b/docs/usage.md index 1c82c04..51ab651 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -81,9 +81,9 @@ nextflow pull sanger-tol/ensemblrepeatdownload It is a good idea to specify a pipeline version when running the pipeline on your data. This ensures that a specific version of the pipeline code and software are used when you run your pipeline. If you keep using the same tag, you'll be running the same version of the pipeline, even if there have been changes to the code since. -First, go to the [sanger-tol/ensemblrepeatdownload releases page](https://github.com/sanger-tol/ensemblrepeatdownload/releases) and find the latest version number - numeric only (eg. `1.3.1`). Then specify this when running the pipeline with `-r` (one hyphen) - eg. `-r 1.3.1`. +First, go to the [sanger-tol/ensemblrepeatdownload releases page](https://github.com/sanger-tol/ensemblrepeatdownload/releases) and find the latest pipeline version - numeric only (eg. `1.3.1`). Then specify this when running the pipeline with `-r` (one hyphen) - eg. `-r 1.3.1`. Of course, you can switch to another version by changing the number after the `-r` flag. -This version number will be logged in reports when you run the pipeline, so that you'll know what you used when you look back in the future. +This version number will be logged in reports when you run the pipeline, so that you'll know what you used when you look back in the future. For example, at the bottom of the MultiQC reports. ## Core Nextflow arguments @@ -93,7 +93,7 @@ This version number will be logged in reports when you run the pipeline, so that Use this parameter to choose a configuration profile. Profiles can give configuration presets for different compute environments. -Several generic profiles are bundled with the pipeline which instruct the pipeline to use software packaged using different methods (Docker, Singularity, Podman, Shifter, Charliecloud, Conda) - see below. When using Biocontainers, most of these software packaging methods pull Docker containers from quay.io e.g [FastQC](https://quay.io/repository/biocontainers/fastqc) except for Singularity which directly downloads Singularity images via https hosted by the [Galaxy project](https://depot.galaxyproject.org/singularity/) and Conda which downloads and installs software locally from [Bioconda](https://bioconda.github.io/). +Several generic profiles are bundled with the pipeline which instruct the pipeline to use software packaged using different methods (Docker, Singularity, Podman, Shifter, Charliecloud, Conda) - see below. > We highly recommend the use of Docker or Singularity containers for full pipeline reproducibility, however when this is not possible, Conda is also supported. @@ -102,8 +102,11 @@ The pipeline also dynamically loads configurations from [https://github.com/nf-c Note that multiple profiles can be loaded, for example: `-profile test,docker` - the order of arguments is important! They are loaded in sequence, so later profiles can overwrite earlier profiles. -If `-profile` is not specified, the pipeline will run locally and expect all software to be installed and available on the `PATH`. This is _not_ recommended. +If `-profile` is not specified, the pipeline will run locally and expect all software to be installed and available on the `PATH`. This is _not_ recommended, since it can lead to different results on different machines dependent on the computer enviroment. +- `test` + - A profile with a complete configuration for automated testing + - Includes links to test data so needs no other parameters - `docker` - A generic configuration profile to be used with [Docker](https://docker.com/) - `singularity` @@ -116,9 +119,6 @@ If `-profile` is not specified, the pipeline will run locally and expect all sof - A generic configuration profile to be used with [Charliecloud](https://hpc.github.io/charliecloud/) - `conda` - A generic configuration profile to be used with [Conda](https://conda.io/docs/). Please only use Conda as a last resort i.e. when it's not possible to run the pipeline with Docker, Singularity, Podman, Shifter or Charliecloud. -- `test` - - A profile with a complete configuration for automated testing - - Includes links to test data so needs no other parameters ### `-resume` @@ -167,8 +167,14 @@ Work dir: Tip: you can replicate the issue by changing to the process work dir and entering the command `bash .command.run` ``` +#### For beginners + +A first step to bypass this error, you could try to increase the amount of CPUs, memory, and time for the whole pipeline. Therefor you can try to increase the resource for the parameters `--max_cpus`, `--max_memory`, and `--max_time`. Based on the error above, you have to increase the amount of memory. Therefore you can go to the [parameter documentation of rnaseq](https://nf-co.re/rnaseq/3.9/parameters) and scroll down to the `show hidden parameter` button to get the default value for `--max_memory`. In this case 128GB, you than can try to run your pipeline again with `--max_memory 200GB -resume` to skip all process, that were already calculated. If you can not increase the resource of the complete pipeline, you can try to adapt the resource for a single process as mentioned below. + +#### Advanced option on process level + To bypass this error you would need to find exactly which resources are set by the `STAR_ALIGN` process. The quickest way is to search for `process STAR_ALIGN` in the [nf-core/rnaseq Github repo](https://github.com/nf-core/rnaseq/search?q=process+STAR_ALIGN). -We have standardised the structure of Nextflow DSL2 pipelines such that all module files will be present in the `modules/` directory and so, based on the search results, the file we want is `modules/nf-core/software/star/align/main.nf`. +We have standardised the structure of Nextflow DSL2 pipelines such that all module files will be present in the `modules/` directory and so, based on the search results, the file we want is `modules/nf-core/star/align/main.nf`. If you click on the link to that file you will notice that there is a `label` directive at the top of the module that is set to [`label process_high`](https://github.com/nf-core/rnaseq/blob/4c27ef5610c87db00c3c5a3eed10b1d161abf575/modules/nf-core/software/star/align/main.nf#L9). The [Nextflow `label`](https://www.nextflow.io/docs/latest/process.html#label) directive allows us to organise workflow processes in separate groups which can be referenced in a configuration file to select and configure subset of processes having similar computing requirements. The default values for the `process_high` label are set in the pipeline's [`base.config`](https://github.com/nf-core/rnaseq/blob/4c27ef5610c87db00c3c5a3eed10b1d161abf575/conf/base.config#L33-L37) which in this case is defined as 72GB. @@ -187,7 +193,7 @@ process { > > If you get a warning suggesting that the process selector isn't recognised check that the process name has been specified correctly. -### Updating containers +### Updating containers (advanced users) The [Nextflow DSL2](https://www.nextflow.io/docs/latest/dsl2.html) implementation of this pipeline uses one container per process which makes it much easier to maintain and update software dependencies. If for some reason you need to use a different version of a particular tool with the pipeline then you just need to identify the `process` name and override the Nextflow `container` definition for that process using the `withName` declaration. For example, in the [nf-core/viralrecon](https://nf-co.re/viralrecon) pipeline a tool called [Pangolin](https://github.com/cov-lineages/pangolin) has been used during the COVID-19 pandemic to assign lineages to SARS-CoV-2 genome sequenced samples. Given that the lineage assignments change quite frequently it doesn't make sense to re-release the nf-core/viralrecon everytime a new version of Pangolin has been released. However, you can override the default container used by the pipeline by creating a custom config file and passing it as a command-line argument via `-c custom.config`. diff --git a/lib/NfcoreSchema.groovy b/lib/NfcoreSchema.groovy index b3d092f..33cd4f6 100755 --- a/lib/NfcoreSchema.groovy +++ b/lib/NfcoreSchema.groovy @@ -46,7 +46,6 @@ class NfcoreSchema { 'quiet', 'syslog', 'v', - 'version', // Options for `nextflow run` command 'ansi', diff --git a/lib/NfcoreTemplate.groovy b/lib/NfcoreTemplate.groovy index 45992fa..2023f45 100755 --- a/lib/NfcoreTemplate.groovy +++ b/lib/NfcoreTemplate.groovy @@ -32,6 +32,25 @@ class NfcoreTemplate { } } + // + // Generate version string + // + public static String version(workflow) { + String version_string = "" + + if (workflow.manifest.version) { + def prefix_v = workflow.manifest.version[0] != 'v' ? 'v' : '' + version_string += "${prefix_v}${workflow.manifest.version}" + } + + if (workflow.commitId) { + def git_shortsha = workflow.commitId.substring(0, 7) + version_string += "-g${git_shortsha}" + } + + return version_string + } + // // Construct and send completion email // @@ -61,7 +80,7 @@ class NfcoreTemplate { misc_fields['Nextflow Compile Timestamp'] = workflow.nextflow.timestamp def email_fields = [:] - email_fields['version'] = workflow.manifest.version + email_fields['version'] = NfcoreTemplate.version(workflow) email_fields['runName'] = workflow.runName email_fields['success'] = workflow.success email_fields['dateComplete'] = workflow.complete @@ -146,10 +165,10 @@ class NfcoreTemplate { } // - // Construct and send adaptive card - // https://adaptivecards.io + // Construct and send a notification to a web server as JSON + // e.g. Microsoft Teams and Slack // - public static void adaptivecard(workflow, params, summary_params, projectDir, log) { + public static void IM_notification(workflow, params, summary_params, projectDir, log) { def hook_url = params.hook_url def summary = [:] @@ -170,7 +189,7 @@ class NfcoreTemplate { misc_fields['nxf_timestamp'] = workflow.nextflow.timestamp def msg_fields = [:] - msg_fields['version'] = workflow.manifest.version + msg_fields['version'] = NfcoreTemplate.version(workflow) msg_fields['runName'] = workflow.runName msg_fields['success'] = workflow.success msg_fields['dateComplete'] = workflow.complete @@ -178,13 +197,16 @@ class NfcoreTemplate { msg_fields['exitStatus'] = workflow.exitStatus msg_fields['errorMessage'] = (workflow.errorMessage ?: 'None') msg_fields['errorReport'] = (workflow.errorReport ?: 'None') - msg_fields['commandLine'] = workflow.commandLine + msg_fields['commandLine'] = workflow.commandLine.replaceFirst(/ +--hook_url +[^ ]+/, "") msg_fields['projectDir'] = workflow.projectDir msg_fields['summary'] = summary << misc_fields // Render the JSON template def engine = new groovy.text.GStringTemplateEngine() - def hf = new File("$projectDir/assets/adaptivecard.json") + // Different JSON depending on the service provider + // Defaults to "Adaptive Cards" (https://adaptivecards.io), except Slack which has its own format + def json_path = hook_url.contains("hooks.slack.com") ? "slackreport.json" : "adaptivecard.json" + def hf = new File("$projectDir/assets/${json_path}") def json_template = engine.createTemplate(hf).make(msg_fields) def json_message = json_template.toString() @@ -209,7 +231,7 @@ class NfcoreTemplate { if (workflow.stats.ignoredCount == 0) { log.info "-${colors.purple}[$workflow.manifest.name]${colors.green} Pipeline completed successfully${colors.reset}-" } else { - log.info "-${colors.purple}[$workflow.manifest.name]${colors.red} Pipeline completed successfully, but with errored process(es) ${colors.reset}-" + log.info "-${colors.purple}[$workflow.manifest.name]${colors.yellow} Pipeline completed successfully, but with errored process(es) ${colors.reset}-" } } else { log.info "-${colors.purple}[$workflow.manifest.name]${colors.red} Pipeline completed with errors${colors.reset}-" @@ -297,10 +319,11 @@ class NfcoreTemplate { // public static String logo(workflow, monochrome_logs) { Map colors = logColours(monochrome_logs) + String workflow_version = NfcoreTemplate.version(workflow) String.format( """\n ${dashedLine(monochrome_logs)} - ${colors.purple} ${workflow.manifest.name} v${workflow.manifest.version}${colors.reset} + ${colors.purple} ${workflow.manifest.name} ${workflow_version}${colors.reset} ${dashedLine(monochrome_logs)} """.stripIndent() ) diff --git a/lib/WorkflowMain.groovy b/lib/WorkflowMain.groovy index 284a160..1b3a42f 100755 --- a/lib/WorkflowMain.groovy +++ b/lib/WorkflowMain.groovy @@ -19,7 +19,7 @@ class WorkflowMain { } // - // Print help to screen if required + // Generate help string // public static String help(workflow, params, log) { def command = "nextflow run ${workflow.manifest.name} --input samplesheet.csv --fasta reference.fa -profile docker" @@ -32,7 +32,7 @@ class WorkflowMain { } // - // Print parameter summary log to screen + // Generate parameter summary log string // public static String paramsSummaryLog(workflow, params, log) { def summary_log = '' @@ -53,20 +53,26 @@ class WorkflowMain { System.exit(0) } - // Validate workflow parameters via the JSON schema - if (params.validate_params) { - NfcoreSchema.validateParameters(workflow, params, log) + // Print workflow version and exit on --version + if (params.version) { + String workflow_version = NfcoreTemplate.version(workflow) + log.info "${workflow.manifest.name} ${workflow_version}" + System.exit(0) } // Print parameter summary log to screen - log.info paramsSummaryLog(workflow, params, log) + // Validate workflow parameters via the JSON schema + if (params.validate_params) { + NfcoreSchema.validateParameters(workflow, params, log) + } + // Check that a -profile or Nextflow config has been provided to run the pipeline NfcoreTemplate.checkConfigProvided(workflow, log) // Check that conda channels are set-up correctly - if (params.enable_conda) { + if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { Utils.checkCondaChannels(log) } diff --git a/modules.json b/modules.json index 24edaba..888a59f 100644 --- a/modules.json +++ b/modules.json @@ -7,15 +7,18 @@ "nf-core": { "custom/dumpsoftwareversions": { "branch": "master", - "git_sha": "5e34754d42cd2d5d248ca8673c0a53cdf5624905" + "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "installed_by": ["modules"] }, "fastqc": { "branch": "master", - "git_sha": "5e34754d42cd2d5d248ca8673c0a53cdf5624905" + "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "installed_by": ["modules"] }, "multiqc": { "branch": "master", - "git_sha": "5e34754d42cd2d5d248ca8673c0a53cdf5624905" + "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "installed_by": ["modules"] } } } diff --git a/modules/local/samplesheet_check.nf b/modules/local/samplesheet_check.nf index 3e3e08e..9619c23 100644 --- a/modules/local/samplesheet_check.nf +++ b/modules/local/samplesheet_check.nf @@ -1,7 +1,8 @@ process SAMPLESHEET_CHECK { tag "$samplesheet" + label 'process_single' - conda (params.enable_conda ? "conda-forge::python=3.8.3" : null) + conda "conda-forge::python=3.8.3" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/python:3.8.3' : 'quay.io/biocontainers/python:3.8.3' }" @@ -13,6 +14,9 @@ process SAMPLESHEET_CHECK { path '*.csv' , emit: csv path "versions.yml", emit: versions + when: + task.ext.when == null || task.ext.when + script: // This script is bundled with the pipeline, in sanger-tol/ensemblrepeatdownload/bin/ """ check_samplesheet.py \\ diff --git a/modules/nf-core/custom/dumpsoftwareversions/main.nf b/modules/nf-core/custom/dumpsoftwareversions/main.nf index cebb6e0..3df2176 100644 --- a/modules/nf-core/custom/dumpsoftwareversions/main.nf +++ b/modules/nf-core/custom/dumpsoftwareversions/main.nf @@ -2,7 +2,7 @@ process CUSTOM_DUMPSOFTWAREVERSIONS { label 'process_single' // Requires `pyyaml` which does not have a dedicated container but is in the MultiQC container - conda (params.enable_conda ? 'bioconda::multiqc=1.13' : null) + conda "bioconda::multiqc=1.13" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/multiqc:1.13--pyhdfd78af_0' : 'quay.io/biocontainers/multiqc:1.13--pyhdfd78af_0' }" diff --git a/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py b/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py old mode 100644 new mode 100755 index 787bdb7..e55b8d4 --- a/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py +++ b/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py @@ -1,5 +1,9 @@ #!/usr/bin/env python + +"""Provide functions to merge multiple versions.yml files.""" + + import platform from textwrap import dedent @@ -7,6 +11,7 @@ def _make_versions_html(versions): + """Generate a tabular HTML output of all versions for MultiQC.""" html = [ dedent( """\\ @@ -45,47 +50,53 @@ def _make_versions_html(versions): return "\\n".join(html) -versions_this_module = {} -versions_this_module["${task.process}"] = { - "python": platform.python_version(), - "yaml": yaml.__version__, -} - -with open("$versions") as f: - versions_by_process = yaml.load(f, Loader=yaml.BaseLoader) | versions_this_module - -# aggregate versions by the module name (derived from fully-qualified process name) -versions_by_module = {} -for process, process_versions in versions_by_process.items(): - module = process.split(":")[-1] - try: - if versions_by_module[module] != process_versions: - raise AssertionError( - "We assume that software versions are the same between all modules. " - "If you see this error-message it means you discovered an edge-case " - "and should open an issue in nf-core/tools. " - ) - except KeyError: - versions_by_module[module] = process_versions - -versions_by_module["Workflow"] = { - "Nextflow": "$workflow.nextflow.version", - "$workflow.manifest.name": "$workflow.manifest.version", -} - -versions_mqc = { - "id": "software_versions", - "section_name": "${workflow.manifest.name} Software Versions", - "section_href": "https://github.com/${workflow.manifest.name}", - "plot_type": "html", - "description": "are collected at run time from the software output.", - "data": _make_versions_html(versions_by_module), -} - -with open("software_versions.yml", "w") as f: - yaml.dump(versions_by_module, f, default_flow_style=False) -with open("software_versions_mqc.yml", "w") as f: - yaml.dump(versions_mqc, f, default_flow_style=False) - -with open("versions.yml", "w") as f: - yaml.dump(versions_this_module, f, default_flow_style=False) +def main(): + """Load all version files and generate merged output.""" + versions_this_module = {} + versions_this_module["${task.process}"] = { + "python": platform.python_version(), + "yaml": yaml.__version__, + } + + with open("$versions") as f: + versions_by_process = yaml.load(f, Loader=yaml.BaseLoader) | versions_this_module + + # aggregate versions by the module name (derived from fully-qualified process name) + versions_by_module = {} + for process, process_versions in versions_by_process.items(): + module = process.split(":")[-1] + try: + if versions_by_module[module] != process_versions: + raise AssertionError( + "We assume that software versions are the same between all modules. " + "If you see this error-message it means you discovered an edge-case " + "and should open an issue in nf-core/tools. " + ) + except KeyError: + versions_by_module[module] = process_versions + + versions_by_module["Workflow"] = { + "Nextflow": "$workflow.nextflow.version", + "$workflow.manifest.name": "$workflow.manifest.version", + } + + versions_mqc = { + "id": "software_versions", + "section_name": "${workflow.manifest.name} Software Versions", + "section_href": "https://github.com/${workflow.manifest.name}", + "plot_type": "html", + "description": "are collected at run time from the software output.", + "data": _make_versions_html(versions_by_module), + } + + with open("software_versions.yml", "w") as f: + yaml.dump(versions_by_module, f, default_flow_style=False) + with open("software_versions_mqc.yml", "w") as f: + yaml.dump(versions_mqc, f, default_flow_style=False) + + with open("versions.yml", "w") as f: + yaml.dump(versions_this_module, f, default_flow_style=False) + + +if __name__ == "__main__": + main() diff --git a/modules/nf-core/fastqc/main.nf b/modules/nf-core/fastqc/main.nf index 0573036..9ae5838 100644 --- a/modules/nf-core/fastqc/main.nf +++ b/modules/nf-core/fastqc/main.nf @@ -2,7 +2,7 @@ process FASTQC { tag "$meta.id" label 'process_medium' - conda (params.enable_conda ? "bioconda::fastqc=0.11.9" : null) + conda "bioconda::fastqc=0.11.9" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/fastqc:0.11.9--0' : 'quay.io/biocontainers/fastqc:0.11.9--0' }" @@ -20,30 +20,22 @@ process FASTQC { script: def args = task.ext.args ?: '' - // Add soft-links to original FastQs for consistent naming in pipeline def prefix = task.ext.prefix ?: "${meta.id}" - if (meta.single_end) { - """ - [ ! -f ${prefix}.fastq.gz ] && ln -s $reads ${prefix}.fastq.gz - fastqc $args --threads $task.cpus ${prefix}.fastq.gz - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - fastqc: \$( fastqc --version | sed -e "s/FastQC v//g" ) - END_VERSIONS - """ - } else { - """ - [ ! -f ${prefix}_1.fastq.gz ] && ln -s ${reads[0]} ${prefix}_1.fastq.gz - [ ! -f ${prefix}_2.fastq.gz ] && ln -s ${reads[1]} ${prefix}_2.fastq.gz - fastqc $args --threads $task.cpus ${prefix}_1.fastq.gz ${prefix}_2.fastq.gz - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - fastqc: \$( fastqc --version | sed -e "s/FastQC v//g" ) - END_VERSIONS - """ - } + // Make list of old name and new name pairs to use for renaming in the bash while loop + def old_new_pairs = reads instanceof Path || reads.size() == 1 ? [[ reads, "${prefix}.${reads.extension}" ]] : reads.withIndex().collect { entry, index -> [ entry, "${prefix}_${index + 1}.${entry.extension}" ] } + def rename_to = old_new_pairs*.join(' ').join(' ') + def renamed_files = old_new_pairs.collect{ old_name, new_name -> new_name }.join(' ') + """ + printf "%s %s\\n" $rename_to | while read old_name new_name; do + [ -f "\${new_name}" ] || ln -s \$old_name \$new_name + done + fastqc $args --threads $task.cpus $renamed_files + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + fastqc: \$( fastqc --version | sed -e "s/FastQC v//g" ) + END_VERSIONS + """ stub: def prefix = task.ext.prefix ?: "${meta.id}" diff --git a/modules/nf-core/multiqc/main.nf b/modules/nf-core/multiqc/main.nf index a8159a5..68f66be 100644 --- a/modules/nf-core/multiqc/main.nf +++ b/modules/nf-core/multiqc/main.nf @@ -1,7 +1,7 @@ process MULTIQC { label 'process_single' - conda (params.enable_conda ? 'bioconda::multiqc=1.13' : null) + conda "bioconda::multiqc=1.13" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/multiqc:1.13--pyhdfd78af_0' : 'quay.io/biocontainers/multiqc:1.13--pyhdfd78af_0' }" diff --git a/nextflow.config b/nextflow.config index 74ae549..a09fd4b 100644 --- a/nextflow.config +++ b/nextflow.config @@ -30,10 +30,10 @@ params { monochrome_logs = false hook_url = null help = false + version = false validate_params = true show_hidden_params = false schema_ignore_params = 'genomes' - enable_conda = false // Config options @@ -75,7 +75,7 @@ try { profiles { debug { process.beforeScript = 'echo $HOSTNAME' } conda { - params.enable_conda = true + conda.enabled = true docker.enabled = false singularity.enabled = false podman.enabled = false @@ -83,7 +83,7 @@ profiles { charliecloud.enabled = false } mamba { - params.enable_conda = true + conda.enabled = true conda.useMamba = true docker.enabled = false singularity.enabled = false @@ -99,6 +99,9 @@ profiles { shifter.enabled = false charliecloud.enabled = false } + arm { + docker.runOptions = '-u $(id -u):$(id -g) --platform=linux/amd64' + } singularity { singularity.enabled = true singularity.autoMounts = true @@ -173,11 +176,11 @@ dag { manifest { name = 'sanger-tol/ensemblrepeatdownload' - author = '@muffato' + author = """@muffato""" homePage = 'https://github.com/sanger-tol/ensemblrepeatdownload' - description = 'Pipeline to download annotations from Ensembl onto the Tree of Life directory structure' + description = """Pipeline to download annotations from Ensembl onto the Tree of Life directory structure""" mainScript = 'main.nf' - nextflowVersion = '!>=21.10.3' + nextflowVersion = '!>=22.10.1' version = '1.0dev' doi = '' } diff --git a/nextflow_schema.json b/nextflow_schema.json index e8b6f8b..5bf347d 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,7 +10,10 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": ["input", "outdir"], + "required": [ + "input", + "outdir" + ], "properties": { "input": { "type": "string", @@ -145,13 +148,26 @@ "fa_icon": "fas fa-question-circle", "hidden": true }, + "version": { + "type": "boolean", + "description": "Display version and exit.", + "fa_icon": "fas fa-question-circle", + "hidden": true + }, "publish_dir_mode": { "type": "string", "default": "copy", "description": "Method used to save pipeline results to output directory.", "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", "fa_icon": "fas fa-copy", - "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"], + "enum": [ + "symlink", + "rellink", + "link", + "copy", + "copyNoFollow", + "move" + ], "hidden": true }, "email_on_fail": { @@ -186,7 +202,7 @@ "type": "string", "description": "Incoming hook URL for messaging service", "fa_icon": "fas fa-people-group", - "help_text": "Incoming hook URL for messaging service. Currently, only MS Teams is supported.", + "help_text": "Incoming hook URL for messaging service. Currently, MS Teams and Slack are supported.", "hidden": true }, "multiqc_config": { @@ -226,12 +242,6 @@ "description": "Show all params when using `--help`", "hidden": true, "help_text": "By default, parameters set as _hidden_ in the schema are not shown on the command line when a user runs with `--help`. Specifying this option will tell the pipeline to show all parameters." - }, - "enable_conda": { - "type": "boolean", - "description": "Run this workflow with Conda. You can also use '-profile conda' instead of providing this parameter.", - "hidden": true, - "fa_icon": "fas fa-bacon" } } } @@ -253,4 +263,4 @@ "$ref": "#/definitions/generic_options" } ] -} +} \ No newline at end of file diff --git a/workflows/ensemblrepeatdownload.nf b/workflows/ensemblrepeatdownload.nf index 986b08b..f4daa03 100644 --- a/workflows/ensemblrepeatdownload.nf +++ b/workflows/ensemblrepeatdownload.nf @@ -102,12 +102,11 @@ workflow ENSEMBLREPEATDOWNLOAD { MULTIQC ( ch_multiqc_files.collect(), - ch_multiqc_config.collect().ifEmpty([]), - ch_multiqc_custom_config.collect().ifEmpty([]), - ch_multiqc_logo.collect().ifEmpty([]) + ch_multiqc_config.toList(), + ch_multiqc_custom_config.toList(), + ch_multiqc_logo.toList() ) multiqc_report = MULTIQC.out.report.toList() - ch_versions = ch_versions.mix(MULTIQC.out.versions) } /* @@ -122,7 +121,7 @@ workflow.onComplete { } NfcoreTemplate.summary(workflow, params, log) if (params.hook_url) { - NfcoreTemplate.adaptivecard(workflow, params, summary_params, projectDir, log) + NfcoreTemplate.IM_notification(workflow, params, summary_params, projectDir, log) } } From 5f4a6d1bf4cd173dea30c795c4cb92b85ebde312 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Sun, 16 Jul 2023 10:33:33 +0100 Subject: [PATCH 28/72] Template update for nf-core/tools version 2.8 --- .editorconfig | 2 +- .github/ISSUE_TEMPLATE/bug_report.yml | 3 +- .github/workflows/branch.yml | 2 +- .github/workflows/clean-up.yml | 24 ++++ .github/workflows/linting.yml | 2 +- .pre-commit-config.yaml | 5 + README.md | 64 +++++---- bin/check_samplesheet.py | 3 - conf/base.config | 2 +- conf/test_full.config | 2 + docs/usage.md | 130 +++++------------- lib/NfcoreSchema.groovy | 4 +- lib/WorkflowEnsemblrepeatdownload.groovy | 4 +- lib/WorkflowMain.groovy | 13 +- modules.json | 4 +- modules/local/samplesheet_check.nf | 2 +- .../custom/dumpsoftwareversions/main.nf | 6 +- .../custom/dumpsoftwareversions/meta.yml | 2 + modules/nf-core/multiqc/main.nf | 6 +- modules/nf-core/multiqc/meta.yml | 3 +- nextflow.config | 29 +++- pipeline_template.yml | 3 + tower.yml | 5 + 23 files changed, 173 insertions(+), 147 deletions(-) create mode 100644 .github/workflows/clean-up.yml create mode 100644 .pre-commit-config.yaml create mode 100644 pipeline_template.yml create mode 100644 tower.yml diff --git a/.editorconfig b/.editorconfig index b78de6e..b6b3190 100644 --- a/.editorconfig +++ b/.editorconfig @@ -8,7 +8,7 @@ trim_trailing_whitespace = true indent_size = 4 indent_style = space -[*.{md,yml,yaml,html,css,scss,js,cff}] +[*.{md,yml,yaml,html,css,scss,js}] indent_size = 2 # These files are edited and tested upstream in nf-core/modules diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml index e523572..53e2794 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.yml +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -44,7 +44,8 @@ body: * Executor _(eg. slurm, local, awsbatch)_ - * Container engine: _(e.g. Docker, Singularity, Conda, Podman, Shifter or Charliecloud)_ + * Container engine: _(e.g. Docker, Singularity, Conda, Podman, Shifter, Charliecloud, + or Apptainer)_ * OS _(eg. CentOS Linux, macOS, Linux Mint)_ diff --git a/.github/workflows/branch.yml b/.github/workflows/branch.yml index aba3fa2..d31fdc0 100644 --- a/.github/workflows/branch.yml +++ b/.github/workflows/branch.yml @@ -13,7 +13,7 @@ jobs: - name: Check PRs if: github.repository == 'sanger-tol/ensemblrepeatdownload' run: | - { [[ ${{github.event.pull_request.head.repo.full_name }} == sanger-tol/ensemblrepeatdownload ]] && [[ $GITHUB_HEAD_REF = "dev" ]]; } || [[ $GITHUB_HEAD_REF == "patch" ]] + { [[ ${{github.event.pull_request.head.repo.full_name }} == sanger-tol/ensemblrepeatdownload ]] && [[ $GITHUB_HEAD_REF == "dev" ]]; } || [[ $GITHUB_HEAD_REF == "patch" ]] # If the above check failed, post a comment on the PR explaining the failure # NOTE - this doesn't currently work if the PR is coming from a fork, due to limitations in GitHub actions secrets diff --git a/.github/workflows/clean-up.yml b/.github/workflows/clean-up.yml new file mode 100644 index 0000000..694e90e --- /dev/null +++ b/.github/workflows/clean-up.yml @@ -0,0 +1,24 @@ +name: "Close user-tagged issues and PRs" +on: + schedule: + - cron: "0 0 * * 0" # Once a week + +jobs: + clean-up: + runs-on: ubuntu-latest + permissions: + issues: write + pull-requests: write + steps: + - uses: actions/stale@v7 + with: + stale-issue-message: "This issue has been tagged as awaiting-changes or awaiting-feedback by an nf-core contributor. Remove stale label or add a comment otherwise this issue will be closed in 20 days." + stale-pr-message: "This PR has been tagged as awaiting-changes or awaiting-feedback by an nf-core contributor. Remove stale label or add a comment if it is still useful." + close-issue-message: "This issue was closed because it has been tagged as awaiting-changes or awaiting-feedback by an nf-core contributor and then staled for 20 days with no activity." + days-before-stale: 30 + days-before-close: 20 + days-before-pr-close: -1 + any-of-labels: "awaiting-changes,awaiting-feedback" + exempt-issue-labels: "WIP" + exempt-pr-labels: "WIP" + repo-token: "${{ secrets.GITHUB_TOKEN }}" diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index 858d622..888cb4b 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -78,7 +78,7 @@ jobs: - uses: actions/setup-python@v4 with: - python-version: "3.7" + python-version: "3.8" architecture: "x64" - name: Install dependencies diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..0c31cdb --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,5 @@ +repos: + - repo: https://github.com/pre-commit/mirrors-prettier + rev: "v2.7.1" + hooks: + - id: prettier diff --git a/README.md b/README.md index 8b7f640..1bec487 100644 --- a/README.md +++ b/README.md @@ -8,49 +8,59 @@ ## Introduction - +**sanger-tol/ensemblrepeatdownload** is a bioinformatics pipeline that ... -**sanger-tol/ensemblrepeatdownload** is a bioinformatics best-practice analysis pipeline for Pipeline to download annotations from Ensembl onto the Tree of Life directory structure. - -The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It uses Docker/Singularity containers making installation trivial and results highly reproducible. The [Nextflow DSL2](https://www.nextflow.io/docs/latest/dsl2.html) implementation of this pipeline uses one container per process which makes it much easier to maintain and update software dependencies. Where possible, these processes have been submitted to and installed from [nf-core/modules](https://github.com/nf-core/modules) in order to make them available to all nf-core pipelines, and to everyone within the Nextflow community! - - - -On release, automated continuous integration tests run the pipeline on a full-sized dataset on the AWS cloud infrastructure. This ensures that the pipeline runs on AWS, has sensible resource allocation defaults set to run on real-world datasets, and permits the persistent storage of results to benchmark between pipeline releases and other analysis sources. - -## Pipeline summary + + 1. Read QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)) 2. Present QC for raw reads ([`MultiQC`](http://multiqc.info/)) -## Quick Start +## Usage + +> **Note** +> If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how +> to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) +> with `-profile test` before running the workflow on actual data. + + - > - The pipeline comes with config profiles called `docker`, `singularity`, `podman`, `shifter`, `charliecloud` and `conda` which instruct the pipeline to use the named tool for software management. For example, `-profile test,docker`. - > - Please check [nf-core/configs](https://github.com/nf-core/configs#documentation) to see if a custom config file to run nf-core pipelines already exists for your Institute. If so, you can simply use `-profile ` in your command. This will enable either `docker` or `singularity` and set the appropriate execution settings for your local compute environment. - > - If you are using `singularity`, please use the [`nf-core download`](https://nf-co.re/tools/#downloading-pipelines-for-offline-use) command to download images first, before running the pipeline. Setting the [`NXF_SINGULARITY_CACHEDIR` or `singularity.cacheDir`](https://www.nextflow.io/docs/latest/singularity.html?#singularity-docker-hub) Nextflow options enables you to store and re-use the images from a central location for future pipeline runs. - > - If you are using `conda`, it is highly recommended to use the [`NXF_CONDA_CACHEDIR` or `conda.cacheDir`](https://www.nextflow.io/docs/latest/conda.html) settings to store the environments in a central location for future pipeline runs. +Now, you can run the pipeline using: -4. Start running your own analysis! + - +```bash +nextflow run sanger-tol/ensemblrepeatdownload \ + -profile \ + --input samplesheet.csv \ + --outdir +``` - ```bash - nextflow run sanger-tol/ensemblrepeatdownload --input samplesheet.csv --outdir --genome GRCh37 -profile - ``` +> **Warning:** +> Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those +> provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_; +> see [docs](https://nf-co.re/usage/configuration#custom-configuration-files). ## Credits diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py index 11b1557..4a758fe 100755 --- a/bin/check_samplesheet.py +++ b/bin/check_samplesheet.py @@ -158,9 +158,6 @@ def sniff_format(handle): peek = read_head(handle) handle.seek(0) sniffer = csv.Sniffer() - if not sniffer.has_header(peek): - logger.critical("The given sample sheet does not appear to contain a header.") - sys.exit(1) dialect = sniffer.sniff(peek) return dialect diff --git a/conf/base.config b/conf/base.config index cffb6a7..38fe1df 100644 --- a/conf/base.config +++ b/conf/base.config @@ -15,7 +15,7 @@ process { memory = { check_max( 6.GB * task.attempt, 'memory' ) } time = { check_max( 4.h * task.attempt, 'time' ) } - errorStrategy = { task.exitStatus in [143,137,104,134,139] ? 'retry' : 'finish' } + errorStrategy = { task.exitStatus in ((130..145) + 104) ? 'retry' : 'finish' } maxRetries = 1 maxErrors = '-1' diff --git a/conf/test_full.config b/conf/test_full.config index f56734b..85f54dc 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -10,6 +10,8 @@ ---------------------------------------------------------------------------------------- */ +cleanup = true + params { config_profile_name = 'Full test profile' config_profile_description = 'Full test dataset to check pipeline function' diff --git a/docs/usage.md b/docs/usage.md index 51ab651..1aa780e 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -69,6 +69,29 @@ work # Directory containing the nextflow working files # Other nextflow hidden files, eg. history of pipeline runs and old logs. ``` +If you wish to repeatedly use the same parameters for multiple runs, rather than specifying each flag in the command, you can specify these in a params file. + +Pipeline settings can be provided in a `yaml` or `json` file via `-params-file `. + +> ⚠️ Do not use `-c ` to specify parameters as this will result in errors. Custom config files specified with `-c` must only be used for [tuning process resource specifications](https://nf-co.re/docs/usage/configuration#tuning-workflow-resources), other infrastructural tweaks (such as output directories), or module arguments (args). +> The above pipeline run specified with a params file in yaml format: + +```bash +nextflow run sanger-tol/ensemblrepeatdownload -profile docker -params-file params.yaml +``` + +with `params.yaml` containing: + +```yaml +input: './samplesheet.csv' +outdir: './results/' +genome: 'GRCh37' +input: 'data' +<...> +``` + +You can also generate such `YAML`/`JSON` files via [nf-core/launch](https://nf-co.re/launch). + ### Updating the pipeline When you run the above command, Nextflow automatically pulls the pipeline code from GitHub and stores it as a cached version. When running the pipeline after this, it will always use the cached version if available - even if the pipeline has been updated since. To make sure that you're running the latest version of the pipeline, make sure that you regularly update the cached version of the pipeline: @@ -85,6 +108,10 @@ First, go to the [sanger-tol/ensemblrepeatdownload releases page](https://github This version number will be logged in reports when you run the pipeline, so that you'll know what you used when you look back in the future. For example, at the bottom of the MultiQC reports. +To further assist in reproducbility, you can use share and re-use [parameter files](#running-the-pipeline) to repeat pipeline runs with the same settings without having to write out a command with every single parameter. + +> 💡 If you wish to share such profile (such as upload as supplementary material for academic publications), make sure to NOT include cluster specific paths to files, nor institutional specific profiles. + ## Core Nextflow arguments > **NB:** These options are part of Nextflow and use a _single_ hyphen (pipeline parameters use a double-hyphen). @@ -93,7 +120,7 @@ This version number will be logged in reports when you run the pipeline, so that Use this parameter to choose a configuration profile. Profiles can give configuration presets for different compute environments. -Several generic profiles are bundled with the pipeline which instruct the pipeline to use software packaged using different methods (Docker, Singularity, Podman, Shifter, Charliecloud, Conda) - see below. +Several generic profiles are bundled with the pipeline which instruct the pipeline to use software packaged using different methods (Docker, Singularity, Podman, Shifter, Charliecloud, Apptainer, Conda) - see below. > We highly recommend the use of Docker or Singularity containers for full pipeline reproducibility, however when this is not possible, Conda is also supported. @@ -117,8 +144,10 @@ If `-profile` is not specified, the pipeline will run locally and expect all sof - A generic configuration profile to be used with [Shifter](https://nersc.gitlab.io/development/shifter/how-to-use/) - `charliecloud` - A generic configuration profile to be used with [Charliecloud](https://hpc.github.io/charliecloud/) +- `apptainer` + - A generic configuration profile to be used with [Apptainer](https://apptainer.org/) - `conda` - - A generic configuration profile to be used with [Conda](https://conda.io/docs/). Please only use Conda as a last resort i.e. when it's not possible to run the pipeline with Docker, Singularity, Podman, Shifter or Charliecloud. + - A generic configuration profile to be used with [Conda](https://conda.io/docs/). Please only use Conda as a last resort i.e. when it's not possible to run the pipeline with Docker, Singularity, Podman, Shifter, Charliecloud, or Apptainer. ### `-resume` @@ -136,102 +165,19 @@ Specify the path to a specific config file (this is a core Nextflow command). Se Whilst the default requirements set within the pipeline will hopefully work for most people and with most input data, you may find that you want to customise the compute resources that the pipeline requests. Each step in the pipeline has a default set of requirements for number of CPUs, memory and time. For most of the steps in the pipeline, if the job exits with any of the error codes specified [here](https://github.com/nf-core/rnaseq/blob/4c27ef5610c87db00c3c5a3eed10b1d161abf575/conf/base.config#L18) it will automatically be resubmitted with higher requests (2 x original, then 3 x original). If it still fails after the third attempt then the pipeline execution is stopped. -For example, if the nf-core/rnaseq pipeline is failing after multiple re-submissions of the `STAR_ALIGN` process due to an exit code of `137` this would indicate that there is an out of memory issue: - -```console -[62/149eb0] NOTE: Process `NFCORE_RNASEQ:RNASEQ:ALIGN_STAR:STAR_ALIGN (WT_REP1)` terminated with an error exit status (137) -- Execution is retried (1) -Error executing process > 'NFCORE_RNASEQ:RNASEQ:ALIGN_STAR:STAR_ALIGN (WT_REP1)' - -Caused by: - Process `NFCORE_RNASEQ:RNASEQ:ALIGN_STAR:STAR_ALIGN (WT_REP1)` terminated with an error exit status (137) - -Command executed: - STAR \ - --genomeDir star \ - --readFilesIn WT_REP1_trimmed.fq.gz \ - --runThreadN 2 \ - --outFileNamePrefix WT_REP1. \ - - -Command exit status: - 137 - -Command output: - (empty) - -Command error: - .command.sh: line 9: 30 Killed STAR --genomeDir star --readFilesIn WT_REP1_trimmed.fq.gz --runThreadN 2 --outFileNamePrefix WT_REP1. -Work dir: - /home/pipelinetest/work/9d/172ca5881234073e8d76f2a19c88fb - -Tip: you can replicate the issue by changing to the process work dir and entering the command `bash .command.run` -``` - -#### For beginners - -A first step to bypass this error, you could try to increase the amount of CPUs, memory, and time for the whole pipeline. Therefor you can try to increase the resource for the parameters `--max_cpus`, `--max_memory`, and `--max_time`. Based on the error above, you have to increase the amount of memory. Therefore you can go to the [parameter documentation of rnaseq](https://nf-co.re/rnaseq/3.9/parameters) and scroll down to the `show hidden parameter` button to get the default value for `--max_memory`. In this case 128GB, you than can try to run your pipeline again with `--max_memory 200GB -resume` to skip all process, that were already calculated. If you can not increase the resource of the complete pipeline, you can try to adapt the resource for a single process as mentioned below. - -#### Advanced option on process level - -To bypass this error you would need to find exactly which resources are set by the `STAR_ALIGN` process. The quickest way is to search for `process STAR_ALIGN` in the [nf-core/rnaseq Github repo](https://github.com/nf-core/rnaseq/search?q=process+STAR_ALIGN). -We have standardised the structure of Nextflow DSL2 pipelines such that all module files will be present in the `modules/` directory and so, based on the search results, the file we want is `modules/nf-core/star/align/main.nf`. -If you click on the link to that file you will notice that there is a `label` directive at the top of the module that is set to [`label process_high`](https://github.com/nf-core/rnaseq/blob/4c27ef5610c87db00c3c5a3eed10b1d161abf575/modules/nf-core/software/star/align/main.nf#L9). -The [Nextflow `label`](https://www.nextflow.io/docs/latest/process.html#label) directive allows us to organise workflow processes in separate groups which can be referenced in a configuration file to select and configure subset of processes having similar computing requirements. -The default values for the `process_high` label are set in the pipeline's [`base.config`](https://github.com/nf-core/rnaseq/blob/4c27ef5610c87db00c3c5a3eed10b1d161abf575/conf/base.config#L33-L37) which in this case is defined as 72GB. -Providing you haven't set any other standard nf-core parameters to **cap** the [maximum resources](https://nf-co.re/usage/configuration#max-resources) used by the pipeline then we can try and bypass the `STAR_ALIGN` process failure by creating a custom config file that sets at least 72GB of memory, in this case increased to 100GB. -The custom config below can then be provided to the pipeline via the [`-c`](#-c) parameter as highlighted in previous sections. - -```nextflow -process { - withName: 'NFCORE_RNASEQ:RNASEQ:ALIGN_STAR:STAR_ALIGN' { - memory = 100.GB - } -} -``` - -> **NB:** We specify the full process name i.e. `NFCORE_RNASEQ:RNASEQ:ALIGN_STAR:STAR_ALIGN` in the config file because this takes priority over the short name (`STAR_ALIGN`) and allows existing configuration using the full process name to be correctly overridden. -> -> If you get a warning suggesting that the process selector isn't recognised check that the process name has been specified correctly. - -### Updating containers (advanced users) - -The [Nextflow DSL2](https://www.nextflow.io/docs/latest/dsl2.html) implementation of this pipeline uses one container per process which makes it much easier to maintain and update software dependencies. If for some reason you need to use a different version of a particular tool with the pipeline then you just need to identify the `process` name and override the Nextflow `container` definition for that process using the `withName` declaration. For example, in the [nf-core/viralrecon](https://nf-co.re/viralrecon) pipeline a tool called [Pangolin](https://github.com/cov-lineages/pangolin) has been used during the COVID-19 pandemic to assign lineages to SARS-CoV-2 genome sequenced samples. Given that the lineage assignments change quite frequently it doesn't make sense to re-release the nf-core/viralrecon everytime a new version of Pangolin has been released. However, you can override the default container used by the pipeline by creating a custom config file and passing it as a command-line argument via `-c custom.config`. - -1. Check the default version used by the pipeline in the module file for [Pangolin](https://github.com/nf-core/viralrecon/blob/a85d5969f9025409e3618d6c280ef15ce417df65/modules/nf-core/software/pangolin/main.nf#L14-L19) -2. Find the latest version of the Biocontainer available on [Quay.io](https://quay.io/repository/biocontainers/pangolin?tag=latest&tab=tags) -3. Create the custom config accordingly: - - - For Docker: +To change the resource requests, please see the [max resources](https://nf-co.re/docs/usage/configuration#max-resources) and [tuning workflow resources](https://nf-co.re/docs/usage/configuration#tuning-workflow-resources) section of the nf-core website. - ```nextflow - process { - withName: PANGOLIN { - container = 'quay.io/biocontainers/pangolin:3.0.5--pyhdfd78af_0' - } - } - ``` +### Custom Containers - - For Singularity: +In some cases you may wish to change which container or conda environment a step of the pipeline uses for a particular tool. By default nf-core pipelines use containers and software from the [biocontainers](https://biocontainers.pro/) or [bioconda](https://bioconda.github.io/) projects. However in some cases the pipeline specified version maybe out of date. - ```nextflow - process { - withName: PANGOLIN { - container = 'https://depot.galaxyproject.org/singularity/pangolin:3.0.5--pyhdfd78af_0' - } - } - ``` +To use a different container from the default container or conda environment specified in a pipeline, please see the [updating tool versions](https://nf-co.re/docs/usage/configuration#updating-tool-versions) section of the nf-core website. - - For Conda: +### Custom Tool Arguments - ```nextflow - process { - withName: PANGOLIN { - conda = 'bioconda::pangolin=3.0.5' - } - } - ``` +A pipeline might not always support every possible argument or option of a particular tool used in pipeline. Fortunately, nf-core pipelines provide some freedom to users to insert additional parameters that the pipeline does not include by default. -> **NB:** If you wish to periodically update individual tool-specific results (e.g. Pangolin) generated by the pipeline then you must ensure to keep the `work/` directory otherwise the `-resume` ability of the pipeline will be compromised and it will restart from scratch. +To learn how to provide additional arguments to a particular tool of the pipeline, please see the [customising tool arguments](https://nf-co.re/docs/usage/configuration#customising-tool-arguments) section of the nf-core website. ### nf-core/configs diff --git a/lib/NfcoreSchema.groovy b/lib/NfcoreSchema.groovy index 33cd4f6..9b34804 100755 --- a/lib/NfcoreSchema.groovy +++ b/lib/NfcoreSchema.groovy @@ -2,6 +2,7 @@ // This file holds several functions used to perform JSON parameter validation, help and summary rendering for the nf-core pipeline template. // +import nextflow.Nextflow import org.everit.json.schema.Schema import org.everit.json.schema.loader.SchemaLoader import org.everit.json.schema.ValidationException @@ -83,6 +84,7 @@ class NfcoreSchema { 'stub-run', 'test', 'w', + 'with-apptainer', 'with-charliecloud', 'with-conda', 'with-dag', @@ -177,7 +179,7 @@ class NfcoreSchema { } if (has_error) { - System.exit(1) + Nextflow.error('Exiting!') } } diff --git a/lib/WorkflowEnsemblrepeatdownload.groovy b/lib/WorkflowEnsemblrepeatdownload.groovy index b42def0..f127105 100755 --- a/lib/WorkflowEnsemblrepeatdownload.groovy +++ b/lib/WorkflowEnsemblrepeatdownload.groovy @@ -2,6 +2,7 @@ // This file holds several functions specific to the workflow/ensemblrepeatdownload.nf in the sanger-tol/ensemblrepeatdownload pipeline // +import nextflow.Nextflow import groovy.text.SimpleTemplateEngine class WorkflowEnsemblrepeatdownload { @@ -13,8 +14,7 @@ class WorkflowEnsemblrepeatdownload { if (!params.fasta) { - log.error "Genome fasta file not specified with e.g. '--fasta genome.fa' or via a detectable config file." - System.exit(1) + Nextflow.error "Genome fasta file not specified with e.g. '--fasta genome.fa' or via a detectable config file." } } diff --git a/lib/WorkflowMain.groovy b/lib/WorkflowMain.groovy index 1b3a42f..505a0f5 100755 --- a/lib/WorkflowMain.groovy +++ b/lib/WorkflowMain.groovy @@ -2,6 +2,8 @@ // This file holds several functions specific to the main.nf workflow in the sanger-tol/ensemblrepeatdownload pipeline // +import nextflow.Nextflow + class WorkflowMain { // @@ -21,7 +23,7 @@ class WorkflowMain { // // Generate help string // - public static String help(workflow, params, log) { + public static String help(workflow, params) { def command = "nextflow run ${workflow.manifest.name} --input samplesheet.csv --fasta reference.fa -profile docker" def help_string = '' help_string += NfcoreTemplate.logo(workflow, params.monochrome_logs) @@ -34,7 +36,7 @@ class WorkflowMain { // // Generate parameter summary log string // - public static String paramsSummaryLog(workflow, params, log) { + public static String paramsSummaryLog(workflow, params) { def summary_log = '' summary_log += NfcoreTemplate.logo(workflow, params.monochrome_logs) summary_log += NfcoreSchema.paramsSummaryLog(workflow, params) @@ -49,7 +51,7 @@ class WorkflowMain { public static void initialise(workflow, params, log) { // Print help to screen if required if (params.help) { - log.info help(workflow, params, log) + log.info help(workflow, params) System.exit(0) } @@ -61,7 +63,7 @@ class WorkflowMain { } // Print parameter summary log to screen - log.info paramsSummaryLog(workflow, params, log) + log.info paramsSummaryLog(workflow, params) // Validate workflow parameters via the JSON schema if (params.validate_params) { @@ -81,8 +83,7 @@ class WorkflowMain { // Check input has been provided if (!params.input) { - log.error "Please provide an input samplesheet to the pipeline e.g. '--input samplesheet.csv'" - System.exit(1) + Nextflow.error("Please provide an input samplesheet to the pipeline e.g. '--input samplesheet.csv'") } } } diff --git a/modules.json b/modules.json index 888a59f..617af59 100644 --- a/modules.json +++ b/modules.json @@ -7,7 +7,7 @@ "nf-core": { "custom/dumpsoftwareversions": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "git_sha": "76cc4938c1f6ea5c7d83fed1eeffc146787f9543", "installed_by": ["modules"] }, "fastqc": { @@ -17,7 +17,7 @@ }, "multiqc": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "git_sha": "f2d63bd5b68925f98f572eed70993d205cc694b7", "installed_by": ["modules"] } } diff --git a/modules/local/samplesheet_check.nf b/modules/local/samplesheet_check.nf index 9619c23..6cdadd6 100644 --- a/modules/local/samplesheet_check.nf +++ b/modules/local/samplesheet_check.nf @@ -5,7 +5,7 @@ process SAMPLESHEET_CHECK { conda "conda-forge::python=3.8.3" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/python:3.8.3' : - 'quay.io/biocontainers/python:3.8.3' }" + 'biocontainers/python:3.8.3' }" input: path samplesheet diff --git a/modules/nf-core/custom/dumpsoftwareversions/main.nf b/modules/nf-core/custom/dumpsoftwareversions/main.nf index 3df2176..800a609 100644 --- a/modules/nf-core/custom/dumpsoftwareversions/main.nf +++ b/modules/nf-core/custom/dumpsoftwareversions/main.nf @@ -2,10 +2,10 @@ process CUSTOM_DUMPSOFTWAREVERSIONS { label 'process_single' // Requires `pyyaml` which does not have a dedicated container but is in the MultiQC container - conda "bioconda::multiqc=1.13" + conda "bioconda::multiqc=1.14" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/multiqc:1.13--pyhdfd78af_0' : - 'quay.io/biocontainers/multiqc:1.13--pyhdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/multiqc:1.14--pyhdfd78af_0' : + 'quay.io/biocontainers/multiqc:1.14--pyhdfd78af_0' }" input: path versions diff --git a/modules/nf-core/custom/dumpsoftwareversions/meta.yml b/modules/nf-core/custom/dumpsoftwareversions/meta.yml index 60b546a..c32657d 100644 --- a/modules/nf-core/custom/dumpsoftwareversions/meta.yml +++ b/modules/nf-core/custom/dumpsoftwareversions/meta.yml @@ -1,7 +1,9 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/yaml-schema.json name: custom_dumpsoftwareversions description: Custom module used to dump software versions within the nf-core pipeline template keywords: - custom + - dump - version tools: - custom: diff --git a/modules/nf-core/multiqc/main.nf b/modules/nf-core/multiqc/main.nf index 68f66be..4b60474 100644 --- a/modules/nf-core/multiqc/main.nf +++ b/modules/nf-core/multiqc/main.nf @@ -1,10 +1,10 @@ process MULTIQC { label 'process_single' - conda "bioconda::multiqc=1.13" + conda "bioconda::multiqc=1.14" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/multiqc:1.13--pyhdfd78af_0' : - 'quay.io/biocontainers/multiqc:1.13--pyhdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/multiqc:1.14--pyhdfd78af_0' : + 'quay.io/biocontainers/multiqc:1.14--pyhdfd78af_0' }" input: path multiqc_files, stageAs: "?/*" diff --git a/modules/nf-core/multiqc/meta.yml b/modules/nf-core/multiqc/meta.yml index ebc29b2..f93b5ee 100644 --- a/modules/nf-core/multiqc/meta.yml +++ b/modules/nf-core/multiqc/meta.yml @@ -1,3 +1,4 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/yaml-schema.json name: MultiQC description: Aggregate results from bioinformatics analyses across many samples into a single report keywords: @@ -37,7 +38,7 @@ output: description: MultiQC report file pattern: "multiqc_report.html" - data: - type: dir + type: directory description: MultiQC data dir pattern: "multiqc_data" - plots: diff --git a/nextflow.config b/nextflow.config index a09fd4b..0778239 100644 --- a/nextflow.config +++ b/nextflow.config @@ -73,7 +73,11 @@ try { profiles { - debug { process.beforeScript = 'echo $HOSTNAME' } + debug { + dumpHashes = true + process.beforeScript = 'echo $HOSTNAME' + cleanup = false + } conda { conda.enabled = true docker.enabled = false @@ -81,6 +85,7 @@ profiles { podman.enabled = false shifter.enabled = false charliecloud.enabled = false + apptainer.enabled = false } mamba { conda.enabled = true @@ -90,14 +95,18 @@ profiles { podman.enabled = false shifter.enabled = false charliecloud.enabled = false + apptainer.enabled = false } docker { docker.enabled = true + docker.registry = 'quay.io' docker.userEmulation = true + conda.enabled = false singularity.enabled = false podman.enabled = false shifter.enabled = false charliecloud.enabled = false + apptainer.enabled = false } arm { docker.runOptions = '-u $(id -u):$(id -g) --platform=linux/amd64' @@ -105,31 +114,49 @@ profiles { singularity { singularity.enabled = true singularity.autoMounts = true + conda.enabled = false docker.enabled = false podman.enabled = false shifter.enabled = false charliecloud.enabled = false + apptainer.enabled = false } podman { podman.enabled = true + podman.registry = 'quay.io' + conda.enabled = false docker.enabled = false singularity.enabled = false shifter.enabled = false charliecloud.enabled = false + apptainer.enabled = false } shifter { shifter.enabled = true + conda.enabled = false docker.enabled = false singularity.enabled = false podman.enabled = false charliecloud.enabled = false + apptainer.enabled = false } charliecloud { charliecloud.enabled = true + conda.enabled = false docker.enabled = false singularity.enabled = false podman.enabled = false shifter.enabled = false + apptainer.enabled = false + } + apptainer { + apptainer.enabled = true + conda.enabled = false + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false } gitpod { executor.name = 'local' diff --git a/pipeline_template.yml b/pipeline_template.yml new file mode 100644 index 0000000..09d0664 --- /dev/null +++ b/pipeline_template.yml @@ -0,0 +1,3 @@ +prefix: sanger-tol +skip: +- igenomes diff --git a/tower.yml b/tower.yml new file mode 100644 index 0000000..787aedf --- /dev/null +++ b/tower.yml @@ -0,0 +1,5 @@ +reports: + multiqc_report.html: + display: "MultiQC HTML report" + samplesheet.csv: + display: "Auto-created samplesheet with collated metadata and FASTQ paths" From efd3e8dbd4b7ba95932026280dc4b076795b3985 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Sun, 16 Jul 2023 10:50:22 +0100 Subject: [PATCH 29/72] Fixed the pipeline description --- assets/email_template.html | 2 +- nextflow.config | 2 +- nextflow_schema.json | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/assets/email_template.html b/assets/email_template.html index 8e7a3a4..0147b9c 100644 --- a/assets/email_template.html +++ b/assets/email_template.html @@ -4,7 +4,7 @@ - + sanger-tol/ensemblrepeatdownload Pipeline Report diff --git a/nextflow.config b/nextflow.config index 0778239..de20744 100644 --- a/nextflow.config +++ b/nextflow.config @@ -205,7 +205,7 @@ manifest { name = 'sanger-tol/ensemblrepeatdownload' author = """@muffato""" homePage = 'https://github.com/sanger-tol/ensemblrepeatdownload' - description = """Pipeline to download annotations from Ensembl onto the Tree of Life directory structure""" + description = """Pipeline to download annotations from Ensembl into a Tree of Life directory structure""" mainScript = 'main.nf' nextflowVersion = '!>=22.10.1' version = '1.0dev' diff --git a/nextflow_schema.json b/nextflow_schema.json index 5bf347d..45270ec 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -2,7 +2,7 @@ "$schema": "http://json-schema.org/draft-07/schema", "$id": "https://raw.githubusercontent.com/sanger-tol/ensemblrepeatdownload/master/nextflow_schema.json", "title": "sanger-tol/ensemblrepeatdownload pipeline parameters", - "description": "Pipeline to download annotations from Ensembl onto the Tree of Life directory structure", + "description": "Pipeline to download annotations from Ensembl into a Tree of Life directory structure", "type": "object", "definitions": { "input_output_options": { @@ -263,4 +263,4 @@ "$ref": "#/definitions/generic_options" } ] -} \ No newline at end of file +} From 8f3174b7daadf6aef022aae0a8959809c2d2c9a0 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Sun, 16 Jul 2023 10:52:15 +0100 Subject: [PATCH 30/72] Fixed the pipeline description --- assets/email_template.html | 2 +- nextflow.config | 2 +- nextflow_schema.json | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/assets/email_template.html b/assets/email_template.html index 0147b9c..0ffb3f2 100644 --- a/assets/email_template.html +++ b/assets/email_template.html @@ -4,7 +4,7 @@ - + sanger-tol/ensemblrepeatdownload Pipeline Report diff --git a/nextflow.config b/nextflow.config index de20744..b6bcf4f 100644 --- a/nextflow.config +++ b/nextflow.config @@ -205,7 +205,7 @@ manifest { name = 'sanger-tol/ensemblrepeatdownload' author = """@muffato""" homePage = 'https://github.com/sanger-tol/ensemblrepeatdownload' - description = """Pipeline to download annotations from Ensembl into a Tree of Life directory structure""" + description = """Pipeline to download repeat annotations from Ensembl into a Tree of Life directory structure""" mainScript = 'main.nf' nextflowVersion = '!>=22.10.1' version = '1.0dev' diff --git a/nextflow_schema.json b/nextflow_schema.json index 45270ec..a2e3056 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -2,7 +2,7 @@ "$schema": "http://json-schema.org/draft-07/schema", "$id": "https://raw.githubusercontent.com/sanger-tol/ensemblrepeatdownload/master/nextflow_schema.json", "title": "sanger-tol/ensemblrepeatdownload pipeline parameters", - "description": "Pipeline to download annotations from Ensembl into a Tree of Life directory structure", + "description": "Pipeline to download repeat annotations from Ensembl into a Tree of Life directory structure", "type": "object", "definitions": { "input_output_options": { From fbb69932f34fb176186f26b8a3e763855ae4af42 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Sun, 16 Jul 2023 10:53:51 +0100 Subject: [PATCH 31/72] Fixed the example params.yaml --- docs/usage.md | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/docs/usage.md b/docs/usage.md index ef9a233..51b47e9 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -85,11 +85,10 @@ nextflow run sanger-tol/ensemblrepeatdownload -profile docker -params-file param with `params.yaml` containing: ```yaml -input: './samplesheet.csv' -outdir: './results/' -genome: 'GRCh37' -input: 'data' -<...> +ensembl_species_name: "Noctua_fimbriata" +assembly_accession: "GCA_905163415.1" +annotation_method: "braker" +outdir: "./results/" ``` You can also generate such `YAML`/`JSON` files via [nf-core/launch](https://nf-co.re/launch). From 855419b39309664410a08901f8852404ac7993f9 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Sun, 16 Jul 2023 10:54:08 +0100 Subject: [PATCH 32/72] We do this through a profile --- conf/test_full.config | 2 -- 1 file changed, 2 deletions(-) diff --git a/conf/test_full.config b/conf/test_full.config index 964abfb..141efad 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -10,8 +10,6 @@ ---------------------------------------------------------------------------------------- */ -cleanup = true - params { config_profile_name = 'Full test profile' config_profile_description = 'Full test dataset to check pipeline function' From 7509bacdfba7a68ed1cda16101d328d3e989d37a Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Sun, 16 Jul 2023 10:54:25 +0100 Subject: [PATCH 33/72] [linting] prettier --- .github/ISSUE_TEMPLATE/bug_report.yml | 100 +++++++++++++------------- CITATION.cff | 24 +++---- nextflow_schema.json | 9 +-- pipeline_template.yml | 2 +- 4 files changed, 64 insertions(+), 71 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml index 53e2794..b518362 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.yml +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -2,53 +2,53 @@ name: Bug report description: Report something that is broken or incorrect labels: bug body: -- type: textarea - id: description - attributes: - label: Description of the bug - description: A clear and concise description of what the bug is. - validations: - required: true -- type: textarea - id: command_used - attributes: - label: Command used and terminal output - description: Steps to reproduce the behaviour. Please paste the command you used - to launch the pipeline and the output from your terminal. - render: console - placeholder: '$ nextflow run ... - - - Some output where something broke - - ' -- type: textarea - id: files - attributes: - label: Relevant files - description: 'Please drag and drop the relevant files here. Create a `.zip` archive - if the extension is not allowed. - - Your verbose log file `.nextflow.log` is often useful _(this is a hidden file - in the directory where you launched the pipeline)_ as well as custom Nextflow - configuration files. - - ' -- type: textarea - id: system - attributes: - label: System information - description: '* Nextflow version _(eg. 22.10.1)_ - - * Hardware _(eg. HPC, Desktop, Cloud)_ - - * Executor _(eg. slurm, local, awsbatch)_ - - * Container engine: _(e.g. Docker, Singularity, Conda, Podman, Shifter, Charliecloud, - or Apptainer)_ - - * OS _(eg. CentOS Linux, macOS, Linux Mint)_ - - * Version of sanger-tol/ensemblrepeatdownload _(eg. 1.1, 1.5, 1.8.2)_ - - ' + - type: textarea + id: description + attributes: + label: Description of the bug + description: A clear and concise description of what the bug is. + validations: + required: true + - type: textarea + id: command_used + attributes: + label: Command used and terminal output + description: Steps to reproduce the behaviour. Please paste the command you used + to launch the pipeline and the output from your terminal. + render: console + placeholder: "$ nextflow run ... + + + Some output where something broke + + " + - type: textarea + id: files + attributes: + label: Relevant files + description: "Please drag and drop the relevant files here. Create a `.zip` archive + if the extension is not allowed. + + Your verbose log file `.nextflow.log` is often useful _(this is a hidden file + in the directory where you launched the pipeline)_ as well as custom Nextflow + configuration files. + + " + - type: textarea + id: system + attributes: + label: System information + description: "* Nextflow version _(eg. 22.10.1)_ + + * Hardware _(eg. HPC, Desktop, Cloud)_ + + * Executor _(eg. slurm, local, awsbatch)_ + + * Container engine: _(e.g. Docker, Singularity, Conda, Podman, Shifter, Charliecloud, + or Apptainer)_ + + * OS _(eg. CentOS Linux, macOS, Linux Mint)_ + + * Version of sanger-tol/ensemblrepeatdownload _(eg. 1.1, 1.5, 1.8.2)_ + + " diff --git a/CITATION.cff b/CITATION.cff index a81ccf0..d607c6c 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -4,21 +4,21 @@ cff-version: 1.2.0 title: sanger-tol/ensemblrepeatdownload v1.0.0 - Gwaihir the Windlord message: >- - If you use this software, please cite it using the - metadata from this file. + If you use this software, please cite it using the + metadata from this file. type: software authors: - - given-names: Matthieu - family-names: Muffato - affiliation: Wellcome Sanger Institute - orcid: "https://orcid.org/0000-0002-7860-3560" - - given-names: Priyanka - family-names: Surana - orcid: "https://orcid.org/0000-0002-7167-0875" - affiliation: Wellcome Sanger Institute + - given-names: Matthieu + family-names: Muffato + affiliation: Wellcome Sanger Institute + orcid: "https://orcid.org/0000-0002-7860-3560" + - given-names: Priyanka + family-names: Surana + orcid: "https://orcid.org/0000-0002-7167-0875" + affiliation: Wellcome Sanger Institute identifiers: - - type: doi - value: 10.5281/zenodo.7183380 + - type: doi + value: 10.5281/zenodo.7183380 repository-code: "https://github.com/sanger-tol/ensemblrepeatdownload" license: MIT commit: TODO diff --git a/nextflow_schema.json b/nextflow_schema.json index 8c11ef9..7ed8fae 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -172,14 +172,7 @@ "description": "Method used to save pipeline results to output directory.", "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", "fa_icon": "fas fa-copy", - "enum": [ - "symlink", - "rellink", - "link", - "copy", - "copyNoFollow", - "move" - ], + "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"], "hidden": true }, "email_on_fail": { diff --git a/pipeline_template.yml b/pipeline_template.yml index 09d0664..0aa7398 100644 --- a/pipeline_template.yml +++ b/pipeline_template.yml @@ -1,3 +1,3 @@ prefix: sanger-tol skip: -- igenomes + - igenomes From dc0df2cb833d8f21f4fa4edd75e6d33fc0ee5141 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Sun, 16 Jul 2023 10:54:38 +0100 Subject: [PATCH 34/72] [linting] black --- bin/check_samplesheet.py | 8 ++------ bin/repeats_bed.py | 2 -- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py index 8ecffd4..dc47094 100755 --- a/bin/check_samplesheet.py +++ b/bin/check_samplesheet.py @@ -90,9 +90,7 @@ def _validate_accession(self, row): and row[self._accession_col] and not self._regex_accession.match(row[self._accession_col]) ): - raise AssertionError( - "Accession numbers must match %s." % self._regex_accession - ) + raise AssertionError("Accession numbers must match %s." % self._regex_accession) def _validate_name(self, row): """Assert that the assembly name is non-empty and has no space.""" @@ -120,9 +118,7 @@ def validate_unique_assemblies(self): Assert that the assembly parameters are unique. """ if len(self._seen) != len(self.modified): - raise AssertionError( - "The pair of species directories and assembly names must be unique." - ) + raise AssertionError("The pair of species directories and assembly names must be unique.") def read_head(handle, num_lines=10): diff --git a/bin/repeats_bed.py b/bin/repeats_bed.py index d4ff4b7..e4bd8cd 100755 --- a/bin/repeats_bed.py +++ b/bin/repeats_bed.py @@ -9,7 +9,6 @@ def fasta_to_bed(fasta): - in_gap = None with gzip.open(fasta, "rt") if fasta.endswith(".gz") else open(fasta) as fh: for line in fh: @@ -39,7 +38,6 @@ def fasta_to_bed(fasta): if __name__ == "__main__": - parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("fasta", help="Input Fasta file.") parser.add_argument("--version", action="version", version="%(prog)s 1.1") From 841e44db1822e5a5473e5c4d35f089d480cb3ce7 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Sun, 16 Jul 2023 10:56:46 +0100 Subject: [PATCH 35/72] modules update --- modules.json | 12 ++++----- .../custom/dumpsoftwareversions/main.nf | 2 +- .../templates/dumpsoftwareversions.py | 1 + .../custom/getchromsizes/main.nf | 6 ++--- .../custom/getchromsizes/meta.yml | 0 .../{modules => }/samtools/dict/main.nf | 6 ++--- .../{modules => }/samtools/dict/meta.yml | 0 .../{modules => }/samtools/faidx/main.nf | 22 ++++++++++------ .../{modules => }/samtools/faidx/meta.yml | 14 ++++++++-- .../nf-core/{modules => }/tabix/bgzip/main.nf | 26 ++++++++++++++----- .../{modules => }/tabix/bgzip/meta.yml | 1 + .../nf-core/{modules => }/tabix/tabix/main.nf | 4 +-- .../{modules => }/tabix/tabix/meta.yml | 0 13 files changed, 63 insertions(+), 31 deletions(-) rename modules/nf-core/{modules => }/custom/getchromsizes/main.nf (88%) rename modules/nf-core/{modules => }/custom/getchromsizes/meta.yml (100%) rename modules/nf-core/{modules => }/samtools/dict/main.nf (86%) rename modules/nf-core/{modules => }/samtools/dict/meta.yml (100%) rename modules/nf-core/{modules => }/samtools/faidx/main.nf (59%) rename modules/nf-core/{modules => }/samtools/faidx/meta.yml (79%) rename modules/nf-core/{modules => }/tabix/bgzip/main.nf (65%) rename modules/nf-core/{modules => }/tabix/bgzip/meta.yml (98%) rename modules/nf-core/{modules => }/tabix/tabix/main.nf (89%) rename modules/nf-core/{modules => }/tabix/tabix/meta.yml (100%) diff --git a/modules.json b/modules.json index ca61128..92ef1de 100644 --- a/modules.json +++ b/modules.json @@ -7,32 +7,32 @@ "nf-core": { "custom/dumpsoftwareversions": { "branch": "master", - "git_sha": "76cc4938c1f6ea5c7d83fed1eeffc146787f9543", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"] }, "custom/getchromsizes": { "branch": "master", - "git_sha": "d75b37fef175f241230ee25c485bd574c768e282", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"] }, "samtools/dict": { "branch": "master", - "git_sha": "31409f5e727ec932f0b3a399c7a3847d70b21374", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"] }, "samtools/faidx": { "branch": "master", - "git_sha": "3eb99152cedbb7280258858e5df08478a4670696", + "git_sha": "fd742419940e01ba1c5ecb172c3e32ec840662fe", "installed_by": ["modules"] }, "tabix/bgzip": { "branch": "master", - "git_sha": "31c0b49f6527ef196e89eca49a36af2de71711f8", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"] }, "tabix/tabix": { "branch": "master", - "git_sha": "5e7b1ef9a5a2d9258635bcbf70fcf37dacd1b247", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"] } } diff --git a/modules/nf-core/custom/dumpsoftwareversions/main.nf b/modules/nf-core/custom/dumpsoftwareversions/main.nf index 800a609..ebc8727 100644 --- a/modules/nf-core/custom/dumpsoftwareversions/main.nf +++ b/modules/nf-core/custom/dumpsoftwareversions/main.nf @@ -5,7 +5,7 @@ process CUSTOM_DUMPSOFTWAREVERSIONS { conda "bioconda::multiqc=1.14" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/multiqc:1.14--pyhdfd78af_0' : - 'quay.io/biocontainers/multiqc:1.14--pyhdfd78af_0' }" + 'biocontainers/multiqc:1.14--pyhdfd78af_0' }" input: path versions diff --git a/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py b/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py index b10e0a5..da03340 100755 --- a/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py +++ b/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py @@ -4,6 +4,7 @@ """Provide functions to merge multiple versions.yml files.""" +import yaml import platform from textwrap import dedent diff --git a/modules/nf-core/modules/custom/getchromsizes/main.nf b/modules/nf-core/custom/getchromsizes/main.nf similarity index 88% rename from modules/nf-core/modules/custom/getchromsizes/main.nf rename to modules/nf-core/custom/getchromsizes/main.nf index 8e1693d..060a2e8 100644 --- a/modules/nf-core/modules/custom/getchromsizes/main.nf +++ b/modules/nf-core/custom/getchromsizes/main.nf @@ -2,10 +2,10 @@ process CUSTOM_GETCHROMSIZES { tag "$fasta" label 'process_single' - conda (params.enable_conda ? "bioconda::samtools=1.15.1" : null) + conda "bioconda::samtools=1.16.1" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/samtools:1.15.1--h1170115_0' : - 'quay.io/biocontainers/samtools:1.15.1--h1170115_0' }" + 'https://depot.galaxyproject.org/singularity/samtools:1.16.1--h6899075_1' : + 'biocontainers/samtools:1.16.1--h6899075_1' }" input: tuple val(meta), path(fasta) diff --git a/modules/nf-core/modules/custom/getchromsizes/meta.yml b/modules/nf-core/custom/getchromsizes/meta.yml similarity index 100% rename from modules/nf-core/modules/custom/getchromsizes/meta.yml rename to modules/nf-core/custom/getchromsizes/meta.yml diff --git a/modules/nf-core/modules/samtools/dict/main.nf b/modules/nf-core/samtools/dict/main.nf similarity index 86% rename from modules/nf-core/modules/samtools/dict/main.nf rename to modules/nf-core/samtools/dict/main.nf index 91f782b..f5b469b 100644 --- a/modules/nf-core/modules/samtools/dict/main.nf +++ b/modules/nf-core/samtools/dict/main.nf @@ -2,10 +2,10 @@ process SAMTOOLS_DICT { tag "$fasta" label 'process_single' - conda (params.enable_conda ? "bioconda::samtools=1.15.1" : null) + conda "bioconda::samtools=1.17" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/samtools:1.15.1--h1170115_0' : - 'quay.io/biocontainers/samtools:1.15.1--h1170115_0' }" + 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : + 'biocontainers/samtools:1.17--h00cdaf9_0' }" input: tuple val(meta), path(fasta) diff --git a/modules/nf-core/modules/samtools/dict/meta.yml b/modules/nf-core/samtools/dict/meta.yml similarity index 100% rename from modules/nf-core/modules/samtools/dict/meta.yml rename to modules/nf-core/samtools/dict/meta.yml diff --git a/modules/nf-core/modules/samtools/faidx/main.nf b/modules/nf-core/samtools/faidx/main.nf similarity index 59% rename from modules/nf-core/modules/samtools/faidx/main.nf rename to modules/nf-core/samtools/faidx/main.nf index ef940db..59ed308 100644 --- a/modules/nf-core/modules/samtools/faidx/main.nf +++ b/modules/nf-core/samtools/faidx/main.nf @@ -2,18 +2,20 @@ process SAMTOOLS_FAIDX { tag "$fasta" label 'process_single' - conda (params.enable_conda ? "bioconda::samtools=1.15.1" : null) + conda "bioconda::samtools=1.17" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/samtools:1.15.1--h1170115_0' : - 'quay.io/biocontainers/samtools:1.15.1--h1170115_0' }" + 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : + 'biocontainers/samtools:1.17--h00cdaf9_0' }" input: tuple val(meta), path(fasta) + tuple val(meta2), path(fai) output: - tuple val(meta), path ("*.fai"), emit: fai - tuple val(meta), path ("*.gzi"), emit: gzi, optional: true - path "versions.yml" , emit: versions + tuple val(meta), path ("*.{fa,fasta}") , emit: fa , optional: true + tuple val(meta), path ("*.fai") , emit: fai, optional: true + tuple val(meta), path ("*.gzi") , emit: gzi, optional: true + path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when @@ -23,8 +25,8 @@ process SAMTOOLS_FAIDX { """ samtools \\ faidx \\ - $args \\ - $fasta + $fasta \\ + $args cat <<-END_VERSIONS > versions.yml "${task.process}": @@ -33,8 +35,12 @@ process SAMTOOLS_FAIDX { """ stub: + def match = (task.ext.args =~ /-o(?:utput)?\s(.*)\s?/).findAll() + def fastacmd = match[0] ? "touch ${match[0][1]}" : '' """ + ${fastacmd} touch ${fasta}.fai + cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/nf-core/modules/samtools/faidx/meta.yml b/modules/nf-core/samtools/faidx/meta.yml similarity index 79% rename from modules/nf-core/modules/samtools/faidx/meta.yml rename to modules/nf-core/samtools/faidx/meta.yml index fe2fe9a..957b25e 100644 --- a/modules/nf-core/modules/samtools/faidx/meta.yml +++ b/modules/nf-core/samtools/faidx/meta.yml @@ -3,6 +3,7 @@ description: Index FASTA file keywords: - index - fasta + - faidx tools: - samtools: description: | @@ -17,12 +18,21 @@ input: - meta: type: map description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] + Groovy Map containing reference information + e.g. [ id:'test' ] - fasta: type: file description: FASTA file pattern: "*.{fa,fasta}" + - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'test' ] + - fai: + type: file + description: FASTA index file + pattern: "*.{fai}" output: - meta: type: map diff --git a/modules/nf-core/modules/tabix/bgzip/main.nf b/modules/nf-core/tabix/bgzip/main.nf similarity index 65% rename from modules/nf-core/modules/tabix/bgzip/main.nf rename to modules/nf-core/tabix/bgzip/main.nf index aaef785..8c47d9e 100644 --- a/modules/nf-core/modules/tabix/bgzip/main.nf +++ b/modules/nf-core/tabix/bgzip/main.nf @@ -2,10 +2,10 @@ process TABIX_BGZIP { tag "$meta.id" label 'process_single' - conda (params.enable_conda ? 'bioconda::tabix=1.11' : null) + conda "bioconda::tabix=1.11" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/tabix:1.11--hdfd78af_0' : - 'quay.io/biocontainers/tabix:1.11--hdfd78af_0' }" + 'biocontainers/tabix:1.11--hdfd78af_0' }" input: tuple val(meta), path(input) @@ -22,15 +22,29 @@ process TABIX_BGZIP { def args = task.ext.args ?: '' prefix = task.ext.prefix ?: "${meta.id}" in_bgzip = ["gz", "bgz", "bgzf"].contains(input.getExtension()) - output = in_bgzip ? input.getBaseName() : "${prefix}.${input.getExtension()}.gz" - command1 = in_bgzip ? '-d' : '-c' - command2 = in_bgzip ? '' : " > ${output}" + extension = in_bgzip ? input.getBaseName().tokenize(".")[-1] : input.getExtension() + output = in_bgzip ? "${prefix}.${extension}" : "${prefix}.${extension}.gz" + command = in_bgzip ? '-d' : '' // Name the index according to $prefix, unless a name has been requested if ((args.matches("(^| )-i\\b") || args.matches("(^| )--index(\$| )")) && !args.matches("(^| )-I\\b") && !args.matches("(^| )--index-name\\b")) { args = args + " -I ${output}.gzi" } """ - bgzip $command1 $args -@${task.cpus} $input $command2 + bgzip $command -c $args -@${task.cpus} $input > ${output} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + tabix: \$(echo \$(tabix -h 2>&1) | sed 's/^.*Version: //; s/ .*\$//') + END_VERSIONS + """ + + stub: + prefix = task.ext.prefix ?: "${meta.id}" + in_bgzip = ["gz", "bgz", "bgzf"].contains(input.getExtension()) + output = in_bgzip ? input.getBaseName() : "${prefix}.${input.getExtension()}.gz" + + """ + touch ${output} cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/nf-core/modules/tabix/bgzip/meta.yml b/modules/nf-core/tabix/bgzip/meta.yml similarity index 98% rename from modules/nf-core/modules/tabix/bgzip/meta.yml rename to modules/nf-core/tabix/bgzip/meta.yml index 72f0abc..c3ea210 100644 --- a/modules/nf-core/modules/tabix/bgzip/meta.yml +++ b/modules/nf-core/tabix/bgzip/meta.yml @@ -44,3 +44,4 @@ authors: - "@joseespinosa" - "@drpatelh" - "@maxulysse" + - "@nvnieuwk" diff --git a/modules/nf-core/modules/tabix/tabix/main.nf b/modules/nf-core/tabix/tabix/main.nf similarity index 89% rename from modules/nf-core/modules/tabix/tabix/main.nf rename to modules/nf-core/tabix/tabix/main.nf index 21b2e79..5bf332e 100644 --- a/modules/nf-core/modules/tabix/tabix/main.nf +++ b/modules/nf-core/tabix/tabix/main.nf @@ -2,10 +2,10 @@ process TABIX_TABIX { tag "$meta.id" label 'process_single' - conda (params.enable_conda ? 'bioconda::tabix=1.11' : null) + conda "bioconda::tabix=1.11" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/tabix:1.11--hdfd78af_0' : - 'quay.io/biocontainers/tabix:1.11--hdfd78af_0' }" + 'biocontainers/tabix:1.11--hdfd78af_0' }" input: tuple val(meta), path(tab) diff --git a/modules/nf-core/modules/tabix/tabix/meta.yml b/modules/nf-core/tabix/tabix/meta.yml similarity index 100% rename from modules/nf-core/modules/tabix/tabix/meta.yml rename to modules/nf-core/tabix/tabix/meta.yml From 88210f75177f352336dc130bd0ff2d266e8169d7 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Sun, 16 Jul 2023 10:57:45 +0100 Subject: [PATCH 36/72] Fixed the Conda directives --- modules/local/ensembl_genome_download.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/local/ensembl_genome_download.nf b/modules/local/ensembl_genome_download.nf index 765ecb1..c47e7e0 100644 --- a/modules/local/ensembl_genome_download.nf +++ b/modules/local/ensembl_genome_download.nf @@ -5,7 +5,7 @@ process ENSEMBL_GENOME_DOWNLOAD { tag "${meta.id}" label 'process_single' - conda (params.enable_conda ? "bioconda::wget=1.18" : null) + conda "bioconda::wget=1.18" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/gnu-wget:1.18--h7132678_6' : 'quay.io/biocontainers/gnu-wget:1.18--h7132678_6' }" From 89ff640f37e1e7dfdbfd7aecb66a48d31e92e1fa Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Sun, 16 Jul 2023 10:57:58 +0100 Subject: [PATCH 37/72] Fixed the wget conda package --- modules/local/ensembl_genome_download.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/local/ensembl_genome_download.nf b/modules/local/ensembl_genome_download.nf index c47e7e0..8f28900 100644 --- a/modules/local/ensembl_genome_download.nf +++ b/modules/local/ensembl_genome_download.nf @@ -5,7 +5,7 @@ process ENSEMBL_GENOME_DOWNLOAD { tag "${meta.id}" label 'process_single' - conda "bioconda::wget=1.18" + conda "bioconda::gnu-wget=1.18" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/gnu-wget:1.18--h7132678_6' : 'quay.io/biocontainers/gnu-wget:1.18--h7132678_6' }" From 9bc2ce57b7c879957c67726210a05414a290d460 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Sun, 16 Jul 2023 10:58:17 +0100 Subject: [PATCH 38/72] The quay.io prefix should now be skipped --- modules/local/ensembl_genome_download.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/local/ensembl_genome_download.nf b/modules/local/ensembl_genome_download.nf index 8f28900..53d4e86 100644 --- a/modules/local/ensembl_genome_download.nf +++ b/modules/local/ensembl_genome_download.nf @@ -8,7 +8,7 @@ process ENSEMBL_GENOME_DOWNLOAD { conda "bioconda::gnu-wget=1.18" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/gnu-wget:1.18--h7132678_6' : - 'quay.io/biocontainers/gnu-wget:1.18--h7132678_6' }" + 'biocontainers/gnu-wget:1.18--h7132678_6' }" input: tuple val(meta), val(ftp_path), val(remote_filename_stem) From 417f6026c3b6df5a10fdcf1431cdba1d684b529f Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Sun, 16 Jul 2023 11:00:30 +0100 Subject: [PATCH 39/72] Not needed --- assets/methods_description_template.yml | 25 ------------------------- 1 file changed, 25 deletions(-) delete mode 100644 assets/methods_description_template.yml diff --git a/assets/methods_description_template.yml b/assets/methods_description_template.yml deleted file mode 100644 index 34ef135..0000000 --- a/assets/methods_description_template.yml +++ /dev/null @@ -1,25 +0,0 @@ -id: "sanger-tol-ensemblrepeatdownload-methods-description" -description: "Suggested text and references to use when describing pipeline usage within the methods section of a publication." -section_name: "sanger-tol/ensemblrepeatdownload Methods Description" -section_href: "https://github.com/sanger-tol/ensemblrepeatdownload" -plot_type: "html" -## TODO nf-core: Update the HTML below to your prefered methods description, e.g. add publication citation for this pipeline -## You inject any metadata in the Nextflow '${workflow}' object -data: | -

    Methods

    -

    Data was processed using sanger-tol/ensemblrepeatdownload v${workflow.manifest.version} ${doi_text} of the nf-core collection of workflows (Ewels et al., 2020).

    -

    The pipeline was executed with Nextflow v${workflow.nextflow.version} (Di Tommaso et al., 2017) with the following command:

    -
    ${workflow.commandLine}
    -

    References

    -
      -
    • Di Tommaso, P., Chatzou, M., Floden, E. W., Barja, P. P., Palumbo, E., & Notredame, C. (2017). Nextflow enables reproducible computational workflows. Nature Biotechnology, 35(4), 316-319. https://doi.org/10.1038/nbt.3820
    • -
    • Ewels, P. A., Peltzer, A., Fillinger, S., Patel, H., Alneberg, J., Wilm, A., Garcia, M. U., Di Tommaso, P., & Nahnsen, S. (2020). The nf-core framework for community-curated bioinformatics pipelines. Nature Biotechnology, 38(3), 276-278. https://doi.org/10.1038/s41587-020-0439-x
    • -
    -
    -
    Notes:
    -
      - ${nodoi_text} -
    • The command above does not include parameters contained in any configs or profiles that may have been used. Ensure the config file is also uploaded with your publication!
    • -
    • You should also cite all software used within this run. Check the "Software Versions" of this report to get version information.
    • -
    -
    From ca307b2df597398f4a48ff2d46e189e450a06a90 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Sun, 16 Jul 2023 11:00:01 +0100 Subject: [PATCH 40/72] This file is now identical --- .gitattributes | 2 -- .nf-core.yml | 1 - 2 files changed, 3 deletions(-) diff --git a/.gitattributes b/.gitattributes index 2d80efd..7a2dabc 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,6 +1,4 @@ *.config linguist-language=nextflow *.nf.test linguist-language=nextflow modules/nf-core/** linguist-generated -modules/sanger-tol/** linguist-generated subworkflows/nf-core/** linguist-generated -subworkflows/sanger-tol/** linguist-generated diff --git a/.nf-core.yml b/.nf-core.yml index d4e1bfc..fa0802b 100644 --- a/.nf-core.yml +++ b/.nf-core.yml @@ -12,7 +12,6 @@ lint: - .github/workflows/awsfulltest.yml files_unchanged: - LICENSE - - .gitattributes - .github/CONTRIBUTING.md - .github/ISSUE_TEMPLATE/bug_report.yml - .github/workflows/linting.yml From dc02dc1f4236c0014c6ca757194609ffb607ba0d Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Sun, 16 Jul 2023 11:07:04 +0100 Subject: [PATCH 41/72] we use prettier for the cff files --- .github/workflows/linting.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index a19f141..0f93aaa 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -22,7 +22,7 @@ jobs: run: npm install -g editorconfig-checker - name: Run ECLint check - run: editorconfig-checker -exclude README.md $(find .* -type f | grep -v '.git\|.py\|.md\|json\|yml\|yaml\|html\|css\|work\|.nextflow\|build\|nf_core.egg-info\|log.txt\|Makefile') + run: editorconfig-checker -exclude README.md $(find .* -type f | grep -v '.git\|.py\|.md\|cff\|json\|yml\|yaml\|html\|css\|work\|.nextflow\|build\|nf_core.egg-info\|log.txt\|Makefile') Prettier: runs-on: ubuntu-latest From f7b0b23b9e2389b9a499a18185bfb569a4a6a128 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Sun, 16 Jul 2023 11:07:18 +0100 Subject: [PATCH 42/72] Now need nf-core 2.8 --- .github/workflows/linting.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index 0f93aaa..7ebc310 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -84,7 +84,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install nf-core==2.5.0 + pip install nf-core==2.8 - name: Run nf-core lint env: From e66c079ff270789d768e646c23a7d9f05742a422 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Sun, 16 Jul 2023 11:13:15 +0100 Subject: [PATCH 43/72] Moved from sanger-tol to local --- .../{sanger-tol/nf-core-pipeline => local}/repeats_bed.nf | 0 subworkflows/{sanger-tol => local}/prepare_fasta.nf | 6 +++--- subworkflows/{sanger-tol => local}/prepare_repeats.nf | 8 ++++---- workflows/ensemblrepeatdownload.nf | 4 ++-- 4 files changed, 9 insertions(+), 9 deletions(-) rename modules/{sanger-tol/nf-core-pipeline => local}/repeats_bed.nf (100%) rename subworkflows/{sanger-tol => local}/prepare_fasta.nf (96%) rename subworkflows/{sanger-tol => local}/prepare_repeats.nf (93%) diff --git a/modules/sanger-tol/nf-core-pipeline/repeats_bed.nf b/modules/local/repeats_bed.nf similarity index 100% rename from modules/sanger-tol/nf-core-pipeline/repeats_bed.nf rename to modules/local/repeats_bed.nf diff --git a/subworkflows/sanger-tol/prepare_fasta.nf b/subworkflows/local/prepare_fasta.nf similarity index 96% rename from subworkflows/sanger-tol/prepare_fasta.nf rename to subworkflows/local/prepare_fasta.nf index 13b6859..c615e63 100644 --- a/subworkflows/sanger-tol/prepare_fasta.nf +++ b/subworkflows/local/prepare_fasta.nf @@ -2,9 +2,9 @@ // Prepare all the indexes for a Fasta file // -include { CUSTOM_GETCHROMSIZES } from '../../modules/nf-core/modules/custom/getchromsizes/main' -include { SAMTOOLS_DICT } from '../../modules/nf-core/modules/samtools/dict/main' -include { TABIX_BGZIP } from '../../modules/nf-core/modules/tabix/bgzip/main' +include { CUSTOM_GETCHROMSIZES } from '../../modules/nf-core/custom/getchromsizes/main' +include { SAMTOOLS_DICT } from '../../modules/nf-core/samtools/dict/main' +include { TABIX_BGZIP } from '../../modules/nf-core/tabix/bgzip/main' workflow PREPARE_FASTA { diff --git a/subworkflows/sanger-tol/prepare_repeats.nf b/subworkflows/local/prepare_repeats.nf similarity index 93% rename from subworkflows/sanger-tol/prepare_repeats.nf rename to subworkflows/local/prepare_repeats.nf index 64d23f1..df42c52 100644 --- a/subworkflows/sanger-tol/prepare_repeats.nf +++ b/subworkflows/local/prepare_repeats.nf @@ -3,10 +3,10 @@ // and prepare indexes for it // -include { REPEATS_BED } from '../../modules/sanger-tol/nf-core-pipeline/repeats_bed' -include { TABIX_BGZIP } from '../../modules/nf-core/modules/tabix/bgzip/main' -include { TABIX_TABIX as TABIX_TABIX_CSI } from '../../modules/nf-core/modules/tabix/tabix/main' -include { TABIX_TABIX as TABIX_TABIX_TBI } from '../../modules/nf-core/modules/tabix/tabix/main' +include { REPEATS_BED } from '../../modules/local/repeats_bed' +include { TABIX_BGZIP } from '../../modules/nf-core/tabix/bgzip/main' +include { TABIX_TABIX as TABIX_TABIX_CSI } from '../../modules/nf-core/tabix/tabix/main' +include { TABIX_TABIX as TABIX_TABIX_TBI } from '../../modules/nf-core/tabix/tabix/main' workflow PREPARE_REPEATS { diff --git a/workflows/ensemblrepeatdownload.nf b/workflows/ensemblrepeatdownload.nf index 11a43ae..80729a1 100644 --- a/workflows/ensemblrepeatdownload.nf +++ b/workflows/ensemblrepeatdownload.nf @@ -20,8 +20,8 @@ WorkflowEnsemblrepeatdownload.initialise(params, log) // include { DOWNLOAD } from '../subworkflows/local/download' include { PARAMS_CHECK } from '../subworkflows/local/params_check' -include { PREPARE_FASTA } from '../subworkflows/sanger-tol/prepare_fasta' -include { PREPARE_REPEATS } from '../subworkflows/sanger-tol/prepare_repeats' +include { PREPARE_FASTA } from '../subworkflows/local/prepare_fasta' +include { PREPARE_REPEATS } from '../subworkflows/local/prepare_repeats' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ From 35b1251983179a2c0a521db0d7e5698c7d8a70e6 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Sun, 16 Jul 2023 11:22:20 +0100 Subject: [PATCH 44/72] We don't use MultiQC --- tower.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/tower.yml b/tower.yml index 787aedf..c61323c 100644 --- a/tower.yml +++ b/tower.yml @@ -1,5 +1,3 @@ reports: - multiqc_report.html: - display: "MultiQC HTML report" samplesheet.csv: display: "Auto-created samplesheet with collated metadata and FASTQ paths" From 0d89fe279b05dc266f6efb4f177a172ce251c9b6 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Mon, 31 Jul 2023 21:39:37 +0000 Subject: [PATCH 45/72] The template was erroneously hardcoding sanger-tol/readmapping --- assets/slackreport.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/assets/slackreport.json b/assets/slackreport.json index 043d02f..7fb3dbd 100644 --- a/assets/slackreport.json +++ b/assets/slackreport.json @@ -3,7 +3,7 @@ { "fallback": "Plain-text summary of the attachment.", "color": "<% if (success) { %>good<% } else { %>danger<%} %>", - "author_name": "sanger-tol/readmapping v${version} - ${runName}", + "author_name": "sanger-tol/ensemblrepeatdownload v${version} - ${runName}", "author_icon": "https://www.nextflow.io/docs/latest/_static/favicon.ico", "text": "<% if (success) { %>Pipeline completed successfully!<% } else { %>Pipeline completed with errors<% } %>", "fields": [ From 2fb30e0bd452c6e65b1dc5bd661638cd4e63efe8 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Mon, 31 Jul 2023 21:45:37 +0000 Subject: [PATCH 46/72] Added a redirection notice to the pipeline website --- docs/usage.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/usage.md b/docs/usage.md index 51b47e9..9ae7870 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -1,5 +1,9 @@ # sanger-tol/ensemblrepeatdownload: Usage +## :warning: Please read this documentation on the nf-core website: [https://pipelines.tol.sanger.ac.uk/ensemblrepeatdownload/usage](https://pipelines.tol.sanger.ac.uk/ensemblrepeatdownload/usage) + +> _Documentation of pipeline parameters is generated automatically from the pipeline schema and can no longer be found in markdown files._ + ## Introduction The pipeline downloads Enembl repeat annotations for one of multiple assemblies. From 04fbd9175aa719689a6d11ab0285d04b5a624c27 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Mon, 31 Jul 2023 21:55:46 +0000 Subject: [PATCH 47/72] The convention is now to put all the pipeline_info files in per-pipeline sub-directories --- docs/output.md | 2 +- lib/NfcoreTemplate.groovy | 2 +- nextflow.config | 2 +- nextflow_schema.json | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/output.md b/docs/output.md index 555b7d2..a3deef9 100644 --- a/docs/output.md +++ b/docs/output.md @@ -193,7 +193,7 @@ the directory structure includes the assembly name, e.g. `gfLaeSulp1.1`, and all ### Pipeline information -- `pipeline_info/` +- `pipeline_info/ensemblrepeatdownload/` - Reports generated by Nextflow: `execution_report.html`, `execution_timeline.html`, `execution_trace.txt` and `pipeline_dag.dot`/`pipeline_dag.svg`. - Reports generated by the pipeline: `pipeline_report.html`, `pipeline_report.txt` and `software_versions.yml`. The `pipeline_report*` files will only be present if the `--email` / `--email_on_fail` parameter's are used when running the pipeline. - Reformatted samplesheet files used as input to the pipeline: `samplesheet.valid.csv`. diff --git a/lib/NfcoreTemplate.groovy b/lib/NfcoreTemplate.groovy index c534916..85785ef 100755 --- a/lib/NfcoreTemplate.groovy +++ b/lib/NfcoreTemplate.groovy @@ -132,7 +132,7 @@ class NfcoreTemplate { } // Write summary e-mail HTML to a file - def output_d = new File("${params.outdir}/pipeline_info/") + def output_d = new File("${params.tracedir}/") if (!output_d.exists()) { output_d.mkdirs() } diff --git a/nextflow.config b/nextflow.config index 24d7f1e..b5eb902 100644 --- a/nextflow.config +++ b/nextflow.config @@ -18,7 +18,7 @@ params { // Boilerplate options outdir = 'results' - tracedir = "${params.outdir}/pipeline_info" + tracedir = "${params.outdir}/pipeline_info/ensemblrepeatdownload" publish_dir_mode = 'copy' email = null email_on_fail = null diff --git a/nextflow_schema.json b/nextflow_schema.json index 7ed8fae..5df12d5 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -205,7 +205,7 @@ "tracedir": { "type": "string", "description": "Directory to keep pipeline Nextflow logs and reports.", - "default": "${params.outdir}/pipeline_info", + "default": "${params.outdir}/pipeline_info/ensemblrepeatdownload", "fa_icon": "fas fa-cogs", "hidden": true }, From c1bb5e080401ab38b0918a4bf0a41591e91c8220 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Fri, 4 Aug 2023 10:21:19 +0000 Subject: [PATCH 48/72] Updated the DOI --- CITATION.cff | 2 +- README.md | 4 ++-- lib/WorkflowMain.groovy | 2 +- nextflow.config | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/CITATION.cff b/CITATION.cff index d607c6c..9494493 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -18,7 +18,7 @@ authors: affiliation: Wellcome Sanger Institute identifiers: - type: doi - value: 10.5281/zenodo.7183380 + value: 10.5281/zenodo.7183379 repository-code: "https://github.com/sanger-tol/ensemblrepeatdownload" license: MIT commit: TODO diff --git a/README.md b/README.md index 146cc6f..4fc0f35 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ -[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.7183380-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.7183380) +[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.7183379-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.7183379) [![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A522.10.1-23aa62.svg)](https://www.nextflow.io/) [![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/) @@ -72,7 +72,7 @@ For further information or help, don't hesitate to get in touch on the [Slack `# ## Citations -If you use sanger-tol/ensemblrepeatdownload for your analysis, please cite it using the following doi: [10.5281/zenodo.7183380](https://doi.org/10.5281/zenodo.7183380) +If you use sanger-tol/ensemblrepeatdownload for your analysis, please cite it using the following doi: [10.5281/zenodo.7183379](https://doi.org/10.5281/zenodo.7183379) An extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file. diff --git a/lib/WorkflowMain.groovy b/lib/WorkflowMain.groovy index a694c53..72851a4 100755 --- a/lib/WorkflowMain.groovy +++ b/lib/WorkflowMain.groovy @@ -12,7 +12,7 @@ class WorkflowMain { public static String citation(workflow) { return "If you use ${workflow.manifest.name} for your analysis please cite:\n\n" + "* The pipeline\n" + - " https://doi.org/10.5281/zenodo.7183380\n\n" + + " https://doi.org/10.5281/zenodo.7183379\n\n" + "* The nf-core framework\n" + " https://doi.org/10.1038/s41587-020-0439-x\n\n" + "* Software dependencies\n" + diff --git a/nextflow.config b/nextflow.config index b5eb902..f3a71b2 100644 --- a/nextflow.config +++ b/nextflow.config @@ -205,7 +205,7 @@ manifest { mainScript = 'main.nf' nextflowVersion = '!>=22.10.1' version = '1.1dev' - doi = '' + doi = '10.5281/zenodo.7183379' } // Load modules.config for DSL2 module specific options From 46d52134686b675a9d1cab732aeb7ae4c95b4246 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Fri, 4 Aug 2023 10:21:39 +0000 Subject: [PATCH 49/72] Fixed the README --- README.md | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 4fc0f35..a01384c 100644 --- a/README.md +++ b/README.md @@ -19,14 +19,6 @@ **sanger-tol/ensemblrepeatdownload** is a pipeline that downloads repeat annotations from Ensembl into a Tree of Life directory structure. -The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It uses Docker/Singularity containers making installation trivial and results highly reproducible. The [Nextflow DSL2](https://www.nextflow.io/docs/latest/dsl2.html) implementation of this pipeline uses one container per process which makes it much easier to maintain and update software dependencies. Where possible, these processes have been submitted to and installed from [nf-core/modules](https://github.com/nf-core/modules) in order to make them available to all nf-core pipelines, and to everyone within the Nextflow community! - -On release, automated continuous integration tests run the pipeline on a full-sized dataset on the GitHub CI infrastructure. This ensures that the pipeline runs in a third-party environment, and has sensible resource allocation defaults set to run on real-world datasets. - -## Pipeline summary - -## Overview - The pipeline takes a CSV file that contains assembly accession number, Ensembl species names (as they may differ from Tree of Life ones !), output directories. Assembly accession numbers are optional too. If missing, the pipeline assumes it can be retrieved from files named `ACCESSION` in the standard location on disk. The pipeline downloads the repeat annotation as the masked Fasta file and a BED file. @@ -56,13 +48,16 @@ nextflow run sanger-tol/ensemblrepeatdownload --input $PWD/assets/samplesheet.cs > provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_; > see [docs](https://nf-co.re/usage/configuration#custom-configuration-files). -## Documentation - -The sanger-tol/ensemblrepeatdownload pipeline comes with documentation about the pipeline [usage](docs/usage.md) and [output](docs/output.md). +The pipeline also supports bulk downloads through a sample-sheet. +More information about this mode on our [pipeline website](https://pipelines.tol.sanger.ac.uk/ensemblrepeatdownload/usage). ## Credits -sanger-tol/ensemblrepeatdownload was originally written by @muffato. +sanger-tol/ensemblrepeatdownload was originally written by [Matthieu Muffato](https://github.com/muffato). + +We thank the following people for their assistance in the development of this pipeline: + +- [Priyanka Surana](https://github.com/priyanka-surana) for providing reviews. ## Contributions and Support From e45b9b1bbb620b66690ce9b4783bf701bd933935 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Fri, 4 Aug 2023 10:29:54 +0000 Subject: [PATCH 50/72] Added the code of conduct --- .nf-core.yml | 1 - CODE_OF_CONDUCT.md | 111 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 111 insertions(+), 1 deletion(-) create mode 100644 CODE_OF_CONDUCT.md diff --git a/.nf-core.yml b/.nf-core.yml index fa0802b..2fac3e0 100644 --- a/.nf-core.yml +++ b/.nf-core.yml @@ -1,7 +1,6 @@ repository_type: pipeline lint: files_exist: - - CODE_OF_CONDUCT.md - assets/multiqc_config.yml - assets/nf-core-ensemblrepeatdownload_logo_light.png - conf/igenomes.config diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 0000000..f4fd052 --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,111 @@ +# Code of Conduct at nf-core (v1.0) + +## Our Pledge + +In the interest of fostering an open, collaborative, and welcoming environment, we as contributors and maintainers of nf-core, pledge to making participation in our projects and community a harassment-free experience for everyone, regardless of: + +- Age +- Body size +- Familial status +- Gender identity and expression +- Geographical location +- Level of experience +- Nationality and national origins +- Native language +- Physical and neurological ability +- Race or ethnicity +- Religion +- Sexual identity and orientation +- Socioeconomic status + +Please note that the list above is alphabetised and is therefore not ranked in any order of preference or importance. + +## Preamble + +> Note: This Code of Conduct (CoC) has been drafted by the nf-core Safety Officer and been edited after input from members of the nf-core team and others. "We", in this document, refers to the Safety Officer and members of the nf-core core team, both of whom are deemed to be members of the nf-core community and are therefore required to abide by this Code of Conduct. This document will amended periodically to keep it up-to-date, and in case of any dispute, the most current version will apply. + +An up-to-date list of members of the nf-core core team can be found [here](https://nf-co.re/about). Our current safety officer is Renuka Kudva. + +nf-core is a young and growing community that welcomes contributions from anyone with a shared vision for [Open Science Policies](https://www.fosteropenscience.eu/taxonomy/term/8). Open science policies encompass inclusive behaviours and we strive to build and maintain a safe and inclusive environment for all individuals. + +We have therefore adopted this code of conduct (CoC), which we require all members of our community and attendees in nf-core events to adhere to in all our workspaces at all times. Workspaces include but are not limited to Slack, meetings on Zoom, Jitsi, YouTube live etc. + +Our CoC will be strictly enforced and the nf-core team reserve the right to exclude participants who do not comply with our guidelines from our workspaces and future nf-core activities. + +We ask all members of our community to help maintain a supportive and productive workspace and to avoid behaviours that can make individuals feel unsafe or unwelcome. Please help us maintain and uphold this CoC. + +Questions, concerns or ideas on what we can include? Contact safety [at] nf-co [dot] re + +## Our Responsibilities + +The safety officer is responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behaviour. + +The safety officer in consultation with the nf-core core team have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. + +Members of the core team or the safety officer who violate the CoC will be required to recuse themselves pending investigation. They will not have access to any reports of the violations and be subject to the same actions as others in violation of the CoC. + +## When are where does this Code of Conduct apply? + +Participation in the nf-core community is contingent on following these guidelines in all our workspaces and events. This includes but is not limited to the following listed alphabetically and therefore in no order of preference: + +- Communicating with an official project email address. +- Communicating with community members within the nf-core Slack channel. +- Participating in hackathons organised by nf-core (both online and in-person events). +- Participating in collaborative work on GitHub, Google Suite, community calls, mentorship meetings, email correspondence. +- Participating in workshops, training, and seminar series organised by nf-core (both online and in-person events). This applies to events hosted on web-based platforms such as Zoom, Jitsi, YouTube live etc. +- Representing nf-core on social media. This includes both official and personal accounts. + +## nf-core cares 😊 + +nf-core's CoC and expectations of respectful behaviours for all participants (including organisers and the nf-core team) include but are not limited to the following (listed in alphabetical order): + +- Ask for consent before sharing another community member’s personal information (including photographs) on social media. +- Be respectful of differing viewpoints and experiences. We are all here to learn from one another and a difference in opinion can present a good learning opportunity. +- Celebrate your accomplishments at events! (Get creative with your use of emojis 🎉 🥳 💯 🙌 !) +- Demonstrate empathy towards other community members. (We don’t all have the same amount of time to dedicate to nf-core. If tasks are pending, don’t hesitate to gently remind members of your team. If you are leading a task, ask for help if you feel overwhelmed.) +- Engage with and enquire after others. (This is especially important given the geographically remote nature of the nf-core community, so let’s do this the best we can) +- Focus on what is best for the team and the community. (When in doubt, ask) +- Graciously accept constructive criticism, yet be unafraid to question, deliberate, and learn. +- Introduce yourself to members of the community. (We’ve all been outsiders and we know that talking to strangers can be hard for some, but remember we’re interested in getting to know you and your visions for open science!) +- Show appreciation and **provide clear feedback**. (This is especially important because we don’t see each other in person and it can be harder to interpret subtleties. Also remember that not everyone understands a certain language to the same extent as you do, so **be clear in your communications to be kind.**) +- Take breaks when you feel like you need them. +- Using welcoming and inclusive language. (Participants are encouraged to display their chosen pronouns on Zoom or in communication on Slack.) + +## nf-core frowns on 😕 + +The following behaviours from any participants within the nf-core community (including the organisers) will be considered unacceptable under this code of conduct. Engaging or advocating for any of the following could result in expulsion from nf-core workspaces. + +- Deliberate intimidation, stalking or following and sustained disruption of communication among participants of the community. This includes hijacking shared screens through actions such as using the annotate tool in conferencing software such as Zoom. +- “Doxing” i.e. posting (or threatening to post) another person’s personal identifying information online. +- Spamming or trolling of individuals on social media. +- Use of sexual or discriminatory imagery, comments, or jokes and unwelcome sexual attention. +- Verbal and text comments that reinforce social structures of domination related to gender, gender identity and expression, sexual orientation, ability, physical appearance, body size, race, age, religion or work experience. + +### Online Trolling + +The majority of nf-core interactions and events are held online. Unfortunately, holding events online comes with the added issue of online trolling. This is unacceptable, reports of such behaviour will be taken very seriously, and perpetrators will be excluded from activities immediately. + +All community members are required to ask members of the group they are working within for explicit consent prior to taking screenshots of individuals during video calls. + +## Procedures for Reporting CoC violations + +If someone makes you feel uncomfortable through their behaviours or actions, report it as soon as possible. + +You can reach out to members of the [nf-core core team](https://nf-co.re/about) and they will forward your concerns to the safety officer(s). + +Issues directly concerning members of the core team will be dealt with by other members of the core team and the safety manager, and possible conflicts of interest will be taken into account. nf-core is also in discussions about having an ombudsperson, and details will be shared in due course. + +All reports will be handled with utmost discretion and confidentially. + +## Attribution and Acknowledgements + +- The [Contributor Covenant, version 1.4](http://contributor-covenant.org/version/1/4) +- The [OpenCon 2017 Code of Conduct](http://www.opencon2017.org/code_of_conduct) (CC BY 4.0 OpenCon organisers, SPARC and Right to Research Coalition) +- The [eLife innovation sprint 2020 Code of Conduct](https://sprint.elifesciences.org/code-of-conduct/) +- The [Mozilla Community Participation Guidelines v3.1](https://www.mozilla.org/en-US/about/governance/policies/participation/) (version 3.1, CC BY-SA 3.0 Mozilla) + +## Changelog + +### v1.0 - March 12th, 2021 + +- Complete rewrite from original [Contributor Covenant](http://contributor-covenant.org/) CoC. From e9f36b50ddf01fb43e9b4e66373c60177fef35e1 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Fri, 4 Aug 2023 10:30:16 +0000 Subject: [PATCH 51/72] Aligned with the template --- .nf-core.yml | 1 - docs/README.md | 2 -- docs/parameters.md | 70 ---------------------------------------------- 3 files changed, 73 deletions(-) delete mode 100644 docs/parameters.md diff --git a/.nf-core.yml b/.nf-core.yml index 2fac3e0..fb2371a 100644 --- a/.nf-core.yml +++ b/.nf-core.yml @@ -15,7 +15,6 @@ lint: - .github/ISSUE_TEMPLATE/bug_report.yml - .github/workflows/linting.yml - assets/sendmail_template.txt - - docs/README.md - lib/NfcoreTemplate.groovy nextflow_config: - manifest.name diff --git a/docs/README.md b/docs/README.md index 8b32bab..b3effd1 100644 --- a/docs/README.md +++ b/docs/README.md @@ -4,7 +4,5 @@ The sanger-tol/ensemblrepeatdownload documentation is split into the following p - [Usage](usage.md) - An overview of how the pipeline works, how to run it and a description of all of the different command-line flags. -- [Parameters](parameters.md) - - An overview of the different pipeline options and how to use them. - [Output](output.md) - An overview of the different results produced by the pipeline and how to interpret them. diff --git a/docs/parameters.md b/docs/parameters.md deleted file mode 100644 index 3b8a1a0..0000000 --- a/docs/parameters.md +++ /dev/null @@ -1,70 +0,0 @@ -# sanger-tol/ensemblrepeatdownload pipeline parameters - -Pipeline that downloads repeats annotations from Ensembl into the Tree of Life directory structure - -## Input/output options - -Define where the pipeline should find input data and save output data. - -| Parameter | Description | Type | Default | Required | Hidden | -| ---------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------- | ------- | -------- | ------ | -| `assembly_accession` | Accession number of the assembly to download. Typically of the form `GCA_*.*` | `string` | | | | -| `ensembl_species_name` | Name of the species, _as used by Ensembl_. Note: it may differ from Tree of Life's | `string` | | | | -| `annotation_method` | Method used to annotate the genome. Typically `ensembl`, `braker`, etc. | `string` | | | | -| `outdir` | The output directory where the results will be saved. Not considered for sample-sheet entries that have an absolute path. | `string` | results | | | -| `input` | Path to comma-separated file containing information about the assemblies to download. Used for bulk download of many assemblies.
    HelpThe file has to be a | - -comma-separated file with five columns, and a header row. The columns names must be `species_dir`, `assembly_name`, `ensembl_species_name`, and `annotation_method`. An additional `assembly_accession` column can -be provided too.
    | `string` | | | | -| `ftp_root` | Root location of the Ensembl FTP, in which all annotations can be found. Access protocol is actually not limited to FTP, and we use HTTPS by default. | `string` | -https://ftp.ensembl.org/pub/rapid-release/species | | True | -| `email` | Email address for completion summary.
    HelpSet this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow -exits. If set in your user config file (`~/.nextflow/config`) then you don't need to specify this on the command line for every run.
    | `string` | | | True | - -## Institutional config options - -Parameters used to describe centralised config profiles. These should not be edited. - -| Parameter | Description | Type | Default | Required | Hidden | -| ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------- | ------- | -------- | ------ | -| `custom_config_version` | Git commit id for Institutional configs. | `string` | master | | True | -| `custom_config_base` | Base directory for Institutional configs.
    HelpIf you're running offline, Nextflow will not be able to fetch the institutional config files from the | -| internet. If you don't need them, then this is not a problem. If you do need them, you should download the files from the repo and tell Nextflow where to find them with this parameter.
    | -| `string` | https://raw.githubusercontent.com/nf-core/configs/master | | True | -| `config_profile_name` | Institutional config name. | `string` | | | True | -| `config_profile_description` | Institutional config description. | `string` | | | True | -| `config_profile_contact` | Institutional config contact information. | `string` | | | True | -| `config_profile_url` | Institutional config URL link. | `string` | | | True | - -## Max job request options - -Set the top limit for requested resources for any single job. - -| Parameter | Description | Type | Default | Required | Hidden | -| ------------------------------------------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------ | ------- | -------- | ------ | -| `max_cpus` | Maximum number of CPUs that can be requested for any single job.
    HelpUse to set an upper-limit for the CPU requirement for each process. Should be an integer | -| e.g. `--max_cpus 1`
    | `integer` | 16 | | True | -| `max_memory` | Maximum amount of memory that can be requested for any single job.
    HelpUse to set an upper-limit for the memory requirement for each process. Should be a | -| string in the format integer-unit e.g. `--max_memory '8.GB'`
    | `string` | 128.GB | | True | -| `max_time` | Maximum amount of time that can be requested for any single job.
    HelpUse to set an upper-limit for the time requirement for each process. Should be a string in | -| the format integer-unit e.g. `--max_time '2.h'`
    | `string` | 240.h | | True | - -## Generic options - -Less common options for the pipeline, typically set in a config file. - -| Parameter | Description | Type | Default | Required | Hidden | -| ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --------- | ------------------------------ | -------- | ------ | -| `help` | Display help text. | `boolean` | | | True | -| `publish_dir_mode` | Method used to save pipeline results to output directory.
    HelpThe Nextflow `publishDir` option specifies which intermediate files should be saved to the | -| output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.
    | -| `string` | copy | | True | -| `email_on_fail` | Email address for completion summary, only when pipeline fails.
    HelpAn email address to send a summary email to when the pipeline is completed - ONLY sent | -| if the pipeline does not exit successfully.
    | `string` | | | True | -| `plaintext_email` | Send plain-text email instead of HTML. | `boolean` | | | True | -| `monochrome_logs` | Do not use coloured log outputs. | `boolean` | | | True | -| `tracedir` | Directory to keep pipeline Nextflow logs and reports. | `string` | ${params.outdir}/pipeline_info | | True | -| `validate_params` | Boolean whether to validate parameters against the schema at runtime | `boolean` | True | | True | -| `show_hidden_params` | Show all params when using `--help`
    HelpBy default, parameters set as _hidden_ in the schema are not shown on the command line when a user runs with | -| `--help`. Specifying this option will tell the pipeline to show all parameters.
    | `boolean` | | | True | -| `enable_conda` | Run this workflow with Conda. You can also use '-profile conda' instead of providing this parameter. | `boolean` | | | True | From 4f560658e0fe7879b637b4584c3cf0d16de10ab3 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Fri, 4 Aug 2023 11:19:31 +0000 Subject: [PATCH 52/72] Added some instructions about fetching the Ensembl parameters --- docs/usage.md | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/docs/usage.md b/docs/usage.md index 9ae7870..316fa04 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -25,6 +25,22 @@ nextflow run sanger-tol/ensemblrepeatdownload -profile singularity --ensembl_spe This will launch the pipeline and download the assembly of `Noctua_fimbriata` accession `GCA_905163415.1` into the `Noctua_fimbriata_repeats/` directory, which will be created if needed. +Those parameters can be retrieved by browsing the [Ensembl Rapid Release](https://rapid.ensembl.org/) site. + +- Go to the [species list](https://rapid.ensembl.org/info/about/species.html) and click on the + annotation link of your species of interest. +- From the URL, e.g. `https://ftp.ensembl.org/pub/rapid-release/species/Noctua_fimbriata/GCA_905163425.1/braker/geneset/`, + extract the species name, the assembly accession, and the annotation method. + +> [!WARNING] +> Only the _Rapid Release_ site is currently supported, not the other sites. + +Current annotation methods include: + +- `ensembl` for Ensembl's own annotation pipeline +- `braker` for [BRAKER2](https://academic.oup.com/nargab/article/3/1/lqaa108/6066535) +- `refseq` for [RefSeq](https://academic.oup.com/nar/article/49/D1/D1020/6018440) + ## Bulk download The pipeline can download multiple assemblies at once, by providing them in a `.csv` file through the `--input` parameter. From c6d887479409c3b73cf007b0e21390c1dc7d0080 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Fri, 4 Aug 2023 11:22:07 +0000 Subject: [PATCH 53/72] Added the farm tests (through Tower) --- .github/workflows/sangerfulltest.yml | 37 ++++++++++++++++++++++++++++ .github/workflows/sangertest.yml | 23 +++++++++++++++++ nextflow.config | 1 + 3 files changed, 61 insertions(+) create mode 100644 .github/workflows/sangerfulltest.yml create mode 100644 .github/workflows/sangertest.yml diff --git a/.github/workflows/sangerfulltest.yml b/.github/workflows/sangerfulltest.yml new file mode 100644 index 0000000..850d4ae --- /dev/null +++ b/.github/workflows/sangerfulltest.yml @@ -0,0 +1,37 @@ +name: sanger-tol LSF full size tests + +on: + push: + branches: + - main + - dev + workflow_dispatch: +jobs: + run-tower: + name: Run LSF full size tests + runs-on: ubuntu-latest + steps: + - name: Sets env vars for push + run: | + echo "REVISION=${{ github.sha }}" >> $GITHUB_ENV + if: github.event_name == 'push' + + - name: Sets env vars for workflow_dispatch + run: | + echo "REVISION=${{ github.sha }}" >> $GITHUB_ENV + if: github.event_name == 'workflow_dispatch' + + - name: Launch workflow via tower + uses: nf-core/tower-action@v2 + with: + workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }} + access_token: ${{ secrets.TOWER_ACCESS_TOKEN }} + compute_env: ${{ secrets.TOWER_COMPUTE_ENV }} + pipeline: ${{ github.repository }} + revision: ${{ env.REVISION }} + workdir: ${{ secrets.TOWER_WORKDIR_PARENT }}/work/${{ github.repository }}/work-${{ env.REVISION }} + parameters: | + { + "outdir": "${{ secrets.TOWER_WORKDIR_PARENT }}/results/${{ github.repository }}/results-${{ env.REVISION }}", + } + profiles: test_full,sanger,singularity,cleanup diff --git a/.github/workflows/sangertest.yml b/.github/workflows/sangertest.yml new file mode 100644 index 0000000..d6d7be5 --- /dev/null +++ b/.github/workflows/sangertest.yml @@ -0,0 +1,23 @@ +name: sanger-tol LSF tests + +on: + workflow_dispatch: +jobs: + run-tower: + name: Run LSF tests + runs-on: ubuntu-latest + steps: + - name: Launch workflow via tower + uses: nf-core/tower-action@v2 + with: + workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }} + access_token: ${{ secrets.TOWER_ACCESS_TOKEN }} + compute_env: ${{ secrets.TOWER_COMPUTE_ENV }} + pipeline: ${{ github.repository }} + revision: ${{ github.sha }} + workdir: ${{ secrets.TOWER_WORKDIR_PARENT }}/work/${{ github.repository }}/work-${{ github.sha }} + parameters: | + { + "outdir": "${{ secrets.TOWER_WORKDIR_PARENT }}/results/${{ github.repository }}/results-${{ github.sha }}", + } + profiles: test,sanger,singularity,cleanup diff --git a/nextflow.config b/nextflow.config index f3a71b2..6ab0b46 100644 --- a/nextflow.config +++ b/nextflow.config @@ -69,6 +69,7 @@ try { profiles { + cleanup { cleanup = true } debug { dumpHashes = true process.beforeScript = 'echo $HOSTNAME' From 44126ff359a742d4d32f9e995bb826567cb687d1 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Fri, 4 Aug 2023 12:30:09 +0000 Subject: [PATCH 54/72] Conda syntax --- modules/local/repeats_bed.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/local/repeats_bed.nf b/modules/local/repeats_bed.nf index 8f66714..205c59e 100644 --- a/modules/local/repeats_bed.nf +++ b/modules/local/repeats_bed.nf @@ -4,7 +4,7 @@ process REPEATS_BED { tag "$genome" label 'process_single' - conda (params.enable_conda ? "conda-forge::python=3.9.1" : null) + conda "conda-forge::python=3.9.1" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/python:3.9--1' : 'quay.io/biocontainers/python:3.9--1' }" From bee8e2d02ce41d2ea9ad8b4339f4a85740ff4ee0 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Fri, 4 Aug 2023 12:39:53 +0000 Subject: [PATCH 55/72] Fixed the example --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index a01384c..6192016 100644 --- a/README.md +++ b/README.md @@ -39,8 +39,8 @@ Steps involved: Now, you can run the pipeline using: -```bash -nextflow run sanger-tol/ensemblrepeatdownload --input $PWD/assets/samplesheet.csv --outdir -profile +```console +nextflow run sanger-tol/ensemblrepeatdownload --ensembl_species_name Noctua_fimbriata --assembly_accession GCA_905163415.1 --annotation_method braker ``` > **Warning:** From f986da9fd9dc7f590871cb7f10280dab89a1dbab Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Fri, 4 Aug 2023 12:40:38 +0000 Subject: [PATCH 56/72] Fixed the example --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 6192016..a7f0a36 100644 --- a/README.md +++ b/README.md @@ -37,7 +37,7 @@ Steps involved: > to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) > with `-profile test` before running the workflow on actual data. -Now, you can run the pipeline using: +The easiest is to provide the exact species name and versions of the assembly and annotations like this: ```console nextflow run sanger-tol/ensemblrepeatdownload --ensembl_species_name Noctua_fimbriata --assembly_accession GCA_905163415.1 --annotation_method braker From ff4794646e1147ca4c29756033b675523052387c Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Fri, 11 Aug 2023 02:35:24 +0000 Subject: [PATCH 57/72] The warning should be on the README too --- README.md | 3 +++ docs/usage.md | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index a7f0a36..fd94389 100644 --- a/README.md +++ b/README.md @@ -30,6 +30,9 @@ Steps involved: - Extract the coordinates of the masked regions into a BED file. - Compress and index the BED file with `bgzip` and `tabix`. +> [!WARNING] +> Only the _Rapid Release_ site is currently supported, not the other Ensembl sites. + ## Usage > **Note** diff --git a/docs/usage.md b/docs/usage.md index 316fa04..b487b49 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -33,7 +33,7 @@ Those parameters can be retrieved by browsing the [Ensembl Rapid Release](https: extract the species name, the assembly accession, and the annotation method. > [!WARNING] -> Only the _Rapid Release_ site is currently supported, not the other sites. +> Only the _Rapid Release_ site is currently supported, not the other Ensembl sites. Current annotation methods include: From b177af0ebd6396e9bae7bc17c1b9c451884d6a2d Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Fri, 11 Aug 2023 09:44:28 +0000 Subject: [PATCH 58/72] Latest nomenclature --- .github/workflows/{sangertest.yml => sanger_test.yml} | 0 .github/workflows/{sangerfulltest.yml => sanger_test_full.yml} | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename .github/workflows/{sangertest.yml => sanger_test.yml} (100%) rename .github/workflows/{sangerfulltest.yml => sanger_test_full.yml} (100%) diff --git a/.github/workflows/sangertest.yml b/.github/workflows/sanger_test.yml similarity index 100% rename from .github/workflows/sangertest.yml rename to .github/workflows/sanger_test.yml diff --git a/.github/workflows/sangerfulltest.yml b/.github/workflows/sanger_test_full.yml similarity index 100% rename from .github/workflows/sangerfulltest.yml rename to .github/workflows/sanger_test_full.yml From d649e8174ebbca5619dbd35cda455d7ccbcacb85 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Fri, 4 Aug 2023 12:30:01 +0000 Subject: [PATCH 59/72] Now use "outdir", which is equivalent to a ToL analysis directory --- assets/samplesheet.csv | 10 ++++----- assets/schema_input.json | 9 ++------ bin/check_samplesheet.py | 34 +++++++++--------------------- docs/usage.md | 31 ++++++++++++--------------- nextflow_schema.json | 2 +- subworkflows/local/download.nf | 9 ++++---- subworkflows/local/params_check.nf | 14 ++++-------- workflows/ensemblrepeatdownload.nf | 2 +- 8 files changed, 42 insertions(+), 69 deletions(-) diff --git a/assets/samplesheet.csv b/assets/samplesheet.csv index c902d01..0ef842f 100644 --- a/assets/samplesheet.csv +++ b/assets/samplesheet.csv @@ -1,5 +1,5 @@ -species_dir,assembly_name,assembly_accession,ensembl_species_name,annotation_method -25g/data/echinoderms/Asterias_rubens,eAstRub1.3,GCA_902459465.3,Asterias_rubens,refseq -25g/data/insects/Osmia_bicornis,iOsmBic2.1,GCA_907164935.1,Osmia_bicornis_bicornis,ensembl -25g/data/insects/Osmia_bicornis,iOsmBic2.1_alternate_haplotype,GCA_907164925.1,Osmia_bicornis_bicornis,ensembl -darwin/data/insects/Noctua_fimbriata,ilNocFimb1.1,GCA_905163415.1,Noctua_fimbriata,braker +outdir,assembly_accession,ensembl_species_name,annotation_method +Asterias_rubens/eAstRub1.3,GCA_902459465.3,Asterias_rubens,refseq +Osmia_bicornis/iOsmBic2.1,GCA_907164935.1,Osmia_bicornis_bicornis,ensembl +Osmia_bicornis/iOsmBic2.1_alternate_haplotype,GCA_907164925.1,Osmia_bicornis_bicornis,ensembl +Noctua_fimbriata/ilNocFimb1.1,GCA_905163415.1,Noctua_fimbriata,braker diff --git a/assets/schema_input.json b/assets/schema_input.json index 05ce4ca..f79333b 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -7,16 +7,11 @@ "items": { "type": "object", "properties": { - "species_dir": { + "outdir": { "type": "string", "pattern": "^\\S+$", "errorMessage": "Species directory must be provided and exist" }, - "assembly_name": { - "type": "string", - "pattern": "^\\S+$", - "errorMessage": "Assembly name must be provided and cannot contain spaces" - }, "assembly_accession": { "type": "string", "pattern": "^GCA_[0-9]{9}\\.[0-9]+$", @@ -33,6 +28,6 @@ "errorMessage": "The annotation method must be provided and cannot contain spaces" } }, - "required": ["species_dir", "assembly_name", "ensembl_species_name", "annotation_method"] + "required": ["outdir", "assembly_accession", "ensembl_species_name", "annotation_method"] } } diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py index dc47094..76609c7 100755 --- a/bin/check_samplesheet.py +++ b/bin/check_samplesheet.py @@ -28,8 +28,7 @@ class RowChecker: def __init__( self, - dir_col="species_dir", - name_col="assembly_name", + dir_col="outdir", accession_col="assembly_accession", ensembl_name_col="ensembl_species_name", method_col="annotation_method", @@ -40,9 +39,7 @@ def __init__( Args: dir_col (str): The name of the column that contains the species directory - (default "species_dir"). - name_col (str): The name of the column that contains the assembly name - (default "assembly_name"). + (default "outdir"). accession_col (str): The name of the column that contains the accession number (default "assembly_accession"). ensembl_name_col(str): The name of the column that contains the Ensembl species name @@ -53,7 +50,6 @@ def __init__( """ super().__init__(**kwargs) self._dir_col = dir_col - self._name_col = name_col self._accession_col = accession_col self._ensembl_name_col = ensembl_name_col self._method_col = method_col @@ -71,11 +67,10 @@ def validate_and_transform(self, row): """ self._validate_dir(row) - self._validate_name(row) self._validate_accession(row) self._validate_ensembl_name(row) self._validate_method(row) - self._seen.add(row[self._name_col]) + self._seen.add(row[self._accession_col]) self.modified.append(row) def _validate_dir(self, row): @@ -85,20 +80,11 @@ def _validate_dir(self, row): def _validate_accession(self, row): """Assert that the accession number exists and matches the expected nomenclature.""" - if ( - self._accession_col in row - and row[self._accession_col] - and not self._regex_accession.match(row[self._accession_col]) - ): + if not row[self._accession_col]: + raise AssertionError("Assembly accession is required.") + if not self._regex_accession.match(row[self._accession_col]): raise AssertionError("Accession numbers must match %s." % self._regex_accession) - def _validate_name(self, row): - """Assert that the assembly name is non-empty and has no space.""" - if not row[self._name_col]: - raise AssertionError("Assembly name is required.") - if " " in row[self._name_col]: - raise AssertionError("Accession names must not contain whitespace.") - def _validate_ensembl_name(self, row): """Assert that the Ensembl name is non-empty and has no space.""" if not row[self._ensembl_name_col]: @@ -168,13 +154,13 @@ def check_samplesheet(file_in, file_out): Example: This function checks that the samplesheet follows the following structure:: - species_dir,assembly_name,ensembl_species_name,annotation_method - 25g/data/echinoderms/Asterias_rubens,eAstRub1.3,Asterias_rubens,ensembl + outdir,assembly_accession,ensembl_species_name,annotation_method + Asterias_rubens/eAstRub1.3,GCA_902459465.3,Asterias_rubens,refseq """ required_columns = { - "species_dir", - "assembly_name", + "outdir", + "assembly_accession", "ensembl_species_name", "annotation_method", } diff --git a/docs/usage.md b/docs/usage.md index b487b49..dfcbe30 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -44,23 +44,22 @@ Current annotation methods include: ## Bulk download The pipeline can download multiple assemblies at once, by providing them in a `.csv` file through the `--input` parameter. -It has to be a comma-separated file with four or five columns, and a header row as shown in the examples below. +It has to be a comma-separated file with four columns, and a header row as shown in the examples below. ```console -species_dir,assembly_name,assembly_accession,ensembl_species_name,annotation_method -25g/data/echinoderms/Asterias_rubens,eAstRub1.3,GCA_902459465.3,Asterias_rubens,refseq -25g/data/insects/Osmia_bicornis,iOsmBic2.1,GCA_907164935.1,Osmia_bicornis_bicornis,ensembl -25g/data/insects/Osmia_bicornis,iOsmBic2.1_alternate_haplotype,GCA_907164925.1,Osmia_bicornis_bicornis,ensembl -darwin/data/insects/Noctua_fimbriata,ilNocFimb1.1,GCA_905163415.1,Noctua_fimbriata,braker +outdir,assembly_accession,ensembl_species_name,annotation_method +Asterias_rubens/eAstRub1.3,GCA_902459465.3,Asterias_rubens,refseq +Osmia_bicornis/iOsmBic2.1,GCA_907164935.1,Osmia_bicornis_bicornis,ensembl +Osmia_bicornis/iOsmBic2.1_alternate_haplotype,GCA_907164925.1,Osmia_bicornis_bicornis,ensembl +Noctua_fimbriata/ilNocFimb1.1,GCA_905163415.1,Noctua_fimbriata,braker ``` -| Column | Description | -| ---------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `species_dir` | Output directory for this species (evaluated from `--outdir` if a relative path). Analysis results are deposited in `analysis/$assembly_name/`. | -| `assembly_name` | Name of the assembly. Used to build the actual output directory. | -| `assembly_accession` | (Optional). Accession number of the assembly to download. Typically of the form `GCA_*.*`. If missing, the pipeline will infer it from the ACCESSION file. | -| `ensembl_species_name` | Name of the species, _as used by Ensembl_. Note: it may differ from Tree of Life's | -| `annotation_method` | Name of the method of the geneset that holds the repeat annotation. | +| Column | Description | +| ---------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------- | +| `outdir` | Output directory for this annotation (evaluated from `--outdir` if a relative path). Analysis results are in a sub-directory `repeats/ensembl`. | +| `assembly_accession` | Accession number of the assembly to download. Typically of the form `GCA_*.*`. If missing, the pipeline will infer it from the ACCESSION file. | +| `ensembl_species_name` | Name of the species, _as used by Ensembl_. Note: it may differ from Tree of Life's | +| `annotation_method` | Name of the method of the geneset that holds the repeat annotation. | A samplesheet may contain: @@ -69,9 +68,7 @@ A samplesheet may contain: - only one row per assembly All samplesheet columns correspond exactly to their corresponding command-line parameter, -except `species_dir` which overrides or complements `--oudir`. -`species_dir` is used to fit the output of this pipeline into a directory structure compatible with the other pipelines -from Sanger Tree of Life. +except `outdir` which, if a relative path, is interpreted under `--oudir`. An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline. @@ -105,8 +102,8 @@ nextflow run sanger-tol/ensemblrepeatdownload -profile docker -params-file param with `params.yaml` containing: ```yaml -ensembl_species_name: "Noctua_fimbriata" assembly_accession: "GCA_905163415.1" +ensembl_species_name: "Noctua_fimbriata" annotation_method: "braker" outdir: "./results/" ``` diff --git a/nextflow_schema.json b/nextflow_schema.json index 5df12d5..a68ea51 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -44,7 +44,7 @@ "pattern": "^\\S+\\.csv$", "schema": "assets/schema_input.json", "description": "Path to comma-separated file containing information about the assemblies to download. Used for bulk download of many assemblies.", - "help_text": "The file has to be a comma-separated file with five columns, and a header row. The columns names must be `species_dir`, `assembly_name`, `ensembl_species_name`, and `annotation_method`. An additional `assembly_accession` column can be provided too.", + "help_text": "The file has to be a comma-separated file with four columns, and a header row. The columns names must be `outdir`, `assembly_accession`, `ensembl_species_name`, and `annotation_method`.", "fa_icon": "fas fa-file-csv" }, "ftp_root": { diff --git a/subworkflows/local/download.nf b/subworkflows/local/download.nf index 0ac37d5..1d16f58 100644 --- a/subworkflows/local/download.nf +++ b/subworkflows/local/download.nf @@ -8,7 +8,7 @@ include { ENSEMBL_GENOME_DOWNLOAD } from '../../modules/local/ensembl_geno workflow DOWNLOAD { take: - repeat_params // tuple(analysis_dir, ensembl_species_name, assembly_accession, annotation_method) + repeat_params // tuple(outdir, assembly_accession, ensembl_species_name, annotation_method) main: @@ -16,9 +16,10 @@ workflow DOWNLOAD { ch_genome_fasta = ENSEMBL_GENOME_DOWNLOAD ( repeat_params.map { - species_dir, - ensembl_species_name, + + outdir, assembly_accession, + ensembl_species_name, annotation_method -> [ @@ -26,7 +27,7 @@ workflow DOWNLOAD { [ id: assembly_accession + ".masked.ensembl", method: annotation_method, - outdir: species_dir, + outdir: outdir, ], // e.g. https://ftp.ensembl.org/pub/rapid-release/species/Agriopis_aurantiaria/GCA_914767915.1/braker/genome/Agriopis_aurantiaria-GCA_914767915.1-softmasked.fa.gz diff --git a/subworkflows/local/params_check.nf b/subworkflows/local/params_check.nf index 9337087..2be381e 100644 --- a/subworkflows/local/params_check.nf +++ b/subworkflows/local/params_check.nf @@ -19,19 +19,13 @@ workflow PARAMS_CHECK { if (samplesheet) { SAMPLESHEET_CHECK ( file(samplesheet, checkIfExists: true) ) .csv - // Provides species_dir, assembly_name, assembly_accession (optional), ensembl_species_name, and annotation_method + // Provides outdir, assembly_accession, ensembl_species_name, and annotation_method .splitCsv ( header:true, sep:',' ) - .map { - // If assembly_accession is missing, load the accession number from file, following the Tree of Life directory structure - it["assembly_accession"] ? it : it + [ - assembly_accession: file("${it["species_dir"]}/assembly/release/${it["assembly_name"]}/insdc/ACCESSION", checkIfExists: true).text.trim(), - ] - } // Convert to tuple, as required by the download subworkflow .map { [ - (it["species_dir"].startsWith("/") ? "" : outdir + "/") + "${it["species_dir"]}/analysis/${it["assembly_name"]}", - it["ensembl_species_name"], + (it["outdir"].startsWith("/") ? "" : outdir + "/") + it["outdir"], it["assembly_accession"], + it["ensembl_species_name"], it["annotation_method"], ] } .set { ch_inputs } @@ -45,7 +39,7 @@ workflow PARAMS_CHECK { emit: - ensembl_params = ch_inputs // tuple(analysis_dir, ensembl_species_name, assembly_accession, annotation_method) + ensembl_params = ch_inputs // tuple(outdir, ensembl_species_name, assembly_accession, annotation_method) versions = ch_versions // channel: versions.yml } diff --git a/workflows/ensemblrepeatdownload.nf b/workflows/ensemblrepeatdownload.nf index 80729a1..685ce63 100644 --- a/workflows/ensemblrepeatdownload.nf +++ b/workflows/ensemblrepeatdownload.nf @@ -48,8 +48,8 @@ workflow ENSEMBLREPEATDOWNLOAD { params.input, Channel.of( [ - params.ensembl_species_name, params.assembly_accession, + params.ensembl_species_name, params.annotation_method, ] ), From ec56e9b2788769cbdd0d71d31b1b33d0ac0ff710 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Mon, 14 Aug 2023 13:30:43 +0000 Subject: [PATCH 60/72] Use the latest version of the Tower action --- .github/workflows/sanger_test.yml | 2 +- .github/workflows/sanger_test_full.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/sanger_test.yml b/.github/workflows/sanger_test.yml index d6d7be5..e7007c0 100644 --- a/.github/workflows/sanger_test.yml +++ b/.github/workflows/sanger_test.yml @@ -8,7 +8,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Launch workflow via tower - uses: nf-core/tower-action@v2 + uses: seqeralabs/action-tower-launch@v2 with: workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }} access_token: ${{ secrets.TOWER_ACCESS_TOKEN }} diff --git a/.github/workflows/sanger_test_full.yml b/.github/workflows/sanger_test_full.yml index 850d4ae..fc11b07 100644 --- a/.github/workflows/sanger_test_full.yml +++ b/.github/workflows/sanger_test_full.yml @@ -22,7 +22,7 @@ jobs: if: github.event_name == 'workflow_dispatch' - name: Launch workflow via tower - uses: nf-core/tower-action@v2 + uses: seqeralabs/action-tower-launch@v2 with: workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }} access_token: ${{ secrets.TOWER_ACCESS_TOKEN }} From 6b9a55d630e427fbc24e707a671d459bc8d8b425 Mon Sep 17 00:00:00 2001 From: Guoying Qi <729395+gq1@users.noreply.github.com> Date: Wed, 16 Aug 2023 14:23:02 +0100 Subject: [PATCH 61/72] Update sanger_test.yml --- .github/workflows/sanger_test.yml | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/.github/workflows/sanger_test.yml b/.github/workflows/sanger_test.yml index e7007c0..e69af1e 100644 --- a/.github/workflows/sanger_test.yml +++ b/.github/workflows/sanger_test.yml @@ -13,7 +13,6 @@ jobs: workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }} access_token: ${{ secrets.TOWER_ACCESS_TOKEN }} compute_env: ${{ secrets.TOWER_COMPUTE_ENV }} - pipeline: ${{ github.repository }} revision: ${{ github.sha }} workdir: ${{ secrets.TOWER_WORKDIR_PARENT }}/work/${{ github.repository }}/work-${{ github.sha }} parameters: | @@ -21,3 +20,10 @@ jobs: "outdir": "${{ secrets.TOWER_WORKDIR_PARENT }}/results/${{ github.repository }}/results-${{ github.sha }}", } profiles: test,sanger,singularity,cleanup + + - uses: actions/upload-artifact@v3 + with: + name: Tower debug log file + path: | + tower_action_*.log + tower_action_*.json From 0a98259fb39653d442400ce15bf23199c991c71f Mon Sep 17 00:00:00 2001 From: Guoying Qi <729395+gq1@users.noreply.github.com> Date: Wed, 16 Aug 2023 14:23:36 +0100 Subject: [PATCH 62/72] Update sanger_test_full.yml --- .github/workflows/sanger_test_full.yml | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/.github/workflows/sanger_test_full.yml b/.github/workflows/sanger_test_full.yml index fc11b07..267081d 100644 --- a/.github/workflows/sanger_test_full.yml +++ b/.github/workflows/sanger_test_full.yml @@ -27,7 +27,6 @@ jobs: workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }} access_token: ${{ secrets.TOWER_ACCESS_TOKEN }} compute_env: ${{ secrets.TOWER_COMPUTE_ENV }} - pipeline: ${{ github.repository }} revision: ${{ env.REVISION }} workdir: ${{ secrets.TOWER_WORKDIR_PARENT }}/work/${{ github.repository }}/work-${{ env.REVISION }} parameters: | @@ -35,3 +34,11 @@ jobs: "outdir": "${{ secrets.TOWER_WORKDIR_PARENT }}/results/${{ github.repository }}/results-${{ env.REVISION }}", } profiles: test_full,sanger,singularity,cleanup + + - uses: actions/upload-artifact@v3 + with: + name: Tower debug log file + path: | + tower_action_*.log + tower_action_*.json + From fbdb59f2eb6c02ae54e86c85b69564ed58b1176a Mon Sep 17 00:00:00 2001 From: Guoying Qi <729395+gq1@users.noreply.github.com> Date: Wed, 16 Aug 2023 14:58:41 +0100 Subject: [PATCH 63/72] Update sanger_test_full.yml --- .github/workflows/sanger_test_full.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/sanger_test_full.yml b/.github/workflows/sanger_test_full.yml index 267081d..e028c6b 100644 --- a/.github/workflows/sanger_test_full.yml +++ b/.github/workflows/sanger_test_full.yml @@ -41,4 +41,3 @@ jobs: path: | tower_action_*.log tower_action_*.json - From fd3553185a60dc679567e4fa963870485f0a4c4e Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Fri, 18 Aug 2023 15:17:54 +0000 Subject: [PATCH 64/72] New tolsoft details --- .github/workflows/fix-linting.yml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/workflows/fix-linting.yml b/.github/workflows/fix-linting.yml index e06cc54..010b9f1 100644 --- a/.github/workflows/fix-linting.yml +++ b/.github/workflows/fix-linting.yml @@ -8,21 +8,21 @@ jobs: # Only run if comment is on a PR with the main repo, and if it contains the magic keywords if: > contains(github.event.comment.html_url, '/pull/') && - contains(github.event.comment.body, '@nf-core-bot fix linting') && - github.repository == 'sanger-tol/ensemblrepeatdownload' + contains(github.event.comment.body, '@sanger-tolsoft fix linting') && + github.repository == 'sanger-tol/variantcalling' runs-on: ubuntu-latest steps: - # Use the @nf-core-bot token to check out so we can push later + # Use the @sanger-tolsoft token to check out so we can push later - uses: actions/checkout@v3 with: - token: ${{ secrets.nf_core_bot_auth_token }} + token: ${{ secrets.sangertolsoft_access_token }} # Action runs on the issue comment, so we don't get the PR by default # Use the gh cli to check out the PR - name: Checkout Pull Request run: gh pr checkout ${{ github.event.issue.number }} env: - GITHUB_TOKEN: ${{ secrets.nf_core_bot_auth_token }} + GITHUB_TOKEN: ${{ secrets.sangertolsoft_access_token }} - uses: actions/setup-node@v3 @@ -46,8 +46,8 @@ jobs: - name: Commit & push changes if: steps.prettier_status.outputs.result == 'fail' run: | - git config user.email "core@nf-co.re" - git config user.name "nf-core-bot" + git config user.email "105875386+sanger-tolsoft@users.noreply.github.com" + git config user.name "sanger-tolsoft" git config push.default upstream git add . git status From e6c2599e164535134d7cf0d7e88a943b6e0eee0f Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Fri, 18 Aug 2023 15:29:12 +0000 Subject: [PATCH 65/72] Fixed the repository name --- .github/workflows/fix-linting.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/fix-linting.yml b/.github/workflows/fix-linting.yml index 010b9f1..9e23ae4 100644 --- a/.github/workflows/fix-linting.yml +++ b/.github/workflows/fix-linting.yml @@ -9,7 +9,7 @@ jobs: if: > contains(github.event.comment.html_url, '/pull/') && contains(github.event.comment.body, '@sanger-tolsoft fix linting') && - github.repository == 'sanger-tol/variantcalling' + github.repository == 'sanger-tol/ensemblrepeatdownload' runs-on: ubuntu-latest steps: # Use the @sanger-tolsoft token to check out so we can push later From ccd9211aecf5a18e132f834159425badaa60fede Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Sat, 23 Sep 2023 11:17:36 +0000 Subject: [PATCH 66/72] Made the relation between --outdir and the samplesheet outdir clearer --- docs/usage.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/usage.md b/docs/usage.md index dfcbe30..234b6f3 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -16,7 +16,7 @@ The pipeline accepts command-one line arguments to specify a single genome to do - `--ensembl_species_name`: How Ensembl name the species (as it can be different from Tree of Life), - `--assembly_accession`: The accession number of the assembly, - `--annotation_method`: The annotation method of the geneset related to the repeat annotation (requirement of Ensembl's data-model), -- `--outdir`: Where to download the data. +- `--outdir`: Where the pipeline runtime information will be stored, and where data will be downloaded (except if absolute paths are given in the samplesheet). ```console nextflow run sanger-tol/ensemblrepeatdownload -profile singularity --ensembl_species_name Noctua_fimbriata --assembly_accession GCA_905163415.1 --annotation_method braker --outdir Noctua_fimbriata_repeats From eb9f6bff5f519f3930ccdb06ba591d00710e45d9 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Fri, 24 May 2024 09:34:37 +0000 Subject: [PATCH 67/72] Version bump --- CHANGELOG.md | 4 ++-- CITATION.cff | 4 ++-- nextflow.config | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6cdaf21..cd33d3b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,7 +3,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## v1.1.0 - [date] +## [[2.0.0](https://github.com/sanger-tol/ensemblrepeatdownload/releases/tag/2.0.0)] – Shadowfax the Planerider – [2024-05-24] ### `Fixed` @@ -17,7 +17,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - New command-line parameter (`--annotation_method`): required for accessing the files on the Ensembl FTP - `--outdir` is a _mandatory_ parameter -## v1.0.0 - [2022-10-19] +## [[1.0.0](https://github.com/sanger-tol/ensemblrepeatdownload/releases/tag/1.0.0)] – Gwaihir the Windlord – [2022-10-19] Initial release of sanger-tol/ensemblrepeatdownload, created with the [nf-core](https://nf-co.re/) template. diff --git a/CITATION.cff b/CITATION.cff index 9494493..a4299dd 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -2,7 +2,7 @@ # Visit https://bit.ly/cffinit to generate yours today! cff-version: 1.2.0 -title: sanger-tol/ensemblrepeatdownload v1.0.0 - Gwaihir the Windlord +title: sanger-tol/ensemblrepeatdownload message: >- If you use this software, please cite it using the metadata from this file. @@ -22,5 +22,5 @@ identifiers: repository-code: "https://github.com/sanger-tol/ensemblrepeatdownload" license: MIT commit: TODO -version: 1.0.0 +version: 2.0.0 date-released: "2022-10-19" diff --git a/nextflow.config b/nextflow.config index 6ab0b46..2393036 100644 --- a/nextflow.config +++ b/nextflow.config @@ -205,7 +205,7 @@ manifest { description = """Pipeline to download repeat annotations from Ensembl into a Tree of Life directory structure""" mainScript = 'main.nf' nextflowVersion = '!>=22.10.1' - version = '1.1dev' + version = '2.0.0' doi = '10.5281/zenodo.7183379' } From 97516bdc85781aa7bccdb25664a841f095564257 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Fri, 24 May 2024 09:34:45 +0000 Subject: [PATCH 68/72] Updated the output documentation --- docs/output.md | 175 +++++-------------------------------------------- 1 file changed, 15 insertions(+), 160 deletions(-) diff --git a/docs/output.md b/docs/output.md index a3deef9..2eae20c 100644 --- a/docs/output.md +++ b/docs/output.md @@ -13,8 +13,7 @@ The directories comply with Tree of Life's canonical directory structure. The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps: -- [Gene annotation files](#gene-annotation-files) - Assembly files, either straight from the NCBI FTP, or indices built on them -- [Repeat annotation files](#repeat-annotation-files) - Files corresponding to analyses run (by the NCBI) on the original assembly, e.g repeat masking +- [Repeat annotation files](#repeat-annotation-files) - Files corresponding to repeat annotation produced by Ensembl - [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution All data files are compressed (and indexed) with `bgzip`. @@ -23,170 +22,26 @@ All Fasta files are indexed with `samtools faidx`, which allows accessing any re All BED files are indexed with tabix in both TBI and CSI modes, unless the sequences are too large. -### Gene annotation files - -Here are the files you can expect in the `gene/` sub-directory. - -```text -/lustre/scratch124/tol/projects/darwin/data/insects/Noctua_fimbriata/ -└── analysis - └── ilNocFimb1.1 - └── gene - └── braker2 - ├── GCA_905163415.1.braker2.2022_03.cdna.fa.gz - ├── GCA_905163415.1.braker2.2022_03.cdna.fa.gz.dict - ├── GCA_905163415.1.braker2.2022_03.cdna.fa.gz.fai - ├── GCA_905163415.1.braker2.2022_03.cdna.fa.gz.gzi - ├── GCA_905163415.1.braker2.2022_03.cdna.seq_length.tsv - ├── GCA_905163415.1.braker2.2022_03.cds.fa.gz - ├── GCA_905163415.1.braker2.2022_03.cds.fa.gz.dict - ├── GCA_905163415.1.braker2.2022_03.cds.fa.gz.fai - ├── GCA_905163415.1.braker2.2022_03.cds.fa.gz.gzi - ├── GCA_905163415.1.braker2.2022_03.cds.seq_length.tsv - ├── GCA_905163415.1.braker2.2022_03.gff3.gz - ├── GCA_905163415.1.braker2.2022_03.gff3.gz.csi - ├── GCA_905163415.1.braker2.2022_03.gff3.gz.gzi - ├── GCA_905163415.1.braker2.2022_03.pep.fa.gz - ├── GCA_905163415.1.braker2.2022_03.pep.fa.gz.dict - ├── GCA_905163415.1.braker2.2022_03.pep.fa.gz.fai - ├── GCA_905163415.1.braker2.2022_03.pep.fa.gz.gzi - └── GCA_905163415.1.braker2.2022_03.pep.seq_length.tsv -``` - -The directory structure includes the assembly name, e.g. `fParRan2.2`, and all files are named after the assembly accession, e.g. `GCA_900634625.2`. -The file name (and the directory name) includes the annotation method and date. Current methods are: - -- `braker2` for [BRAKER2](https://academic.oup.com/nargab/article/3/1/lqaa108/6066535) -- `ensembl` for Ensembl's own annotation pipeline - -The `.seq_length.tsv` files are tabular analogous to the common `chrom.sizes`. They contain the sequence names and their lengths. - -_The following documentation is copied from Ensembl's FTP_ - -#### Fasta files - -Ensembl provide gene sequences in FASTA format in three files. The 'cdna' file contains -transcript sequences for all types of gene (including, for example, -pseudogenes and RNA genes). The 'cds' file contains the DNA sequences -of the coding regions of protein-coding genes. The 'pep' file contains -the amino acid sequences of protein-coding genes. - -The headers in the 'cdna' FASTA files have the format: - -```text -> :::: gene: gene_biotype: transcript_biotype: [gene_symbol:] [description:] -``` - -Example 'cdna' header: - -```text ->ENSZVIT00000000002.1 cdna UG_Zviv_1:LG1:3600:22235:-1 gene:ENSZVIG00000000002.1 gene_biotype:protein_coding transcript_biotype:protein_coding -``` - -The headers in the 'cds' FASTA files have the format: - -```text -> :::: gene: gene_biotype: transcript_biotype: [gene_symbol:] [description:] -``` - -Example 'cds' header: - -```text ->ENSZVIT00000000002.1 cds UG_Zviv_1:LG1:5289:19862:-1 gene:ENSZVIG00000000002.1 gene_biotype:protein_coding transcript_biotype:protein_coding -``` - -The headers in the 'pep' FASTA files have the format: - -```text -> :::: gene: transcript: gene_biotype: transcript_biotype: [gene_symbol:] [description:] -``` - -Example 'pep' header: - -```text ->ENSZVIP00000000002.1 pep UG_Zviv_1:LG1:5289:19862:-1 gene:ENSZVIG00000000002.1 transcript:ENSZVIT00000000002.1 gene_biotype:protein_coding transcript_biotype:protein_coding -``` - -Stable IDs for genes, transcripts, and proteins include a version -suffix. Gene symbols and descriptions are not available for all genes. - -#### GFF3 file - -A GFF3 ([specification](https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md)) file is also provided. -GFF3 files are validated using [GenomeTools](http://genometools.org). - -The 'type' of gene features is: - -- `gene` for protein-coding genes -- `ncRNA_gene` for RNA genes -- `pseudogene` for pseudogenes - -The 'type' of transcript features is: - -- `mRNA` for protein-coding transcripts -- a specific type or RNA transcript such as `snoRNA` or `lnc_RNA` -- `pseudogenic_transcript` for pseudogenes - -All transcripts are linked to `exon` features. -Protein-coding transcripts are linked to `CDS`, `five_prime_UTR`, and -`three_prime_UTR` features. - -Attributes for feature types: -(italics indicate data which is not available for all features) - -- region types: - - `ID`: Unique identifier, format `:` - - _`Alias`_: A comma-separated list of aliases, usually including the - `INSDC` accession - - _`Is_circular`_: Flag to indicate circular regions -- gene types: - - `ID`: Unique identifier, format `gene:` - - `biotype`: Ensembl biotype, e.g. `protein_coding`, `pseudogene` - - `gene_id`: Ensembl gene stable ID - - `version`: Ensembl gene version - - _`Name`_: Gene name - - _`description`_: Gene description -- transcript types: - - `ID`: Unique identifier, format `transcript:` - - `Parent`: Gene identifier, format `gene:` - - `biotype`: Ensembl biotype, e.g. `protein_coding`, `pseudogene` - - `transcript_id`: Ensembl transcript stable ID - - `version`: Ensembl transcript version - - _`Note`_: If the transcript sequence has been edited (i.e. differs - from the genomic sequence), the edits are described in a note. -- exon - - `Parent`: Transcript identifier, format `transcript:` - - `exon_id`: Ensembl exon stable ID - - `version`: Ensembl exon version - - `constitutive`: Flag to indicate if exon is present in all - transcripts - - `rank`: Integer that show the 5'->3' ordering of exons -- CDS - - `ID`: Unique identifier, format `CDS:` - - `Parent`: Transcript identifier, format `transcript:` - - `protein_id`: Ensembl protein stable ID - - `version`: Ensembl protein version - ### Repeat annotation files -Here are the files you can expect in the `repeats/` sub-directory. +Here are the files you can expect in the results directory. ```text -analysis -└── gfLaeSulp1.1 - └── repeats - └── ncbi - ├── GCA_927399515.1.masked.ncbi.bed.gz - ├── GCA_927399515.1.masked.ncbi.bed.gz.gzi - ├── GCA_927399515.1.masked.ncbi.bed.gz.tbi - ├── GCA_927399515.1.masked.ncbi.fasta.dict - ├── GCA_927399515.1.masked.ncbi.fasta.gz - ├── GCA_927399515.1.masked.ncbi.fasta.gz.fai - └── GCA_927399515.1.masked.ncbi.fasta.gz.gzi +└── repeats + └── ensembl + ├── GCA_907164925.1.masked.ensembl.bed.gz + ├── GCA_907164925.1.masked.ensembl.bed.gz.csi + ├── GCA_907164925.1.masked.ensembl.bed.gz.gzi + ├── GCA_907164925.1.masked.ensembl.bed.gz.tbi + ├── GCA_907164925.1.masked.ensembl.fa.dict + ├── GCA_907164925.1.masked.ensembl.fa.gz + ├── GCA_907164925.1.masked.ensembl.fa.gz.fai + ├── GCA_907164925.1.masked.ensembl.fa.gz.gzi + └── GCA_907164925.1.masked.ensembl.fa.gz.sizes ``` -They all correspond to the repeat-masking analysis run by Ensembl themselves. Like for the `assembly/` sub-directory, -the directory structure includes the assembly name, e.g. `gfLaeSulp1.1`, and all files are named after the assembly accession, e.g. `GCA_927399515.1`. +They all correspond to the repeat-masking analysis run by Ensembl themselves. +All files are named after the assembly accession, e.g. `GCA_907164925.1`. - `GCA_*.masked.ncbi.fasta.gz`: Masked assembly in Fasta format - `GCA_*.masked.ncbi.bed.gz`: BED file with the coordinates of the regions masked by the Ensembl pipeline From 9b0138b2e28ae92dc2a75c961a5a08d1bc0975cb Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Fri, 24 May 2024 09:43:03 +0000 Subject: [PATCH 69/72] Changelog updated --- CHANGELOG.md | 31 +++++++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index cd33d3b..53ace04 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,18 +5,41 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [[2.0.0](https://github.com/sanger-tol/ensemblrepeatdownload/releases/tag/2.0.0)] – Shadowfax the Planerider – [2024-05-24] -### `Fixed` +This version supports the new FTP structure of Ensembl + +### Enhancements & fixes - Support for the updated directory structure of the Ensembl FTP - Relative paths in the sample-sheet are now evaluated from the `--outdir` parameter - Memory usage rules for `samtools dict` - Appropriate use of `tabix`'s TBI and CSI indexing, depending on the sequence lengths - -### `Added` - - New command-line parameter (`--annotation_method`): required for accessing the files on the Ensembl FTP - `--outdir` is a _mandatory_ parameter +### Parameters + +| Old parameter | New parameter | +| ------------- | ------------------- | +| | --annotation_method | + +_In the samplesheet_ + +| Old parameter | New parameter | +| ------------- | ----------------- | +| species_dir | outdir | +| | annotation_method | +| assembly_name | | + +> **NB:** Parameter has been **updated** if both old and new parameter information is present.
    **NB:** Parameter has been **added** if just the new parameter information is present.
    **NB:** Parameter has been **removed** if new parameter information isn't present. + +### Software dependencies + +Note, since the pipeline is using Nextflow DSL2, each process will be run with its own [Biocontainer](https://biocontainers.pro/#/registry). This means that on occasion it is entirely possible for the pipeline to be using different versions of the same tool. However, the overall software dependency changes compared to the last release have been listed below for reference. Only `Docker` or `Singularity` containers are supported, `conda` is not supported. + +| Dependency | Old version | New version | +| ---------- | ----------- | ----------- | +| multiqc | 1.13 | 1.14 | + ## [[1.0.0](https://github.com/sanger-tol/ensemblrepeatdownload/releases/tag/1.0.0)] – Gwaihir the Windlord – [2022-10-19] Initial release of sanger-tol/ensemblrepeatdownload, created with the [nf-core](https://nf-co.re/) template. From dd8c9ca3e3e38e95ea80214994418ccb01b4562e Mon Sep 17 00:00:00 2001 From: Guoying Qi <729395+gq1@users.noreply.github.com> Date: Mon, 3 Jun 2024 14:49:56 +0100 Subject: [PATCH 70/72] change default branch to main for PR branch checking --- .github/workflows/branch.yml | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/workflows/branch.yml b/.github/workflows/branch.yml index d31fdc0..6d9b05a 100644 --- a/.github/workflows/branch.yml +++ b/.github/workflows/branch.yml @@ -1,15 +1,15 @@ name: nf-core branch protection -# This workflow is triggered on PRs to master branch on the repository -# It fails when someone tries to make a PR against the nf-core `master` branch instead of `dev` +# This workflow is triggered on PRs to main branch on the repository +# It fails when someone tries to make a PR against the nf-core `main` branch instead of `dev` on: pull_request_target: - branches: [master] + branches: [main] jobs: test: runs-on: ubuntu-latest steps: - # PRs to the nf-core repo master branch are only ok if coming from the nf-core repo `dev` or any `patch` branches + # PRs to the nf-core repo main branch are only ok if coming from the nf-core repo `dev` or any `patch` branches - name: Check PRs if: github.repository == 'sanger-tol/ensemblrepeatdownload' run: | @@ -22,7 +22,7 @@ jobs: uses: mshick/add-pr-comment@v1 with: message: | - ## This PR is against the `master` branch :x: + ## This PR is against the `main` branch :x: * Do not close this PR * Click _Edit_ and change the `base` to `dev` @@ -32,9 +32,9 @@ jobs: Hi @${{ github.event.pull_request.user.login }}, - It looks like this pull-request is has been made against the [${{github.event.pull_request.head.repo.full_name }}](https://github.com/${{github.event.pull_request.head.repo.full_name }}) `master` branch. - The `master` branch on nf-core repositories should always contain code from the latest release. - Because of this, PRs to `master` are only allowed if they come from the [${{github.event.pull_request.head.repo.full_name }}](https://github.com/${{github.event.pull_request.head.repo.full_name }}) `dev` branch. + It looks like this pull-request is has been made against the [${{github.event.pull_request.head.repo.full_name }}](https://github.com/${{github.event.pull_request.head.repo.full_name }}) `main` branch. + The `main` branch on nf-core repositories should always contain code from the latest release. + Because of this, PRs to `main` are only allowed if they come from the [${{github.event.pull_request.head.repo.full_name }}](https://github.com/${{github.event.pull_request.head.repo.full_name }}) `dev` branch. You do not need to close this PR, you can change the target branch to `dev` by clicking the _"Edit"_ button at the top of this page. Note that even after this, the test will continue to show as failing until you push a new commit. From a6b54faf05673684b4e0d216dbc7a0d9a821a917 Mon Sep 17 00:00:00 2001 From: Guoying Qi <729395+gq1@users.noreply.github.com> Date: Mon, 3 Jun 2024 14:51:08 +0100 Subject: [PATCH 71/72] add one more unchanged file for nf-core linting --- .nf-core.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.nf-core.yml b/.nf-core.yml index fb2371a..ebd2195 100644 --- a/.nf-core.yml +++ b/.nf-core.yml @@ -16,6 +16,7 @@ lint: - .github/workflows/linting.yml - assets/sendmail_template.txt - lib/NfcoreTemplate.groovy + - .github/workflows/branch.yml nextflow_config: - manifest.name - manifest.homePage From b0f7829b01b3aa5ff5d882224e861164d28d877c Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Tue, 4 Jun 2024 08:43:08 +0000 Subject: [PATCH 72/72] Updated the release date --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 53ace04..5d6238f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,7 +3,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [[2.0.0](https://github.com/sanger-tol/ensemblrepeatdownload/releases/tag/2.0.0)] – Shadowfax the Planerider – [2024-05-24] +## [[2.0.0](https://github.com/sanger-tol/ensemblrepeatdownload/releases/tag/2.0.0)] – Shadowfax the Planerider – [2024-06-04] This version supports the new FTP structure of Ensembl