diff --git a/.nf-core.yml b/.nf-core.yml index b1a2402..c0b070c 100644 --- a/.nf-core.yml +++ b/.nf-core.yml @@ -1,27 +1,27 @@ lint: files_exist: - - CODE_OF_CONDUCT.md - - assets/nf-core-ascc_logo_light.png - - docs/images/nf-core-ascc_logo_light.png - - docs/images/nf-core-ascc_logo_dark.png - - .github/ISSUE_TEMPLATE/config.yml - - .github/workflows/awstest.yml - - .github/workflows/awsfulltest.yml - - conf/igenomes.config + - CODE_OF_CONDUCT.md + - assets/nf-core-ascc_logo_light.png + - docs/images/nf-core-ascc_logo_light.png + - docs/images/nf-core-ascc_logo_dark.png + - .github/ISSUE_TEMPLATE/config.yml + - .github/workflows/awstest.yml + - .github/workflows/awsfulltest.yml + - conf/igenomes.config files_unchanged: - - CODE_OF_CONDUCT.md - - assets/nf-core-ascc_logo_light.png - - docs/images/nf-core-ascc_logo_light.png - - docs/images/nf-core-ascc_logo_dark.png - - .github/ISSUE_TEMPLATE/bug_report.yml + - CODE_OF_CONDUCT.md + - assets/nf-core-ascc_logo_light.png + - docs/images/nf-core-ascc_logo_light.png + - docs/images/nf-core-ascc_logo_dark.png + - .github/ISSUE_TEMPLATE/bug_report.yml multiqc_config: - - report_comment + - report_comment nextflow_config: - - manifest.name - - manifest.homePage + - manifest.name + - manifest.homePage nf_core_version: 2.14.1 repository_type: pipeline template: prefix: sanger-tol skip: - - igenomes + - igenomes diff --git a/CITATIONS.md b/CITATIONS.md index 53e1fe4..33fedf9 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -11,32 +11,38 @@ ## Pipeline tools - [FCS-adaptor](https://github.com/ncbi/fcs/wiki/FCS-adaptor-quickstart.) - > Astashyn, Alexander, Eric S. Tvedte, Deacon Sweeney, Victor Sapojnikov, Nathan Bouk, Victor Joukov, Eyal Mozes, et al. 2023. “FCS-Adaptor.” FCS-Adaptor. June 6, 2023. -———. 2024. “Rapid and Sensitive Detection of Genome Contamination at Scale with FCS-GX.” Genome Biology 25 (1): 60. + + > Astashyn, Alexander, Eric S. Tvedte, Deacon Sweeney, Victor Sapojnikov, Nathan Bouk, Victor Joukov, Eyal Mozes, et al. 2023. “FCS-Adaptor.” FCS-Adaptor. June 6, 2023. + > ———. 2024. “Rapid and Sensitive Detection of Genome Contamination at Scale with FCS-GX.” Genome Biology 25 (1): 60. - [Kcounter](https://github.com/apcamargo/kcounter). + > Buchfink, Benjamin, Klaus Reuter, and Hajk-Georg Drost. 2021. “Sensitive Protein Alignments at Tree-of-Life Scale Using DIAMOND.” Nature Methods 18 (4): 366–68. - Camargo, Antônio. 2020. “Kcounter.” Kcounter. February 17, 2020. https://github.com/apcamargo/kcounter. + > Camargo, Antônio. 2020. “Kcounter.” Kcounter. February 17, 2020. https://github.com/apcamargo/kcounter. - [BlobToolKit](https://github.com/sanger-tol/blobtoolkit). - > Challis, Richard, Edward Richards, Jeena Rajan, Guy Cochrane, and Mark Blaxter. 2020. “BlobToolKit - Interactive Quality Assessment of Genome Assemblies.” G3 10 (4): 1361–74. Diaz, Alexander Ramos, Zaynab Butt, Priyanka Surana, Richard Challis, Sujai Kumar, and Matthieu Muffato. 2023. “BlobToolKit Pipeline.” BlobToolKit Pipeline. May 18, 2023. + + > Challis, Richard, Edward Richards, Jeena Rajan, Guy Cochrane, and Mark Blaxter. 2020. “BlobToolKit - Interactive Quality Assessment of Genome Assemblies.” G3 10 (4): 1361–74. Diaz, Alexander Ramos, Zaynab Butt, Priyanka Surana, Richard Challis, Sujai Kumar, and Matthieu Muffato. 2023. “BlobToolKit Pipeline.” BlobToolKit Pipeline. May 18, 2023. - [Tiara](https://github.com/ibe-uw/tiara). - > Karlicki, Michał, Stanisław Antonowicz, and Anna Karnkowska. 2022. “Tiara: Deep Learning-Based Classification System for Eukaryotic Sequences.” Bioinformatics 38 (2): 344–50. + + > Karlicki, Michał, Stanisław Antonowicz, and Anna Karnkowska. 2022. “Tiara: Deep Learning-Based Classification System for Eukaryotic Sequences.” Bioinformatics 38 (2): 344–50. - [Minimap2](https://github.com/lh3/minimap2). - > Li, Heng. 2018. “Minimap2: Pairwise Alignment for Nucleotide Sequences.” Bioinformatics 34 (18): 3094–3100. + + > Li, Heng. 2018. “Minimap2: Pairwise Alignment for Nucleotide Sequences.” Bioinformatics 34 (18): 3094–3100. - [TensorFlow](https://www.tensorflow.org/) + > Martín Abadi, Ashish Agarwal, Paul Barham, Eugene Brevdo, Zhifeng Chen, Craig Citro, Greg S. Corrado, et al. 2015. “TensorFlow: Large-Scale Machine Learning on Heterogeneous Systems.” - [VecScreen](https://manpages.debian.org/testing/ncbi-tools-bin/vecscreen.1.en.html). + > NCBI. 2001. “NCBI VecScreen.” NCBI VecScreen. October 5, 2001. - [Scikit-Learn] Pedregosa, F., G. Varoquaux, A. Gramfort, and V. Michel. 2011. “Scikit-Learn: Machine Learning in Python. JMLR 12, 2825–2830 (2011).” Journal of Machine Learning Research 12 (October): 2825–30. - ## Software packaging/containerisation tools - [Anaconda](https://anaconda.com) diff --git a/README.md b/README.md index 590ed8c..1e0bb18 100644 --- a/README.md +++ b/README.md @@ -74,7 +74,6 @@ Each row represents a an assembled haplotype or organelle of the sample. The params-input yaml will need to contain the following data will be detailed [here](./docs/usage.md) - Now, you can run the pipeline using: @@ -115,4 +114,4 @@ If you would like to contribute to this pipeline, please see the [contributing g An extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file. -This pipeline uses code and infrastructure developed and maintained by the [nf-core](https://nf-co.re) community, reused here under the [MIT license](https://github.com/nf-core/tools/blob/master/LICENSE). \ No newline at end of file +This pipeline uses code and infrastructure developed and maintained by the [nf-core](https://nf-co.re) community, reused here under the [MIT license](https://github.com/nf-core/tools/blob/master/LICENSE). diff --git a/assets/btk_draft.yaml b/assets/btk_draft.yaml index 34b1b3f..0e02351 100644 --- a/assets/btk_draft.yaml +++ b/assets/btk_draft.yaml @@ -14,4 +14,4 @@ taxon: order: order_name phylum: phylum_name superkingdom: superkingdom_name - taxid: 0 \ No newline at end of file + taxid: 0 diff --git a/assets/pacbio_adaptors.fa b/assets/pacbio_adaptors.fa index 5461b9c..34826e8 100644 --- a/assets/pacbio_adaptors.fa +++ b/assets/pacbio_adaptors.fa @@ -221,4 +221,4 @@ TAGATACAGCGAGTAT >bc2095 CTACTCATACGAGTAT >bc2096 -ATGTACTAGTGAGTAT \ No newline at end of file +ATGTACTAGTGAGTAT diff --git a/assets/samplesheet.csv b/assets/samplesheet.csv index c1c8677..c788117 100644 --- a/assets/samplesheet.csv +++ b/assets/samplesheet.csv @@ -2,4 +2,4 @@ sample,assembly_type,assembly_file asccTinyTest_V2,PRIMARY,/lustre/scratch123/tol/teams/tola/users/ea10/pipeline_testing/20240319_ascc_test_file_sizes/download_test2/asccTinyTest/assembly/Pyoeliiyoelii17XNL_assembly.fa asccTinyTest_V2,HAPLO,/lustre/scratch123/tol/resources/treeval/sanger-tol-asccthesecond/test_hap2.fa asccTinyTest_V2,MITO,/nfs/treeoflife-01/teams/tola/users/dp24/ascc/asccTinyTest_V2/organellar/Pyoeliiyoelii17XNL_mitochondrion_ncbi.fa -asccTinyTest_V2,PLASTID,/nfs/treeoflife-01/teams/tola/users/dp24/ascc/asccTinyTest_V2/organellar/Pyoeliiyoelii17XNL_apicoplast_ncbi.fa \ No newline at end of file +asccTinyTest_V2,PLASTID,/nfs/treeoflife-01/teams/tola/users/dp24/ascc/asccTinyTest_V2/organellar/Pyoeliiyoelii17XNL_apicoplast_ncbi.fa diff --git a/assets/test.yaml b/assets/test.yaml index 15f9f5c..d5e3872 100644 --- a/assets/test.yaml +++ b/assets/test.yaml @@ -5,7 +5,7 @@ pacbio_barcode_names: "bc2008,bc2009" scientific_name: "Plasmodium yoelii yoelii 17XNL" taxid: 352914 kmer_length: 7 -dimensionality_reduction_methods: ["pca","random_trees"] +dimensionality_reduction_methods: ["pca", "random_trees"] # all available methods # "pca,umap,t-sne,isomap,lle_standard,lle_hessian,lle_modified,mds,se,random_trees,kernel_pca,pca_svd,autoencoder_sigmoid,autoencoder_linear,autoencoder_selu,autoencoder_relu,nmf" nt_database_path: /lustre/scratch123/tol/teams/tola/users/ea10/pipeline_testing/20240704_blast_tiny_testdb/blastdb/ diff --git a/bin/kmer_count_dim_reduction.py b/bin/kmer_count_dim_reduction.py index fc0996c..8479c5e 100755 --- a/bin/kmer_count_dim_reduction.py +++ b/bin/kmer_count_dim_reduction.py @@ -302,7 +302,7 @@ def main(kmer_counts_file, out_folder, selected_methods, n_neighbors_setting, au "Skipping the dimensionality reduction of kmer counts, as the kmer counts table has only one row" ) # Generate an empty file to satisfy nextflow expecting a file from script finishing with no file with small output - with open(f"EMPTY_{selected_methods}_kmers_dim_reduction_embeddings.csv", 'w') as empty_file: + with open(f"EMPTY_{selected_methods}_kmers_dim_reduction_embeddings.csv", "w") as empty_file: empty_file.write("FILE TO SMALL FOR ANALYSIS") sys.exit(0) diff --git a/conf/test.config b/conf/test.config index 7124d84..cf6f1f7 100644 --- a/conf/test.config +++ b/conf/test.config @@ -25,7 +25,7 @@ params { input = "./assets/samplesheet.csv" params-file = "./assets/test.yaml" outdir = "ASCC-TEST" - include = "ALL" + include = "ALL" exclude = "btk_busco" - + } diff --git a/modules/local/run/main.nf b/modules/local/run/main.nf index cc522bc..af6ba65 100644 --- a/modules/local/run/main.nf +++ b/modules/local/run/main.nf @@ -35,4 +35,4 @@ process NEXTFLOW_RUN { output: path "results" , emit: output val process.text, emit: log -} \ No newline at end of file +} diff --git a/nextflow_schema.json b/nextflow_schema.json index 029b347..a8701dc 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,10 +10,7 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": [ - "input", - "outdir" - ], + "required": ["input", "outdir"], "properties": { "input": { "type": "string", @@ -26,27 +23,27 @@ "help_text": "You will need to create a design file with information about the samples in your experiment before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row.", "fa_icon": "fas fa-file-csv" }, - "scientific_name":{ + "scientific_name": { "type": "string", "description": "Scientific name of sample according to NCBI", "fa_icon": "fas fa-input-text" }, - "taxid":{ + "taxid": { "type": "integer", "description": "TaxID according to NCBI", "fa_icon": "fas fa-hastag" }, - "reads_path":{ + "reads_path": { "type": "string", "description": "folder containing long read data files in FASTA.GZ format", "fa_icon": "fas fa-file-lines" }, - "reads_type":{ + "reads_type": { "type": "string", "description": "Type of read (hifi)", "fa_icon": "fas fa-input-text" }, - "pacbio_barcode_file":{ + "pacbio_barcode_file": { "type": "string", "format": "file-path", "exists": true, @@ -55,47 +52,47 @@ "description": "Fasta format file containing barcode data", "fa_icon": "fas fa-file-lines" }, - "pacbio_barcode_names":{ + "pacbio_barcode_names": { "type": "string", "description": "barcodes to use in run", "fa_icon": "fas fa-input-text" }, - "kmer_length":{ + "kmer_length": { "type": "number", "description": "Length of kmer to use in run", "fa_icon": "fas fa-hashtag" }, - "dimensionality_reduction_methods":{ + "dimensionality_reduction_methods": { "type": "array", "description": "dimensionality_reduction_methods to use in runs", "fa_icon": "fas fa-input-text" }, - "nt_database_path":{ + "nt_database_path": { "type": "string", "description": "folder containing nt database", "fa_icon": "fas fa-folder" }, - "nt_database_prefix":{ + "nt_database_prefix": { "type": "string", "description": "Prefix of the NT database", "fa_icon": "fas fa-input-text" }, - "nt_kraken_database_path":{ + "nt_kraken_database_path": { "type": "string", "description": "folder containing Kraken2 database", "fa_icon": "fas fa-folder" }, - "ncbi_accession_ids_folder":{ + "ncbi_accession_ids_folder": { "type": "string", "description": "folder containing accession2taxid file", "fa_icon": "fas fa-folder" }, - "ncbi_taxonomy_path":{ + "ncbi_taxonomy_path": { "type": "string", "description": "folder containing dmp files", "fa_icon": "fas fa-folder" }, - "ncbi_ranked_lineage_path":{ + "ncbi_ranked_lineage_path": { "type": "string", "format": "file-path", "exists": true, @@ -103,27 +100,27 @@ "description": "Path to rankedlineage.dmp", "fa_icon": "fas fa-file-lines" }, - "busco_lineages_folder":{ + "busco_lineages_folder": { "type": "string", "description": "BUSCO folder containing busco Lineages folder", "fa_icon": "fas fa-folder" }, - "busco_lineages":{ + "busco_lineages": { "type": "string", "description": "CSV list of lineages to use in run", "fa_icon": "fas fa-input-text" }, - "fcs_gx_database_path":{ + "fcs_gx_database_path": { "type": "string", "description": "FCS_GX folder containing database", "fa_icon": "fas fa-folder" }, - "vecscreen_database_path":{ + "vecscreen_database_path": { "type": "string", "description": "Vecscreen folder containing database", "fa_icon": "fas fa-folder" }, - "diamond_uniprot_database_path":{ + "diamond_uniprot_database_path": { "type": "string", "format": "file-path", "exists": true, @@ -131,7 +128,7 @@ "description": "Path to Uniprot Diamond DB", "fa_icon": "fas fa-file-lines" }, - "diamond_nr_database_path":{ + "diamond_nr_database_path": { "type": "string", "format": "file-path", "exists": true, @@ -139,22 +136,22 @@ "description": "Path to NR Diamond DB", "fa_icon": "fas fa-file-lines" }, - "seqkit_sliding":{ + "seqkit_sliding": { "type": "number", "description": "Length of slide for Seqkit", "fa_icon": "fas fa-hashtag" }, - "seqkit_window":{ + "seqkit_window": { "type": "number", "description": "Length of the window for Seqkit", "fa_icon": "fas fa-hashtag" }, - "n_neighbours":{ + "n_neighbours": { "type": "number", "description": "Nearest Neighbour value", "fa_icon": "fas fa-hashtag" }, - "btk_yaml":{ + "btk_yaml": { "type": "string", "format": "file-path", "exists": true, @@ -290,14 +287,7 @@ "description": "Method used to save pipeline results to output directory.", "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", "fa_icon": "fas fa-copy", - "enum": [ - "symlink", - "rellink", - "link", - "copy", - "copyNoFollow", - "move" - ], + "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"], "hidden": true }, "email_on_fail": { @@ -405,4 +395,4 @@ "$ref": "#/definitions/generic_options" } ] -} \ No newline at end of file +}