From 55d2c1596ec5c84caa74663c5f37c96e1339a371 Mon Sep 17 00:00:00 2001 From: YSims Date: Wed, 11 Oct 2023 14:13:34 +0100 Subject: [PATCH 1/7] Update ci.yml --- .github/workflows/ci.yml | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 9f49a14..574a38b 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -68,7 +68,13 @@ jobs: run: | mkdir NT_database curl -L https://ftp.ncbi.nlm.nih.gov/blast/db/18S_fungal_sequences.tar.gz | tar -C NT_database -xzf - - + + - name: Download the subset of Diamond database + run: | + mkdir diamond + wget -c https://tolit.cog.sanger.ac.uk/test-data/resources/diamond/UP000000212_1234679_tax.dmnd -O diamond/UP000000212_1234679_tax.dmnd + + - name: Run pipeline with test data - name: Run pipeline with test data # TODO nf-core: You can customise CI pipeline run tests as required # For example: adding multiple test runs with different parameters From 7712ccd3c2df7e651c04099e399f53de768d823c Mon Sep 17 00:00:00 2001 From: YSims Date: Wed, 11 Oct 2023 15:00:39 +0100 Subject: [PATCH 2/7] Update test.yaml --- assets/github_testing/test.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/assets/github_testing/test.yaml b/assets/github_testing/test.yaml index 6d61de3..e50a84b 100755 --- a/assets/github_testing/test.yaml +++ b/assets/github_testing/test.yaml @@ -14,6 +14,8 @@ ncbi_taxonomy_path: /home/runner/work/ascc/ascc/ncbi_taxdump/ ncbi_rankedlineage_path: /home/runner/work/ascc/ascc/ncbi_taxdump/rankedlineage.dmp busco_lineages_folder: /home/runner/work/ascc/ascc/busco_database/lineages fcs_gx_database_path: /home/runner/work/ascc/ascc/FCS_gx/ +diamond_uniprot_database_path: /home/runner/work/ascc/ascc/diamond/UP000000212_1234679_tax.dmnd +diamond_nr_database_path: /home/runner/work/ascc/ascc/diamond/UP000000212_1234679_tax.dmnd seqkit: sliding: 6000 window: 100000 From 91ddd8e9a467fcdf19e51a09837149bed6bc4d05 Mon Sep 17 00:00:00 2001 From: YSims Date: Wed, 11 Oct 2023 15:05:22 +0100 Subject: [PATCH 3/7] Update yaml_input.nf --- subworkflows/local/yaml_input.nf | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/subworkflows/local/yaml_input.nf b/subworkflows/local/yaml_input.nf index fad111b..d4b4fe3 100644 --- a/subworkflows/local/yaml_input.nf +++ b/subworkflows/local/yaml_input.nf @@ -36,6 +36,8 @@ workflow YAML_INPUT { ncbi_rankedlineage_path: ( data.ncbi_rankedlineage_path ) busco_lineages_folder: ( data.busco_lineages_folder ) seqkit_values : ( data.seqkit ) + diamond_uniprot_database_path: ( data.diamond_uniprot_database_path ) + diamond_nr_database_path: ( data.diamond_nr_database_path ) } .set{ group } @@ -78,6 +80,8 @@ workflow YAML_INPUT { ncbi_rankedlineage_path = group.ncbi_rankedlineage_path busco_lineages_folder = group.busco_lineages_folder fcs_gx_database_path = group.fcs_gx_database_path + diamond_uniprot_database_path = group.diamond_uniprot_database_path + diamond_nr_database_path = group.diamond_nr_database_path seqkit_sliding = seqkit.sliding_value seqkit_window = seqkit.window_value versions = ch_versions.ifEmpty(null) From 5d87195a7d22c100525e37bd531d3bef6dd93b95 Mon Sep 17 00:00:00 2001 From: yumisims Date: Wed, 11 Oct 2023 17:11:54 +0100 Subject: [PATCH 4/7] fixed ci --- .github/workflows/ci.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 574a38b..4a3f0c5 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -68,12 +68,12 @@ jobs: run: | mkdir NT_database curl -L https://ftp.ncbi.nlm.nih.gov/blast/db/18S_fungal_sequences.tar.gz | tar -C NT_database -xzf - - + - name: Download the subset of Diamond database run: | mkdir diamond - wget -c https://tolit.cog.sanger.ac.uk/test-data/resources/diamond/UP000000212_1234679_tax.dmnd -O diamond/UP000000212_1234679_tax.dmnd - + wget -c https://tolit.cog.sanger.ac.uk/test-data/resources/diamond/UP000000212_1234679_tax.dmnd -O diamond/UP000000212_1234679_tax.dmnd + - name: Run pipeline with test data - name: Run pipeline with test data # TODO nf-core: You can customise CI pipeline run tests as required From ccc1a7e3f07ed036f761475e22f8985b579582db Mon Sep 17 00:00:00 2001 From: yumisims Date: Wed, 11 Oct 2023 17:13:10 +0100 Subject: [PATCH 5/7] fixed ci --- .github/workflows/ci.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 4a3f0c5..e1dbb4b 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -74,7 +74,6 @@ jobs: mkdir diamond wget -c https://tolit.cog.sanger.ac.uk/test-data/resources/diamond/UP000000212_1234679_tax.dmnd -O diamond/UP000000212_1234679_tax.dmnd - - name: Run pipeline with test data - name: Run pipeline with test data # TODO nf-core: You can customise CI pipeline run tests as required # For example: adding multiple test runs with different parameters From 4b14a9bd05331bceaec7558aa26d868d70f25a5b Mon Sep 17 00:00:00 2001 From: yumisims Date: Wed, 11 Oct 2023 21:03:19 +0100 Subject: [PATCH 6/7] add vecscreen test set on ci, checkin coverage related scripts, updated yaml_input --- .github/workflows/ci.yml | 5 +++ assets/github_testing/test.yaml | 1 + assets/test.yaml | 3 ++ bin/sam_to_sorted_indexed_bam.py | 60 ++++++++++++++++++++++++++ bin/samtools_depth_average_coverage.py | 27 ++++++++++++ subworkflows/local/yaml_input.nf | 4 +- 6 files changed, 99 insertions(+), 1 deletion(-) create mode 100644 bin/sam_to_sorted_indexed_bam.py create mode 100644 bin/samtools_depth_average_coverage.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index e1dbb4b..63a80f5 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -74,6 +74,11 @@ jobs: mkdir diamond wget -c https://tolit.cog.sanger.ac.uk/test-data/resources/diamond/UP000000212_1234679_tax.dmnd -O diamond/UP000000212_1234679_tax.dmnd + - name: Download the vecscreen test data + run: | + mkdir vecscreen + wget -c https://tolit.cog.sanger.ac.uk/test-data/resources/vecscreen/GCA_900155405.1_PRJEB18760_genomic.fna -O vecscreen/GCA_900155405.1_PRJEB18760_genomic.fna + - name: Run pipeline with test data # TODO nf-core: You can customise CI pipeline run tests as required # For example: adding multiple test runs with different parameters diff --git a/assets/github_testing/test.yaml b/assets/github_testing/test.yaml index e50a84b..022e9f6 100755 --- a/assets/github_testing/test.yaml +++ b/assets/github_testing/test.yaml @@ -16,6 +16,7 @@ busco_lineages_folder: /home/runner/work/ascc/ascc/busco_database/lineages fcs_gx_database_path: /home/runner/work/ascc/ascc/FCS_gx/ diamond_uniprot_database_path: /home/runner/work/ascc/ascc/diamond/UP000000212_1234679_tax.dmnd diamond_nr_database_path: /home/runner/work/ascc/ascc/diamond/UP000000212_1234679_tax.dmnd +vecscreen_database_path: /home/runner/work/ascc/ascc/vecscreen/GCA_900155405.1_PRJEB18760_genomic.fna seqkit: sliding: 6000 window: 100000 diff --git a/assets/test.yaml b/assets/test.yaml index 46ba412..6e5b822 100755 --- a/assets/test.yaml +++ b/assets/test.yaml @@ -13,6 +13,9 @@ ncbi_taxonomy_path: /lustre/scratch123/tol/teams/tola/users/ea10/databases/taxdu ncbi_rankedlineage_path: /lustre/scratch123/tol/teams/tola/users/ea10/databases/taxdump/rankedlineage.dmp busco_lineages_folder: /lustre/scratch123/tol/resources/busco/data/v5/2021-03-14/lineages fcs_gx_database_path: /lustre/scratch124/tol/projects/asg/sub_projects/ncbi_decon/0.4.0/gxdb +vecscreen_database_path: /lustre/scratch123/tol/teams/tola/users/ea10/ascc_databases/vecscreen_database/adaptors_for_screening_euks.fa +diamond_uniprot_database_path: /lustre/scratch123/tol/teams/tola/users/ea10/ascc_databases/uniprot/uniprot_reference_proteomes_with_taxonnames.dmnd +diamond_nr_database_path: /lustre/scratch123/tol/teams/tola/users/ea10/ascc_databases/proteins2/nr.dmnd seqkit: sliding: 6000 window: 100000 diff --git a/bin/sam_to_sorted_indexed_bam.py b/bin/sam_to_sorted_indexed_bam.py new file mode 100644 index 0000000..7be6d85 --- /dev/null +++ b/bin/sam_to_sorted_indexed_bam.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python3 +""" +Script conversion of .sam file with mapped reads to sorted and indexed .bam file +""" +# MIT License +# +# Copyright (c) 2020-2021 Genome Research Ltd. +# +# Author: Eerik Aunin (ea10@sanger.ac.uk) +# +# This file is a part of the Genome Decomposition Analysis (GDA) pipeline. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +import general_purpose_functions as gpf +import argparse + +def main(sam_file, threads, fasta_path): + sam_file_extensionless_path = sam_file.split(".sam")[0] + bam_file = sam_file_extensionless_path + ".bam" + sorted_bam_file = sam_file_extensionless_path + "_sorted.bam" + sam_to_bam_command = None + if fasta_path == "": + sam_to_bam_command = "samtools view -Sb -@ {} {} > {}".format(threads, sam_file, bam_file) + else: + sam_to_bam_command = "samtools view -T {} -Sb -@ {} {} > {}".format(fasta_path, threads, sam_file, bam_file) + gpf.run_system_command(sam_to_bam_command) + sort_bam_command = "samtools sort -@ {} {} -o {}".format(threads, bam_file, sorted_bam_file) + gpf.run_system_command(sort_bam_command) + index_bam_command = "samtools index " + sorted_bam_file + gpf.run_system_command(index_bam_command) + remove_sam_command = "rm " + sam_file + gpf.run_system_command(remove_sam_command) + remove_unsorted_bam_command = "rm " + bam_file + gpf.run_system_command(remove_unsorted_bam_command) + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("sam_file", type=str, help="Path to input SAM file") + parser.add_argument("--threads", type=int, help="Number of threads (default: 1)", default=1) + parser.add_argument("--fasta_path", type=str, help="Optional (needed when the reads have been mapped to a genome larger than 4Gb with minimap2): path the FASTA file to which the reads were mapped", default="") + args = parser.parse_args() + main(args.sam_file, args.threads, args.fasta_path) + diff --git a/bin/samtools_depth_average_coverage.py b/bin/samtools_depth_average_coverage.py new file mode 100644 index 0000000..e7763a6 --- /dev/null +++ b/bin/samtools_depth_average_coverage.py @@ -0,0 +1,27 @@ +#!/usr/bin/env python3 +""" +Script for finding the average coverage of each scaffold in a SAMtools depth output file +""" + +import sys +import general_purpose_functions as gpf + +in_path = sys.argv[1] +in_data = gpf.ll(in_path) +scaffs_dict = dict() + +for line in in_data: + split_line = line.split() + scaff_name = split_line[0] + coverage = int(split_line[2]) + if scaff_name in scaffs_dict: + scaffs_dict[scaff_name]["coverage_sum"] += coverage + scaffs_dict[scaff_name]["scaff_len"] += 1 + else: + scaffs_dict[scaff_name] = dict() + scaffs_dict[scaff_name]["coverage_sum"] = coverage + scaffs_dict[scaff_name]["scaff_len"] = 1 + +for scaff_name in scaffs_dict: + average_coverage = scaffs_dict[scaff_name]["coverage_sum"] / scaffs_dict[scaff_name]["scaff_len"] + print(scaff_name + "," + str(average_coverage)) diff --git a/subworkflows/local/yaml_input.nf b/subworkflows/local/yaml_input.nf index d4b4fe3..dd70d91 100644 --- a/subworkflows/local/yaml_input.nf +++ b/subworkflows/local/yaml_input.nf @@ -35,9 +35,10 @@ workflow YAML_INPUT { ncbi_taxonomy_path: ( data.ncbi_taxonomy_path ) ncbi_rankedlineage_path: ( data.ncbi_rankedlineage_path ) busco_lineages_folder: ( data.busco_lineages_folder ) - seqkit_values : ( data.seqkit ) + seqkit_values: ( data.seqkit ) diamond_uniprot_database_path: ( data.diamond_uniprot_database_path ) diamond_nr_database_path: ( data.diamond_nr_database_path ) + vecscreen_database_path: ( data.vecscreen_database_path ) } .set{ group } @@ -82,6 +83,7 @@ workflow YAML_INPUT { fcs_gx_database_path = group.fcs_gx_database_path diamond_uniprot_database_path = group.diamond_uniprot_database_path diamond_nr_database_path = group.diamond_nr_database_path + vecscreen_database_path = group.vecscreen_database_path seqkit_sliding = seqkit.sliding_value seqkit_window = seqkit.window_value versions = ch_versions.ifEmpty(null) From b994daea8b538506c9c14ca61cc95e296860f3c6 Mon Sep 17 00:00:00 2001 From: yumisims Date: Wed, 11 Oct 2023 21:07:56 +0100 Subject: [PATCH 7/7] python black --- bin/sam_to_sorted_indexed_bam.py | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/bin/sam_to_sorted_indexed_bam.py b/bin/sam_to_sorted_indexed_bam.py index 7be6d85..a1b5a45 100644 --- a/bin/sam_to_sorted_indexed_bam.py +++ b/bin/sam_to_sorted_indexed_bam.py @@ -3,23 +3,23 @@ Script conversion of .sam file with mapped reads to sorted and indexed .bam file """ # MIT License -# +# # Copyright (c) 2020-2021 Genome Research Ltd. -# +# # Author: Eerik Aunin (ea10@sanger.ac.uk) -# +# # This file is a part of the Genome Decomposition Analysis (GDA) pipeline. -# +# # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: -# +# # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. -# +# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE @@ -31,6 +31,7 @@ import general_purpose_functions as gpf import argparse + def main(sam_file, threads, fasta_path): sam_file_extensionless_path = sam_file.split(".sam")[0] bam_file = sam_file_extensionless_path + ".bam" @@ -50,11 +51,16 @@ def main(sam_file, threads, fasta_path): remove_unsorted_bam_command = "rm " + bam_file gpf.run_system_command(remove_unsorted_bam_command) + if __name__ == "__main__": parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("sam_file", type=str, help="Path to input SAM file") parser.add_argument("--threads", type=int, help="Number of threads (default: 1)", default=1) - parser.add_argument("--fasta_path", type=str, help="Optional (needed when the reads have been mapped to a genome larger than 4Gb with minimap2): path the FASTA file to which the reads were mapped", default="") + parser.add_argument( + "--fasta_path", + type=str, + help="Optional (needed when the reads have been mapped to a genome larger than 4Gb with minimap2): path the FASTA file to which the reads were mapped", + default="", + ) args = parser.parse_args() main(args.sam_file, args.threads, args.fasta_path) -