From 4b14a9bd05331bceaec7558aa26d868d70f25a5b Mon Sep 17 00:00:00 2001
From: yumisims <yy5@sanger.ac.uk>
Date: Wed, 11 Oct 2023 21:03:19 +0100
Subject: [PATCH 1/2] add vecscreen test set on ci, checkin coverage related
 scripts, updated yaml_input

---
 .github/workflows/ci.yml               |  5 +++
 assets/github_testing/test.yaml        |  1 +
 assets/test.yaml                       |  3 ++
 bin/sam_to_sorted_indexed_bam.py       | 60 ++++++++++++++++++++++++++
 bin/samtools_depth_average_coverage.py | 27 ++++++++++++
 subworkflows/local/yaml_input.nf       |  4 +-
 6 files changed, 99 insertions(+), 1 deletion(-)
 create mode 100644 bin/sam_to_sorted_indexed_bam.py
 create mode 100644 bin/samtools_depth_average_coverage.py

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index e1dbb4b..63a80f5 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -74,6 +74,11 @@ jobs:
           mkdir diamond
           wget -c https://tolit.cog.sanger.ac.uk/test-data/resources/diamond/UP000000212_1234679_tax.dmnd -O diamond/UP000000212_1234679_tax.dmnd
 
+      - name: Download the vecscreen test data
+        run: |
+          mkdir vecscreen
+          wget -c https://tolit.cog.sanger.ac.uk/test-data/resources/vecscreen/GCA_900155405.1_PRJEB18760_genomic.fna -O vecscreen/GCA_900155405.1_PRJEB18760_genomic.fna
+
       - name: Run pipeline with test data
         # TODO nf-core: You can customise CI pipeline run tests as required
         # For example: adding multiple test runs with different parameters
diff --git a/assets/github_testing/test.yaml b/assets/github_testing/test.yaml
index e50a84b..022e9f6 100755
--- a/assets/github_testing/test.yaml
+++ b/assets/github_testing/test.yaml
@@ -16,6 +16,7 @@ busco_lineages_folder: /home/runner/work/ascc/ascc/busco_database/lineages
 fcs_gx_database_path: /home/runner/work/ascc/ascc/FCS_gx/
 diamond_uniprot_database_path: /home/runner/work/ascc/ascc/diamond/UP000000212_1234679_tax.dmnd
 diamond_nr_database_path: /home/runner/work/ascc/ascc/diamond/UP000000212_1234679_tax.dmnd
+vecscreen_database_path: /home/runner/work/ascc/ascc/vecscreen/GCA_900155405.1_PRJEB18760_genomic.fna
 seqkit:
   sliding: 6000
   window: 100000
diff --git a/assets/test.yaml b/assets/test.yaml
index 46ba412..6e5b822 100755
--- a/assets/test.yaml
+++ b/assets/test.yaml
@@ -13,6 +13,9 @@ ncbi_taxonomy_path: /lustre/scratch123/tol/teams/tola/users/ea10/databases/taxdu
 ncbi_rankedlineage_path: /lustre/scratch123/tol/teams/tola/users/ea10/databases/taxdump/rankedlineage.dmp
 busco_lineages_folder: /lustre/scratch123/tol/resources/busco/data/v5/2021-03-14/lineages
 fcs_gx_database_path: /lustre/scratch124/tol/projects/asg/sub_projects/ncbi_decon/0.4.0/gxdb
+vecscreen_database_path: /lustre/scratch123/tol/teams/tola/users/ea10/ascc_databases/vecscreen_database/adaptors_for_screening_euks.fa
+diamond_uniprot_database_path: /lustre/scratch123/tol/teams/tola/users/ea10/ascc_databases/uniprot/uniprot_reference_proteomes_with_taxonnames.dmnd
+diamond_nr_database_path: /lustre/scratch123/tol/teams/tola/users/ea10/ascc_databases/proteins2/nr.dmnd
 seqkit:
   sliding: 6000
   window: 100000
diff --git a/bin/sam_to_sorted_indexed_bam.py b/bin/sam_to_sorted_indexed_bam.py
new file mode 100644
index 0000000..7be6d85
--- /dev/null
+++ b/bin/sam_to_sorted_indexed_bam.py
@@ -0,0 +1,60 @@
+#!/usr/bin/env python3
+"""
+Script conversion of .sam file with mapped reads to sorted and indexed .bam file
+"""
+# MIT License
+# 
+# Copyright (c) 2020-2021 Genome Research Ltd.
+# 
+# Author: Eerik Aunin (ea10@sanger.ac.uk)
+# 
+# This file is a part of the Genome Decomposition Analysis (GDA) pipeline.
+# 
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# 
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+# 
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import general_purpose_functions as gpf
+import argparse
+
+def main(sam_file, threads, fasta_path):
+    sam_file_extensionless_path = sam_file.split(".sam")[0]
+    bam_file = sam_file_extensionless_path + ".bam"
+    sorted_bam_file = sam_file_extensionless_path + "_sorted.bam"
+    sam_to_bam_command = None
+    if fasta_path == "":
+        sam_to_bam_command = "samtools view -Sb -@ {} {} > {}".format(threads, sam_file, bam_file)
+    else:
+        sam_to_bam_command = "samtools view -T {} -Sb -@ {} {} > {}".format(fasta_path, threads, sam_file, bam_file)
+    gpf.run_system_command(sam_to_bam_command)
+    sort_bam_command = "samtools sort -@ {} {} -o {}".format(threads, bam_file, sorted_bam_file)
+    gpf.run_system_command(sort_bam_command)
+    index_bam_command = "samtools index " + sorted_bam_file
+    gpf.run_system_command(index_bam_command)
+    remove_sam_command = "rm " + sam_file
+    gpf.run_system_command(remove_sam_command)
+    remove_unsorted_bam_command = "rm " + bam_file
+    gpf.run_system_command(remove_unsorted_bam_command)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("sam_file", type=str, help="Path to input SAM file")
+    parser.add_argument("--threads", type=int, help="Number of threads (default: 1)", default=1)
+    parser.add_argument("--fasta_path", type=str, help="Optional (needed when the reads have been mapped to a genome larger than 4Gb with minimap2): path the FASTA file to which the reads were mapped", default="")
+    args = parser.parse_args()
+    main(args.sam_file, args.threads, args.fasta_path)
+
diff --git a/bin/samtools_depth_average_coverage.py b/bin/samtools_depth_average_coverage.py
new file mode 100644
index 0000000..e7763a6
--- /dev/null
+++ b/bin/samtools_depth_average_coverage.py
@@ -0,0 +1,27 @@
+#!/usr/bin/env python3
+"""
+Script for finding the average coverage of each scaffold in a SAMtools depth output file
+"""
+
+import sys
+import general_purpose_functions as gpf
+
+in_path = sys.argv[1]
+in_data = gpf.ll(in_path)
+scaffs_dict = dict()
+
+for line in in_data:
+    split_line = line.split()
+    scaff_name = split_line[0]
+    coverage = int(split_line[2])
+    if scaff_name in scaffs_dict:
+        scaffs_dict[scaff_name]["coverage_sum"] += coverage
+        scaffs_dict[scaff_name]["scaff_len"] += 1
+    else:
+        scaffs_dict[scaff_name] = dict()
+        scaffs_dict[scaff_name]["coverage_sum"] = coverage
+        scaffs_dict[scaff_name]["scaff_len"] = 1
+
+for scaff_name in scaffs_dict:
+    average_coverage = scaffs_dict[scaff_name]["coverage_sum"] / scaffs_dict[scaff_name]["scaff_len"]
+    print(scaff_name + "," + str(average_coverage))
diff --git a/subworkflows/local/yaml_input.nf b/subworkflows/local/yaml_input.nf
index d4b4fe3..dd70d91 100644
--- a/subworkflows/local/yaml_input.nf
+++ b/subworkflows/local/yaml_input.nf
@@ -35,9 +35,10 @@ workflow YAML_INPUT {
                 ncbi_taxonomy_path:                             ( data.ncbi_taxonomy_path               )
                 ncbi_rankedlineage_path:                        ( data.ncbi_rankedlineage_path          )
                 busco_lineages_folder:                          ( data.busco_lineages_folder            )
-                seqkit_values                       :           ( data.seqkit                           )
+                seqkit_values:                                  ( data.seqkit                           )
                 diamond_uniprot_database_path:                  ( data.diamond_uniprot_database_path    )
                 diamond_nr_database_path:                       ( data.diamond_nr_database_path         )
+                vecscreen_database_path:                        ( data.vecscreen_database_path          )
 
         }
         .set{ group }
@@ -82,6 +83,7 @@ workflow YAML_INPUT {
     fcs_gx_database_path             = group.fcs_gx_database_path
     diamond_uniprot_database_path    = group.diamond_uniprot_database_path
     diamond_nr_database_path         = group.diamond_nr_database_path
+    vecscreen_database_path          = group.vecscreen_database_path
     seqkit_sliding                   = seqkit.sliding_value
     seqkit_window                    = seqkit.window_value
     versions                         = ch_versions.ifEmpty(null)

From b994daea8b538506c9c14ca61cc95e296860f3c6 Mon Sep 17 00:00:00 2001
From: yumisims <yy5@sanger.ac.uk>
Date: Wed, 11 Oct 2023 21:07:56 +0100
Subject: [PATCH 2/2] python black

---
 bin/sam_to_sorted_indexed_bam.py | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/bin/sam_to_sorted_indexed_bam.py b/bin/sam_to_sorted_indexed_bam.py
index 7be6d85..a1b5a45 100644
--- a/bin/sam_to_sorted_indexed_bam.py
+++ b/bin/sam_to_sorted_indexed_bam.py
@@ -3,23 +3,23 @@
 Script conversion of .sam file with mapped reads to sorted and indexed .bam file
 """
 # MIT License
-# 
+#
 # Copyright (c) 2020-2021 Genome Research Ltd.
-# 
+#
 # Author: Eerik Aunin (ea10@sanger.ac.uk)
-# 
+#
 # This file is a part of the Genome Decomposition Analysis (GDA) pipeline.
-# 
+#
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
 # in the Software without restriction, including without limitation the rights
 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 # copies of the Software, and to permit persons to whom the Software is
 # furnished to do so, subject to the following conditions:
-# 
+#
 # The above copyright notice and this permission notice shall be included in all
 # copies or substantial portions of the Software.
-# 
+#
 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -31,6 +31,7 @@
 import general_purpose_functions as gpf
 import argparse
 
+
 def main(sam_file, threads, fasta_path):
     sam_file_extensionless_path = sam_file.split(".sam")[0]
     bam_file = sam_file_extensionless_path + ".bam"
@@ -50,11 +51,16 @@ def main(sam_file, threads, fasta_path):
     remove_unsorted_bam_command = "rm " + bam_file
     gpf.run_system_command(remove_unsorted_bam_command)
 
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description=__doc__)
     parser.add_argument("sam_file", type=str, help="Path to input SAM file")
     parser.add_argument("--threads", type=int, help="Number of threads (default: 1)", default=1)
-    parser.add_argument("--fasta_path", type=str, help="Optional (needed when the reads have been mapped to a genome larger than 4Gb with minimap2): path the FASTA file to which the reads were mapped", default="")
+    parser.add_argument(
+        "--fasta_path",
+        type=str,
+        help="Optional (needed when the reads have been mapped to a genome larger than 4Gb with minimap2): path the FASTA file to which the reads were mapped",
+        default="",
+    )
     args = parser.parse_args()
     main(args.sam_file, args.threads, args.fasta_path)
-