Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add vecscreen test set on ci, checkin coverage related scripts, updat… #32

Merged
merged 2 commits into from
Oct 12, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,11 @@ jobs:
mkdir diamond
wget -c https://tolit.cog.sanger.ac.uk/test-data/resources/diamond/UP000000212_1234679_tax.dmnd -O diamond/UP000000212_1234679_tax.dmnd

- name: Download the vecscreen test data
run: |
mkdir vecscreen
wget -c https://tolit.cog.sanger.ac.uk/test-data/resources/vecscreen/GCA_900155405.1_PRJEB18760_genomic.fna -O vecscreen/GCA_900155405.1_PRJEB18760_genomic.fna

- name: Run pipeline with test data
# TODO nf-core: You can customise CI pipeline run tests as required
# For example: adding multiple test runs with different parameters
Expand Down
1 change: 1 addition & 0 deletions assets/github_testing/test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ busco_lineages_folder: /home/runner/work/ascc/ascc/busco_database/lineages
fcs_gx_database_path: /home/runner/work/ascc/ascc/FCS_gx/
diamond_uniprot_database_path: /home/runner/work/ascc/ascc/diamond/UP000000212_1234679_tax.dmnd
diamond_nr_database_path: /home/runner/work/ascc/ascc/diamond/UP000000212_1234679_tax.dmnd
vecscreen_database_path: /home/runner/work/ascc/ascc/vecscreen/GCA_900155405.1_PRJEB18760_genomic.fna
seqkit:
sliding: 6000
window: 100000
3 changes: 3 additions & 0 deletions assets/test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@ ncbi_taxonomy_path: /lustre/scratch123/tol/teams/tola/users/ea10/databases/taxdu
ncbi_rankedlineage_path: /lustre/scratch123/tol/teams/tola/users/ea10/databases/taxdump/rankedlineage.dmp
busco_lineages_folder: /lustre/scratch123/tol/resources/busco/data/v5/2021-03-14/lineages
fcs_gx_database_path: /lustre/scratch124/tol/projects/asg/sub_projects/ncbi_decon/0.4.0/gxdb
vecscreen_database_path: /lustre/scratch123/tol/teams/tola/users/ea10/ascc_databases/vecscreen_database/adaptors_for_screening_euks.fa
diamond_uniprot_database_path: /lustre/scratch123/tol/teams/tola/users/ea10/ascc_databases/uniprot/uniprot_reference_proteomes_with_taxonnames.dmnd
diamond_nr_database_path: /lustre/scratch123/tol/teams/tola/users/ea10/ascc_databases/proteins2/nr.dmnd
seqkit:
sliding: 6000
window: 100000
66 changes: 66 additions & 0 deletions bin/sam_to_sorted_indexed_bam.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
#!/usr/bin/env python3
"""
Script conversion of .sam file with mapped reads to sorted and indexed .bam file
"""
# MIT License
#
# Copyright (c) 2020-2021 Genome Research Ltd.
#
# Author: Eerik Aunin ([email protected])
#
# This file is a part of the Genome Decomposition Analysis (GDA) pipeline.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import general_purpose_functions as gpf
import argparse


def main(sam_file, threads, fasta_path):
sam_file_extensionless_path = sam_file.split(".sam")[0]
bam_file = sam_file_extensionless_path + ".bam"
sorted_bam_file = sam_file_extensionless_path + "_sorted.bam"
sam_to_bam_command = None
if fasta_path == "":
sam_to_bam_command = "samtools view -Sb -@ {} {} > {}".format(threads, sam_file, bam_file)
else:
sam_to_bam_command = "samtools view -T {} -Sb -@ {} {} > {}".format(fasta_path, threads, sam_file, bam_file)
gpf.run_system_command(sam_to_bam_command)
sort_bam_command = "samtools sort -@ {} {} -o {}".format(threads, bam_file, sorted_bam_file)
gpf.run_system_command(sort_bam_command)
index_bam_command = "samtools index " + sorted_bam_file
gpf.run_system_command(index_bam_command)
remove_sam_command = "rm " + sam_file
gpf.run_system_command(remove_sam_command)
remove_unsorted_bam_command = "rm " + bam_file
gpf.run_system_command(remove_unsorted_bam_command)


if __name__ == "__main__":
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("sam_file", type=str, help="Path to input SAM file")
parser.add_argument("--threads", type=int, help="Number of threads (default: 1)", default=1)
parser.add_argument(
"--fasta_path",
type=str,
help="Optional (needed when the reads have been mapped to a genome larger than 4Gb with minimap2): path the FASTA file to which the reads were mapped",
default="",
)
args = parser.parse_args()
main(args.sam_file, args.threads, args.fasta_path)
27 changes: 27 additions & 0 deletions bin/samtools_depth_average_coverage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#!/usr/bin/env python3
"""
Script for finding the average coverage of each scaffold in a SAMtools depth output file
"""

import sys
import general_purpose_functions as gpf

in_path = sys.argv[1]
in_data = gpf.ll(in_path)
scaffs_dict = dict()

for line in in_data:
split_line = line.split()
scaff_name = split_line[0]
coverage = int(split_line[2])
if scaff_name in scaffs_dict:
scaffs_dict[scaff_name]["coverage_sum"] += coverage
scaffs_dict[scaff_name]["scaff_len"] += 1
else:
scaffs_dict[scaff_name] = dict()
scaffs_dict[scaff_name]["coverage_sum"] = coverage
scaffs_dict[scaff_name]["scaff_len"] = 1

for scaff_name in scaffs_dict:
average_coverage = scaffs_dict[scaff_name]["coverage_sum"] / scaffs_dict[scaff_name]["scaff_len"]
print(scaff_name + "," + str(average_coverage))
4 changes: 3 additions & 1 deletion subworkflows/local/yaml_input.nf
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,10 @@ workflow YAML_INPUT {
ncbi_taxonomy_path: ( data.ncbi_taxonomy_path )
ncbi_rankedlineage_path: ( data.ncbi_rankedlineage_path )
busco_lineages_folder: ( data.busco_lineages_folder )
seqkit_values : ( data.seqkit )
seqkit_values: ( data.seqkit )
diamond_uniprot_database_path: ( data.diamond_uniprot_database_path )
diamond_nr_database_path: ( data.diamond_nr_database_path )
vecscreen_database_path: ( data.vecscreen_database_path )

}
.set{ group }
Expand Down Expand Up @@ -82,6 +83,7 @@ workflow YAML_INPUT {
fcs_gx_database_path = group.fcs_gx_database_path
diamond_uniprot_database_path = group.diamond_uniprot_database_path
diamond_nr_database_path = group.diamond_nr_database_path
vecscreen_database_path = group.vecscreen_database_path
seqkit_sliding = seqkit.sliding_value
seqkit_window = seqkit.window_value
versions = ch_versions.ifEmpty(null)
Expand Down