From dba1a156ae64d53bcd7753ba936eee168224056b Mon Sep 17 00:00:00 2001 From: cimendes Date: Tue, 27 Aug 2019 11:50:02 +0100 Subject: [PATCH] add process to compile mash hits into a multifasta for guided assembly --- main.nf | 20 +++++++- params.config | 2 +- templates/mash_hits_compiler.py | 91 +++++++++++++++++++++++++++++++++ 3 files changed, 111 insertions(+), 2 deletions(-) create mode 100644 templates/mash_hits_compiler.py diff --git a/main.nf b/main.nf index e0772ef..3623630 100644 --- a/main.nf +++ b/main.nf @@ -185,13 +185,15 @@ if (params.mode == "magicblast") { IN_reference = Channel.fromPath("${params.reference}") + IN_reference.into{ IN_reference_1 ; IN_reference_2 } + process mash_sketch { tag {amr_reference} storeDir 'mash_sketch/' input: - file(amr_reference) from IN_reference + file(amr_reference) from IN_reference_1 output: file("*.msh") into OUT_mash_sketch @@ -221,6 +223,22 @@ if (params.mode == "magicblast") { """ } + process mash_hits_compiler { + + tag {sample_id} + + input: + set sample_id, file(mash_results) from OUT_mash_screen + file(amr_reference) from IN_reference_2 + + output: + set sample_id, file("*.fasta") into OUT_baits + + script: + template "mash_hits_compiler.py" + } + + } else if (params.mode == "hmmer") { diff --git a/params.config b/params.config index 2ded827..38cf3a6 100644 --- a/params.config +++ b/params.config @@ -14,7 +14,7 @@ params { compress_fastq = true // options: magicblast, mash, hmmer - mode = "hmmer" + mode = "mash" /* Component 'magicBLAST' diff --git a/templates/mash_hits_compiler.py b/templates/mash_hits_compiler.py new file mode 100644 index 0000000..75e1b63 --- /dev/null +++ b/templates/mash_hits_compiler.py @@ -0,0 +1,91 @@ +#!/usr/bin/env python3 + +""" +Purpose +------- + +This module intends to retrieve the AMR fasta sequences from mash. + +Expected input +-------------- + +The following variables are expected whether using NextFlow or the +:py:func:`main` executor. + +- ``sample_id`` : Sample Identification string. + - e.g.: ``'SampleA'`` +- ``amr_reference`` : A fasta file path. + - e.g.: ``'AMR_CDS.fasta'`` +- ``mash_results`` : mash hits + - e.g.: ``'.screen'`` + +Generated output +---------------- + +- A fasta file per contig (given the minimum contig size +""" + +__version__ = "0.0.1" +__build__ = "27082019" +__process__ = "mash_hits_compiler-nf" + +import os +from itertools import groupby +import csv + + +if __file__.endswith(".command.sh"): + SAMPLE_ID = '$sample_id' + REFERENCE = '$amr_reference' + MASH_RESULTS = '$mash_results' + print("Running {} with parameters:".format( + os.path.basename(__file__))) + print("SAMPLE_ID: {}".format(SAMPLE_ID)) + print("ASSEMBLY: {}".format(REFERENCE)) + print("MIN_SIZE: {}".format(MASH_RESULTS)) + + +def main(sample_id, reference, mash_results): + """Main executor of the split_fasta template. + + Parameters + ---------- + sample_id : str + Sample Identification string. + assembly : list + Assembly file. + min_size : int + Minimum contig size.""" + + print("Starting script") + + amr_hits = [] + + with open(mash_results, "r") as mash: + data = mash.readlines() + for line in data: + amr_hits.append(line.split('\\t')[4]) + + print(len(amr_hits)) + + f_open = open(reference, "r") + results_file = open(sample_id + '_amr.fasta', 'w') + + entry = (x[1] for x in groupby(f_open, lambda line: line[0] == ">")) + + for header in entry: + + header_str = header.__next__()[1:].strip() + seq = "".join(s.strip() for s in entry.__next__()) + + print(header_str.split(' ')[0]) + + if header_str.split(' ')[0] in amr_hits: + results_file.write(">" + header_str + "\\n" + seq + "\\n") + + f_open.close() + results_file.close() + + +if __name__ == '__main__': + main(SAMPLE_ID, REFERENCE, MASH_RESULTS)