diff --git a/.github/workflows/ecoli.yml b/.github/workflows/ecoli.yml index b610388..f66c2f0 100644 --- a/.github/workflows/ecoli.yml +++ b/.github/workflows/ecoli.yml @@ -38,5 +38,12 @@ jobs: nextflow run . -profile docker --maxcpus 2 --medcpus 2 cat grandeur/grandeur_summary.tsv - cat grandeur/shigatyper/shigatyper_results.txt - cat grandeur/serotypefinder/serotypefinder_results.txt \ No newline at end of file + + - name: Check E. coli file + run: | + for file in grandeur/shigatyper/shigatyper_results.txt grandeur/serotypefinder/serotypefinder_results.txt + do + head $file + wc -l $file + done + \ No newline at end of file diff --git a/.github/workflows/just_msa.yml b/.github/workflows/just_msa.yml index c0cb407..6910866 100644 --- a/.github/workflows/just_msa.yml +++ b/.github/workflows/just_msa.yml @@ -24,13 +24,21 @@ jobs: run: | docker --version - wget https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/904/864/595/GCA_904864595.1_INF333/GCA_904864595.1_INF333_genomic.fna.gz && gzip -d GCA_904864595.1_INF333_genomic.fna.gz - wget https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/013/783/245/GCA_013783245.1_ASM1378324v1/GCA_013783245.1_ASM1378324v1_genomic.fna.gz && gzip -d GCA_013783245.1_ASM1378324v1_genomic.fna.gz - wget https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/026/626/185/GCA_026626185.1_ASM2662618v1/GCA_026626185.1_ASM2662618v1_genomic.fna.gz && gzip -d GCA_026626185.1_ASM2662618v1_genomic.fna.gz - wget https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/020/808/985/GCA_020808985.1_ASM2080898v1/GCA_020808985.1_ASM2080898v1_genomic.fna.gz && gzip -d GCA_020808985.1_ASM2080898v1_genomic.fna.gz - wget https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/904/863/225/GCA_904863225.1_KSB1_6J/GCA_904863225.1_KSB1_6J_genomic.fna.gz && gzip -d GCA_904863225.1_KSB1_6J_genomic.fna.gz + wget -q https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/904/864/595/GCA_904864595.1_INF333/GCA_904864595.1_INF333_genomic.fna.gz && gzip -d GCA_904864595.1_INF333_genomic.fna.gz + wget -q https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/013/783/245/GCA_013783245.1_ASM1378324v1/GCA_013783245.1_ASM1378324v1_genomic.fna.gz && gzip -d GCA_013783245.1_ASM1378324v1_genomic.fna.gz + wget -q https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/026/626/185/GCA_026626185.1_ASM2662618v1/GCA_026626185.1_ASM2662618v1_genomic.fna.gz && gzip -d GCA_026626185.1_ASM2662618v1_genomic.fna.gz + wget -q https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/020/808/985/GCA_020808985.1_ASM2080898v1/GCA_020808985.1_ASM2080898v1_genomic.fna.gz && gzip -d GCA_020808985.1_ASM2080898v1_genomic.fna.gz + wget -q https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/904/863/225/GCA_904863225.1_KSB1_6J/GCA_904863225.1_KSB1_6J_genomic.fna.gz && gzip -d GCA_904863225.1_KSB1_6J_genomic.fna.gz mkdir fastas mv *fna fastas/. nextflow run . -profile docker,just_msa --maxcpus 2 --medcpus 2 + + - name: Check MSA files + run: | + for file in grandeur/roary/summary_statistics.txt grandeur/iqtree2/iqtree.treefile.nwk grandeur/snp-dists/snp_matrix_with_qc.txt + do + head $file + wc -l $file + done diff --git a/.github/workflows/legionella.yml b/.github/workflows/legionella.yml index 3a27757..5d28836 100644 --- a/.github/workflows/legionella.yml +++ b/.github/workflows/legionella.yml @@ -40,4 +40,11 @@ jobs: cat grandeur/grandeur_summary.tsv - cat grandeur/legsta/legsta_summary.csv \ No newline at end of file + - name: Check Legionella file + run: | + for file in grandeur/legsta/legsta_summary.csv + do + head $file + wc -l $file + done + \ No newline at end of file diff --git a/.github/workflows/run_workflow.yml b/.github/workflows/run_workflow.yml index ea607bb..6eeea18 100644 --- a/.github/workflows/run_workflow.yml +++ b/.github/workflows/run_workflow.yml @@ -31,4 +31,11 @@ jobs: mv *fastq.gz reads/. nextflow run . -profile docker --maxcpus 2 --medcpus 2 cat grandeur/grandeur_summary.tsv - + + - name: Check summary files + run: | + for file in grandeur/mlst/mlst_summary.tsv + do + head $file + wc -l $file + done diff --git a/.github/workflows/salmonella.yml b/.github/workflows/salmonella.yml index 2b93058..73087ed 100644 --- a/.github/workflows/salmonella.yml +++ b/.github/workflows/salmonella.yml @@ -37,5 +37,14 @@ jobs: done nextflow run . -profile docker --maxcpus 2 --medcpus 2 + cat grandeur/grandeur_summary.tsv - cat grandeur/seqsero2/seqsero2_results.txt \ No newline at end of file + + - name: Check Salmonella file + run: | + for file in grandeur/seqsero2/seqsero2_results.txt + do + head $file + wc -l $file + done + \ No newline at end of file diff --git a/.github/workflows/strepA.yml b/.github/workflows/strepA.yml index 3ed8a18..5a4e1ed 100644 --- a/.github/workflows/strepA.yml +++ b/.github/workflows/strepA.yml @@ -38,4 +38,11 @@ jobs: nextflow run . -profile docker --maxcpus 2 --medcpus 2 cat grandeur/grandeur_summary.tsv - cat grandeur/emmtyper/emmtyper_summary.tsv + + - name: Check Strep pneumo file + run: | + for file in grandeur/emmtyper/emmtyper_summary.tsv + do + head $file + wc -l $file + done diff --git a/.github/workflows/strep_pneumo.yml b/.github/workflows/strep_pneumo.yml index da2f3bf..4d4ff70 100644 --- a/.github/workflows/strep_pneumo.yml +++ b/.github/workflows/strep_pneumo.yml @@ -38,4 +38,12 @@ jobs: nextflow run . -profile docker --maxcpus 2 --medcpus 2 cat grandeur/grandeur_summary.tsv - cat grandeur/pbptyper/pbptyper_summary.tsv \ No newline at end of file + + - name: Check Strep pneumo file + run: | + for file in grandeur/pbptyper/pbptyper_summary.tsv + do + head $file + wc -l $file + done + \ No newline at end of file diff --git a/.github/workflows/vibrio.yml b/.github/workflows/vibrio.yml index 7194679..a160a90 100644 --- a/.github/workflows/vibrio.yml +++ b/.github/workflows/vibrio.yml @@ -38,4 +38,6 @@ jobs: nextflow run . -profile docker --maxcpus 2 --medcpus 2 cat grandeur/grandeur_summary.tsv - grep -i vibrio grandeur/fastani/fastani_summary.csv \ No newline at end of file + + - name: Check Vibrio species + run: grep -i vibrio grandeur/fastani/fastani_summary.csv diff --git a/bin/datasets_download.py b/bin/datasets_download.py new file mode 100644 index 0000000..48dd79b --- /dev/null +++ b/bin/datasets_download.py @@ -0,0 +1,48 @@ +#!/usr/bin/env python3 + +''' +Author: Erin Young + +Description: + +This script is to get some genome accession from NCBI datasets + +EXAMPLE: +python3 datasets_download.py taxon hits +''' + +import subprocess +import sys + +taxon = sys.argv[1] +genus, species = taxon.replace('[', '').replace(']', '').split('_') +print("Looking for accessions for " + genus + " " + species ) +outfile = open('datasets/' + genus + "_" + species + '_genomes.csv', "w") + +try: + hits = sys.argv[2] +except: + hits = '5' + +# putting in the header +outfile.write('accession,assminfo-refseq-category,assminfo-level,organism-name,assmstats-total-ungapped-len\n') + +# Getting representative genomes +rep = subprocess.Popen(['datasets', 'summary', 'genome', 'taxon', '"' + genus + ' ' + species + '"', '--reference', '--limit', hits, '--as-json-lines'], stdout = subprocess.PIPE) +dft = subprocess.check_output(['dataformat', 'tsv', 'genome', '--fields' , 'accession,assminfo-refseq-category,assminfo-level,organism-name,assmstats-total-ungapped-len'], stdin = rep.stdout, universal_newlines= True, text='str') +rep.wait() +for line in dft.split('\n'): + if 'Ungapped Length' not in line and line: + if int(line.split('\t')[4]) < 15000000: + outfile.write(line.replace('\t',',') + '\n') + +# Getting additional genomes +oth = subprocess.Popen(['datasets', 'summary', 'genome', 'taxon', '"' + genus + ' ' + species + '"', '--limit', hits, '--as-json-lines'], stdout = subprocess.PIPE) +df2 = subprocess.check_output(['dataformat', 'tsv', 'genome', '--fields' , 'accession,assminfo-refseq-category,assminfo-level,organism-name,assmstats-total-ungapped-len'], stdin = oth.stdout, universal_newlines= True, text='str') +oth.wait() +for line in df2.split('\n'): + if 'Ungapped Length' not in line and line: + if int(line.split('\t')[4]) < 15000000: + outfile.write(line.replace('\t',',') + '\n') + +outfile.close() diff --git a/bin/summary_file.py b/bin/summary_file.py new file mode 100755 index 0000000..5eafdad --- /dev/null +++ b/bin/summary_file.py @@ -0,0 +1,76 @@ +#!/bin/python3 + +########################################## +# written by Erin Young # +# for creating summary files with the # +# sample id for Grandeur # +########################################## + +import os +import sys + +out = sys.argv[2] +spl = sys.argv[4] + +if not os.path.exists(sys.argv[1]): + print("File " + sys.argv[1] + " does not exist. Exiting.") + exit() + +coms = 0 +tabs = 0 +with open(sys.argv[1]) as file: + first_line = file.readline() + coms = first_line.count('\t') + tabs = first_line.count('\t') + +if tabs > coms: + delim = '\t' + print("Predicting tab delimited") +else: + delim = ',' + print("Predicting comma delimited") + +with open(sys.argv[1], 'r') as file: + lines = file.readlines() + for line in lines: + print(line) + print(line.split(delim)) + +outfile = open(sys.argv[2], "w") + +final_delim = ',' +header = 'shouldntexist' + +# TODO: turn this into a dict +if sys.argv[3] == 'mlst': + final_delim = '\t' + header = 'PubMLST' + outfile.write('sample\tfilename\tmatching PubMLST scheme\tST\tID1\tID2\tID3\tID4\tID5\tID6\tID7\tID8\tID9\tID10\tID11\tID12\tID13\tID14\tID15\n') +elif sys.argv[3] == 'shigatyper': + final_delim = '\t' + header = 'Number' +elif sys.argv[3] == 'kleborate': + final_delim = '\t' + header = 'largest_contig' +elif sys.argv[3] == 'plasmidfinder' : + final_delim = '\t' + header = 'Accession number' +elif sys.argv[3] == 'emmtyper': + outfile.write('sample\tIsolate name\tNumber of BLAST hits\tNumber of clusters\tPredicted emm-type\tPosition(s)\tPossible emm-like alleles\temm-like position(s)\tEMM cluster\n') + final_delim = '\t' + header = 'Number of BLAST hits' +elif sys.argv[3] == 'serotypefinder': + final_delim = '\t' + header = 'HSP length' + +print("Using final delim " + final_delim + " with sample " + spl + " for " + sys.argv[3]) + +with open(sys.argv[1]) as file: + lines = file.readlines() + for line in lines: + if header in line: + replace = line.replace(delim, final_delim) + outfile.write('sample' + final_delim + replace) + else: + replace = line.replace(delim, final_delim) + outfile.write(spl + final_delim + replace) diff --git a/grandeur.nf b/grandeur.nf index 516371d..5bd9b5a 100644 --- a/grandeur.nf +++ b/grandeur.nf @@ -146,8 +146,10 @@ include { test } from "./subworkflows/test" // ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### // Creating the summary files -summary_script = Channel.fromPath(workflow.projectDir + "/bin/summary.py", type: "file") -snpmtrx_script = Channel.fromPath(workflow.projectDir + "/bin/HeatCluster.py", type: "file") +dataset_script = Channel.fromPath(workflow.projectDir + "/bin/datasets_download.py", type: "file") +snpmtrx_script = Channel.fromPath(workflow.projectDir + "/bin/HeatCluster.py", type: "file") +summary_script = Channel.fromPath(workflow.projectDir + "/bin/summary.py", type: "file") +summfle_script = Channel.fromPath(workflow.projectDir + "/bin/summary_file.py", type: "file") // ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### @@ -318,7 +320,8 @@ workflow { ch_for_summary.collect(), ch_contigs, ch_fastani_genomes, - ch_genome_ref) + ch_genome_ref, + dataset_script) ch_for_summary = ch_for_summary.mix(average_nucleotide_identity.out.for_summary) ch_for_flag = ch_for_flag.mix(average_nucleotide_identity.out.for_flag) @@ -341,7 +344,8 @@ workflow { ch_raw_reads, ch_contigs, ch_for_flag, - ch_size) + ch_size, + summfle_script) ch_for_summary = ch_for_summary.mix(information.out.for_summary) ch_for_multiqc = ch_for_multiqc.mix(information.out.for_multiqc) diff --git a/modules/blobtools.nf b/modules/blobtools.nf index c3c836f..5b1d5fd 100644 --- a/modules/blobtools.nf +++ b/modules/blobtools.nf @@ -125,4 +125,4 @@ process blobtools_plot { grep -vw all blobtools/!{sample}_summary.txt > blobtools/!{sample}_blobtools.txt ''' -} +} \ No newline at end of file diff --git a/modules/datasets.nf b/modules/datasets.nf index b80a287..43de113 100644 --- a/modules/datasets.nf +++ b/modules/datasets.nf @@ -1,7 +1,7 @@ process datasets_summary { tag "${taxon}" publishDir params.outdir, mode: 'copy' - container 'staphb/ncbi-datasets:15.2.0' + container 'quay.io/uphl/datasets:15.12.0' maxForks 10 //#UPHLICA errorStrategy { task.attempt < 2 ? 'retry' : 'ignore'} //#UPHLICA pod annotation: 'scheduler.illumina.com/presetSize', value: 'standard-medium' @@ -10,10 +10,10 @@ process datasets_summary { //#UPHLICA time '10m' input: - val(taxon) + tuple val(taxon), file(script) output: - path "datasets/*_genomes.csv" , emit: genomes + path "datasets/*_genomes.csv" , emit: genomes, optional: true path "logs/${task.process}/*.${workflow.sessionId}.log", emit: log shell: @@ -28,32 +28,18 @@ process datasets_summary { echo "Nextflow command : " >> $log_file cat .command.sh >> $log_file - taxon="$(echo !{taxon} | tr '_' ' ' | sed 's/[//g' | sed 's/]//g' )" - echo "the taxon is now $taxon" - - datasets summary genome taxon "$taxon" --reference --limit !{params.datasets_max_genomes} --as-json-lines | \ - dataformat tsv genome --fields accession,assminfo-refseq-category,assminfo-level,organism-name,assmstats-total-ungapped-len | \ - grep -v Homo | \ - tr '\\t' ',' \ - > datasets/!{taxon}_genomes.csv - - datasets summary genome taxon "$taxon" --limit !{params.datasets_max_genomes} --as-json-lines | \ - dataformat tsv genome --fields accession,assminfo-refseq-category,assminfo-level,organism-name,assmstats-total-ungapped-len | \ - grep -v Homo | \ - grep -v "Assembly Accession" | \ - tr '\\t' ',' \ - >> datasets/!{taxon}_genomes.csv + python3 !{script} !{taxon} !{params.datasets_max_genomes} ''' } -// It is faster if datasets can download the entire list at a time, but there is a timeout for downloading that is about 20 minutes. +// It is faster if datasets can download the entire list at a time, but there is a 20 minute timeout for downloading. // The '||' is to allow each genome to be downloaded on its own, which is longer overall but each genome should be less than 20 minutes. process datasets_download { tag "Downloading Genomes" // because there's no way to specify threads label "medcpus" publishDir = [ path: "${params.outdir}", mode: 'copy', pattern: "{logs/*/*log,datasets/fastani_refs.tar.gz}" ] - container 'staphb/ncbi-datasets:15.2.0' + container 'quay.io/uphl/datasets:15.12.0' maxForks 10 //#UPHLICA errorStrategy { task.attempt < 2 ? 'retry' : 'ignore'} //#UPHLICA pod annotation: 'scheduler.illumina.com/presetSize', value: 'standard-medium' @@ -83,7 +69,7 @@ process datasets_download { cat .command.sh >> $log_file cut -f 1 !{genomes} > all_runs.txt - grep -h -v Accession !{ids} | cut -f 1 -d , | sort | uniq > this_run.txt + grep -h -v accession !{ids} | cut -f 1 -d , | sort | uniq > this_run.txt cat all_runs.txt this_run.txt | sort | uniq > id_list.txt diff --git a/modules/emmtyper.nf b/modules/emmtyper.nf index 3bc7f22..b1aa1ec 100644 --- a/modules/emmtyper.nf +++ b/modules/emmtyper.nf @@ -10,11 +10,11 @@ process emmtyper { //#UPHLICA cpus 3 //#UPHLICA time '24h' - when: - flag =~ 'found' + when: + flag =~ 'found' input: - tuple val(sample), file(contigs), val(flag) + tuple val(sample), file(contigs), val(flag), file(script) output: path "emmtyper/${sample}_emmtyper.txt" , emit: collect @@ -34,12 +34,12 @@ process emmtyper { echo "Nextflow command : " >> $log_file cat .command.sh >> $log_file - echo -e "sample\\tIsolate name\\tNumber of BLAST hits\\tNumber of clusters\\tPredicted emm-type\\tPosition(s)\\tPossible emm-like alleles\\temm-like position(s)\\tEMM cluster" > emmtyper/!{sample}_emmtyper.txt - emmtyper !{params.emmtyper_options} \ --output-format 'verbose' \ !{contigs} \ | tee -a $log_file \ - | awk -v sample=!{sample} '{ print sample "\\t" $0 }' >> emmtyper/!{sample}_emmtyper.txt + > !{sample}_emmtyper.txt + + python3 !{script} !{sample}_emmtyper.txt emmtyper/!{sample}_emmtyper.txt emmtyper !{sample} ''' } diff --git a/modules/fastani.nf b/modules/fastani.nf index 145b415..174306c 100644 --- a/modules/fastani.nf +++ b/modules/fastani.nf @@ -16,10 +16,10 @@ process fastani { tuple val(sample), file(contigs), path(genomes) output: - tuple val(sample), file("fastani/${sample}_fastani.csv") , emit: results - tuple val(sample), env(top_hit), file("top_hit/*"), optional: true, emit: top_hit - path "fastani/*" , emit: everything - path "logs/${task.process}/${sample}.${workflow.sessionId}.log" , emit: log + tuple val(sample), file("fastani/${sample}_fastani.csv") , emit: results, optional: true + tuple val(sample), env(top_hit), file("top_hit/*") , emit: top_hit, optional: true + path "fastani/*" , emit: everything + path "logs/${task.process}/${sample}.${workflow.sessionId}.log", emit: log shell: ''' diff --git a/modules/kleborate.nf b/modules/kleborate.nf index 301eef4..93c6322 100644 --- a/modules/kleborate.nf +++ b/modules/kleborate.nf @@ -13,10 +13,10 @@ process kleborate { flag =~ 'found' input: - tuple val(sample), file(contig), val(flag) + tuple val(sample), file(contig), val(flag), file(script) output: - path "kleborate/${sample}_results.tsv" , emit: collect + path "kleborate/${sample}_results.tsv" , emit: collect, optional: true path "kleborate/${sample}_results.txt" , emit: result path "logs/${task.process}/${sample}.${workflow.sessionId}.log", emit: log @@ -37,7 +37,6 @@ process kleborate { -a !{contig} \ | tee -a $log_file - head -n 1 kleborate/!{sample}_results.txt | awk '{print "sample\\t" $0}' > kleborate/!{sample}_results.tsv - tail -n 1 kleborate/!{sample}_results.txt | awk -v sample=!{sample} '{print sample "\\t" $0}' >> kleborate/!{sample}_results.tsv + python3 !{script} kleborate/!{sample}_results.txt kleborate/!{sample}_results.tsv kleborate !{sample} ''' } diff --git a/modules/kraken2.nf b/modules/kraken2.nf index 5a05c5a..e8b9ba2 100644 --- a/modules/kraken2.nf +++ b/modules/kraken2.nf @@ -88,4 +88,4 @@ process kraken2_fasta { awk -v sample=!{sample} '{ if ($1 >= 5 ) print sample ",contigs," $1 "," $2 "," $3 "," $4 "," $5 "," $6 "_" $7 }' | \ sort >> kraken2/!{sample}_contigs_summary_kraken2.csv ''' -} +} \ No newline at end of file diff --git a/modules/mlst.nf b/modules/mlst.nf index d802ba1..f2fd328 100644 --- a/modules/mlst.nf +++ b/modules/mlst.nf @@ -10,7 +10,7 @@ process mlst { //#UPHLICA time '10m' input: - tuple val(sample), file(contig) + tuple val(sample), file(contig), file(script) output: path "mlst/${sample}_mlst.tsv" , emit: collect @@ -28,11 +28,11 @@ process mlst { echo "Nextflow command : " >> $log_file cat .command.sh >> $log_file - echo -e "sample\\tfilename\\tmatching PubMLST scheme\\tST\\tID1\\tID2\\tID3\\tID4\\tID5\\tID6\\tID7\\tID8\\tID9\\tID10\\tID11\\tID12\\tID13\\tID14\\tID15" > mlst/!{sample}_mlst.tsv - mlst !{params.mlst_options} \ --threads !{task.cpus} \ - !{contig} | \ - awk -v sample=!{sample} '{print sample "\\t" $1 "\\t" $2 "\\t" $3 "\\t" $4 "\\t" $5 "\\t" $6 "\\t" $7 "\\t" $8 "\\t" $9 "\\t" $10 "\\t" $11 "\\t" $12 "\\t" $13 "\\t" $14 "\\t" $15 "\\t" $16 "\\t" $17 "\\t" $18}' >> mlst/!{sample}_mlst.tsv + !{contig} \ + > !{sample}_mlst.txt + + python3 !{script} !{sample}_mlst.txt mlst/!{sample}_mlst.tsv mlst !{sample} ''' } diff --git a/modules/plasmidfinder.nf b/modules/plasmidfinder.nf index 944cdbf..afad63d 100644 --- a/modules/plasmidfinder.nf +++ b/modules/plasmidfinder.nf @@ -10,11 +10,11 @@ process plasmidfinder { //#UPHLICA time '10m' input: - tuple val(sample), file(file) + tuple val(sample), file(file), file(script) output: path "plasmidfinder/${sample}/*" , emit: files - path "plasmidfinder/${sample}_plasmidfinder.tsv" , emit: collect + path "plasmidfinder/${sample}_plasmidfinder.tsv" , emit: collect, optional: true path "logs/${task.process}/${sample}.${workflow.sessionId}.log" , emit: log shell: @@ -35,7 +35,6 @@ process plasmidfinder { --extented_output \ | tee -a $log_file - head -n 1 plasmidfinder/!{sample}/results_tab.tsv | awk '{print "sample\\t" $0 }' > plasmidfinder/!{sample}_plasmidfinder.tsv - tail -n +2 plasmidfinder/!{sample}/results_tab.tsv | awk -v sample=!{sample} '{print sample "\\t" $0 }' >> plasmidfinder/!{sample}_plasmidfinder.tsv + python3 !{script} plasmidfinder/!{sample}/results_tab.tsv plasmidfinder/!{sample}_plasmidfinder.tsv plasmidfinder !{sample} ''' } diff --git a/modules/seqsero2.nf b/modules/seqsero2.nf index c7562fa..c6a460c 100644 --- a/modules/seqsero2.nf +++ b/modules/seqsero2.nf @@ -42,22 +42,25 @@ process seqsero2 { -n !{sample} \ | tee -a $log_file - enteritidis_check=$(grep "Enteritidis" seqsero2/!{sample}/SeqSero_result.tsv | head -n 1) - sdf_check=$(grep "Detected Sdf" seqsero2/!{sample}/SeqSero_result.tsv | head -n 1 ) - - if [ -n "$enteritidis_check" ] && [ -n "$sdf_check" ] - then - head -n 1 seqsero2/!{sample}/SeqSero_result.tsv > SeqSero_result.tsv.tmp - tail -n 1 seqsero2/!{sample}/SeqSero_result.tsv | awk -F "\\t" -v OFS='\t' '{($9 = $9 " (Sdf+)") ; print $0}' >> SeqSero_result.tsv.tmp - mv SeqSero_result.tsv.tmp seqsero2/!{sample}/SeqSero_result.tsv - elif [ -n "$enteritidis_check" ] && [ -z "$sdf_check" ] + if [ -f "seqsero2/!{sample}/SeqSero_result.tsv" ] then - head -n 1 seqsero2/!{sample}/SeqSero_result.tsv > SeqSero_result.tsv.tmp - tail -n 1 seqsero2/!{sample}/SeqSero_result.tsv | awk -F "\\t" -v OFS='\t' '{($9 = $9 " (Sdf-)") ; print $0}' >> SeqSero_result.tsv.tmp - mv SeqSero_result.tsv.tmp seqsero2/!{sample}/SeqSero_result.tsv - fi + enteritidis_check=$(grep "Enteritidis" seqsero2/!{sample}/SeqSero_result.tsv | head -n 1) + sdf_check=$(grep "Detected Sdf" seqsero2/!{sample}/SeqSero_result.tsv | head -n 1 ) - cat seqsero2/!{sample}/SeqSero_result.tsv | sed 's/Sample name/sample/g' > seqsero2/!{sample}/SeqSero_result.tsv.tmp - mv seqsero2/!{sample}/SeqSero_result.tsv.tmp seqsero2/!{sample}/SeqSero_result.tsv + if [ -n "$enteritidis_check" ] && [ -n "$sdf_check" ] + then + head -n 1 seqsero2/!{sample}/SeqSero_result.tsv > SeqSero_result.tsv.tmp + tail -n 1 seqsero2/!{sample}/SeqSero_result.tsv | awk -F "\\t" -v OFS='\t' '{($9 = $9 " (Sdf+)") ; print $0}' >> SeqSero_result.tsv.tmp + mv SeqSero_result.tsv.tmp seqsero2/!{sample}/SeqSero_result.tsv + elif [ -n "$enteritidis_check" ] && [ -z "$sdf_check" ] + then + head -n 1 seqsero2/!{sample}/SeqSero_result.tsv > SeqSero_result.tsv.tmp + tail -n 1 seqsero2/!{sample}/SeqSero_result.tsv | awk -F "\\t" -v OFS='\t' '{($9 = $9 " (Sdf-)") ; print $0}' >> SeqSero_result.tsv.tmp + mv SeqSero_result.tsv.tmp seqsero2/!{sample}/SeqSero_result.tsv + fi + + cat seqsero2/!{sample}/SeqSero_result.tsv | sed 's/Sample name/sample/g' > seqsero2/!{sample}/SeqSero_result.tsv.tmp + mv seqsero2/!{sample}/SeqSero_result.tsv.tmp seqsero2/!{sample}/SeqSero_result.tsv + fi ''' } diff --git a/modules/serotypefinder.nf b/modules/serotypefinder.nf index 4d58cb1..af3f37c 100644 --- a/modules/serotypefinder.nf +++ b/modules/serotypefinder.nf @@ -14,11 +14,11 @@ process serotypefinder { flag =~ 'found' input: - tuple val(sample), file(file), val(flag) + tuple val(sample), file(file), val(flag), file(script) output: path "serotypefinder/${sample}/*" , emit: files - path "serotypefinder/${sample}_serotypefinder.tsv" , emit: collect + path "serotypefinder/${sample}_serotypefinder.tsv" , emit: collect, optional: true path "logs/${task.process}/${sample}.${workflow.sessionId}.log" , emit: log shell: @@ -40,7 +40,6 @@ process serotypefinder { cp serotypefinder/!{sample}/results_tab.tsv serotypefinder/!{sample}_serotypefinder.tsv - head -n 1 serotypefinder/!{sample}/results_tab.tsv | awk '{print "sample\\t" $0 }' > serotypefinder/!{sample}_serotypefinder.tsv - tail -n +2 serotypefinder/!{sample}/results_tab.tsv | awk -v sample=!{sample} '{print sample "\\t" $0 }' >> serotypefinder/!{sample}_serotypefinder.tsv + python3 !{script} serotypefinder/!{sample}/results_tab.tsv serotypefinder/!{sample}_serotypefinder.tsv serotypefinder !{sample} ''' } diff --git a/modules/shigatyper.nf b/modules/shigatyper.nf index 92b9183..18e667d 100644 --- a/modules/shigatyper.nf +++ b/modules/shigatyper.nf @@ -15,10 +15,10 @@ process shigatyper { flag =~ 'found' input: - tuple val(sample), file(input), val(flag) + tuple val(sample), file(input), val(flag), file(script) output: - path "shigatyper/${sample}_shigatyper.tsv" , emit: files + path "shigatyper/${sample}_shigatyper.tsv", optional: true, emit: files path "shigatyper/${sample}_shigatyper-hits.tsv", optional: true, emit: collect path "logs/${task.process}/${sample}.${workflow.sessionId}.log", emit: log @@ -39,14 +39,8 @@ process shigatyper { --name !{sample} \ | tee -a $log_file - hits=$(find . -iname "*hits.tsv" | head -n 1) - if [ -f "$hits" ] - then - head -n 1 $hits | awk '{print "sample\\t" $0}' > shigatyper/!{sample}_shigatyper-hits.tsv - tail -n +2 $hits | awk -v sample=!{sample} '{print sample "\\t" $0}' >> shigatyper/!{sample}_shigatyper-hits.tsv - rm $hits - fi + python3 !{script} !{sample}-hits.tsv shigatyper/!{sample}_shigatyper-hits.tsv shigatyper !{sample} - cat *tsv > shigatyper/!{sample}_shigatyper.tsv + if [ -f "!{sample}.tsv" ] ; then cp !{sample}.tsv shigatyper/!{sample}_shigatyper.tsv ; fi ''' } diff --git a/modules/snp-dists.nf b/modules/snp-dists.nf index 3728298..fd7edf5 100644 --- a/modules/snp-dists.nf +++ b/modules/snp-dists.nf @@ -36,9 +36,9 @@ process snp_dists { genome_length=$(cat !{contigs} | tr "\n" ";" | sed 's/>[^>]*//2g' | tr ";" "\n" | grep -v ">" | wc -c ) - sed '0,/,/s/,/num_samples=!{num_samples};num_core_genes=!{num_core_genes};core_genome_length=$genome_length,/' snp-dists/snp_matrix.txt > snp-dists/snp_matrix_with_qc.txt + sed '0,/,/s/,/num_samples=!{num_samples};num_core_genes=!{num_core_genes},/' snp-dists/snp_matrix.txt > snp-dists/snp_matrix_with_qc.txt - echo "num_samples,num_core_genes,core_genome_length" > snp-dists/roary_metrics_mqc.csv - echo "!{num_samples},!{num_core_genes},$genome_length" >> snp-dists/roary_metrics_mqc.csv + echo "num_samples,num_core_genes,core_genome_length" > snp-dists/roary_metrics_mqc.csv + echo "!{num_samples},!{num_core_genes},${genome_length}" >> snp-dists/roary_metrics_mqc.csv ''' } diff --git a/nextflow.config b/nextflow.config index 96c7826..f1ed574 100644 --- a/nextflow.config +++ b/nextflow.config @@ -3,7 +3,7 @@ manifest { author = 'Erin Young' homePage = 'https://github.com/UPHL-BioNGS/Grandeur' mainScript = 'grandeur.nf' - version = '3.2.20230718' + version = '3.2.20230803' defaultBranch = 'main' description = 'Grandeur is short-read de novo assembly pipeline with serotyping.' } diff --git a/subworkflows/average_nucleotide_identity.nf b/subworkflows/average_nucleotide_identity.nf index 3372363..160cf61 100644 --- a/subworkflows/average_nucleotide_identity.nf +++ b/subworkflows/average_nucleotide_identity.nf @@ -10,6 +10,7 @@ workflow average_nucleotide_identity { ch_contigs ch_static_fastani_genomes ch_genome_ref + dataset_script main: if ( params.current_datasets ) { @@ -20,7 +21,7 @@ workflow average_nucleotide_identity { .map(it -> it.trim()) .set{ ch_species_list } - datasets_summary(ch_species_list) + datasets_summary(ch_species_list.combine(dataset_script)) datasets_download(datasets_summary.out.genomes.collect(), ch_genome_ref) ch_fastani_db = datasets_download.out.genomes diff --git a/subworkflows/information.nf b/subworkflows/information.nf index 7502c08..180d1e7 100644 --- a/subworkflows/information.nf +++ b/subworkflows/information.nf @@ -20,6 +20,7 @@ workflow information { ch_contigs ch_flag ch_size + summfle_script main: // fastq files @@ -27,9 +28,9 @@ workflow information { fastqc(ch_reads) // contigs - mlst(ch_contigs) + mlst(ch_contigs.combine(summfle_script)) quast(ch_contigs) - plasmidfinder(ch_contigs) + plasmidfinder(ch_contigs.combine(summfle_script)) // estimating size of genome for the oganism size(ch_size.join(quast.out.results, by: 0, remainder: true).map{ it -> tuple(it[0], [ it[1], it[2], it[3], it[4], it[5], it[6], it[7], it[8]])}) @@ -38,13 +39,13 @@ workflow information { flag(ch_flag.groupTuple()) amrfinderplus(ch_contigs.join(flag.out.organism, by:0)) - emmtyper(ch_contigs.join(flag.out.strepa_flag, by:0)) + emmtyper(ch_contigs.join(flag.out.strepa_flag, by:0).combine(summfle_script)) //kaptive(ch_contigs.join(flag.out.klebacin_flag, by:0)) - kleborate(ch_contigs.join(flag.out.klebsiella_flag, by:0)) + kleborate(ch_contigs.join(flag.out.klebsiella_flag, by:0).combine(summfle_script)) legsta(ch_contigs.join(flag.out.legionella_flag, by:0)) seqsero2(ch_contigs.join(flag.out.salmonella_flag, by:0)) - serotypefinder(ch_contigs.join(flag.out.ecoli_flag, by:0)) - shigatyper(ch_contigs.join(flag.out.ecoli_flag, by:0)) + serotypefinder(ch_contigs.join(flag.out.ecoli_flag, by:0).combine(summfle_script)) + shigatyper(ch_contigs.join(flag.out.ecoli_flag, by:0).combine(summfle_script)) pbptyper(ch_contigs.join(flag.out.streppneu_flag, by:0)) emmtyper.out.collect