From 83a9e2a8c6f0ae79a6b2eded1e70dc6b5bfee28f Mon Sep 17 00:00:00 2001 From: Victoria Carr Date: Tue, 16 Aug 2022 14:06:50 +0100 Subject: [PATCH 1/5] Remove intermediate files --- main.nf | 4 ---- modules/mlst.nf | 12 ++++++------ modules/pbp_typer.nf | 4 ++++ modules/res_alignments.nf | 12 +++++++++--- modules/res_typer.nf | 2 +- modules/serotyping.nf | 2 +- modules/surface_typer.nf | 3 ++- 7 files changed, 23 insertions(+), 16 deletions(-) diff --git a/main.nf b/main.nf index f53c4c1..cb7d6e7 100644 --- a/main.nf +++ b/main.nf @@ -189,14 +189,12 @@ workflow MLST { // Collect outputs new_alleles = get_mlst_allele_and_pileup.out.new_alleles pileup = get_mlst_allele_and_pileup.out.pileup - existing_alleles = get_mlst_allele_and_pileup.out.existing_alleles status = get_mlst_allele_and_pileup.out.new_alleles_status srst2_results = srst2_for_mlst.out.srst2_results emit: new_alleles pileup - existing_alleles status srst2_results } @@ -325,8 +323,6 @@ workflow { MLST.out.pileup.subscribe { it -> it.copyTo(file("${results_dir}")) } - MLST.out.existing_alleles - .collectFile(name: file("${results_dir}/${params.existing_mlst_alleles_out}"), keepHeader: true, sort: true) MLST.out.status .collectFile(name: file("${results_dir}/${params.new_mlst_alleles_status}"), keepHeader: false, sort: true) diff --git a/modules/mlst.nf b/modules/mlst.nf index 84008d7..1a3f297 100644 --- a/modules/mlst.nf +++ b/modules/mlst.nf @@ -15,10 +15,12 @@ process srst2_for_mlst { mlst_name="Streptococcus_agalactiae" """ - set +e + getmlst.py --species 'Streptococcus agalactiae' srst2 --samtools_args '\\-A' --input_pe ${reads[0]} ${reads[1]} --output ${pair_id} --save_scores --mlst_db ${mlst_db} --mlst_definitions profiles_csv --mlst_delimiter '_' --min_coverage ${min_coverage} + touch ${pair_id}__mlst__${mlst_name}__results.txt + find . \\! -type f \\( -name "${pair_id}*.bam" -o -name "${pair_id}__mlst__${mlst_name}__results.txt" -o -name ${mlst_db} \\) -delete """ } @@ -31,17 +33,15 @@ process get_mlst_allele_and_pileup { output: path(output_new_mlst_alleles_fasta), emit: new_alleles, optional: true path(output_new_mlst_pileup), emit: pileup, optional: true - path(output_existing_mlst_alleles), emit: existing_alleles, optional: true path(output_new_mlst_alleles_log), emit: new_alleles_status script: output_new_mlst_alleles_fasta="${pair_id}_new_mlst_alleles.fasta" output_new_mlst_pileup="${pair_id}_new_mlst_pileup.txt" - output_existing_mlst_alleles="${pair_id}_existing_mlst_alleles.txt" output_new_mlst_alleles_log="${pair_id}_new_mlst_alleles.log" """ - set +e + # Get alleles from mismatches in SRST2 MLST results file samtools index ${bam_file} get_alleles_from_srst2_mlst.py --mlst_results_file ${results_file} --min_read_depth ${min_read_depth} --output_prefix ${pair_id} @@ -81,11 +81,11 @@ process get_mlst_allele_and_pileup { echo "${pair_id}: No new MLST alleles found." > tmp.log fi - - touch ${pair_id}_new_mlst_alleles.log mv tmp.fasta ${output_new_mlst_alleles_fasta} mv tmp_pileup.txt ${output_new_mlst_pileup} mv tmp.log ${output_new_mlst_alleles_log} + + find . \\! -type f \\( -name "${pair_id}_new_mlst_alleles.log" -o -name ${output_new_mlst_alleles_fasta} -o -name ${output_new_mlst_pileup} \\) -delete """ } diff --git a/modules/pbp_typer.nf b/modules/pbp_typer.nf index 6ffb353..8eca3c2 100644 --- a/modules/pbp_typer.nf +++ b/modules/pbp_typer.nf @@ -17,6 +17,8 @@ process get_pbp_genes { # Get BED file of PBP fragments get_pbp_genes_from_contigs.py --blast_out_file ${pair_id}_blast_blactam.out --query_fasta ${blactam_ref} --frac_align_len_threshold ${frac_align_len_threshold} --frac_identity_threshold ${frac_identity_len_threshold} --output_prefix ${pair_id}_ + + find . \\! -type f \\( -name "${pair_id}_*bed" -o -name ${contigs} \\) -delete """ } @@ -46,5 +48,7 @@ process get_pbp_alleles { # Get identical or imperfect hits get_pbp_alleles.py --blast_out_file ${pair_id}_blast_${pbp_type}.out --query_fasta ${pair_id}_${pbp_type}.faa --output_prefix ${pair_id}_${pbp_type}_PBP fi + + find . \\! -type f \\( -name "${pair_id}_${pbp_type}_PBP_new_allele.faa" -o -name "${pair_id}_${pbp_type}_PBP_existing_allele.txt" \\) -delete """ } diff --git a/modules/res_alignments.nf b/modules/res_alignments.nf index cd25cef..d9bc362 100644 --- a/modules/res_alignments.nf +++ b/modules/res_alignments.nf @@ -13,9 +13,11 @@ process srst2_for_res_typing { script: db_name=db.getSimpleName() """ - set +e + srst2 --samtools_args '\\-A' --input_pe ${reads[0]} ${reads[1]} --output ${pair_id} --log --save_scores --min_coverage ${min_coverage} --max_divergence ${max_divergence} --gene_db ${db} + touch ${pair_id}__fullgenes__${db_name}__results.txt + find . \\! -type f \\( -name "${pair_id}*.bam" -o -name "${pair_id}__fullgenes__${db_name}__results.txt" \\) -delete """ } @@ -45,15 +47,17 @@ process split_target_RES_seq_from_sam_file { file("*_*_${pair_id}*.bai") """ - set +e + samtools view -h ${bam_file} > \$(basename ${bam_file} .bam).sam get_targets_from_samfile.py -s \$(basename ${bam_file} .bam).sam -t ${targets_file} -i ${pair_id} -o CHECK_ for check_sam_file in CHECK_*_${pair_id}*.sam; do samtools view -bS \${check_sam_file} > \$(basename \${check_sam_file} .sam).bam samtools index \$(basename \${check_sam_file} .sam).bam \$(basename \${check_sam_file} .sam).bai done + touch dummy_dummy_${pair_id}_dummy.bam touch dummy_dummy_${pair_id}_dummy.bai + find . \\! -type f \\( -name "*_*_${pair_id}*.bam" -o -name "*_*_${pair_id}*.bai" \\) -delete """ } @@ -69,7 +73,7 @@ process freebayes { tuple val(pair_id), file("${pair_id}_consensus_seq.fna"), emit: consensus """ - set +e + for check_bam_file in CHECK_*_${pair_id}*.bam; do target=\$(echo \${check_bam_file} | sed 's/CHECK_//g' | sed 's/_${pair_id}.*//g') freebayes -q 20 -p 1 -f CHECK_\${target}_ref.fna \${check_bam_file} -v CHECK_\${target}_${pair_id}_seq.vcf @@ -77,6 +81,8 @@ process freebayes { tabix -p vcf CHECK_\${target}_${pair_id}_seq.vcf.gz cat CHECK_\${target}_ref.fna | vcf-consensus CHECK_\${target}_${pair_id}_seq.vcf.gz >> ${pair_id}_consensus_seq.fna done + touch ${pair_id}_consensus_seq.fna + find . \\! -name "${pair_id}_consensus_seq.fna" -delete """ } diff --git a/modules/res_typer.nf b/modules/res_typer.nf index a0c8edc..d499e37 100644 --- a/modules/res_typer.nf +++ b/modules/res_typer.nf @@ -15,7 +15,7 @@ process res_typer { variants_output_file="${pair_id}_res_gbs_variants.txt" alleles_accessions_file="${pair_id}_res_alleles_accessions.txt" """ - set +e + process_res_typer_results.py \ --srst2_gbs_fullgenes ${gbs_fullgenes} \ --srst2_gbs_consensus ${gbs_consensus} \ diff --git a/modules/serotyping.nf b/modules/serotyping.nf index 7458a6f..a2abac3 100644 --- a/modules/serotyping.nf +++ b/modules/serotyping.nf @@ -12,7 +12,6 @@ process serotyping { sero_gene_db="GBS-SBG.fasta" """ - set +e # Get latest version of GBS Serotype Database git clone https://github.com/swainechen/GBS-SBG @@ -22,5 +21,6 @@ process serotyping { process_serotyper_results.py --srst2_output SERO_${pair_id} --sero_db ${sero_gene_db} --output ${pair_id}_SeroType_Results.txt --min_read_depth ${min_read_depth} touch ${output_file} + find . \\! -name ${output_file} -delete """ } diff --git a/modules/surface_typer.nf b/modules/surface_typer.nf index 64a48ff..7230898 100644 --- a/modules/surface_typer.nf +++ b/modules/surface_typer.nf @@ -14,11 +14,12 @@ process surface_typer { inc_output_file="${pair_id}_surface_protein_incidence_sample.txt" variants_output_file="${pair_id}_surface_protein_variants_sample.txt" """ - set +e + srst2 --samtools_args '\\-A' --input_pe ${reads[0]} ${reads[1]} --output ${pair_id}_SURFACE --log --save_scores --min_coverage ${min_coverage} --max_divergence ${max_divergence} --gene_db ${surface_protein_db} process_surface_typer_results.py --srst2_gbs_fullgenes ${pair_id}_SURFACE --surface_db ${surface_protein_db} --output_prefix ${pair_id} --min_read_depth ${min_read_depth} touch ${inc_output_file} touch ${variants_output_file} + find . \\! -type f \\( -name ${inc_output_file} -o -name ${variants_output_file} \\) -delete """ } From b5fd9a1d7553eac9c3f1f7eac26dbf16514cb58e Mon Sep 17 00:00:00 2001 From: Victoria Carr Date: Tue, 16 Aug 2022 14:28:02 +0100 Subject: [PATCH 2/5] Fix missing mlst outputs --- modules/mlst.nf | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/modules/mlst.nf b/modules/mlst.nf index 1a3f297..3fee020 100644 --- a/modules/mlst.nf +++ b/modules/mlst.nf @@ -81,8 +81,16 @@ process get_mlst_allele_and_pileup { echo "${pair_id}: No new MLST alleles found." > tmp.log fi - mv tmp.fasta ${output_new_mlst_alleles_fasta} - mv tmp_pileup.txt ${output_new_mlst_pileup} + if [ -f tmp.fasta ] + then + mv tmp.fasta ${output_new_mlst_alleles_fasta} + fi + + if [ -f tmp_pileup.txt ] + then + mv tmp_pileup.txt ${output_new_mlst_pileup} + fi + mv tmp.log ${output_new_mlst_alleles_log} find . \\! -type f \\( -name "${pair_id}_new_mlst_alleles.log" -o -name ${output_new_mlst_alleles_fasta} -o -name ${output_new_mlst_pileup} \\) -delete From 6f5aa1464d658f58748a05880c6955e07ed74a8d Mon Sep 17 00:00:00 2001 From: Victoria Carr Date: Tue, 16 Aug 2022 14:34:21 +0100 Subject: [PATCH 3/5] Remove delete files in pbp typer --- modules/pbp_typer.nf | 4 ---- 1 file changed, 4 deletions(-) diff --git a/modules/pbp_typer.nf b/modules/pbp_typer.nf index 8eca3c2..6ffb353 100644 --- a/modules/pbp_typer.nf +++ b/modules/pbp_typer.nf @@ -17,8 +17,6 @@ process get_pbp_genes { # Get BED file of PBP fragments get_pbp_genes_from_contigs.py --blast_out_file ${pair_id}_blast_blactam.out --query_fasta ${blactam_ref} --frac_align_len_threshold ${frac_align_len_threshold} --frac_identity_threshold ${frac_identity_len_threshold} --output_prefix ${pair_id}_ - - find . \\! -type f \\( -name "${pair_id}_*bed" -o -name ${contigs} \\) -delete """ } @@ -48,7 +46,5 @@ process get_pbp_alleles { # Get identical or imperfect hits get_pbp_alleles.py --blast_out_file ${pair_id}_blast_${pbp_type}.out --query_fasta ${pair_id}_${pbp_type}.faa --output_prefix ${pair_id}_${pbp_type}_PBP fi - - find . \\! -type f \\( -name "${pair_id}_${pbp_type}_PBP_new_allele.faa" -o -name "${pair_id}_${pbp_type}_PBP_existing_allele.txt" \\) -delete """ } From b236b47bd7fa580293aae050a49daae6aa1477e1 Mon Sep 17 00:00:00 2001 From: Victoria Carr Date: Tue, 16 Aug 2022 14:56:17 +0100 Subject: [PATCH 4/5] Undo remove optional existing alleles file --- main.nf | 4 ++++ modules/mlst.nf | 4 +++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/main.nf b/main.nf index cb7d6e7..f53c4c1 100644 --- a/main.nf +++ b/main.nf @@ -189,12 +189,14 @@ workflow MLST { // Collect outputs new_alleles = get_mlst_allele_and_pileup.out.new_alleles pileup = get_mlst_allele_and_pileup.out.pileup + existing_alleles = get_mlst_allele_and_pileup.out.existing_alleles status = get_mlst_allele_and_pileup.out.new_alleles_status srst2_results = srst2_for_mlst.out.srst2_results emit: new_alleles pileup + existing_alleles status srst2_results } @@ -323,6 +325,8 @@ workflow { MLST.out.pileup.subscribe { it -> it.copyTo(file("${results_dir}")) } + MLST.out.existing_alleles + .collectFile(name: file("${results_dir}/${params.existing_mlst_alleles_out}"), keepHeader: true, sort: true) MLST.out.status .collectFile(name: file("${results_dir}/${params.new_mlst_alleles_status}"), keepHeader: false, sort: true) diff --git a/modules/mlst.nf b/modules/mlst.nf index 3fee020..e3d39e6 100644 --- a/modules/mlst.nf +++ b/modules/mlst.nf @@ -33,11 +33,13 @@ process get_mlst_allele_and_pileup { output: path(output_new_mlst_alleles_fasta), emit: new_alleles, optional: true path(output_new_mlst_pileup), emit: pileup, optional: true + path(output_existing_mlst_alleles), emit: existing_alleles, optional: true path(output_new_mlst_alleles_log), emit: new_alleles_status script: output_new_mlst_alleles_fasta="${pair_id}_new_mlst_alleles.fasta" output_new_mlst_pileup="${pair_id}_new_mlst_pileup.txt" + output_existing_mlst_alleles="${pair_id}_existing_mlst_alleles.txt" output_new_mlst_alleles_log="${pair_id}_new_mlst_alleles.log" """ @@ -90,7 +92,7 @@ process get_mlst_allele_and_pileup { then mv tmp_pileup.txt ${output_new_mlst_pileup} fi - + mv tmp.log ${output_new_mlst_alleles_log} find . \\! -type f \\( -name "${pair_id}_new_mlst_alleles.log" -o -name ${output_new_mlst_alleles_fasta} -o -name ${output_new_mlst_pileup} \\) -delete From c0d2d3e3174308320d4769560425dd66ef926cdc Mon Sep 17 00:00:00 2001 From: Victoria Carr Date: Wed, 17 Aug 2022 16:34:58 +0100 Subject: [PATCH 5/5] Unlink and remove unnecessary files --- modules/combine.nf | 9 +++++++++ modules/mlst.nf | 19 ++++++++++++++++-- modules/pbp_typer.nf | 15 ++++++++++++++ modules/res_alignments.nf | 41 +++++++++++++++++++++++++++++++++++---- modules/serotyping.nf | 10 +++++++++- modules/surface_typer.nf | 11 ++++++++++- 6 files changed, 97 insertions(+), 8 deletions(-) diff --git a/modules/combine.nf b/modules/combine.nf index e9f908d..6bac8d1 100644 --- a/modules/combine.nf +++ b/modules/combine.nf @@ -19,6 +19,15 @@ process combine_results { -x "${surface_protein_incidence}" \ -n "${version}" \ -o ${pair_id} + + unlink ${sero_results} + unlink ${res_incidence} + unlink ${res_alleles} + unlink ${surface_protein_incidence} + unlink ${surface_protein_variants} + unlink ${mlst_allelic_frequency} + unlink ${version} + unlink ${config} """ } diff --git a/modules/mlst.nf b/modules/mlst.nf index e3d39e6..cb3d68b 100644 --- a/modules/mlst.nf +++ b/modules/mlst.nf @@ -20,7 +20,19 @@ process srst2_for_mlst { srst2 --samtools_args '\\-A' --input_pe ${reads[0]} ${reads[1]} --output ${pair_id} --save_scores --mlst_db ${mlst_db} --mlst_definitions profiles_csv --mlst_delimiter '_' --min_coverage ${min_coverage} touch ${pair_id}__mlst__${mlst_name}__results.txt - find . \\! -type f \\( -name "${pair_id}*.bam" -o -name "${pair_id}__mlst__${mlst_name}__results.txt" -o -name ${mlst_db} \\) -delete + + # Clean directory + mkdir output + mv ${pair_id}*.bam output + mv ${pair_id}__mlst__${mlst_name}__results.txt output + mv ${mlst_db} output + find . -maxdepth 1 -type f -delete + unlink ${reads[0]} + unlink ${reads[1]} + mv output/${pair_id}*.bam . + mv output/${pair_id}__mlst__${mlst_name}__results.txt . + mv output/${mlst_db} . + rm -d output """ } @@ -95,7 +107,10 @@ process get_mlst_allele_and_pileup { mv tmp.log ${output_new_mlst_alleles_log} - find . \\! -type f \\( -name "${pair_id}_new_mlst_alleles.log" -o -name ${output_new_mlst_alleles_fasta} -o -name ${output_new_mlst_pileup} \\) -delete + # Clean + unlink ${bam_file} + unlink ${results_file} + unlink ${mlst_alleles} """ } diff --git a/modules/pbp_typer.nf b/modules/pbp_typer.nf index 6ffb353..8839f4c 100644 --- a/modules/pbp_typer.nf +++ b/modules/pbp_typer.nf @@ -17,6 +17,16 @@ process get_pbp_genes { # Get BED file of PBP fragments get_pbp_genes_from_contigs.py --blast_out_file ${pair_id}_blast_blactam.out --query_fasta ${blactam_ref} --frac_align_len_threshold ${frac_align_len_threshold} --frac_identity_threshold ${frac_identity_len_threshold} --output_prefix ${pair_id}_ + + # Clean directory + mkdir output + mv ${pair_id}_*bed output + mv ${contigs} output + find . -maxdepth 1 -type f -delete + unlink ${blactam_ref} + mv output/${pair_id}_*bed . + mv output/${contigs} . + rm -d output """ } @@ -45,6 +55,11 @@ process get_pbp_alleles { # Get identical or imperfect hits get_pbp_alleles.py --blast_out_file ${pair_id}_blast_${pbp_type}.out --query_fasta ${pair_id}_${pbp_type}.faa --output_prefix ${pair_id}_${pbp_type}_PBP + + unlink ${pair_id}_${pbp_type}.bed fi + + unlink ${contigs} + unlink ${gbs_blactam_db} """ } diff --git a/modules/res_alignments.nf b/modules/res_alignments.nf index d9bc362..1296753 100644 --- a/modules/res_alignments.nf +++ b/modules/res_alignments.nf @@ -17,7 +17,18 @@ process srst2_for_res_typing { srst2 --samtools_args '\\-A' --input_pe ${reads[0]} ${reads[1]} --output ${pair_id} --log --save_scores --min_coverage ${min_coverage} --max_divergence ${max_divergence} --gene_db ${db} touch ${pair_id}__fullgenes__${db_name}__results.txt - find . \\! -type f \\( -name "${pair_id}*.bam" -o -name "${pair_id}__fullgenes__${db_name}__results.txt" \\) -delete + + # Clean directory + mkdir output + mv ${pair_id}*.bam output + mv ${pair_id}__fullgenes__${db_name}__results.txt output + find . -maxdepth 1 -type f -delete + unlink ${reads[0]} + unlink ${reads[1]} + unlink ${db} + mv output/${pair_id}*.bam . + mv output/${pair_id}__fullgenes__${db_name}__results.txt . + rm -d output """ } @@ -32,6 +43,10 @@ process split_target_RES_sequences { """ get_targets_from_db.py -f ${fasta_file} -t ${targets_file} -o CHECK_ + + # Clean + unlink ${fasta_file} + unlink ${targets_file} """ } @@ -57,7 +72,10 @@ process split_target_RES_seq_from_sam_file { touch dummy_dummy_${pair_id}_dummy.bam touch dummy_dummy_${pair_id}_dummy.bai - find . \\! -type f \\( -name "*_*_${pair_id}*.bam" -o -name "*_*_${pair_id}*.bai" \\) -delete + + # Clean directory + unlink ${bam_file} + unlink ${targets_file} """ } @@ -73,16 +91,31 @@ process freebayes { tuple val(pair_id), file("${pair_id}_consensus_seq.fna"), emit: consensus """ - for check_bam_file in CHECK_*_${pair_id}*.bam; do target=\$(echo \${check_bam_file} | sed 's/CHECK_//g' | sed 's/_${pair_id}.*//g') freebayes -q 20 -p 1 -f CHECK_\${target}_ref.fna \${check_bam_file} -v CHECK_\${target}_${pair_id}_seq.vcf bgzip CHECK_\${target}_${pair_id}_seq.vcf tabix -p vcf CHECK_\${target}_${pair_id}_seq.vcf.gz cat CHECK_\${target}_ref.fna | vcf-consensus CHECK_\${target}_${pair_id}_seq.vcf.gz >> ${pair_id}_consensus_seq.fna + rm CHECK_\${target}_${pair_id}_seq.vcf.gz + rm CHECK_\${target}_${pair_id}_seq.vcf.gz.tbi + rm CHECK_\${target}_ref.fna.fai done touch ${pair_id}_consensus_seq.fna - find . \\! -name "${pair_id}_consensus_seq.fna" -delete + + # Clean directory + for check_bam_file in CHECK_*_${pair_id}*.bam; do + target=\$(echo \${check_bam_file} | sed 's/CHECK_//g' | sed 's/_${pair_id}.*//g') + unlink \${check_bam_file} + unlink CHECK_\${target}_${pair_id}_seq.bai + unlink CHECK_\${target}_ref.fna + done + + mkdir output + mv ${pair_id}_consensus_seq.fna output + find . -maxdepth 1 -type f -delete + mv output/${pair_id}_consensus_seq.fna . + rm -d output """ } diff --git a/modules/serotyping.nf b/modules/serotyping.nf index a2abac3..bad9e95 100644 --- a/modules/serotyping.nf +++ b/modules/serotyping.nf @@ -21,6 +21,14 @@ process serotyping { process_serotyper_results.py --srst2_output SERO_${pair_id} --sero_db ${sero_gene_db} --output ${pair_id}_SeroType_Results.txt --min_read_depth ${min_read_depth} touch ${output_file} - find . \\! -name ${output_file} -delete + + # Clean directory + mkdir output + mv ${output_file} output + find . -maxdepth 1 -type f -delete + unlink ${reads[0]} + unlink ${reads[1]} + mv output/${output_file} . + rm -d output """ } diff --git a/modules/surface_typer.nf b/modules/surface_typer.nf index 7230898..cb74ee6 100644 --- a/modules/surface_typer.nf +++ b/modules/surface_typer.nf @@ -20,6 +20,15 @@ process surface_typer { touch ${inc_output_file} touch ${variants_output_file} - find . \\! -type f \\( -name ${inc_output_file} -o -name ${variants_output_file} \\) -delete + + # Clean directory + mkdir output + mv ${inc_output_file} output + mv ${variants_output_file} output + find . -maxdepth 1 -type f -delete + unlink ${surface_protein_db} + mv output/${inc_output_file} . + mv output/${variants_output_file} . + rm -d output """ }