Merge pull request #71 from UPHL-BioNGS/update-20230523

Update 20230613
UPHL-BioNGS · Jun 12, 2023 · c6ce94b · c6ce94b
2 parents 24782d5 + 4e1d4d5
commit c6ce94b
Show file tree

Hide file tree

Showing 8 changed files with 68 additions and 37 deletions.
diff --git a/assets/genomes.txt b/assets/genomes.txt
@@ -13,7 +13,7 @@ GCF_010918895.1	Acinetobacter_schindleri
 GCF_016064815.1	Acinetobacter_seifertii
 GCF_018409485.1	Acinetobacter_variabilis
 GCF_000967305.2	Alcaligenes_faecalis
-GCF_008011635.1	Campylobacter_coli
+GCA_008011635.1	Campylobacter_coli
 GCF_009730395.1	Campylobacter_coli
 GCF_000015085.1	Campylobacter_fetus
 GCF_000495505.1	Campylobacter_fetus
@@ -22,13 +22,13 @@ GCF_011600945.2	Campylobacter_fetus
 GCF_001643955.1	Campylobacter_hyointestinalis
 GCF_013372165.1	Campylobacter_hyointestinalis_subsp._lawsonii
 GCF_000017485.1	Campylobacter_jejuni
-GCF_008011525.1	Campylobacter_jejuni
+GCA_008011525.1	Campylobacter_jejuni
 GCF_000009085.1	Campylobacter_jejuni_subsp._jejuni_NCTC_11168_=_ATCC_700819
 GCF_000816225.1	Campylobacter_lari
 GCF_000019205.1	Campylobacter_lari_RM2100
 GCF_014931075.1	Campylobacter_peloridis
 GCF_000816305.1	Campylobacter_subantarcticus_LMG_24377
-GCF_008011615.1	Campylobacter_upsaliensis
+GCA_008011615.1	Campylobacter_upsaliensis
 GCF_916098265.1	Campylobacter_upsaliensis
 GCF_001558935.2	Citrobacter_amalonaticus
 GCF_009648935.1	Citrobacter_braakii
@@ -111,7 +111,7 @@ GCF_014526935.1	Listeria_monocytogenes
 GCF_000196035.1	Listeria_monocytogenes_EGD-e
 GCF_017363605.1	Listeria_seeligeri
 GCF_002489005.1	Listeria_welshimeri
-GCF_007681265.1	Mixta_calida
+GCA_007681265.1	Mixta_calida
 GCF_902387845.1	Morganella_morganii
 GCF_013030075.1	Neisseria_gonorrhoeae
 GCF_000233595.1	Pantoea_ananatis_PA13
@@ -136,9 +136,9 @@ GCF_000412675.1	Pseudomonas_putida_NBRC_14164
 GCF_902374465.1	Ralstonia_pickettii
 GCF_901421005.1	Raoultella_ornithinolytica
 GCF_022637595.1	Raoultella_planticola
-GCF_013588055.1	Salmonella_bongori
+GCA_013588055.1	Salmonella_bongori
 GCF_000439255.1	Salmonella_bongori_N268-08
-GCF_011388235.1	Salmonella_enterica
+GCA_011388235.1	Salmonella_enterica
 GCF_000006945.2	Salmonella_enterica_subsp._enterica_serovar_Typhimurium_str._LT2
 GCF_003516165.1	Serratia_marcescens
 GCF_004768745.1	Serratia_nematodiphila
@@ -155,9 +155,9 @@ GCF_000221985.1	Streptococcus_pseudopneumoniae
 GCF_900475035.1	Streptococcus_pyogenes
 GCF_009665435.1	Vibrio_alginolyticus
 GCA_023650915.1	Vibrio_alginolyticus
-GCF_009665515.2	Vibrio_cholerae
+GCA_009665515.2	Vibrio_cholerae
 GCF_008369605.1	Vibrio_cholerae
-GCF_009665415.1	Vibrio_cidicii
+GCA_009665415.1	Vibrio_cidicii
 GCF_009665395.1	Vibrio_cincinnatiensis
 GCF_009665355.1	Vibrio_fluvialis
 GCF_009665335.1	Vibrio_furnissii

diff --git a/bin/summary.py b/bin/summary.py
@@ -48,6 +48,22 @@
 tsv_files = [ quast, seqsero2, kleborate, mlst, emmtyper , pbptyper]
 top_hit   = [ fastani ]
 
+##########################################
+# exiting if no input files              #
+##########################################
+
+if not exists(names) :
+    print("No analyses to report on for this run!")
+    with open(extended + '.tsv', 'w') as fp:
+        pass
+    with open(extended + '.txt', 'w') as fp:
+        pass
+    with open(final + '.tsv', 'w') as fp:
+        pass
+    with open(final + '.txt', 'w') as fp:
+        pass
+    quit()
+
 ##########################################
 # creating the summary dataframe         #
 ##########################################
@@ -139,9 +155,9 @@
     print("Adding results for " + file)
     analysis = "fastqc"
     new_df = pd.read_csv(file, dtype = str, index_col= False)
-    R1_df = new_df.drop_duplicates(subset='sample', keep="first")
+    R1_df = new_df.drop_duplicates(subset='sample', keep="first").copy()
     R1_df = R1_df.add_prefix('R1_')
-    R2_df = new_df.drop_duplicates(subset='sample', keep="last")
+    R2_df = new_df.drop_duplicates(subset='sample', keep="last").copy()
     R2_df = R2_df.add_prefix('R2_')
     new_df = pd.merge(R1_df, R2_df, left_on="R1_sample", right_on="R2_sample", how = 'left')
     new_df['sample'] = new_df['R1_sample']
@@ -202,9 +218,9 @@
     new_df = new_df.sort_values(by='Identity', ascending=False)
     new_df = new_df.drop_duplicates(subset=['sample', 'Database'], keep="first")
     new_df = new_df.add_prefix(analysis + '_')
-    H_df   = new_df[new_df[analysis + '_Database' ] == 'H_type']
+    H_df   = new_df[new_df[analysis + '_Database' ] == 'H_type'].copy()
     H_df   = H_df.add_suffix('_H')
-    O_df   = new_df[new_df[analysis + '_Database' ] == 'O_type']
+    O_df   = new_df[new_df[analysis + '_Database' ] == 'O_type'].copy()
     O_df   = O_df.add_suffix('_O')
     summary_df = pd.merge(summary_df, O_df, left_on="sample", right_on=analysis + "_sample_O", how = 'left')
     summary_df.drop(analysis + "_sample_O", axis=1, inplace=True)
@@ -255,8 +271,8 @@
     print("Adding analysis parsed via multiqc in " + file)
     new_df = pd.read_table(file, dtype = str, index_col= False)
     if "FastQC_mqc-generalstats-fastqc-avg_sequence_length" in new_df.columns :
-        tmp_df = new_df[["Sample","FastQC_mqc-generalstats-fastqc-avg_sequence_length"]]
-        tmp_df["fastqc_avg_length"]= tmp_df["FastQC_mqc-generalstats-fastqc-avg_sequence_length"]
+        tmp_df = new_df[["Sample","FastQC_mqc-generalstats-fastqc-avg_sequence_length"]].copy()
+        tmp_df["fastqc_avg_length"] = tmp_df["FastQC_mqc-generalstats-fastqc-avg_sequence_length"]
         tmp_df.drop("FastQC_mqc-generalstats-fastqc-avg_sequence_length", axis=1, inplace=True)
 
         summary_df["possible_fastqc_name"] = summary_df['file'].str.split(" ").str[0].str.split(".").str[0]

diff --git a/modules/blast.nf b/modules/blast.nf
@@ -2,7 +2,7 @@ process blastn {
   tag           "${sample}"
   label         "medcpus"
   publishDir    params.outdir, mode: 'copy'
-  container     'staphb/blast:2.13.0'
+  container     'staphb/blast:2.14.0'
   maxForks      10
   //#UPHLICA errorStrategy { task.attempt < 2 ? 'retry' : 'ignore'}
   //#UPHLICA pod annotation: 'scheduler.illumina.com/presetSize' , value: 'himem-medium'

diff --git a/modules/datasets.nf b/modules/datasets.nf
@@ -1,7 +1,7 @@
 process datasets_summary {
   tag           "${taxon}"
   publishDir    params.outdir, mode: 'copy'
-  container     'staphb/ncbi-datasets:14.20.0'
+  container     'staphb/ncbi-datasets:15.2.0'
   maxForks      10
   //#UPHLICA errorStrategy { task.attempt < 2 ? 'retry' : 'ignore'}
   //#UPHLICA pod annotation: 'scheduler.illumina.com/presetSize', value: 'standard-medium'
@@ -30,20 +30,29 @@ process datasets_summary {
 
     taxon=$(echo !{taxon} | tr "_" " ")
 
-    datasets summary genome taxon "$taxon" --reference  --limit !{params.datasets_max_genomes} --as-json-lines | \
+    datasets summary genome taxon "$taxon" --reference --limit !{params.datasets_max_genomes} --as-json-lines | \
       dataformat tsv genome --fields accession,assminfo-refseq-category,assminfo-level,organism-name,assmstats-total-ungapped-len | \
       grep -v Homo | \
       tr '\\t' ',' \
       > datasets/!{taxon}_genomes.csv
+
+    datasets summary genome taxon "$taxon" --limit !{params.datasets_max_genomes} --as-json-lines | \
+      dataformat tsv genome --fields accession,assminfo-refseq-category,assminfo-level,organism-name,assmstats-total-ungapped-len | \
+      grep -v Homo | \
+      grep -v "Assembly Accession" | \
+      tr '\\t' ',' \
+      >> datasets/!{taxon}_genomes.csv
   '''
 }
 
+// It is faster if datasets can download the entire list at a time, but there is a timeout for downloading that is about 20 minutes.
+// The '||' is to allow each genome to be downloaded on its own, which is longer overall but each genome should be less than 20 minutes.
 process datasets_download {
   tag           "Downloading Genomes"
   // because there's no way to specify threads
   label         "medcpus"
-  publishDir    params.outdir, mode: 'copy'
-  container     'staphb/ncbi-datasets:14.20.0'
+  publishDir = [ path: "${params.outdir}", mode: 'copy', pattern: "{logs/*/*log,datasets/fastani_refs.tar.gz}" ]
+  container     'staphb/ncbi-datasets:15.2.0'
   maxForks      10
   //#UPHLICA errorStrategy { task.attempt < 2 ? 'retry' : 'ignore'}
   //#UPHLICA pod annotation: 'scheduler.illumina.com/presetSize', value: 'standard-medium'
@@ -77,18 +86,20 @@ process datasets_download {
 
     cat all_runs.txt this_run.txt | sort | uniq > id_list.txt
 
-    datasets download genome accession --inputfile id_list.txt --filename ncbi_dataset.zip
+    ( datasets download genome accession --inputfile id_list.txt --filename ncbi_dataset.zip unzip -o ncbi_dataset.zip ) || \
+    ( while read line ; do echo "Downloading $line" ; datasets download genome accession $line --filename dataset.zip ; unzip -o dataset.zip ; done < id_list.txt )
+
+    fastas=$(ls ncbi_dataset/data/*/*.fna )
 
-    unzip -o ncbi_dataset.zip
-    
-    fastas=$(ls ncbi_dataset/data/*/*.fna)
-    
     for fasta in ${fastas[@]}
     do
-      accession=$(echo $fasta | cut -f 3 -d / )
-      organism=$(head -n 1 $fasta | awk '{print $2 "_" $3 }' )
+      echo "Copying $fasta to genomes"
+      accession=$(echo $fasta | cut -f 4 -d / | cut -f 1,2 -d _ )
+      organism=$(head -n 1 $fasta | awk '{print $2 "_" $3 }' | sed 's/,//g' )
       cat $fasta | sed 's/ /_/g' | sed 's/,//g' > genomes/${organism}_${accession}.fna
-    done
+    done  
+
+    rm -rf genomes/*:_*
 
     tar -czvf datasets/fastani_refs.tar.gz genomes/
   '''

diff --git a/modules/fastp.nf b/modules/fastp.nf
@@ -1,7 +1,7 @@
 process fastp {
   tag           "${sample}"
   publishDir    params.outdir, mode: 'copy'
-  container     'staphb/fastp:0.23.2'
+  container     'staphb/fastp:0.23.4'
   maxForks      10
   //#UPHLICA errorStrategy { task.attempt < 2 ? 'retry' : 'ignore'}
   //#UPHLICA pod annotation: 'scheduler.illumina.com/presetSize', value: 'standard-medium'

diff --git a/modules/grandeur.nf b/modules/grandeur.nf
@@ -39,7 +39,7 @@ process species {
 
     if [ -f "mash_summary.csv" ]
     then
-      cut -f 7 -d , mash_summary.csv >> species.txt
+      cut -f 7 -d , mash_summary.csv | tail -n+2 >> species.txt
     fi
 
     grep -v no-hit species.txt | grep -v undef | grep -v name | sort | uniq > datasets/species_list.txt
@@ -223,12 +223,15 @@ process size {
 
     if [ -f "!{sample}_fastani.csv" ] 
     then
-      genus=$(head -n 2 !{sample}_fastani.csv | tail -n 1 | cut -f 3 -d "," | cut -f 1 -d "_" )
-      species=$(head -n 2 !{sample}_fastani.csv | tail -n 1 | cut -f 3 -d "," | cut -f 2 -d "_" )
-      accession=$(head -n 2 !{sample}_fastani.csv | tail -n 1 | sed 's/.*_GC/GC/g' | cut -f 1,2 -d '.' )
+      if [ "$(wc -l !{sample}_fastani.csv | awk '{print $1}' )" -gt 1 ]
+      then
+        genus=$(head     -n 2 !{sample}_fastani.csv | tail -n 1 | cut -f 3 -d "," | cut -f 1 -d "_" )
+        species=$(head   -n 2 !{sample}_fastani.csv | tail -n 1 | cut -f 3 -d "," | cut -f 2 -d "_" )
+        accession=$(head -n 2 !{sample}_fastani.csv | tail -n 1 | sed 's/.*_GC/GC/g' | cut -f 1,2 -d '.' )
+      fi
     fi
 
-    if [ -z "$genus" ] && [ -f "!{sample}.summary.mash.csv"]
+    if [ -z "$genus" ] && [ -f "!{sample}.summary.mash.csv" ]
     then
       genus=$(head -n 2 !{sample}.summary.mash.csv | tail -n 1 | cut -f 7 -d "," | cut -f 1 -d "_" )
       species=$(head -n 2 !{sample}.summary.mash.csv | tail -n 1 | cut -f 7 -d "," | cut -f 2 -d "_" )
@@ -311,7 +314,7 @@ process size {
     then
       echo "Using size from genomes file : $datasets_size" | tee -a $log_file
       size=$datasets_size
-    elif [ -n "$quast_size"] && [ -z "$datasets_size" ] && [ -z "$expected_size" ]
+    elif [ -n "$quast_size" ] && [ -z "$datasets_size" ] && [ -z "$expected_size" ]
     then
       echo "Using size from quast : $quast_size" | tee -a $log_file
       size=$quast_size

diff --git a/nextflow.config b/nextflow.config
@@ -3,7 +3,7 @@ manifest {
   author                          = 'Erin Young'
   homePage                        = 'https://github.com/UPHL-BioNGS/Grandeur'
   mainScript                      = 'grandeur.nf'
-  version                         = '3.2.20230523'
+  version                         = '3.2.20230613'
   defaultBranch                   = 'main'
   description                     = 'Grandeur is short-read de novo assembly pipeline with serotyping.'
 }

diff --git a/subworkflows/phylogenetic_analysis.nf b/subworkflows/phylogenetic_analysis.nf
@@ -33,10 +33,11 @@ workflow phylogenetic_analysis {
 
     ch_contig_organism = ch_contigs.join( ch_organism, by: 0, remainder: true)
 
-    for_prokka = for_prokka.mix(ch_contig_organism)
+    for_prokka = for_prokka.mix(ch_contig_organism).unique()
 
     prokka( for_prokka )
-    roary(prokka.out.gffs.concat(ch_gff).collect())
+
+    roary(prokka.out.gffs.concat(ch_gff).unique().collect())
 
     roary.out.core_gene_alignment
       .filter ({ it[1] as int >= 4 })