Skip to content

Commit

Permalink
Merge pull request #71 from UPHL-BioNGS/update-20230523
Browse files Browse the repository at this point in the history
Update 20230613
  • Loading branch information
erinyoung authored Jun 12, 2023
2 parents 24782d5 + 4e1d4d5 commit c6ce94b
Show file tree
Hide file tree
Showing 8 changed files with 68 additions and 37 deletions.
16 changes: 8 additions & 8 deletions assets/genomes.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ GCF_010918895.1 Acinetobacter_schindleri
GCF_016064815.1 Acinetobacter_seifertii
GCF_018409485.1 Acinetobacter_variabilis
GCF_000967305.2 Alcaligenes_faecalis
GCF_008011635.1 Campylobacter_coli
GCA_008011635.1 Campylobacter_coli
GCF_009730395.1 Campylobacter_coli
GCF_000015085.1 Campylobacter_fetus
GCF_000495505.1 Campylobacter_fetus
Expand All @@ -22,13 +22,13 @@ GCF_011600945.2 Campylobacter_fetus
GCF_001643955.1 Campylobacter_hyointestinalis
GCF_013372165.1 Campylobacter_hyointestinalis_subsp._lawsonii
GCF_000017485.1 Campylobacter_jejuni
GCF_008011525.1 Campylobacter_jejuni
GCA_008011525.1 Campylobacter_jejuni
GCF_000009085.1 Campylobacter_jejuni_subsp._jejuni_NCTC_11168_=_ATCC_700819
GCF_000816225.1 Campylobacter_lari
GCF_000019205.1 Campylobacter_lari_RM2100
GCF_014931075.1 Campylobacter_peloridis
GCF_000816305.1 Campylobacter_subantarcticus_LMG_24377
GCF_008011615.1 Campylobacter_upsaliensis
GCA_008011615.1 Campylobacter_upsaliensis
GCF_916098265.1 Campylobacter_upsaliensis
GCF_001558935.2 Citrobacter_amalonaticus
GCF_009648935.1 Citrobacter_braakii
Expand Down Expand Up @@ -111,7 +111,7 @@ GCF_014526935.1 Listeria_monocytogenes
GCF_000196035.1 Listeria_monocytogenes_EGD-e
GCF_017363605.1 Listeria_seeligeri
GCF_002489005.1 Listeria_welshimeri
GCF_007681265.1 Mixta_calida
GCA_007681265.1 Mixta_calida
GCF_902387845.1 Morganella_morganii
GCF_013030075.1 Neisseria_gonorrhoeae
GCF_000233595.1 Pantoea_ananatis_PA13
Expand All @@ -136,9 +136,9 @@ GCF_000412675.1 Pseudomonas_putida_NBRC_14164
GCF_902374465.1 Ralstonia_pickettii
GCF_901421005.1 Raoultella_ornithinolytica
GCF_022637595.1 Raoultella_planticola
GCF_013588055.1 Salmonella_bongori
GCA_013588055.1 Salmonella_bongori
GCF_000439255.1 Salmonella_bongori_N268-08
GCF_011388235.1 Salmonella_enterica
GCA_011388235.1 Salmonella_enterica
GCF_000006945.2 Salmonella_enterica_subsp._enterica_serovar_Typhimurium_str._LT2
GCF_003516165.1 Serratia_marcescens
GCF_004768745.1 Serratia_nematodiphila
Expand All @@ -155,9 +155,9 @@ GCF_000221985.1 Streptococcus_pseudopneumoniae
GCF_900475035.1 Streptococcus_pyogenes
GCF_009665435.1 Vibrio_alginolyticus
GCA_023650915.1 Vibrio_alginolyticus
GCF_009665515.2 Vibrio_cholerae
GCA_009665515.2 Vibrio_cholerae
GCF_008369605.1 Vibrio_cholerae
GCF_009665415.1 Vibrio_cidicii
GCA_009665415.1 Vibrio_cidicii
GCF_009665395.1 Vibrio_cincinnatiensis
GCF_009665355.1 Vibrio_fluvialis
GCF_009665335.1 Vibrio_furnissii
Expand Down
28 changes: 22 additions & 6 deletions bin/summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,22 @@
tsv_files = [ quast, seqsero2, kleborate, mlst, emmtyper , pbptyper]
top_hit = [ fastani ]

##########################################
# exiting if no input files #
##########################################

if not exists(names) :
print("No analyses to report on for this run!")
with open(extended + '.tsv', 'w') as fp:
pass
with open(extended + '.txt', 'w') as fp:
pass
with open(final + '.tsv', 'w') as fp:
pass
with open(final + '.txt', 'w') as fp:
pass
quit()

##########################################
# creating the summary dataframe #
##########################################
Expand Down Expand Up @@ -139,9 +155,9 @@
print("Adding results for " + file)
analysis = "fastqc"
new_df = pd.read_csv(file, dtype = str, index_col= False)
R1_df = new_df.drop_duplicates(subset='sample', keep="first")
R1_df = new_df.drop_duplicates(subset='sample', keep="first").copy()
R1_df = R1_df.add_prefix('R1_')
R2_df = new_df.drop_duplicates(subset='sample', keep="last")
R2_df = new_df.drop_duplicates(subset='sample', keep="last").copy()
R2_df = R2_df.add_prefix('R2_')
new_df = pd.merge(R1_df, R2_df, left_on="R1_sample", right_on="R2_sample", how = 'left')
new_df['sample'] = new_df['R1_sample']
Expand Down Expand Up @@ -202,9 +218,9 @@
new_df = new_df.sort_values(by='Identity', ascending=False)
new_df = new_df.drop_duplicates(subset=['sample', 'Database'], keep="first")
new_df = new_df.add_prefix(analysis + '_')
H_df = new_df[new_df[analysis + '_Database' ] == 'H_type']
H_df = new_df[new_df[analysis + '_Database' ] == 'H_type'].copy()
H_df = H_df.add_suffix('_H')
O_df = new_df[new_df[analysis + '_Database' ] == 'O_type']
O_df = new_df[new_df[analysis + '_Database' ] == 'O_type'].copy()
O_df = O_df.add_suffix('_O')
summary_df = pd.merge(summary_df, O_df, left_on="sample", right_on=analysis + "_sample_O", how = 'left')
summary_df.drop(analysis + "_sample_O", axis=1, inplace=True)
Expand Down Expand Up @@ -255,8 +271,8 @@
print("Adding analysis parsed via multiqc in " + file)
new_df = pd.read_table(file, dtype = str, index_col= False)
if "FastQC_mqc-generalstats-fastqc-avg_sequence_length" in new_df.columns :
tmp_df = new_df[["Sample","FastQC_mqc-generalstats-fastqc-avg_sequence_length"]]
tmp_df["fastqc_avg_length"]= tmp_df["FastQC_mqc-generalstats-fastqc-avg_sequence_length"]
tmp_df = new_df[["Sample","FastQC_mqc-generalstats-fastqc-avg_sequence_length"]].copy()
tmp_df["fastqc_avg_length"] = tmp_df["FastQC_mqc-generalstats-fastqc-avg_sequence_length"]
tmp_df.drop("FastQC_mqc-generalstats-fastqc-avg_sequence_length", axis=1, inplace=True)

summary_df["possible_fastqc_name"] = summary_df['file'].str.split(" ").str[0].str.split(".").str[0]
Expand Down
2 changes: 1 addition & 1 deletion modules/blast.nf
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ process blastn {
tag "${sample}"
label "medcpus"
publishDir params.outdir, mode: 'copy'
container 'staphb/blast:2.13.0'
container 'staphb/blast:2.14.0'
maxForks 10
//#UPHLICA errorStrategy { task.attempt < 2 ? 'retry' : 'ignore'}
//#UPHLICA pod annotation: 'scheduler.illumina.com/presetSize' , value: 'himem-medium'
Expand Down
35 changes: 23 additions & 12 deletions modules/datasets.nf
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
process datasets_summary {
tag "${taxon}"
publishDir params.outdir, mode: 'copy'
container 'staphb/ncbi-datasets:14.20.0'
container 'staphb/ncbi-datasets:15.2.0'
maxForks 10
//#UPHLICA errorStrategy { task.attempt < 2 ? 'retry' : 'ignore'}
//#UPHLICA pod annotation: 'scheduler.illumina.com/presetSize', value: 'standard-medium'
Expand Down Expand Up @@ -30,20 +30,29 @@ process datasets_summary {
taxon=$(echo !{taxon} | tr "_" " ")
datasets summary genome taxon "$taxon" --reference --limit !{params.datasets_max_genomes} --as-json-lines | \
datasets summary genome taxon "$taxon" --reference --limit !{params.datasets_max_genomes} --as-json-lines | \
dataformat tsv genome --fields accession,assminfo-refseq-category,assminfo-level,organism-name,assmstats-total-ungapped-len | \
grep -v Homo | \
tr '\\t' ',' \
> datasets/!{taxon}_genomes.csv
datasets summary genome taxon "$taxon" --limit !{params.datasets_max_genomes} --as-json-lines | \
dataformat tsv genome --fields accession,assminfo-refseq-category,assminfo-level,organism-name,assmstats-total-ungapped-len | \
grep -v Homo | \
grep -v "Assembly Accession" | \
tr '\\t' ',' \
>> datasets/!{taxon}_genomes.csv
'''
}

// It is faster if datasets can download the entire list at a time, but there is a timeout for downloading that is about 20 minutes.
// The '||' is to allow each genome to be downloaded on its own, which is longer overall but each genome should be less than 20 minutes.
process datasets_download {
tag "Downloading Genomes"
// because there's no way to specify threads
label "medcpus"
publishDir params.outdir, mode: 'copy'
container 'staphb/ncbi-datasets:14.20.0'
publishDir = [ path: "${params.outdir}", mode: 'copy', pattern: "{logs/*/*log,datasets/fastani_refs.tar.gz}" ]
container 'staphb/ncbi-datasets:15.2.0'
maxForks 10
//#UPHLICA errorStrategy { task.attempt < 2 ? 'retry' : 'ignore'}
//#UPHLICA pod annotation: 'scheduler.illumina.com/presetSize', value: 'standard-medium'
Expand Down Expand Up @@ -77,18 +86,20 @@ process datasets_download {
cat all_runs.txt this_run.txt | sort | uniq > id_list.txt
datasets download genome accession --inputfile id_list.txt --filename ncbi_dataset.zip
( datasets download genome accession --inputfile id_list.txt --filename ncbi_dataset.zip unzip -o ncbi_dataset.zip ) || \
( while read line ; do echo "Downloading $line" ; datasets download genome accession $line --filename dataset.zip ; unzip -o dataset.zip ; done < id_list.txt )
fastas=$(ls ncbi_dataset/data/*/*.fna )
unzip -o ncbi_dataset.zip
fastas=$(ls ncbi_dataset/data/*/*.fna)
for fasta in ${fastas[@]}
do
accession=$(echo $fasta | cut -f 3 -d / )
organism=$(head -n 1 $fasta | awk '{print $2 "_" $3 }' )
echo "Copying $fasta to genomes"
accession=$(echo $fasta | cut -f 4 -d / | cut -f 1,2 -d _ )
organism=$(head -n 1 $fasta | awk '{print $2 "_" $3 }' | sed 's/,//g' )
cat $fasta | sed 's/ /_/g' | sed 's/,//g' > genomes/${organism}_${accession}.fna
done
done
rm -rf genomes/*:_*
tar -czvf datasets/fastani_refs.tar.gz genomes/
'''
Expand Down
2 changes: 1 addition & 1 deletion modules/fastp.nf
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
process fastp {
tag "${sample}"
publishDir params.outdir, mode: 'copy'
container 'staphb/fastp:0.23.2'
container 'staphb/fastp:0.23.4'
maxForks 10
//#UPHLICA errorStrategy { task.attempt < 2 ? 'retry' : 'ignore'}
//#UPHLICA pod annotation: 'scheduler.illumina.com/presetSize', value: 'standard-medium'
Expand Down
15 changes: 9 additions & 6 deletions modules/grandeur.nf
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ process species {
if [ -f "mash_summary.csv" ]
then
cut -f 7 -d , mash_summary.csv >> species.txt
cut -f 7 -d , mash_summary.csv | tail -n+2 >> species.txt
fi
grep -v no-hit species.txt | grep -v undef | grep -v name | sort | uniq > datasets/species_list.txt
Expand Down Expand Up @@ -223,12 +223,15 @@ process size {
if [ -f "!{sample}_fastani.csv" ]
then
genus=$(head -n 2 !{sample}_fastani.csv | tail -n 1 | cut -f 3 -d "," | cut -f 1 -d "_" )
species=$(head -n 2 !{sample}_fastani.csv | tail -n 1 | cut -f 3 -d "," | cut -f 2 -d "_" )
accession=$(head -n 2 !{sample}_fastani.csv | tail -n 1 | sed 's/.*_GC/GC/g' | cut -f 1,2 -d '.' )
if [ "$(wc -l !{sample}_fastani.csv | awk '{print $1}' )" -gt 1 ]
then
genus=$(head -n 2 !{sample}_fastani.csv | tail -n 1 | cut -f 3 -d "," | cut -f 1 -d "_" )
species=$(head -n 2 !{sample}_fastani.csv | tail -n 1 | cut -f 3 -d "," | cut -f 2 -d "_" )
accession=$(head -n 2 !{sample}_fastani.csv | tail -n 1 | sed 's/.*_GC/GC/g' | cut -f 1,2 -d '.' )
fi
fi
if [ -z "$genus" ] && [ -f "!{sample}.summary.mash.csv"]
if [ -z "$genus" ] && [ -f "!{sample}.summary.mash.csv" ]
then
genus=$(head -n 2 !{sample}.summary.mash.csv | tail -n 1 | cut -f 7 -d "," | cut -f 1 -d "_" )
species=$(head -n 2 !{sample}.summary.mash.csv | tail -n 1 | cut -f 7 -d "," | cut -f 2 -d "_" )
Expand Down Expand Up @@ -311,7 +314,7 @@ process size {
then
echo "Using size from genomes file : $datasets_size" | tee -a $log_file
size=$datasets_size
elif [ -n "$quast_size"] && [ -z "$datasets_size" ] && [ -z "$expected_size" ]
elif [ -n "$quast_size" ] && [ -z "$datasets_size" ] && [ -z "$expected_size" ]
then
echo "Using size from quast : $quast_size" | tee -a $log_file
size=$quast_size
Expand Down
2 changes: 1 addition & 1 deletion nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ manifest {
author = 'Erin Young'
homePage = 'https://github.com/UPHL-BioNGS/Grandeur'
mainScript = 'grandeur.nf'
version = '3.2.20230523'
version = '3.2.20230613'
defaultBranch = 'main'
description = 'Grandeur is short-read de novo assembly pipeline with serotyping.'
}
Expand Down
5 changes: 3 additions & 2 deletions subworkflows/phylogenetic_analysis.nf
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,11 @@ workflow phylogenetic_analysis {

ch_contig_organism = ch_contigs.join( ch_organism, by: 0, remainder: true)

for_prokka = for_prokka.mix(ch_contig_organism)
for_prokka = for_prokka.mix(ch_contig_organism).unique()

prokka( for_prokka )
roary(prokka.out.gffs.concat(ch_gff).collect())

roary(prokka.out.gffs.concat(ch_gff).unique().collect())

roary.out.core_gene_alignment
.filter ({ it[1] as int >= 4 })
Expand Down

0 comments on commit c6ce94b

Please sign in to comment.