From 358941a58b82a4a5f894aa69fa54047242c038aa Mon Sep 17 00:00:00 2001 From: Sage Wright <40403716+sage-wright@users.noreply.github.com> Date: Wed, 22 Nov 2023 15:44:03 -0500 Subject: [PATCH] TheiaCoV_FASTA_Batch: TheiaCoV_FASTA, for many samples at once (#238) * skellieton * continuation of skeleton workflow * very alpha iteration of the theiacov fasta wrangling and upload task * progress * newline issue * fix inputs * follow methods through * solve newline issue * to-dos * to-dos * fix organism logic * fix syntax * revert to wdl 1.0 * split pango and nextclade files * add to dockstore * enable table upload * fix old 1.1 syntax * not sure what went wrong with disk size before * disk size change for real this time * add slash to path * remove excess outputs * add check for nextclade json and pango lineage report existance before splitting and upload to google bucket * fix dastardly typo --------- Co-authored-by: cimendes --- .dockstore.yml | 5 + tasks/utilities/task_theiacov_fasta_batch.wdl | 225 ++++++++++++++++++ .../theiacov/wf_theiacov_fasta_batch.wdl | 88 +++++++ 3 files changed, 318 insertions(+) create mode 100644 tasks/utilities/task_theiacov_fasta_batch.wdl create mode 100644 workflows/theiacov/wf_theiacov_fasta_batch.wdl diff --git a/.dockstore.yml b/.dockstore.yml index 86bb712e5..78a9464da 100644 --- a/.dockstore.yml +++ b/.dockstore.yml @@ -50,6 +50,11 @@ workflows: primaryDescriptorPath: /workflows/theiacov/wf_theiacov_fasta.wdl testParameterFiles: - empty.json + - name: TheiaCoV_FASTA_Batch_PHB + subclass: WDL + primaryDescriptorPath: /workflows/theiacov/wf_theiacov_fasta_batch.wdl + testParameterFiles: + - empty.json - name: Mercury_Prep_N_Batch_PHB subclass: WDL primaryDescriptorPath: /workflows/submission/wf_mercury_prep_n_batch.wdl diff --git a/tasks/utilities/task_theiacov_fasta_batch.wdl b/tasks/utilities/task_theiacov_fasta_batch.wdl new file mode 100644 index 000000000..3ec438d5f --- /dev/null +++ b/tasks/utilities/task_theiacov_fasta_batch.wdl @@ -0,0 +1,225 @@ +version 1.0 + +task sm_theiacov_fasta_wrangling { # the sm stands for supermassive + input { + String table_name + String workspace_name + String project_name + String bucket_name + + Array[Pair[String, File]] sample_to_fasta + String organism = "sars-cov-2" + + File? nextclade_tsv + File? nextclade_json + String? nextclade_docker + String? nextclade_version + String? nextclade_ds_tag + + File? pango_lineage_report + String? pangolin_docker + + String seq_platform + String assembly_method + String theiacov_fasta_analysis_date + String theiacov_fasta_version + + Int disk_size = 100 + } + command <<< + # convert the map into a JSON file for use in the python block + # example map: {ERR4439752.test: /mnt/miniwdl_task_container/work/_miniwdl_inputs/0/ERR4439752.ivar.consensus.fasta} + cp -v ~{write_json(sample_to_fasta)} sample_to_fasta.json + + # check if nextclade json file exists + if [ -f ~{nextclade_json} ]; then + # this line splits into individual json files + jq -c '.results = (.results[] | [.]) ' ~{nextclade_json} | awk '{ print > "out" NR ".json"}' + + # rename each individual json file with the sample name + for file in out*.json; do + samplename=$(jq -r '.results[].seqName' ${file}) + cp -v ${file} ${samplename}.nextclade.json + done + + # transfer all the json files to the bucket for access in Terra -- not sure if this works on Terra + gcloud storage cp -v *.nextclade.json gs://~{bucket_name}/theiacov_fasta_batch-~{theiacov_fasta_analysis_date}/nextclade_json/ + fi + + # check if pangolin lineage report file exists + if [ -f ~{pango_lineage_report} ]; then + # split the pangolin lineage report into individual csv files named by the taxon + awk 'BEGIN {FS=","} NR==1 {header=$0; next} {print header > $1".pangolin_report.csv" ; print $0 >> $1".pangolin_report.csv"}' ~{pango_lineage_report} + + # transfer all pangolin lineage report files to the bucket for access in Terra + gcloud storage cp -v *pangolin_report.csv gs://~{bucket_name}/theiacov_fasta_batch-~{theiacov_fasta_analysis_date}/pangolin_report/ + fi + + echo "DEBUG: Now entering Python block to perform parsing of data for populating sample-level table" + + python3 <>> + output { + File terra_table = "terra-table-to-upload.tsv" + } + runtime { + docker: "us-docker.pkg.dev/general-theiagen/theiagen/terra-tools:2023-08-28-v4" + memory: "8 GB" + cpu: 4 + disks: "local-disk " + disk_size + " SSD" + disk: disk_size + " GB" + preemptible: 0 + } +} \ No newline at end of file diff --git a/workflows/theiacov/wf_theiacov_fasta_batch.wdl b/workflows/theiacov/wf_theiacov_fasta_batch.wdl new file mode 100644 index 000000000..5934c93cc --- /dev/null +++ b/workflows/theiacov/wf_theiacov_fasta_batch.wdl @@ -0,0 +1,88 @@ +version 1.0 + +import "../../tasks/species_typing/task_pangolin.wdl" as pangolin_task +import "../../tasks/task_versioning.wdl" as versioning +import "../../tasks/taxon_id/task_nextclade.wdl" as nextclade_task +import "../../tasks/utilities/task_file_handling.wdl" as concatenate +import "../../tasks/utilities/task_theiacov_fasta_batch.wdl" as theiacov_fasta_wrangling_task + +workflow theiacov_fasta_batch { + meta { + description: "TheiaCoV_FASTA for multiple samples" + } + input { + Array[String] samplenames + Array[File] assembly_fastas + String organism = "sars-cov-2" + # sequencing values + String seq_method + String input_assembly_method + # nextclade inputs + String nextclade_dataset_reference = "MN908947" + String nextclade_dataset_tag = "2023-09-21T12:00:00Z" + String? nextclade_dataset_name + # workspace values + String table_name + String workspace_name + String project_name + String bucket_name + } + call versioning.version_capture{ + input: + } + call concatenate.cat_files { + input: + files_to_cat = assembly_fastas, + concatenated_file_name = "concatenated_assemblies.fasta" + } + if (organism == "sars-cov-2") { + # sars-cov-2 specific tasks + call pangolin_task.pangolin4 { + input: + samplename = "concatenated_assemblies", + fasta = cat_files.concatenated_files + } + } + if (organism == "MPXV" || organism == "sars-cov-2"){ + # tasks specific to either MPXV or sars-cov-2 + call nextclade_task.nextclade { + input: + genome_fasta = cat_files.concatenated_files, + dataset_name = select_first([nextclade_dataset_name, organism]), + dataset_reference = nextclade_dataset_reference, + dataset_tag = nextclade_dataset_tag + } + } + call theiacov_fasta_wrangling_task.sm_theiacov_fasta_wrangling { + input: + table_name = table_name, + workspace_name = workspace_name, + project_name = project_name, + bucket_name = bucket_name, + sample_to_fasta = zip(samplenames, assembly_fastas), + organism = organism, + nextclade_tsv = nextclade.nextclade_tsv, + nextclade_docker = nextclade.nextclade_docker, + nextclade_version = nextclade.nextclade_version, + nextclade_ds_tag = nextclade_dataset_tag, + nextclade_json = nextclade.nextclade_json, + pango_lineage_report = pangolin4.pango_lineage_report, + pangolin_docker = pangolin4.pangolin_docker, + seq_platform = seq_method, + assembly_method = input_assembly_method, + theiacov_fasta_analysis_date = version_capture.date, + theiacov_fasta_version = version_capture.phb_version + } + output { + # Version Capture + String theiacov_fasta_batch_version = version_capture.phb_version + String theiacov_fasta_batch_analysis_date = version_capture.date + # Pangolin outputs + File? pango_lineage_report = pangolin4.pango_lineage_report + # Nextclade outputs + File? nextclade_json = nextclade.nextclade_json + File? nextclade_tsv = nextclade.nextclade_tsv + # Wrangling outputs + File datatable = sm_theiacov_fasta_wrangling.terra_table + } +} \ No newline at end of file