From def7883632f9011d385a34f80130217b9d94d612 Mon Sep 17 00:00:00 2001
From: jrotieno <james.otieno@theiagen.com>
Date: Tue, 25 Jul 2023 08:39:48 +0000
Subject: [PATCH 01/36] created new task set_mpxv_defaults

---
 tasks/utilities/task_augur_utilities.wdl | 38 ++++++++++++++++++++++++
 1 file changed, 38 insertions(+)

diff --git a/tasks/utilities/task_augur_utilities.wdl b/tasks/utilities/task_augur_utilities.wdl
index c305222db..5ab8f68ac 100644
--- a/tasks/utilities/task_augur_utilities.wdl
+++ b/tasks/utilities/task_augur_utilities.wdl
@@ -324,6 +324,44 @@ task set_flu_defaults { # establish flu default values for augur
   }
 }
 
+task set_mpxv_defaults { # establish mpxv default values for augur
+  input {
+    # in the future we will wget from the repo directly
+    File mpxv_lat_longs_tsv = "gs://theiagen-public-files-rp/terra/flu-references/lat_longs.tsv"
+    File mpxv_clades_tsv = "gs://theiagen-public-files-rp/terra/augur-mpox-references/mpox_clades.tsv"
+    File mpxv_reference_fasta = "gs://theiagen-public-files-rp/terra/augur-mpox-references/NC_063383.1.reference.fasta"
+    File mpxv_reference_genbank = "gs://theiagen-public-files-rp/terra/augur-mpox-references/NC_063383.1_reference.gb"
+    File mpxv_auspice_config = "gs://theiagen-public-files-rp/terra/augur-mpox-references/mpox_auspice_config_mpxv.json"
+
+    Int disk_size = 50
+  }
+  command <<<
+    # nothing to do here for now
+    echo "working...very hard"
+
+  >>>
+  output {
+    Int min_num_unambig = 150000
+    File? clades_tsv = mpxv_clades_tsv
+    File lat_longs_tsv = mpxv_lat_longs_tsv
+    File reference_fasta = mpxv_reference_fasta
+    File reference_genbank = mpxv_reference_genbank
+    File auspice_config = mpxv_auspice_config
+    # inherited from flu defaults
+    Float min_date = 2020.0
+    Int pivot_interval = 1
+    Float narrow_bandwidth = 0.1666667
+    Float proportion_wide = 0.0
+  }
+  runtime {
+    docker: "us-docker.pkg.dev/general-theiagen/biocontainers/augur:22.0.2--pyhdfd78af_0"
+    memory: "1 GB"
+    cpu: 1
+    disks: "local-disk " + disk_size + " HDD"
+    disk: disk_size + " GB"
+  }
+}
+
 task prep_augur_metadata {
   input {
     File assembly

From 1b70ce9f6512aea79c7114091718b3a9b2afed26 Mon Sep 17 00:00:00 2001
From: jrotieno <james.otieno@theiagen.com>
Date: Tue, 25 Jul 2023 08:50:30 +0000
Subject: [PATCH 02/36] added mpxv inputs to augur workflow

---
 tasks/utilities/task_augur_utilities.wdl |  2 +-
 workflows/phylogenetics/wf_augur.wdl     | 21 +++++++++++++--------
 2 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/tasks/utilities/task_augur_utilities.wdl b/tasks/utilities/task_augur_utilities.wdl
index 5ab8f68ac..4f05553f4 100644
--- a/tasks/utilities/task_augur_utilities.wdl
+++ b/tasks/utilities/task_augur_utilities.wdl
@@ -342,7 +342,7 @@ task set_mpxv_defaults { # establish mpxv default values for augur
   >>>
   output {
     Int min_num_unambig = 150000
-    File? clades_tsv = mpxv_clades_tsv
+    File clades_tsv = mpxv_clades_tsv
     File lat_longs_tsv = mpxv_lat_longs_tsv
     File reference_fasta = mpxv_reference_fasta
     File reference_genbank = mpxv_reference_genbank
diff --git a/workflows/phylogenetics/wf_augur.wdl b/workflows/phylogenetics/wf_augur.wdl
index 8ed6948d9..0d3dd7c51 100644
--- a/workflows/phylogenetics/wf_augur.wdl
+++ b/workflows/phylogenetics/wf_augur.wdl
@@ -52,15 +52,20 @@ workflow augur {
         flu_subtype = flu_subtype
     }
   }
+  if (organism == "MPXV") {
+    call augur_utils.set_mpxv_defaults as mpxv_defaults { # establish default parameters for mpxv
+      input:
+    }
+  }
   call augur_utils.filter_sequences_by_length { # remove any sequences that do not meet the quality threshold
     input:
       sequences_fasta = cat_files.concatenated_files,
-      min_non_N = select_first([min_num_unambig, sc2_defaults.min_num_unambig, flu_defaults.min_num_unambig])
+      min_non_N = select_first([min_num_unambig, sc2_defaults.min_num_unambig, flu_defaults.min_num_unambig, mpxv_defaults.min_num_unambig]),
   }
   call align_task.augur_align { # perform mafft alignment on the sequences
     input: 
       assembly_fasta = filter_sequences_by_length.filtered_fasta,
-      reference_fasta = select_first([reference_fasta, sc2_defaults.reference_fasta, flu_defaults.reference_fasta])
+      reference_fasta = select_first([reference_fasta, sc2_defaults.reference_fasta, flu_defaults.reference_fasta, mpxv_defaults.reference_fasta]),
   }
   call augur_utils.tsv_join { # merge the metadata files
     input:
@@ -95,19 +100,19 @@ workflow augur {
       input:
         refined_tree = augur_refine.refined_tree,
         ancestral_nt_muts_json = augur_ancestral.ancestral_nt_muts_json,
-        reference_genbank = select_first([reference_genbank, sc2_defaults.reference_genbank, flu_defaults.reference_genbank]),
+        reference_genbank = select_first([reference_genbank, sc2_defaults.reference_genbank, flu_defaults.reference_genbank, mpxv_defaults.reference_genbank]),
         build_name = build_name
     }
     if (flu_segment == "HA") { # we only have clade information for HA segments (but SC2 defaults will be selected first)
-      if (defined(clades_tsv) || defined(sc2_defaults.clades_tsv) || defined(flu_defaults.clades_tsv)) { # one of these must be present
+      if (defined(clades_tsv) || defined(sc2_defaults.clades_tsv) || defined(flu_defaults.clades_tsv) || defined(mpxv_defaults.clades_tsv) ) { # one of these must be present
         call clades_task.augur_clades { # assign clades to nodes based on amino-acid or nucleotide signatures
           input: 
             refined_tree = augur_refine.refined_tree,
             ancestral_nt_muts_json = augur_ancestral.ancestral_nt_muts_json,
             translated_aa_muts_json = augur_translate.translated_aa_muts_json,
-            reference_fasta = select_first([reference_fasta, sc2_defaults.reference_fasta, flu_defaults.reference_fasta]),
+            reference_fasta = select_first([reference_fasta, sc2_defaults.reference_fasta, flu_defaults.reference_fasta, mpxv_defaults.reference_fasta]),
             build_name = build_name,
-            clades_tsv = select_first([clades_tsv, sc2_defaults.clades_tsv, flu_defaults.clades_tsv])
+            clades_tsv = select_first([clades_tsv, sc2_defaults.clades_tsv, flu_defaults.clades_tsv, mpxv_defaults.clades_tsv])
         }
       }
     }
@@ -121,8 +126,8 @@ workflow augur {
                             augur_translate.translated_aa_muts_json,
                             augur_clades.clade_assignments_json]),
         build_name = build_name,
-        lat_longs_tsv = select_first([sc2_defaults.lat_longs_tsv, flu_defaults.lat_longs_tsv, lat_longs_tsv]),
-        auspice_config = select_first([sc2_defaults.auspice_config, flu_defaults.auspice_config, auspice_config])
+        lat_longs_tsv = select_first([sc2_defaults.lat_longs_tsv, flu_defaults.lat_longs_tsv, mpxv_defaults.lat_longs_tsv, lat_longs_tsv]),
+        auspice_config = select_first([sc2_defaults.auspice_config, flu_defaults.auspice_config, mpxv_defaults.auspice_config, auspice_config])
     }
   }
   call snp_dists_task.snp_dists { # create a snp matrix from the alignment

From 2dc86e6ecd78a2c4c957f1a43f2dbd1fe077a9bc Mon Sep 17 00:00:00 2001
From: jrotieno <james.otieno@theiagen.com>
Date: Tue, 25 Jul 2023 11:23:30 +0000
Subject: [PATCH 03/36] updated reference fasta to gb file

---
 tasks/utilities/task_augur_utilities.wdl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tasks/utilities/task_augur_utilities.wdl b/tasks/utilities/task_augur_utilities.wdl
index 4f05553f4..71a0be0b0 100644
--- a/tasks/utilities/task_augur_utilities.wdl
+++ b/tasks/utilities/task_augur_utilities.wdl
@@ -329,7 +329,7 @@ task set_mpxv_defaults { # establish mpxv default values for augur
     # in the future we will wget from the repo directly
     File mpxv_lat_longs_tsv = "gs://theiagen-public-files-rp/terra/flu-references/lat_longs.tsv"
     File mpxv_clades_tsv = "gs://theiagen-public-files-rp/terra/augur-mpox-references/mpox_clades.tsv"
-    File mpxv_reference_fasta = "gs://theiagen-public-files-rp/terra/augur-mpox-references/NC_063383.1.reference.fasta"
+    File mpxv_reference_fasta = "gs://theiagen-public-files-rp/terra/augur-mpox-references/NC_063383.1_reference.gb"
     File mpxv_reference_genbank = "gs://theiagen-public-files-rp/terra/augur-mpox-references/NC_063383.1_reference.gb"
     File mpxv_auspice_config = "gs://theiagen-public-files-rp/terra/augur-mpox-references/mpox_auspice_config_mpxv.json"
 

From fce14c8f7342d8a6854c8a0d6ba624257156e897 Mon Sep 17 00:00:00 2001
From: jrotieno <james.otieno@theiagen.com>
Date: Tue, 25 Jul 2023 11:52:43 +0000
Subject: [PATCH 04/36] new workflow for adding samples to a ref tree

---
 .../wf_nextclade_addToRefTree.wdl             | 50 +++++++++++++++++++
 1 file changed, 50 insertions(+)
 create mode 100644 workflows/phylogenetics/wf_nextclade_addToRefTree.wdl

diff --git a/workflows/phylogenetics/wf_nextclade_addToRefTree.wdl b/workflows/phylogenetics/wf_nextclade_addToRefTree.wdl
new file mode 100644
index 000000000..bc26b277b
--- /dev/null
+++ b/workflows/phylogenetics/wf_nextclade_addToRefTree.wdl
@@ -0,0 +1,50 @@
+version 1.0
+
+import "../../tasks/utilities/task_file_handling.wdl" as file_handling
+import "../../tasks/taxon_id/task_nextclade.wdl" as nextclade
+
+workflow nextclade_addToRefTree {
+    meta {
+      description: "Nextclade workflow that adds samples to a curated JSON tree from Augur."
+    }
+    input {
+      Array[File]+ assembly_fastas
+      String build_name
+      File root_sequence_fasta
+      File reference_tree_json
+      File? qc_config_json
+      File? gene_annotations_gff
+      File? pcr_primers_csv
+      File? virus_properties
+      String docker = "nextstrain/nextclade:2.13.0"
+      String dataset_name = "MPXV"
+      String dataset_reference = "ancestral"
+      String dataset_tag = "2023-01-26T12:00:00Z"
+    }
+    call file_handling.cat_files { # concatenate all of the input fasta files together
+      input:
+        files_to_cat = assembly_fastas,
+        concatenated_file_name = "~{build_name}_concatenated.fasta"
+    }
+    call nextclade.nextclade { # nextclade analysis
+      input:
+        genome_fasta = cat_files.concatenated_files,
+        root_sequence = root_sequence_fasta,
+        auspice_reference_tree_json = reference_tree_json,
+        qc_config_json = qc_config_json,
+        gene_annotations_json = gene_annotations_gff,
+        pcr_primers_csv = pcr_primers_csv,
+        virus_properties = virus_properties,
+        docker = docker,
+        dataset_name = dataset_name,
+        dataset_reference = dataset_reference,
+        dataset_tag
+    }
+    output {
+      String treeUpdate_nextclade_version = select_first([nextclade.nextclade_version, ""])
+      File treeUpdate_nextclade_json = select_first([nextclade.nextclade_json, ""])
+      File treeUpdate_auspice_json = select_first([nextclade.auspice_json, ""])
+      File treeUpdate_nextclade_tsv = select_first([nextclade.nextclade_tsv, ""])
+      String treeUpdate_nextclade_docker = select_first([nextclade.nextclade_docker, ""])
+    }
+}
\ No newline at end of file

From c152d9f93a2eb9706a400b16868d8b04da498fe4 Mon Sep 17 00:00:00 2001
From: jrotieno <james.otieno@theiagen.com>
Date: Tue, 25 Jul 2023 11:56:22 +0000
Subject: [PATCH 05/36] small change

---
 workflows/phylogenetics/wf_nextclade_addToRefTree.wdl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflows/phylogenetics/wf_nextclade_addToRefTree.wdl b/workflows/phylogenetics/wf_nextclade_addToRefTree.wdl
index bc26b277b..96b991e92 100644
--- a/workflows/phylogenetics/wf_nextclade_addToRefTree.wdl
+++ b/workflows/phylogenetics/wf_nextclade_addToRefTree.wdl
@@ -38,7 +38,7 @@ workflow nextclade_addToRefTree {
         docker = docker,
         dataset_name = dataset_name,
         dataset_reference = dataset_reference,
-        dataset_tag
+        dataset_tag = dataset_tag
     }
     output {
       String treeUpdate_nextclade_version = select_first([nextclade.nextclade_version, ""])

From 9aff5ba6e5eb30a6a05c571d267e12d0bac088cc Mon Sep 17 00:00:00 2001
From: jrotieno <james.otieno@theiagen.com>
Date: Tue, 25 Jul 2023 12:14:04 +0000
Subject: [PATCH 06/36] minor change

---
 workflows/phylogenetics/wf_nextclade_addToRefTree.wdl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/workflows/phylogenetics/wf_nextclade_addToRefTree.wdl b/workflows/phylogenetics/wf_nextclade_addToRefTree.wdl
index 96b991e92..12da50433 100644
--- a/workflows/phylogenetics/wf_nextclade_addToRefTree.wdl
+++ b/workflows/phylogenetics/wf_nextclade_addToRefTree.wdl
@@ -1,7 +1,7 @@
 version 1.0
 
 import "../../tasks/utilities/task_file_handling.wdl" as file_handling
-import "../../tasks/taxon_id/task_nextclade.wdl" as nextclade
+import "../../tasks/taxon_id/task_nextclade.wdl" as nextclade_analysis
 
 workflow nextclade_addToRefTree {
     meta {
@@ -11,9 +11,9 @@ workflow nextclade_addToRefTree {
       Array[File]+ assembly_fastas
       String build_name
       File root_sequence_fasta
+      #File? gene_annotations_gff
       File reference_tree_json
       File? qc_config_json
-      File? gene_annotations_gff
       File? pcr_primers_csv
       File? virus_properties
       String docker = "nextstrain/nextclade:2.13.0"
@@ -26,13 +26,13 @@ workflow nextclade_addToRefTree {
         files_to_cat = assembly_fastas,
         concatenated_file_name = "~{build_name}_concatenated.fasta"
     }
-    call nextclade.nextclade { # nextclade analysis
+    call nextclade_analysis.nextclade { # nextclade analysis
       input:
         genome_fasta = cat_files.concatenated_files,
         root_sequence = root_sequence_fasta,
         auspice_reference_tree_json = reference_tree_json,
         qc_config_json = qc_config_json,
-        gene_annotations_json = gene_annotations_gff,
+        #gene_annotations_json = gene_annotations_gff,
         pcr_primers_csv = pcr_primers_csv,
         virus_properties = virus_properties,
         docker = docker,

From e33db476da1b26a035e84784fd51121de98ed452 Mon Sep 17 00:00:00 2001
From: jrotieno <james.otieno@theiagen.com>
Date: Tue, 25 Jul 2023 12:20:09 +0000
Subject: [PATCH 07/36] updated wf to dockstore

---
 .dockstore.yml | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/.dockstore.yml b/.dockstore.yml
index 004bf9955..584123e81 100644
--- a/.dockstore.yml
+++ b/.dockstore.yml
@@ -220,3 +220,8 @@ workflows:
    primaryDescriptorPath: /workflows/utilities/wf_theiavalidate.wdl
    testParameterFiles:
     - empty.json
+ - name: Samples_to_Ref_Tree_PHB
+   subclass: WDL
+   primaryDescriptorPath: /workflows/phylogenetics/wf_nextclade_addToRef.wdl
+   testParameterFiles:
+    - empty.json

From aba4cb561dc9229c56fc9b769fff72b8768411e8 Mon Sep 17 00:00:00 2001
From: jrotieno <james.otieno@theiagen.com>
Date: Tue, 25 Jul 2023 12:41:46 +0000
Subject: [PATCH 08/36] typo

---
 .dockstore.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.dockstore.yml b/.dockstore.yml
index 584123e81..182cdd987 100644
--- a/.dockstore.yml
+++ b/.dockstore.yml
@@ -222,6 +222,6 @@ workflows:
     - empty.json
  - name: Samples_to_Ref_Tree_PHB
    subclass: WDL
-   primaryDescriptorPath: /workflows/phylogenetics/wf_nextclade_addToRef.wdl
+   primaryDescriptorPath: /workflows/phylogenetics/wf_nextclade_addToRefTree.wdl
    testParameterFiles:
     - empty.json

From 2a2636e1734223fb90a60f43b374603503a42ae2 Mon Sep 17 00:00:00 2001
From: jrotieno <james.otieno@theiagen.com>
Date: Tue, 25 Jul 2023 13:37:50 +0000
Subject: [PATCH 09/36] removed concatenate task

---
 workflows/phylogenetics/wf_nextclade_addToRefTree.wdl | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/workflows/phylogenetics/wf_nextclade_addToRefTree.wdl b/workflows/phylogenetics/wf_nextclade_addToRefTree.wdl
index 12da50433..abbc43072 100644
--- a/workflows/phylogenetics/wf_nextclade_addToRefTree.wdl
+++ b/workflows/phylogenetics/wf_nextclade_addToRefTree.wdl
@@ -1,6 +1,5 @@
 version 1.0
 
-import "../../tasks/utilities/task_file_handling.wdl" as file_handling
 import "../../tasks/taxon_id/task_nextclade.wdl" as nextclade_analysis
 
 workflow nextclade_addToRefTree {
@@ -8,7 +7,7 @@ workflow nextclade_addToRefTree {
       description: "Nextclade workflow that adds samples to a curated JSON tree from Augur."
     }
     input {
-      Array[File]+ assembly_fastas
+      File assembly_fastas
       String build_name
       File root_sequence_fasta
       #File? gene_annotations_gff
@@ -21,14 +20,9 @@ workflow nextclade_addToRefTree {
       String dataset_reference = "ancestral"
       String dataset_tag = "2023-01-26T12:00:00Z"
     }
-    call file_handling.cat_files { # concatenate all of the input fasta files together
-      input:
-        files_to_cat = assembly_fastas,
-        concatenated_file_name = "~{build_name}_concatenated.fasta"
-    }
     call nextclade_analysis.nextclade { # nextclade analysis
       input:
-        genome_fasta = cat_files.concatenated_files,
+        genome_fasta = assembly_fastas,
         root_sequence = root_sequence_fasta,
         auspice_reference_tree_json = reference_tree_json,
         qc_config_json = qc_config_json,

From bb551898402db46a7e766866fa8b49f13b29ea42 Mon Sep 17 00:00:00 2001
From: jrotieno <james.otieno@theiagen.com>
Date: Wed, 26 Jul 2023 07:57:28 +0000
Subject: [PATCH 10/36] updated augur mpxv ref files

---
 tasks/utilities/task_augur_utilities.wdl | 6 +++++-
 workflows/phylogenetics/wf_augur.wdl     | 6 +++++-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/tasks/utilities/task_augur_utilities.wdl b/tasks/utilities/task_augur_utilities.wdl
index 71a0be0b0..15da6bb51 100644
--- a/tasks/utilities/task_augur_utilities.wdl
+++ b/tasks/utilities/task_augur_utilities.wdl
@@ -327,11 +327,13 @@ task set_flu_defaults { # establish flu default values for augur
 task set_mpxv_defaults { # establish mpxv default values for augur
   input {
     # in the future we will wget from the repo directly
-    File mpxv_lat_longs_tsv = "gs://theiagen-public-files-rp/terra/flu-references/lat_longs.tsv"
+    File mpxv_lat_longs_tsv = "gs://theiagen-public-files-rp/terra/flu-references/lat_longs.tsv" #more comprehensive
     File mpxv_clades_tsv = "gs://theiagen-public-files-rp/terra/augur-mpox-references/mpox_clades.tsv"
     File mpxv_reference_fasta = "gs://theiagen-public-files-rp/terra/augur-mpox-references/NC_063383.1_reference.gb"
     File mpxv_reference_genbank = "gs://theiagen-public-files-rp/terra/augur-mpox-references/NC_063383.1_reference.gb"
     File mpxv_auspice_config = "gs://theiagen-public-files-rp/terra/augur-mpox-references/mpox_auspice_config_mpxv.json"
+    File mpxv_gene_annotations_gff = "gs://theiagen-public-files-rp/terra/augur-mpox-references/genemap.gff"
+    File mpxv_colors = "gs://theiagen-public-files-rp/terra/augur-mpox-references/colors_mpxv.tsv"
 
     Int disk_size = 50
   }
@@ -347,6 +349,8 @@ task set_mpxv_defaults { # establish mpxv default values for augur
     File reference_fasta = mpxv_reference_fasta
     File reference_genbank = mpxv_reference_genbank
     File auspice_config = mpxv_auspice_config
+    File genes = mpxv_gene_annotations_gff
+    File colors = mpxv_colors
     # inherited from flu defaults
     Float min_date = 2020.0
     Int pivot_interval = 1
diff --git a/workflows/phylogenetics/wf_augur.wdl b/workflows/phylogenetics/wf_augur.wdl
index 0d3dd7c51..189820905 100644
--- a/workflows/phylogenetics/wf_augur.wdl
+++ b/workflows/phylogenetics/wf_augur.wdl
@@ -23,6 +23,8 @@ workflow augur {
     String build_name
     File? reference_fasta
     File? reference_genbank
+    File? genes
+    File? colors
     Int? min_num_unambig
     String organism = "sars-cov-2" # options: sars-cov-2 or flu
     String flu_segment = "HA" # options: HA or NA
@@ -52,7 +54,7 @@ workflow augur {
         flu_subtype = flu_subtype
     }
   }
-  if (organism == "MPXV") {
+  if (organism == "MPXV" || organism == "mpxv" || organism == "monkeypox") {
     call augur_utils.set_mpxv_defaults as mpxv_defaults { # establish default parameters for mpxv
       input:
     }
@@ -101,6 +103,7 @@ workflow augur {
         refined_tree = augur_refine.refined_tree,
         ancestral_nt_muts_json = augur_ancestral.ancestral_nt_muts_json,
         reference_genbank = select_first([reference_genbank, sc2_defaults.reference_genbank, flu_defaults.reference_genbank, mpxv_defaults.reference_genbank]),
+        genes = select_first([genes, mpxv_defaults.genes]),
         build_name = build_name
     }
     if (flu_segment == "HA") { # we only have clade information for HA segments (but SC2 defaults will be selected first)
@@ -126,6 +129,7 @@ workflow augur {
                             augur_translate.translated_aa_muts_json,
                             augur_clades.clade_assignments_json]),
         build_name = build_name,
+        colors_tsv = select_first([colors, mpxv_defaults.colors]),
         lat_longs_tsv = select_first([sc2_defaults.lat_longs_tsv, flu_defaults.lat_longs_tsv, mpxv_defaults.lat_longs_tsv, lat_longs_tsv]),
         auspice_config = select_first([sc2_defaults.auspice_config, flu_defaults.auspice_config, mpxv_defaults.auspice_config, auspice_config])
     }

From 109220ae80c77ef8163e0ccafd8ccad74154876d Mon Sep 17 00:00:00 2001
From: jrotieno <james.otieno@theiagen.com>
Date: Wed, 26 Jul 2023 14:47:05 +0000
Subject: [PATCH 11/36] ancestral reference instead of NC_063383.fasta

---
 tasks/utilities/task_augur_utilities.wdl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tasks/utilities/task_augur_utilities.wdl b/tasks/utilities/task_augur_utilities.wdl
index 15da6bb51..d7b42d350 100644
--- a/tasks/utilities/task_augur_utilities.wdl
+++ b/tasks/utilities/task_augur_utilities.wdl
@@ -329,7 +329,7 @@ task set_mpxv_defaults { # establish mpxv default values for augur
     # in the future we will wget from the repo directly
     File mpxv_lat_longs_tsv = "gs://theiagen-public-files-rp/terra/flu-references/lat_longs.tsv" #more comprehensive
     File mpxv_clades_tsv = "gs://theiagen-public-files-rp/terra/augur-mpox-references/mpox_clades.tsv"
-    File mpxv_reference_fasta = "gs://theiagen-public-files-rp/terra/augur-mpox-references/NC_063383.1_reference.gb"
+    File mpxv_reference_fasta = "gs://theiagen-public-files-rp/terra/augur-mpox-references/reconstructed_ancestral_mpox.fasta"
     File mpxv_reference_genbank = "gs://theiagen-public-files-rp/terra/augur-mpox-references/NC_063383.1_reference.gb"
     File mpxv_auspice_config = "gs://theiagen-public-files-rp/terra/augur-mpox-references/mpox_auspice_config_mpxv.json"
     File mpxv_gene_annotations_gff = "gs://theiagen-public-files-rp/terra/augur-mpox-references/genemap.gff"

From 61f6f49abc615419b86691be2b8e15900a7f83e5 Mon Sep 17 00:00:00 2001
From: jrotieno <james.otieno@theiagen.com>
Date: Tue, 1 Aug 2023 10:40:45 +0000
Subject: [PATCH 12/36] adding traits task to infer ancestral traits when
 clades task isn't working

---
 .../augur/task_augur_traits.wdl               | 39 +++++++++++++++++++
 workflows/phylogenetics/wf_augur.wdl          | 15 ++++++-
 2 files changed, 53 insertions(+), 1 deletion(-)
 create mode 100644 tasks/phylogenetic_inference/augur/task_augur_traits.wdl

diff --git a/tasks/phylogenetic_inference/augur/task_augur_traits.wdl b/tasks/phylogenetic_inference/augur/task_augur_traits.wdl
new file mode 100644
index 000000000..ec84eceb6
--- /dev/null
+++ b/tasks/phylogenetic_inference/augur/task_augur_traits.wdl
@@ -0,0 +1,39 @@
+version 1.0
+
+task augur_traits {
+  input {
+    File refined_tree
+    File metadata
+    File? weights
+    Boolean confidence = true
+    String? metadata_id_columns
+    String columns
+    String build_name
+
+    Int mem_size = 30
+    Int disk_size = 100
+  }
+  command <<<
+    AUGUR_RECURSION_LIMIT=10000 augur traits \
+      --tree "~{refined_tree}" \
+      --metadata "~{metadata}" \
+      --columns "~{columns}" \
+      --confidence "~{confidence}" \
+      ~{'--metadata-id-columns ' + metadata_id_columns} \
+      ~{'--weights ' + weights}
+      --output-node-data "~{build_name}_traits.json"
+  >>>
+  output {
+    File traits_assignments_json = "~{build_name}_traits.json"
+  }
+  runtime {
+    docker: "us-docker.pkg.dev/general-theiagen/biocontainers/augur:22.0.2--pyhdfd78af_0"
+    memory: mem_size + " GB"
+    cpu: 4
+    disks:  "local-disk " + disk_size + " HDD"
+    disk: disk_size + " GB" 
+    dx_instance_type: "mem3_ssd2_x4"
+    preemptible: 0
+    maxRetries: 3
+  }
+}
\ No newline at end of file
diff --git a/workflows/phylogenetics/wf_augur.wdl b/workflows/phylogenetics/wf_augur.wdl
index 189820905..f0484e96f 100644
--- a/workflows/phylogenetics/wf_augur.wdl
+++ b/workflows/phylogenetics/wf_augur.wdl
@@ -5,6 +5,7 @@ import "../../tasks/utilities/task_augur_utilities.wdl" as augur_utils
 
 import "../../tasks/phylogenetic_inference/augur/task_augur_align.wdl" as align_task
 import "../../tasks/phylogenetic_inference/augur/task_augur_ancestral.wdl" as ancestral_task
+import "../../tasks/phylogenetic_inference/augur/task_augur_traits.wdl" as traits_task
 import "../../tasks/phylogenetic_inference/augur/task_augur_clades.wdl" as clades_task
 import "../../tasks/phylogenetic_inference/augur/task_augur_export.wdl" as export_task
 import "../../tasks/phylogenetic_inference/augur/task_augur_refine.wdl" as refine_task
@@ -31,6 +32,8 @@ workflow augur {
     String? flu_subtype # options: "Victoria" "Yamagata" "H3N2" "H1N1"
 
     File? clades_tsv
+    Boolean run_traits = false # by default, do not run traits
+    String? augur_trait_columns # comma-separated list of columns to use for traits
     # these are very minimal files that hopefully will prevent workflow failure but will not provide any useful information
     File lat_longs_tsv = "gs://theiagen-public-files-rp/terra/augur-defaults/minimal-lat-longs.tsv"
     File auspice_config = "gs://theiagen-public-files-rp/terra/augur-defaults/minimal-auspice-config.json"
@@ -107,9 +110,18 @@ workflow augur {
         build_name = build_name
     }
     if (flu_segment == "HA") { # we only have clade information for HA segments (but SC2 defaults will be selected first)
+      if (run_traits) { # by default do not run traits and clades will be assigned based on the clades_tsv
+        call traits_task.augur_traits {
+          input:
+            refined_tree = augur_refine.refined_tree,
+            metadata = tsv_join.out_tsv,
+            columns = select_first([augur_trait_columns, "lineage, clade, clade_membership"]),
+            build_name = build_name
+        }
+      }
       if (defined(clades_tsv) || defined(sc2_defaults.clades_tsv) || defined(flu_defaults.clades_tsv) || defined(mpxv_defaults.clades_tsv) ) { # one of these must be present
         call clades_task.augur_clades { # assign clades to nodes based on amino-acid or nucleotide signatures
-          input: 
+          input:
             refined_tree = augur_refine.refined_tree,
             ancestral_nt_muts_json = augur_ancestral.ancestral_nt_muts_json,
             translated_aa_muts_json = augur_translate.translated_aa_muts_json,
@@ -161,6 +173,7 @@ workflow augur {
     File aligned_fastas = augur_align.aligned_fasta
     File combined_assemblies = filter_sequences_by_length.filtered_fasta
     File metadata_merged = tsv_join.out_tsv
+    File? traits_json = augur_traits.traits_assignments_json
 
     # list of samples that were kept and met the length filters    
     File keep_list = fasta_to_ids.ids_txt

From cdf84f6d2210743ed9d48c388d334271268b2a77 Mon Sep 17 00:00:00 2001
From: jrotieno <james.otieno@theiagen.com>
Date: Tue, 1 Aug 2023 11:36:56 +0000
Subject: [PATCH 13/36] traits confidence

---
 tasks/phylogenetic_inference/augur/task_augur_traits.wdl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tasks/phylogenetic_inference/augur/task_augur_traits.wdl b/tasks/phylogenetic_inference/augur/task_augur_traits.wdl
index ec84eceb6..dfaa712e0 100644
--- a/tasks/phylogenetic_inference/augur/task_augur_traits.wdl
+++ b/tasks/phylogenetic_inference/augur/task_augur_traits.wdl
@@ -5,7 +5,7 @@ task augur_traits {
     File refined_tree
     File metadata
     File? weights
-    Boolean confidence = true
+    #Boolean confidence = true
     String? metadata_id_columns
     String columns
     String build_name
@@ -18,7 +18,7 @@ task augur_traits {
       --tree "~{refined_tree}" \
       --metadata "~{metadata}" \
       --columns "~{columns}" \
-      --confidence "~{confidence}" \
+      --confidence \
       ~{'--metadata-id-columns ' + metadata_id_columns} \
       ~{'--weights ' + weights}
       --output-node-data "~{build_name}_traits.json"

From 6324a2492d5972aa741cee3071c490d3582b7345 Mon Sep 17 00:00:00 2001
From: jrotieno <james.otieno@theiagen.com>
Date: Tue, 1 Aug 2023 12:35:59 +0000
Subject: [PATCH 14/36] updated traits task

---
 tasks/phylogenetic_inference/augur/task_augur_traits.wdl | 2 +-
 workflows/phylogenetics/wf_augur.wdl                     | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/tasks/phylogenetic_inference/augur/task_augur_traits.wdl b/tasks/phylogenetic_inference/augur/task_augur_traits.wdl
index dfaa712e0..182ddfdba 100644
--- a/tasks/phylogenetic_inference/augur/task_augur_traits.wdl
+++ b/tasks/phylogenetic_inference/augur/task_augur_traits.wdl
@@ -20,7 +20,7 @@ task augur_traits {
       --columns "~{columns}" \
       --confidence \
       ~{'--metadata-id-columns ' + metadata_id_columns} \
-      ~{'--weights ' + weights}
+      ~{'--weights ' + weights} \
       --output-node-data "~{build_name}_traits.json"
   >>>
   output {
diff --git a/workflows/phylogenetics/wf_augur.wdl b/workflows/phylogenetics/wf_augur.wdl
index f0484e96f..c1122b8a8 100644
--- a/workflows/phylogenetics/wf_augur.wdl
+++ b/workflows/phylogenetics/wf_augur.wdl
@@ -115,7 +115,7 @@ workflow augur {
           input:
             refined_tree = augur_refine.refined_tree,
             metadata = tsv_join.out_tsv,
-            columns = select_first([augur_trait_columns, "lineage, clade, clade_membership"]),
+            columns = select_first([augur_trait_columns, "lineage clade clade_membership"]), # default to these columns if none are specified
             build_name = build_name
         }
       }
@@ -139,7 +139,8 @@ workflow augur {
                             augur_refine.branch_lengths,
                             augur_ancestral.ancestral_nt_muts_json,
                             augur_translate.translated_aa_muts_json,
-                            augur_clades.clade_assignments_json]),
+                            augur_clades.clade_assignments_json,
+                            augur_traits.traits_assignments_json]),
         build_name = build_name,
         colors_tsv = select_first([colors, mpxv_defaults.colors]),
         lat_longs_tsv = select_first([sc2_defaults.lat_longs_tsv, flu_defaults.lat_longs_tsv, mpxv_defaults.lat_longs_tsv, lat_longs_tsv]),

From ab0710b93110bac6e0a66f959a6a2d67dc0337bc Mon Sep 17 00:00:00 2001
From: jrotieno <james.otieno@theiagen.com>
Date: Tue, 1 Aug 2023 13:14:46 +0000
Subject: [PATCH 15/36] update augur prep fields

---
 tasks/utilities/task_augur_utilities.wdl | 2 +-
 workflows/phylogenetics/wf_augur.wdl     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tasks/utilities/task_augur_utilities.wdl b/tasks/utilities/task_augur_utilities.wdl
index d7b42d350..f2022bcdb 100644
--- a/tasks/utilities/task_augur_utilities.wdl
+++ b/tasks/utilities/task_augur_utilities.wdl
@@ -394,7 +394,7 @@ task prep_augur_metadata {
     fi
     # if pango_lineage defined, add to metadata
     if [[ "~{nextclade_clade}" ]]; then 
-      nextclade_header="pango_lineage"
+      nextclade_header="nextclade_clade"
     fi
 
     if [[ "~{organism}" == "sars-cov-2" ]]; then
diff --git a/workflows/phylogenetics/wf_augur.wdl b/workflows/phylogenetics/wf_augur.wdl
index c1122b8a8..062ec30ef 100644
--- a/workflows/phylogenetics/wf_augur.wdl
+++ b/workflows/phylogenetics/wf_augur.wdl
@@ -115,7 +115,7 @@ workflow augur {
           input:
             refined_tree = augur_refine.refined_tree,
             metadata = tsv_join.out_tsv,
-            columns = select_first([augur_trait_columns, "lineage clade clade_membership"]), # default to these columns if none are specified
+            columns = select_first([augur_trait_columns, "pango_lineage nextclade_clade"]), # default to these columns if none are specified
             build_name = build_name
         }
       }

From 18a61856931ca338e9b06845c7cf2310f91f41d9 Mon Sep 17 00:00:00 2001
From: jrotieno <james.otieno@theiagen.com>
Date: Tue, 1 Aug 2023 13:46:44 +0000
Subject: [PATCH 16/36] modifying column input

---
 tasks/phylogenetic_inference/augur/task_augur_traits.wdl | 2 +-
 workflows/phylogenetics/wf_augur.wdl                     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tasks/phylogenetic_inference/augur/task_augur_traits.wdl b/tasks/phylogenetic_inference/augur/task_augur_traits.wdl
index 182ddfdba..8955a6c73 100644
--- a/tasks/phylogenetic_inference/augur/task_augur_traits.wdl
+++ b/tasks/phylogenetic_inference/augur/task_augur_traits.wdl
@@ -17,7 +17,7 @@ task augur_traits {
     AUGUR_RECURSION_LIMIT=10000 augur traits \
       --tree "~{refined_tree}" \
       --metadata "~{metadata}" \
-      --columns "~{columns}" \
+      ~{'--columns ' + columns} \
       --confidence \
       ~{'--metadata-id-columns ' + metadata_id_columns} \
       ~{'--weights ' + weights} \
diff --git a/workflows/phylogenetics/wf_augur.wdl b/workflows/phylogenetics/wf_augur.wdl
index 062ec30ef..d14c51ddd 100644
--- a/workflows/phylogenetics/wf_augur.wdl
+++ b/workflows/phylogenetics/wf_augur.wdl
@@ -115,7 +115,7 @@ workflow augur {
           input:
             refined_tree = augur_refine.refined_tree,
             metadata = tsv_join.out_tsv,
-            columns = select_first([augur_trait_columns, "pango_lineage nextclade_clade"]), # default to these columns if none are specified
+            columns = select_first([augur_trait_columns, "pango_lineage,nextclade_clade"]), # default to these columns if none are specified
             build_name = build_name
         }
       }

From 5827a6094f7c2701774ae323e43238d72b9c29e1 Mon Sep 17 00:00:00 2001
From: jrotieno <james.otieno@theiagen.com>
Date: Tue, 1 Aug 2023 15:58:57 +0000
Subject: [PATCH 17/36] changes to traits task and augur prep

---
 tasks/phylogenetic_inference/augur/task_augur_traits.wdl | 2 +-
 tasks/utilities/task_augur_utilities.wdl                 | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tasks/phylogenetic_inference/augur/task_augur_traits.wdl b/tasks/phylogenetic_inference/augur/task_augur_traits.wdl
index 8955a6c73..3f3f1414b 100644
--- a/tasks/phylogenetic_inference/augur/task_augur_traits.wdl
+++ b/tasks/phylogenetic_inference/augur/task_augur_traits.wdl
@@ -17,7 +17,7 @@ task augur_traits {
     AUGUR_RECURSION_LIMIT=10000 augur traits \
       --tree "~{refined_tree}" \
       --metadata "~{metadata}" \
-      ~{'--columns ' + columns} \
+      ~{'--columns {' + columns + '}'} \
       --confidence \
       ~{'--metadata-id-columns ' + metadata_id_columns} \
       ~{'--weights ' + weights} \
diff --git a/tasks/utilities/task_augur_utilities.wdl b/tasks/utilities/task_augur_utilities.wdl
index f2022bcdb..575c0f6e6 100644
--- a/tasks/utilities/task_augur_utilities.wdl
+++ b/tasks/utilities/task_augur_utilities.wdl
@@ -394,7 +394,7 @@ task prep_augur_metadata {
     fi
     # if pango_lineage defined, add to metadata
     if [[ "~{nextclade_clade}" ]]; then 
-      nextclade_header="nextclade_clade"
+      nextclade_header="clade_membership"
     fi
 
     if [[ "~{organism}" == "sars-cov-2" ]]; then

From c011bf2d0ac9f0307941dad9ea5f12b1698c8cec Mon Sep 17 00:00:00 2001
From: jrotieno <james.otieno@theiagen.com>
Date: Tue, 1 Aug 2023 17:11:24 +0000
Subject: [PATCH 18/36] updated conditional for traits vs clades

---
 workflows/phylogenetics/wf_augur.wdl | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/workflows/phylogenetics/wf_augur.wdl b/workflows/phylogenetics/wf_augur.wdl
index d14c51ddd..1e1de8095 100644
--- a/workflows/phylogenetics/wf_augur.wdl
+++ b/workflows/phylogenetics/wf_augur.wdl
@@ -115,19 +115,21 @@ workflow augur {
           input:
             refined_tree = augur_refine.refined_tree,
             metadata = tsv_join.out_tsv,
-            columns = select_first([augur_trait_columns, "pango_lineage,nextclade_clade"]), # default to these columns if none are specified
+            columns = select_first([augur_trait_columns, "pango_lineage,clade_membership"]), # default to these columns if none are specified
             build_name = build_name
         }
       }
-      if (defined(clades_tsv) || defined(sc2_defaults.clades_tsv) || defined(flu_defaults.clades_tsv) || defined(mpxv_defaults.clades_tsv) ) { # one of these must be present
-        call clades_task.augur_clades { # assign clades to nodes based on amino-acid or nucleotide signatures
-          input:
-            refined_tree = augur_refine.refined_tree,
-            ancestral_nt_muts_json = augur_ancestral.ancestral_nt_muts_json,
-            translated_aa_muts_json = augur_translate.translated_aa_muts_json,
-            reference_fasta = select_first([reference_fasta, sc2_defaults.reference_fasta, flu_defaults.reference_fasta, mpxv_defaults.reference_fasta]),
-            build_name = build_name,
-            clades_tsv = select_first([clades_tsv, sc2_defaults.clades_tsv, flu_defaults.clades_tsv, mpxv_defaults.clades_tsv])
+      if (! run_traits) {
+        if (defined(clades_tsv) || defined(sc2_defaults.clades_tsv) || defined(flu_defaults.clades_tsv) || defined(mpxv_defaults.clades_tsv) ) { # one of these must be present
+          call clades_task.augur_clades { # assign clades to nodes based on amino-acid or nucleotide signatures
+            input:
+              refined_tree = augur_refine.refined_tree,
+              ancestral_nt_muts_json = augur_ancestral.ancestral_nt_muts_json,
+              translated_aa_muts_json = augur_translate.translated_aa_muts_json,
+              reference_fasta = select_first([reference_fasta, sc2_defaults.reference_fasta, flu_defaults.reference_fasta, mpxv_defaults.reference_fasta]),
+              build_name = build_name,
+              clades_tsv = select_first([clades_tsv, sc2_defaults.clades_tsv, flu_defaults.clades_tsv, mpxv_defaults.clades_tsv])
+          }
         }
       }
     }

From ee3b711bac0d31448eea9f8aaf262a9ac99b0ef6 Mon Sep 17 00:00:00 2001
From: jrotieno <james.otieno@theiagen.com>
Date: Wed, 16 Aug 2023 17:36:33 +0000
Subject: [PATCH 19/36] adding the option to skip augur align when user has an
 alignment already

---
 .../augur/task_augur_tree.wdl                 |  4 ++
 workflows/phylogenetics/wf_augur.wdl          | 52 +++++++++++--------
 2 files changed, 33 insertions(+), 23 deletions(-)

diff --git a/tasks/phylogenetic_inference/augur/task_augur_tree.wdl b/tasks/phylogenetic_inference/augur/task_augur_tree.wdl
index e18b5c703..9aad90396 100644
--- a/tasks/phylogenetic_inference/augur/task_augur_tree.wdl
+++ b/tasks/phylogenetic_inference/augur/task_augur_tree.wdl
@@ -14,6 +14,9 @@ task augur_tree {
     Int disk_size = 750
   }
   command <<<
+    # capture version information
+    augur version > VERSION
+
     AUGUR_RECURSION_LIMIT=10000 augur tree \
       --alignment "~{aligned_fasta}" \
       --output "~{build_name}_~{method}.nwk" \
@@ -26,6 +29,7 @@ task augur_tree {
   >>>
   output {
     File aligned_tree  = "~{build_name}_~{method}.nwk"
+    String augur_version = read_string("VERSION")
   }
   runtime {
     docker: "us-docker.pkg.dev/general-theiagen/biocontainers/augur:22.0.2--pyhdfd78af_0"
diff --git a/workflows/phylogenetics/wf_augur.wdl b/workflows/phylogenetics/wf_augur.wdl
index 1e1de8095..1e209c698 100644
--- a/workflows/phylogenetics/wf_augur.wdl
+++ b/workflows/phylogenetics/wf_augur.wdl
@@ -27,9 +27,11 @@ workflow augur {
     File? genes
     File? colors
     Int? min_num_unambig
-    String organism = "sars-cov-2" # options: sars-cov-2 or flu
+    String organism = "sars-cov-2" # options: sars-cov-2 or flu or mpxv
     String flu_segment = "HA" # options: HA or NA
     String? flu_subtype # options: "Victoria" "Yamagata" "H3N2" "H1N1"
+    Boolean skip_alignment = false # by default, do not skip alignment
+    File? alignment_fasta # if alignment is skipped, provide an alignment
 
     File? clades_tsv
     Boolean run_traits = false # by default, do not run traits
@@ -40,11 +42,6 @@ workflow augur {
 
     Boolean distance_tree_only = false # by default, do not skip making a time tree
   }
-  call file_handling.cat_files { # concatenate all of the input fasta files together
-    input:
-      files_to_cat = assembly_fastas,
-      concatenated_file_name = "~{build_name}_concatenated.fasta"
-  }
   if (organism == "sars-cov-2") {
     call augur_utils.set_sc2_defaults as sc2_defaults { # establish default parameters for sars-cov-2
       input:
@@ -62,35 +59,44 @@ workflow augur {
       input:
     }
   }
-  call augur_utils.filter_sequences_by_length { # remove any sequences that do not meet the quality threshold
-    input:
-      sequences_fasta = cat_files.concatenated_files,
-      min_non_N = select_first([min_num_unambig, sc2_defaults.min_num_unambig, flu_defaults.min_num_unambig, mpxv_defaults.min_num_unambig]),
-  }
-  call align_task.augur_align { # perform mafft alignment on the sequences
-    input: 
-      assembly_fasta = filter_sequences_by_length.filtered_fasta,
-      reference_fasta = select_first([reference_fasta, sc2_defaults.reference_fasta, flu_defaults.reference_fasta, mpxv_defaults.reference_fasta]),
-  }
   call augur_utils.tsv_join { # merge the metadata files
     input:
       input_tsvs = sample_metadata_tsvs,
       id_col = "strain",
       out_basename = "metadata-merged"
   }
+  if (! skip_alignment) { # by default, continue
+    call file_handling.cat_files { # concatenate all of the input fasta files together
+      input:
+        files_to_cat = assembly_fastas,
+        concatenated_file_name = "~{build_name}_concatenated.fasta"
+    }
+  }
+  call augur_utils.filter_sequences_by_length { # remove any sequences that do not meet the quality threshold
+    input:
+      sequences_fasta = select_first([cat_files.concatenated_files, alignment_fasta]),
+      min_non_N = select_first([min_num_unambig, sc2_defaults.min_num_unambig, flu_defaults.min_num_unambig, mpxv_defaults.min_num_unambig]),
+  }
+  if (! skip_alignment) { # by default, continue
+    call align_task.augur_align { # perform mafft alignment on the sequences
+      input:
+        assembly_fasta = filter_sequences_by_length.filtered_fasta,
+        reference_fasta = select_first([reference_fasta, sc2_defaults.reference_fasta, flu_defaults.reference_fasta, mpxv_defaults.reference_fasta]),
+    }
+  }
   call augur_utils.fasta_to_ids { # extract list of remaining sequences (so we know which ones were dropped)
     input:
-      sequences_fasta = augur_align.aligned_fasta
+      sequences_fasta = select_first([augur_align.aligned_fasta, filter_sequences_by_length.filtered_fasta])
   }
   call tree_task.augur_tree { # create a "draft" (or distance) augur tree
     input:
-      aligned_fasta = augur_align.aligned_fasta,
+      aligned_fasta = select_first([augur_align.aligned_fasta, filter_sequences_by_length.filtered_fasta]),
       build_name = build_name
   }
   if (! distance_tree_only) { # by default, continue
     call refine_task.augur_refine { # create a timetree (aka, refine augur tree)
       input:
-        aligned_fasta = augur_align.aligned_fasta,
+        aligned_fasta = select_first([augur_align.aligned_fasta, filter_sequences_by_length.filtered_fasta]),
         draft_augur_tree = augur_tree.aligned_tree,
         metadata = tsv_join.out_tsv,
         build_name = build_name
@@ -98,7 +104,7 @@ workflow augur {
     call ancestral_task.augur_ancestral { # infer ancestral sequences
       input:
         refined_tree = augur_refine.refined_tree,
-        aligned_fasta = augur_align.aligned_fasta,
+        aligned_fasta = select_first([augur_align.aligned_fasta, filter_sequences_by_length.filtered_fasta]),
         build_name = build_name
     }
     call translate_task.augur_translate { # translate gene regions from nucleotides to amino acids
@@ -152,7 +158,7 @@ workflow augur {
   call snp_dists_task.snp_dists { # create a snp matrix from the alignment
     input:
       cluster_name = build_name,
-      alignment = augur_align.aligned_fasta
+      alignment = select_first([augur_align.aligned_fasta,filter_sequences_by_length.filtered_fasta])
   }
   call reorder_matrix_task.reorder_matrix { # reorder snp matrix to match distance tree 
     input:
@@ -167,13 +173,13 @@ workflow augur {
     # version capture
     String augur_phb_version = version_capture.phb_version
     String augur_phb_analysis_date = version_capture.date
-    String augur_version = augur_align.augur_version
+    String augur_version = augur_tree.augur_version
 
     # augur outputs
     File? auspice_input_json = augur_export.auspice_json
     File? time_tree = augur_refine.refined_tree
     File distance_tree = augur_tree.aligned_tree
-    File aligned_fastas = augur_align.aligned_fasta
+    File aligned_fastas = select_first([augur_align.aligned_fasta, alignment_fasta])
     File combined_assemblies = filter_sequences_by_length.filtered_fasta
     File metadata_merged = tsv_join.out_tsv
     File? traits_json = augur_traits.traits_assignments_json

From a41c45414fb7091dd7e6d4515852badf8b8f0fb3 Mon Sep 17 00:00:00 2001
From: jrotieno <james.otieno@theiagen.com>
Date: Thu, 17 Aug 2023 11:57:37 +0000
Subject: [PATCH 20/36] Add memory to augur tasks

---
 tasks/phylogenetic_inference/augur/task_augur_translate.wdl | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tasks/phylogenetic_inference/augur/task_augur_translate.wdl b/tasks/phylogenetic_inference/augur/task_augur_translate.wdl
index b2efe1a7d..1dd9a65f3 100644
--- a/tasks/phylogenetic_inference/augur/task_augur_translate.wdl
+++ b/tasks/phylogenetic_inference/augur/task_augur_translate.wdl
@@ -10,6 +10,7 @@ task augur_translate {
     File? genes # a file containing list of genes to translate (from nucleotides to amino acids)
 
     Int disk_size = 50
+    Int mem_size = 32
   }
   command <<<
     AUGUR_RECURSION_LIMIT=10000 augur translate \
@@ -24,7 +25,7 @@ task augur_translate {
   }
   runtime {
     docker: "us-docker.pkg.dev/general-theiagen/biocontainers/augur:22.0.2--pyhdfd78af_0"
-    memory: "2 GB"
+    memory: mem_size + " GB"
     cpu : 1
     disks: "local-disk " + disk_size + " HDD"
     disk: disk_size + " GB"

From f4e1c460fca4fedf46920eec94ff8cb150029564 Mon Sep 17 00:00:00 2001
From: jrotieno <james.otieno@theiagen.com>
Date: Thu, 17 Aug 2023 15:53:47 +0000
Subject: [PATCH 21/36] increase memory for augur align

---
 tasks/phylogenetic_inference/augur/task_augur_align.wdl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tasks/phylogenetic_inference/augur/task_augur_align.wdl b/tasks/phylogenetic_inference/augur/task_augur_align.wdl
index 30c8a7fff..5d9afa2a2 100644
--- a/tasks/phylogenetic_inference/augur/task_augur_align.wdl
+++ b/tasks/phylogenetic_inference/augur/task_augur_align.wdl
@@ -6,7 +6,7 @@ task augur_align {
     File reference_fasta
     Boolean fill_gaps = false
     Int cpus = 64
-    Int mem_size = 32
+    Int mem_size = 128
     Int disk_size = 750
   }
   command <<<

From 80a8eb92cd62f5d61f4db835ecd2b5d692d2febd Mon Sep 17 00:00:00 2001
From: jrotieno <james.otieno@theiagen.com>
Date: Fri, 18 Aug 2023 07:37:12 +0000
Subject: [PATCH 22/36] removed the clock filter setting from 4, making it
 optional input

---
 tasks/phylogenetic_inference/augur/task_augur_refine.wdl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tasks/phylogenetic_inference/augur/task_augur_refine.wdl b/tasks/phylogenetic_inference/augur/task_augur_refine.wdl
index 543b5d58f..1b8982a83 100644
--- a/tasks/phylogenetic_inference/augur/task_augur_refine.wdl
+++ b/tasks/phylogenetic_inference/augur/task_augur_refine.wdl
@@ -19,7 +19,7 @@ task augur_refine {
     String date_inference = "marginal" # assign internal nodes to their marginally most likley dates (joint, marginal)
     String? branch_length_inference # branch length mode of treetime to use (auto, joint, marginal, input; default: auto)
     String? coalescent # coalescent time scale in units of inverse clock rate (float), optimize as scalar ("opt") or skyline (skyline)
-    Int clock_filter_iqd = 4 # remove tips that deviate more than n_iqd interquartile ranges from the root-to-tip vs time regression
+    Int? clock_filter_iqd # remove tips that deviate more than n_iqd interquartile ranges from the root-to-tip vs time regression
     String divergence_units = "mutations" # units in which sequence divergences is exported ("mutations" or "mutations-per-site")
 
     Int disk_size = 100

From ddfe22d4358c708ed7595e8ae5692bf98b9de238 Mon Sep 17 00:00:00 2001
From: jrotieno <james.otieno@theiagen.com>
Date: Fri, 18 Aug 2023 16:54:43 +0000
Subject: [PATCH 23/36] nextclade task to use dataset json as reference

---
 tasks/taxon_id/task_nextclade_addSamples.wdl  | 68 +++++++++++++++++++
 .../wf_nextclade_addToRefTree.wdl             | 21 +++---
 2 files changed, 77 insertions(+), 12 deletions(-)
 create mode 100644 tasks/taxon_id/task_nextclade_addSamples.wdl

diff --git a/tasks/taxon_id/task_nextclade_addSamples.wdl b/tasks/taxon_id/task_nextclade_addSamples.wdl
new file mode 100644
index 000000000..6c68ad8d9
--- /dev/null
+++ b/tasks/taxon_id/task_nextclade_addSamples.wdl
@@ -0,0 +1,68 @@
+version 1.0
+
+task nextclade {
+    meta {
+      description: "Nextclade classification of one sample. Leaving optional inputs unspecified will use SARS-CoV-2 defaults."
+    }
+    input {
+      File genome_fasta
+      File? root_sequence
+      File? reference_tree_json
+      File? qc_config_json
+      File? gene_annotations_gff
+      File? pcr_primers_csv
+      File? virus_properties
+      String docker = "us-docker.pkg.dev/general-theiagen/nextstrain/nextclade:2.14.0"
+      String dataset_name
+      String? dataset_reference
+      String? dataset_tag
+      Int disk_size = 50
+    }
+    String basename = basename(genome_fasta, ".fasta")
+    command <<<
+        NEXTCLADE_VERSION="$(nextclade --version)"
+        echo $NEXTCLADE_VERSION > NEXTCLADE_VERSION
+
+        nextclade dataset get \
+          --name="~{dataset_name}" \
+          ~{"--reference " + dataset_reference} \
+          ~{"--tag " + dataset_tag} \
+          -o nextclade_dataset_dir \
+          --verbose
+        
+        # If no referece sequence is provided, use the reference tree from the dataset
+        if [ -z "~{reference_tree_json}" ]; then
+          reference_tree_json=nextclade_dataset_dir/tree.json
+        fi
+        set -e
+        nextclade run \
+            --input-dataset=nextclade_dataset_dir/ \
+            ~{"--input-root-seq " + root_sequence} \
+            --input-tree ~{reference_tree_json} \
+            ~{"--input-qc-config " + qc_config_json} \
+            ~{"--input-gene-map " + gene_annotations_gff} \
+            ~{"--input-pcr-primers " + pcr_primers_csv} \
+            ~{"--input-virus-properties " + virus_properties}  \
+            --output-json "~{basename}".nextclade.json \
+            --output-tsv  "~{basename}".nextclade.tsv \
+            --output-tree "~{basename}".nextclade.auspice.json \
+            --output-all=. \
+            "~{genome_fasta}"
+    >>>
+    runtime {
+      docker: "~{docker}"
+      memory: "8 GB"
+      cpu: 2
+      disks:  "local-disk " + disk_size + " SSD"
+      disk: disk_size + " GB" # TES
+      dx_instance_type: "mem1_ssd1_v2_x2"
+      maxRetries: 3 
+    }
+    output {
+      String nextclade_version = read_string("NEXTCLADE_VERSION")
+      File nextclade_json = "~{basename}.nextclade.json"
+      File auspice_json = "~{basename}.nextclade.auspice.json"
+      File nextclade_tsv = "~{basename}.nextclade.tsv"
+      String nextclade_docker = docker
+    }
+}
\ No newline at end of file
diff --git a/workflows/phylogenetics/wf_nextclade_addToRefTree.wdl b/workflows/phylogenetics/wf_nextclade_addToRefTree.wdl
index abbc43072..616423d38 100644
--- a/workflows/phylogenetics/wf_nextclade_addToRefTree.wdl
+++ b/workflows/phylogenetics/wf_nextclade_addToRefTree.wdl
@@ -1,6 +1,6 @@
 version 1.0
 
-import "../../tasks/taxon_id/task_nextclade.wdl" as nextclade_analysis
+import "../../tasks/taxon_id/task_nextclade_addSamples.wdl" as nextclade_analysis
 
 workflow nextclade_addToRefTree {
     meta {
@@ -8,28 +8,25 @@ workflow nextclade_addToRefTree {
     }
     input {
       File assembly_fastas
-      String build_name
-      File root_sequence_fasta
-      #File? gene_annotations_gff
-      File reference_tree_json
+      File? root_sequence_fasta
+      File? gene_annotations_gff
+      File? reference_tree_json
       File? qc_config_json
       File? pcr_primers_csv
       File? virus_properties
-      String docker = "nextstrain/nextclade:2.13.0"
-      String dataset_name = "MPXV"
-      String dataset_reference = "ancestral"
-      String dataset_tag = "2023-01-26T12:00:00Z"
+      String dataset_name
+      String? dataset_reference
+      String? dataset_tag
     }
     call nextclade_analysis.nextclade { # nextclade analysis
       input:
         genome_fasta = assembly_fastas,
         root_sequence = root_sequence_fasta,
-        auspice_reference_tree_json = reference_tree_json,
+        reference_tree_json = reference_tree_json,
         qc_config_json = qc_config_json,
-        #gene_annotations_json = gene_annotations_gff,
+        gene_annotations_gff = gene_annotations_gff,
         pcr_primers_csv = pcr_primers_csv,
         virus_properties = virus_properties,
-        docker = docker,
         dataset_name = dataset_name,
         dataset_reference = dataset_reference,
         dataset_tag = dataset_tag

From f0c0ae1c504240c9a801f0d6e23c539b7c5fd53c Mon Sep 17 00:00:00 2001
From: jrotieno <james.otieno@theiagen.com>
Date: Fri, 18 Aug 2023 17:50:07 +0000
Subject: [PATCH 24/36] fix input tree

---
 tasks/taxon_id/task_nextclade_addSamples.wdl | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/tasks/taxon_id/task_nextclade_addSamples.wdl b/tasks/taxon_id/task_nextclade_addSamples.wdl
index 6c68ad8d9..784b90e5d 100644
--- a/tasks/taxon_id/task_nextclade_addSamples.wdl
+++ b/tasks/taxon_id/task_nextclade_addSamples.wdl
@@ -31,14 +31,17 @@ task nextclade {
           --verbose
         
         # If no referece sequence is provided, use the reference tree from the dataset
-        if [ -z "~{reference_tree_json}" ]; then
+        if [[ ! -z "~{reference_tree_json}" ]]; then
           reference_tree_json=nextclade_dataset_dir/tree.json
+        else
+          reference_tree_json="~{reference_tree_json}"
         fi
+
         set -e
         nextclade run \
             --input-dataset=nextclade_dataset_dir/ \
             ~{"--input-root-seq " + root_sequence} \
-            --input-tree ~{reference_tree_json} \
+            --input-tree ${reference_tree_json} \
             ~{"--input-qc-config " + qc_config_json} \
             ~{"--input-gene-map " + gene_annotations_gff} \
             ~{"--input-pcr-primers " + pcr_primers_csv} \

From c0483b3e441efe88bc57132b182a20e57008fa35 Mon Sep 17 00:00:00 2001
From: jrotieno <james.otieno@theiagen.com>
Date: Fri, 18 Aug 2023 18:44:17 +0000
Subject: [PATCH 25/36] .

---
 tasks/taxon_id/task_nextclade_addSamples.wdl | 38 +++++++++++---------
 1 file changed, 21 insertions(+), 17 deletions(-)

diff --git a/tasks/taxon_id/task_nextclade_addSamples.wdl b/tasks/taxon_id/task_nextclade_addSamples.wdl
index 784b90e5d..e496499be 100644
--- a/tasks/taxon_id/task_nextclade_addSamples.wdl
+++ b/tasks/taxon_id/task_nextclade_addSamples.wdl
@@ -31,26 +31,29 @@ task nextclade {
           --verbose
         
         # If no referece sequence is provided, use the reference tree from the dataset
-        if [[ ! -z "~{reference_tree_json}" ]]; then
-          reference_tree_json=nextclade_dataset_dir/tree.json
+        if [ ! -z "~{reference_tree_json}" ]; then
+          echo "Default reference tree JSON will be used"
+          cp nextclade_dataset_dir/tree.json reference_tree.json
+          tree_json="reference_tree.json"
         else
-          reference_tree_json="~{reference_tree_json}"
+          echo "User reference tree JSON will be used"
+          tree_json="~{reference_tree_json}"
         fi
 
-        set -e
-        nextclade run \
-            --input-dataset=nextclade_dataset_dir/ \
-            ~{"--input-root-seq " + root_sequence} \
-            --input-tree ${reference_tree_json} \
-            ~{"--input-qc-config " + qc_config_json} \
-            ~{"--input-gene-map " + gene_annotations_gff} \
-            ~{"--input-pcr-primers " + pcr_primers_csv} \
-            ~{"--input-virus-properties " + virus_properties}  \
-            --output-json "~{basename}".nextclade.json \
-            --output-tsv  "~{basename}".nextclade.tsv \
-            --output-tree "~{basename}".nextclade.auspice.json \
-            --output-all=. \
-            "~{genome_fasta}"
+       # set -e
+       # nextclade run \
+       #     --input-dataset=nextclade_dataset_dir/ \
+       #     ~{"--input-root-seq " + root_sequence} \
+       #     --input-tree ${reference_tree_json} \
+       #     ~{"--input-qc-config " + qc_config_json} \
+       #     ~{"--input-gene-map " + gene_annotations_gff} \
+       #     ~{"--input-pcr-primers " + pcr_primers_csv} \
+       #     ~{"--input-virus-properties " + virus_properties}  \
+       #     --output-json "~{basename}".nextclade.json \
+       #     --output-tsv  "~{basename}".nextclade.tsv \
+       #     --output-tree "~{basename}".nextclade.auspice.json \
+       #     --output-all=. \
+       #     "~{genome_fasta}"
     >>>
     runtime {
       docker: "~{docker}"
@@ -67,5 +70,6 @@ task nextclade {
       File auspice_json = "~{basename}.nextclade.auspice.json"
       File nextclade_tsv = "~{basename}.nextclade.tsv"
       String nextclade_docker = docker
+      File nextclade_ref_tree_json = "reference_tree.json"
     }
 }
\ No newline at end of file

From c81b9262c08f7841e91c23447126a517af943caa Mon Sep 17 00:00:00 2001
From: jrotieno <james.otieno@theiagen.com>
Date: Fri, 18 Aug 2023 19:04:38 +0000
Subject: [PATCH 26/36] .

---
 tasks/taxon_id/task_nextclade_addSamples.wdl | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tasks/taxon_id/task_nextclade_addSamples.wdl b/tasks/taxon_id/task_nextclade_addSamples.wdl
index e496499be..c82271572 100644
--- a/tasks/taxon_id/task_nextclade_addSamples.wdl
+++ b/tasks/taxon_id/task_nextclade_addSamples.wdl
@@ -31,8 +31,8 @@ task nextclade {
           --verbose
         
         # If no referece sequence is provided, use the reference tree from the dataset
-        if [ ! -z "~{reference_tree_json}" ]; then
-          echo "Default reference tree JSON will be used"
+        if [ -z "~{reference_tree_json}" ]; then
+          echo "Default dataset reference tree JSON will be used"
           cp nextclade_dataset_dir/tree.json reference_tree.json
           tree_json="reference_tree.json"
         else
@@ -66,10 +66,10 @@ task nextclade {
     }
     output {
       String nextclade_version = read_string("NEXTCLADE_VERSION")
-      File nextclade_json = "~{basename}.nextclade.json"
-      File auspice_json = "~{basename}.nextclade.auspice.json"
-      File nextclade_tsv = "~{basename}.nextclade.tsv"
+      #File nextclade_json = "~{basename}.nextclade.json"
+      #File auspice_json = "~{basename}.nextclade.auspice.json"
+      #File nextclade_tsv = "~{basename}.nextclade.tsv"
       String nextclade_docker = docker
-      File nextclade_ref_tree_json = "reference_tree.json"
+      #File nextclade_ref_tree_json = select_first(["~{reference_tree_json}","reference_tree.json"])
     }
 }
\ No newline at end of file

From 369d52447d0d1e5a7af678911f38ddbfb7592ec8 Mon Sep 17 00:00:00 2001
From: jrotieno <james.otieno@theiagen.com>
Date: Fri, 18 Aug 2023 19:08:35 +0000
Subject: [PATCH 27/36] .

---
 tasks/taxon_id/task_nextclade_addSamples.wdl | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tasks/taxon_id/task_nextclade_addSamples.wdl b/tasks/taxon_id/task_nextclade_addSamples.wdl
index c82271572..f017cf0dc 100644
--- a/tasks/taxon_id/task_nextclade_addSamples.wdl
+++ b/tasks/taxon_id/task_nextclade_addSamples.wdl
@@ -69,6 +69,9 @@ task nextclade {
       #File nextclade_json = "~{basename}.nextclade.json"
       #File auspice_json = "~{basename}.nextclade.auspice.json"
       #File nextclade_tsv = "~{basename}.nextclade.tsv"
+      File nextclade_json = read_string("test")
+      File auspice_json = read_string("test")
+      File nextclade_tsv = read_string("test")
       String nextclade_docker = docker
       #File nextclade_ref_tree_json = select_first(["~{reference_tree_json}","reference_tree.json"])
     }

From 635377672257bbcbcb913205a50017e2722f9b36 Mon Sep 17 00:00:00 2001
From: jrotieno <james.otieno@theiagen.com>
Date: Fri, 18 Aug 2023 19:10:48 +0000
Subject: [PATCH 28/36] .

---
 tasks/taxon_id/task_nextclade_addSamples.wdl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tasks/taxon_id/task_nextclade_addSamples.wdl b/tasks/taxon_id/task_nextclade_addSamples.wdl
index f017cf0dc..8a57cd485 100644
--- a/tasks/taxon_id/task_nextclade_addSamples.wdl
+++ b/tasks/taxon_id/task_nextclade_addSamples.wdl
@@ -69,9 +69,9 @@ task nextclade {
       #File nextclade_json = "~{basename}.nextclade.json"
       #File auspice_json = "~{basename}.nextclade.auspice.json"
       #File nextclade_tsv = "~{basename}.nextclade.tsv"
-      File nextclade_json = read_string("test")
-      File auspice_json = read_string("test")
-      File nextclade_tsv = read_string("test")
+      File nextclade_json = "reference_tree.json"
+      File auspice_json = "reference_tree.json"
+      File nextclade_tsv = "reference_tree.json"
       String nextclade_docker = docker
       #File nextclade_ref_tree_json = select_first(["~{reference_tree_json}","reference_tree.json"])
     }

From d4570774127903cb82198610f8af8f063cd55192 Mon Sep 17 00:00:00 2001
From: jrotieno <james.otieno@theiagen.com>
Date: Fri, 18 Aug 2023 19:26:29 +0000
Subject: [PATCH 29/36] .

---
 tasks/taxon_id/task_nextclade_addSamples.wdl | 41 ++++++++++----------
 1 file changed, 20 insertions(+), 21 deletions(-)

diff --git a/tasks/taxon_id/task_nextclade_addSamples.wdl b/tasks/taxon_id/task_nextclade_addSamples.wdl
index 8a57cd485..a79deadb9 100644
--- a/tasks/taxon_id/task_nextclade_addSamples.wdl
+++ b/tasks/taxon_id/task_nextclade_addSamples.wdl
@@ -40,20 +40,20 @@ task nextclade {
           tree_json="~{reference_tree_json}"
         fi
 
-       # set -e
-       # nextclade run \
-       #     --input-dataset=nextclade_dataset_dir/ \
-       #     ~{"--input-root-seq " + root_sequence} \
-       #     --input-tree ${reference_tree_json} \
-       #     ~{"--input-qc-config " + qc_config_json} \
-       #     ~{"--input-gene-map " + gene_annotations_gff} \
-       #     ~{"--input-pcr-primers " + pcr_primers_csv} \
-       #     ~{"--input-virus-properties " + virus_properties}  \
-       #     --output-json "~{basename}".nextclade.json \
-       #     --output-tsv  "~{basename}".nextclade.tsv \
-       #     --output-tree "~{basename}".nextclade.auspice.json \
-       #     --output-all=. \
-       #     "~{genome_fasta}"
+        set -e
+        nextclade run \
+            --input-dataset=nextclade_dataset_dir/ \
+            ~{"--input-root-seq " + root_sequence} \
+            --input-tree ${tree_json} \
+            ~{"--input-qc-config " + qc_config_json} \
+            ~{"--input-gene-map " + gene_annotations_gff} \
+            ~{"--input-pcr-primers " + pcr_primers_csv} \
+            ~{"--input-virus-properties " + virus_properties}  \
+            --output-json "~{basename}".nextclade.json \
+            --output-tsv  "~{basename}".nextclade.tsv \
+            --output-tree "~{basename}".nextclade.auspice.json \
+            --output-all=. \
+            "~{genome_fasta}"
     >>>
     runtime {
       docker: "~{docker}"
@@ -66,13 +66,12 @@ task nextclade {
     }
     output {
       String nextclade_version = read_string("NEXTCLADE_VERSION")
-      #File nextclade_json = "~{basename}.nextclade.json"
-      #File auspice_json = "~{basename}.nextclade.auspice.json"
-      #File nextclade_tsv = "~{basename}.nextclade.tsv"
-      File nextclade_json = "reference_tree.json"
-      File auspice_json = "reference_tree.json"
-      File nextclade_tsv = "reference_tree.json"
+      File nextclade_json = "~{basename}.nextclade.json"
+      File auspice_json = "~{basename}.nextclade.auspice.json"
+      File nextclade_tsv = "~{basename}.nextclade.tsv"
+      #File nextclade_json = "reference_tree.json"
+      #File auspice_json = "reference_tree.json"
+      #File nextclade_tsv = "reference_tree.json"
       String nextclade_docker = docker
-      #File nextclade_ref_tree_json = select_first(["~{reference_tree_json}","reference_tree.json"])
     }
 }
\ No newline at end of file

From 17ca4557b33f5f95640b0b3a97d316d1342649c9 Mon Sep 17 00:00:00 2001
From: jrotieno <james.otieno@theiagen.com>
Date: Fri, 18 Aug 2023 19:50:06 +0000
Subject: [PATCH 30/36] .

---
 tasks/taxon_id/task_nextclade_addSamples.wdl | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/tasks/taxon_id/task_nextclade_addSamples.wdl b/tasks/taxon_id/task_nextclade_addSamples.wdl
index a79deadb9..07f04b242 100644
--- a/tasks/taxon_id/task_nextclade_addSamples.wdl
+++ b/tasks/taxon_id/task_nextclade_addSamples.wdl
@@ -34,12 +34,13 @@ task nextclade {
         if [ -z "~{reference_tree_json}" ]; then
           echo "Default dataset reference tree JSON will be used"
           cp nextclade_dataset_dir/tree.json reference_tree.json
-          tree_json="reference_tree.json"
         else
           echo "User reference tree JSON will be used"
-          tree_json="~{reference_tree_json}"
+          cp ~{reference_tree_json} reference_tree.json
         fi
 
+        tree_json="reference_tree.json"
+
         set -e
         nextclade run \
             --input-dataset=nextclade_dataset_dir/ \
@@ -69,9 +70,7 @@ task nextclade {
       File nextclade_json = "~{basename}.nextclade.json"
       File auspice_json = "~{basename}.nextclade.auspice.json"
       File nextclade_tsv = "~{basename}.nextclade.tsv"
-      #File nextclade_json = "reference_tree.json"
-      #File auspice_json = "reference_tree.json"
-      #File nextclade_tsv = "reference_tree.json"
       String nextclade_docker = docker
+      File netclade_ref_tree = "reference_tree.json"
     }
 }
\ No newline at end of file

From 88846aa2e109a3c8c7e73c6dbe67eb0af44785d0 Mon Sep 17 00:00:00 2001
From: jrotieno <james.otieno@theiagen.com>
Date: Fri, 18 Aug 2023 19:53:05 +0000
Subject: [PATCH 31/36] .

---
 ...task_nextclade_addSamples.wdl => task_nextclade_add_ref.wdl} | 0
 workflows/phylogenetics/wf_nextclade_addToRefTree.wdl           | 2 +-
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename tasks/taxon_id/{task_nextclade_addSamples.wdl => task_nextclade_add_ref.wdl} (100%)

diff --git a/tasks/taxon_id/task_nextclade_addSamples.wdl b/tasks/taxon_id/task_nextclade_add_ref.wdl
similarity index 100%
rename from tasks/taxon_id/task_nextclade_addSamples.wdl
rename to tasks/taxon_id/task_nextclade_add_ref.wdl
diff --git a/workflows/phylogenetics/wf_nextclade_addToRefTree.wdl b/workflows/phylogenetics/wf_nextclade_addToRefTree.wdl
index 616423d38..ffbc66c53 100644
--- a/workflows/phylogenetics/wf_nextclade_addToRefTree.wdl
+++ b/workflows/phylogenetics/wf_nextclade_addToRefTree.wdl
@@ -1,6 +1,6 @@
 version 1.0
 
-import "../../tasks/taxon_id/task_nextclade_addSamples.wdl" as nextclade_analysis
+import "../../tasks/taxon_id/task_nextclade_add_ref.wdl" as nextclade_analysis
 
 workflow nextclade_addToRefTree {
     meta {

From 802e41436c7cc1d6067151039f397db3f1792e39 Mon Sep 17 00:00:00 2001
From: jrotieno <james.otieno@theiagen.com>
Date: Fri, 18 Aug 2023 20:48:36 +0000
Subject: [PATCH 32/36] .

---
 tasks/taxon_id/task_nextclade_add_ref.wdl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tasks/taxon_id/task_nextclade_add_ref.wdl b/tasks/taxon_id/task_nextclade_add_ref.wdl
index 07f04b242..9ffa2faeb 100644
--- a/tasks/taxon_id/task_nextclade_add_ref.wdl
+++ b/tasks/taxon_id/task_nextclade_add_ref.wdl
@@ -2,7 +2,7 @@ version 1.0
 
 task nextclade {
     meta {
-      description: "Nextclade classification of one sample. Leaving optional inputs unspecified will use SARS-CoV-2 defaults."
+      description: "Nextclade task to add samples to either a user specified or a nextclade reference tree."
     }
     input {
       File genome_fasta

From f2a740c6abb2f4fb44c4017168de3ba4c377229d Mon Sep 17 00:00:00 2001
From: jrotieno <james.otieno@theiagen.com>
Date: Sat, 19 Aug 2023 05:50:55 +0000
Subject: [PATCH 33/36] use existing nextclade task

---
 tasks/taxon_id/task_nextclade.wdl             | 75 +++++++++++++++++++
 .../wf_nextclade_addToRefTree.wdl             | 14 ++--
 2 files changed, 82 insertions(+), 7 deletions(-)

diff --git a/tasks/taxon_id/task_nextclade.wdl b/tasks/taxon_id/task_nextclade.wdl
index aace367a4..5a54865a6 100644
--- a/tasks/taxon_id/task_nextclade.wdl
+++ b/tasks/taxon_id/task_nextclade.wdl
@@ -165,4 +165,79 @@ task nextclade_output_parser {
       String nextclade_aa_dels = read_string("NEXTCLADE_AADELS")
       String nextclade_lineage = read_string("NEXTCLADE_LINEAGE")
     }
+}
+
+task nextclade_add_ref {
+    meta {
+      description: "Nextclade task to add samples to either a user specified or a nextclade reference tree."
+    }
+    input {
+      File genome_fasta
+      File? root_sequence
+      File? reference_tree_json
+      File? qc_config_json
+      File? gene_annotations_gff
+      File? pcr_primers_csv
+      File? virus_properties
+      String docker = "us-docker.pkg.dev/general-theiagen/nextstrain/nextclade:2.14.0"
+      String dataset_name
+      String? dataset_reference
+      String? dataset_tag
+      Int disk_size = 50
+    }
+    String basename = basename(genome_fasta, ".fasta")
+    command <<<
+        NEXTCLADE_VERSION="$(nextclade --version)"
+        echo $NEXTCLADE_VERSION > NEXTCLADE_VERSION
+
+        nextclade dataset get \
+          --name="~{dataset_name}" \
+          ~{"--reference " + dataset_reference} \
+          ~{"--tag " + dataset_tag} \
+          -o nextclade_dataset_dir \
+          --verbose
+
+        # If no referece sequence is provided, use the reference tree from the dataset
+        if [ -z "~{reference_tree_json}" ]; then
+          echo "Default dataset reference tree JSON will be used"
+          cp nextclade_dataset_dir/tree.json reference_tree.json
+        else
+          echo "User reference tree JSON will be used"
+          cp ~{reference_tree_json} reference_tree.json
+        fi
+
+        tree_json="reference_tree.json"
+
+        set -e
+        nextclade run \
+            --input-dataset=nextclade_dataset_dir/ \
+            ~{"--input-root-seq " + root_sequence} \
+            --input-tree ${tree_json} \
+            ~{"--input-qc-config " + qc_config_json} \
+            ~{"--input-gene-map " + gene_annotations_gff} \
+            ~{"--input-pcr-primers " + pcr_primers_csv} \
+            ~{"--input-virus-properties " + virus_properties}  \
+            --output-json "~{basename}".nextclade.json \
+            --output-tsv  "~{basename}".nextclade.tsv \
+            --output-tree "~{basename}".nextclade.auspice.json \
+            --output-all=. \
+            "~{genome_fasta}"
+    >>>
+    runtime {
+      docker: "~{docker}"
+      memory: "8 GB"
+      cpu: 2
+      disks:  "local-disk " + disk_size + " SSD"
+      disk: disk_size + " GB" # TES
+      dx_instance_type: "mem1_ssd1_v2_x2"
+      maxRetries: 3
+    }
+    output {
+      String nextclade_version = read_string("NEXTCLADE_VERSION")
+      File nextclade_json = "~{basename}.nextclade.json"
+      File auspice_json = "~{basename}.nextclade.auspice.json"
+      File nextclade_tsv = "~{basename}.nextclade.tsv"
+      String nextclade_docker = docker
+      File netclade_ref_tree = "reference_tree.json"
+    }
 }
\ No newline at end of file
diff --git a/workflows/phylogenetics/wf_nextclade_addToRefTree.wdl b/workflows/phylogenetics/wf_nextclade_addToRefTree.wdl
index ffbc66c53..3deb1eee1 100644
--- a/workflows/phylogenetics/wf_nextclade_addToRefTree.wdl
+++ b/workflows/phylogenetics/wf_nextclade_addToRefTree.wdl
@@ -1,6 +1,6 @@
 version 1.0
 
-import "../../tasks/taxon_id/task_nextclade_add_ref.wdl" as nextclade_analysis
+import "../../tasks/taxon_id/task_nextclade.wdl" as nextclade_analysis
 
 workflow nextclade_addToRefTree {
     meta {
@@ -18,7 +18,7 @@ workflow nextclade_addToRefTree {
       String? dataset_reference
       String? dataset_tag
     }
-    call nextclade_analysis.nextclade { # nextclade analysis
+    call nextclade_analysis.nextclade_add_ref { # nextclade analysis
       input:
         genome_fasta = assembly_fastas,
         root_sequence = root_sequence_fasta,
@@ -32,10 +32,10 @@ workflow nextclade_addToRefTree {
         dataset_tag = dataset_tag
     }
     output {
-      String treeUpdate_nextclade_version = select_first([nextclade.nextclade_version, ""])
-      File treeUpdate_nextclade_json = select_first([nextclade.nextclade_json, ""])
-      File treeUpdate_auspice_json = select_first([nextclade.auspice_json, ""])
-      File treeUpdate_nextclade_tsv = select_first([nextclade.nextclade_tsv, ""])
-      String treeUpdate_nextclade_docker = select_first([nextclade.nextclade_docker, ""])
+      String treeUpdate_nextclade_version = select_first([nextclade_add_ref.nextclade_version, ""])
+      File treeUpdate_nextclade_json = select_first([nextclade_add_ref.nextclade_json, ""])
+      File treeUpdate_auspice_json = select_first([nextclade_add_ref.auspice_json, ""])
+      File treeUpdate_nextclade_tsv = select_first([nextclade_add_ref.nextclade_tsv, ""])
+      String treeUpdate_nextclade_docker = select_first([nextclade_add_ref.nextclade_docker, ""])
     }
 }
\ No newline at end of file

From f38024684b1638da4cf71376737f0a92a438b73e Mon Sep 17 00:00:00 2001
From: jrotieno <james.otieno@theiagen.com>
Date: Sat, 19 Aug 2023 06:09:02 +0000
Subject: [PATCH 34/36] delete temp task

---
 tasks/taxon_id/task_nextclade_add_ref.wdl | 76 -----------------------
 1 file changed, 76 deletions(-)
 delete mode 100644 tasks/taxon_id/task_nextclade_add_ref.wdl

diff --git a/tasks/taxon_id/task_nextclade_add_ref.wdl b/tasks/taxon_id/task_nextclade_add_ref.wdl
deleted file mode 100644
index 9ffa2faeb..000000000
--- a/tasks/taxon_id/task_nextclade_add_ref.wdl
+++ /dev/null
@@ -1,76 +0,0 @@
-version 1.0
-
-task nextclade {
-    meta {
-      description: "Nextclade task to add samples to either a user specified or a nextclade reference tree."
-    }
-    input {
-      File genome_fasta
-      File? root_sequence
-      File? reference_tree_json
-      File? qc_config_json
-      File? gene_annotations_gff
-      File? pcr_primers_csv
-      File? virus_properties
-      String docker = "us-docker.pkg.dev/general-theiagen/nextstrain/nextclade:2.14.0"
-      String dataset_name
-      String? dataset_reference
-      String? dataset_tag
-      Int disk_size = 50
-    }
-    String basename = basename(genome_fasta, ".fasta")
-    command <<<
-        NEXTCLADE_VERSION="$(nextclade --version)"
-        echo $NEXTCLADE_VERSION > NEXTCLADE_VERSION
-
-        nextclade dataset get \
-          --name="~{dataset_name}" \
-          ~{"--reference " + dataset_reference} \
-          ~{"--tag " + dataset_tag} \
-          -o nextclade_dataset_dir \
-          --verbose
-        
-        # If no referece sequence is provided, use the reference tree from the dataset
-        if [ -z "~{reference_tree_json}" ]; then
-          echo "Default dataset reference tree JSON will be used"
-          cp nextclade_dataset_dir/tree.json reference_tree.json
-        else
-          echo "User reference tree JSON will be used"
-          cp ~{reference_tree_json} reference_tree.json
-        fi
-
-        tree_json="reference_tree.json"
-
-        set -e
-        nextclade run \
-            --input-dataset=nextclade_dataset_dir/ \
-            ~{"--input-root-seq " + root_sequence} \
-            --input-tree ${tree_json} \
-            ~{"--input-qc-config " + qc_config_json} \
-            ~{"--input-gene-map " + gene_annotations_gff} \
-            ~{"--input-pcr-primers " + pcr_primers_csv} \
-            ~{"--input-virus-properties " + virus_properties}  \
-            --output-json "~{basename}".nextclade.json \
-            --output-tsv  "~{basename}".nextclade.tsv \
-            --output-tree "~{basename}".nextclade.auspice.json \
-            --output-all=. \
-            "~{genome_fasta}"
-    >>>
-    runtime {
-      docker: "~{docker}"
-      memory: "8 GB"
-      cpu: 2
-      disks:  "local-disk " + disk_size + " SSD"
-      disk: disk_size + " GB" # TES
-      dx_instance_type: "mem1_ssd1_v2_x2"
-      maxRetries: 3 
-    }
-    output {
-      String nextclade_version = read_string("NEXTCLADE_VERSION")
-      File nextclade_json = "~{basename}.nextclade.json"
-      File auspice_json = "~{basename}.nextclade.auspice.json"
-      File nextclade_tsv = "~{basename}.nextclade.tsv"
-      String nextclade_docker = docker
-      File netclade_ref_tree = "reference_tree.json"
-    }
-}
\ No newline at end of file

From 45fdc304f77e4ea68eb5458eb4cfd590b4a67154 Mon Sep 17 00:00:00 2001
From: jrotieno <james.otieno@theiagen.com>
Date: Thu, 21 Sep 2023 14:57:40 +0000
Subject: [PATCH 35/36] changing input "dataset_name" to "organism" to be more
 intuitive

---
 workflows/phylogenetics/wf_nextclade_addToRefTree.wdl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/workflows/phylogenetics/wf_nextclade_addToRefTree.wdl b/workflows/phylogenetics/wf_nextclade_addToRefTree.wdl
index 3deb1eee1..80678001f 100644
--- a/workflows/phylogenetics/wf_nextclade_addToRefTree.wdl
+++ b/workflows/phylogenetics/wf_nextclade_addToRefTree.wdl
@@ -14,7 +14,7 @@ workflow nextclade_addToRefTree {
       File? qc_config_json
       File? pcr_primers_csv
       File? virus_properties
-      String dataset_name
+      String organism
       String? dataset_reference
       String? dataset_tag
     }
@@ -27,7 +27,7 @@ workflow nextclade_addToRefTree {
         gene_annotations_gff = gene_annotations_gff,
         pcr_primers_csv = pcr_primers_csv,
         virus_properties = virus_properties,
-        dataset_name = dataset_name,
+        dataset_name = organism,
         dataset_reference = dataset_reference,
         dataset_tag = dataset_tag
     }

From ef3e182ae72eb8fe8a4c37003ed982fb65a8bda7 Mon Sep 17 00:00:00 2001
From: jrotieno <james.otieno@theiagen.com>
Date: Thu, 21 Sep 2023 15:11:50 +0000
Subject: [PATCH 36/36] adding wf date and version capture

---
 workflows/phylogenetics/wf_nextclade_addToRefTree.wdl | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/workflows/phylogenetics/wf_nextclade_addToRefTree.wdl b/workflows/phylogenetics/wf_nextclade_addToRefTree.wdl
index 80678001f..bd832ba98 100644
--- a/workflows/phylogenetics/wf_nextclade_addToRefTree.wdl
+++ b/workflows/phylogenetics/wf_nextclade_addToRefTree.wdl
@@ -1,6 +1,7 @@
 version 1.0
 
 import "../../tasks/taxon_id/task_nextclade.wdl" as nextclade_analysis
+import "../../tasks/task_versioning.wdl" as versioning
 
 workflow nextclade_addToRefTree {
     meta {
@@ -31,11 +32,17 @@ workflow nextclade_addToRefTree {
         dataset_reference = dataset_reference,
         dataset_tag = dataset_tag
     }
+    call versioning.version_capture{
+      input:
+    }
     output {
       String treeUpdate_nextclade_version = select_first([nextclade_add_ref.nextclade_version, ""])
       File treeUpdate_nextclade_json = select_first([nextclade_add_ref.nextclade_json, ""])
       File treeUpdate_auspice_json = select_first([nextclade_add_ref.auspice_json, ""])
       File treeUpdate_nextclade_tsv = select_first([nextclade_add_ref.nextclade_tsv, ""])
       String treeUpdate_nextclade_docker = select_first([nextclade_add_ref.nextclade_docker, ""])
+      # Version Capture
+      String samples_to_ref_tree_version = version_capture.phb_version
+      String samples_to_ref_tree_analysis_date = version_capture.date
     }
 }
\ No newline at end of file