From 7b8b018b164556d98a3640ef1edc6991fccd812e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?In=C3=AAs=20Mendes?= <iines.mendes@gmail.com>
Date: Fri, 18 Oct 2024 20:33:53 +0100
Subject: [PATCH] [TheiaCoV ONT and Clearlabs] Update consensus task container
 to artic:1.2.4-1.12.0 (#636)

* update container to artic:1.2.4-1.12.0

* semi-update CI

* clean up CI to removed intermediary files

* update docs

---------

Co-authored-by: Sage Wright <sage.wright@theiagen.com>
---
 .../genomic_characterization/theiacov.md      | 48 ++++++++++++++++++-
 tasks/assembly/task_artic_consensus.wdl       | 10 +++-
 .../theiacov/test_wf_theiacov_clearlabs.yml   |  2 +-
 .../theiacov/test_wf_theiacov_ont.yml         | 13 ++---
 4 files changed, 58 insertions(+), 15 deletions(-)

diff --git a/docs/workflows/genomic_characterization/theiacov.md b/docs/workflows/genomic_characterization/theiacov.md
index 58c92da2b..740d82524 100644
--- a/docs/workflows/genomic_characterization/theiacov.md
+++ b/docs/workflows/genomic_characterization/theiacov.md
@@ -126,8 +126,8 @@ All TheiaCoV Workflows (not TheiaCoV_FASTA_Batch)
 | clean_check_reads | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 2 | Optional | ONT, PE, SE | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 |
 | consensus | **cpu** | Int | Number of CPUs to allocate to the task | 8 | Optional | CL, ONT | sars-cov-2 |
 | consensus | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | CL, ONT | sars-cov-2 |
-| consensus | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/artic-ncov2019-epi2me | Optional | ONT | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 |
-| consensus | **medaka_model** | String | In order to obtain the best results, the appropriate model must be set to match the sequencer's basecaller model; this string takes the format of {pore}_{device}_{caller variant}_{caller_version}. See also https://github.com/nanoporetech/medaka?tab=readme-ov-file#models. | r941_min_high_g360 | Optional | CL, ONT | sars-cov-2 |
+| consensus | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/artic:1.2.4-1.12.0 | Optional | CL, ONT | HIV, MPXV, WNV, flu, rsv_a, rsv_b, sars-cov-2 |
+| consensus | **medaka_model** | String | In order to obtain the best results, the appropriate model must be set to match the sequencer's basecaller model; this string takes the format of {pore}_{device}_{caller variant}_{caller_version}. See the list of available models in the `artic_consensus` documentation section. See also https://github.com/nanoporetech/medaka?tab=readme-ov-file#models. | r941_min_high_g360 | Optional | CL, ONT | sars-cov-2 |
 | consensus | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 16 | Optional | CL, ONT | sars-cov-2 |
 | consensus_qc | **cpu** | Int | Number of CPUs to allocate to the task | 1 | Optional | CL, FASTA, ONT, PE, SE | HIV, MPXV, WNV, rsv_a, rsv_b, sars-cov-2 |
 | consensus_qc | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | CL, FASTA, ONT, PE, SE | HIV, MPXV, WNV, rsv_a, rsv_b, sars-cov-2 |
@@ -800,6 +800,50 @@ All input reads are processed through "core tasks" in the TheiaCoV Illumina, ONT
 
     !!! info ""
         Read-trimming is performed on raw read data generated on the ClearLabs instrument and thus not a required step in the TheiaCoV_ClearLabs workflow.
+    
+    ??? toggle "Available `medaka` models"
+        The medaka models available in the default docker container are as follows:
+
+        ``` bash
+        r103_fast_g507, r103_fast_snp_g507, r103_fast_variant_g507, r103_hac_g507,
+        r103_hac_snp_g507, r103_hac_variant_g507, r103_min_high_g345, r103_min_high_g360,
+        r103_prom_high_g360, r103_prom_snp_g3210, r103_prom_variant_g3210, r103_sup_g507,
+        r103_sup_snp_g507, r103_sup_variant_g507, r1041_e82_260bps_fast_g632,
+        r1041_e82_260bps_fast_variant_g632, r1041_e82_260bps_hac_g632,
+        r1041_e82_260bps_hac_v4.0.0, r1041_e82_260bps_hac_v4.1.0,
+        r1041_e82_260bps_hac_variant_g632, r1041_e82_260bps_hac_variant_v4.1.0,
+        r1041_e82_260bps_joint_apk_ulk_v5.0.0, r1041_e82_260bps_sup_g632,
+        r1041_e82_260bps_sup_v4.0.0, r1041_e82_260bps_sup_v4.1.0,
+        r1041_e82_260bps_sup_variant_g632, r1041_e82_260bps_sup_variant_v4.1.0,
+        r1041_e82_400bps_fast_g615, r1041_e82_400bps_fast_g632,
+        r1041_e82_400bps_fast_variant_g615, r1041_e82_400bps_fast_variant_g632,
+        r1041_e82_400bps_hac_g615, r1041_e82_400bps_hac_g632, r1041_e82_400bps_hac_v4.0.0,
+        r1041_e82_400bps_hac_v4.1.0, r1041_e82_400bps_hac_v4.2.0, r1041_e82_400bps_hac_v4.3.0,
+        r1041_e82_400bps_hac_v5.0.0, r1041_e82_400bps_hac_variant_g615,
+        r1041_e82_400bps_hac_variant_g632, r1041_e82_400bps_hac_variant_v4.1.0,
+        r1041_e82_400bps_hac_variant_v4.2.0, r1041_e82_400bps_hac_variant_v4.3.0,
+        r1041_e82_400bps_hac_variant_v5.0.0, r1041_e82_400bps_sup_g615,
+        r1041_e82_400bps_sup_v4.0.0, r1041_e82_400bps_sup_v4.1.0, r1041_e82_400bps_sup_v4.2.0,
+        r1041_e82_400bps_sup_v4.3.0, r1041_e82_400bps_sup_v5.0.0,
+        r1041_e82_400bps_sup_variant_g615, r1041_e82_400bps_sup_variant_v4.1.0,
+        r1041_e82_400bps_sup_variant_v4.2.0, r1041_e82_400bps_sup_variant_v4.3.0,
+        r1041_e82_400bps_sup_variant_v5.0.0, r104_e81_fast_g5015, r104_e81_fast_variant_g5015,
+        r104_e81_hac_g5015, r104_e81_hac_variant_g5015, r104_e81_sup_g5015, r104_e81_sup_g610,
+        r104_e81_sup_variant_g610, r10_min_high_g303, r10_min_high_g340, r941_e81_fast_g514,
+        r941_e81_fast_variant_g514, r941_e81_hac_g514, r941_e81_hac_variant_g514,
+        r941_e81_sup_g514, r941_e81_sup_variant_g514, r941_min_fast_g303, r941_min_fast_g507,
+        r941_min_fast_snp_g507, r941_min_fast_variant_g507, r941_min_hac_g507,
+        r941_min_hac_snp_g507, r941_min_hac_variant_g507, r941_min_high_g303, r941_min_high_g330,
+        r941_min_high_g340_rle, r941_min_high_g344, r941_min_high_g351, r941_min_high_g360,
+        r941_min_sup_g507, r941_min_sup_snp_g507, r941_min_sup_variant_g507, r941_prom_fast_g303,
+        r941_prom_fast_g507, r941_prom_fast_snp_g507, r941_prom_fast_variant_g507,
+        r941_prom_hac_g507, r941_prom_hac_snp_g507, r941_prom_hac_variant_g507,
+        r941_prom_high_g303, r941_prom_high_g330, r941_prom_high_g344, r941_prom_high_g360,
+        r941_prom_high_g4011, r941_prom_snp_g303, r941_prom_snp_g322, r941_prom_snp_g360,
+        r941_prom_sup_g507, r941_prom_sup_snp_g507, r941_prom_sup_variant_g507,
+        r941_prom_variant_g303, r941_prom_variant_g322, r941_prom_variant_g360,
+        r941_sup_plant_g610, r941_sup_plant_variant_g610
+        ```
 
     General statistics about the assembly are generated with the `consensus_qc` task ([task_assembly_metrics.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/basic_statistics/task_assembly_metrics.wdl)).
 
diff --git a/tasks/assembly/task_artic_consensus.wdl b/tasks/assembly/task_artic_consensus.wdl
index 6e38334a1..8e2d174db 100644
--- a/tasks/assembly/task_artic_consensus.wdl
+++ b/tasks/assembly/task_artic_consensus.wdl
@@ -12,7 +12,7 @@ task consensus {
     Int memory = 16
     Int disk_size = 100
     String medaka_model = "r941_min_high_g360"
-    String docker = "us-docker.pkg.dev/general-theiagen/staphb/artic-ncov2019-epi2me"
+    String docker = "us-docker.pkg.dev/general-theiagen/staphb/artic:1.2.4-1.12.0"
   }
   String primer_name = basename(primer_bed)
   command <<<
@@ -61,7 +61,13 @@ task consensus {
     # version control
     echo "Medaka via $(artic -v)" | tee VERSION
     echo "~{primer_name}" | tee PRIMER_NAME
-    artic minion --medaka --medaka-model ~{medaka_model} --normalise ~{normalise} --threads ~{cpu} --scheme-directory ./primer-schemes --read-file ~{read1} ${scheme_name} ~{samplename}
+    artic minion \
+      --medaka \
+      --medaka-model ~{medaka_model} \
+      --normalise ~{normalise} \
+      --threads ~{cpu} \
+      --scheme-directory ./primer-schemes \
+      --read-file ~{read1} ${scheme_name} ~{samplename}
     gunzip -f ~{samplename}.pass.vcf.gz
 
     # clean up fasta header
diff --git a/tests/workflows/theiacov/test_wf_theiacov_clearlabs.yml b/tests/workflows/theiacov/test_wf_theiacov_clearlabs.yml
index 48ffe30c9..599fec45e 100644
--- a/tests/workflows/theiacov/test_wf_theiacov_clearlabs.yml
+++ b/tests/workflows/theiacov/test_wf_theiacov_clearlabs.yml
@@ -17,7 +17,7 @@
     - wf_theiacov_clearlabs_miniwdl
   files:
     - path: miniwdl_run/call-consensus/command
-      md5sum: a8e200703dedf732b45dd92b0af15f1c
+      md5sum: b19d5ce485c612036064c07f0a1d6a18
     - path: miniwdl_run/call-consensus/inputs.json
       contains: ["read1", "samplename", "fastq"]
     - path: miniwdl_run/call-consensus/outputs.json
diff --git a/tests/workflows/theiacov/test_wf_theiacov_ont.yml b/tests/workflows/theiacov/test_wf_theiacov_ont.yml
index 03027a794..6077323b9 100644
--- a/tests/workflows/theiacov/test_wf_theiacov_ont.yml
+++ b/tests/workflows/theiacov/test_wf_theiacov_ont.yml
@@ -31,7 +31,7 @@
     - path: miniwdl_run/call-clean_check_reads/work/_miniwdl_inputs/0/artic_ncov2019_ont.fastq
       md5sum: d41d8cd98f00b204e9800998ecf8427e
     - path: miniwdl_run/call-consensus/command
-      md5sum: 056563d18294928fef5238bac7213791
+      md5sum: 362dccda19ecadf377d5cd5872946ddd
     - path: miniwdl_run/call-consensus/inputs.json
       contains: ["read1_clean", "samplename", "fastq"]
     - path: miniwdl_run/call-consensus/outputs.json
@@ -45,7 +45,7 @@
     - path: miniwdl_run/call-consensus/work/REFERENCE_GENOME
       md5sum: 0e6efd549c8773f9a2f7a3e82619ee61
     - path: miniwdl_run/call-consensus/work/VERSION
-      md5sum: f3528ff85409c70100063c55ad75612b
+      md5sum: 394e07bc6788e025ac35254411db107c
     - path: miniwdl_run/call-consensus/work/_miniwdl_inputs/0/artic-v3.primers.bed
       md5sum: d41d8cd98f00b204e9800998ecf8427e
     - path: miniwdl_run/call-consensus/work/_miniwdl_inputs/0/artic_ncov2019_ont.fastq
@@ -64,8 +64,6 @@
     - path: miniwdl_run/call-consensus/work/ont.fastq.gz
     - path: miniwdl_run/call-consensus/work/ont.medaka.consensus.fasta
       md5sum: d36b7c665aa4127f0a6e8dbc562eea3e
-    - path: miniwdl_run/call-consensus/work/ont.merged.gvcf.vcf.gz
-    - path: miniwdl_run/call-consensus/work/ont.merged.gvcf.vcf.gz.tbi
     - path: miniwdl_run/call-consensus/work/ont.merged.vcf.gz
     - path: miniwdl_run/call-consensus/work/ont.merged.vcf.gz.tbi
     - path: miniwdl_run/call-consensus/work/ont.minion.log.txt
@@ -73,20 +71,15 @@
     - path: miniwdl_run/call-consensus/work/ont.pass.vcf.gz.tbi
     - path: miniwdl_run/call-consensus/work/ont.preconsensus.fasta
       md5sum: b68f4ee4abc9fc16215204d0ff754bb8
-    - path: miniwdl_run/call-consensus/work/ont.preconsensus.fasta.fai
-      md5sum: 4ca7d9fd06b9cdf379c2cf02b9fd6d0e
     - path: miniwdl_run/call-consensus/work/ont.primers.vcf
     - path: miniwdl_run/call-consensus/work/ont.primersitereport.txt
-      md5sum: cffee67632a262eeb947cea9cee0b4c1
+      md5sum: dab514423a8fb7b59ab7870ad8c3b4cf
     - path: miniwdl_run/call-consensus/work/ont.primertrimmed.rg.sorted.bam
     - path: miniwdl_run/call-consensus/work/ont.primertrimmed.rg.sorted.bam.bai
     - path: miniwdl_run/call-consensus/work/ont.sorted.bam
     - path: miniwdl_run/call-consensus/work/ont.sorted.bam.bai
     - path: miniwdl_run/call-consensus/work/ont.trimmed.rg.sorted.bam
     - path: miniwdl_run/call-consensus/work/ont.trimmed.rg.sorted.bam.bai
-    - path: miniwdl_run/call-consensus/work/ont.vcfcheck.log
-    - path: miniwdl_run/call-consensus/work/ont.vcfreport.txt
-      md5sum: 69131186223267b3ae6621cb8ef4eecd
     - path: miniwdl_run/call-consensus/work/primer-schemes/SARS-CoV-2/Vuser/SARS-CoV-2.reference.fasta
       md5sum: b9b67235a2d9d0b0d7f531166ffefd41
     - path: miniwdl_run/call-consensus/work/primer-schemes/SARS-CoV-2/Vuser/SARS-CoV-2.reference.fasta.fai