From 2c627f93817a65f2bb2e6cfc542e928e7a103270 Mon Sep 17 00:00:00 2001
From: Curtis Kapsak <kapsakcj@gmail.com>
Date: Thu, 17 Oct 2024 16:04:01 -0400
Subject: [PATCH] [TheiaCov & TheiaProk & TheiaEuk] read screen ONT bugfix and
 improvements (#650)

* update read screen task to count number of bases in a FASTQ using fastq-scan instead of bash tools. tested successfully w/ miniwdl

* check_reads and check_reads_se tasks: added set-euo pipefail to both; updated method for counting reads and basepairs to be more robust and added debug statements; added debug statements for estimated_genome_length. tested successfully w miniwdl

* update CI

* update docs for theiacov/euk/prok
---
 .../genomic_characterization/theiacov.md      |  2 +-
 .../genomic_characterization/theiaeuk.md      |  2 +-
 .../genomic_characterization/theiaprok.md     |  2 +-
 .../comparisons/task_screen.wdl               | 51 ++++++++++---------
 .../theiacov/test_wf_theiacov_illumina_pe.yml |  2 +-
 .../theiacov/test_wf_theiacov_illumina_se.yml |  2 +-
 .../theiacov/test_wf_theiacov_ont.yml         |  4 +-
 .../test_wf_theiaprok_illumina_pe.yml         |  6 +--
 .../test_wf_theiaprok_illumina_se.yml         | 12 ++---
 9 files changed, 42 insertions(+), 41 deletions(-)

diff --git a/docs/workflows/genomic_characterization/theiacov.md b/docs/workflows/genomic_characterization/theiacov.md
index a97f85246..58c92da2b 100644
--- a/docs/workflows/genomic_characterization/theiacov.md
+++ b/docs/workflows/genomic_characterization/theiacov.md
@@ -612,7 +612,7 @@ All input reads are processed through "core tasks" in the TheiaCoV Illumina, ONT
 
 ??? task "`screen`: Total Raw Read Quantification and Genome Size Estimation"
 
-    The [`screen`](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/comparisons/task_screen.wdl) task ensures the quantity of sequence data is sufficient to undertake genomic analysis. It uses bash commands for quantification of reads and base pairs, and [mash](https://mash.readthedocs.io/en/latest/index.html) sketching to estimate the genome size and its coverage. At each step, the results are assessed relative to pass/fail criteria and thresholds that may be defined by optional user inputs. Samples that do not meet these criteria will not be processed further by the workflow:
+    The [`screen`](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/comparisons/task_screen.wdl) task ensures the quantity of sequence data is sufficient to undertake genomic analysis. It uses [`fastq-scan`](https://github.com/rpetit3/fastq-scan) and bash commands for quantification of reads and base pairs, and [mash](https://mash.readthedocs.io/en/latest/index.html) sketching to estimate the genome size and its coverage. At each step, the results are assessed relative to pass/fail criteria and thresholds that may be defined by optional user inputs. Samples that do not meet these criteria will not be processed further by the workflow:
 
     1. Total number of reads: A sample will fail the read screening task if its total number of reads is less than or equal to `min_reads`.
     2. The proportion of basepairs reads in the forward and reverse read files: A sample will fail the read screening if fewer than `min_proportion` basepairs are in either the reads1 or read2 files.
diff --git a/docs/workflows/genomic_characterization/theiaeuk.md b/docs/workflows/genomic_characterization/theiaeuk.md
index a594dce3b..37957d82f 100644
--- a/docs/workflows/genomic_characterization/theiaeuk.md
+++ b/docs/workflows/genomic_characterization/theiaeuk.md
@@ -187,7 +187,7 @@ All input reads are processed through "core tasks" in each workflow. The core ta
 
 ??? task "`screen`: Total Raw Read Quantification and Genome Size Estimation"
 
-    The [`screen`](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/comparisons/task_screen.wdl) task ensures the quantity of sequence data is sufficient to undertake genomic analysis. It uses bash commands for quantification of reads and base pairs, and [mash](https://mash.readthedocs.io/en/latest/index.html) sketching to estimate the genome size and its coverage. At each step, the results are assessed relative to pass/fail criteria and thresholds that may be defined by optional user inputs. Samples that do not meet these criteria will not be processed further by the workflow:
+    The [`screen`](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/comparisons/task_screen.wdl) task ensures the quantity of sequence data is sufficient to undertake genomic analysis. It uses [`fastq-scan`](https://github.com/rpetit3/fastq-scan) and bash commands for quantification of reads and base pairs, and [mash](https://mash.readthedocs.io/en/latest/index.html) sketching to estimate the genome size and its coverage. At each step, the results are assessed relative to pass/fail criteria and thresholds that may be defined by optional user inputs. Samples that do not meet these criteria will not be processed further by the workflow:
 
     1. Total number of reads: A sample will fail the read screening task if its total number of reads is less than or equal to `min_reads`.
     2. The proportion of basepairs reads in the forward and reverse read files: A sample will fail the read screening if fewer than `min_proportion` basepairs are in either the reads1 or read2 files.
diff --git a/docs/workflows/genomic_characterization/theiaprok.md b/docs/workflows/genomic_characterization/theiaprok.md
index 3e9329a3b..1d55ae302 100644
--- a/docs/workflows/genomic_characterization/theiaprok.md
+++ b/docs/workflows/genomic_characterization/theiaprok.md
@@ -586,7 +586,7 @@ All input reads are processed through "[core tasks](#core-tasks-performed-for-al
 
 ??? task "`screen`: Total Raw Read Quantification and Genome Size Estimation"
 
-    The [`screen`](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/comparisons/task_screen.wdl) task ensures the quantity of sequence data is sufficient to undertake genomic analysis. It uses bash commands for quantification of reads and base pairs, and [mash](https://mash.readthedocs.io/en/latest/index.html) sketching to estimate the genome size and its coverage. At each step, the results are assessed relative to pass/fail criteria and thresholds that may be defined by optional user inputs. Samples that do not meet these criteria will not be processed further by the workflow:
+    The [`screen`](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/comparisons/task_screen.wdl) task ensures the quantity of sequence data is sufficient to undertake genomic analysis. It uses [`fastq-scan`](https://github.com/rpetit3/fastq-scan) and bash commands for quantification of reads and base pairs, and [mash](https://mash.readthedocs.io/en/latest/index.html) sketching to estimate the genome size and its coverage. At each step, the results are assessed relative to pass/fail criteria and thresholds that may be defined by optional user inputs. Samples that do not meet these criteria will not be processed further by the workflow:
 
     1. Total number of reads: A sample will fail the read screening task if its total number of reads is less than or equal to `min_reads`.
     2. The proportion of basepairs reads in the forward and reverse read files: A sample will fail the read screening if fewer than `min_proportion` basepairs are in either the reads1 or read2 files.
diff --git a/tasks/quality_control/comparisons/task_screen.wdl b/tasks/quality_control/comparisons/task_screen.wdl
index 8cc6f58e0..3934d0b7a 100644
--- a/tasks/quality_control/comparisons/task_screen.wdl
+++ b/tasks/quality_control/comparisons/task_screen.wdl
@@ -20,6 +20,9 @@ task check_reads {
     Int cpu = 1
   }
   command <<<
+    # just in case anything fails, throw an error
+    set -euo pipefail
+    
     flag="PASS"
 
     # initalize estimated genome length
@@ -34,13 +37,13 @@ task check_reads {
       fi
 
       # check one: number of reads
-      read1_num=`eval "$cat_reads ~{read1}" | awk '{s++}END{print s/4}'`
-      read2_num=`eval "$cat_reads ~{read2}" | awk '{s++}END{print s/4}'`
-      # awk '{s++}END{print s/4' counts the number of lines and divides them by 4
-      # key assumption: in fastq there will be four lines per read
-      # sometimes fastqs do not have 4 lines per read, so this might fail one day
+      read1_num=$($cat_reads ~{read1} | fastq-scan | grep 'read_total' | sed 's/[^0-9]*\([0-9]\+\).*/\1/')
+      read2_num=$($cat_reads ~{read2} | fastq-scan | grep 'read_total' | sed 's/[^0-9]*\([0-9]\+\).*/\1/')
+      echo "DEBUG: Number of reads in R1: ${read1_num}"
+      echo "DEBUG: Number of reads in R2: ${read2_num}"
 
       reads_total=$(expr $read1_num + $read2_num)
+      echo "DEBUG: Number of reads total in R1 and R2: ${reads_total}"
 
       if [ "${reads_total}" -le "~{min_reads}" ]; then
         flag="FAIL; the total number of reads is below the minimum of ~{min_reads}"
@@ -51,13 +54,11 @@ task check_reads {
       # checks two and three: number of basepairs and proportion of sequence
       if [ "${flag}" == "PASS" ]; then
         # count number of basepairs
-        # this only works if the fastq has 4 lines per read, so this might fail one day
-        read1_bp=`eval "${cat_reads} ~{read1}" | paste - - - - | cut -f2 | tr -d '\n' | wc -c`
-        read2_bp=`eval "${cat_reads} ~{read2}" | paste - - - - | cut -f2 | tr -d '\n' | wc -c`
-        # paste - - - - (print 4 consecutive lines in one row, tab delimited)
-        # cut -f2 print only the second column (the second line of the fastq 4-line)
-        # tr -d '\n' removes line endings
-        # wc -c counts characters
+        # using fastq-scan to count the number of basepairs in each fastq
+        read1_bp=$(eval "${cat_reads} ~{read1}" | fastq-scan | grep 'total_bp' | sed 's/[^0-9]*\([0-9]\+\).*/\1/')
+        read2_bp=$(eval "${cat_reads} ~{read2}" | fastq-scan | grep 'total_bp' | sed 's/[^0-9]*\([0-9]\+\).*/\1/')
+        echo "DEBUG: Number of basepairs in R1: $read1_bp"
+        echo "DEBUG: Number of basepairs in R2: $read2_bp"
 
         # set proportion variables for easy comparison
         # removing the , 2) to make these integers instead of floats
@@ -147,7 +148,8 @@ task check_reads {
             flag="FAIL; the estimated coverage (${estimated_coverage}) is less than the minimum of ~{min_coverage}x"
           else
             flag="PASS"
-            echo $estimated_genome_length | tee EST_GENOME_LENGTH
+            echo ${estimated_genome_length} | tee EST_GENOME_LENGTH
+            echo "DEBUG: estimated_genome_length: ${estimated_genome_length}"
           fi 
         fi
       fi 
@@ -190,6 +192,9 @@ task check_reads_se {
     Int cpu = 1
   }
   command <<<
+    # just in case anything fails, throw an error
+    set -euo pipefail
+
     flag="PASS"
 
     # initalize estimated genome length
@@ -203,11 +208,9 @@ task check_reads_se {
         cat_reads="cat"
       fi
 
-      # check one: number of reads
-      read1_num=`eval "$cat_reads ~{read1}" | awk '{s++}END{print s/4}'`
-      # awk '{s++}END{print s/4' counts the number of lines and divides them by 4
-      # key assumption: in fastq there will be four lines per read
-      # sometimes fastqs do not have 4 lines per read, so this might fail one day
+      # check one: number of reads via fastq-scan
+      read1_num=$($cat_reads ~{read1} | fastq-scan | grep 'read_total' | sed 's/[^0-9]*\([0-9]\+\).*/\1/')
+      echo "DEBUG: Number of reads in R1: ${read1_num}"
 
       if [ "${read1_num}" -le "~{min_reads}" ] ; then
         flag="FAIL; the number of reads (${read1_num}) is below the minimum of ~{min_reads}"
@@ -218,12 +221,9 @@ task check_reads_se {
       # checks two and three: number of basepairs and proportion of sequence
       if [ "${flag}" == "PASS" ]; then
         # count number of basepairs
-        # this only works if the fastq has 4 lines per read, so this might fail one day
-        read1_bp=`eval "${cat_reads} ~{read1}" | paste - - - - | cut -f2 | tr -d '\n' | wc -c`
-        # paste - - - - (print 4 consecutive lines in one row, tab delimited)
-        # cut -f2 print only the second column (the second line of the fastq 4-line)
-        # tr -d '\n' removes line endings
-        # wc -c counts characters
+        # using fastq-scan to count the number of basepairs in each fastq
+        read1_bp=$(eval "${cat_reads} ~{read1}" | fastq-scan | grep 'total_bp' | sed 's/[^0-9]*\([0-9]\+\).*/\1/')
+        echo "DEBUG: Number of basepairs in R1: $read1_bp"
 
         if [ "$flag" == "PASS" ] ; then
           if [ "${read1_bp}" -le "~{min_basepairs}" ] ; then
@@ -309,7 +309,8 @@ task check_reads_se {
     fi 
     
     echo $flag | tee FLAG
-    echo $estimated_genome_length | tee EST_GENOME_LENGTH
+    echo ${estimated_genome_length} | tee EST_GENOME_LENGTH
+    echo "DEBUG: estimated_genome_length: ${estimated_genome_length}"
   >>>
   output {
     String read_screen = read_string("FLAG")
diff --git a/tests/workflows/theiacov/test_wf_theiacov_illumina_pe.yml b/tests/workflows/theiacov/test_wf_theiacov_illumina_pe.yml
index b1bb6da13..4c7542334 100644
--- a/tests/workflows/theiacov/test_wf_theiacov_illumina_pe.yml
+++ b/tests/workflows/theiacov/test_wf_theiacov_illumina_pe.yml
@@ -138,7 +138,7 @@
       md5sum: 7c5aba41f53293b712fd86d08ed5b36e
     # clean read screen
     - path: miniwdl_run/call-clean_check_reads/command
-      md5sum: e18830c68993b2837d1da29ce55d2de8
+      md5sum: aeeb107f328ccd7d2d805dc5990b24ac
     - path: miniwdl_run/call-clean_check_reads/inputs.json
       contains: ["read1", "read2", "organism"]
     - path: miniwdl_run/call-clean_check_reads/outputs.json
diff --git a/tests/workflows/theiacov/test_wf_theiacov_illumina_se.yml b/tests/workflows/theiacov/test_wf_theiacov_illumina_se.yml
index 9af5b61c9..0742c19a9 100644
--- a/tests/workflows/theiacov/test_wf_theiacov_illumina_se.yml
+++ b/tests/workflows/theiacov/test_wf_theiacov_illumina_se.yml
@@ -94,7 +94,7 @@
       md5sum: 9a089b8920e55c9cc7bc8cd7d18f9a8e
     # clean read screen
     - path: miniwdl_run/call-clean_check_reads/command
-      md5sum: aec6c57452ddff84c325601a780605d2
+      md5sum: 80a361915a627e86743baacfc383b2b5
     - path: miniwdl_run/call-clean_check_reads/inputs.json
       contains: ["read1", "organism"]
     - path: miniwdl_run/call-clean_check_reads/outputs.json
diff --git a/tests/workflows/theiacov/test_wf_theiacov_ont.yml b/tests/workflows/theiacov/test_wf_theiacov_ont.yml
index 1772e16b4..03027a794 100644
--- a/tests/workflows/theiacov/test_wf_theiacov_ont.yml
+++ b/tests/workflows/theiacov/test_wf_theiacov_ont.yml
@@ -17,7 +17,7 @@
     - wf_theiacov_ont_miniwdl
   files:
     - path: miniwdl_run/call-clean_check_reads/command
-      md5sum: 2517c5cf87db0af66663ffb5e69f6b4f
+      md5sum: 8078e96573428b712ddcb06517333cf5
     - path: miniwdl_run/call-clean_check_reads/inputs.json
     - path: miniwdl_run/call-clean_check_reads/outputs.json
     - path: miniwdl_run/call-clean_check_reads/stderr.txt
@@ -219,7 +219,7 @@
       md5sum: d41d8cd98f00b204e9800998ecf8427e
     - path: miniwdl_run/call-pangolin4/work/ont.pangolin_report.csv
     - path: miniwdl_run/call-raw_check_reads/command
-      md5sum: 1858d98f3c15904be0c219b867727048
+      md5sum: 5df9a70c852960f82c84c7da611cd177
     - path: miniwdl_run/call-raw_check_reads/inputs.json
     - path: miniwdl_run/call-raw_check_reads/outputs.json
     - path: miniwdl_run/call-raw_check_reads/stderr.txt
diff --git a/tests/workflows/theiaprok/test_wf_theiaprok_illumina_pe.yml b/tests/workflows/theiaprok/test_wf_theiaprok_illumina_pe.yml
index d2cedb29b..cb21eb7ba 100644
--- a/tests/workflows/theiaprok/test_wf_theiaprok_illumina_pe.yml
+++ b/tests/workflows/theiaprok/test_wf_theiaprok_illumina_pe.yml
@@ -201,7 +201,7 @@
     - path: miniwdl_run/call-cg_pipeline_raw/work/test_readMetrics.tsv
       contains: ["File", "fastq", "coverage"]
     - path: miniwdl_run/call-clean_check_reads/command
-      md5sum: 8235bc815fe3b57471f97474bff3e3f7
+      md5sum: f28c2b4597398988bc9016505425f6f0
     - path: miniwdl_run/call-clean_check_reads/inputs.json
       contains: ["read1", "fastq", "skip_screen", "true"]
     - path: miniwdl_run/call-clean_check_reads/outputs.json
@@ -364,7 +364,7 @@
     - path: miniwdl_run/call-quast/work/transposed_report.txt
       contains: ["Assembly", "length", "contigs", "test"]
     - path: miniwdl_run/call-raw_check_reads/command
-      md5sum: fd8c392c24e4e49859bdd006163db646
+      md5sum: 0e417649675499e2be549ce82e02704c
     - path: miniwdl_run/call-raw_check_reads/inputs.json
       contains: ["read1", "fastq", "skip_screen", "true"]
     - path: miniwdl_run/call-raw_check_reads/outputs.json
@@ -581,7 +581,7 @@
     - path: miniwdl_run/wdl/tasks/quality_control/basic_statistics/task_quast.wdl
       contains: ["version", "quast", "output"]
     - path: miniwdl_run/wdl/tasks/quality_control/comparisons/task_screen.wdl
-      md5sum: 75631c4db89792a939e4d872adc05b86
+      md5sum: adb43c5bf0a83b9e2ef7669ed2d1760f
     - path: miniwdl_run/wdl/tasks/quality_control/read_filtering/task_trimmomatic.wdl
       contains: ["version", "trimmomatic", "output"]
     - path: miniwdl_run/wdl/tasks/species_typing/escherichia_shigella/task_ectyper.wdl
diff --git a/tests/workflows/theiaprok/test_wf_theiaprok_illumina_se.yml b/tests/workflows/theiaprok/test_wf_theiaprok_illumina_se.yml
index d19f1319e..e4076ec77 100644
--- a/tests/workflows/theiaprok/test_wf_theiaprok_illumina_se.yml
+++ b/tests/workflows/theiaprok/test_wf_theiaprok_illumina_se.yml
@@ -194,7 +194,7 @@
     - path: miniwdl_run/call-cg_pipeline_raw/work/test_readMetrics.tsv
       contains: ["File", "fastq", "coverage"]
     - path: miniwdl_run/call-clean_check_reads/command
-      md5sum: 1ed4ae1859ade5045c01c92dfe54899c
+      md5sum: 51f799af8609c440d51e0046ebc030e8
     - path: miniwdl_run/call-clean_check_reads/inputs.json
       contains: ["read1", "fastq", "skip_screen", "true"]
     - path: miniwdl_run/call-clean_check_reads/outputs.json
@@ -203,7 +203,7 @@
       md5sum: d41d8cd98f00b204e9800998ecf8427e
     - path: miniwdl_run/call-clean_check_reads/stderr.txt.offset
     - path: miniwdl_run/call-clean_check_reads/stdout.txt
-      md5sum: 9e807ce699271c3f647c7594df2b5b0a
+      md5sum: 73809db947c875bf81acccaad76a57d5
     - path: miniwdl_run/call-clean_check_reads/task.log
       contains: ["wdl", "theiaprok_illumina_se", "check_reads", "done"]
     - path: miniwdl_run/call-clean_check_reads/work/EST_GENOME_LENGTH
@@ -355,7 +355,7 @@
     - path: miniwdl_run/call-quast/work/transposed_report.txt
       contains: ["Assembly", "length", "contigs", "test"]
     - path: miniwdl_run/call-raw_check_reads/command
-      md5sum: 3f6bf93b769ef9b152f005195cf2502e
+      md5sum: 7594079920cccbcf01d027547f54ccc2
     - path: miniwdl_run/call-raw_check_reads/inputs.json
       contains: ["read1", "fastq", "skip_screen", "true"]
     - path: miniwdl_run/call-raw_check_reads/outputs.json
@@ -364,7 +364,7 @@
       md5sum: d41d8cd98f00b204e9800998ecf8427e
     - path: miniwdl_run/call-raw_check_reads/stderr.txt.offset
     - path: miniwdl_run/call-raw_check_reads/stdout.txt
-      md5sum: 9e807ce699271c3f647c7594df2b5b0a
+      md5sum: 73809db947c875bf81acccaad76a57d5
     - path: miniwdl_run/call-raw_check_reads/task.log
       contains: ["wdl", "theiaprok_illumina_se", "check_reads", "done"]
     - path: miniwdl_run/call-raw_check_reads/work/EST_GENOME_LENGTH
@@ -526,7 +526,7 @@
     - path: miniwdl_run/wdl/tasks/gene_typing/drug_resistance/task_resfinder.wdl
       md5sum: 27528633723303b462d095b642649453
     - path: miniwdl_run/wdl/tasks/gene_typing/variant_detection/task_snippy_variants.wdl
-      md5sum: 284ce680b52e7e1c1753537b344fa161
+      md5sum: 3b9e04569d7e856dcc649b7726b306b7
     - path: miniwdl_run/wdl/tasks/quality_control/read_filtering/task_bbduk.wdl
       md5sum: aec6ef024d6dff31723f44290f6b9cf5
     - path: miniwdl_run/wdl/tasks/quality_control/advanced_metrics/task_busco.wdl
@@ -544,7 +544,7 @@
     - path: miniwdl_run/wdl/tasks/quality_control/basic_statistics/task_quast.wdl
       contains: ["version", "quast", "output"]
     - path: miniwdl_run/wdl/tasks/quality_control/comparisons/task_screen.wdl
-      md5sum: 75631c4db89792a939e4d872adc05b86
+      md5sum: adb43c5bf0a83b9e2ef7669ed2d1760f
     - path: miniwdl_run/wdl/tasks/quality_control/read_filtering/task_trimmomatic.wdl
       contains: ["version", "trimmomatic", "output"]
     - path: miniwdl_run/wdl/tasks/species_typing/escherichia_shigella/task_ectyper.wdl