Adds Staph aureus subwf (#213)

* set to S. aureus if gambit calls the species * parse tsv output for spatyper_type and spatyper_repeats; fix version command; fix cpu optional input * updated staphopia-sccmec WDL task: fixed optional input cpu; added second command to produce hamming distance output TSV; added and updated task outputs * add spatyper to merlin_magic when merlin_tag is Staphylococcus aureus * add spatyper outputs to export_taxon_tables call block in both TheiaProk_Illumina_PE and TheiaProk_Illumina_SE; add spatyper outputs to both workflows TheiaProk_Illumina_PE and TheiaProk_Illumina_SE * added code for parsing output TSV; lowered resource requirements since its already fast and doesn't need more than 1 cpu * make bash code less ugly; make linter less angry * update GH Actions workflows to use actions/checkout@v3 instead of v2. also upgraded actions/upload-artifact to v3 from v2 * added staphopia_sccmecc to merlin_magic workflow. Tested fine with miniwdl * added staphopia_sccmec outputs to TheiaProk_Illumina_PE as well as the export_taxon_tables task. not yet tested * add staphopia_sccmec outputs to theiaprok_illumina_se * update CI for GAMBIT task - expected change * remove unnecessary paired_end conditional from spatyper call block. tested fine with merlin_magic run via miniwdl * update parsing to include multiple spa hits * update agrvate task: reduced disk_size to 50 GB, fixed optional input declaration for cpu; changed how to capture version; changeed agrvate to always use mummer mode since usearch is not available in container; rename output files based on samplename instead of fasta file prefix; reduced to 4GB RAM; runtime block now uses cpu input param * added code for parsing agrvate summary TSV and added 5 new String outputs. tested with miniwdl and with a "good" example genome * fixed typing-only flag for agrvate option * add agrvate call block to merlin_magic subwf. tested w miniwdl * added agrvate outputs to export_taxon_tables task and theiaprok_illumina_pe workflow, still need to test in Terra * added agrvate outputs to theiaprok_illumina_se workflow. have not tested on Terra yet * clarify output strings for agrvate outputs * delete commented cut commands from spatyper task --------- Co-authored-by: cimendes <[email protected]> Co-authored-by: Inês Mendes <[email protected]>
theiagen · Mar 16, 2023 · 600f720 · 600f720
1 parent 01a6eb4
commit 600f720
Show file tree

Hide file tree

Showing 11 changed files with 300 additions and 34 deletions.
diff --git a/.github/workflows/miniwdl-check.yml b/.github/workflows/miniwdl-check.yml
@@ -20,7 +20,7 @@ jobs:
       workflows_files: ${{ steps.filter.outputs.wf_files }}
     steps:
       # Checkout the repo
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
 
       # Select wdl files with changes
       - uses: dorny/paths-filter@v2
@@ -43,7 +43,7 @@ jobs:
         wf: ${{ fromJson(needs.changes.outputs.workflows_files) }}
     steps:
       # Checkout the repo
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
 
       # Install a version of Python3
       - name: Set up Python

diff --git a/.github/workflows/pytest-workflows.yml b/.github/workflows/pytest-workflows.yml
@@ -19,7 +19,7 @@ jobs:
       workflows: ${{ steps.filter.outputs.changes }}
     steps:
       # Checkout the repo
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
 
       # Select workflows with changes
       - uses: dorny/paths-filter@v2
@@ -45,11 +45,11 @@ jobs:
     steps:
       # Checkout the repo
       - name: Checkout theiagen/public_health_bacterial_genomics
-        uses: actions/checkout@v2
+        uses: actions/checkout@v3
 
       # Import test data
       - name: Pull Test Data from bactopia/bactopia-tests
-        uses: actions/checkout@v2
+        uses: actions/checkout@v3
         with:
           repository: bactopia/bactopia-tests
           path: bactopia-tests
@@ -72,7 +72,7 @@ jobs:
 
       - name: Upload logs on failure
         if: failure()
-        uses: actions/upload-artifact@v2
+        uses: actions/upload-artifact@v3
         with:
           name: logs-${{ matrix.engine }}
           path: |

diff --git a/tasks/species_typing/task_agrvate.wdl b/tasks/species_typing/task_agrvate.wdl
@@ -8,30 +8,85 @@ task agrvate {
     File assembly
     String samplename
     String docker = "quay.io/biocontainers/agrvate:1.0.2--hdfd78af_0"
-    Int disk_size = 100
-    Int? cpu = 1
+    Int disk_size = 50
+    Int cpu = 1
 
     # Parameters
     # --typing_only    agr typing only. Skips agr operon extraction and frameshift detection
     Boolean typing_only = false
   }
   command <<<
-    echo $(agrvate -v 2>&1) | sed 's/agrvate v//;' | tee VERSION
+    # get version info
+    agrvate -v 2>&1 | sed 's/agrvate v//;' | tee VERSION
+
+    # run agrvate on assembly; usearch not available in biocontainer, cannot use that option
+    # using -m flag for mummer frameshift detection since usearch is not available
     agrvate \
-        ~{true="--typing_only" false="" typing_only} \
-        -i $fasta_name
-    cp results/~{samplename}-summary.tab ~{samplename}.tsv
-    tar -czvf ~{samplename}.tar.gz results/
+        ~{true="--typing-only" false="" typing_only} \
+        -i ~{assembly} \
+        -m 
+
+    # agrvate names output directory and file based on name of .fasta file, so <prefix>.fasta as input results in <prefix>-results/ outdir
+    # and results in <prefix>-results/<prefix>-summary.tab files 
+    basename=$(basename ~{assembly})
+    # strip off anything after the period
+    fasta_prefix=${basename%.*}
+
+    # rename outputs summary TSV to include samplename
+    mv -v "${fasta_prefix}-results/${fasta_prefix}-summary.tab" ~{samplename}.agrvate.tsv
+
+    # parse output summary TSV
+    cut -f 2 ~{samplename}.agrvate.tsv | tail -n 1 | tee AGR_GROUP
+    cut -f 3 ~{samplename}.agrvate.tsv | tail -n 1 | tee AGR_MATCH_SCORE
+    cut -f 4 ~{samplename}.agrvate.tsv | tail -n 1 | tee AGR_CANONICAL
+    cut -f 5 ~{samplename}.agrvate.tsv | tail -n 1 | tee AGR_MULTIPLE
+    cut -f 6 ~{samplename}.agrvate.tsv | tail -n 1 | tee AGR_NUM_FRAMESHIFTS
+
+    # edit output string AGR_CANONICAL to be more informative: https://github.com/VishnuRaghuram94/AgrVATE#results
+    if [[ $(cat AGR_CANONICAL) == 1 ]]; then
+      echo "1. canonical agrD" >AGR_CANONICAL
+    elif [[ $(cat AGR_CANONICAL) == 0 ]]; then
+      echo "0. non-canonical agrD" >AGR_CANONICAL
+    elif [[ $(cat AGR_CANONICAL) == "u" ]]; then
+      echo "u. unknown agrD" >AGR_CANNONICAL
+    else 
+      echo "result unrecognized, please see summary agrvate TSV file" >AGR_CANONICAL
+    fi
+
+    # edit output string AGR_MULTIPLE to be more informative: https://github.com/VishnuRaghuram94/AgrVATE#results
+    if [[ $(cat AGR_MULTIPLE) == "s" ]]; then
+      echo "s. single agr group found" >AGR_MULTIPLE
+    elif [[ $(cat AGR_MULTIPLE) == "m" ]]; then
+      echo "m. multiple agr groups found" >AGR_MULTIPLE
+    elif [[ $(cat AGR_MULTIPLE) == "u" ]]; then
+      echo "u. unknown agr groups found" >AGR_MULTIPLE
+    else 
+      echo "result unrecognized, please see summary agrvate TSV file" >AGR_MULTIPLE
+    fi
+
+    # if AGR_NUM_FRAMESHIFTS is unknown, edit output string AGR_NUM_FRAMESHIFTS to be more informative, otherwise keep set to a number: https://github.com/VishnuRaghuram94/AgrVATE#results
+    if [[ $(cat AGR_NUM_FRAMESHIFTS) == "u" ]]; then
+      echo "u or unknown; agr operon not extracted" >AGR_NUM_FRAMESHIFTS
+    fi
+
+    # create tarball of all output files
+    tar -czvf ~{samplename}.agrvate.tar.gz "${fasta_prefix}-results/"
   >>>
   output {
-    File agrvate_summary = "~{samplename}.tsv"
-    File agrvate_results = "~{samplename}.tar.gz"
+    File agrvate_summary = "~{samplename}.agrvate.tsv"
+    File agrvate_results = "~{samplename}.agrvate.tar.gz"
+    String agrvate_agr_group = read_string("AGR_GROUP")
+    String agrvate_agr_match_score = read_string("AGR_MATCH_SCORE")
+    String agrvate_agr_canonical = read_string("AGR_CANONICAL")
+    String agrvate_agr_multiple = read_string("AGR_MULTIPLE")
+    String agrvate_agr_num_frameshifts = read_string("AGR_NUM_FRAMESHIFTS")
     String agrvate_version = read_string("VERSION")
+    String agrvate_docker = docker
   }
   runtime {
     docker: "~{docker}"
-    memory: "8 GB"
-    cpu: 4
+    memory: "4 GB"
+    cpu: cpu
     disks: "local-disk " + disk_size + " SSD"
     disk: disk_size + " GB"
     maxRetries: 3

diff --git a/tasks/species_typing/task_spatyper.wdl b/tasks/species_typing/task_spatyper.wdl
@@ -9,27 +9,50 @@ task spatyper {
     String samplename
     String docker = "quay.io/biocontainers/spatyper:0.3.3--pyhdfd78af_3"
     Int disk_size = 100
-    Int? cpu = 4
+    Int cpu = 4
 
     # Parameters
     # --do_enrich Do PCR product enrichment
     Boolean do_enrich = false
   }
   command <<<
-    echo \$(spaTyper --version 2>&1) | sed 's/^.*spaTyper //' | tee VERSION
+    spaTyper --version 2>&1 | sed 's/^.*spaTyper //' | tee VERSION
     spaTyper \
       ~{true="--do_enrich" false="" do_enrich} \
       --fasta ~{assembly} \
       --output ~{samplename}.tsv
+
+    python3 <<CODE
+    import csv
+
+    TYPE = []
+    REPEATS = []
+
+    with open("./~{samplename}.tsv",'r') as tsv_file:
+      tsv_reader=csv.reader(tsv_file, delimiter="\t")
+      next(tsv_reader, None)  # skip the headers
+      for row in tsv_reader:
+        TYPE.append(row[-1])
+        REPEATS.append(row[-2])
+
+      with open ("TYPE", 'wt') as TYPE_fh:
+        TYPE_fh.write(','.join(TYPE))
+
+      with open ("REPEATS", 'wt') as REPEATS_fh:
+        REPEATS_fh.write(','.join(REPEATS))
+    CODE
   >>>
   output {
-      File spatyper_results = "~{samplename}.tsv"
+      File spatyper_tsv = "~{samplename}.tsv"
+      String spatyper_repeats = read_string("REPEATS")
+      String spatyper_type = read_string("TYPE")
       String spatyper_version = read_string("VERSION")
+      String spatyper_docker = "~{docker}"
   }
   runtime {
     docker: "~{docker}"
     memory: "8 GB"
-    cpu: 2
+    cpu: cpu
     disks: "local-disk " + disk_size + " SSD"
     disk: disk_size + " GB"
     maxRetries: 3

diff --git a/tasks/species_typing/task_staphopiasccmec.wdl b/tasks/species_typing/task_staphopiasccmec.wdl
@@ -9,26 +9,48 @@ task staphopiasccmec {
     String samplename
     String docker = "quay.io/biocontainers/staphopia-sccmec:1.0.0--hdfd78af_0"
     Int disk_size = 100
-    Int? cpu = 2
-
-    # Parameters
-    # --hamming Report the results as hamming distances
-    Boolean hamming = false
+    Int cpu = 1
   }
   command <<<
+    # get version
     staphopia-sccmec --version 2>&1 | sed 's/^.*staphopia-sccmec //' | tee VERSION
+
+    # run staphopia-sccmec on input assembly; hamming option OFF; outputs are true/false
     staphopia-sccmec \
-      ~{true="--hamming" false="" hamming} \
-      --assembly ~{assembly} > ~{samplename}.tsv
+      --assembly ~{assembly} > ~{samplename}.staphopia-sccmec.summary.tsv
+
+    # run staphopia-sccmec on input assembly; hamming option ON; outputs are the hamming distance; 0 is exact match
+    staphopia-sccmec \
+      --hamming \
+      --assembly ~{assembly} > ~{samplename}.staphopia-sccmec.hamming.tsv
+
+    # please excuse this ugly bash code below :)
+
+    # parse output summary TSV for true matches
+    # look for columns that contain the word "True" and print the column numbers in a list to a file col_headers.txt
+     awk '{ for (i=1; i<=NF; ++i) { if ($i ~ "True") print i } }' ~{samplename}.staphopia-sccmec.summary.tsv | tee col_headers.txt
+
+     # use column number list to print column headers (example: IV, mecA, etc.) to a file type.txt
+     cat col_headers.txt | while read -r COL_NUMBER; do \
+        cut -f "$COL_NUMBER" ~{samplename}.staphopia-sccmec.summary.tsv | head -n 1 >>type.txt
+        echo "," >>type.txt
+     done
+
+    # remove newlines, remove trailing comma; generate output string of comma separated values
+    cat type.txt | tr -d '\n' | sed 's|.$||g' | tee TYPES_AND_MECA.txt
+
   >>>
   output {
-    File staphopiasccmec_results = "~{samplename}.tsv"
+    File staphopiasccmec_results_tsv = "~{samplename}.staphopia-sccmec.summary.tsv"
+    File staphopiasccmec_hamming_distance_tsv = "~{samplename}.staphopia-sccmec.hamming.tsv"
+    String staphopiasccmec_types_and_mecA_presence = read_string("TYPES_AND_MECA.txt")
     String staphopiasccmec_version = read_string("VERSION")
+    String staphopiasccmec_docker = docker
   }
   runtime {
     docker: "~{docker}"
-    memory: "8 GB"
-    cpu: 2
+    memory: "4 GB"
+    cpu: cpu
     disks: "local-disk " + disk_size + " SSD"
     disk: disk_size + " GB"
     maxRetries: 3

diff --git a/tasks/taxon_id/task_gambit.wdl b/tasks/taxon_id/task_gambit.wdl
@@ -165,6 +165,10 @@ task gambit {
       merlin_tag="Salmonella"
     elif [[ ${predicted_taxon} == *"Staphylococcus"* ]]; then 
       merlin_tag="Staphylococcus"
+      # set to aureus if gambit calls the species
+      if [[ ${predicted_taxon} == *"Staphylococcus aureus"* ]]; then 
+        merlin_tag="Staphylococcus aureus"
+      fi
     elif [[ ${predicted_taxon} == *"Streptococcus"* ]]; then 
       merlin_tag="Streptococcus"
       # set to pneumoniae if gambit calls the species

diff --git a/tasks/utilities/task_broad_terra_tools.wdl b/tasks/utilities/task_broad_terra_tools.wdl
@@ -273,6 +273,25 @@ task export_taxon_tables {
     String? pasty_comment
     String? qc_check
     File? qc_standard
+    File? spatyper_tsv
+    String? spatyper_docker
+    String? spatyper_repeats
+    String? spatyper_type
+    String? spatyper_version
+    File? staphopiasccmec_results_tsv
+    File? staphopiasccmec_hamming_distance_tsv
+    String? staphopiasccmec_types_and_mecA_presence
+    String? staphopiasccmec_version
+    String? staphopiasccmec_docker
+    File? agrvate_summary
+    File? agrvate_results
+    String? agrvate_agr_group
+    String? agrvate_agr_match_score
+    String? agrvate_agr_canonical
+    String? agrvate_agr_multiple
+    String? agrvate_agr_num_frameshifts
+    String? agrvate_version
+    String? agrvate_docker
   }
   command <<<
 
@@ -571,7 +590,26 @@ task export_taxon_tables {
       "pasty_docker": "~{pasty_docker}",
       "pasty_comment": "~{pasty_comment}",
       "qc_check": "~{qc_check}",
-      "qc_standard": "~{qc_standard}"
+      "qc_standard": "~{qc_standard}",
+      "spatyper_tsv": "~{spatyper_tsv}",
+      "spatyper_docker": "~{spatyper_docker}",
+      "spatyper_repeats": "~{spatyper_repeats}",
+      "spatyper_type": "~{spatyper_type}",
+      "spatyper_version": "~{spatyper_version}",
+      "staphopiasccmec_results_tsv": "~{staphopiasccmec_results_tsv}",
+      "staphopiasccmec_hamming_distance_tsv": "~{staphopiasccmec_hamming_distance_tsv}",
+      "staphopiasccmec_types_and_mecA_presence": "~{staphopiasccmec_types_and_mecA_presence}",
+      "staphopiasccmec_version": "~{staphopiasccmec_version}",
+      "staphopiasccmec_docker ": "~{staphopiasccmec_docker}",
+      "agrvate_summary": "~{agrvate_summary}",
+      "agrvate_results": "~{agrvate_results}",
+      "agrvate_agr_group": "~{agrvate_agr_group}",
+      "agrvate_agr_match_score": "~{agrvate_agr_match_score}",
+      "agrvate_agr_canonical": "~{agrvate_agr_canonical}",
+      "agrvate_agr_multiple": "~{agrvate_agr_multiple}",
+      "agrvate_agr_num_frameshifts": "~{agrvate_agr_num_frameshifts}",
+      "agrvate_version": "~{agrvate_version}",
+      "agrvate_docker": "~{agrvate_docker}"
     }
 
     with open("~{samplename}_terra_table.tsv", "w") as outfile:

diff --git a/tests/workflows/test_wf_theiaprok_illumina_pe.yml b/tests/workflows/test_wf_theiaprok_illumina_pe.yml
@@ -107,7 +107,7 @@
     - path: miniwdl_run/call-clean_check_reads/work/_miniwdl_inputs/0/test_1.clean.fastq.gz
     - path: miniwdl_run/call-clean_check_reads/work/_miniwdl_inputs/0/test_2.clean.fastq.gz
     - path: miniwdl_run/call-gambit/command
-      md5sum: 5e6b3458a9dfa1c6f3a04af66f40f746
+      md5sum: ffda45de2bad7a2206f507bf4485c930
     - path: miniwdl_run/call-gambit/inputs.json
       contains: ["assembly", "fasta", "samplename", "test"]
     - path: miniwdl_run/call-gambit/outputs.json