diff --git a/.dockstore.yml b/.dockstore.yml index 7481388c3..9d0fc99e2 100644 --- a/.dockstore.yml +++ b/.dockstore.yml @@ -230,3 +230,8 @@ workflows: primaryDescriptorPath: /workflows/phylogenetics/wf_usher.wdl testParameterFiles: - empty.json + - name: CZGenEpi_Prep_PHB + subclass: WDL + primaryDescriptorPath: /workflows/utilities/wf_czgenepi_prep.wdl + testParameterFiles: + - empty.json \ No newline at end of file diff --git a/tasks/utilities/task_czgenepi_wrangling.wdl b/tasks/utilities/task_czgenepi_wrangling.wdl new file mode 100644 index 000000000..97081458a --- /dev/null +++ b/tasks/utilities/task_czgenepi_wrangling.wdl @@ -0,0 +1,134 @@ +version 1.0 + +task czgenepi_wrangling { + input { + File full_terra_table + String terra_table_name + + # required fields + Array[String] sample_names + String assembly_fasta_column_name + String collection_date_column_name + String private_id_column_name + + # collection location + String continent_column_name + String country_column_name + String state_column_name + String county_column_name + + # optional inputs + String gisaid_id_column_name + String genbank_accession_column_name + String sequencing_date_column_name + String sample_is_private_column_name + + # runtime + Int disk_size = 100 + } + command <<< + # parse terra table for data + python3 < 0 + metadata["Collection Location"] = metadata.apply(lambda x: x["Collection Location"] + "/" + x["county"] if len(x["county"]) > 0 else x["Collection Location"], axis=1) + + print("DEBUG: checking if private_id column was set") + if "~{private_id_column_name}" == "~{terra_table_name}_id": + print("DEBUG: removing duplicated column") + metadata = metadata.loc[:, ~metadata.columns.duplicated()].copy() + metadata["Private ID"] = metadata.loc[:, "~{private_id_column_name}"] + else: + metadata.rename(columns={"~{private_id_column_name}": "Private ID"}, inplace=True) + + print("DEBUG: renaming the rest of the headers") + # rename headers to match CZGenEpi's expected format + metadata.rename(columns={"~{terra_table_name}_id": "Sample Name (from FASTA)", + "~{gisaid_id_column_name}": "GISAID ID (Public ID) - Optional", + "~{genbank_accession_column_name}": "GenBank Accession (Public ID) - Optional", + "~{collection_date_column_name}": "Collection Date", + "~{sequencing_date_column_name}": "Sequencing Date - Optional", + "~{sample_is_private_column_name}": "Sample is Private"}, inplace=True) + + # write the output to a csv file + metadata.to_csv("czgenepi_prep_metadata.csv", columns=OUTPUT_COLUMN_ORDER, index=False) + + # create file transfer command and write it to a file + table["cp"] = "gcloud storage cp " + table["~{assembly_fasta_column_name}"] + " ." + table.to_csv("file-transfer.sh", columns=["cp"], index=False, header=False) + + # create fasta header renaming command and write it to a file + table["rename_fasta_header"] = "sed -i '1s|.*|>" + table["~{terra_table_name}_id"] + "|' " + table.apply(lambda x: os.path.basename(x["~{assembly_fasta_column_name}"]), axis=1) + table.to_csv("rename-command.sh", columns=["rename_fasta_header"], index=False, header=False) + + CODE + + echo "DEBUG: transfering fasta files" + bash file-transfer.sh + + echo "DEBUG: renaming fasta headers" + bash rename-command.sh + + echo "DEBUG: concatenating fasta files" + cat *fasta > czgenepi_prep_concatenated.fasta + + >>> + output { + File concatenated_fasta = "czgenepi_prep_concatenated.fasta" + File concatenated_metadata = "czgenepi_prep_metadata.csv" + } + runtime { + docker: "us-docker.pkg.dev/general-theiagen/theiagen/terra-tools:2023-08-08-2" + memory: "8 GB" + cpu: 1 + disks: "local-disk " + disk_size + " SSD" + disk: disk_size + " GB" + dx_instance_type: "mem1_ssd1_v2_x2" + #maxRetries: 3 + } +} \ No newline at end of file diff --git a/tasks/utilities/task_download_terra_table.wdl b/tasks/utilities/task_download_terra_table.wdl new file mode 100644 index 000000000..79eb7e438 --- /dev/null +++ b/tasks/utilities/task_download_terra_table.wdl @@ -0,0 +1,27 @@ +version 1.0 + +task download_terra_table { + meta { + description: "This task downloads a Terra table and reduces it to only include the samples of interest." + } + input { + String terra_table_name + String terra_workspace_name + String terra_project_name + Int disk_size = 10 + } + command <<< + python3 /scripts/export_large_tsv/export_large_tsv.py --project ~{terra_project_name} --workspace ~{terra_workspace_name} --entity_type ~{terra_table_name} --tsv_filename "~{terra_table_name}.tsv" + >>> + output { + File terra_table = "~{terra_table_name}.tsv" + } + runtime { + docker: "quay.io/theiagen/terra-tools:2023-06-21" + memory: "1 GB" + cpu: 1 + disks: "local-disk " + disk_size + " HDD" + disk: disk_size + " GB" + dx_instance_type: "mem1_ssd1_v2_x2" + } +} \ No newline at end of file diff --git a/workflows/utilities/wf_czgenepi_prep.wdl b/workflows/utilities/wf_czgenepi_prep.wdl new file mode 100644 index 000000000..1b298ce0a --- /dev/null +++ b/workflows/utilities/wf_czgenepi_prep.wdl @@ -0,0 +1,65 @@ +version 1.0 + +import "../../tasks/utilities/task_czgenepi_wrangling.wdl" as czgenepi_wrangling_task +import "../../tasks/utilities/task_download_terra_table.wdl" as download_table +import "../../tasks/task_versioning.wdl" as versioning + +workflow czgenepi_prep { + input { + Array[String] sample_names + + # downloading table information + String terra_project_name + String terra_workspace_name + String terra_table_name + + # required columns + String assembly_fasta_column_name = "assembly_fasta" + String collection_date_column_name = "collection_date" + String private_id_column_name = terra_table_name + "_id" + + # collection location - required + String continent_column_name = "continent" + String country_column_name = "country" + String state_column_name = "state" + String county_column_name = "county" + + # optional columns + String gisaid_id_column_name = "gisaid_accession" + String genbank_accession_column_name = "genbank_accession" + String sequencing_date_column_name = "sequencing_date" + String sample_is_private_column_name = "sample_is_private" + } + call versioning.version_capture{ + input: + } + call download_table.download_terra_table { + input: + terra_project_name = terra_project_name, + terra_workspace_name = terra_workspace_name, + terra_table_name = terra_table_name + } + call czgenepi_wrangling_task.czgenepi_wrangling { + input: + full_terra_table = download_terra_table.terra_table, + sample_names = sample_names, + terra_table_name = terra_table_name, + assembly_fasta_column_name = assembly_fasta_column_name, + collection_date_column_name = collection_date_column_name, + private_id_column_name = private_id_column_name, + continent_column_name = continent_column_name, + country_column_name = country_column_name, + state_column_name = state_column_name, + county_column_name = county_column_name, + gisaid_id_column_name = gisaid_id_column_name, + genbank_accession_column_name = genbank_accession_column_name, + sequencing_date_column_name = sequencing_date_column_name, + sample_is_private_column_name = sample_is_private_column_name + } + output { + File concatenated_czgenepi_fasta = czgenepi_wrangling.concatenated_fasta + File concatenated_czgenepi_metadata = czgenepi_wrangling.concatenated_metadata + String czgenepi_prep_version = version_capture.phb_version + String czgenepi_prep_analysis_date = version_capture.date + } +} \ No newline at end of file