Skip to content

Commit

Permalink
demux_deplete: allow multiple biosample attribute files (#561)
Browse files Browse the repository at this point in the history
* allow demux_deplete to accept multiple files containing biosample metadata/map info

modify the demux_deplete workflow so it can accept multiple files containing biosample metadata/map info. This is useful for the scenario where a flowcell contains samples corresponding to multiple BioSample submissions (ex. one for each attribute package type; or submissions made gradually over time). If more than one file is provided, the the data will be merged via a full left outer join

* add utils import to demux_deplete workflow

* update biosample_map_tsvs variable name in one spot

* additional /find/replace/

* omit unnecessary flattening of biosample attribute map file list

* update demux_deplete input name biosample_map -> biosample_map_tsvs in sarscov2_illumina_full

* pass array of biosample_map_tsvs to demux_deplete
  • Loading branch information
tomkinsc authored Nov 6, 2024
1 parent 4e2beb9 commit 1f535ba
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 10 deletions.
30 changes: 21 additions & 9 deletions pipes/WDL/workflows/demux_deplete.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import "../tasks/tasks_ncbi.wdl" as ncbi
import "../tasks/tasks_reports.wdl" as reports
import "../tasks/tasks_taxon_filter.wdl" as taxon_filter
import "../tasks/tasks_terra.wdl" as terra
import "../tasks/tasks_utils.wdl" as utils

workflow demux_deplete {
meta {
Expand All @@ -25,7 +26,7 @@ workflow demux_deplete {
Boolean insert_demux_outputs_into_terra_tables=false

File? sample_rename_map
File? biosample_map
Array[File]? biosample_map_tsvs
Int min_reads_per_bam = 100

String? instrument_model_user_specified
Expand Down Expand Up @@ -53,12 +54,12 @@ workflow demux_deplete {
category: "required"
}
sample_rename_map: {
description: "If 'samples' need to be renamed, provide a two-column tsv that contains at least the following columns: internal_id, external_id. All samples will be renamed prior to analysis. Any samples described in the samplesheets that are not present in sample_rename_map will be unaltered. If this is omitted, no samples will be renamed.",
description: "If 'samples' need to be renamed, provide a two-column tsv that contains at least the following columns: 'internal_id', 'external_id'. All samples will be renamed prior to analysis. Any samples described in the samplesheets that are not present in sample_rename_map will be unaltered. If this is omitted, no samples will be renamed.",
patterns: ["*.txt", "*.tsv"],
category: "advanced"
}
biosample_map: {
description: "A two-column tsv that contains at least the following columns: sample_name, accession. sample_name refers to the external sample id, accession is the NCBI BioSample accession number (SAMNxxx). If this file is omitted, SRA submission prep will be skipped.",
biosample_map_tsvs: {
description: "One or more tsv files, each containing at least the following columns: 'sample_name', 'accession'. 'sample_name' refers to the external sample id, 'accession' is the NCBI BioSample accession (SAMN########). Recommended input: the BioSample attributes tsv returned by NCBI following successful submission of a new list of BioSample attributes. If this file is omitted, SRA submission prep will be skipped.",
patterns: ["*.txt", "*.tsv"],
category: "advanced"
}
Expand Down Expand Up @@ -182,11 +183,22 @@ workflow demux_deplete {
Pair[String,Int] count_cleaned = (basename(raw_reads, '.bam'), read_count_post_depletion)
}

if(defined(biosample_map)) {
if (length(flatten(select_all([biosample_map_tsvs]))) > 0) {
#### merge biosample attribute tsvs (iff provided with more than one)
if (length(flatten(select_all([biosample_map_tsvs]))) > 1) {
call utils.tsv_join as biosample_map_tsv_join {
input:
input_tsvs = flatten([select_first([biosample_map_tsvs,[]])]),
id_col = 'accession',
out_suffix = ".tsv",
out_basename = "biosample-attributes-merged"
}
}

#### biosample metadata mapping
call ncbi.biosample_to_table {
input:
biosample_attributes_tsv = select_first([biosample_map]),
biosample_attributes_tsv = select_first([biosample_map_tsv_join.out_tsv, biosample_map_tsvs]),
cleaned_bam_filepaths = select_all(cleaned_bam_passing),
demux_meta_json = meta_filename.merged_json
}
Expand All @@ -195,7 +207,7 @@ workflow demux_deplete {
call ncbi.sra_meta_prep {
input:
cleaned_bam_filepaths = select_all(cleaned_bam_passing),
biosample_map = select_first([biosample_map]),
biosample_map = select_first([biosample_map_tsv_join.out_tsv, biosample_map_tsvs]),
library_metadata = samplesheet_rename_ids.new_sheet,
platform = "ILLUMINA",
paired = (illumina_demux.run_info[0]['indexes'] == '2'),
Expand All @@ -206,7 +218,7 @@ workflow demux_deplete {
}
}

if(insert_demux_outputs_into_terra_tables){
if(insert_demux_outputs_into_terra_tables) {
call terra.check_terra_env

if(check_terra_env.is_running_on_terra) {
Expand All @@ -223,7 +235,7 @@ workflow demux_deplete {
read_counts_cleaned_json = write_json(count_cleaned)
}
if(defined(biosample_map)) {
if (length(flatten(select_all([biosample_map_tsvs]))) > 0) {
call terra.upload_entities_tsv as terra_load_biosample_data {
input:
workspace_name = check_terra_env.workspace_name,
Expand Down
2 changes: 1 addition & 1 deletion pipes/WDL/workflows/sarscov2_illumina_full.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ workflow sarscov2_illumina_full {
call demux_deplete.demux_deplete {
input:
flowcell_tgz = flowcell_tgz,
biosample_map = biosample_merge.out_tsv,
biosample_map_tsvs = [biosample_merge.out_tsv],
instrument_model_user_specified = instrument_model,
sra_title = sra_title,
read_structure = read_structure,
Expand Down

0 comments on commit 1f535ba

Please sign in to comment.