From 2d6295493c292495f3dd4a0bde90d831ba64840f Mon Sep 17 00:00:00 2001 From: Christopher Tomkins-Tinch Date: Wed, 10 Mar 2021 21:34:06 -0500 Subject: [PATCH 1/3] allow custom placeholder for missing data in tsv_join task allow custom placeholder for missing data in tsv_join task, set default to "?" for tsv join of nextmeta-format input in sarscov2_nextstrain workflow --- pipes/WDL/tasks/tasks_reports.wdl | 3 ++- pipes/WDL/workflows/sarscov2_nextstrain.wdl | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/pipes/WDL/tasks/tasks_reports.wdl b/pipes/WDL/tasks/tasks_reports.wdl index ccb36d7d2..bbd205baf 100644 --- a/pipes/WDL/tasks/tasks_reports.wdl +++ b/pipes/WDL/tasks/tasks_reports.wdl @@ -398,6 +398,7 @@ task tsv_join { Array[File]+ input_tsvs String id_col String out_basename = "merged" + String placeholder_for_missing = "" } command <<< @@ -431,7 +432,7 @@ task tsv_join { for h in header: # prefer non-empty values from earlier files in in_tsvs, populate from subsequent files only if missing if not row_out.get(h): - row_out[h] = row.get(h, '') + row_out[h] = row.get(h, '~{placeholder_for_missing}') out_row_by_id[row_id] = row_out out_ids.append(row_id) out_ids = list(collections.OrderedDict(((i,0) for i in out_ids)).keys()) diff --git a/pipes/WDL/workflows/sarscov2_nextstrain.wdl b/pipes/WDL/workflows/sarscov2_nextstrain.wdl index f588bd9e7..fcc9dffe2 100644 --- a/pipes/WDL/workflows/sarscov2_nextstrain.wdl +++ b/pipes/WDL/workflows/sarscov2_nextstrain.wdl @@ -85,7 +85,8 @@ workflow sarscov2_nextstrain { input: input_tsvs = sample_metadata_tsvs, id_col = 'strain', - out_basename = "metadata-merged" + out_basename = "metadata-merged", + placeholder_for_missing = "?" } } call nextstrain.derived_cols { From df439a6f7bc03fdc1a6d4060e54d18586a62b310 Mon Sep 17 00:00:00 2001 From: Christopher Tomkins-Tinch Date: Wed, 10 Mar 2021 22:17:18 -0500 Subject: [PATCH 2/3] fallback properly --- pipes/WDL/tasks/tasks_reports.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipes/WDL/tasks/tasks_reports.wdl b/pipes/WDL/tasks/tasks_reports.wdl index bbd205baf..4d5a4e13c 100644 --- a/pipes/WDL/tasks/tasks_reports.wdl +++ b/pipes/WDL/tasks/tasks_reports.wdl @@ -431,7 +431,7 @@ task tsv_join { row_out = out_row_by_id.get(row_id, {}) for h in header: # prefer non-empty values from earlier files in in_tsvs, populate from subsequent files only if missing - if not row_out.get(h): + if row_out.get(h, '~{placeholder_for_missing}') == '~{placeholder_for_missing}': row_out[h] = row.get(h, '~{placeholder_for_missing}') out_row_by_id[row_id] = row_out out_ids.append(row_id) From 3f222b88262dfd5dd371d74a9d084d786d39b392 Mon Sep 17 00:00:00 2001 From: Christopher Tomkins-Tinch Date: Mon, 14 Jun 2021 15:37:16 -0400 Subject: [PATCH 3/3] move changed placeholder code to reflect upstream move of tsv_join from tasks_reports.wdl to tasks_utils.wdl --- pipes/WDL/tasks/tasks_utils.wdl | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pipes/WDL/tasks/tasks_utils.wdl b/pipes/WDL/tasks/tasks_utils.wdl index dcb377a25..77adb28a6 100644 --- a/pipes/WDL/tasks/tasks_utils.wdl +++ b/pipes/WDL/tasks/tasks_utils.wdl @@ -178,6 +178,7 @@ task tsv_join { Array[File]+ input_tsvs String id_col String out_basename = "merged" + String placeholder_for_missing = "" } command <<< @@ -210,8 +211,8 @@ task tsv_join { row_out = out_row_by_id.get(row_id, {}) for h in header: # prefer non-empty values from earlier files in in_tsvs, populate from subsequent files only if missing - if not row_out.get(h): - row_out[h] = row.get(h, '') + if row_out.get(h, '~{placeholder_for_missing}') == '~{placeholder_for_missing}': + row_out[h] = row.get(h, '~{placeholder_for_missing}') out_row_by_id[row_id] = row_out out_ids.append(row_id) out_ids = list(collections.OrderedDict(((i,0) for i in out_ids)).keys())