From d9751bb1ed7478e4094289d57ea9ff163ee2673e Mon Sep 17 00:00:00 2001
From: Jover Lee <joverlee521@gmail.com>
Date: Mon, 27 Nov 2023 17:14:50 -0800
Subject: [PATCH] ingest: simplify NCBI Datasets fields config

Instead of hard-coding the list of NCBI Datasets fields in the workflow,
just provide the list via the default config. This makes it easy to
customize which fields to include and makes it very obvious that
field_map config for the the curation pipeline is changing the names of
these NCBI fields.

This includes a change in the `format_ncbi_dataset_report` rule to use
the provided fields as the header so that we do not have to do a
separate renaming of the NCBI column names back to the computer friendly
mneumonics.
---
 ingest/config/defaults.yaml      | 60 +++++++++++++++++++++-----------
 ingest/rules/fetch_from_ncbi.smk | 43 +++--------------------
 2 files changed, 44 insertions(+), 59 deletions(-)

diff --git a/ingest/config/defaults.yaml b/ingest/config/defaults.yaml
index 62389f6..614c027 100644
--- a/ingest/config/defaults.yaml
+++ b/ingest/config/defaults.yaml
@@ -10,8 +10,27 @@ entrez_search_term: ""
 # Required to fetch from NCBI Datasets
 ncbi_taxon_id: ""
 
-# Optional fields to add to the NCBI Datasets output
-ncbi_dataset_fields: []
+# The list of NCBI Datasets fields to include from NCBI Datasets output
+# These need to be the mneumonics of the NCBI Datasets fields, see docs for full list of fields
+# https://www.ncbi.nlm.nih.gov/datasets/docs/v2/reference-docs/command-line/dataformat/tsv/dataformat_tsv_virus-genome/#fields
+# Note: the "accession" field MUST be provided to match with the sequences
+ncbi_datasets_fields:
+  - accession
+  - sourcedb
+  - sra-accs
+  - isolate-lineage
+  - geo-region
+  - geo-location
+  - isolate-collection-date
+  - release-date
+  - update-date
+  - length
+  - host-name
+  - isolate-lineage-source
+  - biosample-acc
+  - submitter-names
+  - submitter-affiliation
+  - submitter-country
 
 # Config parameters related to the curate pipeline
 curate:
@@ -23,26 +42,25 @@ curate:
   # The path should be relative to the ingest directory.
   local_geolocation_rules: "config/geolocation_rules.tsv"
   # List of field names to change where the key is the original field name and the value is the new field name
-  # This is the first step in the pipeline, so any references to field names
-  # in the configs below should use the new field names
-  # The examples below are based on the NCBI Datasets output TSV column names, your data might have different field names.
+  # The original field names should match the ncbi_datasets_fields provided above.
+  # This is the first step in the pipeline, so any references to field names in the configs below should use the new field names
   field_map:
-    Source database: database
-    Isolate Collection date: date
-    Release date: date_released
-    Update date: date_updated
-    Accession: accession
-    Isolate Lineage: strain
-    Geographic Region: region
-    Geographic Location: location
-    Submitter Names: authors
-    Submitter Affiliation: institution
-    SRA Accessions: sra_accessions
-    Length: length
-    Host Name: host
-    Isolate Lineage source: sample_type
-    BioSample accession: biosample_accession
-    Submitter Country: submitter_country
+    accession: accession
+    sourcedb: database
+    sra-accs: sra_accessions
+    isolate-lineage: strain
+    geo-region: region
+    geo-location: location
+    isolate-collection-date: date
+    release-date: date_released
+    update-date: date_updated
+    length: length
+    host-name: host
+    isolate-lineage-source: sample_type
+    biosample-acc: biosample_accessions
+    submitter-names: authors
+    submitter-affiliation: institution
+    submitter-country: submitter_country
   # Standardized strain name regex
   # Currently accepts any characters because we do not have a clear standard for strain names across pathogens
   strain_regex: '^.+$'
diff --git a/ingest/rules/fetch_from_ncbi.smk b/ingest/rules/fetch_from_ncbi.smk
index 8b8c064..2f4b5ad 100644
--- a/ingest/rules/fetch_from_ncbi.smk
+++ b/ingest/rules/fetch_from_ncbi.smk
@@ -67,55 +67,22 @@ rule extract_ncbi_dataset_sequences:
         """
 
 
-def _get_ncbi_dataset_field_mnemonics(provided_fields: list) -> str:
-    """
-    Return list of NCBI Dataset report field mnemonics for fields that we want
-    to parse out of the dataset report. The column names in the output TSV
-    are different from the mnemonics.
-
-    Additional *provided_fields* will be appended to the end of the list.
-
-    See NCBI Dataset docs for full list of available fields and their column
-    names in the output:
-    https://www.ncbi.nlm.nih.gov/datasets/docs/v2/reference-docs/command-line/dataformat/tsv/dataformat_tsv_virus-genome/#fields
-    """
-    fields = [
-        "accession",
-        "sourcedb",
-        "sra-accs",
-        "isolate-lineage",
-        "geo-region",
-        "geo-location",
-        "isolate-collection-date",
-        "release-date",
-        "update-date",
-        "length",
-        "host-name",
-        "isolate-lineage-source",
-        "biosample-acc",
-        "submitter-names",
-        "submitter-affiliation",
-        "submitter-country",
-    ]
-    return ",".join(fields + provided_fields)
-
-
 rule format_ncbi_dataset_report:
     input:
         dataset_package="data/ncbi_dataset.zip",
     output:
         ncbi_dataset_tsv=temp("data/ncbi_dataset_report.tsv"),
     params:
-        fields_to_include=_get_ncbi_dataset_field_mnemonics(
-            config.get("ncbi_dataset_fields", [])
-        ),
+        ncbi_datasets_fields=",".join(config["ncbi_datasets_fields"]),
     benchmark:
         "benchmarks/format_ncbi_dataset_report.txt"
     shell:
         """
         dataformat tsv virus-genome \
             --package {input.dataset_package} \
-            --fields {params.fields_to_include:q} \
+            --fields {params.ncbi_datasets_fields:q} \
+            --elide-header \
+            | csvtk add-header -t -n {params.ncbi_datasets_fields:q} \
             > {output.ncbi_dataset_tsv}
         """
 
@@ -139,7 +106,7 @@ rule format_ncbi_datasets_ndjson:
         augur curate passthru \
             --metadata {input.ncbi_dataset_tsv} \
             --fasta {input.ncbi_dataset_sequences} \
-            --seq-id-column Accession \
+            --seq-id-column accession \
             --seq-field sequence \
             --unmatched-reporting warn \
             --duplicate-reporting warn \