Merge pull request #19 from nextstrain/simplify-ncbi-fields

ingest: simplify NCBI Datasets fields config
nextstrain · Nov 29, 2023 · 87a1204 · 87a1204
2 parents 5ac694b + d9751bb
commit 87a1204
Show file tree

Hide file tree

Showing 2 changed files with 44 additions and 59 deletions.
diff --git a/ingest/config/defaults.yaml b/ingest/config/defaults.yaml
@@ -10,8 +10,27 @@ entrez_search_term: ""
 # Required to fetch from NCBI Datasets
 ncbi_taxon_id: ""
 
-# Optional fields to add to the NCBI Datasets output
-ncbi_dataset_fields: []
+# The list of NCBI Datasets fields to include from NCBI Datasets output
+# These need to be the mneumonics of the NCBI Datasets fields, see docs for full list of fields
+# https://www.ncbi.nlm.nih.gov/datasets/docs/v2/reference-docs/command-line/dataformat/tsv/dataformat_tsv_virus-genome/#fields
+# Note: the "accession" field MUST be provided to match with the sequences
+ncbi_datasets_fields:
+  - accession
+  - sourcedb
+  - sra-accs
+  - isolate-lineage
+  - geo-region
+  - geo-location
+  - isolate-collection-date
+  - release-date
+  - update-date
+  - length
+  - host-name
+  - isolate-lineage-source
+  - biosample-acc
+  - submitter-names
+  - submitter-affiliation
+  - submitter-country
 
 # Config parameters related to the curate pipeline
 curate:
@@ -23,26 +42,25 @@ curate:
   # The path should be relative to the ingest directory.
   local_geolocation_rules: "config/geolocation_rules.tsv"
   # List of field names to change where the key is the original field name and the value is the new field name
-  # This is the first step in the pipeline, so any references to field names
-  # in the configs below should use the new field names
-  # The examples below are based on the NCBI Datasets output TSV column names, your data might have different field names.
+  # The original field names should match the ncbi_datasets_fields provided above.
+  # This is the first step in the pipeline, so any references to field names in the configs below should use the new field names
   field_map:
-    Source database: database
-    Isolate Collection date: date
-    Release date: date_released
-    Update date: date_updated
-    Accession: accession
-    Isolate Lineage: strain
-    Geographic Region: region
-    Geographic Location: location
-    Submitter Names: authors
-    Submitter Affiliation: institution
-    SRA Accessions: sra_accessions
-    Length: length
-    Host Name: host
-    Isolate Lineage source: sample_type
-    BioSample accession: biosample_accession
-    Submitter Country: submitter_country
+    accession: accession
+    sourcedb: database
+    sra-accs: sra_accessions
+    isolate-lineage: strain
+    geo-region: region
+    geo-location: location
+    isolate-collection-date: date
+    release-date: date_released
+    update-date: date_updated
+    length: length
+    host-name: host
+    isolate-lineage-source: sample_type
+    biosample-acc: biosample_accessions
+    submitter-names: authors
+    submitter-affiliation: institution
+    submitter-country: submitter_country
   # Standardized strain name regex
   # Currently accepts any characters because we do not have a clear standard for strain names across pathogens
   strain_regex: '^.+$'

diff --git a/ingest/rules/fetch_from_ncbi.smk b/ingest/rules/fetch_from_ncbi.smk
@@ -67,55 +67,22 @@ rule extract_ncbi_dataset_sequences:
         """
 
 
-def _get_ncbi_dataset_field_mnemonics(provided_fields: list) -> str:
-    """
-    Return list of NCBI Dataset report field mnemonics for fields that we want
-    to parse out of the dataset report. The column names in the output TSV
-    are different from the mnemonics.
-
-    Additional *provided_fields* will be appended to the end of the list.
-
-    See NCBI Dataset docs for full list of available fields and their column
-    names in the output:
-    https://www.ncbi.nlm.nih.gov/datasets/docs/v2/reference-docs/command-line/dataformat/tsv/dataformat_tsv_virus-genome/#fields
-    """
-    fields = [
-        "accession",
-        "sourcedb",
-        "sra-accs",
-        "isolate-lineage",
-        "geo-region",
-        "geo-location",
-        "isolate-collection-date",
-        "release-date",
-        "update-date",
-        "length",
-        "host-name",
-        "isolate-lineage-source",
-        "biosample-acc",
-        "submitter-names",
-        "submitter-affiliation",
-        "submitter-country",
-    ]
-    return ",".join(fields + provided_fields)
-
-
 rule format_ncbi_dataset_report:
     input:
         dataset_package="data/ncbi_dataset.zip",
     output:
         ncbi_dataset_tsv=temp("data/ncbi_dataset_report.tsv"),
     params:
-        fields_to_include=_get_ncbi_dataset_field_mnemonics(
-            config.get("ncbi_dataset_fields", [])
-        ),
+        ncbi_datasets_fields=",".join(config["ncbi_datasets_fields"]),
     benchmark:
         "benchmarks/format_ncbi_dataset_report.txt"
     shell:
         """
         dataformat tsv virus-genome \
             --package {input.dataset_package} \
-            --fields {params.fields_to_include:q} \
+            --fields {params.ncbi_datasets_fields:q} \
+            --elide-header \
+            | csvtk add-header -t -n {params.ncbi_datasets_fields:q} \
             > {output.ncbi_dataset_tsv}
         """
 
@@ -139,7 +106,7 @@ rule format_ncbi_datasets_ndjson:
         augur curate passthru \
             --metadata {input.ncbi_dataset_tsv} \
             --fasta {input.ncbi_dataset_sequences} \
-            --seq-id-column Accession \
+            --seq-id-column accession \
             --seq-field sequence \
             --unmatched-reporting warn \
             --duplicate-reporting warn \