Skip to content

Commit

Permalink
Merge pull request #19 from nextstrain/simplify-ncbi-fields
Browse files Browse the repository at this point in the history
ingest: simplify NCBI Datasets fields config
  • Loading branch information
joverlee521 authored Nov 29, 2023
2 parents 5ac694b + d9751bb commit 87a1204
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 59 deletions.
60 changes: 39 additions & 21 deletions ingest/config/defaults.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,27 @@ entrez_search_term: ""
# Required to fetch from NCBI Datasets
ncbi_taxon_id: ""

# Optional fields to add to the NCBI Datasets output
ncbi_dataset_fields: []
# The list of NCBI Datasets fields to include from NCBI Datasets output
# These need to be the mneumonics of the NCBI Datasets fields, see docs for full list of fields
# https://www.ncbi.nlm.nih.gov/datasets/docs/v2/reference-docs/command-line/dataformat/tsv/dataformat_tsv_virus-genome/#fields
# Note: the "accession" field MUST be provided to match with the sequences
ncbi_datasets_fields:
- accession
- sourcedb
- sra-accs
- isolate-lineage
- geo-region
- geo-location
- isolate-collection-date
- release-date
- update-date
- length
- host-name
- isolate-lineage-source
- biosample-acc
- submitter-names
- submitter-affiliation
- submitter-country

# Config parameters related to the curate pipeline
curate:
Expand All @@ -23,26 +42,25 @@ curate:
# The path should be relative to the ingest directory.
local_geolocation_rules: "config/geolocation_rules.tsv"
# List of field names to change where the key is the original field name and the value is the new field name
# This is the first step in the pipeline, so any references to field names
# in the configs below should use the new field names
# The examples below are based on the NCBI Datasets output TSV column names, your data might have different field names.
# The original field names should match the ncbi_datasets_fields provided above.
# This is the first step in the pipeline, so any references to field names in the configs below should use the new field names
field_map:
Source database: database
Isolate Collection date: date
Release date: date_released
Update date: date_updated
Accession: accession
Isolate Lineage: strain
Geographic Region: region
Geographic Location: location
Submitter Names: authors
Submitter Affiliation: institution
SRA Accessions: sra_accessions
Length: length
Host Name: host
Isolate Lineage source: sample_type
BioSample accession: biosample_accession
Submitter Country: submitter_country
accession: accession
sourcedb: database
sra-accs: sra_accessions
isolate-lineage: strain
geo-region: region
geo-location: location
isolate-collection-date: date
release-date: date_released
update-date: date_updated
length: length
host-name: host
isolate-lineage-source: sample_type
biosample-acc: biosample_accessions
submitter-names: authors
submitter-affiliation: institution
submitter-country: submitter_country
# Standardized strain name regex
# Currently accepts any characters because we do not have a clear standard for strain names across pathogens
strain_regex: '^.+$'
Expand Down
43 changes: 5 additions & 38 deletions ingest/rules/fetch_from_ncbi.smk
Original file line number Diff line number Diff line change
Expand Up @@ -67,55 +67,22 @@ rule extract_ncbi_dataset_sequences:
"""


def _get_ncbi_dataset_field_mnemonics(provided_fields: list) -> str:
"""
Return list of NCBI Dataset report field mnemonics for fields that we want
to parse out of the dataset report. The column names in the output TSV
are different from the mnemonics.
Additional *provided_fields* will be appended to the end of the list.
See NCBI Dataset docs for full list of available fields and their column
names in the output:
https://www.ncbi.nlm.nih.gov/datasets/docs/v2/reference-docs/command-line/dataformat/tsv/dataformat_tsv_virus-genome/#fields
"""
fields = [
"accession",
"sourcedb",
"sra-accs",
"isolate-lineage",
"geo-region",
"geo-location",
"isolate-collection-date",
"release-date",
"update-date",
"length",
"host-name",
"isolate-lineage-source",
"biosample-acc",
"submitter-names",
"submitter-affiliation",
"submitter-country",
]
return ",".join(fields + provided_fields)


rule format_ncbi_dataset_report:
input:
dataset_package="data/ncbi_dataset.zip",
output:
ncbi_dataset_tsv=temp("data/ncbi_dataset_report.tsv"),
params:
fields_to_include=_get_ncbi_dataset_field_mnemonics(
config.get("ncbi_dataset_fields", [])
),
ncbi_datasets_fields=",".join(config["ncbi_datasets_fields"]),
benchmark:
"benchmarks/format_ncbi_dataset_report.txt"
shell:
"""
dataformat tsv virus-genome \
--package {input.dataset_package} \
--fields {params.fields_to_include:q} \
--fields {params.ncbi_datasets_fields:q} \
--elide-header \
| csvtk add-header -t -n {params.ncbi_datasets_fields:q} \
> {output.ncbi_dataset_tsv}
"""

Expand All @@ -139,7 +106,7 @@ rule format_ncbi_datasets_ndjson:
augur curate passthru \
--metadata {input.ncbi_dataset_tsv} \
--fasta {input.ncbi_dataset_sequences} \
--seq-id-column Accession \
--seq-id-column accession \
--seq-field sequence \
--unmatched-reporting warn \
--duplicate-reporting warn \
Expand Down

0 comments on commit 87a1204

Please sign in to comment.