From d9751bb1ed7478e4094289d57ea9ff163ee2673e Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Mon, 27 Nov 2023 17:14:50 -0800 Subject: [PATCH] ingest: simplify NCBI Datasets fields config Instead of hard-coding the list of NCBI Datasets fields in the workflow, just provide the list via the default config. This makes it easy to customize which fields to include and makes it very obvious that field_map config for the the curation pipeline is changing the names of these NCBI fields. This includes a change in the `format_ncbi_dataset_report` rule to use the provided fields as the header so that we do not have to do a separate renaming of the NCBI column names back to the computer friendly mneumonics. --- ingest/config/defaults.yaml | 60 +++++++++++++++++++++----------- ingest/rules/fetch_from_ncbi.smk | 43 +++-------------------- 2 files changed, 44 insertions(+), 59 deletions(-) diff --git a/ingest/config/defaults.yaml b/ingest/config/defaults.yaml index 62389f6..614c027 100644 --- a/ingest/config/defaults.yaml +++ b/ingest/config/defaults.yaml @@ -10,8 +10,27 @@ entrez_search_term: "" # Required to fetch from NCBI Datasets ncbi_taxon_id: "" -# Optional fields to add to the NCBI Datasets output -ncbi_dataset_fields: [] +# The list of NCBI Datasets fields to include from NCBI Datasets output +# These need to be the mneumonics of the NCBI Datasets fields, see docs for full list of fields +# https://www.ncbi.nlm.nih.gov/datasets/docs/v2/reference-docs/command-line/dataformat/tsv/dataformat_tsv_virus-genome/#fields +# Note: the "accession" field MUST be provided to match with the sequences +ncbi_datasets_fields: + - accession + - sourcedb + - sra-accs + - isolate-lineage + - geo-region + - geo-location + - isolate-collection-date + - release-date + - update-date + - length + - host-name + - isolate-lineage-source + - biosample-acc + - submitter-names + - submitter-affiliation + - submitter-country # Config parameters related to the curate pipeline curate: @@ -23,26 +42,25 @@ curate: # The path should be relative to the ingest directory. local_geolocation_rules: "config/geolocation_rules.tsv" # List of field names to change where the key is the original field name and the value is the new field name - # This is the first step in the pipeline, so any references to field names - # in the configs below should use the new field names - # The examples below are based on the NCBI Datasets output TSV column names, your data might have different field names. + # The original field names should match the ncbi_datasets_fields provided above. + # This is the first step in the pipeline, so any references to field names in the configs below should use the new field names field_map: - Source database: database - Isolate Collection date: date - Release date: date_released - Update date: date_updated - Accession: accession - Isolate Lineage: strain - Geographic Region: region - Geographic Location: location - Submitter Names: authors - Submitter Affiliation: institution - SRA Accessions: sra_accessions - Length: length - Host Name: host - Isolate Lineage source: sample_type - BioSample accession: biosample_accession - Submitter Country: submitter_country + accession: accession + sourcedb: database + sra-accs: sra_accessions + isolate-lineage: strain + geo-region: region + geo-location: location + isolate-collection-date: date + release-date: date_released + update-date: date_updated + length: length + host-name: host + isolate-lineage-source: sample_type + biosample-acc: biosample_accessions + submitter-names: authors + submitter-affiliation: institution + submitter-country: submitter_country # Standardized strain name regex # Currently accepts any characters because we do not have a clear standard for strain names across pathogens strain_regex: '^.+$' diff --git a/ingest/rules/fetch_from_ncbi.smk b/ingest/rules/fetch_from_ncbi.smk index 8b8c064..2f4b5ad 100644 --- a/ingest/rules/fetch_from_ncbi.smk +++ b/ingest/rules/fetch_from_ncbi.smk @@ -67,55 +67,22 @@ rule extract_ncbi_dataset_sequences: """ -def _get_ncbi_dataset_field_mnemonics(provided_fields: list) -> str: - """ - Return list of NCBI Dataset report field mnemonics for fields that we want - to parse out of the dataset report. The column names in the output TSV - are different from the mnemonics. - - Additional *provided_fields* will be appended to the end of the list. - - See NCBI Dataset docs for full list of available fields and their column - names in the output: - https://www.ncbi.nlm.nih.gov/datasets/docs/v2/reference-docs/command-line/dataformat/tsv/dataformat_tsv_virus-genome/#fields - """ - fields = [ - "accession", - "sourcedb", - "sra-accs", - "isolate-lineage", - "geo-region", - "geo-location", - "isolate-collection-date", - "release-date", - "update-date", - "length", - "host-name", - "isolate-lineage-source", - "biosample-acc", - "submitter-names", - "submitter-affiliation", - "submitter-country", - ] - return ",".join(fields + provided_fields) - - rule format_ncbi_dataset_report: input: dataset_package="data/ncbi_dataset.zip", output: ncbi_dataset_tsv=temp("data/ncbi_dataset_report.tsv"), params: - fields_to_include=_get_ncbi_dataset_field_mnemonics( - config.get("ncbi_dataset_fields", []) - ), + ncbi_datasets_fields=",".join(config["ncbi_datasets_fields"]), benchmark: "benchmarks/format_ncbi_dataset_report.txt" shell: """ dataformat tsv virus-genome \ --package {input.dataset_package} \ - --fields {params.fields_to_include:q} \ + --fields {params.ncbi_datasets_fields:q} \ + --elide-header \ + | csvtk add-header -t -n {params.ncbi_datasets_fields:q} \ > {output.ncbi_dataset_tsv} """ @@ -139,7 +106,7 @@ rule format_ncbi_datasets_ndjson: augur curate passthru \ --metadata {input.ncbi_dataset_tsv} \ --fasta {input.ncbi_dataset_sequences} \ - --seq-id-column Accession \ + --seq-id-column accession \ --seq-field sequence \ --unmatched-reporting warn \ --duplicate-reporting warn \