diff --git a/ingest/config/defaults.yaml b/ingest/config/defaults.yaml index 62389f6..614c027 100644 --- a/ingest/config/defaults.yaml +++ b/ingest/config/defaults.yaml @@ -10,8 +10,27 @@ entrez_search_term: "" # Required to fetch from NCBI Datasets ncbi_taxon_id: "" -# Optional fields to add to the NCBI Datasets output -ncbi_dataset_fields: [] +# The list of NCBI Datasets fields to include from NCBI Datasets output +# These need to be the mneumonics of the NCBI Datasets fields, see docs for full list of fields +# https://www.ncbi.nlm.nih.gov/datasets/docs/v2/reference-docs/command-line/dataformat/tsv/dataformat_tsv_virus-genome/#fields +# Note: the "accession" field MUST be provided to match with the sequences +ncbi_datasets_fields: + - accession + - sourcedb + - sra-accs + - isolate-lineage + - geo-region + - geo-location + - isolate-collection-date + - release-date + - update-date + - length + - host-name + - isolate-lineage-source + - biosample-acc + - submitter-names + - submitter-affiliation + - submitter-country # Config parameters related to the curate pipeline curate: @@ -23,26 +42,25 @@ curate: # The path should be relative to the ingest directory. local_geolocation_rules: "config/geolocation_rules.tsv" # List of field names to change where the key is the original field name and the value is the new field name - # This is the first step in the pipeline, so any references to field names - # in the configs below should use the new field names - # The examples below are based on the NCBI Datasets output TSV column names, your data might have different field names. + # The original field names should match the ncbi_datasets_fields provided above. + # This is the first step in the pipeline, so any references to field names in the configs below should use the new field names field_map: - Source database: database - Isolate Collection date: date - Release date: date_released - Update date: date_updated - Accession: accession - Isolate Lineage: strain - Geographic Region: region - Geographic Location: location - Submitter Names: authors - Submitter Affiliation: institution - SRA Accessions: sra_accessions - Length: length - Host Name: host - Isolate Lineage source: sample_type - BioSample accession: biosample_accession - Submitter Country: submitter_country + accession: accession + sourcedb: database + sra-accs: sra_accessions + isolate-lineage: strain + geo-region: region + geo-location: location + isolate-collection-date: date + release-date: date_released + update-date: date_updated + length: length + host-name: host + isolate-lineage-source: sample_type + biosample-acc: biosample_accessions + submitter-names: authors + submitter-affiliation: institution + submitter-country: submitter_country # Standardized strain name regex # Currently accepts any characters because we do not have a clear standard for strain names across pathogens strain_regex: '^.+$' diff --git a/ingest/rules/fetch_from_ncbi.smk b/ingest/rules/fetch_from_ncbi.smk index 8b8c064..2f4b5ad 100644 --- a/ingest/rules/fetch_from_ncbi.smk +++ b/ingest/rules/fetch_from_ncbi.smk @@ -67,55 +67,22 @@ rule extract_ncbi_dataset_sequences: """ -def _get_ncbi_dataset_field_mnemonics(provided_fields: list) -> str: - """ - Return list of NCBI Dataset report field mnemonics for fields that we want - to parse out of the dataset report. The column names in the output TSV - are different from the mnemonics. - - Additional *provided_fields* will be appended to the end of the list. - - See NCBI Dataset docs for full list of available fields and their column - names in the output: - https://www.ncbi.nlm.nih.gov/datasets/docs/v2/reference-docs/command-line/dataformat/tsv/dataformat_tsv_virus-genome/#fields - """ - fields = [ - "accession", - "sourcedb", - "sra-accs", - "isolate-lineage", - "geo-region", - "geo-location", - "isolate-collection-date", - "release-date", - "update-date", - "length", - "host-name", - "isolate-lineage-source", - "biosample-acc", - "submitter-names", - "submitter-affiliation", - "submitter-country", - ] - return ",".join(fields + provided_fields) - - rule format_ncbi_dataset_report: input: dataset_package="data/ncbi_dataset.zip", output: ncbi_dataset_tsv=temp("data/ncbi_dataset_report.tsv"), params: - fields_to_include=_get_ncbi_dataset_field_mnemonics( - config.get("ncbi_dataset_fields", []) - ), + ncbi_datasets_fields=",".join(config["ncbi_datasets_fields"]), benchmark: "benchmarks/format_ncbi_dataset_report.txt" shell: """ dataformat tsv virus-genome \ --package {input.dataset_package} \ - --fields {params.fields_to_include:q} \ + --fields {params.ncbi_datasets_fields:q} \ + --elide-header \ + | csvtk add-header -t -n {params.ncbi_datasets_fields:q} \ > {output.ncbi_dataset_tsv} """ @@ -139,7 +106,7 @@ rule format_ncbi_datasets_ndjson: augur curate passthru \ --metadata {input.ncbi_dataset_tsv} \ --fasta {input.ncbi_dataset_sequences} \ - --seq-id-column Accession \ + --seq-id-column accession \ --seq-field sequence \ --unmatched-reporting warn \ --duplicate-reporting warn \