Skip to content

Commit

Permalink
Add ingest config parameters
Browse files Browse the repository at this point in the history
Add taxon id and other config parameters related to the curate pipeline.
Remove config parameters related to Nextclade because we do not currently have a Nextclade measles dataset.
  • Loading branch information
kimandrews committed Jan 19, 2024
1 parent 5eb59ea commit c014e84
Showing 1 changed file with 32 additions and 28 deletions.
60 changes: 32 additions & 28 deletions ingest/config/defaults.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
entrez_search_term: ""

# Required to fetch from NCBI Datasets
ncbi_taxon_id: ""
ncbi_taxon_id: "11234"

# The list of NCBI Datasets fields to include from NCBI Datasets output
# These need to be the mneumonics of the NCBI Datasets fields, see docs for full list of fields
Expand Down Expand Up @@ -37,7 +37,7 @@ curate:
# URL pointed to public generalized geolocation rules
# For the Nextstrain team, this is currently
# 'https://raw.githubusercontent.com/nextstrain/ncov-ingest/master/source-data/gisaid_geoLocationRules.tsv'
geolocation_rules_url: ""
geolocation_rules_url: "https://raw.githubusercontent.com/nextstrain/ncov-ingest/master/source-data/gisaid_geoLocationRules.tsv"
# The path to the local geolocation rules within the pathogen repo
# The path should be relative to the ingest directory.
local_geolocation_rules: "config/geolocation_rules.tsv"
Expand Down Expand Up @@ -67,47 +67,51 @@ curate:
# Back up strain name field to use if 'strain' doesn't match regex above
strain_backup_fields: ['accession']
# List of date fields to standardize to ISO format YYYY-MM-DD
date_fields: []
date_fields: ['date', 'date_released', 'date_updated']
# List of expected date formats that are present in the date fields provided above
# These date formats should use directives expected by datetime
# See https://docs.python.org/3.9/library/datetime.html#strftime-and-strptime-format-codes
expected_date_formats: []
expected_date_formats: ['%Y', '%Y-%m', '%Y-%m-%d', '%Y-%m-%dT%H:%M:%SZ']
titlecase:
# List of string fields to titlecase
fields: []
fields: ['region', 'country', 'division', 'location']
# List of abbreviations not cast to titlecase, keeps uppercase
abbreviations: []
abbreviations: ['USA']
# Articles that should not be cast to titlecase
articles: []
articles: [
'and', 'd', 'de', 'del', 'des', 'di', 'do', 'en', 'l', 'la', 'las', 'le',
'los', 'nad', 'of', 'op', 'sur', 'the', 'y'
]
# Metadata field that contains the list of authors associated with the sequence
authors_field: ""
authors_field: "authors"
# Default value to use if the authors field is empty
authors_default_value: ""
authors_default_value: "?"
# Name to use for the generated abbreviated authors field
abbr_authors_field: ""
abbr_authors_field: "abbr_authors"
# Path to the manual annotations file
# The path should be relative to the ingest directory
annotations: "config/annotations.tsv"
# The ID field in the metadata to use to merge the manual annotations
annotations_id: ""
annotations_id: "accession"
# The ID field in the metadata to use as the sequence id in the output FASTA file
output_id_field: ""
output_id_field: "accession"
# The field in the NDJSON record that contains the actual genomic sequence
output_sequence_field: ""
output_sequence_field: "sequence"
# The list of metadata columns to keep in the final output of the curation pipeline.
metadata_columns: []
metadata_columns: [
'accession',
'strain',
'date',
'region',
'country',
'division',
'location',
'length',
'host',
'date_released',
'date_updated',
'sra_accessions',
'authors',
'institution',
]


# Nextclade parameters to include if you are running Nextclade as a part of your ingest workflow
# Note that this requires a Nextclade dataset to already exist for your pathogen.
# Remove the following parameters if you do not plan to run Nextclade.
nextclade:
# The name of the Nextclade dataset to use for running nextclade.
# Run `nextclade dataset list` to get a full list of available Nextclade datasets
dataset_name: ""
# Path to the mapping for renaming Nextclade output columns
# The path should be relative to the ingest directory
field_map: "config/nextclade_field_map.tsv"
# This is the ID field you would use to match the Nextclade output with the record metadata.
# This should be the new name that you have defined in your field map.
id_field: "seqName"

0 comments on commit c014e84

Please sign in to comment.