Add ingest config parameters

Add taxon id and other config parameters related to the curate pipeline. Remove config parameters related to Nextclade because we do not currently have a Nextclade measles dataset.
nextstrain · Jan 19, 2024 · c014e84 · c014e84
1 parent 5eb59ea
commit c014e84
Showing 1 changed file with 32 additions and 28 deletions.
diff --git a/ingest/config/defaults.yaml b/ingest/config/defaults.yaml
@@ -8,7 +8,7 @@
 entrez_search_term: ""
 
 # Required to fetch from NCBI Datasets
-ncbi_taxon_id: ""
+ncbi_taxon_id: "11234"
 
 # The list of NCBI Datasets fields to include from NCBI Datasets output
 # These need to be the mneumonics of the NCBI Datasets fields, see docs for full list of fields
@@ -37,7 +37,7 @@ curate:
   # URL pointed to public generalized geolocation rules
   # For the Nextstrain team, this is currently
   # 'https://raw.githubusercontent.com/nextstrain/ncov-ingest/master/source-data/gisaid_geoLocationRules.tsv'
-  geolocation_rules_url: ""
+  geolocation_rules_url: "https://raw.githubusercontent.com/nextstrain/ncov-ingest/master/source-data/gisaid_geoLocationRules.tsv"
   # The path to the local geolocation rules within the pathogen repo
   # The path should be relative to the ingest directory.
   local_geolocation_rules: "config/geolocation_rules.tsv"
@@ -67,47 +67,51 @@ curate:
   # Back up strain name field to use if 'strain' doesn't match regex above
   strain_backup_fields: ['accession']
   # List of date fields to standardize to ISO format YYYY-MM-DD
-  date_fields: []
+  date_fields: ['date', 'date_released', 'date_updated']
   # List of expected date formats that are present in the date fields provided above
   # These date formats should use directives expected by datetime
   # See https://docs.python.org/3.9/library/datetime.html#strftime-and-strptime-format-codes
-  expected_date_formats: []
+  expected_date_formats: ['%Y', '%Y-%m', '%Y-%m-%d', '%Y-%m-%dT%H:%M:%SZ']
   titlecase:
     # List of string fields to titlecase
-    fields: []
+    fields: ['region', 'country', 'division', 'location']
     # List of abbreviations not cast to titlecase, keeps uppercase
-    abbreviations: []
+    abbreviations: ['USA']
     # Articles that should not be cast to titlecase
-    articles: []
+    articles: [
+      'and', 'd', 'de', 'del', 'des', 'di', 'do', 'en', 'l', 'la', 'las', 'le',
+      'los', 'nad', 'of', 'op', 'sur', 'the', 'y'
+    ]
   # Metadata field that contains the list of authors associated with the sequence
-  authors_field: ""
+  authors_field: "authors"
   # Default value to use if the authors field is empty
-  authors_default_value: ""
+  authors_default_value: "?"
   # Name to use for the generated abbreviated authors field
-  abbr_authors_field: ""
+  abbr_authors_field: "abbr_authors"
   # Path to the manual annotations file
   # The path should be relative to the ingest directory
   annotations: "config/annotations.tsv"
   # The ID field in the metadata to use to merge the manual annotations
-  annotations_id: ""
+  annotations_id: "accession"
   # The ID field in the metadata to use as the sequence id in the output FASTA file
-  output_id_field: ""
+  output_id_field: "accession"
   # The field in the NDJSON record that contains the actual genomic sequence
-  output_sequence_field: ""
+  output_sequence_field: "sequence"
   # The list of metadata columns to keep in the final output of the curation pipeline.
-  metadata_columns: []
+  metadata_columns: [
+    'accession',
+    'strain',
+    'date',
+    'region',
+    'country',
+    'division',
+    'location',
+    'length',
+    'host',
+    'date_released',
+    'date_updated',
+    'sra_accessions',
+    'authors',
+    'institution',
+  ]
 
-
-# Nextclade parameters to include if you are running Nextclade as a part of your ingest workflow
-# Note that this requires a Nextclade dataset to already exist for your pathogen.
-# Remove the following parameters if you do not plan to run Nextclade.
-nextclade:
-  # The name of the Nextclade dataset to use for running nextclade.
-  # Run `nextclade dataset list` to get a full list of available Nextclade datasets
-  dataset_name: ""
-  # Path to the mapping for renaming Nextclade output columns
-  # The path should be relative to the ingest directory
-  field_map: "config/nextclade_field_map.tsv"
-  # This is the ID field you would use to match the Nextclade output with the record metadata.
-  # This should be the new name that you have defined in your field map.
-  id_field: "seqName"