Skip to content

Commit

Permalink
Replace genbank_accession with accession
Browse files Browse the repository at this point in the history
This simplifies USVI data merge
  • Loading branch information
j23414 committed Dec 16, 2024
1 parent 803ed16 commit 53545a5
Show file tree
Hide file tree
Showing 3 changed files with 12 additions and 36 deletions.
16 changes: 8 additions & 8 deletions ingest/defaults/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,8 @@ curate:
# The original field names should match the ncbi_datasets_fields provided above.
# This is the first step in the pipeline, so any references to field names in the configs below should use the new field names
field_map:
accession: genbank_accession
accession-rev: genbank_accession_rev
accession: accession
accession_version: accession_version
isolate-lineage: strain
sourcedb: database
geo-region: region
Expand All @@ -62,7 +62,7 @@ curate:
# Currently accepts any characters because we do not have a clear standard for strain names across pathogens
strain_regex: '^.+$'
# Back up strain name field to use if 'strain' doesn't match regex above
strain_backup_fields: ['genbank_accession']
strain_backup_fields: ['accession']
# List of date fields to standardize to ISO format YYYY-MM-DD
date_fields: ['date', 'release_date', 'update_date']
# List of expected date formats that are present in the date fields provided above
Expand All @@ -89,17 +89,17 @@ curate:
# The path should be relative to the ingest directory
annotations: "defaults/annotations.tsv"
# The ID field in the metadata to use to merge the manual annotations
annotations_id: 'genbank_accession'
annotations_id: 'accession'
# The ID field in the metadata to use as the sequence id in the output FASTA file
output_id_field: 'genbank_accession'
output_id_field: 'accession'
# The field in the NDJSON record that contains the actual genomic sequence
output_sequence_field: 'sequence'
# The field in the NDJSON record that contains the actual GenBank accession
genbank_accession: 'genbank_accession'
genbank_accession: 'accession'
# The list of metadata columns to keep in the final output of the curation pipeline.
metadata_columns: [
'genbank_accession',
'genbank_accession_rev',
'accession',
'accession_version',
'strain',
'date',
'region',
Expand Down
6 changes: 3 additions & 3 deletions ingest/rules/fetch_from_ncbi.smk
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,8 @@ rule format_ncbi_dataset_report:
--elide-header \
| csvtk fix-quotes -Ht \
| csvtk add-header -t -n {params.ncbi_datasets_fields:q} \
| csvtk rename -t -f accession -n accession-rev \
| csvtk -t mutate -f accession-rev -n accession -p "^(.+?)\." \
| csvtk rename -t -f accession -n accession_version \
| csvtk -t mutate -f accession_version -n accession -p "^(.+?)\." \
| csvtk del-quotes -t \
| tsv-select -H -f accession --rest last \
> {output.ncbi_dataset_tsv}
Expand All @@ -89,7 +89,7 @@ rule format_ncbi_datasets_ndjson:
augur curate passthru \
--metadata {input.ncbi_dataset_tsv} \
--fasta {input.ncbi_dataset_sequences} \
--seq-id-column accession-rev \
--seq-id-column accession_version \
--seq-field sequence \
--unmatched-reporting warn \
--duplicate-reporting warn \
Expand Down
26 changes: 1 addition & 25 deletions phylogenetic/rules/merge_sequences_usvi.smk
Original file line number Diff line number Diff line change
Expand Up @@ -21,35 +21,11 @@ This part of the workflow usually includes the following steps:
"""

rule add_metadata_columns:
"""Add columns to metadata
Notable columns:
- genbank_accession: GenBank accession for Auspice to generate a URL to the NCBI GenBank record.
- [NEW] accession: The GenBank accession. Added to go alongside USVI accession.
- [NEW] url: URL linking to the NCBI GenBank record ('https://www.ncbi.nlm.nih.gov/nuccore/*'). Added to go alongside USVI url.
"""
input:
metadata = "data/metadata.tsv"
output:
metadata = "data/metadata_modified.tsv"
shell:
"""
csvtk mutate2 -tl \
-n url \
-e '"https://www.ncbi.nlm.nih.gov/nuccore/" + $genbank_accession' \
{input.metadata} \
| csvtk mutate2 -tl \
-n accession \
-e '$genbank_accession' \
> {output.metadata}
"""

rule append_usvi:
"""Appending USVI sequences"""
input:
sequences = "data/sequences.fasta",
metadata = "data/metadata_modified.tsv",
metadata = "data/metadata.tsv",
usvi_sequences = "data/sequences_usvi.fasta",
usvi_metadata = "data/metadata_usvi.tsv"
output:
Expand Down

0 comments on commit 53545a5

Please sign in to comment.