Skip to content

Commit

Permalink
Ingest: Derive url and use accession fields during ingest #78
Browse files Browse the repository at this point in the history
  • Loading branch information
j23414 authored Dec 17, 2024
2 parents e304e66 + 77fa1a4 commit de4b930
Show file tree
Hide file tree
Showing 6 changed files with 73 additions and 73 deletions.
17 changes: 10 additions & 7 deletions ingest/defaults/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,8 @@ curate:
# The original field names should match the ncbi_datasets_fields provided above.
# This is the first step in the pipeline, so any references to field names in the configs below should use the new field names
field_map:
accession: genbank_accession
accession-rev: genbank_accession_rev
accession: accession
accession_version: accession_version
isolate-lineage: strain
sourcedb: database
geo-region: region
Expand All @@ -62,7 +62,7 @@ curate:
# Currently accepts any characters because we do not have a clear standard for strain names across pathogens
strain_regex: '^.+$'
# Back up strain name field to use if 'strain' doesn't match regex above
strain_backup_fields: ['genbank_accession']
strain_backup_fields: ['accession']
# List of date fields to standardize to ISO format YYYY-MM-DD
date_fields: ['date', 'release_date', 'update_date']
# List of expected date formats that are present in the date fields provided above
Expand All @@ -89,15 +89,17 @@ curate:
# The path should be relative to the ingest directory
annotations: "defaults/annotations.tsv"
# The ID field in the metadata to use to merge the manual annotations
annotations_id: 'genbank_accession'
annotations_id: 'accession'
# The ID field in the metadata to use as the sequence id in the output FASTA file
output_id_field: 'genbank_accession'
output_id_field: 'accession'
# The field in the NDJSON record that contains the actual genomic sequence
output_sequence_field: 'sequence'
# The field in the NDJSON record that contains the actual GenBank accession
genbank_accession: 'accession'
# The list of metadata columns to keep in the final output of the curation pipeline.
metadata_columns: [
'genbank_accession',
'genbank_accession_rev',
'accession',
'accession_version',
'strain',
'date',
'region',
Expand All @@ -111,4 +113,5 @@ curate:
'sra_accessions',
'authors',
'institution',
'url',
]
23 changes: 22 additions & 1 deletion ingest/rules/curate.smk
Original file line number Diff line number Diff line change
Expand Up @@ -102,9 +102,30 @@ rule curate:
"""


rule add_metadata_columns:
"""Add columns to metadata
Notable columns:
- [NEW] url: URL linking to the NCBI GenBank record ('https://www.ncbi.nlm.nih.gov/nuccore/*').
"""
input:
metadata = "data/all_metadata.tsv"
output:
metadata = temp("data/all_metadata_added.tsv")
params:
accession=config['curate']['genbank_accession']
shell:
"""
csvtk mutate2 -t \
-n url \
-e '"https://www.ncbi.nlm.nih.gov/nuccore/" + ${params.accession}' \
{input.metadata} \
> {output.metadata}
"""


rule subset_metadata:
input:
metadata="data/all_metadata.tsv",
metadata="data/all_metadata_added.tsv",
output:
subset_metadata="results/metadata.tsv",
params:
Expand Down
6 changes: 3 additions & 3 deletions ingest/rules/fetch_from_ncbi.smk
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,8 @@ rule format_ncbi_dataset_report:
--elide-header \
| csvtk fix-quotes -Ht \
| csvtk add-header -t -n {params.ncbi_datasets_fields:q} \
| csvtk rename -t -f accession -n accession-rev \
| csvtk -t mutate -f accession-rev -n accession -p "^(.+?)\." \
| csvtk rename -t -f accession -n accession_version \
| csvtk -t mutate -f accession_version -n accession -p "^(.+?)\." \
| csvtk del-quotes -t \
| tsv-select -H -f accession --rest last \
> {output.ncbi_dataset_tsv}
Expand All @@ -89,7 +89,7 @@ rule format_ncbi_datasets_ndjson:
augur curate passthru \
--metadata {input.ncbi_dataset_tsv} \
--fasta {input.ncbi_dataset_sequences} \
--seq-id-column accession-rev \
--seq-id-column accession_version \
--seq-field sequence \
--unmatched-reporting warn \
--duplicate-reporting warn \
Expand Down
70 changes: 35 additions & 35 deletions phylogenetic/example_data/metadata.tsv
Original file line number Diff line number Diff line change
@@ -1,35 +1,35 @@
strain virus genbank_accession date region country division city db segment authors
PAN/CDC_259359_V1_V3/2015 zika KX156774 2015-12-18 North America Panama Panama Panama genbank genome Shabman et al
COL/FLR_00024/2015 zika MF574569 2015-12-XX South America Colombia Colombia Colombia genbank genome Pickett et al
PRVABC59 zika KU501215 2015-12-XX North America Puerto Rico Puerto Rico Puerto Rico genbank genome Lanciotti et al
COL/FLR_00008/2015 zika MF574562 2015-12-XX South America Colombia Colombia Colombia genbank genome Pickett et al
Colombia/2016/ZC204Se zika KY317939 2016-01-06 South America Colombia Colombia Colombia genbank genome Quick et al
ZKC2/2016 zika KX253996 2016-02-16 Oceania American Samoa American Samoa American Samoa genbank genome Wu et al
VEN/UF_1/2016 zika KX702400 2016-03-25 South America Venezuela Venezuela Venezuela genbank genome Blohm et al
DOM/2016/BB_0059 zika KY785425 2016-04-04 North America Dominican Republic Dominican Republic Dominican Republic genbank genome Metsky et al
BRA/2016/FC_6706 zika KY785433 2016-04-08 South America Brazil Brazil Brazil genbank genome Metsky et al
DOM/2016/BB_0183 zika KY785420 2016-04-18 North America Dominican Republic Dominican Republic Dominican Republic genbank genome Metsky et al
EcEs062_16 zika KX879603 2016-04-XX South America Ecuador Ecuador Ecuador genbank genome Marquez et al
HND/2016/HU_ME59 zika KY785418 2016-05-13 North America Honduras Honduras Honduras genbank genome Metsky et al
DOM/2016/MA_WGS16_011 zika KY785484 2016-06-06 North America Dominican Republic Dominican Republic Dominican Republic genbank genome Metsky et al
DOM/2016/BB_0433 zika KY785441 2016-06-13 North America Dominican Republic Dominican Republic Dominican Republic genbank genome Metsky et al
USA/2016/FL022 zika KY075935 2016-07-22 North America Usa Usa Usa genbank genome Grubaugh et al
SG_027 zika KY241697 2016-08-27 Southeast Asia Singapore Singapore Singapore genbank genome Ho et al
SG_074 zika KY241744 2016-08-28 Southeast Asia Singapore Singapore Singapore genbank genome Ho et al
SG_056 zika KY241726 2016-08-28 Southeast Asia Singapore Singapore Singapore genbank genome Ho et al
USA/2016/FLUR022 zika KY325473 2016-08-31 North America Usa Usa Usa genbank genome Grubaugh et al
Aedes_aegypti/USA/2016/FL05 zika KY075937 2016-09-09 North America Usa Usa Usa genbank genome Grubaugh et al
SG_018 zika KY241688 2016-09-13 Southeast Asia Singapore Singapore Singapore genbank genome Ho et al
USA/2016/FLWB042 zika KY325478 2016-09-26 North America Usa Usa Usa genbank genome Grubaugh et al
COL/PRV_00028/2015 zika MF574578 2016-12-XX South America Colombia Colombia Colombia genbank genome Pickett et al
Thailand/1610acTw zika MF692778 2016-10-XX Southeast Asia Thailand Thailand Thailand genbank genome Lin et al
1_0087_PF zika KX447509 2013-12-XX Oceania French Polynesia French Polynesia French Polynesia genbank genome Pettersson et al
1_0199_PF zika KX447519 2013-11-XX Oceania French Polynesia French Polynesia French Polynesia genbank genome Pettersson et al
1_0181_PF zika KX447512 2013-12-XX Oceania French Polynesia French Polynesia French Polynesia genbank genome Pettersson et al
Brazil/2015/ZBRC301 zika KY558995 2015-05-13 South America Brazil Brazil Brazil genbank genome Faria et al
Brazil/2015/ZBRA105 zika KY558989 2015-02-23 South America Brazil Brazil Brazil genbank genome Faria et al
Brazil/2016/ZBRC16 zika KY558991 2016-01-19 South America Brazil Brazil Brazil genbank genome Faria et al
V8375 zika KU501217 2015-11-01 North America Guatemala Guatemala Guatemala genbank genome Lanciotti et al
Nica1_16 zika KX421195 2016-01-19 North America Nicaragua Nicaragua Nicaragua genbank genome Tabata et al
Brazil/2015/ZBRC303 zika KY558997 2015-05-14 South America Brazil Brazil Brazil genbank genome Faria et al
SMGC_1 zika KX266255 2016-02-14 Oceania American Samoa American Samoa American Samoa genbank genome Bi et al
strain virus accession date region country division city db segment authors url
PAN/CDC_259359_V1_V3/2015 zika KX156774 2015-12-18 North America Panama Panama Panama genbank genome Shabman et al https://www.ncbi.nlm.nih.gov/nuccore/KX156774
COL/FLR_00024/2015 zika MF574569 2015-12-XX South America Colombia Colombia Colombia genbank genome Pickett et al https://www.ncbi.nlm.nih.gov/nuccore/MF574569
PRVABC59 zika KU501215 2015-12-XX North America Puerto Rico Puerto Rico Puerto Rico genbank genome Lanciotti et al https://www.ncbi.nlm.nih.gov/nuccore/KU501215
COL/FLR_00008/2015 zika MF574562 2015-12-XX South America Colombia Colombia Colombia genbank genome Pickett et al https://www.ncbi.nlm.nih.gov/nuccore/MF574562
Colombia/2016/ZC204Se zika KY317939 2016-01-06 South America Colombia Colombia Colombia genbank genome Quick et al https://www.ncbi.nlm.nih.gov/nuccore/KY317939
ZKC2/2016 zika KX253996 2016-02-16 Oceania American Samoa American Samoa American Samoa genbank genome Wu et al https://www.ncbi.nlm.nih.gov/nuccore/KX253996
VEN/UF_1/2016 zika KX702400 2016-03-25 South America Venezuela Venezuela Venezuela genbank genome Blohm et al https://www.ncbi.nlm.nih.gov/nuccore/KX702400
DOM/2016/BB_0059 zika KY785425 2016-04-04 North America Dominican Republic Dominican Republic Dominican Republic genbank genome Metsky et al https://www.ncbi.nlm.nih.gov/nuccore/KY785425
BRA/2016/FC_6706 zika KY785433 2016-04-08 South America Brazil Brazil Brazil genbank genome Metsky et al https://www.ncbi.nlm.nih.gov/nuccore/KY785433
DOM/2016/BB_0183 zika KY785420 2016-04-18 North America Dominican Republic Dominican Republic Dominican Republic genbank genome Metsky et al https://www.ncbi.nlm.nih.gov/nuccore/KY785420
EcEs062_16 zika KX879603 2016-04-XX South America Ecuador Ecuador Ecuador genbank genome Marquez et al https://www.ncbi.nlm.nih.gov/nuccore/KX879603
HND/2016/HU_ME59 zika KY785418 2016-05-13 North America Honduras Honduras Honduras genbank genome Metsky et al https://www.ncbi.nlm.nih.gov/nuccore/KY785418
DOM/2016/MA_WGS16_011 zika KY785484 2016-06-06 North America Dominican Republic Dominican Republic Dominican Republic genbank genome Metsky et al https://www.ncbi.nlm.nih.gov/nuccore/KY785484
DOM/2016/BB_0433 zika KY785441 2016-06-13 North America Dominican Republic Dominican Republic Dominican Republic genbank genome Metsky et al https://www.ncbi.nlm.nih.gov/nuccore/KY785441
USA/2016/FL022 zika KY075935 2016-07-22 North America Usa Usa Usa genbank genome Grubaugh et al https://www.ncbi.nlm.nih.gov/nuccore/KY075935
SG_027 zika KY241697 2016-08-27 Southeast Asia Singapore Singapore Singapore genbank genome Ho et al https://www.ncbi.nlm.nih.gov/nuccore/KY241697
SG_074 zika KY241744 2016-08-28 Southeast Asia Singapore Singapore Singapore genbank genome Ho et al https://www.ncbi.nlm.nih.gov/nuccore/KY241744
SG_056 zika KY241726 2016-08-28 Southeast Asia Singapore Singapore Singapore genbank genome Ho et al https://www.ncbi.nlm.nih.gov/nuccore/KY241726
USA/2016/FLUR022 zika KY325473 2016-08-31 North America Usa Usa Usa genbank genome Grubaugh et al https://www.ncbi.nlm.nih.gov/nuccore/KY325473
Aedes_aegypti/USA/2016/FL05 zika KY075937 2016-09-09 North America Usa Usa Usa genbank genome Grubaugh et al https://www.ncbi.nlm.nih.gov/nuccore/KY075937
SG_018 zika KY241688 2016-09-13 Southeast Asia Singapore Singapore Singapore genbank genome Ho et al https://www.ncbi.nlm.nih.gov/nuccore/KY241688
USA/2016/FLWB042 zika KY325478 2016-09-26 North America Usa Usa Usa genbank genome Grubaugh et al https://www.ncbi.nlm.nih.gov/nuccore/KY325478
COL/PRV_00028/2015 zika MF574578 2016-12-XX South America Colombia Colombia Colombia genbank genome Pickett et al https://www.ncbi.nlm.nih.gov/nuccore/MF574578
Thailand/1610acTw zika MF692778 2016-10-XX Southeast Asia Thailand Thailand Thailand genbank genome Lin et al https://www.ncbi.nlm.nih.gov/nuccore/MF692778
1_0087_PF zika KX447509 2013-12-XX Oceania French Polynesia French Polynesia French Polynesia genbank genome Pettersson et al https://www.ncbi.nlm.nih.gov/nuccore/KX447509
1_0199_PF zika KX447519 2013-11-XX Oceania French Polynesia French Polynesia French Polynesia genbank genome Pettersson et al https://www.ncbi.nlm.nih.gov/nuccore/KX447519
1_0181_PF zika KX447512 2013-12-XX Oceania French Polynesia French Polynesia French Polynesia genbank genome Pettersson et al https://www.ncbi.nlm.nih.gov/nuccore/KX447512
Brazil/2015/ZBRC301 zika KY558995 2015-05-13 South America Brazil Brazil Brazil genbank genome Faria et al https://www.ncbi.nlm.nih.gov/nuccore/KY558995
Brazil/2015/ZBRA105 zika KY558989 2015-02-23 South America Brazil Brazil Brazil genbank genome Faria et al https://www.ncbi.nlm.nih.gov/nuccore/KY558989
Brazil/2016/ZBRC16 zika KY558991 2016-01-19 South America Brazil Brazil Brazil genbank genome Faria et al https://www.ncbi.nlm.nih.gov/nuccore/KY558991
V8375 zika KU501217 2015-11-01 North America Guatemala Guatemala Guatemala genbank genome Lanciotti et al https://www.ncbi.nlm.nih.gov/nuccore/KU501217
Nica1_16 zika KX421195 2016-01-19 North America Nicaragua Nicaragua Nicaragua genbank genome Tabata et al https://www.ncbi.nlm.nih.gov/nuccore/KX421195
Brazil/2015/ZBRC303 zika KY558997 2015-05-14 South America Brazil Brazil Brazil genbank genome Faria et al https://www.ncbi.nlm.nih.gov/nuccore/KY558997
SMGC_1 zika KX266255 2016-02-14 Oceania American Samoa American Samoa American Samoa genbank genome Bi et al https://www.ncbi.nlm.nih.gov/nuccore/KX266255
4 changes: 2 additions & 2 deletions phylogenetic/example_data/metadata_usvi.tsv
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
genbank_accession genbank_accession_rev accession strain date region country division location length host release_date update_date sra_accessions authors institution url
USVI/37/2016 VI37 USVI/37/2016 2016-10-06 North America Usvi Saint Croix Saint Croix 10807 Homo sapiens Black et al FH https://github.com/blab/zika-usvi/
accession accession_version strain date region country division location length host release_date update_date sra_accessions authors institution url
VI37 VI37 USVI/37/2016 2016-10-06 North America Usvi Saint Croix Saint Croix 10807 Homo sapiens Black et al FH https://github.com/blab/zika-usvi/
26 changes: 1 addition & 25 deletions phylogenetic/rules/merge_sequences_usvi.smk
Original file line number Diff line number Diff line change
Expand Up @@ -21,35 +21,11 @@ This part of the workflow usually includes the following steps:
"""

rule add_metadata_columns:
"""Add columns to metadata
Notable columns:
- genbank_accession: GenBank accession for Auspice to generate a URL to the NCBI GenBank record.
- [NEW] accession: The GenBank accession. Added to go alongside USVI accession.
- [NEW] url: URL linking to the NCBI GenBank record ('https://www.ncbi.nlm.nih.gov/nuccore/*'). Added to go alongside USVI url.
"""
input:
metadata = "data/metadata.tsv"
output:
metadata = "data/metadata_modified.tsv"
shell:
"""
csvtk mutate2 -tl \
-n url \
-e '"https://www.ncbi.nlm.nih.gov/nuccore/" + $genbank_accession' \
{input.metadata} \
| csvtk mutate2 -tl \
-n accession \
-e '$genbank_accession' \
> {output.metadata}
"""

rule append_usvi:
"""Appending USVI sequences"""
input:
sequences = "data/sequences.fasta",
metadata = "data/metadata_modified.tsv",
metadata = "data/metadata.tsv",
usvi_sequences = "data/sequences_usvi.fasta",
usvi_metadata = "data/metadata_usvi.tsv"
output:
Expand Down

0 comments on commit de4b930

Please sign in to comment.