Skip to content

Commit

Permalink
Rescue fauna data processing steps that are specific to Zika
Browse files Browse the repository at this point in the history
Rescue some of the original functionality of the zika_upload script from fauna.
https://github.com/nextstrain/fauna/blob/master/vdb/zika_upload.py#L14-L30
  • Loading branch information
j23414 committed Nov 13, 2023
1 parent a386f7d commit bf96ccf
Show file tree
Hide file tree
Showing 4 changed files with 304 additions and 2 deletions.
63 changes: 63 additions & 0 deletions ingest/bin/post_process_metadata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
#! /usr/bin/env python3

import argparse
import json
from sys import stdin, stdout

import re

def parse_args():
parser = argparse.ArgumentParser(
description="Reformat a NCBI Virus metadata.tsv file for a pathogen build."
)
parser.add_argument("--accession-field", default='accession',
help="Field from the records to use as the sequence ID in the FASTA file.")

return parser.parse_args()


def _set_strain_name(record):
"""Replace spaces, dashes, and periods with underscores in strain name."""
strain_name = record["strain"]

strain_name = strain_name.replace('Zika_virus', '').replace('Zikavirus', '').replace('Zika virus', '').replace('Zika', '').replace('ZIKV', '')
strain_name = strain_name.replace('Human', '').replace('human', '').replace('H.sapiens_wt', '').replace('H.sapiens-wt', '').replace('H.sapiens_tc', '').replace('Hsapiens_tc', '').replace('H.sapiens-tc', '').replace('Homo_sapiens', '').replace('Homo sapiens', '').replace('Hsapiens', '').replace('H.sapiens', '')
strain_name = strain_name.replace('/Hu/', '')
strain_name = strain_name.replace('_Asian', '').replace('_Asia', '').replace('_asian', '').replace('_asia', '')
strain_name = strain_name.replace('_URI', '').replace('-URI', '').replace('_SER', '').replace('-SER', '').replace('_PLA', '').replace('-PLA', '').replace('_MOS', '').replace('_SAL', '')
strain_name = strain_name.replace('Aaegypti_wt', 'Aedes_aegypti').replace('Aedessp', 'Aedes_sp')
strain_name = strain_name.replace(' ', '').replace('\'', '').replace('(', '').replace(')', '').replace('//', '/').replace('__', '_').replace('.', '').replace(',', '')
strain_name = re.sub('^[\/\_\-]', '', strain_name)

try:
strain_name = 'V' + str(int(strain_name))
except ValueError:
pass

return (
strain_name.replace(" ", "_")
.replace("-", "_")
.replace(".", "_")
.replace("(", "_")
.replace(")", "_")
)


def _set_url(record, accession_field='accession'):
"""Set url column from accession"""
return "https://www.ncbi.nlm.nih.gov/nuccore/" + str(record[accession_field])


def main():
args = parse_args()

for index, record in enumerate(stdin):
record = json.loads(record)
record["strain"] = _set_strain_name(record)
record["url"] = _set_url(record, args.accession_field)
record["authors"] = record["abbr_authors"]
stdout.write(json.dumps(record) + "\n")


if __name__ == "__main__":
main()
3 changes: 2 additions & 1 deletion ingest/config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ transform:
'sra_accessions',
'abbr_authors',
'authors',
'institution'
'institution',
'url',
]

238 changes: 237 additions & 1 deletion ingest/source-data/annotations.tsv
Original file line number Diff line number Diff line change
@@ -1 +1,237 @@

KX922703 strain USA/2016/FL021
KY765326 strain NIC/6188_13A1/2016
KX922707 strain USA/2016/FL039
KU922923 strain MEX/InDRE/2016
KY075934 strain PuertoRico/2016/FL016U
KY765327 strain NIC/5005_13A1/2016
KX922705 strain USA/2016/FL032
KY075938 strain Aedes_aegypti/USA/2016/FL06
KX922704 strain USA/2016/FL030
KX673530 strain PHE_Guadeloupe
KY075935 strain USA/2016/FL022
KX838906 strain Aedes_aegypti/USA/2016/FL03
KY075933 strain PuertoRico/2016/FL008U
KX838904 strain Aedes_aegypti/USA/2016/FL01
KX838905 strain Aedes_aegypti/USA/2016/FL02
KY765320 strain NIC/6406_13A1/2016
KY075936 strain USA/2016/FL036
KY075932 strain Martinique/2016/FL001Sa
KY765321 strain NIC/4886_12A1/2016
KY075939 strain Aedes_aegypti/USA/2016/FL08
KX922706 strain USA/2016/FL038
KY075937 strain Aedes_aegypti/USA/2016/FL05
KX922708 strain Aedes_aegypti/USA/2016/FL04
KY014295 strain USA/2016/FL010
MT377503 strain V151144
MF988734 strain SG_EHI_/33164Y17
KU853013 strain Dominican_Republic/2016/PD2
KY785443 strain USA/2016/FL028
KX906952 strain 2016_HND_19563
KY120348 strain MEX_CIENI551
KX856011 strain Aedes_sp/MEX_I_44/2016
KY785421 strain USA/2016/FL019
KU527068 strain Natal_RGN
MF438286 strain Cuba_2017
KF993678 strain THA/PLCal_ZV/2013
KY631494 strain ENCB165P4
KY785440 strain USA/2016/FL035
KY785451 strain Martinique/2016/FL001
MF664436 strain Dominican_Republic/2016/ZB
KY648934 strain Aedes_aegypti/MEX/MEX_I_44/2016
KX879603 strain EC/Esmeraldas/062/2016
OL414716 strain Faranah/18
MN185326 strain French_Guiana_Aedes_aegypti_T1010
MN185328 strain French_Guiana_Aedes_aegypti_T1141
KX827268 strain USA/UT_1/2016
KU853012 strain Dominican_Republic/2016/PD1
MK028857 strain Puerto_Rico/2015/PRVABC59
KY785457 strain USA/2016/FL029
MH513600 strain BR/Sinop/H366_2P/2015
KY927808 strain ZZ_1
KX087102 strain COL/FLR/2015
KX879604 strain EC/Esmeraldas/089/2016
KF993678 country Thailand
KF993678 division Thailand
KF993678 location Thailand
KF993678 region Southeast Asia
KU647676 country Martinique
KU647676 division Martinique
KU647676 location Martinique
KU647676 region North America
KU740184 country Venezuela
KU740184 division Venezuela
KU740184 location Venezuela
KU740184 region South America
KU744693 country Venezuela
KU744693 division Venezuela
KU744693 location Venezuela
KU744693 region South America
KU758877 country French Guiana
KU758877 division French Guiana
KU758877 location French Guiana
KU758877 region South America
KU761560 country American Samoa
KU761560 division American Samoa
KU761560 location American Samoa
KU761560 region Oceania
KU761561 country American Samoa
KU761561 division American Samoa
KU761561 location American Samoa
KU761561 region Oceania
KU761564 country Venezuela
KU761564 division Venezuela
KU761564 location Venezuela
KU761564 region South America
KU820898 country Venezuela
KU820898 division Venezuela
KU820898 location Venezuela
KU820898 region South America
KU853012 country Dominican Republic
KU853012 division Dominican Republic
KU853012 location Dominican Republic
KU853012 region North America
KU866423 country American Samoa
KU866423 division American Samoa
KU866423 location American Samoa
KU866423 region Oceania
KU955589 country American Samoa
KU955589 division American Samoa
KU955589 location American Samoa
KU955589 region Oceania
KU955590 country Venezuela
KU955590 division Venezuela
KU955590 location Venezuela
KU955590 region South America
KU963796 country American Samoa
KU963796 division American Samoa
KU963796 location American Samoa
KU963796 region Oceania
KU991811 country Brazil
KU991811 division Brazil
KU991811 location Brazil
KU991811 region South America
KX056898 country Venezuela
KX056898 division Venezuela
KX056898 location Venezuela
KX056898 region South America
KX117076 country American Samoa
KX117076 division American Samoa
KX117076 location American Samoa
KX117076 region Oceania
KX185891 country American Samoa
KX185891 division American Samoa
KX185891 location American Samoa
KX185891 region Oceania
KX253996 country American Samoa
KX253996 division American Samoa
KX253996 location American Samoa
KX253996 region Oceania
KX266255 country American Samoa
KX266255 division American Samoa
KX266255 location American Samoa
KX266255 region Oceania
KX269878 country Haiti
KX269878 division Haiti
KX269878 location Haiti
KX269878 region North America
KX673530 country Guadeloupe
KX673530 division Guadeloupe
KX673530 location Guadeloupe
KX673530 region North America
KY120352 country Brazil
KY120352 division Brazil
KY120352 location Brazil
KY120352 region South America
KY120353 country Philippines
KY120353 division Philippines
KY120353 location Philippines
KY120353 region Southeast Asia
KY553111 country Philippines
KY553111 division Philippines
KY553111 location Philippines
KY553111 region Southeast Asia
KY785451 country Martinique
KY785451 division Martinique
KY785451 location Martinique
KY785451 region North America
KY785454 country El Salvador
KY785454 division El Salvador
KY785454 location El Salvador
KY785454 region North America
KY962729 country Philippines
KY962729 division Philippines
KY962729 location Philippines
KY962729 region Southeast Asia
LC191864 country Fiji
LC191864 division Fiji
LC191864 location Fiji
LC191864 region Oceania
LC219720 country Vietnam
LC219720 division Vietnam
LC219720 location Vietnam
LC219720 region Southeast Asia
LC369584 country Thailand
LC369584 division Thailand
LC369584 location Thailand
LC369584 region Southeast Asia
MF098764 country Dominican Republic
MF098764 division Dominican Republic
MF098764 location Dominican Republic
MF098764 region North America
MF098765 country Dominican Republic
MF098765 division Dominican Republic
MF098765 location Dominican Republic
MF098765 region North America
MF098766 country Dominican Republic
MF098766 division Dominican Republic
MF098766 location Dominican Republic
MF098766 region North America
MF098767 country Saint Barthelemy
MF098767 division Saint Barthelemy
MF098767 location Saint Barthelemy
MF098767 region North America
MF098768 country Dominican Republic
MF098768 division Dominican Republic
MF098768 location Dominican Republic
MF098768 region North America
MF098769 country Dominican Republic
MF098769 division Dominican Republic
MF098769 location Dominican Republic
MF098769 region North America
MF098770 country Mexico
MF098770 division Mexico
MF098770 location Mexico
MF098770 region North America
MF098771 country Mexico
MF098771 division Mexico
MF098771 location Mexico
MF098771 region North America
MF593625 country Guatemala
MF593625 division Guatemala
MF593625 location Guatemala
MF593625 region North America
MF664436 country Dominican Republic
MF664436 division Dominican Republic
MF664436 location Dominican Republic
MF664436 region North America
MF692778 country Thailand
MF692778 division Thailand
MF692778 location Thailand
MF692778 region Southeast Asia
MF988734 country Cuba
MF988734 division Cuba
MF988734 location Cuba
MF988734 region North America
MK829154 country Angola
MK829154 division Angola
MK829154 location Angola
MK829154 region Africa
MN185326 country French Guiana
MN185326 division French Guiana
MN185326 location French Guiana
MN185326 region South America
MN185328 country French Guiana
MN185328 division French Guiana
MN185328 location French Guiana
MN185328 region South America
KY328289 date 2016-05-15
2 changes: 2 additions & 0 deletions ingest/workflow/snakemake_rules/transform.smk
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,8 @@ rule transform:
--abbr-authors-field {params.abbr_authors_field} \
| ./vendored/apply-geolocation-rules \
--geolocation-rules {input.all_geolocation_rules} \
| ./bin/post_process_metadata.py \
--accession-field {params.id_field} \
| ./vendored/merge-user-metadata \
--annotations {input.annotations} \
--id-field {params.annotations_id} \
Expand Down

0 comments on commit bf96ccf

Please sign in to comment.