Skip to content

Commit

Permalink
Add measles-specific fixes to strain names
Browse files Browse the repository at this point in the history
  • Loading branch information
kimandrews committed Jan 22, 2024
1 parent c014e84 commit f0a5194
Show file tree
Hide file tree
Showing 2 changed files with 61 additions and 0 deletions.
60 changes: 60 additions & 0 deletions ingest/bin/fix-measles-strain-names.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
#! /usr/bin/env python3
"""
Parses GenBank's 'strain' field of the NDJSON record from stdin and applies measles-specific strain name corrections
based on historical modifications from the fauna repo.
Outputs the modified record to stdout.
"""

import argparse
import json
from sys import stdin, stdout

import re

def parse_args():
parser = argparse.ArgumentParser(
description="Modify measles strain names by referencing historical modifications from the fauna repo."
)
parser.add_argument("--strain-field", default='strain',
help="Field from the records to use as the strain name to be fixed.")

return parser.parse_args()


def _set_strain_name(record):
"""Replace spaces, dashes, and periods with underscores in strain name."""
strain_name = record["strain"]

strain_name = strain_name.replace('MVs/', '').replace('MVi/', '').replace('Mvi/', '')
strain_name = re.sub(r'[_ ]?\[([A-Z][0-9])\]$', r'/\1', strain_name)
strain_name = re.sub(r'\(([A-Z][0-9])\)$', r'/\1', strain_name)
strain_name = re.sub(r'_([A-Z][0-9])_$', r'/\1', strain_name)
strain_name = re.sub(r'[ ;]', r'_', strain_name)
strain_name = re.sub(r'//', r'/', strain_name)

try:
strain_name = 'V' + str(int(strain_name))
except ValueError:
pass

return (
strain_name.replace(" ", "_")
.replace("-", "_")
.replace(".", "_")
.replace("(", "_")
.replace(")", "_")
)


def main():
args = parse_args()

for index, record in enumerate(stdin):
record = json.loads(record)
record[args.strain_field] = _set_strain_name(record)
stdout.write(json.dumps(record) + "\n")


if __name__ == "__main__":
main()
1 change: 1 addition & 0 deletions ingest/rules/curate.smk
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ rule curate:
--abbr-authors-field {params.abbr_authors_field} \
| ./vendored/apply-geolocation-rules \
--geolocation-rules {input.all_geolocation_rules} \
| ./bin/fix-measles-strain-names.py \
| ./vendored/merge-user-metadata \
--annotations {input.annotations} \
--id-field {params.annotations_id} \
Expand Down

0 comments on commit f0a5194

Please sign in to comment.