-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add measles-specific fixes to strain names
- Loading branch information
1 parent
c014e84
commit f0a5194
Showing
2 changed files
with
61 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
#! /usr/bin/env python3 | ||
""" | ||
Parses GenBank's 'strain' field of the NDJSON record from stdin and applies measles-specific strain name corrections | ||
based on historical modifications from the fauna repo. | ||
Outputs the modified record to stdout. | ||
""" | ||
|
||
import argparse | ||
import json | ||
from sys import stdin, stdout | ||
|
||
import re | ||
|
||
def parse_args(): | ||
parser = argparse.ArgumentParser( | ||
description="Modify measles strain names by referencing historical modifications from the fauna repo." | ||
) | ||
parser.add_argument("--strain-field", default='strain', | ||
help="Field from the records to use as the strain name to be fixed.") | ||
|
||
return parser.parse_args() | ||
|
||
|
||
def _set_strain_name(record): | ||
"""Replace spaces, dashes, and periods with underscores in strain name.""" | ||
strain_name = record["strain"] | ||
|
||
strain_name = strain_name.replace('MVs/', '').replace('MVi/', '').replace('Mvi/', '') | ||
strain_name = re.sub(r'[_ ]?\[([A-Z][0-9])\]$', r'/\1', strain_name) | ||
strain_name = re.sub(r'\(([A-Z][0-9])\)$', r'/\1', strain_name) | ||
strain_name = re.sub(r'_([A-Z][0-9])_$', r'/\1', strain_name) | ||
strain_name = re.sub(r'[ ;]', r'_', strain_name) | ||
strain_name = re.sub(r'//', r'/', strain_name) | ||
|
||
try: | ||
strain_name = 'V' + str(int(strain_name)) | ||
except ValueError: | ||
pass | ||
|
||
return ( | ||
strain_name.replace(" ", "_") | ||
.replace("-", "_") | ||
.replace(".", "_") | ||
.replace("(", "_") | ||
.replace(")", "_") | ||
) | ||
|
||
|
||
def main(): | ||
args = parse_args() | ||
|
||
for index, record in enumerate(stdin): | ||
record = json.loads(record) | ||
record[args.strain_field] = _set_strain_name(record) | ||
stdout.write(json.dumps(record) + "\n") | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters