diff --git a/ingest/bin/fix-measles-strain-names.py b/ingest/bin/fix-measles-strain-names.py new file mode 100755 index 0000000..ab407b0 --- /dev/null +++ b/ingest/bin/fix-measles-strain-names.py @@ -0,0 +1,60 @@ +#! /usr/bin/env python3 +""" +Parses GenBank's 'strain' field of the NDJSON record from stdin and applies measles-specific strain name corrections +based on historical modifications from the fauna repo. + +Outputs the modified record to stdout. +""" + +import argparse +import json +from sys import stdin, stdout + +import re + +def parse_args(): + parser = argparse.ArgumentParser( + description="Modify measles strain names by referencing historical modifications from the fauna repo." + ) + parser.add_argument("--strain-field", default='strain', + help="Field from the records to use as the strain name to be fixed.") + + return parser.parse_args() + + +def _set_strain_name(record): + """Replace spaces, dashes, and periods with underscores in strain name.""" + strain_name = record["strain"] + + strain_name = strain_name.replace('MVs/', '').replace('MVi/', '').replace('Mvi/', '') + strain_name = re.sub(r'[_ ]?\[([A-Z][0-9])\]$', r'/\1', strain_name) + strain_name = re.sub(r'\(([A-Z][0-9])\)$', r'/\1', strain_name) + strain_name = re.sub(r'_([A-Z][0-9])_$', r'/\1', strain_name) + strain_name = re.sub(r'[ ;]', r'_', strain_name) + strain_name = re.sub(r'//', r'/', strain_name) + + try: + strain_name = 'V' + str(int(strain_name)) + except ValueError: + pass + + return ( + strain_name.replace(" ", "_") + .replace("-", "_") + .replace(".", "_") + .replace("(", "_") + .replace(")", "_") + ) + + +def main(): + args = parse_args() + + for index, record in enumerate(stdin): + record = json.loads(record) + record[args.strain_field] = _set_strain_name(record) + stdout.write(json.dumps(record) + "\n") + + +if __name__ == "__main__": + main() diff --git a/ingest/rules/curate.smk b/ingest/rules/curate.smk index 6a7168a..9605a29 100644 --- a/ingest/rules/curate.smk +++ b/ingest/rules/curate.smk @@ -105,6 +105,7 @@ rule curate: --abbr-authors-field {params.abbr_authors_field} \ | ./vendored/apply-geolocation-rules \ --geolocation-rules {input.all_geolocation_rules} \ + | ./bin/fix-measles-strain-names.py \ | ./vendored/merge-user-metadata \ --annotations {input.annotations} \ --id-field {params.annotations_id} \