diff --git a/cojac/generate_sigs_nextstrains.py b/cojac/generate_sigs_nextstrains.py new file mode 100755 index 0000000..20e48d1 --- /dev/null +++ b/cojac/generate_sigs_nextstrains.py @@ -0,0 +1,110 @@ +#!/usr/bin/env python3 + +import os +import json +import requests +from datetime import datetime +import yaml +import click + + +def load_dataset(url): + response = requests.get(url) + full_data = json.loads(response.content) + + return full_data + + +def prepare_header(data, url): + return { + "variant": { + "nextstrain": data["nextstrainClade"], + "pangolin": data["lineage"], + "short": data["lineage"].lower().replace(".", "_"), + "reference": { + "address": url, + "accessed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), + }, + } + } + + +def prepare_mut(data): + mut_dict = {} + if data["nucSubstitutions"] != [""]: + for item in data["nucSubstitutions"]: + start_letter = item[0] + end_letter = item[-1] + number = int(item[1:-1]) + mut_dict[number] = f"{start_letter}>{end_letter}" + return {"mut": mut_dict} + else: + return "" + + +def prepare_del(data): + del_dict = {} + if data["nucDeletions"] != [""]: + for item in data["nucDeletions"]: + if "-" in item: + start, end = map(int, item.split("-")) + key = start + value = str((end - start) * "-") + else: + key = int(item) + value = "-" + del_dict[key] = value + return {"del": del_dict} + else: + return "" + + +def prepare_yaml(data, url): + header = prepare_header(data, url) + mut = prepare_mut(data) + del_section = prepare_del(data) + + output_data = header + + if mut != "": + output_data["mut"] = mut["mut"] + + if del_section != "": + output_data["del"] = del_section["del"] + + return output_data + + +def process_dataset(full_data, outdir, url): + if not os.path.exists(outdir): + os.makedirs(outdir) + for data in full_data.values(): + output_data = prepare_yaml(data, url) + with open(f"{outdir}/{output_data['variant']['short']}.yaml", "w") as yaml_file: + yaml.dump(output_data, yaml_file, sort_keys=False) + + +@click.command( + help="Generating a list of variants from nextstrain", + epilog="This tool fetchs a JSON from Github", +) +@click.option( + "-o", + "--outdir", + required=False, + default="voc_nextstrain", + type=str, + help="The output directory for the YAML files", +) +@click.option( + "-u", + "--url", + metavar="URL", + required=False, + default="https://raw.githubusercontent.com/corneliusroemer/pango-sequences/main/data/pango-consensus-sequences_summary.json", + type=str, + help="url to fetch the JSON from", +) +def generate_sigs_nextstrains(outdir, url): + full_data = load_dataset(url) + process_dataset(full_data, outdir, url) diff --git a/cojac/main.py b/cojac/main.py index b0c83ef..30b6238 100644 --- a/cojac/main.py +++ b/cojac/main.py @@ -10,6 +10,7 @@ from .cooc_tabmut import cooc_tabmut from .phe2cojac import phe2cojac from .sig_generate import sig_generate +from .generate_sigs_nextstrains import generate_sigs_nextstrains @click.group(context_settings=CONTEXT_SETTINGS) @@ -26,3 +27,4 @@ def cli(): cli.add_command(cooc_tabmut) cli.add_command(phe2cojac) cli.add_command(sig_generate) +cli.add_command(generate_sigs_nextstrains) diff --git a/generate_sigs_nextstrains.py b/generate_sigs_nextstrains.py deleted file mode 100755 index 8aa178f..0000000 --- a/generate_sigs_nextstrains.py +++ /dev/null @@ -1,90 +0,0 @@ -import os -import json -import requests -from datetime import datetime -import yaml -import argparse - -def load_dataset(): - url = 'https://raw.githubusercontent.com/corneliusroemer/pango-sequences/main/data/pango-consensus-sequences_summary.json' - response = requests.get(url) - full_data = json.loads(response.content) - - return full_data - -def prepare_header(data): - return { - 'variant': { - 'nextstrain': data['nextstrainClade'], - 'pangolin': data['lineage'], - 'short': data['lineage'].lower().replace('.', '_'), - 'reference': { - 'address': 'https://raw.githubusercontent.com/corneliusroemer/pango-sequences/main/data/pango-consensus-sequences_summary.json', - 'accessed_at': datetime.now().strftime('%Y-%m-%d %H:%M:%S') - } - } - } - -def prepare_mut(data): - mut_dict = {} - if data['nucSubstitutions'] != ['']: - for item in data['nucSubstitutions']: - start_letter = item[0] - end_letter = item[-1] - number = int(item[1:-1]) - mut_dict[number] = f"{start_letter}>{end_letter}" - return {'mut': mut_dict} - else: - return '' - -def prepare_del(data): - del_dict = {} - if data['nucDeletions'] != ['']: - for item in data['nucDeletions']: - if '-' in item: - start, end = map(int, item.split('-')) - key = start - value = str((end - start) * '-') - else: - key = int(item) - value = '-' - del_dict[key] = value - return {'del': del_dict} - else: - return '' - -def prepare_yaml(data): - header = prepare_header(data) - mut = prepare_mut(data) - del_section = prepare_del(data) - - output_data = header - - if mut != '': - output_data['mut'] = mut['mut'] - - if del_section != '': - output_data['del'] = del_section['del'] - - return output_data - -def process_dataset(full_data, outdir): - if not os.path.exists(outdir): - os.makedirs(outdir) - for data in full_data.values(): - output_data = prepare_yaml(data) - with open(f"{outdir}/{output_data['variant']['short']}.yaml", "w") as yaml_file: - yaml.dump(output_data, yaml_file, sort_keys=False) - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument('--outdir', default='voc_nextstrain', help='The output directory for the YAML files') - args = parser.parse_args() - - outdir = args.outdir - - full_data = load_dataset() - process_dataset(full_data, outdir) - -if __name__ == '__main__': - main()