Skip to content

Commit

Permalink
Polishing nextstrain importer
Browse files Browse the repository at this point in the history
  • Loading branch information
DrYak committed May 30, 2024
1 parent 3f63e2a commit 30916ca
Show file tree
Hide file tree
Showing 3 changed files with 112 additions and 90 deletions.
110 changes: 110 additions & 0 deletions cojac/generate_sigs_nextstrains.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
#!/usr/bin/env python3

import os
import json
import requests
from datetime import datetime
import yaml
import click


def load_dataset(url):
response = requests.get(url)
full_data = json.loads(response.content)

return full_data


def prepare_header(data, url):
return {
"variant": {
"nextstrain": data["nextstrainClade"],
"pangolin": data["lineage"],
"short": data["lineage"].lower().replace(".", "_"),
"reference": {
"address": url,
"accessed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
},
}
}


def prepare_mut(data):
mut_dict = {}
if data["nucSubstitutions"] != [""]:
for item in data["nucSubstitutions"]:
start_letter = item[0]
end_letter = item[-1]
number = int(item[1:-1])
mut_dict[number] = f"{start_letter}>{end_letter}"
return {"mut": mut_dict}
else:
return ""


def prepare_del(data):
del_dict = {}
if data["nucDeletions"] != [""]:
for item in data["nucDeletions"]:
if "-" in item:
start, end = map(int, item.split("-"))
key = start
value = str((end - start) * "-")
else:
key = int(item)
value = "-"
del_dict[key] = value
return {"del": del_dict}
else:
return ""


def prepare_yaml(data, url):
header = prepare_header(data, url)
mut = prepare_mut(data)
del_section = prepare_del(data)

output_data = header

if mut != "":
output_data["mut"] = mut["mut"]

if del_section != "":
output_data["del"] = del_section["del"]

return output_data


def process_dataset(full_data, outdir, url):
if not os.path.exists(outdir):
os.makedirs(outdir)
for data in full_data.values():
output_data = prepare_yaml(data, url)
with open(f"{outdir}/{output_data['variant']['short']}.yaml", "w") as yaml_file:
yaml.dump(output_data, yaml_file, sort_keys=False)


@click.command(
help="Generating a list of variants from nextstrain",
epilog="This tool fetchs a JSON from Github",
)
@click.option(
"-o",
"--outdir",
required=False,
default="voc_nextstrain",
type=str,
help="The output directory for the YAML files",
)
@click.option(
"-u",
"--url",
metavar="URL",
required=False,
default="https://raw.githubusercontent.com/corneliusroemer/pango-sequences/main/data/pango-consensus-sequences_summary.json",
type=str,
help="url to fetch the JSON from",
)
def generate_sigs_nextstrains(outdir, url):
full_data = load_dataset(url)
process_dataset(full_data, outdir, url)
2 changes: 2 additions & 0 deletions cojac/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from .cooc_tabmut import cooc_tabmut
from .phe2cojac import phe2cojac
from .sig_generate import sig_generate
from .generate_sigs_nextstrains import generate_sigs_nextstrains


@click.group(context_settings=CONTEXT_SETTINGS)
Expand All @@ -26,3 +27,4 @@ def cli():
cli.add_command(cooc_tabmut)
cli.add_command(phe2cojac)
cli.add_command(sig_generate)
cli.add_command(generate_sigs_nextstrains)
90 changes: 0 additions & 90 deletions generate_sigs_nextstrains.py

This file was deleted.

0 comments on commit 30916ca

Please sign in to comment.