Skip to content

Commit

Permalink
generate voc signature from nextstrain clades
Browse files Browse the repository at this point in the history
  • Loading branch information
dr-david authored and DrYak committed May 30, 2024
1 parent 84cfa22 commit 3f63e2a
Showing 1 changed file with 90 additions and 0 deletions.
90 changes: 90 additions & 0 deletions generate_sigs_nextstrains.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
import os
import json
import requests
from datetime import datetime
import yaml
import argparse

def load_dataset():
url = 'https://raw.githubusercontent.com/corneliusroemer/pango-sequences/main/data/pango-consensus-sequences_summary.json'
response = requests.get(url)
full_data = json.loads(response.content)

return full_data

def prepare_header(data):
return {
'variant': {
'nextstrain': data['nextstrainClade'],
'pangolin': data['lineage'],
'short': data['lineage'].lower().replace('.', '_'),
'reference': {
'address': 'https://raw.githubusercontent.com/corneliusroemer/pango-sequences/main/data/pango-consensus-sequences_summary.json',
'accessed_at': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
}
}
}

def prepare_mut(data):
mut_dict = {}
if data['nucSubstitutions'] != ['']:
for item in data['nucSubstitutions']:
start_letter = item[0]
end_letter = item[-1]
number = int(item[1:-1])
mut_dict[number] = f"{start_letter}>{end_letter}"
return {'mut': mut_dict}
else:
return ''

def prepare_del(data):
del_dict = {}
if data['nucDeletions'] != ['']:
for item in data['nucDeletions']:
if '-' in item:
start, end = map(int, item.split('-'))
key = start
value = str((end - start) * '-')
else:
key = int(item)
value = '-'
del_dict[key] = value
return {'del': del_dict}
else:
return ''

def prepare_yaml(data):
header = prepare_header(data)
mut = prepare_mut(data)
del_section = prepare_del(data)

output_data = header

if mut != '':
output_data['mut'] = mut['mut']

if del_section != '':
output_data['del'] = del_section['del']

return output_data

def process_dataset(full_data, outdir):
if not os.path.exists(outdir):
os.makedirs(outdir)
for data in full_data.values():
output_data = prepare_yaml(data)
with open(f"{outdir}/{output_data['variant']['short']}.yaml", "w") as yaml_file:
yaml.dump(output_data, yaml_file, sort_keys=False)

def main():
parser = argparse.ArgumentParser()
parser.add_argument('--outdir', default='voc_nextstrain', help='The output directory for the YAML files')
args = parser.parse_args()

outdir = args.outdir

full_data = load_dataset()
process_dataset(full_data, outdir)

if __name__ == '__main__':
main()

0 comments on commit 3f63e2a

Please sign in to comment.