-
Notifications
You must be signed in to change notification settings - Fork 2
/
data_loading.py
124 lines (82 loc) · 3.29 KB
/
data_loading.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import contextlib
import re
import pandas
import pronto
hpo_terms_file = 'ontologies/hp.obo'
snomed_terms_file = 'ontologies/snomed_terms.tab'
snomed_synonyms_file = 'ontologies/snomed_synonyms.tab'
# re to extract snomed IDs from obo serialisations of HPO terms
snomed_id_finder = r'SNOMEDCT_US:(\d*)(?:\n|$)'
def load_hpo_terms():
hpo = pronto.Ontology(hpo_terms_file)
max_hpo_id = 3000079
terms = {}
for hpo_id in range(0, max_hpo_id + 1):
with contextlib.suppress(KeyError):
terms[f"{hpo_id:07d}"] = hpo[f"HP:{hpo_id:07d}"]
return terms
def load_rich_hpo_terms():
hpo_terms = load_hpo_terms()
hpo_terms_list = []
for hpo_id, hpo_term in hpo_terms.items():
descriptions = [hpo_term.name]
# add def field if present in term
if hpo_term.desc:
descriptions.append(str(hpo_term.desc))
# add all listed synonyms
for synonym in hpo_term.synonyms:
descriptions.append(synonym.desc)
hpo_terms_list.append([hpo_id, descriptions])
return hpo_terms_list
def load_snomed_terms():
snomed_terms_df = pandas.read_csv(snomed_terms_file, sep='\t')
snomed_terms = {}
for row in snomed_terms_df.itertuples():
snomed_terms[int(getattr(row, 'db_id'))] = getattr(row, 'name')
return snomed_terms
def load_rich_snomed_terms(snomed_terms):
snomed_synonyms = pandas.read_csv(snomed_synonyms_file, sep='\t')
snomed_terms_list = []
for db_id, name in snomed_terms.items():
descriptions = [name]
# print(db_id)
for synonym_row in snomed_synonyms.loc[snomed_synonyms['db_to_id'] == int(db_id)].itertuples():
# print(synonym_row)
# print(db_id)
# print(getattr(synonym_row, 'name'))
descriptions.append(getattr(synonym_row, 'name'))
snomed_terms_list.append([db_id, descriptions])
return snomed_terms_list
def build_hpo_snomed_map(hpo_terms):
hpo_to_snomed_matches = {}
for hpo_id, term in hpo_terms.items():
serialised_term = term.obo
# search for SNOMED xrefs
match = re.search(snomed_id_finder, serialised_term)
while match:
if hpo_id in hpo_to_snomed_matches:
hpo_to_snomed_matches[hpo_id].append(match.group(1))
else:
hpo_to_snomed_matches[hpo_id] = [match.group(1)]
# slicing out match
serialised_term = serialised_term[:match.span()[0]] + serialised_term[match.span()[1]:]
# looking for new match
match = re.search(snomed_id_finder, serialised_term)
return hpo_to_snomed_matches
def build_snomed_hpo_map(hpo_snomed_map):
snomed_hpo_map = {}
for hpo_id, snomed_ids in hpo_snomed_map.items():
for snomed_id in snomed_ids:
snomed_hpo_map[snomed_id] = hpo_id
return snomed_hpo_map
def main():
snomed_synonyms = pandas.read_csv(snomed_synonyms_file, sep='\t')
# print(snomed_synonyms)
db_id = 82525005
# print(snomed_synonyms.loc[snomed_synonyms['db_to_id'] == db_id])
for synonym_row in snomed_synonyms.loc[snomed_synonyms['db_to_id'] == db_id].itertuples():
print(synonym_row)
# print(db_id)
# descriptions.append(synonym_row['name'])
if __name__ == '__main__':
main()