-
Notifications
You must be signed in to change notification settings - Fork 2
/
vec_builder.py
90 lines (59 loc) · 2.58 KB
/
vec_builder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import contextlib
import numpy
from bert_serving.client import BertClient
import data_loading
# connecting to remote bert server
# bc = BertClient(ip='86.17.97.132') # ip address of the GPU machine
bc = BertClient(ip='192.168.0.34') # local ip address of the GPU machine
hpo_terms_file = 'ontologies/hp.obo'
snomed_terms_file = 'ontologies/snomed_terms.tab'
embedding_dims = 1024
def build_vec_file(terms_list, save_path):
descriptions = [terms_list[i][1] for i in range(len(terms_list))]
print(descriptions)
vectors = numpy.zeros(shape=(len(terms_list), embedding_dims), dtype=float)
counter = 0
for i in range(len(terms_list)):
if counter % 100 == 0:
print(f'{counter} terms encoded')
if isinstance(descriptions[i], list):
vectors[i] = numpy.mean(bc.encode(descriptions[i]), axis=0)
else:
vectors[i] = bc.encode(descriptions[i])
counter += 1
print('got vectors')
with open(save_path, 'w+') as file:
for i in range(len(terms_list)):
vector = list(vectors[i].flatten().astype(dtype=str))
file.write(str(terms_list[i][0]) + ' ' + ' '.join(vector) + '\n')
def build_hpo_vectors(rich=False):
if rich:
hpo_terms_list = data_loading.load_rich_hpo_terms()
else:
hpo_terms = data_loading.load_hpo_terms()
hpo_terms_list = [[hpo_id, hpo_term.name] for hpo_id, hpo_term in hpo_terms.items()]
build_vec_file(terms_list=hpo_terms_list, save_path='hpo.vec')
def build_snomed_vectors(rich=False):
hpo_terms = data_loading.load_hpo_terms()
print('hpo terms loaded')
snomed_terms = data_loading.load_snomed_terms()
print('snomed terms loaded')
hpo_snomed_map = data_loading.build_hpo_snomed_map(hpo_terms)
snomed_hpo_map = data_loading.build_snomed_hpo_map(hpo_snomed_map)
print('built snomed -> hpo terms map')
filtered_snomed_terms = {}
for snomed_id, _ in snomed_hpo_map.items():
with contextlib.suppress(KeyError):
filtered_snomed_terms[snomed_id] = snomed_terms[int(snomed_id)]
print('extracted subset of snomed names for terms referenced in hpo')
if rich:
snomed_terms_list = data_loading.load_rich_snomed_terms(filtered_snomed_terms)
print(snomed_terms_list)
else:
snomed_terms_list = [[snomed_id, snomed_name] for snomed_id, snomed_name in filtered_snomed_terms.items()]
build_vec_file(terms_list=snomed_terms_list, save_path='snomed.vec')
def main():
build_snomed_vectors(rich=True)
build_hpo_vectors(rich=True)
if __name__ == '__main__':
main()