forked from leanderme/sytora
-
Notifications
You must be signed in to change notification settings - Fork 0
/
relatedSymptoms.py
192 lines (147 loc) · 7.65 KB
/
relatedSymptoms.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
import pandas as pd
import numpy as np
import re
# read in the 50-dimensional GloVe vectors
def read_glove_vecs(file):
with open(file, 'r') as f:
words = set()
word_to_vec_map = {}
for line in f:
line = line.strip().split()
word = line[0]
words.add(word)
word_to_vec_map[word] = np.array(line[1:], dtype=np.float64)
return words, word_to_vec_map
words, word_to_vec_map = read_glove_vecs('data/cui2vec_pretrained.txt')
df = pd.read_csv('disease-symptom-merged.csv')
# reorganises the dataframe by grouping the data by symptoms instead of by diseases
frame = pd.DataFrame(df.groupby(['Symptom', 'Disease']).size()).drop(0, axis = 1)
# the first entry contains only the disease and no symptom, so it is dropped
frame = frame.iloc[1:]
# set the index of the dataframe as 'Symptom'
frame = frame.reset_index().set_index('Symptom')
# get the counts of each symptom, ie, how many times it occurs in the data set
counts = {}
for i in frame.index:
counts[i] = counts.get(i, 0) + 1
# sort the symptoms by their counts in descending order and save it into a dataframe
import operator
sym, ct = zip(*sorted(counts.items(), key = operator.itemgetter(1), reverse = True))
sym_count = pd.DataFrame()
sym_count['Symptom'] = sym
sym_count['Count'] = ct
#sym_count.to_csv('Symptom Counts.csv')
# drop the symptoms that have fewer than 6 entries in the data set
[frame.drop(i, inplace = True) for i in frame.index if counts[i] < 2]
# extract all the diseases present in the data set and make them into a list, for use later on
diseaseLst = []
frame.Disease.map(lambda x: diseaseLst.append(x))
lst = diseaseLst
# For us to train our own word embeddings on top of the existing GloVe representation, we are going to use the skipgram model.
# Each symptom has a disease associated with it, and we use this as the (target word, context word) pair for skipgram generation.
# The 'skipgrams' function in Keras samples equal no. of context and non-context words for a given word from the distribution,
# and we are going to do the same here.
# First we'll make a list that stores the pair and its corresponding label of 1, if the disease is indeed associated with
# the symptom, and 0 otherwise.
couples_and_labels = []
import random
# run through the symptoms
for i in frame.index.unique():
# make a temporary list of the diseases associated with the symptom (actual context words)
a = list(frame.Disease.loc[i].values)
# loop through the context words
for j in a:
# randomly select a disease that isn't associated with the symptom, to set as a non-context word with label 0,
# by using the XOR operator, that finds the uncommon elements in the 2 sets
non_context = random.choice(list(set(lst) ^ set(a)))
# add labels of 1 and 0 to context and non-context words repectively
couples_and_labels.append((i, j, 1))
couples_and_labels.append((i, non_context, 0))
# the entries in the couples_and_labels list now follow the pattern of 1, 0, 1, 0 for the labels. We shuffle it up.
b = random.sample(couples_and_labels, len(couples_and_labels))
# Extract the symptoms, the diseases and the corresponding labels
symptom, disease, label = zip(*b)
np.save('./data/all-files-for-ml/symptom.npy', symptom)
# Transform them into series' to get unique entries in each ('set()' not used as it generates a different order each time
# and the index(number) associated with a word changes each time the program is run)
s1 = pd.Series(list(symptom))
s2 = pd.Series(list(disease))
dic = {}
# Map each word in the symptoms and diseases to a corresponding number that can be fed into Keras
for i,j in enumerate(s1.append(s2).unique()):
dic[j] = i
# Now all the symptoms are represented by a number in the arrays 'symptoms', and 'diseases'
symptoms = np.array(s1.map(dic), dtype = 'int32')
diseases = np.array(s2.map(dic), dtype = 'int32')
# Make the labels too into an array
labels = np.array(label, dtype = 'int32')
dictDf = pd.DataFrame.from_dict(dic, orient='index')
dictDf.reset_index(drop=False, inplace=True)
dictDf.columns = ['Key', 'Values']
dictDf.to_csv('./data/all-files-for-ml/Dictionary.csv', index=True, header=True)
d = pd.read_csv('./data/all-files-for-ml/Dictionary.csv')
lst = []
# size of the vocabulary ,ie, no. of unique words in corpus
vocab_size = len(dic)
# dimension of word embeddings
vector_dim = 500
# create an array of zeros of shape (vocab_size, vector_dim) to store the new embedding matrix (word vector representations)
embedding_matrix = np.zeros((len(dic), 500))
# loop through the dictionary of words and corresponding indexes
for word, index in dic.items():
# split each symptom/disease into a list of constituent words
for i in word.split():
if i in word_to_vec_map.keys(): #only if in glove!
lst.append(word_to_vec_map[i]) # add the embeddings of each word in symptoms and diseases to list 'lst'
# make an array out of the list
arr = np.array(lst)
# sum the embeddings of all words in the sentence, to get an embedding of the entire sentence
# if in the entire sentence, word embeddings weren't available in GloVe vectors, make that sentence into a
# zero array of shape (50,), just as a precaution, as we have already removed such words
arrsum = arr.sum(axis = 0)
# normalize the values
arrsum = arrsum/np.sqrt((arrsum**2).sum())
# add the embedding to the corresponding word index
embedding_matrix[index,:] = arrsum
#import necessary keras modules
from keras.preprocessing import sequence
from keras.layers import Dot, Reshape, Dense, Input, Embedding
from keras.models import Model
# START BUILDING THE KERAS MODEL FOR TRAINING
input_target = Input((1,))
input_context = Input((1,))
# make a Keras embedding layer of shape (vocab_size, vector_dim) and set 'trainable' argument to 'True'
embedding = Embedding(
input_dim = vocab_size,
output_dim = vector_dim,
input_length = 1,
name='embedding',
trainable = True)
# load pre-trained weights(embeddings) from 'embedding_matrix' into the Keras embedding layer
embedding.build((None,))
embedding.set_weights([embedding_matrix])
# run the context and target words through the embedding layer
context = embedding(input_context)
context = Reshape((vector_dim, 1))(context)
target = embedding(input_target)
target = Reshape((vector_dim, 1))(target)
# compute the dot product of the context and target words, to find the similarity
# (dot product is usually a measure of similarity)
dot = Dot(axes = 1)([context, target])
dot = Reshape((1,))(dot)
# pass it through a 'sigmoid' activation neuron;
#this is then comapared with the value in 'label' generated from the skipgram
out = Dense(1, activation = 'sigmoid')(dot)
# create model instance
model = Model(input = [input_context, input_target], output = out)
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics=['accuracy'])
# fit the model, default batch_size of 32
# running for 25 epochs seems to generate good enough results, although running for more iterations
#may improve performance further
model.fit(x = [symptoms, diseases], y = labels, epochs = 6)
# get the new weights (embeddings) after running through keras
new_vecs = model.layers[2].get_weights()[0]
# Each time the model is run, it generates a different loss at the end, and consequently, different word embeddings after each run.
# It is common to save the trained weights once they are seen to be performing well
# I have saved the weights after 25 epochs and an end loss of 0.232, the screenshot of which I've attached, and can be loaded like this:
np.save('./data/all-files-for-ml/10_epochs_0.6_similarity.npy', new_vecs)