-
Notifications
You must be signed in to change notification settings - Fork 3
/
mrmr.py
390 lines (363 loc) · 16.2 KB
/
mrmr.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
from __future__ import division
import math
import copy
from classification import classifyData
from numpy import array, shape, where, in1d
# This definition is for calculating pearson coefficient correlation
def pearson_coefficient( n1, x, y ):
sum_x = sum_y = sum_sq_dif_x = sum_sq_dif_y = upper = 0.0
for i in range(n1):
sum_x += x[i]
sum_y += y[i]
mean_x = float(sum_x/n1)
mean_y = float(sum_y/n1)
for i in range(n1):
sum_sq_dif_x += ((x[i] - mean_x)**2)
sum_sq_dif_y += ((y[i] - mean_y)**2)
upper += ((x[i] - mean_x) * (y[i] - mean_y))
lower = float(math.sqrt(sum_sq_dif_x * sum_sq_dif_y))
if lower == 0:
corr_coef = 0.0
else:
corr_coef = float(upper/lower)
return abs(corr_coef)
def mrmrFTestPearson(data, reqFeatures, app = None):
# h = classification variable
# gi = gene variable
# klist = list of classes available(k), ksize = size of kth class(is not nk, no of rows having k)
# n = sum of all nk, ksum = sum of all gi of some particular k
# kvar = variance of kth class, var = overall variance
# gbar = mean of all gi for each i(col), gkbar = mean of all gi for each k(class)
# ftestval = value of ftest function for each gi
# mrmrCProgress = gives the progress of this task to animation app
mrmrCProgress = 0.0
if app is not None:
app.updateProgress('mRMR with FCD started')
klist = []; ksize = {}; ksum = {}; kvar = {}
gbar = []; gkbar = []; ftestval = {}
# first row of data list has meta-data, not actual data
# row = no of samples, col = no of features
row = int(data[0][0])
col = int(data[0][1])
for si in data[1:]:
# si[-1] is the class and has to be in klist, if not, then include it, also update ksize
# if it is there then update the size
if si[-1] not in klist:
klist.append(si[-1])
ksize[si[-1]] = 1
else:
if ksize.has_key(si[-1]):
ksize[si[-1]] += 1
else:
print "Error : class in klist, not found in dict(ksize)"
exit()
# ksum stores the sum of all features of each class
if not ksum.has_key(si[-1]):
ksum[si[-1]] = 0
for j in si[:-1]:
ksum[si[-1]] += float(j)
mrmrCProgress += 2.0
if app is not None:
app.updateProgress('%.1f %%' % mrmrCProgress)
# find each kvar for each class
for ki in klist:
# tmean is the mean and tsum is the sum of squared differences
tmean = float(ksum[ki]/(ksize[ki]*col))
tsum = 0
# Here we are going down row wise and checking if this row is of the ki class
for di in data[1:]:
if di[-1] == ki:
for num in di[:-1]:
tsum += (tmean - float(num))**2
if not kvar.has_key(ki):
kvar[ki] = float(tsum/(ksize[ki]*col))
# will not do kvar[i] = kvar[i]**2, as variance is denoted by sigma^2
else:
print "some error occurred in calculating var(k).."
mrmrCProgress += 2.0
if app is not None:
app.updateProgress('%.1f %%' % mrmrCProgress)
# find the pooled variance = var (for all classes)
# nk = size of each class x features in each col( = col)
var = 0
for i in klist:
var += float((ksize[i]-1)*kvar[i])
var /= float(row - len(klist))
mrmrCProgress += 1.0
if app is not None:
app.updateProgress('%.1f %%' % mrmrCProgress)
# Now after calculating the pooled variance, we move on to find the F-Test values
# of all the gene variables(gi), but first find gbar and gkbar
for gi in xrange(col):
gbar.append(0)
for si in xrange(1, row+1, 1):
gbar[gi] += float(data[si][gi])
if len(gkbar) <= gi:
# This means that no entry has been made in gkbar for this value of gi
# make an entry to increase the list index, else it will cause exception
gkbar.append({data[si][-1]: float(data[si][gi])})
else:
# Means some entry has been made, if the class for this row is present then
# simple add the value, else add this class as a key and initialize with this si value
if gkbar[gi].has_key(data[si][-1]):
gkbar[gi][data[si][-1]] += float(data[si][gi])
else:
gkbar[gi][data[si][-1]] = float(data[si][gi])
# Divide the sum by the number of samples(rows) to actually get the mean
gbar[gi] = float(gbar[gi]/row)
for ki in klist:
if gkbar[gi].has_key(ki):
# Sum of gi for each class/size of the class, correspondingly
gkbar[gi][ki] = float(gkbar[gi][ki]/ksize[ki])
mrmrCProgress += 20.0/col
if app is not None:
app.updateProgress('%.1f %%' % mrmrCProgress)
# For F-Test function calculation
for gi in xrange(col):
tsum = float(0.0)
# print '\n\ngi : ', gi
for ki in klist:
tsum += (((ksize[ki])*(gkbar[gi][ki]-gbar[gi])**2)/(row-1))
ftestval[gi] = float(tsum/var)
mrmrCProgress += 5.0/col
if app is not None:
app.updateProgress('%.1f %%' % mrmrCProgress)
# maxftestval stores the feature with maxvalue of f-Test which starts the algo as first
# selected feature
# selectedfeatures stores the list of features selected as set of reduced features
# nonselectedfeatures stores the list of features not selected yet
# sigmacorval stores the value of sig(corr-coeff) for any (gi) with all other gj's in incremental way
# import operator
# print 'sortedftestval : ',sorted(ftestval.items(), key=operator.itemgetter(1))
maxftestval = max(ftestval, key=lambda k: ftestval[k])
selectedfeatures = []
selectedfeatures.append(maxftestval)
nonselectedfeatures = list(set(ftestval.keys())-{maxftestval})
# print 'nonselectedfeaturelist : ',nonselectedfeatures
# data matrix is transposed to align it feature wise not sample wise
transdata = [list(float(n) for n in x) for x in zip(*data[1:])]
sigmacorval = {}
reduceDataSet = [[0,0,0]]
featureCount = 1
# These variables help in optimal selection
# tempRedData = []
# classAcc = {'knn': 0.0, 'svm': 0.0}
oldClassAcc = {'knn': 0.0, 'nb': 0.0}
while True:
funcval = -10.0; gi = -2
prevFeature = selectedfeatures[-1]
for giNs in nonselectedfeatures:
if sigmacorval.has_key(giNs):
sigmacorval[giNs] += pearson_coefficient(row,transdata[giNs],transdata[prevFeature])
else:
sigmacorval[giNs] = pearson_coefficient(row,transdata[giNs],transdata[prevFeature])
# print "g(%d,%d)" % (giNs, prevFeature), pearson_coefficient(row, transdata[giNs], transdata[prevFeature])
if (ftestval[giNs] - (sigmacorval[giNs]/len(selectedfeatures))) > funcval:
funcval = ftestval[giNs] - (sigmacorval[giNs]/len(selectedfeatures))
gi = giNs
if gi != -2:
selectedfeatures.append(gi)
featureCount +=1
nonselectedfeatures.remove(gi)
del sigmacorval[gi]
# Saving the current DataSet Making the next reducedDataSet
tempRedData = copy.deepcopy(reduceDataSet)
reduceDataSet[0] = [row, featureCount, len(klist)]
if len(reduceDataSet) == 1:
for si in range(row):
# To insert the first feature(maxfTest)
reduceDataSet.append([transdata[selectedfeatures[0]][si], int(transdata[-1][si])])
# print transdata
# print reduceDataSet
for si in range(1, row + 1):
# print selectedfeatures
# print transdata
reduceDataSet[si].insert(-1,float(transdata[selectedfeatures[-1]][si-1]))
# print reduceDataSet
if reqFeatures == 0:
# print featureCount, '---'
# print 'tData : ', tempRedData
# print 'rData : ', reduceDataSet
classAcc = classifyData(reduceDataSet,0.7,None,True)
# print classAcc, ' : ', oldClassAcc
# print '--------------------------'
if classAcc['knn'] > oldClassAcc['knn'] or classAcc['nb'] > oldClassAcc['nb']:
# print reduceDataSet
oldClassAcc['knn'] = classAcc['knn']
oldClassAcc['nb'] = classAcc['nb']
# pass
elif classAcc['knn'] > 90.0 and classAcc['nb'] > 90.0:
oldClassAcc['knn'] = classAcc['knn']
oldClassAcc['nb'] = classAcc['nb']
else:
# print classAcc,' : ', oldClassAcc,' : ',selectedfeatures[-1],' rejected'
featureCount -= 1
reduceDataSet = tempRedData
if len(selectedfeatures) == col:
break
if app is not None:
if reqFeatures > 0:
mrmrCProgress += 70.0 / (reqFeatures - 1)
app.updateProgress('%.1f %%' % mrmrCProgress)
else:
app.updateProgress('%d Features added' % featureCount)
# else:
# print 'wtf'
if reqFeatures != 0 and featureCount == reqFeatures:
break
# print 'selectedfeatures : ',selectedfeatures, len(selectedfeatures)
# print 'nonselectedfeaturelist : ',nonselectedfeatures, len(nonselectedfeatures)
# if reqFeatures == 1: # This is to help in optimal Data Reduction
# return selectedfeatures
# if reqFeatures > 0:
# for si in selectedfeatures:
# reduceDataSet.append(transdata[si])
# # reduceDataSet.append(transdata[-1])
# reduceDataSet = [list(float(n) for n in x) for x in zip(*reduceDataSet)]
# for eachrow in range(len(reduceDataSet)):
# reduceDataSet[eachrow].append(int(transdata[-1][eachrow]))
# reduceDataSet.insert(0,[row,reqFeatures,len(klist)])
if mrmrCProgress < 100.0:
mrmrCProgress = 100.0
if app is not None:
app.updateProgress('%.1f %%' % mrmrCProgress)
return reduceDataSet
# Calculate and return Mutual information between two random variables
def mutual_information(x_arr, y_arr, log_base=2):
if len(x_arr) != len(y_arr):
print "Data for MI cannot have irregular dimensions"
# Variable to return MI
mi_value = 0.0
# Get uniques values of random variables
values_x = set(x_arr)
values_y = set(y_arr)
# For each random
for value_x in values_x:
for value_y in values_y:
px = shape(where(x_arr == value_x))[1] / len(x_arr)
py = shape(where(y_arr == value_y))[1] / len(y_arr)
pxy = len(where(in1d(where(x_arr == value_x)[0],
where(y_arr == value_y)[0]) == True)[0]) / len(y_arr)
if pxy > 0.0:
mi_value += pxy * math.log((pxy / (px * py)), log_base)
# print '(%d,%d) px:%f py:%f pxy:%f' % (value_x, value_y, px, py, pxy)
return mi_value
def mrmrMutualInformation(data, reqFeatures = -1, app=None):
mrmrDProgress = 0.0
if app is not None:
app.updateProgress('mRMR with MID started')
dataSet = [] # For a local copy of data
for eachrow in data[1:]:
dataSet.append([int(elem) for elem in eachrow])
row = int(data[0][0])
dataSet = [list(x) for x in zip(*dataSet)] # Transposing data
dataSet =array(dataSet) # Taking to vector form
mrmrDProgress += 5.0
if app is not None:
app.updateProgress('%.1f %%' % mrmrDProgress)
miScoreOfeachG = {}
# Calculate the mutual Info score for each attribute with the class
for gi in range(len(data[1])-1):
miScoreOfeachG[gi] = mutual_information(dataSet[gi], dataSet[-1], 2)
mrmrDProgress += 35.0/(len(data[1])-1)
if app is not None:
app.updateProgress('%.1f %%' % mrmrDProgress)
maxMIScore = max(miScoreOfeachG, key=lambda k: miScoreOfeachG[k])
# print 'maxMIScore : ',maxMIScore
selectedfeatures = []
selectedfeatures.append(maxMIScore)
nonselectedfeatures = list(set(miScoreOfeachG.keys())-{maxMIScore})
# print miScoreOfeachG
# print nonselectedfeatures
transdata = [list (float(n) for n in x) for x in zip(*data[1:])]
# sigmaMIval stores the value of sig(mutual Info) for any (gi) with all other gj's in incremental way
sigmaMIval ={}
reduceDataSet = [[0, 0, 0]]
featureCount = 1
# These variables help in optimal selection
# tempRedData = []
# classAcc = {'knn': 0.0, 'svm': 0.0}
oldClassAcc = {'knn': 0.0, 'nb': 0.0}
while True:
funcval = -10.0; gi = -2
prevFeature = selectedfeatures[-1]
for giNs in nonselectedfeatures:
if sigmaMIval.has_key(giNs):
sigmaMIval[giNs] += mutual_information(dataSet[giNs], dataSet[prevFeature])
else:
sigmaMIval[giNs] = mutual_information(dataSet[giNs], dataSet[prevFeature])
# print fcount,' : ',sigmaMIval
if (miScoreOfeachG[giNs] - (sigmaMIval[giNs] / len(selectedfeatures))) > funcval:
funcval = miScoreOfeachG[giNs] - (sigmaMIval[giNs] / len(selectedfeatures))
gi = giNs
if gi != -2:
selectedfeatures.append(gi)
featureCount += 1
nonselectedfeatures.remove(gi)
del sigmaMIval[gi]
# Saving the current DataSet Making the next reducedDataSet
tempRedData = copy.deepcopy(reduceDataSet)
reduceDataSet[0] = [row, featureCount, int(data[0][2])]
if len(reduceDataSet) == 1:
for si in range(row):
# To insert the first feature(maxfTest)
reduceDataSet.append([transdata[selectedfeatures[0]][si], int(transdata[-1][si])])
# print transdata
# print reduceDataSet
for si in range(1, row + 1):
# print selectedfeatures
# print transdata
reduceDataSet[si].insert(-1, float(transdata[selectedfeatures[-1]][si - 1]))
# print reduceDataSet
if reqFeatures == 0:
# print featureCount, '---'
# print 'tData : ', tempRedData
# print 'rData : ', reduceDataSet
classAcc = classifyData(reduceDataSet, 0.7, None, True)
# print classAcc, ' : ', oldClassAcc
# print '--------------------------'
if classAcc['knn'] > oldClassAcc['knn'] or classAcc['nb'] > oldClassAcc['nb']:
# print reduceDataSet
oldClassAcc['knn'] = classAcc['knn']
oldClassAcc['nb'] = classAcc['nb']
# pass
elif classAcc['knn'] > 90.0 and classAcc['nb'] > 90.0:
oldClassAcc['knn'] = classAcc['knn']
oldClassAcc['nb'] = classAcc['nb']
else:
# print classAcc,' : ', oldClassAcc,' : ',selectedfeatures[-1],' rejected'
featureCount -= 1
reduceDataSet = tempRedData
if len(selectedfeatures) == int(data[0][1]):
break
if app is not None:
if reqFeatures > 0:
mrmrDProgress += 60.0 / (reqFeatures - 1)
app.updateProgress('%.1f %%' % mrmrDProgress)
else:
app.updateProgress('%d Features Added' % featureCount)
# else:
# print 'wtf'
if reqFeatures != 0 and featureCount == reqFeatures:
break
# print 'selectedfeatures : ', selectedfeatures, len(selectedfeatures)
# print 'nonselectedfeaturelist : ',nonselectedfeatures, len(nonselectedfeatures)
# reduceDataSet = []
# for si in selectedfeatures:
# reduceDataSet.append(dataSet[si])
# reduceDataSet.append(dataSet[-1])
# reduceDataSet = [list(n for n in x) for x in zip(*reduceDataSet)]
# reduceDataSet.insert(0, [int(data[0][0]), reqFeatures, int(data[0][2])])
if mrmrDProgress < 100.0:
mrmrDProgress = 100.0
if app is not None:
app.updateProgress('%.1f %%' % mrmrDProgress)
# print reduceDataSet
return reduceDataSet
# myList = [[0, 0, -2, -2, -2, 2, 0, -2, 2, -2, -2],
# [0, 0, 2, 2, 4, 4, 0, 0, 4, 2, 0, 4],
# [7, 8, 8, 8, 7, 7, 0, 8, 0, 0, 8, 7]]
# alist = [myList[1],
# myList[2]]
# data = array(alist)
# print mutual_information(data[0], data[-1],2)