-
Notifications
You must be signed in to change notification settings - Fork 0
/
data_entropy.py
112 lines (91 loc) · 3.99 KB
/
data_entropy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
# Author: Arjun S Kulathuvayal. Intellectual property. Copyright strictly restricted
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from math import log2
from numpy import linalg,mean,dot
font = {'family': 'serif',
'weight': 'normal',
'size': 14}
plt.rc('font', **font)
plt.rcParams["figure.figsize"] = [8, 6]
plt.rcParams['text.usetex'] = True
def entropy(X,classes):
if len(X)<=1:
return 0
no_class1=len(np.where(X[:,1]==classes[0])[0])
no_class2=len(np.where(X[:,1]==classes[1])[0])
if no_class1==0 or no_class2==0:
return 0
p1=no_class1/(no_class1+no_class2)
p2=1-p1
return -(p1 * log2(p1)+p2*log2(p2))
def supervised_discretization(D, classes, feature_entropy_threshold, feature_no_bin_threshold):
range_results = []
def recursive_supervised_discretization(D):
if len(D) <= feature_no_bin_threshold:
return
entropy_D = entropy(D, classes)
if entropy_D <= feature_entropy_threshold:
return
best_et = 1
best_split = -1
for i in range(1, len(D)):
# if D[:,0][i-1]==D[:,0][i]:
# continue
split = (D[:, 0][i - 1] + D[:, 0][i]) / 2
part1 = D[np.where(D[:, 0] <= split)[0]]
part2 = D[np.where(D[:, 0] > split)[0]]
e1 = entropy(part1, classes)
e2 = entropy(part2, classes)
w1 = len(part1) / len(D)
w2 = len(part2) / len(D)
et = w1 * e1 + w2 * e2
if et <= best_et:
best_et = et
best_split = split
part1 = D[np.where(D[:, 0] <= best_split)[0]]
part2 = D[np.where(D[:, 0] > best_split)[0]]
if len(part1) == 0 or len(part2) == 0:
return
range_results.append(best_split)
recursive_supervised_discretization(part1)
recursive_supervised_discretization(part2)
recursive_supervised_discretization(D)
range_results.append(float(min(D[:, 0])))
range_results.append(float(max(D[:, 0])) + 1)
range_results = sorted(range_results)
final_E = 0
ranges_in_nice_format = ""
for i in range(len(range_results) - 1):
start = range_results[i]
end = range_results[i + 1]
ranges_in_nice_format += "[ " + str(start) + ', ' + str(end) + ")"
Z = D[np.where(((D[:, 0] >= start) & (D[:, 0] < end)))[0]]
# print(Z)
e = entropy(Z, classes)
# print(start,end,len(Z),e)
final_E += (len(Z) / len(D)) * e
print("Final entropy", final_E)
print("discretized ranges", ranges_in_nice_format)
def get_feature_class(feat):
return np.array(df[[feat,'target']].sort_values(feat))
if __name__ == '__main__':
# X = np.array([[10, -1], [15, -1], [18, 1], [19, 1], [24, -1], [29, 1], [30, 1], [31, 1], [40, -1], [44, -1], [55, -1], [64, -1]])
target_classes = [0, 1, 1, 0, 0, 1]
df = pd.read_csv("selected_good_fit_features.csv")
dislocs = ['sigma_110_e', 'sigma_110_s', 'sigma_112_e', 'sigma_112_s', 'sigma_123_e', 'sigma_123_s']
features = ['Nb', 'Ta', 'Ti', 'Mo', 'Hf', 'Zr', 'eta', 'a0', 'C11', 'C12', 'C44', 'Ac',
'Ecoh_(eV)', 'gamma_110_usf', 'gamma_112_usf', 'gamma_123_usf', 'tao_110_iss', 'tao_112_iss', 'tao_123_iss']
df['target'] = df['target'].apply(lambda x: dislocs.index(x) if x in dislocs else x)
df['feature_1'] = df['feature_1'].apply(lambda x: features.index(x) if x in features else x)
df['feature_2'] = df['feature_2'].apply(lambda x: features.index(x) if x in features else x)
df['feature_3'] = df['feature_3'].apply(lambda x: features.index(x) if x in features else x)
columns = list(df.columns)
columns.remove('target')
for c in columns:
print("Discretization for feature: {}".format(c))
X = np.array(df[[c,'target']].sort_values(c))
supervised_discretization(X, classes=target_classes, feature_entropy_threshold=0.3,
feature_no_bin_threshold=10)
print('=' * 100)