-
Notifications
You must be signed in to change notification settings - Fork 0
/
svm.py
158 lines (131 loc) · 5.27 KB
/
svm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
from libsvm import read_libsvm
import numpy as np
from collections import Counter
import itertools
from helpers import log_transform
import csv
def predict(X, w, b):
predicted = np.dot(w, X) + b
if predicted >= 0:
return 1
else:
return 0
def accuracy(X, y, w, b):
correct = 0
for i in range(X.shape[0]):
prediction = predict(X[i], w, b)
if prediction == y[i]:
correct += 1
return correct / X.shape[0]
def shuffle_arrays(X, y):
idx = np.arange(X.shape[0])
np.random.shuffle(idx)
return X[idx], y[idx]
def majority_baseline(y):
most_common_label = Counter(y).most_common()[0][0]
correct = 0
for label in y:
if label == most_common_label:
correct += 1
return correct / len(y)
def train_svm(X_train, y_train, C=10, epochs=10, lr=0.01, best_epoch=False):
w = np.random.uniform(-0.01, 0.01, size=X_train.shape[1]) # initialize w
b = np.random.uniform(-0.01, 0.01)
rate = lr
updates = 0
best_accuracy = 0
all_accuracies = []
all_epochs = []
for count in range(epochs):
shuffled_x, shuffled_y = shuffle_arrays(X_train, y_train)
for i in range(shuffled_x.shape[0]):
# SVM PART
y = 1
if shuffled_y[i] == 0:
y = -1
if y * (np.dot(w, shuffled_x[i]) + b) <= 1:
w = np.dot((1 - rate), w) + np.dot(rate * C * y, shuffled_x[i])
b = (1-rate)*b + rate * C * y
else:
w = np.dot(1-rate,w)
b = (1-rate) * b
rate = rate / (1+count)
if best_epoch:
epoch_accuracy = accuracy(X_train, y_train, w, b)
if epoch_accuracy > best_accuracy:
best_accuracy = epoch_accuracy
best_w = w
best_b = b
all_accuracies.append(epoch_accuracy)
all_epochs.append(count+1)
if best_epoch:
return best_w, best_b, updates
else:
return w, b, updates
def cross_validate(learning_rate, C, training_function):
scores = []
for i in range(1,6):
x_folds = []
y_folds = []
first = True
x, y, num_features = read_libsvm('data/data.train')
x = np.asarray(x.todense())
num_per_fold = len(x) // 6
count = 0
for j in range(1, 6):
# path = 'data/CVfolds/fold' + str(j)
if j != i and first:
x_folds.append(x[count:count + num_per_fold])
y_folds.append(y[count:count + num_per_fold])
count += num_per_fold
x_train = np.concatenate(x_folds)
y_train = np.concatenate(y_folds)
w, b, updates = training_function(log_transform(x_train), y_train, C=C, epochs=20, lr=learning_rate)
x_test = x[i * num_per_fold:i * num_per_fold + num_per_fold]
y_test = y[i * num_per_fold:i * num_per_fold + num_per_fold]
result_accuracy = accuracy(log_transform(x_test), y_test, w, b)
scores.append(result_accuracy)
return sum(scores) / float(len(scores))
def run_svm(write=False):
best_params = None
best_accuracy = 0
learning_rates = [1, 10**(-1), 10**(-2), 10**(-3), 10**(-4)]
C = [10, 1, 10 ** (-1), 10 ** (-2), 10 ** (-3), 10 ** (-4)]
combos = list(itertools.product(learning_rates, C))
print('Cross Validation')
print('+---------------+-------+---------------------+')
print('| Learning rate | C | Average Accuracy |')
print('+---------------+-------+---------------------+')
for combo in combos:
result = cross_validate(combo[0], combo[1], train_svm)
print('|{:>15}'.format(str(combo[0]))+'|{:>8}'.format(str(combo[1])) +'|{:>20}|'.format(str(result)))
if result > best_accuracy:
best_accuracy = result
best_params = combo
print('+---------------+-------+---------------------+')
print('Best hyper-parameter (learning rate):', best_params[0])
print('Best hyper-parameter (C):', best_params[1])
print('Average Accuracy for best hyper-parameter:', best_accuracy)
x_train, y_train, num_features = read_libsvm(fname='data/data.train')
w, b, updates = train_svm(log_transform(np.asarray(x_train.todense())), y_train, epochs=20, lr=best_params[0], C=best_params[1], best_epoch=True)
training_acc = accuracy(log_transform(np.asarray(x_train.todense())), y_train, w, b)
print('Training Accuracy:', training_acc)
x_test, y_test, num_features = read_libsvm(fname='data/data.test')
test_acc = accuracy(log_transform(np.asarray(x_test.todense())), y_test, w, b)
print('Test Accuracy:', test_acc)
print()
if write:
x_test, y_test, num_features = read_libsvm(fname='data/data.eval.anon')
write_answers(log_transform(np.asarray(x_test.todense())), y_test, w, b)
def write_answers(X, y, w, b):
ids = []
with open('data/eval.id') as f:
for line in f:
ids.append(line.strip())
with open('answers_svm.csv', 'w') as csvfile:
writer = csv.writer(csvfile, delimiter=',',
quotechar='|', quoting=csv.QUOTE_MINIMAL)
writer.writerow(['example_id','label'])
for i in range(X.shape[0]):
prediction = predict(X[i], w, b)
writer.writerow([ids[i], prediction])