-
Notifications
You must be signed in to change notification settings - Fork 1
/
SVM_Rank.py
169 lines (151 loc) · 5.13 KB
/
SVM_Rank.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
import itertools
import numpy as np
from sklearn.linear_model import SGDClassifier, SGDRanking
from sklearn import metrics
from minirank.compat import RankSVM as MinirankSVM
from scipy import stats
def transform_pairwise(X, y):
"""Transforms data into pairs with balanced labels for ranking
Transforms a n-class ranking problem into a two-class classification
problem. Subclasses implementing particular strategies for choosing
pairs should override this method.
In this method, all pairs are choosen, except for those that have the
same target value. The output is an array of balanced classes, i.e.
there are the same number of -1 as +1
Parameters
----------
X : array, shape (n_samples, n_features)
The data
y : array, shape (n_samples,) or (n_samples, 2)
Target labels. If it's a 2D array, the second column represents
the grouping of samples, i.e., samples with different groups will
not be considered.
Returns
-------
X_trans : array, shape (k, n_feaures)
Data as pairs
y_trans : array, shape (k,)
Output class labels, where classes have values {-1, +1}
"""
X_new = []
y_new = []
y = np.asarray(y)
if y.ndim == 1:
y = np.c_[y, np.ones(y.shape[0])]
comb = itertools.combinations(range(X.shape[0]), 2)
for k, (i, j) in enumerate(comb):
if y[i, 0] == y[j, 0] or y[i, 1] != y[j, 1]:
# skip if same target or different group
continue
X_new.append(X[i] - X[j])
y_new.append(np.sign(y[i, 0] - y[j, 0]))
# output balanced classes
if y_new[-1] != (-1) ** k:
y_new[-1] = - y_new[-1]
X_new[-1] = - X_new[-1]
return np.asarray(X_new), np.asarray(y_new).ravel()
class RankSVM(SGDClassifier):
"""Performs pairwise ranking with an underlying SGDClassifer model
Input should be a n-class ranking problem, this object will convert it
into a two-class classification problem, a setting known as
`pairwise ranking`.
Authors: Fabian Pedregosa <[email protected]>
Alexandre Gramfort <[email protected]>
https://gist.github.com/2071994
"""
def fit(self, X, y):
"""
Fit a pairwise ranking model.
Parameters
----------
X : array, shape (n_samples, n_features)
y : array, shape (n_samples,) or (n_samples, 2)
Returns
-------
self
"""
X_trans, y_trans = transform_pairwise(X, y)
super(RankSVM, self).fit(X_trans, y_trans)
return self
def predict(self, X):
pred = super(RankSVM, self).predict(X)
# preds are mapped to {-1,1}
# FIXME only works in this example!!!
pred[pred == -1] = 0
return pred
def score(self, X, y):
"""
Because we transformed into a pairwise problem, chance level is at 0.5
"""
X_trans, y_trans = transform_pairwise(X, y)
return np.mean(super(RankSVM, self).predict(X_trans) == y_trans)
def rank(clf, X):
if clf.coef_.shape[0] == 1:
coef = clf.coef_[0]
else:
coef = clf.coef_
order = np.argsort(np.dot(X, coef))
order_inv = np.zeros_like(order)
order_inv[order] = np.arange(len(order))
return order_inv
def kendalltau(clf, X, y):
if clf.coef_.shape[0] == 1:
coef = clf.coef_[0]
else:
coef = clf.coef_
tau, _ = stats.kendalltau(np.dot(X, coef), y)
return np.abs(tau)
if __name__ == "__main__":
rs = np.random.RandomState(0)
n_samples_1 = 10000
n_samples_2 = 100
X = np.r_[1.5 * rs.randn(n_samples_1, 2),
0.5 * rs.randn(n_samples_2, 2) + [2, 2]]
y = np.array([0] * (n_samples_1) + [1] * (n_samples_2))
idx = np.arange(y.shape[0])
rs.shuffle(idx)
X = X[idx]
y = y[idx]
mean = X.mean(axis=0)
std = X.std(axis=0)
X = (X - mean) / std
for clf, name in ((SGDClassifier(n_iter=100, alpha=0.01), "plain sgd"),
(SGDClassifier(n_iter=100, alpha=0.01,
class_weight={1: 10}), "weighted sgd"),
(SGDRanking(n_iter=1000, alpha=0.01,
loss='roc_pairwise_ranking'), "pairwise sgd"),
(RankSVM(n_iter=100, alpha=0.01, loss='hinge'), 'RankSVM'),
):
clf.fit(X, y)
print
clf
pred = clf.predict(X)
print
"ACC: %.4f" % metrics.zero_one_score(y, pred)
print
"AUC: %.4f" % metrics.auc_score(y, pred)
print
"CONFUSION MATRIX: "
print
metrics.confusion_matrix(y, pred)
print
"Kendall Tau: %.4f" % kendalltau(clf, X, y)
print
80 * '='
clf = MinirankSVM(max_iter=100, alpha=0.01).fit(X, y)
print
clf
scores = np.dot(X, clf.coef_)
pred = (scores > 0).astype(np.int)
print
"ACC: %.4f" % metrics.zero_one_score(y, pred)
print
"AUC: %.4f" % metrics.auc_score(y, pred)
print
"CONFUSION MATRIX: "
print
metrics.confusion_matrix(y, pred)
print
"Kendall Tau: %.4f" % kendalltau(clf, X, y)
print
80 * '='