forked from sergeio/text_clustering
-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils.py
executable file
·167 lines (134 loc) · 5.59 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
import csv
import numpy as np
import matplotlib.pyplot as plt
import project1 as p1
import sys
if sys.version_info[0] < 3:
PYTHON3 = False
else:
PYTHON3 = True
def load_toy_data(path_toy_data):
"""
Loads the 2D toy dataset as numpy arrays.
Returns the tuple (features, labels) in which features is an Nx2 numpy matrix and
labels is a length-N vector of +1/-1 labels.
"""
labels, xs, ys = np.loadtxt(path_toy_data, delimiter='\t', unpack=True)
return np.vstack((xs, ys)).T, labels
def load_data(path_data, extras=False):
"""
Returns a list of dict with keys:
* sentiment: +1 or -1 if the review was positive or negative, respectively
* text: the text of the review
Additionally, if the `extras` argument is True, each dict will also include the
following information:
* productId: a string that uniquely identifies each product
* userId: a string that uniquely identifies each user
* summary: the title of the review
* helpfulY: the number of users who thought this review was helpful
* helpfulN: the number of users who thought this review was NOT helpful
"""
global PYTHON3
basic_fields = {'sentiment', 'text'}
numeric_fields = {'sentiment', 'helpfulY', 'helpfulN'}
data = []
if PYTHON3:
f_data = open(path_data, encoding="latin1")
else:
f_data = open(path_data)
for datum in csv.DictReader(f_data, delimiter='\t'):
for field in list(datum.keys()):
if not extras and field not in basic_fields:
del datum[field]
elif field in numeric_fields and datum[field]:
datum[field] = int(datum[field])
data.append(datum)
f_data.close()
return data
def write_predictions(path_submit_data, preds):
if PYTHON3:
f_data = open(path_submit_data, encoding="latin1")
else:
f_data = open(path_submit_data)
reader = csv.DictReader(f_data, delimiter='\t')
data = list(reader)
assert len(preds) == len(data), \
'Expected {} predictions but {} were given.'.format(len(data), len(preds))
for pred, datum in zip(preds.astype(int), data):
assert pred == 1 or pred == -1, 'Invalid prediction: {}.'.format(pred)
datum['sentiment'] = pred
f_data.close()
if PYTHON3:
f_out = open(path_submit_data, 'w')
else:
f_out = open(path_submit_data, 'wb')
writer = csv.DictWriter(f_out, delimiter='\t', fieldnames=reader.fieldnames)
writer.writeheader()
for datum in data:
writer.writerow(datum)
f_out.close()
def plot_toy_data(algo_name, features, labels, thetas):
"""
Plots the toy data in 2D.
Arguments:
* features - an Nx2 ndarray of features (points)
* labels - a length-N vector of +1/-1 labels
* thetas - the tuple (theta, theta_0) that is the output of the learning algorithm
* algorithm - the string name of the learning algorithm used
"""
# plot the points with labels represented as colors
plt.subplots()
colors = ['b' if label == 1 else 'r' for label in labels]
plt.scatter(features[:, 0], features[:, 1], s=40, c=colors)
xmin, xmax = plt.axis()[:2]
# plot the decision boundary
theta, theta_0 = thetas
xs = np.linspace(xmin, xmax)
ys = -(theta[0]*xs + theta_0) / (theta[1] + 1e-16)
plt.plot(xs, ys, 'k-')
# show the plot
algo_name = ' '.join((word.capitalize() for word in algo_name.split(' ')))
plt.suptitle('Classified Toy Data ({})'.format(algo_name))
plt.show()
def plot_tune_results(algo_name, param_name, param_vals, acc_train, acc_val):
"""
Plots classification accuracy on the training and validation data versus
several values of a hyperparameter used during training.
"""
# put the data on the plot
plt.subplots()
plt.plot(param_vals, acc_train, '-o')
plt.plot(param_vals, acc_val, '-o')
# make the plot presentable
algo_name = ' '.join((word.capitalize() for word in algo_name.split(' ')))
param_name = param_name.capitalize()
plt.suptitle('Classification Accuracy vs {} ({})'.format(param_name, algo_name))
plt.legend(['train','val'], loc='upper right', title='Partition')
plt.xlabel(param_name)
plt.ylabel('Accuracy (%)')
plt.show()
def tune(train_fn, param_vals, train_feats, train_labels, val_feats, val_labels):
train_accs = np.ndarray(len(param_vals))
val_accs = np.ndarray(len(param_vals))
for i, val in enumerate(param_vals):
theta, theta_0 = train_fn(train_feats, train_labels, val)
train_preds = p1.classify(train_feats, theta, theta_0)
train_accs[i] = p1.accuracy(train_preds, train_labels)
val_preds = p1.classify(val_feats, theta, theta_0)
val_accs[i] = p1.accuracy(val_preds, val_labels)
return train_accs, val_accs
def tune_perceptron(*args):
return tune(p1.perceptron, *args)
def tune_avg_perceptron(*args):
return tune(p1.average_perceptron, *args)
def tune_pegasos_T(best_L, *args):
def train_fn(features, labels, T):
return p1.pegasos(features, labels, T, best_L)
return tune(train_fn, *args)
def tune_pegasos_L(best_T, *args):
def train_fn(features, labels, L):
return p1.pegasos(features, labels, best_T, L)
return tune(train_fn, *args)
def most_explanatory_word(theta, wordlist):
"""Returns the word associated with the bag-of-words feature having largest weight."""
return [word for (theta_i, word) in sorted(zip(theta, wordlist))[::-1]]