-
Notifications
You must be signed in to change notification settings - Fork 19
/
tf_sentimentmain.py
238 lines (181 loc) · 6.87 KB
/
tf_sentimentmain.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
import tf_data_utils as utils
import sys
import numpy as np
import tensorflow as tf
import random
import tf_tree_lstm
import nary_tree_lstm
DIR = 'data/sst/'
GLOVE_DIR ='data/glove/'
import time
#from tf_data_utils import extract_tree_data,load_sentiment_treebank
class Config(object):
num_emb=None
emb_dim = 300
hidden_dim = 150
output_dim=None
degree = 2
num_labels = 3
num_epochs = 50
maxseqlen = None
maxnodesize = None
fine_grained=False
trainable_embeddings=True
nonroot_labels=True
embeddings = None
def train2():
config = Config()
config.batch_size = 25
config.lr = 0.05
config.dropout = 0.5
config.reg = 0.0001
config.emb_lr = 0.02
import collections
import numpy as np
from sklearn import metrics
def test(model, data, session):
relevant_labels = [0, 2]
ys_true = collections.deque([])
ys_pred = collections.deque([])
for batch in data:
y_pred = model.get_output()
y_true = batch[0].root_labels/2
feed_dict = {model.labels: batch[0].root_labels}
feed_dict.update(model.tree_lstm.get_feed_dict(batch[0]))
y_pred_ = session.run([y_pred], feed_dict=feed_dict)
y_pred_ = np.argmax(y_pred_[0][:,relevant_labels], axis=1)
ys_true += y_true.tolist()
ys_pred += y_pred_.tolist()
ys_true = list(ys_true)
ys_pred = list(ys_pred)
score = metrics.accuracy_score(ys_true, ys_pred)
print "Accuracy", score
#print "Recall", metrics.recall_score(ys_true, ys_pred)
#print "f1_score", metrics.f1_score(ys_true, ys_pred)
print "confusion_matrix"
print metrics.confusion_matrix(ys_true, ys_pred)
return score
data, vocab = utils.load_sentiment_treebank(DIR, GLOVE_DIR, config.fine_grained)
# data, vocab = utils.load_sentiment_treebank(DIR, None, config.fine_grained)
config.embeddings = vocab.embed_matrix
train_set, dev_set, test_set = data['train'], data['dev'], data['test']
print 'train', len(train_set)
print 'dev', len(dev_set)
print 'test', len(test_set)
num_emb = len(vocab)
num_labels = 5 if config.fine_grained else 3
for _, dataset in data.items():
labels = [label for _, label in dataset]
assert set(labels) <= set(xrange(num_labels)), set(labels)
print 'num emb', num_emb
print 'num labels', num_labels
config.num_emb = num_emb
config.output_dim = num_labels
# return
random.seed()
np.random.seed()
from random import shuffle
shuffle(train_set)
train_set = utils.build_labelized_batch_trees(train_set, config.batch_size)
dev_set = utils.build_labelized_batch_trees(dev_set, 500)
test_set = utils.build_labelized_batch_trees(test_set, 500)
with tf.Graph().as_default():
#model = tf_seq_lstm.tf_seqLSTM(config)
model = nary_tree_lstm.SoftMaxNarytreeLSTM(config, train_set + dev_set + test_set)
init=tf.global_variables_initializer()
best_valid_score=0.0
best_valid_epoch=0
dev_score=0.0
test_score=0.0
with tf.Session() as sess:
sess.run(init)
for epoch in range(config.num_epochs):
start_time = time.time()
print 'epoch', epoch
avg_loss=0.0
model.train_epoch(train_set[:],sess)
print "Training time per epoch is {0}".format(
time.time() - start_time)
print 'validation score'
score = test(model,dev_set,sess)
#print 'train score'
#test(model, train_set[:40], sess)
if score >= best_valid_score:
best_valid_score = score
best_valid_epoch = epoch
test_score = test(model,test_set,sess)
print 'test score :', test_score, 'updated', epoch - best_valid_epoch, 'epochs ago with validation score', best_valid_score
def train(restore=False):
config=Config()
config.batch_size = 5
config.lr = 0.05
data,vocab = utils.load_sentiment_treebank(DIR,GLOVE_DIR,config.fine_grained)
config.embeddings = vocab.embed_matrix
config.early_stopping = 2
config.reg = 0.0001
config.dropout = 1.0
config.emb_lr = 0.1
train_set, dev_set, test_set = data['train'], data['dev'], data['test']
print 'train', len(train_set)
print 'dev', len(dev_set)
print 'test', len(test_set)
num_emb = len(vocab)
num_labels = 5 if config.fine_grained else 3
for _, dataset in data.items():
labels = [label for _, label in dataset]
assert set(labels) <= set(xrange(num_labels)), set(labels)
print 'num emb', num_emb
print 'num labels', num_labels
config.num_emb=num_emb
config.output_dim = num_labels
config.maxseqlen=utils.get_max_len_data(data)
config.maxnodesize=utils.get_max_node_size(data)
print config.maxnodesize,config.maxseqlen ," maxsize"
#return
random.seed()
np.random.seed()
with tf.Graph().as_default():
#model = tf_seq_lstm.tf_seqLSTM(config)
model = tf_tree_lstm.tf_NarytreeLSTM(config)
init=tf.global_variables_initializer()
saver = tf.train.Saver()
best_valid_score=0.0
best_valid_epoch=0
dev_score=0.0
test_score=0.0
with tf.Session() as sess:
sess.run(init)
if restore:saver.restore(sess,'./ckpt/tree_rnn_weights')
for epoch in range(config.num_epochs):
start_time = time.time()
print 'epoch', epoch
avg_loss=0.0
avg_loss = train_epoch(model, train_set,sess)
print 'avg loss', avg_loss
print "Training time per epoch is {0}".format(
time.time() - start_time)
dev_score=evaluate(model,dev_set,sess)
print 'dev-score', dev_score
if dev_score >= best_valid_score:
best_valid_score=dev_score
best_valid_epoch=epoch
#saver.save(sess,'./ckpt/tree_rnn_weights')
test_score = evaluate(model, test_set, sess)
print 'test score :', test_score, 'updated', epoch - best_valid_epoch, 'epochs ago with validation score', best_valid_score
def train_epoch(model,data,sess):
loss=model.train(data,sess)
return loss
def evaluate(model,data,sess):
acc=model.evaluate(data,sess)
return acc
if __name__ == '__main__':
if len(sys.argv) > 1:
if(sys.argv[1] == "-optimized"):
print "running optimized version"
train2()
else:
print "running not optimized version"
train()
else:
print "running not optimized version, run with option -optimized for the optimized one"
train()