-
Notifications
You must be signed in to change notification settings - Fork 356
/
classifier.py
481 lines (371 loc) · 16.8 KB
/
classifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
# -*- coding: utf-8 -*-
"""
Created on Sun Nov 5 12:58:52 2017
@author: NishitP
"""
import DataPrep
import FeatureSelection
import numpy as np
import pandas as pd
import pickle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix, f1_score, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import learning_curve
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
#string to test
doc_new = ['obama is running for president in 2016']
#the feature selection has been done in FeatureSelection.py module. here we will create models using those features for prediction
#first we will use bag of words techniques
#building classifier using naive bayes
nb_pipeline = Pipeline([
('NBCV',FeatureSelection.countV),
('nb_clf',MultinomialNB())])
nb_pipeline.fit(DataPrep.train_news['Statement'],DataPrep.train_news['Label'])
predicted_nb = nb_pipeline.predict(DataPrep.test_news['Statement'])
np.mean(predicted_nb == DataPrep.test_news['Label'])
#building classifier using logistic regression
logR_pipeline = Pipeline([
('LogRCV',FeatureSelection.countV),
('LogR_clf',LogisticRegression())
])
logR_pipeline.fit(DataPrep.train_news['Statement'],DataPrep.train_news['Label'])
predicted_LogR = logR_pipeline.predict(DataPrep.test_news['Statement'])
np.mean(predicted_LogR == DataPrep.test_news['Label'])
#building Linear SVM classfier
svm_pipeline = Pipeline([
('svmCV',FeatureSelection.countV),
('svm_clf',svm.LinearSVC())
])
svm_pipeline.fit(DataPrep.train_news['Statement'],DataPrep.train_news['Label'])
predicted_svm = svm_pipeline.predict(DataPrep.test_news['Statement'])
np.mean(predicted_svm == DataPrep.test_news['Label'])
#using SVM Stochastic Gradient Descent on hinge loss
sgd_pipeline = Pipeline([
('svm2CV',FeatureSelection.countV),
('svm2_clf',SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5))
])
sgd_pipeline.fit(DataPrep.train_news['Statement'],DataPrep.train_news['Label'])
predicted_sgd = sgd_pipeline.predict(DataPrep.test_news['Statement'])
np.mean(predicted_sgd == DataPrep.test_news['Label'])
#random forest
random_forest = Pipeline([
('rfCV',FeatureSelection.countV),
('rf_clf',RandomForestClassifier(n_estimators=200,n_jobs=3))
])
random_forest.fit(DataPrep.train_news['Statement'],DataPrep.train_news['Label'])
predicted_rf = random_forest.predict(DataPrep.test_news['Statement'])
np.mean(predicted_rf == DataPrep.test_news['Label'])
#User defined functon for K-Fold cross validatoin
def build_confusion_matrix(classifier):
k_fold = KFold(n_splits=5)
scores = []
confusion = np.array([[0,0],[0,0]])
for train_ind, test_ind in k_fold.split(DataPrep.train_news):
train_text = DataPrep.train_news.iloc[train_ind]['Statement']
train_y = DataPrep.train_news.iloc[train_ind]['Label']
test_text = DataPrep.train_news.iloc[test_ind]['Statement']
test_y = DataPrep.train_news.iloc[test_ind]['Label']
classifier.fit(train_text,train_y)
predictions = classifier.predict(test_text)
confusion += confusion_matrix(test_y,predictions)
score = f1_score(test_y,predictions)
scores.append(score)
return (print('Total statements classified:', len(DataPrep.train_news)),
print('Score:', sum(scores)/len(scores)),
print('score length', len(scores)),
print('Confusion matrix:'),
print(confusion))
#K-fold cross validation for all classifiers
build_confusion_matrix(nb_pipeline)
build_confusion_matrix(logR_pipeline)
build_confusion_matrix(svm_pipeline)
build_confusion_matrix(sgd_pipeline)
build_confusion_matrix(random_forest)
#========================================================================================
#Bag of words confusion matrix and F1 scores
#Naive bayes
# [2118 2370]
# [1664 4088]
# f1-Score: 0.669611539651
#Logistic regression
# [2252 2236]
# [1933 3819]
# f1-Score: 0.646909097798
#svm
# [2260 2228]
# [2246 3506]
#f1-score: 0.610468748792
#sgdclassifier
# [2414 2074]
# [2042 3710]
# f1-Score: 0.640874558778
#random forest classifier
# [1821 2667]
# [1192 4560]
# f1-Score: 0.702651511011
#=========================================================================================
"""So far we have used bag of words technique to extract the features and passed those featuers into classifiers. We have also seen the
f1 scores of these classifiers. now lets enhance these features using term frequency weights with various n-grams
"""
##Now using n-grams
#naive-bayes classifier
nb_pipeline_ngram = Pipeline([
('nb_tfidf',FeatureSelection.tfidf_ngram),
('nb_clf',MultinomialNB())])
nb_pipeline_ngram.fit(DataPrep.train_news['Statement'],DataPrep.train_news['Label'])
predicted_nb_ngram = nb_pipeline_ngram.predict(DataPrep.test_news['Statement'])
np.mean(predicted_nb_ngram == DataPrep.test_news['Label'])
#logistic regression classifier
logR_pipeline_ngram = Pipeline([
('LogR_tfidf',FeatureSelection.tfidf_ngram),
('LogR_clf',LogisticRegression(penalty="l2",C=1))
])
logR_pipeline_ngram.fit(DataPrep.train_news['Statement'],DataPrep.train_news['Label'])
predicted_LogR_ngram = logR_pipeline_ngram.predict(DataPrep.test_news['Statement'])
np.mean(predicted_LogR_ngram == DataPrep.test_news['Label'])
#linear SVM classifier
svm_pipeline_ngram = Pipeline([
('svm_tfidf',FeatureSelection.tfidf_ngram),
('svm_clf',svm.LinearSVC())
])
svm_pipeline_ngram.fit(DataPrep.train_news['Statement'],DataPrep.train_news['Label'])
predicted_svm_ngram = svm_pipeline_ngram.predict(DataPrep.test_news['Statement'])
np.mean(predicted_svm_ngram == DataPrep.test_news['Label'])
#sgd classifier
sgd_pipeline_ngram = Pipeline([
('sgd_tfidf',FeatureSelection.tfidf_ngram),
('sgd_clf',SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5))
])
sgd_pipeline_ngram.fit(DataPrep.train_news['Statement'],DataPrep.train_news['Label'])
predicted_sgd_ngram = sgd_pipeline_ngram.predict(DataPrep.test_news['Statement'])
np.mean(predicted_sgd_ngram == DataPrep.test_news['Label'])
#random forest classifier
random_forest_ngram = Pipeline([
('rf_tfidf',FeatureSelection.tfidf_ngram),
('rf_clf',RandomForestClassifier(n_estimators=300,n_jobs=3))
])
random_forest_ngram.fit(DataPrep.train_news['Statement'],DataPrep.train_news['Label'])
predicted_rf_ngram = random_forest_ngram.predict(DataPrep.test_news['Statement'])
np.mean(predicted_rf_ngram == DataPrep.test_news['Label'])
#K-fold cross validation for all classifiers
build_confusion_matrix(nb_pipeline_ngram)
build_confusion_matrix(logR_pipeline_ngram)
build_confusion_matrix(svm_pipeline_ngram)
build_confusion_matrix(sgd_pipeline_ngram)
build_confusion_matrix(random_forest_ngram)
#========================================================================================
#n-grams & tfidf confusion matrix and F1 scores
#Naive bayes
# [841 3647]
# [427 5325]
# f1-Score: 0.723262051071
#Logistic regression
# [1617 2871]
# [1097 4655]
# f1-Score: 0.70113000531
#svm
# [2016 2472]
# [1524 4228]
# f1-Score: 0.67909201429
#sgdclassifier
# [ 10 4478]
# [ 13 5739]
# f1-Score: 0.718731637053
#random forest
# [1979 2509]
# [1630 4122]
# f1-Score: 0.665720333284
#=========================================================================================
print(classification_report(DataPrep.test_news['Label'], predicted_nb_ngram))
print(classification_report(DataPrep.test_news['Label'], predicted_LogR_ngram))
print(classification_report(DataPrep.test_news['Label'], predicted_svm_ngram))
print(classification_report(DataPrep.test_news['Label'], predicted_sgd_ngram))
print(classification_report(DataPrep.test_news['Label'], predicted_rf_ngram))
DataPrep.test_news['Label'].shape
"""
Out of all the models fitted, we would take 2 best performing model. we would call them candidate models
from the confusion matrix, we can see that random forest and logistic regression are best performing
in terms of precision and recall (take a look into false positive and true negative counts which appeares
to be low compared to rest of the models)
"""
#grid-search parameter optimization
#random forest classifier parameters
parameters = {'rf_tfidf__ngram_range': [(1, 1), (1, 2),(1,3),(1,4),(1,5)],
'rf_tfidf__use_idf': (True, False),
'rf_clf__max_depth': (1,2,3,4,5,6,7,8,9,10,11,12,13,14,15)
}
gs_clf = GridSearchCV(random_forest_ngram, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(DataPrep.train_news['Statement'][:10000],DataPrep.train_news['Label'][:10000])
gs_clf.best_score_
gs_clf.best_params_
gs_clf.cv_results_
#logistic regression parameters
parameters = {'LogR_tfidf__ngram_range': [(1, 1), (1, 2),(1,3),(1,4),(1,5)],
'LogR_tfidf__use_idf': (True, False),
'LogR_tfidf__smooth_idf': (True, False)
}
gs_clf = GridSearchCV(logR_pipeline_ngram, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(DataPrep.train_news['Statement'][:10000],DataPrep.train_news['Label'][:10000])
gs_clf.best_score_
gs_clf.best_params_
gs_clf.cv_results_
#Linear SVM
parameters = {'svm_tfidf__ngram_range': [(1, 1), (1, 2),(1,3),(1,4),(1,5)],
'svm_tfidf__use_idf': (True, False),
'svm_tfidf__smooth_idf': (True, False),
'svm_clf__penalty': ('l1','l2'),
}
gs_clf = GridSearchCV(svm_pipeline_ngram, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(DataPrep.train_news['Statement'][:10000],DataPrep.train_news['Label'][:10000])
gs_clf.best_score_
gs_clf.best_params_
gs_clf.cv_results_
#by running above commands we can find the model with best performing parameters
#running both random forest and logistic regression models again with best parameter found with GridSearch method
random_forest_final = Pipeline([
('rf_tfidf',TfidfVectorizer(stop_words='english',ngram_range=(1,3),use_idf=True,smooth_idf=True)),
('rf_clf',RandomForestClassifier(n_estimators=300,n_jobs=3,max_depth=10))
])
random_forest_final.fit(DataPrep.train_news['Statement'],DataPrep.train_news['Label'])
predicted_rf_final = random_forest_final.predict(DataPrep.test_news['Statement'])
np.mean(predicted_rf_final == DataPrep.test_news['Label'])
print(metrics.classification_report(DataPrep.test_news['Label'], predicted_rf_final))
logR_pipeline_final = Pipeline([
#('LogRCV',countV_ngram),
('LogR_tfidf',TfidfVectorizer(stop_words='english',ngram_range=(1,5),use_idf=True,smooth_idf=False)),
('LogR_clf',LogisticRegression(penalty="l2",C=1))
])
logR_pipeline_final.fit(DataPrep.train_news['Statement'],DataPrep.train_news['Label'])
predicted_LogR_final = logR_pipeline_final.predict(DataPrep.test_news['Statement'])
np.mean(predicted_LogR_final == DataPrep.test_news['Label'])
#accuracy = 0.62
print(metrics.classification_report(DataPrep.test_news['Label'], predicted_LogR_final))
"""
by running both random forest and logistic regression with GridSearch's best parameter estimation, we found that for random
forest model with n-gram has better accuracty than with the parameter estimated. The logistic regression model with best parameter
has almost similar performance as n-gram model so logistic regression will be out choice of model for prediction.
"""
#saving best model to the disk
model_file = 'final_model.sav'
pickle.dump(logR_pipeline_ngram,open(model_file,'wb'))
#Plotting learing curve
def plot_learing_curve(pipeline,title):
size = 10000
cv = KFold(size, shuffle=True)
X = DataPrep.train_news["Statement"]
y = DataPrep.train_news["Label"]
pl = pipeline
pl.fit(X,y)
train_sizes, train_scores, test_scores = learning_curve(pl, X, y, n_jobs=-1, cv=cv, train_sizes=np.linspace(.1, 1.0, 5), verbose=0)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
plt.figure()
plt.title(title)
plt.legend(loc="best")
plt.xlabel("Training examples")
plt.ylabel("Score")
plt.gca().invert_yaxis()
# box-like grid
plt.grid()
# plot the std deviation as a transparent range at each training set size
plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g")
# plot the average training and test score lines at each training set size
plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score")
# sizes the window for readability and displays the plot
# shows error from 0 to 1.1
plt.ylim(-.1,1.1)
plt.show()
#below command will plot learing curves for each of the classifiers
plot_learing_curve(logR_pipeline_ngram,"Naive-bayes Classifier")
plot_learing_curve(nb_pipeline_ngram,"LogisticRegression Classifier")
plot_learing_curve(svm_pipeline_ngram,"SVM Classifier")
plot_learing_curve(sgd_pipeline_ngram,"SGD Classifier")
plot_learing_curve(random_forest_ngram,"RandomForest Classifier")
"""
by plotting the learning cureve for logistic regression, it can be seen that cross-validation score is stagnating throughout and it
is unable to learn from data. Also we see that there are high errors that indicates model is simple and we may want to increase the
model complexity.
"""
#plotting Precision-Recall curve
def plot_PR_curve(classifier):
precision, recall, thresholds = precision_recall_curve(DataPrep.test_news['Label'], classifier)
average_precision = average_precision_score(DataPrep.test_news['Label'], classifier)
plt.step(recall, precision, color='b', alpha=0.2,
where='post')
plt.fill_between(recall, precision, step='post', alpha=0.2,
color='b')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('2-class Random Forest Precision-Recall curve: AP={0:0.2f}'.format(
average_precision))
plot_PR_curve(predicted_LogR_ngram)
plot_PR_curve(predicted_rf_ngram)
"""
Now let's extract the most informative feature from ifidf vectorizer for all fo the classifiers and see of there are any common
words that we can identify i.e. are these most informative feature acorss the classifiers are same? we will create a function that
will extract top 50 features.
"""
def show_most_informative_features(model, vect, clf, text=None, n=50):
# Extract the vectorizer and the classifier from the pipeline
vectorizer = model.named_steps[vect]
classifier = model.named_steps[clf]
# Check to make sure that we can perform this computation
if not hasattr(classifier, 'coef_'):
raise TypeError(
"Cannot compute most informative features on {}.".format(
classifier.__class__.__name__
)
)
if text is not None:
# Compute the coefficients for the text
tvec = model.transform([text]).toarray()
else:
# Otherwise simply use the coefficients
tvec = classifier.coef_
# Zip the feature names with the coefs and sort
coefs = sorted(
zip(tvec[0], vectorizer.get_feature_names()),
reverse=True
)
# Get the top n and bottom n coef, name pairs
topn = zip(coefs[:n], coefs[:-(n+1):-1])
# Create the output string to return
output = []
# If text, add the predicted value to the output.
if text is not None:
output.append("\"{}\"".format(text))
output.append(
"Classified as: {}".format(model.predict([text]))
)
output.append("")
# Create two columns with most negative and most positive features.
for (cp, fnp), (cn, fnn) in topn:
output.append(
"{:0.4f}{: >15} {:0.4f}{: >15}".format(
cp, fnp, cn, fnn
)
)
#return "\n".join(output)
print(output)
show_most_informative_features(logR_pipeline_ngram,vect='LogR_tfidf',clf='LogR_clf')
show_most_informative_features(nb_pipeline_ngram,vect='nb_tfidf',clf='nb_clf')
show_most_informative_features(svm_pipeline_ngram,vect='svm_tfidf',clf='svm_clf')
show_most_informative_features(sgd_pipeline_ngram,vect='sgd_tfidf',clf='sgd_clf')