-
Notifications
You must be signed in to change notification settings - Fork 0
/
run.py
129 lines (103 loc) · 4.74 KB
/
run.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
#-- Ivan Felipe Rodriguez & Remi Megret
# USAGE
# python run.py --dataset data.txt --labels labels.txt --output experiment.csv
import copy
import os
import pandas as pd
import numpy as np
import argparse
import matplotlib.pyplot as plt
try:
from sklearn.model_selection import train_test_split
except ImportError:
from sklearn.cross_validation import train_test_split
libact classes
from libact.base.dataset import Dataset, import_libsvm_sparse
from libact.models import SklearnAdapter
from libact.models import *
from libact.query_strategies import *
from libact.labelers import IdealLabeler
from sklearn.ensemble import GradientBoostingClassifier
def run(trn_ds, tst_ds, lbr, model, qs, quota, fully_labeled_trn_ds):
E_in, E_out, E_full = [], [], []
for _ in range(quota):
# Standard usage of libact objects
ask_id = qs.make_query()
X, _ = zip(*trn_ds.data)
lb = lbr.label(X[ask_id])
trn_ds.update(ask_id, lb)
model.train(trn_ds)
E_in = np.append(E_in, 1 - model.score(trn_ds))
E_out = np.append(E_out, 1 - model.score(tst_ds))
E_full = np.append(E_full, 1 - model.score(fully_labeled_trn_ds))
return E_in, E_out, E_full
def split_train_test(X, y, test_size, n_labeled):
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=test_size, stratify=y)
X_train0, _, y_train0, _ = train_test_split(
X_train, y_train, train_size=n_labeled, stratify=y_train)
#trn_ds = Dataset(X_train, np.concatenate([y_train[:n_labeled], [None] * (len(y_train) - n_labeled)]))
trn_ds = Dataset(X_train, np.concatenate(
[y_train0, [None] * (len(y_train) - n_labeled)]))
tst_ds = Dataset(X_test, y_test)
fully_labeled_trn_ds = Dataset(X_train, y_train)
return trn_ds, tst_ds, y_train, fully_labeled_trn_ds
def active_learning(data, labels, test_size, n_labeled):
# Load dataset
trn_ds, tst_ds, y_train, fully_labeled_trn_ds = split_train_test(
data, labels, test_size, n_labeled)
trn_ds2 = copy.deepcopy(trn_ds)
lbr = IdealLabeler(fully_labeled_trn_ds)
quota = len(y_train) - n_labeled # number of samples to query
# Comparing UncertaintySampling strategy with RandomSampling.
# model is the base learner, e.g. LogisticRegression, SVM ... etc.
clf = SklearnProbaAdapter(GradientBoostingClassifier(
n_estimators=5, learning_rate=1.0, max_depth=2, random_state=0))
qs = UncertaintySampling(trn_ds, method='lc', model=clf)
model = clf
E_in_1, E_out_1, E_full_1 = run(
trn_ds, tst_ds, lbr, model, qs, quota, fully_labeled_trn_ds)
qs2 = RandomSampling(trn_ds2)
model = clf
E_in_2, E_out_2, E_full_2 = run(
trn_ds2, tst_ds, lbr, model, qs2, quota, fully_labeled_trn_ds)
# Plot the learning curve of UncertaintySampling to RandomSampling
# The x-axis is the number of queries, and the y-axis is the corresponding
# error rate.
rows = ["E_in_1", "E_in_2", "E_out_1", "E_out_2", "E_full_1", "E_full_2"]
data = pd.DataFrame(data=[E_in_1, E_in_2, E_out_1,
E_out_2, E_full_1, E_full_2], index=rows)
return data.transpose()
def plotting(data, colors=['darkblue', 'orange', 'b', 'r', 'lightblue', 'pink']):
query_num = np.arange(1, len(E_in_1) + 1)
keys = data.keys()
for k in range(0, len(keys), 2):
plt.plot(query_num, data[keys[k]], color=colors[
k], label='qs ' + keys[k][:-2])
plt.plot(query_num, data[
keys[k + 1]], color=colors[k + 1], label='random ' + keys[k + 1][:-2])
plt.xlabel('Number of Queries')
plt.ylabel('Error')
plt.title('Experiment Result')
plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05),
fancybox=True, shadow=True, ncol=5)
plt.show()
def AL(data, labels, test_size, n_label, num_experiments):
experiments = {}
for i in range(num_experiments):
result = active_learning(data, labels, test_size, n_labeled)
experiments["Experiment" + str(i)] = result
return pd.Panel(experiments)
def main():
ap = argparse.ArgumentParser()
ap.add_argument("-i", "--dataset", required = True, help = "Path to the data")
ap.add_argument("-j", "--labels", required = True, help = "Path to the labels")
ap.add_argument("-k", "--output", required = True, help = "output")
args = vars(ap.parse_args())
data=np.loadtxt(args["dataset"])
labels=np.loadtxt(args["labels"])
test_size = 0.25 # the percentage of samples in the dataset that will be # randomly selected and assigned to the test set
n_labeled = 4 # number of samples that are initially labeled
experiment = active_learning(data,labels,test_size,n_labeled)
experiment.to_csv(args["output"])
main()