forked from AlexMoreo/funnelling
-
Notifications
You must be signed in to change notification settings - Fork 0
/
monolingual_classification.py
77 lines (62 loc) · 3.5 KB
/
monolingual_classification.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import util.disable_sklearn_warnings
from sklearn.svm import SVC
import os,sys
from dataset_builder import MultilingualDataset
from learning.learners import *
from util.evaluation import *
from optparse import OptionParser
from util.results import PolylingualClassificationResults
from util.file import exists
import pickle
parser = OptionParser()
parser.add_option("-d", "--dataset", dest="dataset",
help="Path to the multilingual dataset processed and stored in .pickle format")
parser.add_option("-o", "--output", dest="output",
help="Result file", type=str, default='./monolingual_results.csv')
parser.add_option("-n", "--note", dest="note",
help="A description note to be added to the result file", type=str, default='')
parser.add_option("-c", "--optimc", dest="optimc", action='store_true',
help="Optimices hyperparameters", default=False)
parser.add_option("-f", "--force", dest="force", action='store_true',
help="Run even if the result was already computed", default=False)
parser.add_option("-j", "--n_jobs", dest="n_jobs",type=int,
help="Number of parallel jobs (default is -1, all)", default=-1)
parser.add_option("-s", "--set_c", dest="set_c",type=float,
help="Set the C parameter", default=1)
def get_learner(calibrate=False, kernel='linear'):
return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1)
def get_params(dense=False):
if not op.optimc:
return None
c_range = [1e4, 1e3, 1e2, 1e1, 1, 1e-1]
kernel = 'rbf' if dense else 'linear'
return [{'kernel': [kernel], 'C': c_range}]
if __name__=='__main__':
(op, args) = parser.parse_args()
assert exists(op.dataset), 'Unable to find file '+str(op.dataset)
assert not (op.set_c != 1. and op.optimc), 'Parameter C cannot be defined along with optim_c option'
results = PolylingualClassificationResults(op.output)
dataset_file = os.path.basename(op.dataset)
result_id = dataset_file+'_fun-tat'+('_optimC' if op.optimc else '')
if not op.force and results.already_calculated(result_id):
print('Experiment <'+result_id+'> already computed. Exit.')
sys.exit()
data = MultilingualDataset.load(op.dataset)
data.show_dimensions()
classifier = FunnellingPolylingualClassifier(first_tier_learner=get_learner(calibrate=True),
meta_learner=get_learner(calibrate=False),
first_tier_parameters=None,
meta_parameters=get_params(dense=True),
folded_projections=1,
calmode='cal',
n_jobs=op.n_jobs)
languages = data.langs()
for lang in languages:
print('Monolingual: ' + lang)
data.set_view(languages=[lang])
classifier.fit(data.lXtr(), data.lYtr())
l_eval = evaluate_method(classifier, data.lXte(), data.lYte())
macrof1, microf1, macrok, microk = l_eval[lang]
print('Lang %s: macro-F1=%.3f micro-F1=%.3f' % (lang, macrof1, microf1))
notes = op.note + ('C=' + str(op.set_c) if op.set_c != 1 else '') + str(classifier.best_params() if op.optimc else '')
results.add_row(result_id, 'fun-tat', 'svm', op.optimc, data.dataset_name, -1, lang, classifier.time, lang, macrof1, microf1, macrok, microk, notes=notes)