forked from hslh/pie-disambiguation
-
Notifications
You must be signed in to change notification settings - Fork 0
/
evaluate.py
97 lines (92 loc) · 5.78 KB
/
evaluate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
"""Evaluates PIE sense disambiguation systems' output"""
from collections import Counter
def evaluate(data, use_test_data, top_n):
'''
Evaluates PSD predictions, outputs per-class accuracy,
P/R/F1 on idiomatic sense, macro-accuracy, and macro-F1.
Per-class scores for top_n most frequent types only.
'''
# Split data and filter out PIEs without binary labels
evaluation_data = [PIE for PIE in data if
(use_test_data and PIE.split == 'test') or (not use_test_data and PIE.split == 'dev')]
evaluation_data = [PIE for PIE in evaluation_data if PIE.binary_label]
# Get types for per-type scoring
PIE_types = set([PIE.pie_type for PIE in evaluation_data])
PIE_type_counts = Counter(PIE.pie_type for PIE in evaluation_data)
# Assign true/false positives/negatives
for PIE in evaluation_data:
if PIE.binary_label == PIE.predicted_label:
if PIE.binary_label == 'i':
PIE.classification = 'tp'
if PIE.binary_label == 'l':
PIE.classification = 'tn'
else:
if PIE.binary_label == 'i':
PIE.classification = 'fn'
if PIE.binary_label == 'l':
PIE.classification = 'fp'
# Count true/false positives/negatives
total_true = float(sum([PIE.classification in ['tp', 'tn'] for PIE in evaluation_data]))
total_tp = float(sum([PIE.classification == 'tp' for PIE in evaluation_data]))
total_fp = float(sum([PIE.classification == 'fp' for PIE in evaluation_data]))
total_fn = float(sum([PIE.classification == 'fn' for PIE in evaluation_data]))
# Get overall micro-accuracy and micro-F1
micro_accuracy = total_true / float(len(evaluation_data)) * 100.
micro_precision = total_tp / (total_tp + total_fp) * 100.
micro_recall = total_tp / (total_tp + total_fn) * 100.
micro_f1 = 2. * (micro_precision * micro_recall) / (micro_precision + micro_recall)
# Get per-type scores
# NB: define precision and recall as 100%, if denominator is 0
scores_per_type = {PIE_type: {'accuracy': 0., 'precision': 0., 'recall': 0., 'f1': 0.} for PIE_type in PIE_types}
for PIE_type in PIE_types:
total_true = float(
sum([PIE.classification in ['tp', 'tn'] for PIE in evaluation_data if PIE.pie_type == PIE_type]))
scores_per_type[PIE_type]['accuracy'] = total_true / float(PIE_type_counts[PIE_type]) * 100.
total_tp = float(sum([PIE.classification == 'tp' for PIE in evaluation_data if PIE.pie_type == PIE_type]))
total_fp = float(sum([PIE.classification == 'fp' for PIE in evaluation_data if PIE.pie_type == PIE_type]))
total_fn = float(sum([PIE.classification == 'fn' for PIE in evaluation_data if PIE.pie_type == PIE_type]))
try:
scores_per_type[PIE_type]['precision'] = total_tp / (total_tp + total_fp) * 100.
except ZeroDivisionError:
scores_per_type[PIE_type]['precision'] = 100.
try:
scores_per_type[PIE_type]['recall'] = total_tp / (total_tp + total_fn) * 100.
except ZeroDivisionError:
scores_per_type[PIE_type]['recall'] = 100.
try:
scores_per_type[PIE_type]['f1'] = 2. * (
scores_per_type[PIE_type]['precision'] * scores_per_type[PIE_type]['recall']) / (
scores_per_type[PIE_type]['precision'] +
scores_per_type[PIE_type]['recall'])
except ZeroDivisionError:
scores_per_type[PIE_type]['f1'] = 0.
# Get macro-averages
# NB: macro-average F1 is not harmonic mean of macro-average P and macro-average R
macro_accuracy = sum(scores_per_type[PIE_type]['accuracy'] for PIE_type in scores_per_type) / len(PIE_types)
macro_precision = sum(scores_per_type[PIE_type]['precision'] for PIE_type in scores_per_type) / len(PIE_types)
macro_recall = sum(scores_per_type[PIE_type]['recall'] for PIE_type in scores_per_type) / len(PIE_types)
macro_f1 = sum(scores_per_type[PIE_type]['f1'] for PIE_type in scores_per_type) / len(PIE_types)
# Print output to screen
print('### EVALUATION SCORES ###')
print('Macro-Accuracy: {0:.2f}\nMicro-Accuracy: {1:.2f}'.format(macro_accuracy, micro_accuracy))
print('-' * 25)
print('Macro-Precision: {0:.2f}\nMacro-Recall: {1:.2f}\nMacro-F1: {2:.2f}'.format(macro_precision, macro_recall,
macro_f1))
print('Micro-Precision: {0:.2f}\nMicro-Recall: {1:.2f}\nMicro-F1: {2:.2f}'.format(micro_precision, micro_recall,
micro_f1))
# Print for copying to results file
print('\n{:.2f}\t{:.2f}\t{:.2f}\t{:.2f}\t{:.2f}\t{:.2f}\t{:.2f}\t{:.2f}'.format(macro_accuracy, micro_accuracy,
macro_precision, macro_recall,
macro_f1, micro_precision,
micro_recall, micro_f1))
print('\n### SCORES PER TYPE ###')
print('{0}\tFreq.\tPrec.\tRecall\tF1\tAccuracy'.format('PIE Type' + 15 * ' '))
print('-' * 65)
top_n_types = PIE_type_counts.most_common(top_n)
for top_type in top_n_types:
scores = scores_per_type[top_type[0]]
type_with_padding = top_type[0][:23] + (23 - len(top_type[0][:23])) * ' '
print(
'{0}\t{1:d}\t{2:.2f}\t{3:.2f}\t{4:.2f}\t{5:.2f}'.format(type_with_padding, top_type[1], scores['precision'],
scores['recall'], scores['f1'], scores['accuracy']))
return data