evaluate.py

"""Evaluates PIE sense disambiguation systems' output"""

from collections import Counter


def evaluate(data, use_test_data, top_n):
    '''
    Evaluates PSD predictions, outputs per-class accuracy,
    P/R/F1 on idiomatic sense, macro-accuracy, and macro-F1.
    Per-class scores for top_n most frequent types only.
    '''

    # Split data and filter out PIEs without binary labels
    evaluation_data = [PIE for PIE in data if
                       (use_test_data and PIE.split == 'test') or (not use_test_data and PIE.split == 'dev')]
    evaluation_data = [PIE for PIE in evaluation_data if PIE.binary_label]
    # Get types for per-type scoring
    PIE_types = set([PIE.pie_type for PIE in evaluation_data])
    PIE_type_counts = Counter(PIE.pie_type for PIE in evaluation_data)
    # Assign true/false positives/negatives
    for PIE in evaluation_data:
        if PIE.binary_label == PIE.predicted_label:
            if PIE.binary_label == 'i':
                PIE.classification = 'tp'
            if PIE.binary_label == 'l':
                PIE.classification = 'tn'
        else:
            if PIE.binary_label == 'i':
                PIE.classification = 'fn'
            if PIE.binary_label == 'l':
                PIE.classification = 'fp'
    # Count true/false positives/negatives
    total_true = float(sum([PIE.classification in ['tp', 'tn'] for PIE in evaluation_data]))
    total_tp = float(sum([PIE.classification == 'tp' for PIE in evaluation_data]))
    total_fp = float(sum([PIE.classification == 'fp' for PIE in evaluation_data]))
    total_fn = float(sum([PIE.classification == 'fn' for PIE in evaluation_data]))
    # Get overall micro-accuracy and micro-F1
    micro_accuracy = total_true / float(len(evaluation_data)) * 100.
    micro_precision = total_tp / (total_tp + total_fp) * 100.
    micro_recall = total_tp / (total_tp + total_fn) * 100.
    micro_f1 = 2. * (micro_precision * micro_recall) / (micro_precision + micro_recall)
    # Get per-type scores
    # NB: define precision and recall as 100%, if denominator is 0
    scores_per_type = {PIE_type: {'accuracy': 0., 'precision': 0., 'recall': 0., 'f1': 0.} for PIE_type in PIE_types}
    for PIE_type in PIE_types:
        total_true = float(
            sum([PIE.classification in ['tp', 'tn'] for PIE in evaluation_data if PIE.pie_type == PIE_type]))
        scores_per_type[PIE_type]['accuracy'] = total_true / float(PIE_type_counts[PIE_type]) * 100.
        total_tp = float(sum([PIE.classification == 'tp' for PIE in evaluation_data if PIE.pie_type == PIE_type]))
        total_fp = float(sum([PIE.classification == 'fp' for PIE in evaluation_data if PIE.pie_type == PIE_type]))
        total_fn = float(sum([PIE.classification == 'fn' for PIE in evaluation_data if PIE.pie_type == PIE_type]))
        try:
            scores_per_type[PIE_type]['precision'] = total_tp / (total_tp + total_fp) * 100.
        except ZeroDivisionError:
            scores_per_type[PIE_type]['precision'] = 100.
        try:
            scores_per_type[PIE_type]['recall'] = total_tp / (total_tp + total_fn) * 100.
        except ZeroDivisionError:
            scores_per_type[PIE_type]['recall'] = 100.
        try:
            scores_per_type[PIE_type]['f1'] = 2. * (
                        scores_per_type[PIE_type]['precision'] * scores_per_type[PIE_type]['recall']) / (
                                                          scores_per_type[PIE_type]['precision'] +
                                                          scores_per_type[PIE_type]['recall'])
        except ZeroDivisionError:
            scores_per_type[PIE_type]['f1'] = 0.
    # Get macro-averages
    # NB: macro-average F1 is not harmonic mean of macro-average P and macro-average R
    macro_accuracy = sum(scores_per_type[PIE_type]['accuracy'] for PIE_type in scores_per_type) / len(PIE_types)
    macro_precision = sum(scores_per_type[PIE_type]['precision'] for PIE_type in scores_per_type) / len(PIE_types)
    macro_recall = sum(scores_per_type[PIE_type]['recall'] for PIE_type in scores_per_type) / len(PIE_types)
    macro_f1 = sum(scores_per_type[PIE_type]['f1'] for PIE_type in scores_per_type) / len(PIE_types)
    # Print output to screen
    print('### EVALUATION SCORES ###')
    print('Macro-Accuracy: {0:.2f}\nMicro-Accuracy: {1:.2f}'.format(macro_accuracy, micro_accuracy))
    print('-' * 25)
    print('Macro-Precision: {0:.2f}\nMacro-Recall: {1:.2f}\nMacro-F1: {2:.2f}'.format(macro_precision, macro_recall,
                                                                                      macro_f1))
    print('Micro-Precision: {0:.2f}\nMicro-Recall: {1:.2f}\nMicro-F1: {2:.2f}'.format(micro_precision, micro_recall,
                                                                                      micro_f1))
    # Print for copying to results file
    print('\n{:.2f}\t{:.2f}\t{:.2f}\t{:.2f}\t{:.2f}\t{:.2f}\t{:.2f}\t{:.2f}'.format(macro_accuracy, micro_accuracy,
                                                                                    macro_precision, macro_recall,
                                                                                    macro_f1, micro_precision,
                                                                                    micro_recall, micro_f1))
    print('\n### SCORES PER TYPE ###')
    print('{0}\tFreq.\tPrec.\tRecall\tF1\tAccuracy'.format('PIE Type' + 15 * ' '))
    print('-' * 65)
    top_n_types = PIE_type_counts.most_common(top_n)
    for top_type in top_n_types:
        scores = scores_per_type[top_type[0]]
        type_with_padding = top_type[0][:23] + (23 - len(top_type[0][:23])) * ' '
        print(
            '{0}\t{1:d}\t{2:.2f}\t{3:.2f}\t{4:.2f}\t{5:.2f}'.format(type_with_padding, top_type[1], scores['precision'],
                                                                    scores['recall'], scores['f1'], scores['accuracy']))

    return data