-
Notifications
You must be signed in to change notification settings - Fork 3
/
bleu.py
80 lines (48 loc) · 1.82 KB
/
bleu.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu
import matplotlib.pylab as plt
def load_file(fn):
with open(fn) as f:
lines = f.readlines()
return lines
def pretty_print(references, hypotheses):
for i in range(len(hypotheses)):
print(' '.join(references[i][0]))
print(' '.join(hypotheses[i]))
print(sentence_bleu([references[i][0]], hypotheses[i]))
print('*'*30)
def find_length_n(references, hypotheses, n):
ref_list = []
hyp_list = []
for i in range(len(hypotheses)):
if len(references[i]) == n:
ref_list.append(references[i])
hyp_list.append(hypotheses[i])
# [r for r in ref_list] OR [[r] for r in ref_list] ???
return ref_list, hyp_list, corpus_bleu([r for r in ref_list], hyp_list)
references = [ref.split() for ref in load_file('nl.txt')]
hypotheses = [hyp.split() for hyp in load_file('preds.txt')]
#pretty_print(references, hypotheses)
# hypothesis == prediction
#print('BLEU SCORE: ',corpus_bleu([[r] for r in references], hypotheses))
scores = []
bleu_scores = dict()
for i in range(2,42):
print(i)
ref_len, hyp_len, bleu = find_length_n(references, hypotheses, i)
for j in range(len(ref_len)):
print('ref:\t ', ' '.join(ref_len[j]))
print('hyp:\t ', ' '.join(hyp_len[j]))
print(sentence_bleu([ref_len[j]], hyp_len[j]))
print('-'*20)
print('BLEU score for length '+str(i)+': '+str(bleu))
print('*'*80)
bleu_scores[i] = bleu
#pretty_print([[r] for r in ref_len2], hyp_len2)
#print('bleu2', bleu2)
# questions:
# # [r for r in ref_list] OR [[r] for r in ref_list] ??? when calculating BLEU score
lists = sorted(bleu_scores.items()) # sorted by key, return a list of tuples
x, y = zip(*lists) # unpack a list of pairs into two tuples
plt.plot(x, y)
plt.show()
# prediction in space: is context of sentence better measure than BLEU?