-
Notifications
You must be signed in to change notification settings - Fork 0
/
evaluate_large_corpus.py
166 lines (161 loc) · 5.85 KB
/
evaluate_large_corpus.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
def evaluate():
f = open('files/qrels.txt', encoding='UTF-8')
f1 = open('output.txt', encoding='UTF-8')
queries = set()
qrels = {}
'''
Set up dictionary to store relevant judgements:
key: Query id
value is a dictionary:
key: Doc id that relevant level is not 0
value: relevant level of document
'''
for line in f:
temp = {}
if line.strip().split(' ')[0] not in qrels:
qrels[line.strip().split(' ')[0]] = temp
if line.strip().split(' ')[0] in qrels and int(line.strip().split(' ')[3]) !=0:
# Because all documents in the large collection with a relevance level of 0 are unjudged
# Also the number of unjudged documents has no effect on the calculation of bpref
# So to simplify the calculation, it is removed at the time of reading
qrels[line.strip().split(' ')[0]][line.strip().split(' ')[2]] = line.strip().split(' ')[3]
index = {}
'''
Set up dictionary to store relevant judgements:
key: Query id
value is a dictionary:
key: Doc id
value: Relevant level of document
'''
for line in f1:
temp = {}
if line.strip().split(' ')[0] not in index:
index[line.strip().split(' ')[0]] = temp
if line.strip().split(' ')[0] in index:
index[line.strip().split(' ')[0]][line.strip().split(' ')[1]] = line.strip().split(' ')[2]
queries.add(line.strip().split(' ')[0])
precision = 0
# precision = RelRet/Ret
recall = 0
# recall = RelRet/Rel
precision10 = 0
# P@10 = RelRet/Ret of the top 10 relevant documents for this query
Rprecision = 0
# R-precision = RelRet/Ret of the number R of relevant documents for this query
'''
Precision and Recall
'''
for qid in index:
relret = {}
for did in index[qid].keys():
ret = len(index[qid].keys())
# The number of relevant documents for the query
rel = len(qrels[qid].keys())
# The number of retrieved documents for the query
if did in qrels[qid].keys():
relret[len(relret)] = did
# The number of relevant documents in the retrieved set
precision = (len(relret) / ret) + precision
# Total precision
recall = (len(relret) / rel) + recall
# Total recall
'''
R-precision
'''
for qid in index:
relretR = {}
for did in dict_slice(index[qid], 0, count_rel(qrels[qid])).keys():
# Slice dictionaries that store query results according to the number of relevant documents
# count_rel(qrels[qid]) is the number of relevant documents
# dict_slice(index[qid], 0, count_rel(qrels[qid])) return the sliced dictionary
if did in qrels[qid].keys():
relretR[len(relretR)] = did
Rprecision = (len(relretR) / len(dict_slice(index[qid], 0, count_rel(qrels[qid])).keys())) + Rprecision
# Total R-precision
'''
P@10
'''
for qid in index:
relret10 = {}
for did in dict_slice(index[qid], 0, 10).keys():
# Slice dictionaries that store query results
# dict_slice(index[qid], 0, 10) return the sliced dictionary
if did in qrels[qid].keys():
relret10[len(relret10)] = did
precision10 = (len(relret10) / len(dict_slice(index, 0, 10))) + precision10
# Total P@10
'''
MAP
'''
total_map = 0
for qid in index:
map = 0
# map = Number of related documents up to this document / Number of documents up to this document
relretM = {}
for did in index[qid].keys():
if did in qrels[qid].keys():
relretM[len(relretM)] = did
map = len(relretM) / int(index[qid][did]) + map
total_map = total_map + map / len(qrels[qid])
# Total map
'''
Bpref
The unjudged documents are removed at read time,
so there is no need to filter for unjudged documents here
'''
total_bpref = 0
for qid in index:
n = 0
bpref = 0
# Bref = 1 / R (1 - n ranked higher than r / R) + Bref
# n is a member of the first R judged non-relevant document
for did in index[qid].keys():
if did in qrels[qid].keys():
# Relevant doc
if (n / len(qrels[qid])) > 1:
bpref = 0 + bpref
# (1 - n ranked higher than r / R) can't be less than 0
else:
bpref = (1 - n / len(qrels[qid])) + bpref
# calculate (1 - n ranked higher than r / R)
else:
# non-relevant document
n = n + 1
# n + 1
total_bpref = bpref / len(qrels[qid]) + total_bpref
# Calculate total bpref
avg_precision = precision / len(qrels)
avg_recall = recall / len(qrels)
avg_precision10 = precision10 / len(qrels)
avg_Rprecision = Rprecision / len(qrels)
avg_map = total_map / len(qrels)
avg_bpref = total_bpref / len(qrels)
print("Evaluation results:")
print(f'Precision {avg_precision}')
print(f'Recall {avg_recall}')
print(f'Precision@10 {avg_precision10}')
print(f'R-Precision {avg_Rprecision}')
print(f'MAP {avg_map}')
print(f'B_pref {avg_bpref}')
# Calculate the average of all
def dict_slice(dict, start, end):
'''
slice dict
dict: Target dictionaries
start: start position
end: end position
'''
keys = dict.keys()
dict_slice = {}
for k in list(keys)[start:end]:
dict_slice[k] = dict[k]
return dict_slice
# Return sliced dict
def count_rel(dict1):
# Calculate number of relevant doc
count = 0
for doc_id in dict1:
if dict1[doc_id] != 0:
count += 1
return count
evaluate()