-
Notifications
You must be signed in to change notification settings - Fork 0
/
statistics_on_dataset.py
117 lines (94 loc) · 2.48 KB
/
statistics_on_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
# Train
# total instances: 2647
#
# Dev
# total instances: 100
#
#
# Test
# total instances: 100
from nltk import sent_tokenize
from tqdm import tqdm
import pickle, re
import numpy as np
bioclean = lambda t: re.sub('[.,?;*!%^&_+():-\[\]{}]', '', t.replace('"', '').replace('/', '').replace('\\', '').replace("'", '').strip().lower()).split()
dataloc = '/home/dpappas/bioasq_all/bioasq7_data/'
with open(dataloc + 'bioasq7_bm25_docset_top100.train.pkl', 'rb') as f:
train_docs = pickle.load(f)
lenss = []
lenss2 = []
lenss_tit = []
lenss_abs = []
lenss_tit2 = []
lenss_abs2 = []
for d in tqdm(train_docs.values()):
lenss.append(len(bioclean(d['title'])) + len(bioclean(d['abstractText'])))
lenss_tit.append(len(bioclean(d['title'])))
lenss_abs.append(len(bioclean(d['abstractText'])))
###################################################
lenss2.append(len(d['title'].split()) + len(d['abstractText'].split()))
lenss_tit2.append(len(d['title'].split()))
lenss_abs2.append(len(d['abstractText'].split()))
np.max(lenss)
np.min(lenss)
np.average(lenss)
np.max(lenss2)
np.min(lenss2)
np.average(lenss2)
# length of titles
# max: 73
# min: 1
# average: 12.723450985111578
#
#
# length of asbtracts
# max: 1494
# min: 1
# average: 184.46201389127668
import json
import numpy as np
with open(dataloc+'trainining7b.json', 'r') as f:
bioasq7_data = json.load(f)
bioasq7_data = dict((q['id'], q) for q in bioasq7_data['questions'])
lenss_q = []
lenss_q2 = []
for item in bioasq7_data.values():
lenss_q.append(len(bioclean(item['body'])))
lenss_q2.append(len(item['body'].split()))
np.max(lenss_q)
np.min(lenss_q)
np.average(lenss_q)
np.max(lenss_q2)
np.min(lenss_q2)
np.average(lenss_q2)
import json
with open(dataloc+'training7b.dev.json', 'r') as f:
ddd = json.load(f)
print(len(ddd['questions']))
import json
with open(dataloc+'training7b.train.json', 'r') as f:
ddd = json.load(f)
print(len(ddd['questions']))
from nltk import sent_tokenize
qd = {}
for q in ddd['questions']:
for snip in q['snippets']:
qid = 'id'
docid = snip['document']
sents = len(sent_tokenize(snip['text']))
try:
qd['{}{}'.format(qid, docid)] = sents
except:
qd['{}{}'.format(qid, docid)] += sents
print(np.average(list(qd.values())))
# length of questions bioclean
# max: 30
# min: 2
# average: 9.005096468875136
#
#
#
# length of questions whitespace
# max: 30
# min: 2
# average: 9.016745540589735