-
Notifications
You must be signed in to change notification settings - Fork 0
/
find_weird_data.py
66 lines (47 loc) · 1.83 KB
/
find_weird_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import json
from pprint import pprint
import pickle
dataloc = '/home/dpappas/for_ryan/'
print('loading pickle data')
with open(dataloc +'BioASQ-trainingDataset6b.json', 'r') as f:
bioasq6_data = json.load(f)
bioasq6_data = dict((q['id'], q) for q in bioasq6_data['questions'])
pprint(bioasq6_data['58b548d722d3005309000005'])
for item in bioasq6_data.values():
for doc in item['documents']:
if('snippets' in item):
t = [sn for sn in item['snippets'] if(sn['document']==doc)]
else:
t = []
if(len(t)==0):
print('{} {}'.format(item['id'], doc.split('/')[-1]))
# with open(dataloc + 'bioasq_bm25_top100.test.pkl', 'rb') as f:
# test_data = pickle.load(f)
# with open(dataloc + 'bioasq_bm25_docset_top100.test.pkl', 'rb') as f:
# test_docs = pickle.load(f)
# with open(dataloc + 'bioasq_bm25_top100.dev.pkl', 'rb') as f:
# dev_data = pickle.load(f)
# with open(dataloc + 'bioasq_bm25_docset_top100.dev.pkl', 'rb') as f:
# dev_docs = pickle.load(f)
# with open(dataloc + 'bioasq_bm25_top100.train.pkl', 'rb') as f:
# train_data = pickle.load(f)
# with open(dataloc + 'bioasq_bm25_docset_top100.train.pkl', 'rb') as f:
# train_docs = pickle.load(f)
# print('loading words')
#
# from gensim.models.keyedvectors import KeyedVectors
# w2v_bin_path = '/home/dpappas/for_ryan/fordp/pubmed2018_w2v_30D.bin'
# wv = KeyedVectors.load_word2vec_format(w2v_bin_path, binary=True)
# min_tok, min_count = None, None
# for word in wv.vocab.items():
# if(word[1].count < 3):
# print word[0], word[1].count
# if(min_tok is None):
# min_tok = word[0]
# min_count = word[1].count
# else:
# if(min_count) >= word[1].count:
# min_tok = word[0]
# min_count = word[1].count
#
# print(min_tok, min_count)