-
Notifications
You must be signed in to change notification settings - Fork 4
/
search.py
187 lines (147 loc) · 6.22 KB
/
search.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
__author__ = 'matias'
import solr
import elasticsearch
from textanalysis.texts import RawSentenceStream, PhraseSentenceStream, FZArticleLibrary, extract_docid
from textanalysis.texts import CaseReportLibrary
from textanalysis.phrasedetection import PmiPhraseDetector
from irmodels.D2Vmodel import D2Vmodel, DocIndex
from scipy.spatial.distance import cosine
from heapq import heappush
import re
import random
class SearchEngine(object):
def __str__(self):
raise NotImplementedError()
def query(self, query_str, top_n=20):
raise NotImplementedError()
class ElasticSearchEngine(SearchEngine):
def __init__(self, query_expansion=None, index="casereports"):
self.es = elasticsearch.Elasticsearch()
self.name = "Standard ElasticSearch Engine"
self.query_expansion = query_expansion
self.index = index
def __str__(self):
if self.query_expansion is not None:
string = "%s with %s" % (self.name, self.query_expansion,)
else:
string = self.name
return string
def query(self, query_str, top_n=20):
# expand query
if self.query_expansion is not None:
query_str = self.query_expansion.expand(query_str)
# remove special Solr query chars
query_str = query_str.replace(":", "")
query_str = query_str.replace("/", "")#filter(lambda x: str.isalnum(x) or x == " " or x == "_", query_str)
#print "PRINTING STRING", string
#print "PRINTING REPLACED", string.replace("/", "")
search_results = [hit['_source'] for hit in self.es.search(self.index,
q=query_str,
default_operator='OR',
size=top_n,
#body={'query':{'match_all':{}}}
)['hits']['hits']]
results = [{'title':hit['title'], 'description': hit['title'], 'related': [], 'id':hit['pmcid']} for hit in search_results]
print results
return results
class StandardSolrEngine(SearchEngine):
def __init__(self, query_expansion=None):
self.solr_con = solr.SolrConnection('http://localhost:8983/solr')
self.name = "Standard Solr Engine"
self.query_expansion = query_expansion
def __str__(self):
if self.query_expansion is not None:
string = "%s with %s" % (self.name, self.query_expansion,)
else:
string = self.name
return string
def query(self, query_str, top_n=20):
# expand query
if self.query_expansion is not None:
query_str = self.query_expansion.expand(query_str)
# remove special Solr query chars
query_str = query_str.replace(":", "")
return self.solr_con.query(query_str, rows=top_n).results
class RandomSearchEngine(SearchEngine):
def __init__(self):
print "start"
self.docids = [doc.get_id() for doc in CaseReportLibrary()]
print "end"
def __str__(self):
return "RandomSearchEngine"
def query(self, query_str, top_n=20):
random.shuffle(self.docids)
result = self.docids[:top_n]
solr_like_results = []
for entry in result:
score = 0.0
title = entry
_id = entry[6:]
solr_like_results.append({'id': _id, 'score': score, 'title': title})
return solr_like_results
class TwoPhaseSearchEngine(SearchEngine):
def __init__(self, inner_engine, top_ranker):
self.inner_engine = inner_engine
self.top_ranker = top_ranker
def __str__(self):
return "2-phase Search Engine"
def query(self, query_str, top_n=20):
pass
class Doc2VecSearchEngine(SearchEngine):
def __init__(self, size = 50, modelfile=None):
self.phrase_detector = PmiPhraseDetector(RawSentenceStream(fz_docs=False))
# build model
epochs = 2
self.model = D2Vmodel(
PhraseSentenceStream(self.phrase_detector, extract_func=extract_docid, fz_docs=True, reshuffles=epochs-1),
name="DOCID",
dataset_name="CASEREPORT",
epochs=epochs,
dimension=size,
modelfile=modelfile,
)
self.doc_index = DocIndex(CaseReportLibrary(), "CASEREPORT")
def __str__(self):
return "Doc2Vec Search Engine"
def query(self, query_str, top_n=20):
query_vec = self.model.infer_doc_vector(query_str, steps=50, phrase_detector=self.phrase_detector)
ranking = []
for word in self.model.inner_model.vocab:
if word.startswith("DOCID-CS"):
doc_vec = self.model.inner_model[word]
distance = cosine(query_vec, doc_vec)
heappush(ranking, (distance, word))
# make top results similar to Solr results
top_ranked = ranking[:top_n]
solr_like_results = []
for entry in top_ranked:
score = entry[0]
title = self.doc_index[entry[1]]
_id = entry[1][8:]
solr_like_results.append({'id': _id, 'score': score, 'title': title})
print solr_like_results
return solr_like_results
if __name__ == "__main__":
engine = StandardSolrEngine()
engine.solr_con
query_input = raw_input("search for:")
while query_input is not "":
hits = engine.query(query_input, 50)
relevant = []
partly_relevant = []
print "--------- RESULTS ---------"
for hit in hits:
relevance_input = ""
while not relevance_input.isdigit():
relevance_input = raw_input(
str(hit[u'id']) + " " + str(hit[u'title']) +
" (0=not relevant, 1=partly relevant, 2=relevant) Rating:")
if relevance_input == "1":
partly_relevant.append(hit[u'id'])
elif relevance_input == "2":
relevant.append(hit[u'id'])
print "---------------------------"
print "Relevant: ", ",".join(relevant)
print "Partly Relevant: ", ",".join(partly_relevant)
print ""
query_input = raw_input("search for:")