-
Notifications
You must be signed in to change notification settings - Fork 0
/
search.py
150 lines (102 loc) · 4.25 KB
/
search.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
################################################################################
# Search engine for the project #
# #
# Authors #
# Yicheng Wang #
# #
# Description #
# search function of the marginalia project #
# #
################################################################################
from string import ascii_letters
def get_snippets_from_site(site, to_search):
"""
get_snippets_from_site: returns the sublist of to_search that appears in
site
Args:
site (string): the haystack
to_search (list of strings): self-explanatory
Returns:
a list of strings that is a sublist of to_search that appeared in site,
ordered by which one appeared first
Example:
get_snippets_from_site("I am awesome", ["this", "is", "awesome"]) -->
['am', 'awesome']
"""
ret_val = []
site = site.split()
for i in to_search:
if i in site:
ret_val.append(i)
return ret_val
def get_index_of_proximity(site, to_search):
"""
get_index_of_proximity: returns the index of proximity of a specific search,
the index of proximity is defined as the number of words matched
divided by the total distance between the matched words. The higher the
index the better the match
if only one element of to_search is matched, returns 0??? FIXME
return -1 if no element is matched
Args:
site (string): the site to search in
to_search (list of strings): the list of string to match
Returns:
a float, the index of proximity
"""
total_distance = 0
indices = []
site = site.split()
for i in to_search:
if i in site:
indices.append(site.index(i))
indices.sort()
if (len(indices) == 0):
return -1
elif (len(indices) == 1):
return 0
else:
for i in range(1,len(indices)):
total_distance += indices[i] - indices[i - 1]
average_distance = total_distance / float(len(indices) - 1)
print average_distance
return (len(indices) ** 5) / float(average_distance)
def abstract_site_from_words(text, list_of_words):
"""
abstract_site_from_words: get an abstraction of the text based on the words
Args:
text (string): the text to abstract
list_of_words (list of strings): the list to abstract
Returns:
a string that is the abstracted text
Example:
abstract_site_from_words("blah blah blah blah hello world blah blah
blah", ['hello']) --> "... blah blah hello world blah ..."
"""
text = text.split()
result = ""
indices = [0]
for i in list_of_words:
indices.append(text.index(i))
indices.append(len(text) - 1)
indices.sort()
i = 1
blockstart = 0
for i in range(1, len(indices)):
if indices[i] - indices[i - 1] > 10:
result += " ".join(text[blockstart : indices[i - 1] + 6]) + '... '
blockstart = indices[i] - 5
if indices[-1] - indices[-2] > 6:
result += " ".join(text[blockstart : indices[-2] + 6]) + "..."
else:
result += " ".join(text[blockstart : indices[-1] + 6])
return result
if __name__ == "__main__":
f = open('testext.txt', 'r').read()
snippet = get_snippets_from_site(f, ['this', 'is', 'a', 'pineapple'])
print "Snippet: " + str(snippet)
index = get_index_of_proximity(f, ['this', 'is', 'a', 'pineapple'])
abstracted = abstract_site_from_words(f, snippet)
print "Abstracted: " + abstracted
print "Index: " + str(index)
print str(get_index_of_proximity(f, ['pineapple', 'fruit', 'furnishings.', 'country.', 'material']))
print abstract_site_from_words(f, ['pineapple', 'fruit', 'furnishings.', 'country.', 'material'])