-
Notifications
You must be signed in to change notification settings - Fork 0
/
oracle.py
executable file
·213 lines (190 loc) · 8.43 KB
/
oracle.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
#!/usr/bin/python
#SPAM oracle class
###############################
from __future__ import division
import nltk, re, pprint
from xgoogle.search import GoogleSearch, SearchError
from urllib2 import *
import urllib, time, os, string
from xgoogle.BeautifulSoup import *
from sys import stderr
#############################
# What we are pretending to be when we ask google.
user_agents = [
'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11',
'Opera/9.25 (Windows NT 5.1; U; en)'
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)',
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 8.50 Opera 8.5, Windows XP',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 8.0 Opera 8.0, Windows XP',
'Mozilla/4.0 (compatible; MSIE 6.0; MSIE 5.5; Windows NT 5.1) Opera 7.02 [en] Opera 7.02, Windows XP',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.7.5) Gecko/20060127 Netscape/8.1'
]
class MyOpener(urllib.URLopener):
version = choice(user_agents)
class SmartRedirectHandler(HTTPRedirectHandler):
def http_error_301(self, req, fp, code, msg, headers):
result = urllib2.HTTPRedirectHandler.http_error_301(self, req, fp, code, msg, headers)
result.status = code
return result
##############################
class Oracle():
def __init__(self):
self.cached_results = {}
self.results = []
self.retry = 5
self.words = []
self.url = ""
self.zzz = 12
self.wordlist = [w for w in nltk.corpus.brown.words() if w.islower()]
def seed(self):
""" Outputs a number 'n' that we use as a seed. It is derived from the time of the execution. We use this for picking the nth result page
from our search, the nth a word from that page and it will be the total of words we are going to search for.
"""
t=list(time.gmtime())
n=t[3]+t[4]
n=list(str(n))
if len(n) == 1:
num=int(n[0])
else:
num=int(n[0])+int(n[1])
return num
def cachedresults(self,word,new_results=[]):
"""Return cached results for word if we have them
otherwise add them to the dictionary
"""
if word in self.cached_results:
print "-----Cached results for :", word
return self.cached_results[word]
else:
self.cached_results[word]=new_results
return self.cached_results[word]
def opener(self,url):
""" Get a url as an argument, retrieve it and dump the html
sleep randomly between opening urls so google does not figures out we
are not humans.
"""
try:
print "--opening : ", url
self.zzz=(random.random()+random.choice([0.3,1,1.2,2,0.3]))
time.sleep(self.zzz) #random sleep before we hit the web
w = MyOpener()
#print w.version
request = w.open(url)
f=request.read()
request.close()
print "--closed "
return f
except Exception,e:
raise e
def goggle(self,word):
"""Get results from google """
try:
results = []
gs = GoogleSearch(word,random_agent=True)
gs.results_per_page = 50
hits = gs.get_results()
for hit in hits:
results.append(hit.url.encode('utf8'))
return results
except SearchError, e:
print "Search failed: %s" % e
def get_word(self,number,url):
"""Extracts words from url """
try:
print "-----get word starts"
html=self.opener(url)
if html:
soup = BeautifulSoup(html.decode('utf-8', 'ignore'))
clean = nltk.clean_html(html)
tokens = nltk.word_tokenize(clean)
tokens = [b for b in tokens if len(b) > 1]
tokens = [c for c in tokens if not c.istitle()]
# tokens = [d for d in tokens if not d[0].isdigit()]
print "New list"
print len(tokens), number
if len(tokens) < 24:
tokens = [x for x in tokens if x not in set(tokens).difference(self.wordlist)]
else:
print 'more than 24'
print int(len(tokens)/2)
#tokens = tokens[int(len(tokens))/2:len(tokens)-1]
tokens = tokens[int(len(tokens)/2):int((len(tokens)/2)+20)]
tokens = [x for x in tokens if x not in set(tokens).difference(self.wordlist)]
print tokens
print "---------t--------------------"
if len(tokens) < number:
word = tokens[-1]
else:
word = tokens[number]+" "+tokens[number+1]
print word
else:
print "-----Nothing came back. What Do I do? "
return
except Exception,e:
raise e
print e
print "----get words end"
return word
def makeoracle(self,words):
""" A futile attempt to make sense of the words"""
s = ' '.join(self.words)
tokens = nltk.word_tokenize(s)
text = nltk.Text(tokens)
tagged = nltk.pos_tag(tokens)
grammar = "NP: {<NNS><TO><VB>}"
cp = nltk.RegexpParser(grammar)
result = cp.parse(tagged)
return tokens
def oracle(self,word,x):
""" This is the main function, runs in a loop until we have collected all the words we needed. It quits if there are too many failures.
"""
self.word=word
self.words.append(word)
while len(self.words) < x and self.retry > 0:
y=x
print self.words
print "|Number: %s |Current Word: %s |Word Count : %s" % (x, self.word, len(self.words))
self.results = self.goggle(self.word)
#print self.results
try:
print "PASS: ", x, y
print len(self.results) ##
if self.results:
if len(self.results) < x:
print "We got Less Results than expected"
url=self.results[len(self.results)-1]
else:
url=self.results[x]
#print "-URL: %s For Word: %s" % (url, self.word)
for i in range(len(self.results)):
#we try all results until we hit one that returns the word.
try:
self.word = self.get_word(23+x,url)
if self.word:
print "We Got The Word?", self.word
self.words.append(self.word)
self.retry=5
print "Retry => ", self.retry
break
except:
y=y-1
url=self.results[y]
print "We crapped out while trying for a word. Decrement y", y, url
time.sleep(self.zzz)
else:
print "-No results from google. Try Again"
print "Sleeping for %s seconds... " % self.zzz
self.zzz=(self.zzz+5)
time.sleep(self.zzz)
except Exception,e:
print "Ouch !!!" ,e
y=y-1
self.retry=self.retry-1
print "Retry: " , self.retry
continue
if self.words:
#print self.makeoracle(self.words)
return self.makeoracle(self.words)