-
Notifications
You must be signed in to change notification settings - Fork 3
/
tablescraper.py
149 lines (119 loc) · 4.59 KB
/
tablescraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select
from termcolor import colored, cprint
from datetime import datetime
class Scrapie():
def __init__(self, entry, database, m_game):
self.page_number = 1
self.target_system = entry.name
self.target_id = entry.sys_id
self.db = database
self.m_game = m_game
self.finished = False
self.elem_id = "GridView1"
self.system_dropid = "DropSys"
self.url = "http://www.gamesdatabase.org/list.aspx?manual=1"
self.js_script = "javascript:__doPostBack('" + \
self.elem_id + \
"','Page${0}')"
def init_browser(self):
self.browser = webdriver.Chrome()
self.browser.get(self.url)
def setup_browser_table(self):
selector = Select(self.browser.find_element_by_id(self.system_dropid))
selector.select_by_visible_text(self.target_system)
delay = 2 # seconds
try:
self.table = WebDriverWait(self.browser, delay).until(
EC.presence_of_element_located((By.ID, self.elem_id)))
except TimeoutException:
if '404' in self.browser.title:
self.finished = True
def aquire_table(self):
self.table = self.browser.find_element_by_id("GridView1")
def iterate_pages(self):
self.page_number += 1
self.browser.execute_script(self.js_script.format(self.page_number))
delay = 2 # seconds
try:
self.table = WebDriverWait(self.browser, delay).until(
EC.presence_of_element_located((By.ID, self.elem_id)))
print("{} - Page is ready!".format(
datetime.now().isoformat(' ', 'seconds'))
)
except TimeoutException:
if '404' in self.browser.title:
self.finished = True
def aquire_entries(self):
entries = self.table.find_elements_by_tag_name('tr')
filtered_entries = []
# Filter out pages and header
for entry in entries:
# System name is in the 2nd cell with tag 'a'
try:
system_cell = entry.find_elements_by_tag_name('a')[2]
if system_cell.text == self.target_system:
filtered_entries.append(entry)
except IndexError:
pass
self.aquire_cells(filtered_entries)
def save_page_to_table(self, rows):
create_count = 0
for x in rows:
game, created = self.m_game.get_or_create(
game=x[0],
system_id=self.target_id,
publisher=x[1],
developer=x[2],
category=x[3],
year=x[4] or 0
)
if created:
create_count += 1
cprint("Done! Added {} rows to the table!".format(
create_count), 'green')
# Print how many rows was skipped
skipped = len(rows) - create_count
if skipped:
cprint("\t {} entries already existing in database".format(
skipped), 'cyan')
def aquire_cells(self, entries):
tmp = []
for row in entries:
tmp.append(self.transform_to_row(row))
self.save_page_to_table(tmp)
def transform_to_row(self, row):
rawcell = row.find_elements_by_tag_name('td')
# Only extract relevant bois
cell_pos = [1, 5, 7, 8, 9]
output = []
for cell in cell_pos:
output.append(rawcell[cell].text)
return output
def scrape_all(self):
self.aquire_table()
while not self.finished:
self.aquire_entries()
self.iterate_pages()
def have_all_entries(self):
n = self.m_game.select().where(self.m_game.system_id == self.target_id).count()
cprint("Already exists {} entries in base".format(str(n)), 'yellow')
onsite = self.browser.find_element_by_id('Out')
number = int(onsite.text.split(' ')[0].replace(',', ''))
return (n >= number)
def cleanup(self):
cprint("Finished with system {}.".format(
self.target_system,
), 'green')
self.browser.stop_client()
self.browser.quit()
def run(self):
self.init_browser()
self.setup_browser_table()
if not self.have_all_entries():
self.scrape_all()
self.cleanup()