forked from kinvoapp/kinvo-engenheiro-de-dados
-
Notifications
You must be signed in to change notification settings - Fork 0
/
webcrawlear.py
68 lines (52 loc) · 2.11 KB
/
webcrawlear.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
class WebCrawlear():
def __init__(self, url='https://www.spacemoney.com.br/ultimas-noticias') -> None:
self.url = url
self.driver = None
self.data_frame_news = None
def make_driver(self):
option = Options()
option.headless = True
self.driver = webdriver.Firefox(options=option)
self.driver.get(self.url)
def get_elements_by_class_name(self, class_name = 'titulos'):
list_with_xpaths = self.driver.find_elements(By.CLASS_NAME, value=class_name)
return list_with_xpaths
def extract_title(self,xpath):
html_content_title = xpath.get_attribute('title')
soup_title = BeautifulSoup(html_content_title, 'html.parser')
return soup_title
def make_data_frame_news(self):
list_with_xpaths = self.get_elements_by_class_name('titulos')
lista_news = []
for xpath in list_with_xpaths:
news = self.extract_title(xpath)
lista_news.append(news)
# return lista_news
self.data_frame_news = pd.DataFrame(lista_news, columns = ['News'])
self.driver.quit()
return self.data_frame_news
def make_html(self):
table_html = self.data_frame_news.to_html()
with open("templates/news.html") as inf:
txt = inf.read()
soup = BeautifulSoup(txt, 'html.parser')
soup.table.extend(table_html)
# soup.table.append(table_html)
with open("templates/news.html", "w") as outf:
outf.write(str(soup))
def main(self):
self.make_driver()
dataframe_news = self.make_data_frame_news()
# print(dataframe.values.tolist())
return dataframe_news.values.tolist()
# print(self.data_frame_news.to_html())
# self.make_html()
if __name__ == "__main__":
webscralear = WebCrawlear(url='https://www.spacemoney.com.br/ultimas-noticias')
teste = webscralear.main()
print(teste)