From 290ab7a1e3b6a7cbc4a82b14f09f1f0bf1c0d6ac Mon Sep 17 00:00:00 2001 From: Victor Kaillo Date: Wed, 16 Mar 2022 08:20:48 -0300 Subject: [PATCH 1/8] created webcrawlear.py --- webcrawlear.py | 68 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100644 webcrawlear.py diff --git a/webcrawlear.py b/webcrawlear.py new file mode 100644 index 00000000..94fc73b1 --- /dev/null +++ b/webcrawlear.py @@ -0,0 +1,68 @@ +import pandas as pd +from bs4 import BeautifulSoup +from selenium import webdriver +from selenium.webdriver.firefox.options import Options +from selenium.webdriver.common.by import By + + + +class WebCrawlear(): + + + def __init__(self, url='https://www.spacemoney.com.br/ultimas-noticias') -> None: + self.url = url + self.driver = None + self.data_frame_news = None + + def make_driver(self): + option = Options() + option.headless = True + self.driver = webdriver.Firefox(options=option) + self.driver.get(self.url) + + def get_elements_by_class_name(self, class_name = 'titulos'): + list_with_xpaths = self.driver.find_elements(By.CLASS_NAME, value=class_name) + return list_with_xpaths + + def extract_title(self,xpath): + html_content_title = xpath.get_attribute('title') + soup_title = BeautifulSoup(html_content_title, 'html.parser') + return soup_title + + def make_data_frame_news(self): + list_with_xpaths = self.get_elements_by_class_name('titulos') + lista_news = [] + for xpath in list_with_xpaths: + news = self.extract_title(xpath) + lista_news.append(news) + # return lista_news + self.data_frame_news = pd.DataFrame(lista_news, columns = ['News']) + self.driver.quit() + return self.data_frame_news + + def make_html(self): + table_html = self.data_frame_news.to_html() + + with open("templates/news.html") as inf: + txt = inf.read() + soup = BeautifulSoup(txt, 'html.parser') + + soup.table.extend(table_html) + + # soup.table.append(table_html) + + with open("templates/news.html", "w") as outf: + outf.write(str(soup)) + + def main(self): + self.make_driver() + dataframe_news = self.make_data_frame_news() + # print(dataframe.values.tolist()) + return dataframe_news.values.tolist() + # print(self.data_frame_news.to_html()) + # self.make_html() + +if __name__ == "__main__": + webscralear = WebCrawlear(url='https://www.spacemoney.com.br/ultimas-noticias') + teste = webscralear.main() + print(teste) \ No newline at end of file From 46531f44dca5699a73da8d77ce222ba888473743 Mon Sep 17 00:00:00 2001 From: Victor Kaillo Date: Wed, 16 Mar 2022 08:21:19 -0300 Subject: [PATCH 2/8] created topnews_appflask.py --- topnews_appflask.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 topnews_appflask.py diff --git a/topnews_appflask.py b/topnews_appflask.py new file mode 100644 index 00000000..e0dcbc7a --- /dev/null +++ b/topnews_appflask.py @@ -0,0 +1,25 @@ +from flask import Flask, render_template +from webcrawlear import WebCrawlear + +app = Flask(__name__) +dicionario = {'lista_news': ['Lista ainda não foi carregada. [Para carregar utilize endpoint "/savenews"]']} + +@app.route('/') +def index(): + return render_template('index.html'); + +@app.route('/topnews') +def topnews(): + return render_template('news.html', items=dicionario['lista_news'][:5]); + +@app.route('/news') +def news(): + return render_template('news.html', items=dicionario['lista_news']); + +@app.route('/savenews') +def savenews(): + webscrawlear = WebCrawlear() + dicionario['lista_news'] = webscrawlear.main() + return render_template('savenews.html'); + +app.run() \ No newline at end of file From 98239fbdb40acbea09166f7a0e02d40d010a00b8 Mon Sep 17 00:00:00 2001 From: Victor Kaillo Date: Wed, 16 Mar 2022 08:21:45 -0300 Subject: [PATCH 3/8] created templates/*.html --- templates/index.html | 30 ++++++++++++++++++++++++++++++ templates/news.html | 32 ++++++++++++++++++++++++++++++++ templates/savenews.html | 31 +++++++++++++++++++++++++++++++ 3 files changed, 93 insertions(+) create mode 100644 templates/index.html create mode 100644 templates/news.html create mode 100644 templates/savenews.html diff --git a/templates/index.html b/templates/index.html new file mode 100644 index 00000000..9c4b8108 --- /dev/null +++ b/templates/index.html @@ -0,0 +1,30 @@ + + + + + + News Table + + +
+ + + + + + + + + + + +
Waiting click to save top news from 'spacemoney.com.br'
+
+ + \ No newline at end of file diff --git a/templates/news.html b/templates/news.html new file mode 100644 index 00000000..186b3486 --- /dev/null +++ b/templates/news.html @@ -0,0 +1,32 @@ + + + + + + News Table + + +
+ + + + + + + + + + {% for item in items %} + + + + + {% endfor %} + +
News
{{item}}
+
+ + \ No newline at end of file diff --git a/templates/savenews.html b/templates/savenews.html new file mode 100644 index 00000000..72ebf88b --- /dev/null +++ b/templates/savenews.html @@ -0,0 +1,31 @@ + + + + + + News Table + + +
+ + + + + + + + + + + +
The news has been saved
+
+ + \ No newline at end of file From 368db4300f2208ad0724803f384525bec5afabd9 Mon Sep 17 00:00:00 2001 From: Victor Kaillo Date: Wed, 16 Mar 2022 08:23:17 -0300 Subject: [PATCH 4/8] documentation --- README.md | 5 ++++ pip_requirements.txt | 36 +++++++++++++++++++++++++++++ requeriments.txt | 54 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 95 insertions(+) create mode 100644 pip_requirements.txt create mode 100644 requeriments.txt diff --git a/README.md b/README.md index 275de66d..93da650f 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,10 @@ > ![Logo Kinvo](https://github.com/kinvoapp/kinvo-mobile-test/blob/master/logo.svg) +# Para carregar o ambiente recomendo utilizar o conda: +1. $conda create --name --file requeriments.txt + +# Se preferir pode utilizar o pip: +1. pip install -r pip_requirements.txt # Teste para candidatos à vaga de Engenheiro de Dados (Python) diff --git a/pip_requirements.txt b/pip_requirements.txt new file mode 100644 index 00000000..6007ca13 --- /dev/null +++ b/pip_requirements.txt @@ -0,0 +1,36 @@ +async-generator==1.10 +attrs==21.4.0 +beautifulsoup4==4.10.0 +certifi==2021.10.8 +cffi==1.15.0 +charset-normalizer==2.0.12 +click==8.0.4 +cryptography==36.0.1 +Flask==2.0.3 +h11==0.13.0 +html5lib==1.1 +idna==3.3 +itsdangerous==2.1.1 +Jinja2==3.0.3 +lxml==4.8.0 +MarkupSafe==2.1.0 +numpy==1.22.3 +outcome==1.1.0 +pandas==1.4.1 +pycparser==2.21 +pyOpenSSL==22.0.0 +PySocks==1.7.1 +python-dateutil==2.8.2 +pytz==2021.3 +requests==2.27.1 +selenium==4.1.3 +six==1.16.0 +sniffio==1.2.0 +sortedcontainers==2.4.0 +soupsieve==2.3.1 +trio==0.20.0 +trio-websocket==0.9.2 +urllib3==1.26.8 +webencodings==0.5.1 +Werkzeug==2.0.3 +wsproto==1.1.0 diff --git a/requeriments.txt b/requeriments.txt new file mode 100644 index 00000000..3dcce5c7 --- /dev/null +++ b/requeriments.txt @@ -0,0 +1,54 @@ +# This file may be used to create an environment using: +# $ conda create --name --file +# platform: osx-64 +async-generator=1.10=pypi_0 +attrs=21.4.0=pypi_0 +beautifulsoup4=4.10.0=pypi_0 +ca-certificates=2022.2.1=hecd8cb5_0 +certifi=2021.10.8=py39hecd8cb5_2 +cffi=1.15.0=pypi_0 +charset-normalizer=2.0.12=pypi_0 +click=8.0.4=pypi_0 +cryptography=36.0.1=pypi_0 +flask=2.0.3=pypi_0 +h11=0.13.0=pypi_0 +html5lib=1.1=pypi_0 +idna=3.3=pypi_0 +itsdangerous=2.1.1=pypi_0 +jinja2=3.0.3=pypi_0 +libcxx=12.0.0=h2f01273_0 +libffi=3.3=hb1e8313_2 +lxml=4.8.0=pypi_0 +markupsafe=2.1.0=pypi_0 +ncurses=6.3=hca72f7f_2 +numpy=1.22.3=pypi_0 +openssl=1.1.1m=hca72f7f_0 +outcome=1.1.0=pypi_0 +pandas=1.4.1=pypi_0 +pip=21.2.4=py39hecd8cb5_0 +pycparser=2.21=pypi_0 +pyopenssl=22.0.0=pypi_0 +pysocks=1.7.1=pypi_0 +python=3.9.7=h88f2d9e_1 +python-dateutil=2.8.2=pypi_0 +pytz=2021.3=pypi_0 +readline=8.1.2=hca72f7f_1 +requests=2.27.1=pypi_0 +selenium=4.1.3=pypi_0 +setuptools=58.0.4=py39hecd8cb5_0 +six=1.16.0=pypi_0 +sniffio=1.2.0=pypi_0 +sortedcontainers=2.4.0=pypi_0 +soupsieve=2.3.1=pypi_0 +sqlite=3.37.2=h707629a_0 +tk=8.6.11=h7bc2e8c_0 +trio=0.20.0=pypi_0 +trio-websocket=0.9.2=pypi_0 +tzdata=2021e=hda174b7_0 +urllib3=1.26.8=pypi_0 +webencodings=0.5.1=pypi_0 +werkzeug=2.0.3=pypi_0 +wheel=0.37.1=pyhd3eb1b0_0 +wsproto=1.1.0=pypi_0 +xz=5.2.5=h1de35cc_0 +zlib=1.2.11=h4dc903c_4 From 04db43788197fe3bd969b308c80cced1bf9a9406 Mon Sep 17 00:00:00 2001 From: Victor Kaillo Date: Wed, 16 Mar 2022 08:28:30 -0300 Subject: [PATCH 5/8] update README.md --- README.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 93da650f..afae6268 100644 --- a/README.md +++ b/README.md @@ -2,9 +2,11 @@ # Para carregar o ambiente recomendo utilizar o conda: 1. $conda create --name --file requeriments.txt +## Se preferir pode utilizar o pip: +1. $pip install -r pip_requirements.txt -# Se preferir pode utilizar o pip: -1. pip install -r pip_requirements.txt +# Para rodar a aplicação +2. $python3 topnews_appflask.py # Teste para candidatos à vaga de Engenheiro de Dados (Python) From 5f26a955412f2bc5954933d91e035f05e1685397 Mon Sep 17 00:00:00 2001 From: Victor Kaillo Date: Wed, 16 Mar 2022 09:18:07 -0300 Subject: [PATCH 6/8] update webcrawlear.py --- webcrawlear.py | 38 +++++++++++--------------------------- 1 file changed, 11 insertions(+), 27 deletions(-) diff --git a/webcrawlear.py b/webcrawlear.py index 94fc73b1..edf55800 100644 --- a/webcrawlear.py +++ b/webcrawlear.py @@ -5,23 +5,25 @@ from selenium.webdriver.common.by import By - class WebCrawlear(): - def __init__(self, url='https://www.spacemoney.com.br/ultimas-noticias') -> None: - self.url = url + def __init__(self, url=None) -> None: + spacemoney_url = 'https://www.spacemoney.com.br/ultimas-noticias' + self.url = url or spacemoney_url self.driver = None self.data_frame_news = None - def make_driver(self): + def make_driver(self, url=None): option = Options() option.headless = True self.driver = webdriver.Firefox(options=option) - self.driver.get(self.url) + self.driver.get(url or self.url) - def get_elements_by_class_name(self, class_name = 'titulos'): - list_with_xpaths = self.driver.find_elements(By.CLASS_NAME, value=class_name) + def get_elements_by_class_name(self, class_name): + list_with_xpaths = self.driver.find_elements( + By.CLASS_NAME, value=class_name + ) return list_with_xpaths def extract_title(self,xpath): @@ -29,38 +31,20 @@ def extract_title(self,xpath): soup_title = BeautifulSoup(html_content_title, 'html.parser') return soup_title - def make_data_frame_news(self): - list_with_xpaths = self.get_elements_by_class_name('titulos') + def make_data_frame_news(self, class_name='titulos'): + list_with_xpaths = self.get_elements_by_class_name(class_name) lista_news = [] for xpath in list_with_xpaths: news = self.extract_title(xpath) lista_news.append(news) - # return lista_news self.data_frame_news = pd.DataFrame(lista_news, columns = ['News']) self.driver.quit() return self.data_frame_news - def make_html(self): - table_html = self.data_frame_news.to_html() - - with open("templates/news.html") as inf: - txt = inf.read() - soup = BeautifulSoup(txt, 'html.parser') - - soup.table.extend(table_html) - - # soup.table.append(table_html) - - with open("templates/news.html", "w") as outf: - outf.write(str(soup)) - def main(self): self.make_driver() dataframe_news = self.make_data_frame_news() - # print(dataframe.values.tolist()) return dataframe_news.values.tolist() - # print(self.data_frame_news.to_html()) - # self.make_html() if __name__ == "__main__": webscralear = WebCrawlear(url='https://www.spacemoney.com.br/ultimas-noticias') From 05f0e719522490fb31a43ab3fb238d940507b49e Mon Sep 17 00:00:00 2001 From: Victor Kaillo Date: Wed, 16 Mar 2022 09:19:43 -0300 Subject: [PATCH 7/8] update webcrawlear.py --- webcrawlear.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/webcrawlear.py b/webcrawlear.py index edf55800..b7c543c2 100644 --- a/webcrawlear.py +++ b/webcrawlear.py @@ -41,12 +41,12 @@ def make_data_frame_news(self, class_name='titulos'): self.driver.quit() return self.data_frame_news - def main(self): + def get_list_news(self): self.make_driver() dataframe_news = self.make_data_frame_news() return dataframe_news.values.tolist() if __name__ == "__main__": webscralear = WebCrawlear(url='https://www.spacemoney.com.br/ultimas-noticias') - teste = webscralear.main() + teste = webscralear.get_list_news() print(teste) \ No newline at end of file From 59e303f07ca19a3f527c13db58d5580b980d0b37 Mon Sep 17 00:00:00 2001 From: Victor Kaillo Date: Wed, 16 Mar 2022 12:45:12 -0300 Subject: [PATCH 8/8] change metod name --- topnews_appflask.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/topnews_appflask.py b/topnews_appflask.py index e0dcbc7a..7fedb0dc 100644 --- a/topnews_appflask.py +++ b/topnews_appflask.py @@ -19,7 +19,7 @@ def news(): @app.route('/savenews') def savenews(): webscrawlear = WebCrawlear() - dicionario['lista_news'] = webscrawlear.main() + dicionario['lista_news'] = webscrawlear.get_list_news() return render_template('savenews.html'); app.run() \ No newline at end of file