diff --git a/README.md b/README.md index 275de66d..afae6268 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,12 @@ > ![Logo Kinvo](https://github.com/kinvoapp/kinvo-mobile-test/blob/master/logo.svg) +# Para carregar o ambiente recomendo utilizar o conda: +1. $conda create --name --file requeriments.txt +## Se preferir pode utilizar o pip: +1. $pip install -r pip_requirements.txt + +# Para rodar a aplicação +2. $python3 topnews_appflask.py # Teste para candidatos à vaga de Engenheiro de Dados (Python) diff --git a/pip_requirements.txt b/pip_requirements.txt new file mode 100644 index 00000000..6007ca13 --- /dev/null +++ b/pip_requirements.txt @@ -0,0 +1,36 @@ +async-generator==1.10 +attrs==21.4.0 +beautifulsoup4==4.10.0 +certifi==2021.10.8 +cffi==1.15.0 +charset-normalizer==2.0.12 +click==8.0.4 +cryptography==36.0.1 +Flask==2.0.3 +h11==0.13.0 +html5lib==1.1 +idna==3.3 +itsdangerous==2.1.1 +Jinja2==3.0.3 +lxml==4.8.0 +MarkupSafe==2.1.0 +numpy==1.22.3 +outcome==1.1.0 +pandas==1.4.1 +pycparser==2.21 +pyOpenSSL==22.0.0 +PySocks==1.7.1 +python-dateutil==2.8.2 +pytz==2021.3 +requests==2.27.1 +selenium==4.1.3 +six==1.16.0 +sniffio==1.2.0 +sortedcontainers==2.4.0 +soupsieve==2.3.1 +trio==0.20.0 +trio-websocket==0.9.2 +urllib3==1.26.8 +webencodings==0.5.1 +Werkzeug==2.0.3 +wsproto==1.1.0 diff --git a/requeriments.txt b/requeriments.txt new file mode 100644 index 00000000..3dcce5c7 --- /dev/null +++ b/requeriments.txt @@ -0,0 +1,54 @@ +# This file may be used to create an environment using: +# $ conda create --name --file +# platform: osx-64 +async-generator=1.10=pypi_0 +attrs=21.4.0=pypi_0 +beautifulsoup4=4.10.0=pypi_0 +ca-certificates=2022.2.1=hecd8cb5_0 +certifi=2021.10.8=py39hecd8cb5_2 +cffi=1.15.0=pypi_0 +charset-normalizer=2.0.12=pypi_0 +click=8.0.4=pypi_0 +cryptography=36.0.1=pypi_0 +flask=2.0.3=pypi_0 +h11=0.13.0=pypi_0 +html5lib=1.1=pypi_0 +idna=3.3=pypi_0 +itsdangerous=2.1.1=pypi_0 +jinja2=3.0.3=pypi_0 +libcxx=12.0.0=h2f01273_0 +libffi=3.3=hb1e8313_2 +lxml=4.8.0=pypi_0 +markupsafe=2.1.0=pypi_0 +ncurses=6.3=hca72f7f_2 +numpy=1.22.3=pypi_0 +openssl=1.1.1m=hca72f7f_0 +outcome=1.1.0=pypi_0 +pandas=1.4.1=pypi_0 +pip=21.2.4=py39hecd8cb5_0 +pycparser=2.21=pypi_0 +pyopenssl=22.0.0=pypi_0 +pysocks=1.7.1=pypi_0 +python=3.9.7=h88f2d9e_1 +python-dateutil=2.8.2=pypi_0 +pytz=2021.3=pypi_0 +readline=8.1.2=hca72f7f_1 +requests=2.27.1=pypi_0 +selenium=4.1.3=pypi_0 +setuptools=58.0.4=py39hecd8cb5_0 +six=1.16.0=pypi_0 +sniffio=1.2.0=pypi_0 +sortedcontainers=2.4.0=pypi_0 +soupsieve=2.3.1=pypi_0 +sqlite=3.37.2=h707629a_0 +tk=8.6.11=h7bc2e8c_0 +trio=0.20.0=pypi_0 +trio-websocket=0.9.2=pypi_0 +tzdata=2021e=hda174b7_0 +urllib3=1.26.8=pypi_0 +webencodings=0.5.1=pypi_0 +werkzeug=2.0.3=pypi_0 +wheel=0.37.1=pyhd3eb1b0_0 +wsproto=1.1.0=pypi_0 +xz=5.2.5=h1de35cc_0 +zlib=1.2.11=h4dc903c_4 diff --git a/templates/index.html b/templates/index.html new file mode 100644 index 00000000..9c4b8108 --- /dev/null +++ b/templates/index.html @@ -0,0 +1,30 @@ + + + + + + News Table + + +
+ + + + + + + + + + + +
Waiting click to save top news from 'spacemoney.com.br'
+
+ + \ No newline at end of file diff --git a/templates/news.html b/templates/news.html new file mode 100644 index 00000000..186b3486 --- /dev/null +++ b/templates/news.html @@ -0,0 +1,32 @@ + + + + + + News Table + + +
+ + + + + + + + + + {% for item in items %} + + + + + {% endfor %} + +
News
{{item}}
+
+ + \ No newline at end of file diff --git a/templates/savenews.html b/templates/savenews.html new file mode 100644 index 00000000..72ebf88b --- /dev/null +++ b/templates/savenews.html @@ -0,0 +1,31 @@ + + + + + + News Table + + +
+ + + + + + + + + + + +
The news has been saved
+
+ + \ No newline at end of file diff --git a/topnews_appflask.py b/topnews_appflask.py new file mode 100644 index 00000000..7fedb0dc --- /dev/null +++ b/topnews_appflask.py @@ -0,0 +1,25 @@ +from flask import Flask, render_template +from webcrawlear import WebCrawlear + +app = Flask(__name__) +dicionario = {'lista_news': ['Lista ainda não foi carregada. [Para carregar utilize endpoint "/savenews"]']} + +@app.route('/') +def index(): + return render_template('index.html'); + +@app.route('/topnews') +def topnews(): + return render_template('news.html', items=dicionario['lista_news'][:5]); + +@app.route('/news') +def news(): + return render_template('news.html', items=dicionario['lista_news']); + +@app.route('/savenews') +def savenews(): + webscrawlear = WebCrawlear() + dicionario['lista_news'] = webscrawlear.get_list_news() + return render_template('savenews.html'); + +app.run() \ No newline at end of file diff --git a/webcrawlear.py b/webcrawlear.py new file mode 100644 index 00000000..b7c543c2 --- /dev/null +++ b/webcrawlear.py @@ -0,0 +1,52 @@ +import pandas as pd +from bs4 import BeautifulSoup +from selenium import webdriver +from selenium.webdriver.firefox.options import Options +from selenium.webdriver.common.by import By + + +class WebCrawlear(): + + + def __init__(self, url=None) -> None: + spacemoney_url = 'https://www.spacemoney.com.br/ultimas-noticias' + self.url = url or spacemoney_url + self.driver = None + self.data_frame_news = None + + def make_driver(self, url=None): + option = Options() + option.headless = True + self.driver = webdriver.Firefox(options=option) + self.driver.get(url or self.url) + + def get_elements_by_class_name(self, class_name): + list_with_xpaths = self.driver.find_elements( + By.CLASS_NAME, value=class_name + ) + return list_with_xpaths + + def extract_title(self,xpath): + html_content_title = xpath.get_attribute('title') + soup_title = BeautifulSoup(html_content_title, 'html.parser') + return soup_title + + def make_data_frame_news(self, class_name='titulos'): + list_with_xpaths = self.get_elements_by_class_name(class_name) + lista_news = [] + for xpath in list_with_xpaths: + news = self.extract_title(xpath) + lista_news.append(news) + self.data_frame_news = pd.DataFrame(lista_news, columns = ['News']) + self.driver.quit() + return self.data_frame_news + + def get_list_news(self): + self.make_driver() + dataframe_news = self.make_data_frame_news() + return dataframe_news.values.tolist() + +if __name__ == "__main__": + webscralear = WebCrawlear(url='https://www.spacemoney.com.br/ultimas-noticias') + teste = webscralear.get_list_news() + print(teste) \ No newline at end of file