Skip to content

Commit

Permalink
Merge branch 'master' into dependabot/pip/seaborn-0.13.2
Browse files Browse the repository at this point in the history
  • Loading branch information
Erik172 authored Sep 12, 2024
2 parents 06a0c06 + 77e7432 commit 81d1f5d
Show file tree
Hide file tree
Showing 55 changed files with 177,201 additions and 528,161 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,14 @@ apiv2.habi.co.json
habi.json
habi.jsonl
data/raw/*.json
data/processed/*.csv
data/processed/*.json
logs/*
bogota_apartments/spiders/gojom.py
.vscode/
data/interim/*.csv
images/
html/

### JupyterNotebooks ###
# gitignore template for Jupyter Notebooks
Expand Down
4 changes: 2 additions & 2 deletions ETL/03_data_enrichment.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ def haversine_m(lat1, lon1, lat2, lon2):

# Get TransMilenio stations data
logging.info('Getting TransMilenio stations data...')
response = requests.get('https://gis.transmilenio.gov.co/arcgis/rest/services/Troncal/consulta_estaciones_troncales/FeatureServer/1/query?where=1%3D1&outFields=*&f=json').json()
response = requests.get('https://gis.transmilenio.gov.co/arcgis/rest/services/Troncal/consulta_estaciones_troncales/FeatureServer/0/query?where=1%3D1&outFields=*&outSR=4326&f=json').json()
troncal_transmilenio = pd.DataFrame(response['features'])
troncal_transmilenio = pd.json_normalize(troncal_transmilenio['attributes'])

Expand Down Expand Up @@ -153,7 +153,7 @@ def is_cerca_estacion(row):
# Get parks data
parques = pd.read_csv('data/external/espacios_para_deporte_bogota/directorio-parques-y-escenarios-2023-datos-abiertos-v1.0.csv')

def get_distance_to_park(lat, lon, localidad = None) -> (str, float):
def get_distance_to_park(lat, lon, localidad = None):
"""
Calculates the distance between a given location and the nearest park.
Expand Down
69 changes: 39 additions & 30 deletions ETL/04_data_save.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,53 +5,62 @@
import logging
import os

# Cargar las variables de entorno desde el archivo .env
load_dotenv()

filename = f'logs/04_data_save.log'
# Configurar el registro de eventos
filename = 'logs/04_data_save.log'
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s', filename=filename)

if os.getcwd().split('/')[-1] == 'ETL':
logging.info('Cambiando directorio de trabajo')
# Cambiar el directorio de trabajo si es necesario
if os.path.basename(os.getcwd()) == 'ETL':
logging.info('Changing working directory')
os.chdir('..')

# Iniciar el proceso y registrar el inicio
logging.info(f'Process started at {datetime.now()}')
# Connect to MongoDB
logging.info('Connecting to MongoDB')
client = pymongo.MongoClient(os.getenv('MONGO_URI'))
db = client[os.getenv('MONGO_DATABASE')]
collection = db[os.getenv('MONGO_COLLECTION_PROCESSED')]

PREOCESSED_DATA = 'data/processed/apartments.csv'

# Read the processed data
logging.info('Reading the processed data')
try:
df = pd.read_csv(PREOCESSED_DATA, low_memory=False)
# Conectar a MongoDB
logging.info('Connecting to MongoDB')
client = pymongo.MongoClient(os.getenv('MONGO_URI'))
db = client[os.getenv('MONGO_DATABASE')]
collection = db['scrapy_bogota_apartments_processed']

# Ruta al archivo de datos procesados
PROCESSED_DATA = 'data/processed/apartments.csv'

# Leer los datos procesados desde el archivo CSV
logging.info('Reading the processed data')
df = pd.read_csv(PROCESSED_DATA, low_memory=False)
logging.info('Processed data read successfully')
except Exception as error:
logging.error(error)
exit(1)

# Save the processed data to MongoDB
logging.info('Saving the processed data to MongoDB')
# leer, buscar si existe, sie existe mirar si es igual, si es igual no hacer nada, si es diferente actualizar, si no existe insertar
try:
# Guardar los datos procesados en MongoDB
logging.info('Saving the processed data to MongoDB')
for index, row in df.iterrows():
apartment = collection.find_one({'codigo': row['codigo']})
if apartment is None:
collection.insert_one(row.to_dict())
else:
if apartment:
if apartment != row.to_dict():
collection.update_one({'codigo': row['codigo']}, {'$set': row.to_dict()})
else:
collection.insert_one(row.to_dict())

logging.info('Processed data saved successfully')

except Exception as error:
logging.error(error)
exit(1)
except FileNotFoundError as e:
logging.error(f'File not found: {e}')

except pd.errors.EmptyDataError as e:
logging.error(f'Empty data error: {e}')

except Exception as e:
logging.error(f'An error occurred: {e}')

finally:
# Cerrar la conexión a MongoDB
if 'client' in locals():
logging.info('Closing the connection to MongoDB')
client.close()

# Close the connection to MongoDB
logging.info('Closing the connection to MongoDB')
client.close()
logging.info(f'Process finished at {datetime.now()}')

logging.info(f'Process finished at {datetime.now()}')
File renamed without changes.
24 changes: 10 additions & 14 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,13 @@

![Bogota Apartments](https://i.ibb.co/6nfN4Z0/bogota-apartments02.png)

La última fecha de scrapeo fue: **03 de Enero de 2024**
La última fecha de scrapeo fue: **1 de Septiembre 2024**

Version: **V2.0.0 JUNARY.1 2024**
Inicio de scrapeo: **Julio 2024**

Version: **V2.0.0 AUGUST.2 2024**

Descargar ultimos datos: [https://github.com/builker-col/bogota-apartments/releases/tag/v2.0.0-august.2-2024](https://github.com/builker-col/bogota-apartments/releases/tag/v2.0.0-august.2-2024)

## Índice
- [Bogota Apartments](#bogota-apartments)
Expand Down Expand Up @@ -56,12 +60,6 @@ _Este proyecto hace parte [Builker](https://github.com/Builker-col)._

## Configuración

Es esencial tener un servidor de Scrapy-Splash funcionando en el puerto **8050** para ejecutar el scraper con éxito. Para mas información sobre como instalar scrapy-splash puede visitar la [documentación oficial](https://splash.readthedocs.io/en/stable/install.html).

```bash
sudo docker run -d -p 8050:8050 scrapinghub/splash
```

Si quieres ejecutar el proyecto con los servicios de mongoDB debes crear un archivo `.env` en la raiz del proyecto con las siguientes variables de entorno:

```bash
Expand Down Expand Up @@ -107,7 +105,7 @@ Los datos del proyecto fueron extraídos mediante web scraping de los siguientes
- [Metrocuadrado](https://www.metrocuadrado.com/)
- [Habi](https://www.habi.co/)
Se implemento un scraper creado con la librería [Scrapy](https://scrapy.org/) y en caso de que el sitio web este creado con JavaScript [Scrapy](https://scrapy.org/) se conbinara con [scrpay-splash](https://github.com/scrapy-plugins/scrapy-splash) para poder extraer los datos.
Se implemento un scraper creado con la librería [Scrapy](https://scrapy.org/) y en caso de que el sitio web este creado con JavaScript [Scrapy](https://scrapy.org/) se conbinara con Selenium para poder extraer los datos.
## Datos
Expand All @@ -119,7 +117,7 @@ Se implemento un scraper creado con la librería [Scrapy](https://scrapy.org/) y
### Apartamentos
file: [apartments.csv](data/processed/apartments.csv)
file: [processed_v2.0.0_august_2_2024.json](https://github.com/builker-col/bogota-apartments/releases/download/v2.0.0-august.2-2024/processed_v2.0.0_august_2_2024.json)
> ⚠️ **Advertencia**: La columna `coords_modified` indica si las coordenadas geográficas fueron modificadas durante el procesamiento de los datos. Si el valor es `True`, esto significa que las coordenadas originales fueron ajustadas o corregidas. Se recomienda precaución al utilizar estos datos, ya que pueden no reflejar las coordenadas geográficas exactas del apartamento. Es importante verificar la precisión y la fuente de las coordenadas antes de utilizarlas en aplicaciones o análisis que requieran una ubicación geográfica precisa.
Expand Down Expand Up @@ -173,14 +171,14 @@ file: [apartments.csv](data/processed/apartments.csv)
| timeline | Historial de precios del apartamento |
| url | URL del apartamento |
### Imagenes
<!-- ### Imagenes
file: [images.csv](data/processed/images.csv)
| Columna | Descripción |
|--------------|--------------------------------------------------|
| codigo | Código único que identifica cada apartamento. |
| url_imagen | Enlace URL de la imagen asociada al apartamento. |
| url_imagen | Enlace URL de la imagen asociada al apartamento. | -->
### Datos del 2023
Con la **versión 2.0.0**, se realizó una actualización crucial en la estructura de datos, lo que conllevó a la eliminación de los datos anteriores a 2024 de nuestra base de datos. Si necesitas acceder a esta información del 2023, puedes descargarla desde la siguiente URL: [https://www.dropbox.com/scl/fi/nv1efc8me23dsa1ie0g5s/2023_bogota_apartments_processed.json?rlkey=l6cl2gsf8j2icyh5cqwkr4un5&dl=1](https://www.dropbox.com/scl/fi/nv1efc8me23dsa1ie0g5s/2023_bogota_apartments_processed.json?rlkey=l6cl2gsf8j2icyh5cqwkr4un5&dl=1)
Expand All @@ -189,8 +187,6 @@ Esta actualización asegura una estructura más optimizada y acorde con las nece
**Nota:** Los datos del 2023 ya estan procesados y no requieren de ningún procesamiento adicional.
![Apartamentos extraidos por mes](visualizations/2023/type_apartments_by_month2023.png)
## Actualización de los Datos
Los datos extraídos mediante web scraping serán actualizados regularmente para mantenerlos al día. A continuación se detallan los aspectos clave de la actualización:
Expand Down
2 changes: 2 additions & 0 deletions bogota_apartments/items.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,8 @@ class ApartmentsItem(scrapy.Item):

imagenes = scrapy.Field()

# imagenes_paths = scrapy.Field()

website = scrapy.Field(output_processor = TakeFirst())

datetime = scrapy.Field(output_processor = TakeFirst())
Expand Down
2 changes: 1 addition & 1 deletion bogota_apartments/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,4 +176,4 @@ def process_item(self, item, spider):
return item

self.db[self.collection].insert_one(data)
return item
return item
36 changes: 10 additions & 26 deletions bogota_apartments/settings.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,3 @@
# Scrapy settings for bogota_apartments project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html

from dotenv import load_dotenv
import os

Expand All @@ -17,30 +8,23 @@
SPIDER_MODULES = ['bogota_apartments.spiders']
NEWSPIDER_MODULE = 'bogota_apartments.spiders'

VERSION = '2.0.0'
VERSION = '2.1.0'

# Splash settings
SPLASH_URL = 'http://localhost:8050/' # send requests to render web pages and execute JavaScript code.
DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter' # dupe filter is a mechanism that prevents Scrapy from making duplicate requests to a website.
HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage' # stores the cache on the local file system

# Database settings - uncomment if you want to use MongoDB
MONGO_URI = os.getenv('MONGO_URI')
MONGO_DATABASE = os.getenv('MONGO_DATABASE')

if not os.getenv('MONGO_COLLECTION_RAW') or not os.getenv('MONGO_COLLECTION_PROCESSED'):
MONGO_COLLECTION_RAW = 'scrapy_bogota_apartments'
MONGO_COLLECTION_PROCESSED = 'scrapy_bogota_apartments_processed'

else:
MONGO_COLLECTION_RAW = os.getenv('MONGO_COLLECTION_RAW')
MONGO_COLLECTION_PROCESSED = os.getenv('MONGO_COLLECTION_PROCESSED')

# Asignación condicional con valores por defecto
MONGO_COLLECTION_RAW = os.getenv('MONGO_COLLECTION_RAW', 'scrapy_bogota_apartments')
MONGO_COLLECTION_PROCESSED = os.getenv('MONGO_COLLECTION_PROCESSED', 'scrapy_bogota_apartments_processed')

# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = "bogota_apartments (+http://www.yourdomain.com)"
USER_AGENT = "bogota_apartments (+http://erik172.cloud)"

# Obey robots.txt rules
ROBOTSTXT_OBEY = True
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
Expand Down Expand Up @@ -75,8 +59,8 @@
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
'scrapy_splash.SplashCookiesMiddleware': 723, # This middleware handles cookies in requests made to Splash, and it is assigned the priority of 723
'scrapy_splash.SplashMiddleware': 725, # This middleware provides the integration between Scrapy and Splash and is assigned the priority of 725.
# 'scrapy_splash.SplashCookiesMiddleware': 723, # This middleware handles cookies in requests made to Splash, and it is assigned the priority of 723
# 'scrapy_splash.SplashMiddleware': 725, # This middleware provides the integration between Scrapy and Splash and is assigned the priority of 725.
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810, # This middleware is responsible for handling HTTP compression, and it is assigned the priority of 810.
'scrapy.downloadermiddlewares.retry.RetryMiddleware': 500,
}
Expand Down Expand Up @@ -120,6 +104,6 @@
FEED_EXPORT_ENCODING = 'utf-8'

# Logging settings
# LOG_STDOUT = True
LOG_STDOUT = False
# LOG_FILE = f'logs/scrapy_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log'
# LOG_LEVEL = 'DEBUG'
2 changes: 2 additions & 0 deletions bogota_apartments/spiders/habi.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# Author: Erik Garcia (@erik172)
# Version: Stable
from fake_useragent import UserAgent
from datetime import datetime
import json
Expand Down
Loading

0 comments on commit 81d1f5d

Please sign in to comment.