Merge branch 'master' into dependabot/pip/seaborn-0.13.2

builker-col · Sep 12, 2024 · 81d1f5d · 81d1f5d
2 parents 06a0c06 + 77e7432
commit 81d1f5d
Show file tree

Hide file tree

Showing 55 changed files with 177,201 additions and 528,161 deletions.
diff --git a/.gitignore b/.gitignore
@@ -7,10 +7,14 @@ apiv2.habi.co.json
 habi.json
 habi.jsonl
 data/raw/*.json
+data/processed/*.csv
+data/processed/*.json
 logs/*
 bogota_apartments/spiders/gojom.py
 .vscode/
 data/interim/*.csv
+images/
+html/
 
 ### JupyterNotebooks ###
 # gitignore template for Jupyter Notebooks

diff --git a/ETL/03_data_enrichment.py b/ETL/03_data_enrichment.py
@@ -82,7 +82,7 @@ def haversine_m(lat1, lon1, lat2, lon2):
 
 # Get TransMilenio stations data
 logging.info('Getting TransMilenio stations data...')
-response = requests.get('https://gis.transmilenio.gov.co/arcgis/rest/services/Troncal/consulta_estaciones_troncales/FeatureServer/1/query?where=1%3D1&outFields=*&f=json').json()
+response = requests.get('https://gis.transmilenio.gov.co/arcgis/rest/services/Troncal/consulta_estaciones_troncales/FeatureServer/0/query?where=1%3D1&outFields=*&outSR=4326&f=json').json()
 troncal_transmilenio = pd.DataFrame(response['features'])
 troncal_transmilenio = pd.json_normalize(troncal_transmilenio['attributes'])
 
@@ -153,7 +153,7 @@ def is_cerca_estacion(row):
 # Get parks data
 parques = pd.read_csv('data/external/espacios_para_deporte_bogota/directorio-parques-y-escenarios-2023-datos-abiertos-v1.0.csv')
 
-def get_distance_to_park(lat, lon, localidad = None) -> (str, float):
+def get_distance_to_park(lat, lon, localidad = None):
     """
     Calculates the distance between a given location and the nearest park.
 

diff --git a/ETL/04_data_save.py b/ETL/04_data_save.py
@@ -5,53 +5,62 @@
 import logging
 import os
 
+# Cargar las variables de entorno desde el archivo .env
 load_dotenv()
 
-filename = f'logs/04_data_save.log'
+# Configurar el registro de eventos
+filename = 'logs/04_data_save.log'
 logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s', filename=filename)
 
-if os.getcwd().split('/')[-1] == 'ETL':
-    logging.info('Cambiando directorio de trabajo')
+# Cambiar el directorio de trabajo si es necesario
+if os.path.basename(os.getcwd()) == 'ETL':
+    logging.info('Changing working directory')
     os.chdir('..')
 
+# Iniciar el proceso y registrar el inicio
 logging.info(f'Process started at {datetime.now()}')
-# Connect to MongoDB
-logging.info('Connecting to MongoDB')
-client = pymongo.MongoClient(os.getenv('MONGO_URI'))
-db = client[os.getenv('MONGO_DATABASE')]
-collection = db[os.getenv('MONGO_COLLECTION_PROCESSED')]
 
-PREOCESSED_DATA = 'data/processed/apartments.csv'
-
-# Read the processed data
-logging.info('Reading the processed data')
 try:
-    df = pd.read_csv(PREOCESSED_DATA, low_memory=False)
+    # Conectar a MongoDB
+    logging.info('Connecting to MongoDB')
+    client = pymongo.MongoClient(os.getenv('MONGO_URI'))
+    db = client[os.getenv('MONGO_DATABASE')]
+    collection = db['scrapy_bogota_apartments_processed']
+
+    # Ruta al archivo de datos procesados
+    PROCESSED_DATA = 'data/processed/apartments.csv'
+
+    # Leer los datos procesados desde el archivo CSV
+    logging.info('Reading the processed data')
+    df = pd.read_csv(PROCESSED_DATA, low_memory=False)
     logging.info('Processed data read successfully')
-except Exception as error:
-    logging.error(error)
-    exit(1)
 
-# Save the processed data to MongoDB
-logging.info('Saving the processed data to MongoDB')
-# leer, buscar si existe, sie existe mirar si es igual, si es igual no hacer nada, si es diferente actualizar, si no existe insertar
-try:
+    # Guardar los datos procesados en MongoDB
+    logging.info('Saving the processed data to MongoDB')
     for index, row in df.iterrows():
         apartment = collection.find_one({'codigo': row['codigo']})
-        if apartment is None:
-            collection.insert_one(row.to_dict())
-        else:
+        if apartment:
             if apartment != row.to_dict():
                 collection.update_one({'codigo': row['codigo']}, {'$set': row.to_dict()})
+        else:
+            collection.insert_one(row.to_dict())
 
     logging.info('Processed data saved successfully')
 
-except Exception as error:
-    logging.error(error)
-    exit(1)
+except FileNotFoundError as e:
+    logging.error(f'File not found: {e}')
+
+except pd.errors.EmptyDataError as e:
+    logging.error(f'Empty data error: {e}')
+
+except Exception as e:
+    logging.error(f'An error occurred: {e}')
+
+finally:
+    # Cerrar la conexión a MongoDB
+    if 'client' in locals():
+        logging.info('Closing the connection to MongoDB')
+        client.close()
 
-# Close the connection to MongoDB
-logging.info('Closing the connection to MongoDB')
-client.close()
+    logging.info(f'Process finished at {datetime.now()}')
 
-logging.info(f'Process finished at {datetime.now()}')
diff --git a/visualizations/venta/.gitkeep → ETL/data_pipeline.py b/visualizations/venta/.gitkeep → ETL/data_pipeline.py
diff --git a/README.md b/README.md
@@ -10,9 +10,13 @@
 
 ![Bogota Apartments](https://i.ibb.co/6nfN4Z0/bogota-apartments02.png)
 
-La última fecha de scrapeo fue: **03 de Enero de 2024**
+La última fecha de scrapeo fue: **1 de Septiembre 2024**
 
-Version: **V2.0.0 JUNARY.1 2024**
+Inicio de scrapeo: **Julio 2024**
+
+Version: **V2.0.0 AUGUST.2 2024**
+
+Descargar ultimos datos: [https://github.com/builker-col/bogota-apartments/releases/tag/v2.0.0-august.2-2024](https://github.com/builker-col/bogota-apartments/releases/tag/v2.0.0-august.2-2024)
 
 ## Índice
 - [Bogota Apartments](#bogota-apartments)
@@ -56,12 +60,6 @@ _Este proyecto hace parte [Builker](https://github.com/Builker-col)._
 
 ## Configuración
 
-Es esencial tener un servidor de Scrapy-Splash funcionando en el puerto **8050** para ejecutar el scraper con éxito. Para mas información sobre como instalar scrapy-splash puede visitar la [documentación oficial](https://splash.readthedocs.io/en/stable/install.html).
-
-```bash
-sudo docker run -d -p 8050:8050 scrapinghub/splash
-```
-
 Si quieres ejecutar el proyecto con los servicios de mongoDB debes crear un archivo `.env` en la raiz del proyecto con las siguientes variables de entorno:
 
 ```bash
@@ -107,7 +105,7 @@ Los datos del proyecto fueron extraídos mediante web scraping de los siguientes
 - [Metrocuadrado](https://www.metrocuadrado.com/)
 - [Habi](https://www.habi.co/)
 
-Se implemento un scraper creado con la librería [Scrapy](https://scrapy.org/) y en caso de que el sitio web este creado con JavaScript [Scrapy](https://scrapy.org/) se conbinara con [scrpay-splash](https://github.com/scrapy-plugins/scrapy-splash) para poder extraer los datos.
+Se implemento un scraper creado con la librería [Scrapy](https://scrapy.org/) y en caso de que el sitio web este creado con JavaScript [Scrapy](https://scrapy.org/) se conbinara con Selenium para poder extraer los datos.
 
 ## Datos
 
@@ -119,7 +117,7 @@ Se implemento un scraper creado con la librería [Scrapy](https://scrapy.org/) y
 
 ### Apartamentos
 
-file: [apartments.csv](data/processed/apartments.csv)
+file: [processed_v2.0.0_august_2_2024.json](https://github.com/builker-col/bogota-apartments/releases/download/v2.0.0-august.2-2024/processed_v2.0.0_august_2_2024.json)
 
 > ⚠️ **Advertencia**: La columna `coords_modified` indica si las coordenadas geográficas fueron modificadas durante el procesamiento de los datos. Si el valor es `True`, esto significa que las coordenadas originales fueron ajustadas o corregidas. Se recomienda precaución al utilizar estos datos, ya que pueden no reflejar las coordenadas geográficas exactas del apartamento. Es importante verificar la precisión y la fuente de las coordenadas antes de utilizarlas en aplicaciones o análisis que requieran una ubicación geográfica precisa.
 
@@ -173,14 +171,14 @@ file: [apartments.csv](data/processed/apartments.csv)
 | timeline                             | Historial de precios del apartamento                      |
 | url                                  | URL del apartamento                                       |
 
-### Imagenes
+<!-- ### Imagenes
 
 file: [images.csv](data/processed/images.csv)
 
 | Columna      | Descripción                                      |
 |--------------|--------------------------------------------------|
 | codigo       | Código único que identifica cada apartamento.    |
-| url_imagen   | Enlace URL de la imagen asociada al apartamento. |
+| url_imagen   | Enlace URL de la imagen asociada al apartamento. | -->
 
 ### Datos del 2023
 Con la **versión 2.0.0**, se realizó una actualización crucial en la estructura de datos, lo que conllevó a la eliminación de los datos anteriores a 2024 de nuestra base de datos. Si necesitas acceder a esta información del 2023, puedes descargarla desde la siguiente URL: [https://www.dropbox.com/scl/fi/nv1efc8me23dsa1ie0g5s/2023_bogota_apartments_processed.json?rlkey=l6cl2gsf8j2icyh5cqwkr4un5&dl=1](https://www.dropbox.com/scl/fi/nv1efc8me23dsa1ie0g5s/2023_bogota_apartments_processed.json?rlkey=l6cl2gsf8j2icyh5cqwkr4un5&dl=1)
@@ -189,8 +187,6 @@ Esta actualización asegura una estructura más optimizada y acorde con las nece
 
 **Nota:** Los datos del 2023 ya estan procesados y no requieren de ningún procesamiento adicional.
 
-![Apartamentos extraidos por mes](visualizations/2023/type_apartments_by_month2023.png)
-
 ## Actualización de los Datos
 
 Los datos extraídos mediante web scraping serán actualizados regularmente para mantenerlos al día. A continuación se detallan los aspectos clave de la actualización:

diff --git a/bogota_apartments/items.py b/bogota_apartments/items.py
@@ -154,6 +154,8 @@ class ApartmentsItem(scrapy.Item):
 
     imagenes = scrapy.Field()
 
+    # imagenes_paths = scrapy.Field()
+
     website = scrapy.Field(output_processor = TakeFirst())
 
     datetime = scrapy.Field(output_processor = TakeFirst())

diff --git a/bogota_apartments/pipelines.py b/bogota_apartments/pipelines.py
@@ -176,4 +176,4 @@ def process_item(self, item, spider):
             return item
 
         self.db[self.collection].insert_one(data)
-        return item
+        return item
diff --git a/bogota_apartments/settings.py b/bogota_apartments/settings.py
@@ -1,12 +1,3 @@
-# Scrapy settings for bogota_apartments project
-#
-# For simplicity, this file contains only settings considered important or
-# commonly used. You can find more settings consulting the documentation:
-#
-#     https://docs.scrapy.org/en/latest/topics/settings.html
-#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
-#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
-
 from dotenv import load_dotenv
 import os
 
@@ -17,30 +8,23 @@
 SPIDER_MODULES = ['bogota_apartments.spiders']
 NEWSPIDER_MODULE = 'bogota_apartments.spiders'
 
-VERSION = '2.0.0'
+VERSION = '2.1.0'
 
-# Splash settings
-SPLASH_URL = 'http://localhost:8050/'  # send requests to render web pages and execute JavaScript code.
-DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'  # dupe filter is a mechanism that prevents Scrapy from making duplicate requests to a website. 
-HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage' # stores the cache on the local file system
 
 # Database settings - uncomment if you want to use MongoDB
 MONGO_URI = os.getenv('MONGO_URI')
 MONGO_DATABASE = os.getenv('MONGO_DATABASE')
 
-if not os.getenv('MONGO_COLLECTION_RAW') or not os.getenv('MONGO_COLLECTION_PROCESSED'):
-    MONGO_COLLECTION_RAW = 'scrapy_bogota_apartments'
-    MONGO_COLLECTION_PROCESSED = 'scrapy_bogota_apartments_processed'
-
-else:
-    MONGO_COLLECTION_RAW = os.getenv('MONGO_COLLECTION_RAW')
-    MONGO_COLLECTION_PROCESSED = os.getenv('MONGO_COLLECTION_PROCESSED')
+
+# Asignación condicional con valores por defecto
+MONGO_COLLECTION_RAW = os.getenv('MONGO_COLLECTION_RAW', 'scrapy_bogota_apartments')
+MONGO_COLLECTION_PROCESSED = os.getenv('MONGO_COLLECTION_PROCESSED', 'scrapy_bogota_apartments_processed')
 
 # Crawl responsibly by identifying yourself (and your website) on the user-agent
-#USER_AGENT = "bogota_apartments (+http://www.yourdomain.com)"
+USER_AGENT = "bogota_apartments (+http://erik172.cloud)"
 
 # Obey robots.txt rules
-ROBOTSTXT_OBEY = True
+ROBOTSTXT_OBEY = False
 
 # Configure maximum concurrent requests performed by Scrapy (default: 16)
 #CONCURRENT_REQUESTS = 32
@@ -75,8 +59,8 @@
 # Enable or disable downloader middlewares
 # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
 DOWNLOADER_MIDDLEWARES = { 
-    'scrapy_splash.SplashCookiesMiddleware': 723,  # This middleware handles cookies in requests made to Splash, and it is assigned the priority of 723
-    'scrapy_splash.SplashMiddleware': 725,  # This middleware provides the integration between Scrapy and Splash and is assigned the priority of 725.
+    # 'scrapy_splash.SplashCookiesMiddleware': 723,  # This middleware handles cookies in requests made to Splash, and it is assigned the priority of 723
+    # 'scrapy_splash.SplashMiddleware': 725,  # This middleware provides the integration between Scrapy and Splash and is assigned the priority of 725.
     'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,  # This middleware is responsible for handling HTTP compression, and it is assigned the priority of 810.
     'scrapy.downloadermiddlewares.retry.RetryMiddleware': 500,
 }
@@ -120,6 +104,6 @@
 FEED_EXPORT_ENCODING = 'utf-8'
 
 # Logging settings
-# LOG_STDOUT = True
+LOG_STDOUT = False
 # LOG_FILE = f'logs/scrapy_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log'
 # LOG_LEVEL = 'DEBUG'
diff --git a/bogota_apartments/spiders/habi.py b/bogota_apartments/spiders/habi.py
@@ -1,3 +1,5 @@
+# Author: Erik Garcia (@erik172)
+# Version: Stable
 from fake_useragent import UserAgent
 from datetime import datetime
 import json