From bb6139541d872d3960967b318b38aed93f53f0a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Quentin=20G=C3=A9r=C3=B4me?= Date: Wed, 4 Dec 2024 12:08:27 +0100 Subject: [PATCH] fix(datasets): Set the correct engine for xls files & do not crash when DatasetVersionFile do not exist anymore (#864) --- hexa/datasets/queue.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/hexa/datasets/queue.py b/hexa/datasets/queue.py index e9e2b0d01..8a24942bc 100644 --- a/hexa/datasets/queue.py +++ b/hexa/datasets/queue.py @@ -51,7 +51,7 @@ def load_df(dataset_version_file: DatasetVersionFile) -> pd.DataFrame: mime_type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" or mime_type == "application/vnd.ms-excel" ): - return pd.read_excel(download_url) + return pd.read_excel(download_url, engine="openpyxl") elif ( mime_type == "application/vnd.apache.parquet" or dataset_version_file.filename.split(".")[-1] == "parquet" @@ -189,8 +189,14 @@ def add_system_attributes(version_file: DatasetVersionFile, df: pd.DataFrame | N # Set properties map -def generate_file_metadata_task(version_file: DatasetVersionFile) -> None: +def generate_file_metadata_task(file_id: str) -> None: """Task to extract a sample of tabular files, generate profiling metadata when possible and copy user defined attributes.""" + try: + version_file = DatasetVersionFile.objects.get(id=file_id) + except DatasetVersionFile.DoesNotExist: + logger.info(f"Dataset file {file_id} not found.") + return + logger.info("Generating metadata for file %s", version_file.id) df = None try: @@ -214,7 +220,7 @@ class DatasetsFileMetadataQueue(AtLeastOnceQueue): dataset_file_metadata_queue = DatasetsFileMetadataQueue( tasks={ "generate_file_metadata": lambda _, job: generate_file_metadata_task( - DatasetVersionFile.objects.get(id=job.args["file_id"]) + job.args["file_id"] ), }, notify_channel="dataset_file_metadata_queue",