From a86f0acc13ae1c48c96c5182744bc2cfb41077f3 Mon Sep 17 00:00:00 2001 From: nazarfil Date: Tue, 18 Jun 2024 10:57:09 +0200 Subject: [PATCH 01/37] feat(Dataset): add initial background queue, task, work and trigger to enqueue task --- hexa/datasets/management/commands/worker.py | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 hexa/datasets/management/commands/worker.py diff --git a/hexa/datasets/management/commands/worker.py b/hexa/datasets/management/commands/worker.py new file mode 100644 index 000000000..0c6578270 --- /dev/null +++ b/hexa/datasets/management/commands/worker.py @@ -0,0 +1,7 @@ +from dpq.commands import Worker + +from hexa.datasets.queue import dataset_snapshot_queue + + +class Command(Worker): + queue = dataset_snapshot_queue From be8abab371a87601c4b1ed616302720d386d32b0 Mon Sep 17 00:00:00 2001 From: nazarfil Date: Sun, 23 Jun 2024 15:42:05 +0200 Subject: [PATCH 02/37] feat(Dataset): job task to generate snapshot --- hexa/datasets/migrations/__init__.py | 0 hexa/files/basefs.py | 4 ++++ hexa/files/gcp.py | 16 ++++++++++++++++ hexa/files/s3.py | 11 +++++++++++ hexa/files/tests/mocks/client.py | 8 ++++++++ hexa/files/tests/test_api.py | 17 +++++++++++++++++ 6 files changed, 56 insertions(+) delete mode 100644 hexa/datasets/migrations/__init__.py diff --git a/hexa/datasets/migrations/__init__.py b/hexa/datasets/migrations/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/hexa/files/basefs.py b/hexa/files/basefs.py index b35e8134d..8d6403e09 100644 --- a/hexa/files/basefs.py +++ b/hexa/files/basefs.py @@ -97,3 +97,7 @@ def generate_upload_url( @abstractmethod def get_token_as_env_variables(self, token): pass + + @abstractmethod + def read_object_lines(self, bucket_name: str, filename: str, lines_number: int): + pass diff --git a/hexa/files/gcp.py b/hexa/files/gcp.py index a3199db6c..6f8d6b3fa 100644 --- a/hexa/files/gcp.py +++ b/hexa/files/gcp.py @@ -1,4 +1,5 @@ import base64 +import io import json import requests @@ -347,3 +348,18 @@ def get_token_as_env_variables(self, token): "GCS_TOKEN": token, # FIXME: Once we have deployed the new openhexa-bslq-environment image and upgraded the openhexa-app, we can remove this line "WORKSPACE_STORAGE_ENGINE_GCP_ACCESS_TOKEN": token, } + + def read_object_lines(self, bucket_name: str, filename: str, lines_number: int): + client = get_storage_client() + bucket = client.get_bucket(bucket_name) + blob = bucket.get_blob(filename) + + with io.BytesIO() as file_obj: + blob.download_to_file(file_obj) + file_obj.seek(0) + lines = file_obj.readlines() + + max_lines = min(lines_number, len(lines)) + print(max_lines, lines_number, len(lines)) + specific_lines = [lines[i].decode("utf-8").strip() for i in range(max_lines)] + return specific_lines diff --git a/hexa/files/s3.py b/hexa/files/s3.py index c58d07711..f85b5021e 100644 --- a/hexa/files/s3.py +++ b/hexa/files/s3.py @@ -1,4 +1,5 @@ import base64 +import io import json import boto3 @@ -436,3 +437,13 @@ def get_token_as_env_variables(self, token): json.dumps(json_config).encode() ).decode(), } + + def read_object_lines(self, bucket_name: str, filename: str, lines_number: int): + s3 = get_storage_client() + object = s3.get_object(Bucket=bucket_name, Key=filename) + file_stream = io.BytesIO(object["Body"].read()) + file_stream.seek(0) + lines = file_stream.readlines() + + specific_lines = [lines[i].decode("utf-8").strip() for i in range(lines_number)] + return specific_lines diff --git a/hexa/files/tests/mocks/client.py b/hexa/files/tests/mocks/client.py index 10df5d7d9..7238a89bd 100644 --- a/hexa/files/tests/mocks/client.py +++ b/hexa/files/tests/mocks/client.py @@ -1,3 +1,6 @@ +from random import choice +from string import ascii_lowercase, digits + from google.api_core import page_iterator from google.cloud.exceptions import Conflict, NotFound @@ -269,3 +272,8 @@ def list_buckets( return MockHTTPIterator( items=buckets, max_results=max_results, page_size=page_size ) + + def read_object_lines(self, bucket: str, file_path: str, num_lines=None): + chars = ascii_lowercase + digits + lst = ["".join(choice(chars) for _ in range(2)) for _ in range(num_lines)] + return lst diff --git a/hexa/files/tests/test_api.py b/hexa/files/tests/test_api.py index 28123be0d..bfd3f8ebf 100644 --- a/hexa/files/tests/test_api.py +++ b/hexa/files/tests/test_api.py @@ -532,3 +532,20 @@ def get_type(self): class APIGcpTestCase(APITestCase, OnlyGCP, TestCase): def get_type(self): return "gcp" + + +class TestDownloadFromCloudStorage(TestCase): + def test_get_from_gcp(self): + pass + # lines = get_storage("gcp").read_object_lines( + # "hexa-test-datasets", + # "3237e8c2-896d-4628-9054-59d69c785a11/add8469f-14d6-4081-8e02-adb53016f7bd/people.csv", + # 1, + # ) + # print(f"Lines are : {lines}") + # self.assertEqual( + # lines, + # [ + # "3237e8c2-896d-4628-9054-59d69c785a11/add8469f-14d6-4081-8e02-adb53016f7bd/people.csv" + # ], + # ) From e9745088ca164472a083e086e3bd8601ba7ca169 Mon Sep 17 00:00:00 2001 From: nazarfil Date: Thu, 4 Jul 2024 14:17:33 +0200 Subject: [PATCH 03/37] fix(Datasets): fixes migrations --- .../migrations/0005_datasetsnapshotjob.py | 43 +++++++++++++++++++ hexa/datasets/migrations/__init__.py | 0 hexa/files/gcp.py | 1 - hexa/files/tests/test_api.py | 17 -------- 4 files changed, 43 insertions(+), 18 deletions(-) create mode 100644 hexa/datasets/migrations/0005_datasetsnapshotjob.py create mode 100644 hexa/datasets/migrations/__init__.py diff --git a/hexa/datasets/migrations/0005_datasetsnapshotjob.py b/hexa/datasets/migrations/0005_datasetsnapshotjob.py new file mode 100644 index 000000000..eb2969718 --- /dev/null +++ b/hexa/datasets/migrations/0005_datasetsnapshotjob.py @@ -0,0 +1,43 @@ +# Generated by Django 5.0.3 on 2024-07-04 12:08 + +import django.contrib.postgres.functions +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("datasets", "0004_datasetversion_pipeline_run"), + ] + + operations = [ + migrations.CreateModel( + name="DatasetSnapshotJob", + fields=[ + ("id", models.BigAutoField(primary_key=True, serialize=False)), + ( + "created_at", + models.DateTimeField( + default=django.contrib.postgres.functions.TransactionNow + ), + ), + ( + "execute_at", + models.DateTimeField( + default=django.contrib.postgres.functions.TransactionNow + ), + ), + ( + "priority", + models.IntegerField( + default=0, + help_text="Jobs with higher priority will be processed first.", + ), + ), + ("task", models.CharField(max_length=255)), + ("args", models.JSONField()), + ], + options={ + "db_table": "catalog_datasetsnapshotjob", + }, + ), + ] diff --git a/hexa/datasets/migrations/__init__.py b/hexa/datasets/migrations/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/hexa/files/gcp.py b/hexa/files/gcp.py index 6f8d6b3fa..972dfcda6 100644 --- a/hexa/files/gcp.py +++ b/hexa/files/gcp.py @@ -360,6 +360,5 @@ def read_object_lines(self, bucket_name: str, filename: str, lines_number: int): lines = file_obj.readlines() max_lines = min(lines_number, len(lines)) - print(max_lines, lines_number, len(lines)) specific_lines = [lines[i].decode("utf-8").strip() for i in range(max_lines)] return specific_lines diff --git a/hexa/files/tests/test_api.py b/hexa/files/tests/test_api.py index bfd3f8ebf..28123be0d 100644 --- a/hexa/files/tests/test_api.py +++ b/hexa/files/tests/test_api.py @@ -532,20 +532,3 @@ def get_type(self): class APIGcpTestCase(APITestCase, OnlyGCP, TestCase): def get_type(self): return "gcp" - - -class TestDownloadFromCloudStorage(TestCase): - def test_get_from_gcp(self): - pass - # lines = get_storage("gcp").read_object_lines( - # "hexa-test-datasets", - # "3237e8c2-896d-4628-9054-59d69c785a11/add8469f-14d6-4081-8e02-adb53016f7bd/people.csv", - # 1, - # ) - # print(f"Lines are : {lines}") - # self.assertEqual( - # lines, - # [ - # "3237e8c2-896d-4628-9054-59d69c785a11/add8469f-14d6-4081-8e02-adb53016f7bd/people.csv" - # ], - # ) From 1c0b5f090dea27da485c355ad25ef17c55c6ff9b Mon Sep 17 00:00:00 2001 From: nazarfil Date: Fri, 5 Jul 2024 11:04:36 +0200 Subject: [PATCH 04/37] feat(Dataset): adds worker to docker compose --- .../management/commands/{worker.py => dataset_snapshot_worker.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename hexa/datasets/management/commands/{worker.py => dataset_snapshot_worker.py} (100%) diff --git a/hexa/datasets/management/commands/worker.py b/hexa/datasets/management/commands/dataset_snapshot_worker.py similarity index 100% rename from hexa/datasets/management/commands/worker.py rename to hexa/datasets/management/commands/dataset_snapshot_worker.py From a607afc0b39a09dc6a86ba4dc893f3385e29f677 Mon Sep 17 00:00:00 2001 From: nazarfil Date: Fri, 12 Jul 2024 09:44:51 +0200 Subject: [PATCH 05/37] chore: renamed snapshot to file_metadata --- .../commands/dataset_snapshot_worker.py | 4 +- .../migrations/0005_datasetsnapshotjob.py | 43 ------------------- 2 files changed, 2 insertions(+), 45 deletions(-) delete mode 100644 hexa/datasets/migrations/0005_datasetsnapshotjob.py diff --git a/hexa/datasets/management/commands/dataset_snapshot_worker.py b/hexa/datasets/management/commands/dataset_snapshot_worker.py index 0c6578270..d7389ccaf 100644 --- a/hexa/datasets/management/commands/dataset_snapshot_worker.py +++ b/hexa/datasets/management/commands/dataset_snapshot_worker.py @@ -1,7 +1,7 @@ from dpq.commands import Worker -from hexa.datasets.queue import dataset_snapshot_queue +from hexa.datasets.queue import dataset_file_metadata_queue class Command(Worker): - queue = dataset_snapshot_queue + queue = dataset_file_metadata_queue diff --git a/hexa/datasets/migrations/0005_datasetsnapshotjob.py b/hexa/datasets/migrations/0005_datasetsnapshotjob.py deleted file mode 100644 index eb2969718..000000000 --- a/hexa/datasets/migrations/0005_datasetsnapshotjob.py +++ /dev/null @@ -1,43 +0,0 @@ -# Generated by Django 5.0.3 on 2024-07-04 12:08 - -import django.contrib.postgres.functions -from django.db import migrations, models - - -class Migration(migrations.Migration): - dependencies = [ - ("datasets", "0004_datasetversion_pipeline_run"), - ] - - operations = [ - migrations.CreateModel( - name="DatasetSnapshotJob", - fields=[ - ("id", models.BigAutoField(primary_key=True, serialize=False)), - ( - "created_at", - models.DateTimeField( - default=django.contrib.postgres.functions.TransactionNow - ), - ), - ( - "execute_at", - models.DateTimeField( - default=django.contrib.postgres.functions.TransactionNow - ), - ), - ( - "priority", - models.IntegerField( - default=0, - help_text="Jobs with higher priority will be processed first.", - ), - ), - ("task", models.CharField(max_length=255)), - ("args", models.JSONField()), - ], - options={ - "db_table": "catalog_datasetsnapshotjob", - }, - ), - ] From 7eff6e775ed79f74c4ca87d2554e135f48d010f4 Mon Sep 17 00:00:00 2001 From: nazarfil Date: Fri, 12 Jul 2024 10:55:53 +0200 Subject: [PATCH 06/37] remove not needed methods --- hexa/files/basefs.py | 4 ---- hexa/files/gcp.py | 15 --------------- hexa/files/s3.py | 11 ----------- hexa/files/tests/mocks/client.py | 8 -------- 4 files changed, 38 deletions(-) diff --git a/hexa/files/basefs.py b/hexa/files/basefs.py index 8d6403e09..b35e8134d 100644 --- a/hexa/files/basefs.py +++ b/hexa/files/basefs.py @@ -97,7 +97,3 @@ def generate_upload_url( @abstractmethod def get_token_as_env_variables(self, token): pass - - @abstractmethod - def read_object_lines(self, bucket_name: str, filename: str, lines_number: int): - pass diff --git a/hexa/files/gcp.py b/hexa/files/gcp.py index 972dfcda6..a3199db6c 100644 --- a/hexa/files/gcp.py +++ b/hexa/files/gcp.py @@ -1,5 +1,4 @@ import base64 -import io import json import requests @@ -348,17 +347,3 @@ def get_token_as_env_variables(self, token): "GCS_TOKEN": token, # FIXME: Once we have deployed the new openhexa-bslq-environment image and upgraded the openhexa-app, we can remove this line "WORKSPACE_STORAGE_ENGINE_GCP_ACCESS_TOKEN": token, } - - def read_object_lines(self, bucket_name: str, filename: str, lines_number: int): - client = get_storage_client() - bucket = client.get_bucket(bucket_name) - blob = bucket.get_blob(filename) - - with io.BytesIO() as file_obj: - blob.download_to_file(file_obj) - file_obj.seek(0) - lines = file_obj.readlines() - - max_lines = min(lines_number, len(lines)) - specific_lines = [lines[i].decode("utf-8").strip() for i in range(max_lines)] - return specific_lines diff --git a/hexa/files/s3.py b/hexa/files/s3.py index f85b5021e..c58d07711 100644 --- a/hexa/files/s3.py +++ b/hexa/files/s3.py @@ -1,5 +1,4 @@ import base64 -import io import json import boto3 @@ -437,13 +436,3 @@ def get_token_as_env_variables(self, token): json.dumps(json_config).encode() ).decode(), } - - def read_object_lines(self, bucket_name: str, filename: str, lines_number: int): - s3 = get_storage_client() - object = s3.get_object(Bucket=bucket_name, Key=filename) - file_stream = io.BytesIO(object["Body"].read()) - file_stream.seek(0) - lines = file_stream.readlines() - - specific_lines = [lines[i].decode("utf-8").strip() for i in range(lines_number)] - return specific_lines diff --git a/hexa/files/tests/mocks/client.py b/hexa/files/tests/mocks/client.py index 7238a89bd..10df5d7d9 100644 --- a/hexa/files/tests/mocks/client.py +++ b/hexa/files/tests/mocks/client.py @@ -1,6 +1,3 @@ -from random import choice -from string import ascii_lowercase, digits - from google.api_core import page_iterator from google.cloud.exceptions import Conflict, NotFound @@ -272,8 +269,3 @@ def list_buckets( return MockHTTPIterator( items=buckets, max_results=max_results, page_size=page_size ) - - def read_object_lines(self, bucket: str, file_path: str, num_lines=None): - chars = ascii_lowercase + digits - lst = ["".join(choice(chars) for _ in range(2)) for _ in range(num_lines)] - return lst From ac411c569c0ddf0f77167fa092f209918371c149 Mon Sep 17 00:00:00 2001 From: nazarfil Date: Wed, 17 Jul 2024 09:14:19 +0200 Subject: [PATCH 07/37] chore: rename worker to dataset-worker --- .../management/commands/dataset_snapshot_worker.py | 7 ------- 1 file changed, 7 deletions(-) delete mode 100644 hexa/datasets/management/commands/dataset_snapshot_worker.py diff --git a/hexa/datasets/management/commands/dataset_snapshot_worker.py b/hexa/datasets/management/commands/dataset_snapshot_worker.py deleted file mode 100644 index d7389ccaf..000000000 --- a/hexa/datasets/management/commands/dataset_snapshot_worker.py +++ /dev/null @@ -1,7 +0,0 @@ -from dpq.commands import Worker - -from hexa.datasets.queue import dataset_file_metadata_queue - - -class Command(Worker): - queue = dataset_file_metadata_queue From c81a551b0125e8e991ed739f38ffb6f01a8bd5cc Mon Sep 17 00:00:00 2001 From: nazarfil Date: Tue, 18 Jun 2024 10:57:09 +0200 Subject: [PATCH 08/37] feat(Dataset): add initial background queue, task, work and trigger to enqueue task --- hexa/datasets/management/commands/worker.py | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 hexa/datasets/management/commands/worker.py diff --git a/hexa/datasets/management/commands/worker.py b/hexa/datasets/management/commands/worker.py new file mode 100644 index 000000000..0c6578270 --- /dev/null +++ b/hexa/datasets/management/commands/worker.py @@ -0,0 +1,7 @@ +from dpq.commands import Worker + +from hexa.datasets.queue import dataset_snapshot_queue + + +class Command(Worker): + queue = dataset_snapshot_queue From 1a86b91d68a45ccb0c74d1196ba23765d826d5b9 Mon Sep 17 00:00:00 2001 From: nazarfil Date: Sun, 23 Jun 2024 15:42:05 +0200 Subject: [PATCH 09/37] feat(Dataset): job task to generate snapshot --- hexa/datasets/migrations/__init__.py | 0 hexa/files/basefs.py | 4 ++++ hexa/files/gcp.py | 16 ++++++++++++++++ hexa/files/s3.py | 11 +++++++++++ hexa/files/tests/mocks/client.py | 8 ++++++++ hexa/files/tests/test_api.py | 17 +++++++++++++++++ 6 files changed, 56 insertions(+) delete mode 100644 hexa/datasets/migrations/__init__.py diff --git a/hexa/datasets/migrations/__init__.py b/hexa/datasets/migrations/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/hexa/files/basefs.py b/hexa/files/basefs.py index b35e8134d..8d6403e09 100644 --- a/hexa/files/basefs.py +++ b/hexa/files/basefs.py @@ -97,3 +97,7 @@ def generate_upload_url( @abstractmethod def get_token_as_env_variables(self, token): pass + + @abstractmethod + def read_object_lines(self, bucket_name: str, filename: str, lines_number: int): + pass diff --git a/hexa/files/gcp.py b/hexa/files/gcp.py index a3199db6c..6f8d6b3fa 100644 --- a/hexa/files/gcp.py +++ b/hexa/files/gcp.py @@ -1,4 +1,5 @@ import base64 +import io import json import requests @@ -347,3 +348,18 @@ def get_token_as_env_variables(self, token): "GCS_TOKEN": token, # FIXME: Once we have deployed the new openhexa-bslq-environment image and upgraded the openhexa-app, we can remove this line "WORKSPACE_STORAGE_ENGINE_GCP_ACCESS_TOKEN": token, } + + def read_object_lines(self, bucket_name: str, filename: str, lines_number: int): + client = get_storage_client() + bucket = client.get_bucket(bucket_name) + blob = bucket.get_blob(filename) + + with io.BytesIO() as file_obj: + blob.download_to_file(file_obj) + file_obj.seek(0) + lines = file_obj.readlines() + + max_lines = min(lines_number, len(lines)) + print(max_lines, lines_number, len(lines)) + specific_lines = [lines[i].decode("utf-8").strip() for i in range(max_lines)] + return specific_lines diff --git a/hexa/files/s3.py b/hexa/files/s3.py index c58d07711..f85b5021e 100644 --- a/hexa/files/s3.py +++ b/hexa/files/s3.py @@ -1,4 +1,5 @@ import base64 +import io import json import boto3 @@ -436,3 +437,13 @@ def get_token_as_env_variables(self, token): json.dumps(json_config).encode() ).decode(), } + + def read_object_lines(self, bucket_name: str, filename: str, lines_number: int): + s3 = get_storage_client() + object = s3.get_object(Bucket=bucket_name, Key=filename) + file_stream = io.BytesIO(object["Body"].read()) + file_stream.seek(0) + lines = file_stream.readlines() + + specific_lines = [lines[i].decode("utf-8").strip() for i in range(lines_number)] + return specific_lines diff --git a/hexa/files/tests/mocks/client.py b/hexa/files/tests/mocks/client.py index 10df5d7d9..7238a89bd 100644 --- a/hexa/files/tests/mocks/client.py +++ b/hexa/files/tests/mocks/client.py @@ -1,3 +1,6 @@ +from random import choice +from string import ascii_lowercase, digits + from google.api_core import page_iterator from google.cloud.exceptions import Conflict, NotFound @@ -269,3 +272,8 @@ def list_buckets( return MockHTTPIterator( items=buckets, max_results=max_results, page_size=page_size ) + + def read_object_lines(self, bucket: str, file_path: str, num_lines=None): + chars = ascii_lowercase + digits + lst = ["".join(choice(chars) for _ in range(2)) for _ in range(num_lines)] + return lst diff --git a/hexa/files/tests/test_api.py b/hexa/files/tests/test_api.py index 28123be0d..bfd3f8ebf 100644 --- a/hexa/files/tests/test_api.py +++ b/hexa/files/tests/test_api.py @@ -532,3 +532,20 @@ def get_type(self): class APIGcpTestCase(APITestCase, OnlyGCP, TestCase): def get_type(self): return "gcp" + + +class TestDownloadFromCloudStorage(TestCase): + def test_get_from_gcp(self): + pass + # lines = get_storage("gcp").read_object_lines( + # "hexa-test-datasets", + # "3237e8c2-896d-4628-9054-59d69c785a11/add8469f-14d6-4081-8e02-adb53016f7bd/people.csv", + # 1, + # ) + # print(f"Lines are : {lines}") + # self.assertEqual( + # lines, + # [ + # "3237e8c2-896d-4628-9054-59d69c785a11/add8469f-14d6-4081-8e02-adb53016f7bd/people.csv" + # ], + # ) From af08699e5830ea9a03a69cedbae59fc8a2aa3a4d Mon Sep 17 00:00:00 2001 From: nazarfil Date: Fri, 12 Jul 2024 09:44:51 +0200 Subject: [PATCH 10/37] chore: renamed snapshot to file_metadata --- ...shotjob_datasetfilemetadatajob_and_more.py | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 hexa/datasets/migrations/0006_rename_datasetsnapshotjob_datasetfilemetadatajob_and_more.py diff --git a/hexa/datasets/migrations/0006_rename_datasetsnapshotjob_datasetfilemetadatajob_and_more.py b/hexa/datasets/migrations/0006_rename_datasetsnapshotjob_datasetfilemetadatajob_and_more.py new file mode 100644 index 000000000..0def0c132 --- /dev/null +++ b/hexa/datasets/migrations/0006_rename_datasetsnapshotjob_datasetfilemetadatajob_and_more.py @@ -0,0 +1,20 @@ +# Generated by Django 5.0.3 on 2024-07-12 07:44 + +from django.db import migrations + + +class Migration(migrations.Migration): + dependencies = [ + ("datasets", "0005_datasetsnapshotjob"), + ] + + operations = [ + migrations.RenameModel( + old_name="DatasetSnapshotJob", + new_name="DatasetFileMetadataJob", + ), + migrations.AlterModelTable( + name="datasetfilemetadatajob", + table="datasets_filemetadatajob", + ), + ] From 3a18c0fa4de698f6db510e1dc5885deed62c4a3a Mon Sep 17 00:00:00 2001 From: nazarfil Date: Tue, 9 Jul 2024 08:50:23 +0200 Subject: [PATCH 11/37] feat(Dataset): add dataset file snapshot model --- hexa/datasets/admin.py | 14 ++++++- hexa/datasets/graphql/schema.graphql | 25 +++++++++++- hexa/datasets/models.py | 60 ++++++++++++++++++++++++++++ hexa/datasets/permissions.py | 15 ++++++- hexa/datasets/queue.py | 45 ++++++++++++++++++--- hexa/datasets/schema/queries.py | 23 ++++++++++- hexa/files/basefs.py | 4 ++ hexa/files/gcp.py | 6 +++ hexa/files/s3.py | 7 +++- 9 files changed, 188 insertions(+), 11 deletions(-) diff --git a/hexa/datasets/admin.py b/hexa/datasets/admin.py index 7eb704a1e..43343064b 100644 --- a/hexa/datasets/admin.py +++ b/hexa/datasets/admin.py @@ -1,6 +1,12 @@ from django.contrib import admin -from .models import Dataset, DatasetLink, DatasetVersion, DatasetVersionFile +from .models import ( + Dataset, + DatasetFileSnapshot, + DatasetLink, + DatasetVersion, + DatasetVersionFile, +) @admin.register(Dataset) @@ -26,6 +32,12 @@ class DatasetVersionObjectAdmin(admin.ModelAdmin): list_filter = ("dataset_version__dataset", "created_by") +@admin.register(DatasetFileSnapshot) +class DatasetFileSnapshotAdmin(admin.ModelAdmin): + list_display = ("filename", "dataset_version_file") + list_filter = ("dataset_version_file__dataset_version__dataset", "created_by") + + @admin.register(DatasetLink) class DatasetLinkAdmin(admin.ModelAdmin): list_display = ("dataset", "workspace", "created_at", "created_by") diff --git a/hexa/datasets/graphql/schema.graphql b/hexa/datasets/graphql/schema.graphql index 6fc53a8d0..4d09cd44c 100644 --- a/hexa/datasets/graphql/schema.graphql +++ b/hexa/datasets/graphql/schema.graphql @@ -424,11 +424,30 @@ type PinDatasetResult { errors: [PinDatasetError!]! } +input CreateDatasetFileSnapshotInput { + fileId: String! +} + +type DatasetFileSnapshot { + uri: String! + created_by: String! + dataset_version_file: DatasetVersionFile +} + +type CreateDatasetFileSnapshotResult { + dataset_file_snapshot : DatasetFileSnapshot + success: Boolean! + errors: [PrepareVersionFileDownloadError!]! +} + + extend type Query { "Get a dataset by its ID." dataset(id: ID!): Dataset "Get a dataset by its slug." datasetVersion(id: ID!): DatasetVersion + "Get a dataset file snapshot by fileId" + datasetFileSnapshot(id: ID, fileId: ID): DatasetFileSnapshot "Get a dataset link by its id." datasetLink(id: ID!): DatasetLink "Get a dataset link by its slug." @@ -437,6 +456,7 @@ extend type Query { datasets(query: String, page: Int = 1, perPage: Int = 15): DatasetPage! } + extend type Mutation { "Create a new dataset." createDataset(input: CreateDatasetInput!): CreateDatasetResult! @loginRequired @@ -452,6 +472,8 @@ extend type Mutation { generateDatasetUploadUrl(input: GenerateDatasetUploadUrlInput!): GenerateDatasetUploadUrlResult! @loginRequired "Create a new file in a dataset version." createDatasetVersionFile(input: CreateDatasetVersionFileInput!): CreateDatasetVersionFileResult! @loginRequired + "Create dataset version snapshot." + createDatasetVersionFileSnapshot(input: CreateDatasetFileSnapshotInput!): CreateDatasetFileSnapshotResult! @loginRequired "Prepare to download a file in a dataset version." prepareVersionFileDownload(input: PrepareVersionFileDownloadInput!): PrepareVersionFileDownloadResult! @loginRequired "Link a dataset with a workspace." @@ -460,4 +482,5 @@ extend type Mutation { deleteDatasetLink(input: DeleteDatasetLinkInput!): DeleteDatasetLinkResult! @loginRequired "Pin or unpin a dataset for a workspace." pinDataset(input: PinDatasetInput!): PinDatasetResult! @loginRequired -} \ No newline at end of file +} + diff --git a/hexa/datasets/models.py b/hexa/datasets/models.py index eca44a682..44ba28487 100644 --- a/hexa/datasets/models.py +++ b/hexa/datasets/models.py @@ -255,6 +255,66 @@ class Meta: ordering = ["uri"] +class DatasetFileSnapshotQuerySet(BaseQuerySet): + def filter_for_user(self, user: AnonymousUser | User): + return self._filter_for_user_and_query_object( + user, + models.Q( + dataset_version_file__dataset_version__dataset__in=Dataset.objects.filter_for_user( + user + ), + return_all_if_superuser=False, + ), + ) + + +class DatasetFileSnapshotManager(models.Manager): + def create_if_has_perm( + self, + principal: User, + dataset_version_file: DatasetVersionFile, + *, + uri: str, + ): + from hexa.pipelines.authentication import PipelineRunUser + + if isinstance(principal, PipelineRunUser): + if ( + principal.pipeline_run.pipeline.workspace + != dataset_version_file.dataset_version.dataset.workspace + ): + raise PermissionDenied + elif not principal.has_perm( + "datasets.create_dataset_version_file_snapshot", dataset_version_file + ): + raise PermissionDenied + + created_by = principal if not isinstance(principal, PipelineRunUser) else None + return self.create( + dataset_version_file=dataset_version_file, + uri=uri, + created_by=created_by, + ) + + +class DatasetFileSnapshot(Base): + uri = models.TextField(null=False, blank=False, unique=True) + created_by = models.ForeignKey(User, null=True, on_delete=models.SET_NULL) + dataset_version_file = models.ForeignKey( + DatasetVersionFile, + null=False, + blank=False, + on_delete=models.CASCADE, + related_name="snapshots", + ) + + objects = DatasetFileSnapshotManager.from_queryset(DatasetFileSnapshotQuerySet)() + + @property + def filename(self): + return self.uri.split("/")[-1] + + class DatasetLinkQuerySet(BaseQuerySet): def filter_for_user(self, user: AnonymousUser | User): # FIXME: Use a generic permission system instead of differencing between User and PipelineRunUser diff --git a/hexa/datasets/permissions.py b/hexa/datasets/permissions.py index 0a6628f42..dbdc60880 100644 --- a/hexa/datasets/permissions.py +++ b/hexa/datasets/permissions.py @@ -1,4 +1,9 @@ -from hexa.datasets.models import Dataset, DatasetLink, DatasetVersion +from hexa.datasets.models import ( + Dataset, + DatasetLink, + DatasetVersion, + DatasetVersionFile, +) from hexa.user_management.models import User from hexa.workspaces.models import ( Workspace, @@ -105,3 +110,11 @@ def create_dataset_version_file(principal: User, dataset_version: DatasetVersion return False return create_dataset_version(principal, dataset_version.dataset) + + +def create_dataset_version_file_snapshot( + principal: User, dataset_version_file: DatasetVersionFile +): + if dataset_version_file != dataset_version_file.latest_version: + return False + return create_dataset_version_file(principal, dataset_version_file.dataset_version) diff --git a/hexa/datasets/queue.py b/hexa/datasets/queue.py index a93dd46d4..8d0e908e9 100644 --- a/hexa/datasets/queue.py +++ b/hexa/datasets/queue.py @@ -1,18 +1,51 @@ +import os.path from logging import getLogger from dpq.queue import AtLeastOnceQueue from hexa.datasets.models import DatasetFileMetadataJob +from hexa.datasets.models import ( + DatasetFileSnapshot, + DatasetVersionFile, +) +from hexa.files.api import get_storage +from hexa.user_management.models import User logger = getLogger(__name__) +DEFAULT_SNAPSHOT_LINES = 500 + + +def generate_dataset_file_sample_task(queue: AtLeastOnceQueue, job: DatasetSnapshotJob): + try: + dataset_version_file_id = job.args["file_id"] + user_id = job.args["user_id"] + logger.info( + f"Creating dataset snapshot for version file {dataset_version_file_id}" + ) + dataset_version_file = DatasetVersionFile.objects.get( + id=dataset_version_file_id + ) + user = User.objects.get(id=user_id) + + storage = get_storage() + dataset_snapshot = storage.read_object_lines( + dataset_version_file, DEFAULT_SNAPSHOT_LINES + ) + bucket_name = dataset_version_file.uri.split("/")[0] + filename, extension = os.path.splitext(dataset_version_file.uri) + upload_uri = f"{filename}-snapshot{extension}" + storage.upload_object_from_string(bucket_name, upload_uri, dataset_snapshot) -def generate_dataset_file_sample_task( - queue: AtLeastOnceQueue, job: DatasetFileMetadataJob -): - # TODO: imlpement ticket PATHWAYS-98 - extract data in background task - dataset_version_file_id = job.args["fileId"] - logger.info(f"Creating dataset version file {dataset_version_file_id}") + logger.info( + f"Uploaded dataset snapshot to {upload_uri} for file {dataset_version_file_id}" + ) + DatasetFileSnapshot.objects.create_if_has_perm( + principal=user, dataset_version_file=dataset_version_file, uri=upload_uri + ) + logger.info("Dataset snapshot created for file {dataset_version_file_id}") + except Exception as e: + logger.exception(f"Failed to create dataset snapshot: \n {e}") class DatasetsFileMetadataQueue(AtLeastOnceQueue): diff --git a/hexa/datasets/schema/queries.py b/hexa/datasets/schema/queries.py index 427c3e1fc..06b9657ce 100644 --- a/hexa/datasets/schema/queries.py +++ b/hexa/datasets/schema/queries.py @@ -2,7 +2,12 @@ from hexa.core.graphql import result_page -from ..models import Dataset, DatasetLink, DatasetVersion +from ..models import ( + Dataset, + DatasetFileSnapshot, + DatasetLink, + DatasetVersion, +) datasets_queries = QueryType() @@ -37,6 +42,22 @@ def resolve_dataset_version(_, info, **kwargs): return None +@datasets_queries.field("datasetFileSnapshot") +def resolve_dataset_file_snapshot(_, info, **kwargs): + request = info.context["request"] + try: + if kwargs.get("file_id"): + return DatasetFileSnapshot.objects.filter_for_user(request.user).get( + dataset_version_file=kwargs["file_id"] + ) + else: + return DatasetFileSnapshot.objects.filter_for_user(request.user).get( + id=kwargs["id"] + ) + except DatasetFileSnapshot.DoesNotExist: + return None + + @datasets_queries.field("datasetLink") def resolve_dataset_link(_, info, **kwargs): request = info.context["request"] diff --git a/hexa/files/basefs.py b/hexa/files/basefs.py index 8d6403e09..3b71d439f 100644 --- a/hexa/files/basefs.py +++ b/hexa/files/basefs.py @@ -54,6 +54,10 @@ def delete_bucket(self, bucket_name: str, fully: bool = False): def upload_object(self, bucket_name: str, file_name: str, source: str): pass + @abstractmethod + def upload_object_from_string(self, bucket_name: str, file_name: str, content: str): + pass + @abstractmethod def create_bucket_folder(self, bucket_name: str, folder_key: str): pass diff --git a/hexa/files/gcp.py b/hexa/files/gcp.py index 6f8d6b3fa..73c4329e9 100644 --- a/hexa/files/gcp.py +++ b/hexa/files/gcp.py @@ -158,6 +158,12 @@ def upload_object(self, bucket_name: str, file_name: str, source: str): blob = bucket.blob(file_name) blob.upload_from_filename(source) + def upload_object_from_string(self, bucket_name: str, file_name: str, content: str): + client = get_storage_client() + bucket = client.bucket(bucket_name) + blob = bucket.blob(file_name) + blob.upload_from_string(content) + def create_bucket_folder(self, bucket_name: str, folder_key: str): client = get_storage_client() bucket = client.get_bucket(bucket_name) diff --git a/hexa/files/s3.py b/hexa/files/s3.py index f85b5021e..25334fcff 100644 --- a/hexa/files/s3.py +++ b/hexa/files/s3.py @@ -438,6 +438,10 @@ def get_token_as_env_variables(self, token): ).decode(), } + def upload_object_from_string(self, bucket_name: str, file_name: str, content: str): + s3 = get_storage_client() + s3.put_object(bucket_name, file_name, content) + def read_object_lines(self, bucket_name: str, filename: str, lines_number: int): s3 = get_storage_client() object = s3.get_object(Bucket=bucket_name, Key=filename) @@ -445,5 +449,6 @@ def read_object_lines(self, bucket_name: str, filename: str, lines_number: int): file_stream.seek(0) lines = file_stream.readlines() - specific_lines = [lines[i].decode("utf-8").strip() for i in range(lines_number)] + max_lines = min(lines_number, len(lines)) + specific_lines = [lines[i].decode("utf-8").strip() for i in range(max_lines)] return specific_lines From 835df3bac33c5497d5ed9b66fd7a90094b58ba9f Mon Sep 17 00:00:00 2001 From: nazarfil Date: Wed, 10 Jul 2024 15:54:38 +0200 Subject: [PATCH 12/37] chore: adds todos --- hexa/datasets/queue.py | 3 ++- hexa/files/gcp.py | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/hexa/datasets/queue.py b/hexa/datasets/queue.py index 8d0e908e9..b464337d2 100644 --- a/hexa/datasets/queue.py +++ b/hexa/datasets/queue.py @@ -13,7 +13,8 @@ logger = getLogger(__name__) -DEFAULT_SNAPSHOT_LINES = 500 +# qdd to settings +DEFAULT_SNAPSHOT_LINES = 50 def generate_dataset_file_sample_task(queue: AtLeastOnceQueue, job: DatasetSnapshotJob): diff --git a/hexa/files/gcp.py b/hexa/files/gcp.py index 73c4329e9..b1f31e8dc 100644 --- a/hexa/files/gcp.py +++ b/hexa/files/gcp.py @@ -356,6 +356,7 @@ def get_token_as_env_variables(self, token): } def read_object_lines(self, bucket_name: str, filename: str, lines_number: int): + # TODO: redo with pandas with DownloadURL client = get_storage_client() bucket = client.get_bucket(bucket_name) blob = bucket.get_blob(filename) From fd103dfe40875a438896ea3ec9e7b53721c08d85 Mon Sep 17 00:00:00 2001 From: nazarfil Date: Wed, 10 Jul 2024 18:19:13 +0200 Subject: [PATCH 13/37] fix: removes upload to bucket and adds status --- hexa/datasets/admin.py | 7 ----- hexa/datasets/models.py | 68 +++++++++++------------------------------ hexa/datasets/queue.py | 37 +++++++++++----------- hexa/files/basefs.py | 8 ----- hexa/files/gcp.py | 33 ++------------------ hexa/files/s3.py | 16 ---------- 6 files changed, 39 insertions(+), 130 deletions(-) diff --git a/hexa/datasets/admin.py b/hexa/datasets/admin.py index 43343064b..82ad3b020 100644 --- a/hexa/datasets/admin.py +++ b/hexa/datasets/admin.py @@ -2,7 +2,6 @@ from .models import ( Dataset, - DatasetFileSnapshot, DatasetLink, DatasetVersion, DatasetVersionFile, @@ -32,12 +31,6 @@ class DatasetVersionObjectAdmin(admin.ModelAdmin): list_filter = ("dataset_version__dataset", "created_by") -@admin.register(DatasetFileSnapshot) -class DatasetFileSnapshotAdmin(admin.ModelAdmin): - list_display = ("filename", "dataset_version_file") - list_filter = ("dataset_version_file__dataset_version__dataset", "created_by") - - @admin.register(DatasetLink) class DatasetLinkAdmin(admin.ModelAdmin): list_display = ("dataset", "workspace", "created_at", "created_by") diff --git a/hexa/datasets/models.py b/hexa/datasets/models.py index 44ba28487..dd8080196 100644 --- a/hexa/datasets/models.py +++ b/hexa/datasets/models.py @@ -3,6 +3,8 @@ from django.contrib.auth.models import AnonymousUser from django.core.exceptions import PermissionDenied from django.db import models +from django.db.models import JSONField +from django.forms import CharField from dpq.models import BaseJob from slugify import slugify @@ -255,51 +257,23 @@ class Meta: ordering = ["uri"] -class DatasetFileSnapshotQuerySet(BaseQuerySet): - def filter_for_user(self, user: AnonymousUser | User): - return self._filter_for_user_and_query_object( - user, - models.Q( - dataset_version_file__dataset_version__dataset__in=Dataset.objects.filter_for_user( - user - ), - return_all_if_superuser=False, - ), - ) - - -class DatasetFileSnapshotManager(models.Manager): - def create_if_has_perm( - self, - principal: User, - dataset_version_file: DatasetVersionFile, - *, - uri: str, - ): - from hexa.pipelines.authentication import PipelineRunUser - - if isinstance(principal, PipelineRunUser): - if ( - principal.pipeline_run.pipeline.workspace - != dataset_version_file.dataset_version.dataset.workspace - ): - raise PermissionDenied - elif not principal.has_perm( - "datasets.create_dataset_version_file_snapshot", dataset_version_file - ): - raise PermissionDenied - - created_by = principal if not isinstance(principal, PipelineRunUser) else None - return self.create( - dataset_version_file=dataset_version_file, - uri=uri, - created_by=created_by, - ) - - class DatasetFileSnapshot(Base): - uri = models.TextField(null=False, blank=False, unique=True) - created_by = models.ForeignKey(User, null=True, on_delete=models.SET_NULL) + STATUS_PROCESSING = "processing" + STATUS_FAILED = "failed" + STATUS_FINISHED = "finished" + + STATUS_CHOICES = [ + (STATUS_PROCESSING, "Processing"), + (STATUS_FAILED, "Failed"), + (STATUS_FINISHED, "Finished"), + ] + + content = JSONField(blank=True, default=list, null=True) + status = CharField( + max_length=10, + choices=STATUS_CHOICES, + default=STATUS_PROCESSING, + ) dataset_version_file = models.ForeignKey( DatasetVersionFile, null=False, @@ -308,12 +282,6 @@ class DatasetFileSnapshot(Base): related_name="snapshots", ) - objects = DatasetFileSnapshotManager.from_queryset(DatasetFileSnapshotQuerySet)() - - @property - def filename(self): - return self.uri.split("/")[-1] - class DatasetLinkQuerySet(BaseQuerySet): def filter_for_user(self, user: AnonymousUser | User): diff --git a/hexa/datasets/queue.py b/hexa/datasets/queue.py index b464337d2..6534f205b 100644 --- a/hexa/datasets/queue.py +++ b/hexa/datasets/queue.py @@ -1,4 +1,3 @@ -import os.path from logging import getLogger from dpq.queue import AtLeastOnceQueue @@ -9,7 +8,6 @@ DatasetVersionFile, ) from hexa.files.api import get_storage -from hexa.user_management.models import User logger = getLogger(__name__) @@ -20,32 +18,35 @@ def generate_dataset_file_sample_task(queue: AtLeastOnceQueue, job: DatasetSnapshotJob): try: dataset_version_file_id = job.args["file_id"] - user_id = job.args["user_id"] + dataset_version_file = DatasetVersionFile.objects.get( + id=dataset_version_file_id + ) logger.info( f"Creating dataset snapshot for version file {dataset_version_file_id}" ) - dataset_version_file = DatasetVersionFile.objects.get( - id=dataset_version_file_id + dataset_file_snapshot = DatasetFileSnapshot.objects.create( + dataset_version_file=dataset_version_file, + status=DatasetFileSnapshot.STATUS_PROCESSING, ) - user = User.objects.get(id=user_id) storage = get_storage() - dataset_snapshot = storage.read_object_lines( + dataset_snapshot_content = storage.read_object_lines( dataset_version_file, DEFAULT_SNAPSHOT_LINES ) - bucket_name = dataset_version_file.uri.split("/")[0] - filename, extension = os.path.splitext(dataset_version_file.uri) - upload_uri = f"{filename}-snapshot{extension}" - storage.upload_object_from_string(bucket_name, upload_uri, dataset_snapshot) - - logger.info( - f"Uploaded dataset snapshot to {upload_uri} for file {dataset_version_file_id}" - ) - DatasetFileSnapshot.objects.create_if_has_perm( - principal=user, dataset_version_file=dataset_version_file, uri=upload_uri - ) + dataset_file_snapshot.content = dataset_snapshot_content + dataset_file_snapshot.status = DatasetFileSnapshot.STATUS_FINISHED + dataset_file_snapshot.save() logger.info("Dataset snapshot created for file {dataset_version_file_id}") except Exception as e: + dataset_version_file_id = job.args["file_id"] + dataset_version_file = DatasetVersionFile.objects.get( + id=dataset_version_file_id + ) + dataset_file_snapshot = DatasetFileSnapshot.objects.get( + dataset_version_file=dataset_version_file + ) + dataset_file_snapshot.status = DatasetFileSnapshot.STATUS_FAILED + dataset_file_snapshot.save() logger.exception(f"Failed to create dataset snapshot: \n {e}") diff --git a/hexa/files/basefs.py b/hexa/files/basefs.py index 3b71d439f..b35e8134d 100644 --- a/hexa/files/basefs.py +++ b/hexa/files/basefs.py @@ -54,10 +54,6 @@ def delete_bucket(self, bucket_name: str, fully: bool = False): def upload_object(self, bucket_name: str, file_name: str, source: str): pass - @abstractmethod - def upload_object_from_string(self, bucket_name: str, file_name: str, content: str): - pass - @abstractmethod def create_bucket_folder(self, bucket_name: str, folder_key: str): pass @@ -101,7 +97,3 @@ def generate_upload_url( @abstractmethod def get_token_as_env_variables(self, token): pass - - @abstractmethod - def read_object_lines(self, bucket_name: str, filename: str, lines_number: int): - pass diff --git a/hexa/files/gcp.py b/hexa/files/gcp.py index b1f31e8dc..97d22518b 100644 --- a/hexa/files/gcp.py +++ b/hexa/files/gcp.py @@ -1,5 +1,4 @@ import base64 -import io import json import requests @@ -12,13 +11,7 @@ from google.oauth2 import service_account from google.protobuf import duration_pb2 -from .basefs import ( - BaseClient, - BucketObjectAlreadyExists, - NotFound, - ObjectsPage, - load_bucket_sample_data_with, -) +from .basefs import BaseClient, NotFound, ObjectsPage, load_bucket_sample_data_with def get_credentials(): @@ -158,12 +151,6 @@ def upload_object(self, bucket_name: str, file_name: str, source: str): blob = bucket.blob(file_name) blob.upload_from_filename(source) - def upload_object_from_string(self, bucket_name: str, file_name: str, content: str): - client = get_storage_client() - bucket = client.bucket(bucket_name) - blob = bucket.blob(file_name) - blob.upload_from_string(content) - def create_bucket_folder(self, bucket_name: str, folder_key: str): client = get_storage_client() bucket = client.get_bucket(bucket_name) @@ -204,7 +191,7 @@ def generate_upload_url( client = get_storage_client() gcs_bucket = client.get_bucket(bucket_name) if raise_if_exists and gcs_bucket.get_blob(target_key) is not None: - raise BucketObjectAlreadyExists(target_key) + raise ValidationError(f"GCS: Object {target_key} already exists!") blob = gcs_bucket.blob(target_key) return blob.generate_signed_url( expiration=3600, version="v4", method="PUT", content_type=content_type @@ -354,19 +341,3 @@ def get_token_as_env_variables(self, token): "GCS_TOKEN": token, # FIXME: Once we have deployed the new openhexa-bslq-environment image and upgraded the openhexa-app, we can remove this line "WORKSPACE_STORAGE_ENGINE_GCP_ACCESS_TOKEN": token, } - - def read_object_lines(self, bucket_name: str, filename: str, lines_number: int): - # TODO: redo with pandas with DownloadURL - client = get_storage_client() - bucket = client.get_bucket(bucket_name) - blob = bucket.get_blob(filename) - - with io.BytesIO() as file_obj: - blob.download_to_file(file_obj) - file_obj.seek(0) - lines = file_obj.readlines() - - max_lines = min(lines_number, len(lines)) - print(max_lines, lines_number, len(lines)) - specific_lines = [lines[i].decode("utf-8").strip() for i in range(max_lines)] - return specific_lines diff --git a/hexa/files/s3.py b/hexa/files/s3.py index 25334fcff..c58d07711 100644 --- a/hexa/files/s3.py +++ b/hexa/files/s3.py @@ -1,5 +1,4 @@ import base64 -import io import json import boto3 @@ -437,18 +436,3 @@ def get_token_as_env_variables(self, token): json.dumps(json_config).encode() ).decode(), } - - def upload_object_from_string(self, bucket_name: str, file_name: str, content: str): - s3 = get_storage_client() - s3.put_object(bucket_name, file_name, content) - - def read_object_lines(self, bucket_name: str, filename: str, lines_number: int): - s3 = get_storage_client() - object = s3.get_object(Bucket=bucket_name, Key=filename) - file_stream = io.BytesIO(object["Body"].read()) - file_stream.seek(0) - lines = file_stream.readlines() - - max_lines = min(lines_number, len(lines)) - specific_lines = [lines[i].decode("utf-8").strip() for i in range(max_lines)] - return specific_lines From 37132833d1fe14d65c08fe8a77b1e47c99b96ad5 Mon Sep 17 00:00:00 2001 From: nazarfil Date: Thu, 11 Jul 2024 10:50:08 +0200 Subject: [PATCH 14/37] fix: read parquet file in dataframe --- config/settings/base.py | 3 +++ hexa/datasets/queue.py | 41 ++++++++++++++++++++++++++------ hexa/files/tests/mocks/client.py | 8 ------- 3 files changed, 37 insertions(+), 15 deletions(-) diff --git a/config/settings/base.py b/config/settings/base.py index 070fb98fc..5f35ce9ec 100644 --- a/config/settings/base.py +++ b/config/settings/base.py @@ -378,6 +378,9 @@ # Datasets config WORKSPACE_DATASETS_BUCKET = os.environ.get("WORKSPACE_DATASETS_BUCKET") +WORKSPACE_DATASETS_FILE_SNAPSHOT_SIZE = os.environ.get( + "WORKSPACE_DATASETS_FILE_SNAPSHOT_SIZE", 50 +) # Base64 encoded service account key # To generate a service account key, follow the instructions here: diff --git a/hexa/datasets/queue.py b/hexa/datasets/queue.py index 6534f205b..583b7fca1 100644 --- a/hexa/datasets/queue.py +++ b/hexa/datasets/queue.py @@ -1,7 +1,10 @@ from logging import getLogger +import pandas as pd +from django.conf import settings from dpq.queue import AtLeastOnceQueue +from hexa.datasets.api import generate_download_url from hexa.datasets.models import DatasetFileMetadataJob from hexa.datasets.models import ( DatasetFileSnapshot, @@ -11,8 +14,21 @@ logger = getLogger(__name__) -# qdd to settings -DEFAULT_SNAPSHOT_LINES = 50 + +def read_file_content(download_url: str, content_type: str) -> pd.DataFrame: + try: + if content_type == "text/csv": + return pd.read_csv(download_url) + elif content_type == "application/octet-stream": + return pd.read_parquet(download_url) + else: + raise ValueError(f"Unsupported content type: {content_type}") + except pd.errors.ParserError as e: + print(f"Error parsing the file content: {e}") + return pd.DataFrame() + except ValueError as e: + print(f"Unsupported file content: {e}") + return pd.DataFrame() def generate_dataset_file_sample_task(queue: AtLeastOnceQueue, job: DatasetSnapshotJob): @@ -29,11 +45,20 @@ def generate_dataset_file_sample_task(queue: AtLeastOnceQueue, job: DatasetSnaps status=DatasetFileSnapshot.STATUS_PROCESSING, ) - storage = get_storage() - dataset_snapshot_content = storage.read_object_lines( - dataset_version_file, DEFAULT_SNAPSHOT_LINES + download_url = generate_download_url(dataset_version_file) + file_snapshot_df = read_file_content( + download_url, dataset_version_file.content_type ) - dataset_file_snapshot.content = dataset_snapshot_content + if not file_snapshot_df.empty: + file_snapshot_content = file_snapshot_df.head( + settings.WORKSPACE_DATASETS_FILE_SNAPSHOT_SIZE + ) + dataset_file_snapshot.content = file_snapshot_content.to_json( + orient="records" + ) + logger.info(f"Dataset snapshot saved for file {dataset_version_file_id}") + else: + logger.info(f"Dataset snapshot is empty for file {dataset_version_file_id}") dataset_file_snapshot.status = DatasetFileSnapshot.STATUS_FINISHED dataset_file_snapshot.save() logger.info("Dataset snapshot created for file {dataset_version_file_id}") @@ -47,7 +72,9 @@ def generate_dataset_file_sample_task(queue: AtLeastOnceQueue, job: DatasetSnaps ) dataset_file_snapshot.status = DatasetFileSnapshot.STATUS_FAILED dataset_file_snapshot.save() - logger.exception(f"Failed to create dataset snapshot: \n {e}") + logger.exception( + f"Dataset file snapshot creation failed for file {dataset_version_file_id}: {e}" + ) class DatasetsFileMetadataQueue(AtLeastOnceQueue): diff --git a/hexa/files/tests/mocks/client.py b/hexa/files/tests/mocks/client.py index 7238a89bd..10df5d7d9 100644 --- a/hexa/files/tests/mocks/client.py +++ b/hexa/files/tests/mocks/client.py @@ -1,6 +1,3 @@ -from random import choice -from string import ascii_lowercase, digits - from google.api_core import page_iterator from google.cloud.exceptions import Conflict, NotFound @@ -272,8 +269,3 @@ def list_buckets( return MockHTTPIterator( items=buckets, max_results=max_results, page_size=page_size ) - - def read_object_lines(self, bucket: str, file_path: str, num_lines=None): - chars = ascii_lowercase + digits - lst = ["".join(choice(chars) for _ in range(2)) for _ in range(num_lines)] - return lst From 4ba35b08ca28ee93cd1877806b4f9039c076ec33 Mon Sep 17 00:00:00 2001 From: nazarfil Date: Thu, 11 Jul 2024 11:28:55 +0200 Subject: [PATCH 15/37] fix: fixes api call parameters --- hexa/datasets/graphql/schema.graphql | 5 +- .../migrations/0006_datasetfilesnapshot.py | 55 +++++++++++++++++++ hexa/datasets/models.py | 3 +- hexa/datasets/schema/mutations.py | 6 ++ hexa/datasets/schema/queries.py | 9 ++- 5 files changed, 68 insertions(+), 10 deletions(-) create mode 100644 hexa/datasets/migrations/0006_datasetfilesnapshot.py diff --git a/hexa/datasets/graphql/schema.graphql b/hexa/datasets/graphql/schema.graphql index 4d09cd44c..fe6fbba29 100644 --- a/hexa/datasets/graphql/schema.graphql +++ b/hexa/datasets/graphql/schema.graphql @@ -429,8 +429,7 @@ input CreateDatasetFileSnapshotInput { } type DatasetFileSnapshot { - uri: String! - created_by: String! + content: JSON! dataset_version_file: DatasetVersionFile } @@ -446,7 +445,7 @@ extend type Query { dataset(id: ID!): Dataset "Get a dataset by its slug." datasetVersion(id: ID!): DatasetVersion - "Get a dataset file snapshot by fileId" + "Get a dataset file snapshot by fileSnapshot id or by fileId" datasetFileSnapshot(id: ID, fileId: ID): DatasetFileSnapshot "Get a dataset link by its id." datasetLink(id: ID!): DatasetLink diff --git a/hexa/datasets/migrations/0006_datasetfilesnapshot.py b/hexa/datasets/migrations/0006_datasetfilesnapshot.py new file mode 100644 index 000000000..ffca2489f --- /dev/null +++ b/hexa/datasets/migrations/0006_datasetfilesnapshot.py @@ -0,0 +1,55 @@ +# Generated by Django 5.0.3 on 2024-07-11 09:06 + +import uuid + +import django.db.models.deletion +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("datasets", "0005_datasetsnapshotjob"), + ] + + operations = [ + migrations.CreateModel( + name="DatasetFileSnapshot", + fields=[ + ( + "id", + models.UUIDField( + default=uuid.uuid4, + editable=False, + primary_key=True, + serialize=False, + ), + ), + ("created_at", models.DateTimeField(auto_now_add=True)), + ("updated_at", models.DateTimeField(auto_now=True)), + ("content", models.JSONField(blank=True, default=list, null=True)), + ( + "status", + models.CharField( + choices=[ + ("processing", "Processing"), + ("failed", "Failed"), + ("finished", "Finished"), + ], + default="processing", + max_length=10, + ), + ), + ( + "dataset_version_file", + models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, + related_name="snapshots", + to="datasets.datasetversionfile", + ), + ), + ], + options={ + "abstract": False, + }, + ), + ] diff --git a/hexa/datasets/models.py b/hexa/datasets/models.py index dd8080196..da80792bf 100644 --- a/hexa/datasets/models.py +++ b/hexa/datasets/models.py @@ -4,7 +4,6 @@ from django.core.exceptions import PermissionDenied from django.db import models from django.db.models import JSONField -from django.forms import CharField from dpq.models import BaseJob from slugify import slugify @@ -269,7 +268,7 @@ class DatasetFileSnapshot(Base): ] content = JSONField(blank=True, default=list, null=True) - status = CharField( + status = models.CharField( max_length=10, choices=STATUS_CHOICES, default=STATUS_PROCESSING, diff --git a/hexa/datasets/schema/mutations.py b/hexa/datasets/schema/mutations.py index e4d81c9c4..7f897ec2d 100644 --- a/hexa/datasets/schema/mutations.py +++ b/hexa/datasets/schema/mutations.py @@ -267,6 +267,12 @@ def resolve_create_version_file(_, info, **kwargs): return {"success": False, "errors": ["PERMISSION_DENIED"]} +@mutations.field("createDatasetVersionFileSnapshot") +def resolve_create_version_file_snapshot(_, info, **kwargs): + # TODO: implement flow to create snapshot via an API call + raise NotImplementedError + + @mutations.field("prepareVersionFileDownload") def resolve_version_file_download(_, info, **kwargs): request = info.context["request"] diff --git a/hexa/datasets/schema/queries.py b/hexa/datasets/schema/queries.py index 06b9657ce..48f6fa51b 100644 --- a/hexa/datasets/schema/queries.py +++ b/hexa/datasets/schema/queries.py @@ -44,16 +44,15 @@ def resolve_dataset_version(_, info, **kwargs): @datasets_queries.field("datasetFileSnapshot") def resolve_dataset_file_snapshot(_, info, **kwargs): - request = info.context["request"] try: if kwargs.get("file_id"): - return DatasetFileSnapshot.objects.filter_for_user(request.user).get( + return DatasetFileSnapshot.objects.get( dataset_version_file=kwargs["file_id"] ) + elif kwargs.get("id"): + return DatasetFileSnapshot.objects.get(id=kwargs["id"]) else: - return DatasetFileSnapshot.objects.filter_for_user(request.user).get( - id=kwargs["id"] - ) + return None except DatasetFileSnapshot.DoesNotExist: return None From 31ed9af708111e49f85d5a38fc94847f0396d7aa Mon Sep 17 00:00:00 2001 From: nazarfil Date: Fri, 12 Jul 2024 09:16:20 +0200 Subject: [PATCH 16/37] test(Dataset): attempt to test the version file --- hexa/datasets/tests/test_schema.py | 34 ++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/hexa/datasets/tests/test_schema.py b/hexa/datasets/tests/test_schema.py index faf7b6f38..3c17c3c78 100644 --- a/hexa/datasets/tests/test_schema.py +++ b/hexa/datasets/tests/test_schema.py @@ -599,3 +599,37 @@ def test_prepare_version_file_download_linked_dataset(self): }, r["data"]["prepareVersionFileDownload"], ) + + @mock_gcp_storage + def test_prepare_version_file(self): + serena = self.create_user("sereba@blsq.org", is_superuser=True) + src_workspace = self.create_workspace( + serena, + name="Source Workspace", + description="Test workspace", + ) + dataset = self.create_dataset( + serena, src_workspace, "Dataset", "Dataset description" + ) + dataset_version = self.create_dataset_version(serena, dataset=dataset) + self.client.force_login(serena) + + r = self.run_query( + """ + mutation CreateDatasetVersionFile ($input: CreateDatasetVersionFileInput!) { + createDatasetVersionFile(input: $input) { + success + errors + } + } + """, + { + "input": { + "versionId": str(dataset_version.id), + "contentType": "text/csv", + "uri": f"{dataset_version.id}/demo_file.csv", + } + }, + ) + print(r) + # while queue.run_once(): pass From a5951dc90aed389dafddd6d9f06cf67ba664034a Mon Sep 17 00:00:00 2001 From: nazarfil Date: Fri, 12 Jul 2024 10:13:43 +0200 Subject: [PATCH 17/37] chore: rename snapshot to file_metadata --- hexa/datasets/graphql/schema.graphql | 12 +++++----- ...napshot.py => 0006_datasetfilemetadata.py} | 8 +++---- ...shotjob_datasetfilemetadatajob_and_more.py | 20 ---------------- hexa/datasets/models.py | 4 ++-- hexa/datasets/permissions.py | 9 -------- hexa/datasets/queue.py | 23 +++++++++---------- hexa/datasets/schema/mutations.py | 16 +++++++++---- hexa/datasets/schema/queries.py | 10 ++++---- 8 files changed, 40 insertions(+), 62 deletions(-) rename hexa/datasets/migrations/{0006_datasetfilesnapshot.py => 0006_datasetfilemetadata.py} (88%) delete mode 100644 hexa/datasets/migrations/0006_rename_datasetsnapshotjob_datasetfilemetadatajob_and_more.py diff --git a/hexa/datasets/graphql/schema.graphql b/hexa/datasets/graphql/schema.graphql index fe6fbba29..db70cdb94 100644 --- a/hexa/datasets/graphql/schema.graphql +++ b/hexa/datasets/graphql/schema.graphql @@ -424,17 +424,17 @@ type PinDatasetResult { errors: [PinDatasetError!]! } -input CreateDatasetFileSnapshotInput { +input CreateDatasetFileMetadataInput { fileId: String! } -type DatasetFileSnapshot { +type DatasetFileMetadata { content: JSON! dataset_version_file: DatasetVersionFile } -type CreateDatasetFileSnapshotResult { - dataset_file_snapshot : DatasetFileSnapshot +type CreateDatasetFileMetadataResult { + dataset_file_metadata : DatasetFileMetadata success: Boolean! errors: [PrepareVersionFileDownloadError!]! } @@ -446,7 +446,7 @@ extend type Query { "Get a dataset by its slug." datasetVersion(id: ID!): DatasetVersion "Get a dataset file snapshot by fileSnapshot id or by fileId" - datasetFileSnapshot(id: ID, fileId: ID): DatasetFileSnapshot + datasetFileMetadata(id: ID, fileId: ID): DatasetFileMetadata "Get a dataset link by its id." datasetLink(id: ID!): DatasetLink "Get a dataset link by its slug." @@ -472,7 +472,7 @@ extend type Mutation { "Create a new file in a dataset version." createDatasetVersionFile(input: CreateDatasetVersionFileInput!): CreateDatasetVersionFileResult! @loginRequired "Create dataset version snapshot." - createDatasetVersionFileSnapshot(input: CreateDatasetFileSnapshotInput!): CreateDatasetFileSnapshotResult! @loginRequired + createDatasetVersionFileMetadata(input: CreateDatasetFileMetadataInput!): CreateDatasetFileMetadataResult! @loginRequired "Prepare to download a file in a dataset version." prepareVersionFileDownload(input: PrepareVersionFileDownloadInput!): PrepareVersionFileDownloadResult! @loginRequired "Link a dataset with a workspace." diff --git a/hexa/datasets/migrations/0006_datasetfilesnapshot.py b/hexa/datasets/migrations/0006_datasetfilemetadata.py similarity index 88% rename from hexa/datasets/migrations/0006_datasetfilesnapshot.py rename to hexa/datasets/migrations/0006_datasetfilemetadata.py index ffca2489f..e855070f8 100644 --- a/hexa/datasets/migrations/0006_datasetfilesnapshot.py +++ b/hexa/datasets/migrations/0006_datasetfilemetadata.py @@ -1,4 +1,4 @@ -# Generated by Django 5.0.3 on 2024-07-11 09:06 +# Generated by Django 5.0.3 on 2024-07-12 08:18 import uuid @@ -8,12 +8,12 @@ class Migration(migrations.Migration): dependencies = [ - ("datasets", "0005_datasetsnapshotjob"), + ("datasets", "0005_datasetfilemetadatajob"), ] operations = [ migrations.CreateModel( - name="DatasetFileSnapshot", + name="DatasetFileMetadata", fields=[ ( "id", @@ -43,7 +43,7 @@ class Migration(migrations.Migration): "dataset_version_file", models.ForeignKey( on_delete=django.db.models.deletion.CASCADE, - related_name="snapshots", + related_name="file_metadata", to="datasets.datasetversionfile", ), ), diff --git a/hexa/datasets/migrations/0006_rename_datasetsnapshotjob_datasetfilemetadatajob_and_more.py b/hexa/datasets/migrations/0006_rename_datasetsnapshotjob_datasetfilemetadatajob_and_more.py deleted file mode 100644 index 0def0c132..000000000 --- a/hexa/datasets/migrations/0006_rename_datasetsnapshotjob_datasetfilemetadatajob_and_more.py +++ /dev/null @@ -1,20 +0,0 @@ -# Generated by Django 5.0.3 on 2024-07-12 07:44 - -from django.db import migrations - - -class Migration(migrations.Migration): - dependencies = [ - ("datasets", "0005_datasetsnapshotjob"), - ] - - operations = [ - migrations.RenameModel( - old_name="DatasetSnapshotJob", - new_name="DatasetFileMetadataJob", - ), - migrations.AlterModelTable( - name="datasetfilemetadatajob", - table="datasets_filemetadatajob", - ), - ] diff --git a/hexa/datasets/models.py b/hexa/datasets/models.py index da80792bf..3d99a65b2 100644 --- a/hexa/datasets/models.py +++ b/hexa/datasets/models.py @@ -256,7 +256,7 @@ class Meta: ordering = ["uri"] -class DatasetFileSnapshot(Base): +class DatasetFileMetadata(Base): STATUS_PROCESSING = "processing" STATUS_FAILED = "failed" STATUS_FINISHED = "finished" @@ -278,7 +278,7 @@ class DatasetFileSnapshot(Base): null=False, blank=False, on_delete=models.CASCADE, - related_name="snapshots", + related_name="file_metadata", ) diff --git a/hexa/datasets/permissions.py b/hexa/datasets/permissions.py index dbdc60880..e864447de 100644 --- a/hexa/datasets/permissions.py +++ b/hexa/datasets/permissions.py @@ -2,7 +2,6 @@ Dataset, DatasetLink, DatasetVersion, - DatasetVersionFile, ) from hexa.user_management.models import User from hexa.workspaces.models import ( @@ -110,11 +109,3 @@ def create_dataset_version_file(principal: User, dataset_version: DatasetVersion return False return create_dataset_version(principal, dataset_version.dataset) - - -def create_dataset_version_file_snapshot( - principal: User, dataset_version_file: DatasetVersionFile -): - if dataset_version_file != dataset_version_file.latest_version: - return False - return create_dataset_version_file(principal, dataset_version_file.dataset_version) diff --git a/hexa/datasets/queue.py b/hexa/datasets/queue.py index 583b7fca1..e1c5a8047 100644 --- a/hexa/datasets/queue.py +++ b/hexa/datasets/queue.py @@ -5,12 +5,11 @@ from dpq.queue import AtLeastOnceQueue from hexa.datasets.api import generate_download_url -from hexa.datasets.models import DatasetFileMetadataJob from hexa.datasets.models import ( - DatasetFileSnapshot, + DatasetFileMetadata, + DatasetFileMetadataJob, DatasetVersionFile, ) -from hexa.files.api import get_storage logger = getLogger(__name__) @@ -31,7 +30,7 @@ def read_file_content(download_url: str, content_type: str) -> pd.DataFrame: return pd.DataFrame() -def generate_dataset_file_sample_task(queue: AtLeastOnceQueue, job: DatasetSnapshotJob): +def generate_dataset_file_sample_task(queue: AtLeastOnceQueue, job: DatasetFileMetadataJob): try: dataset_version_file_id = job.args["file_id"] dataset_version_file = DatasetVersionFile.objects.get( @@ -40,9 +39,9 @@ def generate_dataset_file_sample_task(queue: AtLeastOnceQueue, job: DatasetSnaps logger.info( f"Creating dataset snapshot for version file {dataset_version_file_id}" ) - dataset_file_snapshot = DatasetFileSnapshot.objects.create( + dataset_file_metadata = DatasetFileMetadata.objects.create( dataset_version_file=dataset_version_file, - status=DatasetFileSnapshot.STATUS_PROCESSING, + status=DatasetFileMetadata.STATUS_PROCESSING, ) download_url = generate_download_url(dataset_version_file) @@ -53,25 +52,25 @@ def generate_dataset_file_sample_task(queue: AtLeastOnceQueue, job: DatasetSnaps file_snapshot_content = file_snapshot_df.head( settings.WORKSPACE_DATASETS_FILE_SNAPSHOT_SIZE ) - dataset_file_snapshot.content = file_snapshot_content.to_json( + dataset_file_metadata.content = file_snapshot_content.to_json( orient="records" ) logger.info(f"Dataset snapshot saved for file {dataset_version_file_id}") else: logger.info(f"Dataset snapshot is empty for file {dataset_version_file_id}") - dataset_file_snapshot.status = DatasetFileSnapshot.STATUS_FINISHED - dataset_file_snapshot.save() + dataset_file_metadata.status = DatasetFileMetadata.STATUS_FINISHED + dataset_file_metadata.save() logger.info("Dataset snapshot created for file {dataset_version_file_id}") except Exception as e: dataset_version_file_id = job.args["file_id"] dataset_version_file = DatasetVersionFile.objects.get( id=dataset_version_file_id ) - dataset_file_snapshot = DatasetFileSnapshot.objects.get( + dataset_file_metadata = DatasetFileMetadata.objects.get( dataset_version_file=dataset_version_file ) - dataset_file_snapshot.status = DatasetFileSnapshot.STATUS_FAILED - dataset_file_snapshot.save() + dataset_file_metadata.status = DatasetFileMetadata.STATUS_FAILED + dataset_file_metadata.save() logger.exception( f"Dataset file snapshot creation failed for file {dataset_version_file_id}: {e}" ) diff --git a/hexa/datasets/schema/mutations.py b/hexa/datasets/schema/mutations.py index 7f897ec2d..10afbe2d8 100644 --- a/hexa/datasets/schema/mutations.py +++ b/hexa/datasets/schema/mutations.py @@ -267,10 +267,18 @@ def resolve_create_version_file(_, info, **kwargs): return {"success": False, "errors": ["PERMISSION_DENIED"]} -@mutations.field("createDatasetVersionFileSnapshot") -def resolve_create_version_file_snapshot(_, info, **kwargs): - # TODO: implement flow to create snapshot via an API call - raise NotImplementedError +@mutations.field("createDatasetVersionFileMetadata") +def resolve_create_version_file_metadata(_, info, **kwargs): + mutation_input = kwargs["input"] + + dataset_file_metadata_queue.enqueue( + { + "generate_file_metadata", + { + "file_id": mutation_input["file_id"], + }, + } + ) @mutations.field("prepareVersionFileDownload") diff --git a/hexa/datasets/schema/queries.py b/hexa/datasets/schema/queries.py index 48f6fa51b..749bb8b60 100644 --- a/hexa/datasets/schema/queries.py +++ b/hexa/datasets/schema/queries.py @@ -4,7 +4,7 @@ from ..models import ( Dataset, - DatasetFileSnapshot, + DatasetFileMetadata, DatasetLink, DatasetVersion, ) @@ -42,18 +42,18 @@ def resolve_dataset_version(_, info, **kwargs): return None -@datasets_queries.field("datasetFileSnapshot") +@datasets_queries.field("datasetFileMetadata") def resolve_dataset_file_snapshot(_, info, **kwargs): try: if kwargs.get("file_id"): - return DatasetFileSnapshot.objects.get( + return DatasetFileMetadata.objects.get( dataset_version_file=kwargs["file_id"] ) elif kwargs.get("id"): - return DatasetFileSnapshot.objects.get(id=kwargs["id"]) + return DatasetFileMetadata.objects.get(id=kwargs["id"]) else: return None - except DatasetFileSnapshot.DoesNotExist: + except DatasetFileMetadata.DoesNotExist: return None From af85aadf9d2e0bc4a2a6d36bc44d0c0b7545b220 Mon Sep 17 00:00:00 2001 From: nazarfil Date: Fri, 12 Jul 2024 10:45:46 +0200 Subject: [PATCH 18/37] refactor: removed not needed file metadata query --- hexa/datasets/graphql/schema.graphql | 19 ++++--------------- hexa/datasets/migrations/__init__.py | 0 hexa/datasets/schema/mutations.py | 14 -------------- hexa/datasets/schema/queries.py | 2 +- hexa/files/gcp.py | 10 ++++++++-- hexa/files/tests/test_api.py | 17 ----------------- 6 files changed, 13 insertions(+), 49 deletions(-) create mode 100644 hexa/datasets/migrations/__init__.py diff --git a/hexa/datasets/graphql/schema.graphql b/hexa/datasets/graphql/schema.graphql index db70cdb94..cb931106c 100644 --- a/hexa/datasets/graphql/schema.graphql +++ b/hexa/datasets/graphql/schema.graphql @@ -424,19 +424,10 @@ type PinDatasetResult { errors: [PinDatasetError!]! } -input CreateDatasetFileMetadataInput { - fileId: String! -} - type DatasetFileMetadata { - content: JSON! - dataset_version_file: DatasetVersionFile -} - -type CreateDatasetFileMetadataResult { - dataset_file_metadata : DatasetFileMetadata - success: Boolean! - errors: [PrepareVersionFileDownloadError!]! + content: JSON + status: String! + datasetVersionFile: DatasetVersionFile } @@ -446,7 +437,7 @@ extend type Query { "Get a dataset by its slug." datasetVersion(id: ID!): DatasetVersion "Get a dataset file snapshot by fileSnapshot id or by fileId" - datasetFileMetadata(id: ID, fileId: ID): DatasetFileMetadata + datasetFileSnapshot(id: ID, fileId: ID): DatasetFileMetadata "Get a dataset link by its id." datasetLink(id: ID!): DatasetLink "Get a dataset link by its slug." @@ -471,8 +462,6 @@ extend type Mutation { generateDatasetUploadUrl(input: GenerateDatasetUploadUrlInput!): GenerateDatasetUploadUrlResult! @loginRequired "Create a new file in a dataset version." createDatasetVersionFile(input: CreateDatasetVersionFileInput!): CreateDatasetVersionFileResult! @loginRequired - "Create dataset version snapshot." - createDatasetVersionFileMetadata(input: CreateDatasetFileMetadataInput!): CreateDatasetFileMetadataResult! @loginRequired "Prepare to download a file in a dataset version." prepareVersionFileDownload(input: PrepareVersionFileDownloadInput!): PrepareVersionFileDownloadResult! @loginRequired "Link a dataset with a workspace." diff --git a/hexa/datasets/migrations/__init__.py b/hexa/datasets/migrations/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/hexa/datasets/schema/mutations.py b/hexa/datasets/schema/mutations.py index 10afbe2d8..e4d81c9c4 100644 --- a/hexa/datasets/schema/mutations.py +++ b/hexa/datasets/schema/mutations.py @@ -267,20 +267,6 @@ def resolve_create_version_file(_, info, **kwargs): return {"success": False, "errors": ["PERMISSION_DENIED"]} -@mutations.field("createDatasetVersionFileMetadata") -def resolve_create_version_file_metadata(_, info, **kwargs): - mutation_input = kwargs["input"] - - dataset_file_metadata_queue.enqueue( - { - "generate_file_metadata", - { - "file_id": mutation_input["file_id"], - }, - } - ) - - @mutations.field("prepareVersionFileDownload") def resolve_version_file_download(_, info, **kwargs): request = info.context["request"] diff --git a/hexa/datasets/schema/queries.py b/hexa/datasets/schema/queries.py index 749bb8b60..4012dbac4 100644 --- a/hexa/datasets/schema/queries.py +++ b/hexa/datasets/schema/queries.py @@ -42,7 +42,7 @@ def resolve_dataset_version(_, info, **kwargs): return None -@datasets_queries.field("datasetFileMetadata") +@datasets_queries.field("datasetFileSnapshot") def resolve_dataset_file_snapshot(_, info, **kwargs): try: if kwargs.get("file_id"): diff --git a/hexa/files/gcp.py b/hexa/files/gcp.py index 97d22518b..a3199db6c 100644 --- a/hexa/files/gcp.py +++ b/hexa/files/gcp.py @@ -11,7 +11,13 @@ from google.oauth2 import service_account from google.protobuf import duration_pb2 -from .basefs import BaseClient, NotFound, ObjectsPage, load_bucket_sample_data_with +from .basefs import ( + BaseClient, + BucketObjectAlreadyExists, + NotFound, + ObjectsPage, + load_bucket_sample_data_with, +) def get_credentials(): @@ -191,7 +197,7 @@ def generate_upload_url( client = get_storage_client() gcs_bucket = client.get_bucket(bucket_name) if raise_if_exists and gcs_bucket.get_blob(target_key) is not None: - raise ValidationError(f"GCS: Object {target_key} already exists!") + raise BucketObjectAlreadyExists(target_key) blob = gcs_bucket.blob(target_key) return blob.generate_signed_url( expiration=3600, version="v4", method="PUT", content_type=content_type diff --git a/hexa/files/tests/test_api.py b/hexa/files/tests/test_api.py index bfd3f8ebf..28123be0d 100644 --- a/hexa/files/tests/test_api.py +++ b/hexa/files/tests/test_api.py @@ -532,20 +532,3 @@ def get_type(self): class APIGcpTestCase(APITestCase, OnlyGCP, TestCase): def get_type(self): return "gcp" - - -class TestDownloadFromCloudStorage(TestCase): - def test_get_from_gcp(self): - pass - # lines = get_storage("gcp").read_object_lines( - # "hexa-test-datasets", - # "3237e8c2-896d-4628-9054-59d69c785a11/add8469f-14d6-4081-8e02-adb53016f7bd/people.csv", - # 1, - # ) - # print(f"Lines are : {lines}") - # self.assertEqual( - # lines, - # [ - # "3237e8c2-896d-4628-9054-59d69c785a11/add8469f-14d6-4081-8e02-adb53016f7bd/people.csv" - # ], - # ) From 9526eaa518038248e2d7bf4112169687d91e60af Mon Sep 17 00:00:00 2001 From: nazarfil Date: Tue, 16 Jul 2024 07:12:47 +0200 Subject: [PATCH 19/37] test(Dataset): add unittest for the task flow --- hexa/datasets/graphql/schema.graphql | 19 +++- .../migrations/0006_datasetfilemetadata.py | 4 +- hexa/datasets/models.py | 2 +- hexa/datasets/queue.py | 48 ++++----- hexa/datasets/schema/queries.py | 4 +- .../datasets/tests/fixtures/example_names.csv | 62 ++++++++++++ .../tests/fixtures/example_names.parquet | Bin 0 -> 3237 bytes hexa/datasets/tests/test_metadata.py | 95 ++++++++++++++++++ hexa/datasets/tests/test_schema.py | 34 ------- requirements.in | 5 +- requirements.txt | 3 + 11 files changed, 204 insertions(+), 72 deletions(-) create mode 100644 hexa/datasets/tests/fixtures/example_names.csv create mode 100644 hexa/datasets/tests/fixtures/example_names.parquet create mode 100644 hexa/datasets/tests/test_metadata.py diff --git a/hexa/datasets/graphql/schema.graphql b/hexa/datasets/graphql/schema.graphql index cb931106c..70ee87904 100644 --- a/hexa/datasets/graphql/schema.graphql +++ b/hexa/datasets/graphql/schema.graphql @@ -424,10 +424,21 @@ type PinDatasetResult { errors: [PinDatasetError!]! } +""" +Statuses that can occur when generating file metadata +""" +enum FileMetadataStatus{ + STATUS_PROCESSING, + STATUS_FAILED, + STATUS_FINISHED +} +""" +Metadata for dataset file +""" type DatasetFileMetadata { content: JSON - status: String! - datasetVersionFile: DatasetVersionFile + status: FileMetadataStatus! + datasetVersionFile: DatasetVersionFile! } @@ -436,8 +447,8 @@ extend type Query { dataset(id: ID!): Dataset "Get a dataset by its slug." datasetVersion(id: ID!): DatasetVersion - "Get a dataset file snapshot by fileSnapshot id or by fileId" - datasetFileSnapshot(id: ID, fileId: ID): DatasetFileMetadata + "Get a dataset file sample by fileSnapshot id or by fileId" + datasetFileSample(id: ID, fileId: ID): DatasetFileMetadata "Get a dataset link by its id." datasetLink(id: ID!): DatasetLink "Get a dataset link by its slug." diff --git a/hexa/datasets/migrations/0006_datasetfilemetadata.py b/hexa/datasets/migrations/0006_datasetfilemetadata.py index e855070f8..350456167 100644 --- a/hexa/datasets/migrations/0006_datasetfilemetadata.py +++ b/hexa/datasets/migrations/0006_datasetfilemetadata.py @@ -1,4 +1,4 @@ -# Generated by Django 5.0.3 on 2024-07-12 08:18 +# Generated by Django 5.0.7 on 2024-07-16 05:13 import uuid @@ -26,7 +26,7 @@ class Migration(migrations.Migration): ), ("created_at", models.DateTimeField(auto_now_add=True)), ("updated_at", models.DateTimeField(auto_now=True)), - ("content", models.JSONField(blank=True, default=list, null=True)), + ("sample", models.JSONField(blank=True, default=list, null=True)), ( "status", models.CharField( diff --git a/hexa/datasets/models.py b/hexa/datasets/models.py index 3d99a65b2..b3fff9f63 100644 --- a/hexa/datasets/models.py +++ b/hexa/datasets/models.py @@ -267,7 +267,7 @@ class DatasetFileMetadata(Base): (STATUS_FINISHED, "Finished"), ] - content = JSONField(blank=True, default=list, null=True) + sample = JSONField(blank=True, default=list, null=True) status = models.CharField( max_length=10, choices=STATUS_CHOICES, diff --git a/hexa/datasets/queue.py b/hexa/datasets/queue.py index e1c5a8047..ba6745279 100644 --- a/hexa/datasets/queue.py +++ b/hexa/datasets/queue.py @@ -14,45 +14,43 @@ logger = getLogger(__name__) -def read_file_content(download_url: str, content_type: str) -> pd.DataFrame: +def read_file_content(download_url: str, filename: str) -> pd.DataFrame: try: - if content_type == "text/csv": + if filename.endswith("csv"): return pd.read_csv(download_url) - elif content_type == "application/octet-stream": - return pd.read_parquet(download_url) + elif filename.endswith("parquet"): + return pd.read_parquet(download_url, engine="pyarrow") else: - raise ValueError(f"Unsupported content type: {content_type}") + raise ValueError(f"Unsupported file format: {filename.split('.')[-1]}") except pd.errors.ParserError as e: print(f"Error parsing the file content: {e}") return pd.DataFrame() except ValueError as e: - print(f"Unsupported file content: {e}") + print(f"Cannot read file: {e}") return pd.DataFrame() -def generate_dataset_file_sample_task(queue: AtLeastOnceQueue, job: DatasetFileMetadataJob): - try: - dataset_version_file_id = job.args["file_id"] - dataset_version_file = DatasetVersionFile.objects.get( - id=dataset_version_file_id - ) - logger.info( - f"Creating dataset snapshot for version file {dataset_version_file_id}" - ) - dataset_file_metadata = DatasetFileMetadata.objects.create( - dataset_version_file=dataset_version_file, - status=DatasetFileMetadata.STATUS_PROCESSING, - ) +def generate_dataset_file_sample_task( + queue: AtLeastOnceQueue, job: DatasetFileMetadataJob +): + dataset_version_file_id = job.args["file_id"] + dataset_version_file = DatasetVersionFile.objects.get(id=dataset_version_file_id) + logger.info(f"Creating dataset snapshot for version file {dataset_version_file_id}") + dataset_file_metadata = DatasetFileMetadata.objects.create( + dataset_version_file=dataset_version_file, + status=DatasetFileMetadata.STATUS_PROCESSING, + ) + try: download_url = generate_download_url(dataset_version_file) file_snapshot_df = read_file_content( - download_url, dataset_version_file.content_type + download_url, dataset_version_file.filename ) if not file_snapshot_df.empty: file_snapshot_content = file_snapshot_df.head( settings.WORKSPACE_DATASETS_FILE_SNAPSHOT_SIZE ) - dataset_file_metadata.content = file_snapshot_content.to_json( + dataset_file_metadata.sample = file_snapshot_content.to_json( orient="records" ) logger.info(f"Dataset snapshot saved for file {dataset_version_file_id}") @@ -62,13 +60,7 @@ def generate_dataset_file_sample_task(queue: AtLeastOnceQueue, job: DatasetFileM dataset_file_metadata.save() logger.info("Dataset snapshot created for file {dataset_version_file_id}") except Exception as e: - dataset_version_file_id = job.args["file_id"] - dataset_version_file = DatasetVersionFile.objects.get( - id=dataset_version_file_id - ) - dataset_file_metadata = DatasetFileMetadata.objects.get( - dataset_version_file=dataset_version_file - ) + print(f"Fail : {e}") dataset_file_metadata.status = DatasetFileMetadata.STATUS_FAILED dataset_file_metadata.save() logger.exception( diff --git a/hexa/datasets/schema/queries.py b/hexa/datasets/schema/queries.py index 4012dbac4..efe99ba4e 100644 --- a/hexa/datasets/schema/queries.py +++ b/hexa/datasets/schema/queries.py @@ -42,8 +42,8 @@ def resolve_dataset_version(_, info, **kwargs): return None -@datasets_queries.field("datasetFileSnapshot") -def resolve_dataset_file_snapshot(_, info, **kwargs): +@datasets_queries.field("datasetFileSample") +def resolve_dataset_file_sample(_, info, **kwargs): try: if kwargs.get("file_id"): return DatasetFileMetadata.objects.get( diff --git a/hexa/datasets/tests/fixtures/example_names.csv b/hexa/datasets/tests/fixtures/example_names.csv new file mode 100644 index 000000000..93b20ed12 --- /dev/null +++ b/hexa/datasets/tests/fixtures/example_names.csv @@ -0,0 +1,62 @@ +name,surname +Joe,Doe +Liam,Smith +Emma,Johnson +Noah,Williams +Olivia,Brown +William,Jones +Ava,Garcia +James,Miller +Sophia,Davis +Oliver,Martinez +Isabella,Hernandez +Benjamin,Lopez +Mia,Gonzalez +Elijah,Wilson +Charlotte,Anderson +Lucas,Thomas +Amelia,Taylor +Mason,Moore +Harper,Jackson +Logan,Martin +Evelyn,Lee +Alexander,Perez +Abigail,Thompson +Ethan,White +Emily,Harris +Jacob,Sanchez +Ella,Clark +Michael,Ramirez +Avery,Lewis +Daniel,Robinson +Sofia,Walker +Henry,Young +Scarlett,Allen +Jackson,King +Grace,Scott +Sebastian,Green +Victoria,Baker +Aiden,Adams +Chloe,Nelson +Matthew,Hill +Riley,Campbell +Samuel,Mitchell +Aria,Carter +David,Rogers +Lily,Evans +Joseph,Murphy +Layla,Parker +Carter,Roberts +Aubrey,Gonzalez +Owen,Reed +Zoey,Cook +Wyatt,Morgan +Hannah,Murphy +Jack,Howard +Lillian,Richardson +Luke,Cox +Addison,James +Gabriel,Wright +Eleanor,Hughes +Anthony,Butler +Natalie,Foster \ No newline at end of file diff --git a/hexa/datasets/tests/fixtures/example_names.parquet b/hexa/datasets/tests/fixtures/example_names.parquet new file mode 100644 index 0000000000000000000000000000000000000000..733f3e9218c0c1d27f76ccf68cf840e647a62455 GIT binary patch literal 3237 zcmcgvO>7&-72a{VGFMpx>v$t6qCguohyEQ36h#k3-z+KF0^CCp zqYH3n=FOY;^S$rQN;^h9lZYk0tt9?^v6>hqq~|5+7qRuvNRm|XNDPK5#V(4gdDq1# z{>HY4ohgvsWwcN6MVPfHV?t#`HTqaaud3K30h-qvUe}=*O{1GD#E*kwIlv}i3?ns_ zCHx4xl%q|lrd%uK89l;|0y;Sd`^*bNf*NYIXJNUa87^T|KGbSB@Ho1!m$2U@zPu3k zI#|A<=lg^WIV#KJ9An2t)5k#-(*_3-_sw zF2}2s$uH>@FCbk9t?5-fWO(Yuf`j$Q&( zIL8k9oGKzjUz4iyzm;zQgvSJ3NgB3I1$fUTi`eujxPDI0GlIEy?eGo{9gh!1CAV>i z86^oR(M^PlKn5)_m6%GcNmpawY?!<_ffZEOKe_<#5t_PuXAVG1mEK*f{$X?H<=Q>+ z`u=kl>}kAW5)(Q%U?1DBF)H`=i*m1ZB4X6BrsU4IPnD z#dA0$F4#ArX@KUmtnUptQr0UTCxIYy5&M>)D)50Z0Q-fgC;@tPrv~U#PDX;8CE{~I zun|gYRj(_?RMF!j%wTH7!NH{gWq<3$tV8&sy8-@eca%ssVX$!SmOsa#?Fw&{F8LKw1LSQDRwqX;n3B?8+a; zwh8(>T3abW<^rZ1cDp7~)KrazmILjId_8wc!MfLhMfskoDglQrC!HT^v!H9gqnyl6}$ji;VJZ#qqLKL-7x{^}p!{Ny&U z^KJbxJ1+)3sQ84SA5YJGUs?G0`Gvnu3x=L|`80ygZmvqnsQ^H9aUv!q)cFfzzRye5 z6a*OwPF+l7Go#tm?BcU&k!P05%twit(L#LY>sNG1nq5pzb)gGzkbF1xFinB(Cl4%- z^;{mLZ_j<{Vfq+kFiZQGcSw41E-fGg0q-#es~`k6!Ez~@dtI2WWuiF|^UR~yVKFxN zAhJ#-cF&tRgj>vha9%c!>p?i|3V-Pyr!M7Nk3I20;ore5YQyM^n`{Y;PdXD*!c7wBiwz0 z;S(0Y!KmqDe<;xC4&gTP1|k6XP6pKDLYi5-k-4#yhEUTJ?@w$l!`@2zk(5-8di`$e z_P7s{Uy0w0o=(m%j9vJN=QYDvF|r+_U>F@E*NJ4=?797c5a!SJ3(*I_Q+$RmW`Y37wwZIti*dgIc%hj>UIDW(+3lzLhJlmABd_lQkQ! z3mf|lVPoQpZFEhy&PcI25`J2@0&2R2@Sp)_i>n#4I0(wQii=y#;GnS4Yqv5?=$X73 zm^rY+vk({f&TK4^{cN;X=FJ{%ZTAm~dsO%a-OwtrL%YNVk=`^<@GPHPWvnMK zeD-liwKxQ9p>B`k;oBK&Qm_9K)5zZcKg88s5_LSX+$P|>F%-B$oB?iChVA{zYJ@Y- zuEKn1-m;>)*lu%ZEwjE!feX3v$u?a%sU8_r-;tn!7ZX`tD}LcGS01w-}>52F_8?;6=%0~)1h7?!am6v5=#q20Zx z^GcC{(^j5%Ek=D3?Tn8+-j|YdUF?6kN5bn?w|hOhSmq`d$K3cznO%hMqxeMT;V+6E H_y^@bddaoI literal 0 HcmV?d00001 diff --git a/hexa/datasets/tests/test_metadata.py b/hexa/datasets/tests/test_metadata.py new file mode 100644 index 000000000..ff960d089 --- /dev/null +++ b/hexa/datasets/tests/test_metadata.py @@ -0,0 +1,95 @@ +import os +from unittest import mock + +import pandas + +from hexa.core.test import TestCase +from hexa.datasets.models import DatasetFileMetadata +from hexa.datasets.queue import create_dataset_file_metadata_task + + +class TestCreateDatasetFileMetadataTask(TestCase): + @mock.patch("hexa.datasets.queue.DatasetVersionFile.objects.get") + @mock.patch("hexa.datasets.queue.DatasetFileMetadata.objects.get") + @mock.patch("hexa.datasets.queue.DatasetFileMetadata.objects.create") + @mock.patch("hexa.datasets.queue.generate_download_url") + def test_create_dataset_file_metadata_task_success( + self, + mock_generate_download_url, + mock_DatasetFileMetadata_create, + mock_DatasetFileMetadata_get, + mock_DatasetVersionFile_get, + ): + test_cases = [ + ("example_names.csv", DatasetFileMetadata.STATUS_FINISHED), + ("example_names.parquet", DatasetFileMetadata.STATUS_FINISHED), + ] + for filename, expected_status in test_cases: + with self.subTest(filename=filename): + dataset_version_file = mock.Mock() + dataset_version_file.id = 1 + dataset_version_file.name = "example_names.csv" + mock_DatasetVersionFile_get.return_value = dataset_version_file + + dataset_file_metadata = mock.Mock() + mock_DatasetFileMetadata_create.return_value = dataset_file_metadata + + fixture_file_path = os.path.join( + os.path.dirname(__file__), f"./fixtures/{filename}" + ) + mock_generate_download_url.return_value = fixture_file_path + + job = mock.Mock() + job.args = {"file_id": dataset_version_file.id} + + create_dataset_file_metadata_task(mock.Mock(), job) + + mock_generate_download_url.assert_called_once_with(dataset_version_file) + mock_DatasetVersionFile_get.assert_called_once_with( + id=dataset_version_file.id + ) + mock_DatasetFileMetadata_create.assert_called_once_with( + dataset_version_file=dataset_version_file, + status=DatasetFileMetadata.STATUS_PROCESSING, + ) + dataset_file_metadata.save.assert_called() + self.assertEqual(dataset_file_metadata.status, expected_status) + self.assertEqual( + dataset_file_metadata.sample, + pandas.read_csv(fixture_file_path) + .head(50) + .to_json(orient="records"), + ) + + @mock.patch("hexa.datasets.models.DatasetVersionFile") + @mock.patch("hexa.datasets.models.DatasetFileMetadata") + @mock.patch("hexa.datasets.api.generate_download_url") + def test_create_dataset_file_metadata_task_failure( + self, + mock_generate_download_url, + mock_DatasetFileMetadata, + mock_DatasetVersionFile, + ): + # Mock dataset version file + dataset_version_file = mock.Mock() + dataset_version_file.id = 1 + mock_DatasetVersionFile.objects.get.return_value = dataset_version_file + + dataset_file_metadata = mock.Mock() + mock_DatasetFileMetadata.objects.get.return_value = dataset_file_metadata + mock_generate_download_url.side_effect = Exception("Failed to generate URL") + + job = mock.Mock() + job.args = {"file_id": dataset_version_file.id} + create_dataset_file_metadata_task(mock.Mock(), job) + + mock_DatasetVersionFile.objects.get.assert_called_with( + id=dataset_version_file.id + ) + mock_DatasetFileMetadata.objects.get.assert_called_with( + dataset_version_file=dataset_version_file + ) + dataset_file_metadata.save.assert_called() + self.assertEqual( + dataset_file_metadata.status, DatasetFileMetadata.STATUS_FAILED + ) diff --git a/hexa/datasets/tests/test_schema.py b/hexa/datasets/tests/test_schema.py index 3c17c3c78..faf7b6f38 100644 --- a/hexa/datasets/tests/test_schema.py +++ b/hexa/datasets/tests/test_schema.py @@ -599,37 +599,3 @@ def test_prepare_version_file_download_linked_dataset(self): }, r["data"]["prepareVersionFileDownload"], ) - - @mock_gcp_storage - def test_prepare_version_file(self): - serena = self.create_user("sereba@blsq.org", is_superuser=True) - src_workspace = self.create_workspace( - serena, - name="Source Workspace", - description="Test workspace", - ) - dataset = self.create_dataset( - serena, src_workspace, "Dataset", "Dataset description" - ) - dataset_version = self.create_dataset_version(serena, dataset=dataset) - self.client.force_login(serena) - - r = self.run_query( - """ - mutation CreateDatasetVersionFile ($input: CreateDatasetVersionFileInput!) { - createDatasetVersionFile(input: $input) { - success - errors - } - } - """, - { - "input": { - "versionId": str(dataset_version.id), - "contentType": "text/csv", - "uri": f"{dataset_version.id}/demo_file.csv", - } - }, - ) - print(r) - # while queue.run_once(): pass diff --git a/requirements.in b/requirements.in index 471996b95..d7445ca55 100644 --- a/requirements.in +++ b/requirements.in @@ -59,4 +59,7 @@ requests==2.32.3 # analytics mixpanel -ua-parser \ No newline at end of file +ua-parser + +# +pyarrow \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index e30bcdd80..73f891c88 100644 --- a/requirements.txt +++ b/requirements.txt @@ -198,6 +198,7 @@ numpy==2.0.1 # via # geopandas # pandas + # pyarrow # pyogrio # rasterio # shapely @@ -241,6 +242,8 @@ psycopg2-binary==2.9.9 # via -r requirements.in py-partiql-parser==0.5.5 # via moto +pyarrow==17.0.0 + # via -r requirements.in pyasn1==0.6.0 # via # pyasn1-modules From 06b427c6c7d855c0c06e38df7da1e42b302e4875 Mon Sep 17 00:00:00 2001 From: nazarfil Date: Tue, 16 Jul 2024 09:48:10 +0200 Subject: [PATCH 20/37] fix(Dataset): fixed parquet file reading --- hexa/datasets/queue.py | 16 ++++++++-------- hexa/datasets/tests/test_metadata.py | 26 ++++++++++++++++---------- 2 files changed, 24 insertions(+), 18 deletions(-) diff --git a/hexa/datasets/queue.py b/hexa/datasets/queue.py index ba6745279..71521016f 100644 --- a/hexa/datasets/queue.py +++ b/hexa/datasets/queue.py @@ -14,14 +14,17 @@ logger = getLogger(__name__) -def read_file_content(download_url: str, filename: str) -> pd.DataFrame: +def download_file_sample(dataset_version_file: DatasetVersionFile) -> pd.DataFrame: + filename = dataset_version_file.filename + file_format = filename.split(".")[-1] try: - if filename.endswith("csv"): + download_url = generate_download_url(dataset_version_file) + if file_format == "csv": return pd.read_csv(download_url) - elif filename.endswith("parquet"): + elif file_format == "parquet": return pd.read_parquet(download_url, engine="pyarrow") else: - raise ValueError(f"Unsupported file format: {filename.split('.')[-1]}") + raise ValueError(f"Unsupported file format: {file_format}") except pd.errors.ParserError as e: print(f"Error parsing the file content: {e}") return pd.DataFrame() @@ -42,10 +45,7 @@ def generate_dataset_file_sample_task( ) try: - download_url = generate_download_url(dataset_version_file) - file_snapshot_df = read_file_content( - download_url, dataset_version_file.filename - ) + file_snapshot_df = download_file_sample(dataset_version_file) if not file_snapshot_df.empty: file_snapshot_content = file_snapshot_df.head( settings.WORKSPACE_DATASETS_FILE_SNAPSHOT_SIZE diff --git a/hexa/datasets/tests/test_metadata.py b/hexa/datasets/tests/test_metadata.py index ff960d089..8dd8cb5e1 100644 --- a/hexa/datasets/tests/test_metadata.py +++ b/hexa/datasets/tests/test_metadata.py @@ -1,11 +1,9 @@ import os from unittest import mock -import pandas - from hexa.core.test import TestCase from hexa.datasets.models import DatasetFileMetadata -from hexa.datasets.queue import create_dataset_file_metadata_task +from hexa.datasets.queue import generate_dataset_file_metadata_task class TestCreateDatasetFileMetadataTask(TestCase): @@ -28,7 +26,7 @@ def test_create_dataset_file_metadata_task_success( with self.subTest(filename=filename): dataset_version_file = mock.Mock() dataset_version_file.id = 1 - dataset_version_file.name = "example_names.csv" + dataset_version_file.filename = f"{filename}" mock_DatasetVersionFile_get.return_value = dataset_version_file dataset_file_metadata = mock.Mock() @@ -42,7 +40,7 @@ def test_create_dataset_file_metadata_task_success( job = mock.Mock() job.args = {"file_id": dataset_version_file.id} - create_dataset_file_metadata_task(mock.Mock(), job) + generate_dataset_file_metadata_task(mock.Mock(), job) mock_generate_download_url.assert_called_once_with(dataset_version_file) mock_DatasetVersionFile_get.assert_called_once_with( @@ -54,13 +52,21 @@ def test_create_dataset_file_metadata_task_success( ) dataset_file_metadata.save.assert_called() self.assertEqual(dataset_file_metadata.status, expected_status) + expected_content = ( + '[{"name":"Joe","surname":"Doe"},' + '{"name":"Liam","surname":"Smith"},' + '{"name":"Emma","surname":"Johnson"},' + ) self.assertEqual( - dataset_file_metadata.sample, - pandas.read_csv(fixture_file_path) - .head(50) - .to_json(orient="records"), + dataset_file_metadata.sample[0 : len(expected_content)], + expected_content, ) + mock_generate_download_url.reset_mock() + mock_DatasetVersionFile_get.reset_mock() + mock_DatasetFileMetadata_create.reset_mock() + dataset_file_metadata.save.reset_mock() + @mock.patch("hexa.datasets.models.DatasetVersionFile") @mock.patch("hexa.datasets.models.DatasetFileMetadata") @mock.patch("hexa.datasets.api.generate_download_url") @@ -81,7 +87,7 @@ def test_create_dataset_file_metadata_task_failure( job = mock.Mock() job.args = {"file_id": dataset_version_file.id} - create_dataset_file_metadata_task(mock.Mock(), job) + generate_dataset_file_metadata_task(mock.Mock(), job) mock_DatasetVersionFile.objects.get.assert_called_with( id=dataset_version_file.id From 090180908a78a362027fae7caa6c8cec2e4ac8ab Mon Sep 17 00:00:00 2001 From: nazarfil Date: Tue, 16 Jul 2024 11:11:44 +0200 Subject: [PATCH 21/37] fix(Dataset): added corner cases for failures --- .../migrations/0006_datasetfilemetadata.py | 3 +- hexa/datasets/models.py | 1 + hexa/datasets/queue.py | 84 +++++++++++++------ hexa/datasets/tests/test_metadata.py | 29 +++---- 4 files changed, 75 insertions(+), 42 deletions(-) diff --git a/hexa/datasets/migrations/0006_datasetfilemetadata.py b/hexa/datasets/migrations/0006_datasetfilemetadata.py index 350456167..97784a1cc 100644 --- a/hexa/datasets/migrations/0006_datasetfilemetadata.py +++ b/hexa/datasets/migrations/0006_datasetfilemetadata.py @@ -1,4 +1,4 @@ -# Generated by Django 5.0.7 on 2024-07-16 05:13 +# Generated by Django 5.0.7 on 2024-07-16 08:34 import uuid @@ -39,6 +39,7 @@ class Migration(migrations.Migration): max_length=10, ), ), + ("status_reason", models.TextField(blank=True, null=True)), ( "dataset_version_file", models.ForeignKey( diff --git a/hexa/datasets/models.py b/hexa/datasets/models.py index b3fff9f63..4d9ab886d 100644 --- a/hexa/datasets/models.py +++ b/hexa/datasets/models.py @@ -273,6 +273,7 @@ class DatasetFileMetadata(Base): choices=STATUS_CHOICES, default=STATUS_PROCESSING, ) + status_reason = models.TextField(blank=True, null=True) dataset_version_file = models.ForeignKey( DatasetVersionFile, null=False, diff --git a/hexa/datasets/queue.py b/hexa/datasets/queue.py index 71521016f..678ae3f51 100644 --- a/hexa/datasets/queue.py +++ b/hexa/datasets/queue.py @@ -2,6 +2,8 @@ import pandas as pd from django.conf import settings +from django.core.exceptions import ObjectDoesNotExist, ValidationError +from django.db import DatabaseError, IntegrityError from dpq.queue import AtLeastOnceQueue from hexa.datasets.api import generate_download_url @@ -10,61 +12,95 @@ DatasetFileMetadataJob, DatasetVersionFile, ) +from hexa.files.api import get_storage logger = getLogger(__name__) -def download_file_sample(dataset_version_file: DatasetVersionFile) -> pd.DataFrame: +def download_file_sample(dataset_version_file: DatasetVersionFile) -> dict: filename = dataset_version_file.filename file_format = filename.split(".")[-1] try: download_url = generate_download_url(dataset_version_file) if file_format == "csv": - return pd.read_csv(download_url) + print(f"File {filename} format: {file_format}") + csv_sample = pd.read_csv(download_url) + return {"success": True, "sample": csv_sample} elif file_format == "parquet": - return pd.read_parquet(download_url, engine="pyarrow") + print(f"File {filename} format: {file_format}") + parquet_sample = pd.read_parquet(download_url) + return {"success": True, "sample": parquet_sample} else: raise ValueError(f"Unsupported file format: {file_format}") except pd.errors.ParserError as e: - print(f"Error parsing the file content: {e}") - return pd.DataFrame() + logger.error(f"Error parsing the file {filename} content: {e}") + return {"success": False, "errors": ["FILE_PARSING_ERROR"]} except ValueError as e: - print(f"Cannot read file: {e}") - return pd.DataFrame() + logger.error(f"Cannot read file {filename}: {e}") + return {"success": False, "errors": ["FILE_FORMAT_NOT_SUPPORTED"]} + except get_storage().exceptions.NotFound: + logger.error(f"Cannot find file {filename}") + return {"success": False, "errors": ["FILE_NOT_FOUND"]} def generate_dataset_file_sample_task( queue: AtLeastOnceQueue, job: DatasetFileMetadataJob ): dataset_version_file_id = job.args["file_id"] - dataset_version_file = DatasetVersionFile.objects.get(id=dataset_version_file_id) - logger.info(f"Creating dataset snapshot for version file {dataset_version_file_id}") - dataset_file_metadata = DatasetFileMetadata.objects.create( - dataset_version_file=dataset_version_file, - status=DatasetFileMetadata.STATUS_PROCESSING, - ) + try: + print( + f"Calling {DatasetVersionFile.objects.get} with id {dataset_version_file_id}" + ) + dataset_version_file = DatasetVersionFile.objects.get( + id=dataset_version_file_id + ) + except ObjectDoesNotExist as e: + logger.error( + f"DatasetVersionFile with id {dataset_version_file_id} does not exist: {e}" + ) + return + logger.info(f"Creating dataset sample for version file {dataset_version_file_id}") try: - file_snapshot_df = download_file_sample(dataset_version_file) - if not file_snapshot_df.empty: - file_snapshot_content = file_snapshot_df.head( + dataset_file_metadata = DatasetFileMetadata.objects.create( + dataset_version_file=dataset_version_file, + status=DatasetFileMetadata.STATUS_PROCESSING, + ) + except (IntegrityError, DatabaseError, ValidationError) as e: + logger.error(f"Error creating DatasetFileMetadata: {e}") + return + + try: + file_sample = download_file_sample(dataset_version_file) + if file_sample["success"]: + file_snapshot_content = file_sample["sample"].head( settings.WORKSPACE_DATASETS_FILE_SNAPSHOT_SIZE ) dataset_file_metadata.sample = file_snapshot_content.to_json( orient="records" ) - logger.info(f"Dataset snapshot saved for file {dataset_version_file_id}") + logger.info(f"Dataset sample saved for file {dataset_version_file_id}") + dataset_file_metadata.status = DatasetFileMetadata.STATUS_FINISHED + dataset_file_metadata.save() + logger.info(f"Dataset sample created for file {dataset_version_file_id}") else: - logger.info(f"Dataset snapshot is empty for file {dataset_version_file_id}") - dataset_file_metadata.status = DatasetFileMetadata.STATUS_FINISHED - dataset_file_metadata.save() - logger.info("Dataset snapshot created for file {dataset_version_file_id}") + dataset_file_metadata.status = DatasetFileMetadata.STATUS_FAILED + dataset_file_metadata.status_reason = file_sample["errors"] + dataset_file_metadata.save() + logger.info( + f'Dataset file sample creation failed for file {dataset_version_file_id} with error {file_sample["errors"]}' + ) except Exception as e: - print(f"Fail : {e}") dataset_file_metadata.status = DatasetFileMetadata.STATUS_FAILED - dataset_file_metadata.save() + dataset_file_metadata.status_reason = str(e) + try: + dataset_file_metadata.save() + except (IntegrityError, DatabaseError, ValidationError) as save_error: + logger.error( + f"Error saving DatasetFileMetadata after failure: {save_error}" + ) logger.exception( - f"Dataset file snapshot creation failed for file {dataset_version_file_id}: {e}" + f"Dataset file sample creation failed for file {dataset_version_file_id}: {e}" ) diff --git a/hexa/datasets/tests/test_metadata.py b/hexa/datasets/tests/test_metadata.py index 8dd8cb5e1..e269095a5 100644 --- a/hexa/datasets/tests/test_metadata.py +++ b/hexa/datasets/tests/test_metadata.py @@ -4,18 +4,17 @@ from hexa.core.test import TestCase from hexa.datasets.models import DatasetFileMetadata from hexa.datasets.queue import generate_dataset_file_metadata_task +from hexa.files.api import get_storage class TestCreateDatasetFileMetadataTask(TestCase): @mock.patch("hexa.datasets.queue.DatasetVersionFile.objects.get") - @mock.patch("hexa.datasets.queue.DatasetFileMetadata.objects.get") @mock.patch("hexa.datasets.queue.DatasetFileMetadata.objects.create") @mock.patch("hexa.datasets.queue.generate_download_url") def test_create_dataset_file_metadata_task_success( self, mock_generate_download_url, mock_DatasetFileMetadata_create, - mock_DatasetFileMetadata_get, mock_DatasetVersionFile_get, ): test_cases = [ @@ -67,34 +66,30 @@ def test_create_dataset_file_metadata_task_success( mock_DatasetFileMetadata_create.reset_mock() dataset_file_metadata.save.reset_mock() - @mock.patch("hexa.datasets.models.DatasetVersionFile") - @mock.patch("hexa.datasets.models.DatasetFileMetadata") - @mock.patch("hexa.datasets.api.generate_download_url") + @mock.patch("hexa.datasets.queue.DatasetVersionFile.objects.get") + @mock.patch("hexa.datasets.queue.DatasetFileMetadata.objects.create") + @mock.patch("hexa.datasets.queue.generate_download_url") def test_create_dataset_file_metadata_task_failure( self, mock_generate_download_url, - mock_DatasetFileMetadata, - mock_DatasetVersionFile, + mock_DatasetFileMetadata_create, + mock_DatasetVersionFile_get, ): - # Mock dataset version file dataset_version_file = mock.Mock() dataset_version_file.id = 1 - mock_DatasetVersionFile.objects.get.return_value = dataset_version_file + dataset_version_file.filename = "example_names.csv" + mock_DatasetVersionFile_get.return_value = dataset_version_file dataset_file_metadata = mock.Mock() - mock_DatasetFileMetadata.objects.get.return_value = dataset_file_metadata - mock_generate_download_url.side_effect = Exception("Failed to generate URL") + mock_DatasetFileMetadata_create.return_value = dataset_file_metadata + + mock_generate_download_url.side_effect = get_storage().exceptions.NotFound job = mock.Mock() job.args = {"file_id": dataset_version_file.id} generate_dataset_file_metadata_task(mock.Mock(), job) - mock_DatasetVersionFile.objects.get.assert_called_with( - id=dataset_version_file.id - ) - mock_DatasetFileMetadata.objects.get.assert_called_with( - dataset_version_file=dataset_version_file - ) + mock_DatasetVersionFile_get.assert_called_with(id=dataset_version_file.id) dataset_file_metadata.save.assert_called() self.assertEqual( dataset_file_metadata.status, DatasetFileMetadata.STATUS_FAILED From 5817d6f5c3865893f1021301bc875f8337f90266 Mon Sep 17 00:00:00 2001 From: nazarfil Date: Tue, 16 Jul 2024 13:30:33 +0200 Subject: [PATCH 22/37] fix(Dataset): move from head to sample --- hexa/datasets/queue.py | 25 ++++++++++--------------- hexa/datasets/tests/test_metadata.py | 6 +----- 2 files changed, 11 insertions(+), 20 deletions(-) diff --git a/hexa/datasets/queue.py b/hexa/datasets/queue.py index 678ae3f51..15fb41d5d 100644 --- a/hexa/datasets/queue.py +++ b/hexa/datasets/queue.py @@ -17,7 +17,7 @@ logger = getLogger(__name__) -def download_file_sample(dataset_version_file: DatasetVersionFile) -> dict: +def download_file_as_dataframe(dataset_version_file: DatasetVersionFile) -> dict: filename = dataset_version_file.filename file_format = filename.split(".")[-1] try: @@ -25,11 +25,11 @@ def download_file_sample(dataset_version_file: DatasetVersionFile) -> dict: if file_format == "csv": print(f"File {filename} format: {file_format}") csv_sample = pd.read_csv(download_url) - return {"success": True, "sample": csv_sample} + return {"success": True, "data": csv_sample} elif file_format == "parquet": print(f"File {filename} format: {file_format}") parquet_sample = pd.read_parquet(download_url) - return {"success": True, "sample": parquet_sample} + return {"success": True, "data": parquet_sample} else: raise ValueError(f"Unsupported file format: {file_format}") except pd.errors.ParserError as e: @@ -48,9 +48,6 @@ def generate_dataset_file_sample_task( ): dataset_version_file_id = job.args["file_id"] try: - print( - f"Calling {DatasetVersionFile.objects.get} with id {dataset_version_file_id}" - ) dataset_version_file = DatasetVersionFile.objects.get( id=dataset_version_file_id ) @@ -71,24 +68,22 @@ def generate_dataset_file_sample_task( return try: - file_sample = download_file_sample(dataset_version_file) - if file_sample["success"]: - file_snapshot_content = file_sample["sample"].head( - settings.WORKSPACE_DATASETS_FILE_SNAPSHOT_SIZE - ) - dataset_file_metadata.sample = file_snapshot_content.to_json( - orient="records" + source_file = download_file_as_dataframe(dataset_version_file) + if source_file["success"]: + file_sample = source_file["data"].sample( + settings.WORKSPACE_DATASETS_FILE_SNAPSHOT_SIZE, random_state=22 ) + dataset_file_metadata.sample = file_sample.to_json(orient="records") logger.info(f"Dataset sample saved for file {dataset_version_file_id}") dataset_file_metadata.status = DatasetFileMetadata.STATUS_FINISHED dataset_file_metadata.save() logger.info(f"Dataset sample created for file {dataset_version_file_id}") else: dataset_file_metadata.status = DatasetFileMetadata.STATUS_FAILED - dataset_file_metadata.status_reason = file_sample["errors"] + dataset_file_metadata.status_reason = source_file["errors"] dataset_file_metadata.save() logger.info( - f'Dataset file sample creation failed for file {dataset_version_file_id} with error {file_sample["errors"]}' + f'Dataset file sample creation failed for file {dataset_version_file_id} with error {source_file["errors"]}' ) except Exception as e: dataset_file_metadata.status = DatasetFileMetadata.STATUS_FAILED diff --git a/hexa/datasets/tests/test_metadata.py b/hexa/datasets/tests/test_metadata.py index e269095a5..f7c997f67 100644 --- a/hexa/datasets/tests/test_metadata.py +++ b/hexa/datasets/tests/test_metadata.py @@ -51,11 +51,7 @@ def test_create_dataset_file_metadata_task_success( ) dataset_file_metadata.save.assert_called() self.assertEqual(dataset_file_metadata.status, expected_status) - expected_content = ( - '[{"name":"Joe","surname":"Doe"},' - '{"name":"Liam","surname":"Smith"},' - '{"name":"Emma","surname":"Johnson"},' - ) + expected_content = '[{"name":"Sofia","surname":"Walker"},{"name":"Aiden","surname":"Adams"},{"name":"Eleanor","surname":"H' self.assertEqual( dataset_file_metadata.sample[0 : len(expected_content)], expected_content, From d03d879594ba5e2ec1d2f33b39f1bbe6f574dde1 Mon Sep 17 00:00:00 2001 From: nazarfil Date: Tue, 16 Jul 2024 13:48:01 +0200 Subject: [PATCH 23/37] fix(Dataset): tests handled exceptions --- hexa/datasets/queue.py | 4 +- .../tests/fixtures/example_names_2_lines.csv | 3 + hexa/datasets/tests/test_metadata.py | 64 +++++++++++++------ 3 files changed, 51 insertions(+), 20 deletions(-) create mode 100644 hexa/datasets/tests/fixtures/example_names_2_lines.csv diff --git a/hexa/datasets/queue.py b/hexa/datasets/queue.py index 15fb41d5d..c5fefc09a 100644 --- a/hexa/datasets/queue.py +++ b/hexa/datasets/queue.py @@ -71,7 +71,9 @@ def generate_dataset_file_sample_task( source_file = download_file_as_dataframe(dataset_version_file) if source_file["success"]: file_sample = source_file["data"].sample( - settings.WORKSPACE_DATASETS_FILE_SNAPSHOT_SIZE, random_state=22 + settings.WORKSPACE_DATASETS_FILE_SNAPSHOT_SIZE, + random_state=22, + replace=True, ) dataset_file_metadata.sample = file_sample.to_json(orient="records") logger.info(f"Dataset sample saved for file {dataset_version_file_id}") diff --git a/hexa/datasets/tests/fixtures/example_names_2_lines.csv b/hexa/datasets/tests/fixtures/example_names_2_lines.csv new file mode 100644 index 000000000..df110065d --- /dev/null +++ b/hexa/datasets/tests/fixtures/example_names_2_lines.csv @@ -0,0 +1,3 @@ +name,surname +Joe,Doe +Liam,Smith \ No newline at end of file diff --git a/hexa/datasets/tests/test_metadata.py b/hexa/datasets/tests/test_metadata.py index f7c997f67..2446f44ed 100644 --- a/hexa/datasets/tests/test_metadata.py +++ b/hexa/datasets/tests/test_metadata.py @@ -1,6 +1,8 @@ import os from unittest import mock +from pandas.errors import ParserError + from hexa.core.test import TestCase from hexa.datasets.models import DatasetFileMetadata from hexa.datasets.queue import generate_dataset_file_metadata_task @@ -18,10 +20,23 @@ def test_create_dataset_file_metadata_task_success( mock_DatasetVersionFile_get, ): test_cases = [ - ("example_names.csv", DatasetFileMetadata.STATUS_FINISHED), - ("example_names.parquet", DatasetFileMetadata.STATUS_FINISHED), + ( + "example_names.csv", + DatasetFileMetadata.STATUS_FINISHED, + '[{"name":"Jack","surname":"Howard"},{"name":"Olivia","surname":"Brown"},{"name":"Lily","surname":"Evan', + ), + ( + "example_names_2_lines.csv", + DatasetFileMetadata.STATUS_FINISHED, + '[{"name":"Liam","surname":"Smith"},{"name":"Joe","surname":"Doe"},{"name":"Joe","surname":"Doe"},{"nam', + ), + ( + "example_names.parquet", + DatasetFileMetadata.STATUS_FINISHED, + '[{"name":"Jack","surname":"Howard"},{"name":"Olivia","surname":"Brown"},{"name":"Lily","surname":"Evan', + ), ] - for filename, expected_status in test_cases: + for filename, expected_status, expected_content in test_cases: with self.subTest(filename=filename): dataset_version_file = mock.Mock() dataset_version_file.id = 1 @@ -51,7 +66,6 @@ def test_create_dataset_file_metadata_task_success( ) dataset_file_metadata.save.assert_called() self.assertEqual(dataset_file_metadata.status, expected_status) - expected_content = '[{"name":"Sofia","surname":"Walker"},{"name":"Aiden","surname":"Adams"},{"name":"Eleanor","surname":"H' self.assertEqual( dataset_file_metadata.sample[0 : len(expected_content)], expected_content, @@ -71,22 +85,34 @@ def test_create_dataset_file_metadata_task_failure( mock_DatasetFileMetadata_create, mock_DatasetVersionFile_get, ): - dataset_version_file = mock.Mock() - dataset_version_file.id = 1 - dataset_version_file.filename = "example_names.csv" - mock_DatasetVersionFile_get.return_value = dataset_version_file + test_cases = [ + (get_storage().exceptions.NotFound, DatasetFileMetadata.STATUS_FAILED), + (ValueError, DatasetFileMetadata.STATUS_FAILED), + (ParserError, DatasetFileMetadata.STATUS_FAILED), + ] + for exception, expected_status in test_cases: + with self.subTest(exception=exception): + dataset_version_file = mock.Mock() + dataset_version_file.id = 1 + dataset_version_file.filename = "example_names.csv" + mock_DatasetVersionFile_get.return_value = dataset_version_file - dataset_file_metadata = mock.Mock() - mock_DatasetFileMetadata_create.return_value = dataset_file_metadata + dataset_file_metadata = mock.Mock() + mock_DatasetFileMetadata_create.return_value = dataset_file_metadata - mock_generate_download_url.side_effect = get_storage().exceptions.NotFound + mock_generate_download_url.side_effect = exception + + job = mock.Mock() + job.args = {"file_id": dataset_version_file.id} + generate_dataset_file_metadata_task(mock.Mock(), job) - job = mock.Mock() - job.args = {"file_id": dataset_version_file.id} - generate_dataset_file_metadata_task(mock.Mock(), job) + mock_DatasetVersionFile_get.assert_called_with( + id=dataset_version_file.id + ) + dataset_file_metadata.save.assert_called() + self.assertEqual(dataset_file_metadata.status, expected_status) - mock_DatasetVersionFile_get.assert_called_with(id=dataset_version_file.id) - dataset_file_metadata.save.assert_called() - self.assertEqual( - dataset_file_metadata.status, DatasetFileMetadata.STATUS_FAILED - ) + mock_generate_download_url.reset_mock() + mock_DatasetVersionFile_get.reset_mock() + mock_DatasetFileMetadata_create.reset_mock() + dataset_file_metadata.save.reset_mock() From e2f1ad7513d786a41b38fa40cb01fa03aca5d305 Mon Sep 17 00:00:00 2001 From: nazarfil Date: Tue, 16 Jul 2024 14:46:40 +0200 Subject: [PATCH 24/37] fix(Dataset): tests file with no columns and no rows --- hexa/datasets/queue.py | 35 ++++++++------ .../tests/fixtures/example_empty_file.csv | 0 .../tests/fixtures/example_names_0_lines.csv | 1 + hexa/datasets/tests/test_metadata.py | 47 +++++++++++++++++++ 4 files changed, 68 insertions(+), 15 deletions(-) create mode 100644 hexa/datasets/tests/fixtures/example_empty_file.csv create mode 100644 hexa/datasets/tests/fixtures/example_names_0_lines.csv diff --git a/hexa/datasets/queue.py b/hexa/datasets/queue.py index c5fefc09a..63b24ce82 100644 --- a/hexa/datasets/queue.py +++ b/hexa/datasets/queue.py @@ -1,3 +1,4 @@ +import json from logging import getLogger import pandas as pd @@ -22,22 +23,21 @@ def download_file_as_dataframe(dataset_version_file: DatasetVersionFile) -> dict file_format = filename.split(".")[-1] try: download_url = generate_download_url(dataset_version_file) + sample = None if file_format == "csv": - print(f"File {filename} format: {file_format}") - csv_sample = pd.read_csv(download_url) - return {"success": True, "data": csv_sample} + sample = pd.read_csv(download_url) elif file_format == "parquet": - print(f"File {filename} format: {file_format}") - parquet_sample = pd.read_parquet(download_url) - return {"success": True, "data": parquet_sample} + sample = pd.read_parquet(download_url) else: raise ValueError(f"Unsupported file format: {file_format}") + return {"success": True, "data": sample} + except pd.errors.ParserError as e: logger.error(f"Error parsing the file {filename} content: {e}") - return {"success": False, "errors": ["FILE_PARSING_ERROR"]} + return {"success": False, "errors": [f"FILE_PARSING_ERROR: {str(e)}"]} except ValueError as e: logger.error(f"Cannot read file {filename}: {e}") - return {"success": False, "errors": ["FILE_FORMAT_NOT_SUPPORTED"]} + return {"success": False, "errors": [f"FILE_NOT_SUPPORTED : {str(e)}"]} except get_storage().exceptions.NotFound: logger.error(f"Cannot find file {filename}") return {"success": False, "errors": ["FILE_NOT_FOUND"]} @@ -70,20 +70,25 @@ def generate_dataset_file_sample_task( try: source_file = download_file_as_dataframe(dataset_version_file) if source_file["success"]: - file_sample = source_file["data"].sample( - settings.WORKSPACE_DATASETS_FILE_SNAPSHOT_SIZE, - random_state=22, - replace=True, - ) - dataset_file_metadata.sample = file_sample.to_json(orient="records") + file_content = source_file["data"] + if not file_content.empty: + file_sample = file_content.sample( + settings.WORKSPACE_DATASETS_FILE_SNAPSHOT_SIZE, + random_state=22, + replace=True, + ) + dataset_file_metadata.sample = file_sample.to_json(orient="records") + else: + dataset_file_metadata.sample = json.dumps([]) logger.info(f"Dataset sample saved for file {dataset_version_file_id}") dataset_file_metadata.status = DatasetFileMetadata.STATUS_FINISHED dataset_file_metadata.save() logger.info(f"Dataset sample created for file {dataset_version_file_id}") else: dataset_file_metadata.status = DatasetFileMetadata.STATUS_FAILED - dataset_file_metadata.status_reason = source_file["errors"] + dataset_file_metadata.status_reason = str(source_file["errors"]) dataset_file_metadata.save() + print(str(source_file["errors"])) logger.info( f'Dataset file sample creation failed for file {dataset_version_file_id} with error {source_file["errors"]}' ) diff --git a/hexa/datasets/tests/fixtures/example_empty_file.csv b/hexa/datasets/tests/fixtures/example_empty_file.csv new file mode 100644 index 000000000..e69de29bb diff --git a/hexa/datasets/tests/fixtures/example_names_0_lines.csv b/hexa/datasets/tests/fixtures/example_names_0_lines.csv new file mode 100644 index 000000000..5c7077c0c --- /dev/null +++ b/hexa/datasets/tests/fixtures/example_names_0_lines.csv @@ -0,0 +1 @@ +name,surname \ No newline at end of file diff --git a/hexa/datasets/tests/test_metadata.py b/hexa/datasets/tests/test_metadata.py index 2446f44ed..c1da6dba2 100644 --- a/hexa/datasets/tests/test_metadata.py +++ b/hexa/datasets/tests/test_metadata.py @@ -30,6 +30,11 @@ def test_create_dataset_file_metadata_task_success( DatasetFileMetadata.STATUS_FINISHED, '[{"name":"Liam","surname":"Smith"},{"name":"Joe","surname":"Doe"},{"name":"Joe","surname":"Doe"},{"nam', ), + ( + "example_names_0_lines.csv", + DatasetFileMetadata.STATUS_FINISHED, + "[]", + ), ( "example_names.parquet", DatasetFileMetadata.STATUS_FINISHED, @@ -116,3 +121,45 @@ def test_create_dataset_file_metadata_task_failure( mock_DatasetVersionFile_get.reset_mock() mock_DatasetFileMetadata_create.reset_mock() dataset_file_metadata.save.reset_mock() + + @mock.patch("hexa.datasets.queue.DatasetVersionFile.objects.get") + @mock.patch("hexa.datasets.queue.DatasetFileMetadata.objects.create") + @mock.patch("hexa.datasets.queue.generate_download_url") + def test_create_dataset_file_metadata_task_failure_empty_file( + self, + mock_generate_download_url, + mock_DatasetFileMetadata_create, + mock_DatasetVersionFile_get, + ): + dataset_version_file = mock.Mock() + dataset_version_file.id = 1 + dataset_version_file.filename = "example_empty_file.csv" + mock_DatasetVersionFile_get.return_value = dataset_version_file + + dataset_file_metadata = mock.Mock() + mock_DatasetFileMetadata_create.return_value = dataset_file_metadata + + fixture_file_path = os.path.join( + os.path.dirname(__file__), "./fixtures/example_empty_file.csv" + ) + mock_generate_download_url.return_value = fixture_file_path + + job = mock.Mock() + job.args = {"file_id": dataset_version_file.id} + + generate_dataset_file_metadata_task(mock.Mock(), job) + + mock_generate_download_url.assert_called_once_with(dataset_version_file) + mock_DatasetVersionFile_get.assert_called_once_with(id=dataset_version_file.id) + mock_DatasetFileMetadata_create.assert_called_once_with( + dataset_version_file=dataset_version_file, + status=DatasetFileMetadata.STATUS_PROCESSING, + ) + dataset_file_metadata.save.assert_called() + self.assertEqual( + dataset_file_metadata.status, DatasetFileMetadata.STATUS_FAILED + ) + self.assertEqual( + dataset_file_metadata.status_reason, + "['FILE_NOT_SUPPORTED : No columns to parse from file']", + ) From ab38d4d7daa100fec18e82f235344c143e4aaaa0 Mon Sep 17 00:00:00 2001 From: nazarfil Date: Tue, 16 Jul 2024 15:54:05 +0200 Subject: [PATCH 25/37] fix(Dataset): fixes grpahql exposed api --- hexa/datasets/graphql/schema.graphql | 40 ++++++++++++++-------------- hexa/datasets/queue.py | 3 ++- hexa/datasets/schema/queries.py | 16 ----------- 3 files changed, 22 insertions(+), 37 deletions(-) diff --git a/hexa/datasets/graphql/schema.graphql b/hexa/datasets/graphql/schema.graphql index 70ee87904..2697ba6ab 100644 --- a/hexa/datasets/graphql/schema.graphql +++ b/hexa/datasets/graphql/schema.graphql @@ -91,6 +91,25 @@ type DatasetVersionPermissions { download: Boolean! } +""" +Statuses that can occur when generating file metadata +""" +enum FileMetadataStatus{ + STATUS_PROCESSING, + STATUS_FAILED, + STATUS_FINISHED +} + +""" +Metadata for dataset file +""" +type DatasetFileMetadata { + content: JSON! + status: FileMetadataStatus! + statusReason: String + datasetVersionFile: DatasetVersionFile! +} + """ A file in a dataset version. """ @@ -101,6 +120,7 @@ type DatasetVersionFile { createdAt: DateTime! createdBy: User contentType: String! + fileMetadata: DatasetFileMetadata } """ @@ -424,31 +444,11 @@ type PinDatasetResult { errors: [PinDatasetError!]! } -""" -Statuses that can occur when generating file metadata -""" -enum FileMetadataStatus{ - STATUS_PROCESSING, - STATUS_FAILED, - STATUS_FINISHED -} -""" -Metadata for dataset file -""" -type DatasetFileMetadata { - content: JSON - status: FileMetadataStatus! - datasetVersionFile: DatasetVersionFile! -} - - extend type Query { "Get a dataset by its ID." dataset(id: ID!): Dataset "Get a dataset by its slug." datasetVersion(id: ID!): DatasetVersion - "Get a dataset file sample by fileSnapshot id or by fileId" - datasetFileSample(id: ID, fileId: ID): DatasetFileMetadata "Get a dataset link by its id." datasetLink(id: ID!): DatasetLink "Get a dataset link by its slug." diff --git a/hexa/datasets/queue.py b/hexa/datasets/queue.py index 63b24ce82..a106dbecd 100644 --- a/hexa/datasets/queue.py +++ b/hexa/datasets/queue.py @@ -28,6 +28,8 @@ def download_file_as_dataframe(dataset_version_file: DatasetVersionFile) -> dict sample = pd.read_csv(download_url) elif file_format == "parquet": sample = pd.read_parquet(download_url) + elif file_format == "xlsx": + sample = pd.read_excel(download_url) else: raise ValueError(f"Unsupported file format: {file_format}") return {"success": True, "data": sample} @@ -88,7 +90,6 @@ def generate_dataset_file_sample_task( dataset_file_metadata.status = DatasetFileMetadata.STATUS_FAILED dataset_file_metadata.status_reason = str(source_file["errors"]) dataset_file_metadata.save() - print(str(source_file["errors"])) logger.info( f'Dataset file sample creation failed for file {dataset_version_file_id} with error {source_file["errors"]}' ) diff --git a/hexa/datasets/schema/queries.py b/hexa/datasets/schema/queries.py index efe99ba4e..9ca23fcb0 100644 --- a/hexa/datasets/schema/queries.py +++ b/hexa/datasets/schema/queries.py @@ -4,7 +4,6 @@ from ..models import ( Dataset, - DatasetFileMetadata, DatasetLink, DatasetVersion, ) @@ -42,21 +41,6 @@ def resolve_dataset_version(_, info, **kwargs): return None -@datasets_queries.field("datasetFileSample") -def resolve_dataset_file_sample(_, info, **kwargs): - try: - if kwargs.get("file_id"): - return DatasetFileMetadata.objects.get( - dataset_version_file=kwargs["file_id"] - ) - elif kwargs.get("id"): - return DatasetFileMetadata.objects.get(id=kwargs["id"]) - else: - return None - except DatasetFileMetadata.DoesNotExist: - return None - - @datasets_queries.field("datasetLink") def resolve_dataset_link(_, info, **kwargs): request = info.context["request"] From a9be661a0ee394ee39991cc710da294abbc864aa Mon Sep 17 00:00:00 2001 From: nazarfil Date: Tue, 16 Jul 2024 16:39:46 +0200 Subject: [PATCH 26/37] test(Dataset): add unittest for xlsx file --- hexa/datasets/tests/fixtures/example_names.xlsx | Bin 0 -> 6005 bytes hexa/datasets/tests/test_metadata.py | 5 +++++ requirements.in | 5 +++-- requirements.txt | 4 ++++ 4 files changed, 12 insertions(+), 2 deletions(-) create mode 100644 hexa/datasets/tests/fixtures/example_names.xlsx diff --git a/hexa/datasets/tests/fixtures/example_names.xlsx b/hexa/datasets/tests/fixtures/example_names.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..474111ec027fbf224dde212eedcd1832b852b565 GIT binary patch literal 6005 zcmai22RIze*2d~})r6?gf+(vKB{~tE=vG_3lc*70tQx)d8ZE3|gXlFxCwd7%L=gR7 z^4%L>{`=i~_IY+^o@eIWIcMg)^PW+b2Otw6prfNB*z-w40rYJ9fTxl1TPWH6%0fxAd=ckfMPA8yW^_s1u^mY#@| zw*8_fdxLf^H9sYd3B72yDhW_Lh^zMXX>s@LVQ~-m{U#0lyQTG_I_=J?NO?+!n+opl zcp4N`;=c^*(0TGqSa6XOdvmM`XO%iXTCfOXNbbe6q2ev6?~l=wJwMcCFf4Di0g?eL z2bT(y0X00B$6{&Zr~EVlB*v#G$7YKqMQCLz!pi4}U9bGOph&^I@yvDGOw+RB7eW~P zmt=?YJ*q8zPHS16Na?UrSU zq*@Ix5l@7to*$qfmVU9U&R@_etruzx$9VQG+y>CQ5J^*oSM)L>U>zq_KP6g|70leP zRDS6Fg1NE9Dn%o6U4?@sC3{|d@PPT}S$VmVCC7Gf)AZ_D4U=Y@?lfA)L@VpC?eXYd zpHpHuib5;qiw*~^03-0%KEb_>oh+NKR-4(fq2PV%WAdZmvH`wHu|tthDn!B04S3o- zJF-1BSt1k$`>DNCytE@&UbJ3?8KDoWdoH9fRu`rn*4oNHeg;-@R}tZ2qg7UO_0AF@!#3eW;+KkM>f)rLNWhUvs09j?nQ)Mk!X%?Us1?Q``aPAtFT2ac-H#x~K#Naz zLDUEFp^kZj9_@>H!TC6<7af^LpztI63gIdzl9&Rn)sOyDo=TD`n z3wy-|=XSmP^lqBr-r5<|32*>^vWrSMfGl#aSh%+Ub&n~X-)g3GD>=}VlR(T#IR}i3 z&Uw8aI64}9J&#AC{82BQM8w!=f!wQ^Vy(t-7<_axr;$u>g;G?%EMBUsQtGBA_HZiv zKP`I1I)q%Wmm(QO&b>J~PyBr%EQy~uxmV2JyK^KgiH_i=N#AF0#;Ie$fQEA#cegdE zS1T(CK3sG8+>C+nU@)G%YEv`XZ4kb4i(au%Z+a0ih8Rp_RsoBT<)0XPAjp0j4t_O%SgUFp)tqa&R|8y&-=&e7lwR?KC3kX`RR;r;#jh(!zjg z?lF-JuHX-rgGtl`{D~l1G>X%>6Knt8LRl^H2s)PsiO}=$wxEEuC0U7JXi*QC>Zz=b z`>q`oDuvpcR{(Q~QA`>_idrgwd;pf5b%37BK!iN9uB@J{8?Liyj!BKaA#Dg2MuM=6 zprF~dR2%b>I8t9?Kv>e?owafTUOGK#V^vP+`PlD!$TqV4j37Buu6CvcSKd`Ds?R&i zb`9rxJi{_Sf}ri_^eP5S!b~ z#&+EgAC|{#=K!KE61oL*c*aZwzUJPM4zSCkG?dN9v!SHh|2Ufxp1s(M(tJh)xVaL)rT+TFiM4OXH>W7p=4+*bhP+GXH0E2tT4CnTL=5@kfF>tYf*h1KE zzHcafP;=I9jv9F3cL`W-ZkudoYE-FA(V)`N*s9s$s@TtKn?mBXyllDaY3n7QZAJ?L zBdu@+b8)?o>Mw33MZpUQ!;WN8WtiS5Ls!GFL-a>rdP@swmus6$o3Jxn};ir7~Ajfpl_>{4voOh#*$ zXk<1wb)yi!SwJr|fDoD&%@$n+1ahP@g$D-!nHJaSzQ;$8O2h{Fe3!_K!+W>Xtbd6w z)H17zOUa=xA%8_ZqnC~Pcnw%P*-EAfX!qQp7b#f2x9s6X=HvfZ3ZysdOmg|TxB|71 z47c9CTzK;Ek(>Vlg;jn1w2r=aZKo%oXR1T}*A=WoxI#)Uoe3ZkcRW=S#ZzoeCpQs8&J*oJb{82L$z@@3Rd^sl)U zrKyk}bb5Xw!`VaaYvB4;IF!20l8kW2`fkzoQyiH1J_d<|k7WrZqi+e}g(#nVTi~40 z8+DG0BB-c5{gO|9(#*CQ=>zNP>7lvPy5;zZ?tb#EJ4urG7^?w!X>aX`dp9Tf{QBj* z)wA*BexmPgzPWnOta3TWMtC=0oOc>q&CX_*H?(>Ctq7EsUFgCmOUiFM!q{M)STFLP zfHb*|r6@CZe0!tYtBA6*O0~VwR14~ZUfr)3$xT^(6q;+1pP%;!9(pCXc}j5PvUcUw zuZ-qNq7nT{ zG-nGVCy1$`mOlvq^k1l*WgZUbrW`rpEL9W!!tW&K(WQhi+yszFv01c$!(yGCOggu$Zyz?y_i|Ml)rvS54kmMVo=LWel_t zXr1X2jv%i|Jes3X)jOzx(25Jy_$*Z$rc~K}wu=M0&#F&FU#bO7aE*(BWUTuR#>OJ^ z=y#w7^l71%vj=!m<<6%{-7B?y(;LQ9cpg)c5;E~;5b~q`j=fjaBPV(%(wQ!SdG!pQ zdb&CkvX3gzbaFMO7NIqQp3FAysHv(~5>`$pA6xUH>J0N^Y+LC$^zQO$<6sZQEG({| z7^g!$Y5X%3!!Fa}*l53Nz2PIE<+q9jo(=f9YvKbB3qKe?H=U;5l9{>J8&rD z7*|i5_v~-3NJf+x;r8UY^(UfH2=T7+S@b-??`Wfp&T;0>6+^0W{!BB);v_isY(nsk zs=T{+Xlaj)aysdwfNxxZ&H#mCZX$A8X8XH9udyk^tst)#~T-L)V z=_i)1chZPmNW9f&hZJG+^W)N1L`5P~11U?*q-G8$96AyPd3a=cQne|51YeL{W+<3X zO%~?7vgjmt1?}GVLKQhjf=|^*mrYll*&%d2i{esecY{5<{5JE}i~#;Fn<|T}krt-_ zEdPWiEXx=k8%?5&q_vrk0}i5g_Vsnk6*_L{nak2Xkjz+;@5x^x=0R-F7(5jYqy^*8 z5>_1is{oM=%hL(qvlq&on72bYTMMwG(-Zz)Yq{u}CxVigTUhVKbqS+LA-cb?Nt zIkGq}=1k7fSqzyVmcCdd?BTTbHZMzTS6&2gA23p<@vd{aB5x6dhK)9&;#8I#kNN1} z-b=AuuX`nV@8#x7%*NPs{E2@b>rY%6%kj^n#80w939;h3gbMagirBYP0AbN91ey=iM`boovllJ*j*JIGUN;8*l!)kF%?wd~QLt(kW z6p6iR&uNLA6`K{FFrD0q8X&FK6K{8suWS5`YOFaPnhmp8Z+71f$I}W~uS(s!%4VGA z?jgR2-%ZcT6}BR*?r#ajr7`DAPmXCW5LLLiVaH0r%BFF+1Sx~dv455!c)!lWwG=b9 zx3|72(4#Bt{5gO@8JT-NO^iVgN(qR1R5kikpIdD7fn?q-!`|#De?JmNr_R$>X7sAr zYgAfa*wOu@0vKYk=8S)6LgM?TMtME9qI|fvGe}AVT@Bq9$Mh>^`p!YYeB|?uoa&9& zr{Tj|#NugAyAYHI=c5ybo4HlEQkkP^b>$SV+cgmodBC0fSTb4Ijrgj3!AAY6a&Fb% z{S4c2mLG>cK59{|rm~z8SyYxYZ)WXK)ghNPr530n@W>iW1NI3>KbSjFWk=VJ=~S^K zE^D3)u;hH<$Ut#7R4!**&eD4(lajr**CYM7!^H07XsAUY{YvJEaXQ^AW$XQ)BnQ<2 zsj~cdTo-R>?w=6V;&mVB45PeM`^9+=MNqX9>!I4kIB6lPA3(s&mIf#P=zLasW`uSP6j z7i1p$zD%xdEQL@4WDi1{4PBJI;Dj?qU{Y0W10E%DSfY{cf~`Tt`2wYC@kw8h`|PjGP6d4=5c{#ar@4WYB*Lx%*laweo2D6G2Ra&YL?6% zCqoP3FKuLqlSfU$2v{mQtc;Zp3_B5?S2MBm_a@qji`Km-F!uKmk#=1bDx@?g<#e(b z%cm9XPn5Q=i?NDDJz*O@o@>3Q{4%lRBa&ARmf_CUH>#8dJ7u@~&oe%DQ4xbf=;ZA= zFtr#i=|n%j?s}N@zBu}okXAF?H8k{G`z-28xFZk#XRV+sVQ=RGv2y{dyE{Of^{*4P zTCHCZek#t9E@{3K^VkVc2P7Z8UqP3R#nbw#U^33`K0mntlhdJlOYQr~psA+&5!ZHg z!9g0V6O}M&kp;xmo&C7nkJno}B(n2hQ;{gP&H7!N3H_0*u}EVw+Yx@zqN!4xg;T__ zYP8h|VjNAjc`k0R*g8a~58VnyK9+A~zf`@={D*t2E-olV#2CjqB{bN(HLnc$gGIRC zKnq<=$g*X(p*>8$Uv$;BNd&|tc$bNAW*3`>Vf3&FA?Fiw)AvEToP97sR#~_>#itp^ zCvqSf3o9TDw#J*Q!MDe=K)rUy;VHB0GjD&oMh+&txby`yo+#zkjSu1ZH0+|dsbj&4 z?;t}og8<3yUi$4}&(MxQ*B*g5u2)#((OW*=O%}xh@wj1EJA*HS-U)8?@0t&{+(g~> zb(Jy?JmRD`)QE^ggz)yLH`MOTQOf3pude4?FR7Aa`aDt+alvySpT*#xa@%Y_5Wn{p9r_F zrE4+r+mhkkYlMGGlRu~4X430>>$kmycmE3NzizQVC*LNOYqI@qocK4B|ABh{#JQcZ t*A(>IObPx*NPkYeedzukdnI`HKcu&^JPLf55fCuppJX@{cHRX1{{W1%6YBr~ literal 0 HcmV?d00001 diff --git a/hexa/datasets/tests/test_metadata.py b/hexa/datasets/tests/test_metadata.py index c1da6dba2..34d1d48e3 100644 --- a/hexa/datasets/tests/test_metadata.py +++ b/hexa/datasets/tests/test_metadata.py @@ -40,6 +40,11 @@ def test_create_dataset_file_metadata_task_success( DatasetFileMetadata.STATUS_FINISHED, '[{"name":"Jack","surname":"Howard"},{"name":"Olivia","surname":"Brown"},{"name":"Lily","surname":"Evan', ), + ( + "example_names.xlsx", + DatasetFileMetadata.STATUS_FINISHED, + '[{"name":"Jack","surname":"Howard"},{"name":"Olivia","surname":"Brown"},{"name":"Lily","surname":"Evan', + ), ] for filename, expected_status, expected_content in test_cases: with self.subTest(filename=filename): diff --git a/requirements.in b/requirements.in index d7445ca55..fb446fd16 100644 --- a/requirements.in +++ b/requirements.in @@ -61,5 +61,6 @@ requests==2.32.3 mixpanel ua-parser -# -pyarrow \ No newline at end of file +# dataset explorer +pyarrow +openpyxl \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 73f891c88..6b83a3fd2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -104,6 +104,8 @@ django-storages[google]==1.14.4 # via -r requirements.in docker==7.1.0 # via -r requirements.in +et-xmlfile==1.1.0 + # via openpyxl geopandas==1.0.1 # via -r requirements.in google-api-core[grpc]==2.19.1 @@ -207,6 +209,8 @@ oauthlib==3.2.2 # via # kubernetes # requests-oauthlib +openpyxl==3.1.5 + # via -r requirements.in opentelemetry-api==1.26.0 # via google-cloud-logging packaging==24.1 From aa1b24006edc75f685a503943e871a02899f0476 Mon Sep 17 00:00:00 2001 From: nazarfil Date: Tue, 16 Jul 2024 16:56:32 +0200 Subject: [PATCH 27/37] chore: rename worker to dataset-worker --- hexa/datasets/management/commands/worker.py | 7 ------- 1 file changed, 7 deletions(-) delete mode 100644 hexa/datasets/management/commands/worker.py diff --git a/hexa/datasets/management/commands/worker.py b/hexa/datasets/management/commands/worker.py deleted file mode 100644 index 0c6578270..000000000 --- a/hexa/datasets/management/commands/worker.py +++ /dev/null @@ -1,7 +0,0 @@ -from dpq.commands import Worker - -from hexa.datasets.queue import dataset_snapshot_queue - - -class Command(Worker): - queue = dataset_snapshot_queue From 9aaef85d93499ea41f4625f62fa877fc9eb0e7d3 Mon Sep 17 00:00:00 2001 From: nazarfil Date: Wed, 17 Jul 2024 09:30:15 +0200 Subject: [PATCH 28/37] chore: ignore local files --- .gitignore | 3 +++ hexa/datasets/tests/test_metadata.py | 8 ++++---- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/.gitignore b/.gitignore index 938b66bbb..381ec6f6e 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,9 @@ venv k8s/* !k8s/sample_app.yaml +#for mac and idea +*.DS_Store +.idea/ # TODO: remove credentials .terraform* diff --git a/hexa/datasets/tests/test_metadata.py b/hexa/datasets/tests/test_metadata.py index 34d1d48e3..51a2618a9 100644 --- a/hexa/datasets/tests/test_metadata.py +++ b/hexa/datasets/tests/test_metadata.py @@ -5,7 +5,7 @@ from hexa.core.test import TestCase from hexa.datasets.models import DatasetFileMetadata -from hexa.datasets.queue import generate_dataset_file_metadata_task +from hexa.datasets.queue import generate_dataset_file_sample_task from hexa.files.api import get_storage @@ -64,7 +64,7 @@ def test_create_dataset_file_metadata_task_success( job = mock.Mock() job.args = {"file_id": dataset_version_file.id} - generate_dataset_file_metadata_task(mock.Mock(), job) + generate_dataset_file_sample_task(mock.Mock(), job) mock_generate_download_url.assert_called_once_with(dataset_version_file) mock_DatasetVersionFile_get.assert_called_once_with( @@ -114,7 +114,7 @@ def test_create_dataset_file_metadata_task_failure( job = mock.Mock() job.args = {"file_id": dataset_version_file.id} - generate_dataset_file_metadata_task(mock.Mock(), job) + generate_dataset_file_sample_task(mock.Mock(), job) mock_DatasetVersionFile_get.assert_called_with( id=dataset_version_file.id @@ -152,7 +152,7 @@ def test_create_dataset_file_metadata_task_failure_empty_file( job = mock.Mock() job.args = {"file_id": dataset_version_file.id} - generate_dataset_file_metadata_task(mock.Mock(), job) + generate_dataset_file_sample_task(mock.Mock(), job) mock_generate_download_url.assert_called_once_with(dataset_version_file) mock_DatasetVersionFile_get.assert_called_once_with(id=dataset_version_file.id) From fe817e82edd87cbc2091a191e366047f50109ca1 Mon Sep 17 00:00:00 2001 From: nazarfil Date: Wed, 17 Jul 2024 09:52:46 +0200 Subject: [PATCH 29/37] chore: document how to run dataset worker locally --- hexa/datasets/graphql/schema.graphql | 1 - hexa/datasets/queue.py | 19 +++++++------------ 2 files changed, 7 insertions(+), 13 deletions(-) diff --git a/hexa/datasets/graphql/schema.graphql b/hexa/datasets/graphql/schema.graphql index 2697ba6ab..92b73040f 100644 --- a/hexa/datasets/graphql/schema.graphql +++ b/hexa/datasets/graphql/schema.graphql @@ -107,7 +107,6 @@ type DatasetFileMetadata { content: JSON! status: FileMetadataStatus! statusReason: String - datasetVersionFile: DatasetVersionFile! } """ diff --git a/hexa/datasets/queue.py b/hexa/datasets/queue.py index a106dbecd..d00af3483 100644 --- a/hexa/datasets/queue.py +++ b/hexa/datasets/queue.py @@ -23,16 +23,16 @@ def download_file_as_dataframe(dataset_version_file: DatasetVersionFile) -> dict file_format = filename.split(".")[-1] try: download_url = generate_download_url(dataset_version_file) - sample = None + file_content = None if file_format == "csv": - sample = pd.read_csv(download_url) + file_content = pd.read_csv(download_url) elif file_format == "parquet": - sample = pd.read_parquet(download_url) + file_content = pd.read_parquet(download_url) elif file_format == "xlsx": - sample = pd.read_excel(download_url) + file_content = pd.read_excel(download_url) else: raise ValueError(f"Unsupported file format: {file_format}") - return {"success": True, "data": sample} + return {"success": True, "data": file_content} except pd.errors.ParserError as e: logger.error(f"Error parsing the file {filename} content: {e}") @@ -95,13 +95,8 @@ def generate_dataset_file_sample_task( ) except Exception as e: dataset_file_metadata.status = DatasetFileMetadata.STATUS_FAILED - dataset_file_metadata.status_reason = str(e) - try: - dataset_file_metadata.save() - except (IntegrityError, DatabaseError, ValidationError) as save_error: - logger.error( - f"Error saving DatasetFileMetadata after failure: {save_error}" - ) + dataset_file_metadata.status_reason = str([f"UNKNOWN_ERROR : {e}"]) + dataset_file_metadata.save() logger.exception( f"Dataset file sample creation failed for file {dataset_version_file_id}: {e}" ) From 95f6b49d5877f9ed5b6234b9ae5a707e4d2be704 Mon Sep 17 00:00:00 2001 From: nazarfil Date: Thu, 18 Jul 2024 14:33:49 +0200 Subject: [PATCH 30/37] fix(Dataset): fixes grpahql exposed metadata and exceptions --- hexa/datasets/graphql/schema.graphql | 11 +-- .../migrations/0006_datasetfilemetadata.py | 10 +-- hexa/datasets/models.py | 6 +- hexa/datasets/queue.py | 74 +++++++------------ hexa/datasets/schema/queries.py | 12 +++ hexa/datasets/schema/types.py | 11 +++ hexa/datasets/tests/test_metadata.py | 3 +- hexa/datasets/tests/test_schema.py | 46 +++++++++++- 8 files changed, 110 insertions(+), 63 deletions(-) diff --git a/hexa/datasets/graphql/schema.graphql b/hexa/datasets/graphql/schema.graphql index 92b73040f..a93b4a109 100644 --- a/hexa/datasets/graphql/schema.graphql +++ b/hexa/datasets/graphql/schema.graphql @@ -95,18 +95,17 @@ type DatasetVersionPermissions { Statuses that can occur when generating file metadata """ enum FileMetadataStatus{ - STATUS_PROCESSING, - STATUS_FAILED, - STATUS_FINISHED + PROCESSING, + FAILED, + FINISHED } """ Metadata for dataset file """ type DatasetFileMetadata { - content: JSON! + sample: JSON! status: FileMetadataStatus! - statusReason: String } """ @@ -448,6 +447,8 @@ extend type Query { dataset(id: ID!): Dataset "Get a dataset by its slug." datasetVersion(id: ID!): DatasetVersion + "Get a dataset file bi its id " + datasetVersionFile(id: ID!): DatasetVersionFile "Get a dataset link by its id." datasetLink(id: ID!): DatasetLink "Get a dataset link by its slug." diff --git a/hexa/datasets/migrations/0006_datasetfilemetadata.py b/hexa/datasets/migrations/0006_datasetfilemetadata.py index 97784a1cc..7ecc59078 100644 --- a/hexa/datasets/migrations/0006_datasetfilemetadata.py +++ b/hexa/datasets/migrations/0006_datasetfilemetadata.py @@ -1,4 +1,4 @@ -# Generated by Django 5.0.7 on 2024-07-16 08:34 +# Generated by Django 5.0.7 on 2024-07-18 15:11 import uuid @@ -31,11 +31,11 @@ class Migration(migrations.Migration): "status", models.CharField( choices=[ - ("processing", "Processing"), - ("failed", "Failed"), - ("finished", "Finished"), + ("PROCESSING", "Processing"), + ("FAILED", "Failed"), + ("FINISHED", "Finished"), ], - default="processing", + default="PROCESSING", max_length=10, ), ), diff --git a/hexa/datasets/models.py b/hexa/datasets/models.py index 4d9ab886d..43c2d53fa 100644 --- a/hexa/datasets/models.py +++ b/hexa/datasets/models.py @@ -257,9 +257,9 @@ class Meta: class DatasetFileMetadata(Base): - STATUS_PROCESSING = "processing" - STATUS_FAILED = "failed" - STATUS_FINISHED = "finished" + STATUS_PROCESSING = "PROCESSING" + STATUS_FAILED = "FAILED" + STATUS_FINISHED = "FINISHED" STATUS_CHOICES = [ (STATUS_PROCESSING, "Processing"), diff --git a/hexa/datasets/queue.py b/hexa/datasets/queue.py index d00af3483..01ef0a526 100644 --- a/hexa/datasets/queue.py +++ b/hexa/datasets/queue.py @@ -13,36 +13,24 @@ DatasetFileMetadataJob, DatasetVersionFile, ) -from hexa.files.api import get_storage logger = getLogger(__name__) -def download_file_as_dataframe(dataset_version_file: DatasetVersionFile) -> dict: +def download_file_as_dataframe( + dataset_version_file: DatasetVersionFile, +) -> pd.DataFrame: filename = dataset_version_file.filename file_format = filename.split(".")[-1] - try: - download_url = generate_download_url(dataset_version_file) - file_content = None - if file_format == "csv": - file_content = pd.read_csv(download_url) - elif file_format == "parquet": - file_content = pd.read_parquet(download_url) - elif file_format == "xlsx": - file_content = pd.read_excel(download_url) - else: - raise ValueError(f"Unsupported file format: {file_format}") - return {"success": True, "data": file_content} - - except pd.errors.ParserError as e: - logger.error(f"Error parsing the file {filename} content: {e}") - return {"success": False, "errors": [f"FILE_PARSING_ERROR: {str(e)}"]} - except ValueError as e: - logger.error(f"Cannot read file {filename}: {e}") - return {"success": False, "errors": [f"FILE_NOT_SUPPORTED : {str(e)}"]} - except get_storage().exceptions.NotFound: - logger.error(f"Cannot find file {filename}") - return {"success": False, "errors": ["FILE_NOT_FOUND"]} + download_url = generate_download_url(dataset_version_file) + if file_format == "csv": + return pd.read_csv(download_url) + elif file_format == "parquet": + return pd.read_parquet(download_url) + elif file_format == "xlsx": + return pd.read_excel(download_url) + else: + raise ValueError(f"Unsupported file format: {file_format}") def generate_dataset_file_sample_task( @@ -70,32 +58,24 @@ def generate_dataset_file_sample_task( return try: - source_file = download_file_as_dataframe(dataset_version_file) - if source_file["success"]: - file_content = source_file["data"] - if not file_content.empty: - file_sample = file_content.sample( - settings.WORKSPACE_DATASETS_FILE_SNAPSHOT_SIZE, - random_state=22, - replace=True, - ) - dataset_file_metadata.sample = file_sample.to_json(orient="records") - else: - dataset_file_metadata.sample = json.dumps([]) - logger.info(f"Dataset sample saved for file {dataset_version_file_id}") - dataset_file_metadata.status = DatasetFileMetadata.STATUS_FINISHED - dataset_file_metadata.save() - logger.info(f"Dataset sample created for file {dataset_version_file_id}") - else: - dataset_file_metadata.status = DatasetFileMetadata.STATUS_FAILED - dataset_file_metadata.status_reason = str(source_file["errors"]) - dataset_file_metadata.save() - logger.info( - f'Dataset file sample creation failed for file {dataset_version_file_id} with error {source_file["errors"]}' + file_content = download_file_as_dataframe(dataset_version_file) + if not file_content.empty: + random_seed = 22 + file_sample = file_content.sample( + settings.WORKSPACE_DATASETS_FILE_SNAPSHOT_SIZE, + random_state=random_seed, + replace=True, ) + dataset_file_metadata.sample = file_sample.to_json(orient="records") + else: + dataset_file_metadata.sample = json.dumps([]) + logger.info(f"Dataset sample saved for file {dataset_version_file_id}") + dataset_file_metadata.status = DatasetFileMetadata.STATUS_FINISHED + dataset_file_metadata.save() + logger.info(f"Dataset sample created for file {dataset_version_file_id}") except Exception as e: dataset_file_metadata.status = DatasetFileMetadata.STATUS_FAILED - dataset_file_metadata.status_reason = str([f"UNKNOWN_ERROR : {e}"]) + dataset_file_metadata.status_reason = str(e) dataset_file_metadata.save() logger.exception( f"Dataset file sample creation failed for file {dataset_version_file_id}: {e}" diff --git a/hexa/datasets/schema/queries.py b/hexa/datasets/schema/queries.py index 9ca23fcb0..f6d427b59 100644 --- a/hexa/datasets/schema/queries.py +++ b/hexa/datasets/schema/queries.py @@ -6,6 +6,7 @@ Dataset, DatasetLink, DatasetVersion, + DatasetVersionFile, ) datasets_queries = QueryType() @@ -41,6 +42,17 @@ def resolve_dataset_version(_, info, **kwargs): return None +@datasets_queries.field("datasetVersionFile") +def resolve_dataset_version_file(_, info, **kwargs): + request = info.context["request"] + try: + return DatasetVersionFile.objects.filter_for_user(request.user).get( + id=kwargs["id"] + ) + except DatasetVersionFile.DoesNotExist: + return None + + @datasets_queries.field("datasetLink") def resolve_dataset_link(_, info, **kwargs): request = info.context["request"] diff --git a/hexa/datasets/schema/types.py b/hexa/datasets/schema/types.py index d367497e8..e8a773fd5 100644 --- a/hexa/datasets/schema/types.py +++ b/hexa/datasets/schema/types.py @@ -8,6 +8,7 @@ from hexa.datasets.api import generate_upload_url from hexa.datasets.models import ( Dataset, + DatasetFileMetadata, DatasetLink, DatasetVersion, DatasetVersionFile, @@ -217,6 +218,16 @@ def resolve_upload_url(obj, info, **kwargs): return None +@dataset_version_file_object.field("fileMetadata") +def resolve_version_file_metadata(obj: DatasetVersionFile, info, **kwargs): + try: + file_metadata = DatasetFileMetadata.objects.get(dataset_version_file=obj) + return file_metadata + except DatasetFileMetadata.DoesNotExist: + logging.error(f"No metadata found for file {obj.filename} with id {obj.id}") + return None + + bindables = [ dataset_object, dataset_permissions, diff --git a/hexa/datasets/tests/test_metadata.py b/hexa/datasets/tests/test_metadata.py index 51a2618a9..9a27ea2f4 100644 --- a/hexa/datasets/tests/test_metadata.py +++ b/hexa/datasets/tests/test_metadata.py @@ -165,6 +165,5 @@ def test_create_dataset_file_metadata_task_failure_empty_file( dataset_file_metadata.status, DatasetFileMetadata.STATUS_FAILED ) self.assertEqual( - dataset_file_metadata.status_reason, - "['FILE_NOT_SUPPORTED : No columns to parse from file']", + dataset_file_metadata.status_reason, "No columns to parse from file" ) diff --git a/hexa/datasets/tests/test_schema.py b/hexa/datasets/tests/test_schema.py index faf7b6f38..c03ef401d 100644 --- a/hexa/datasets/tests/test_schema.py +++ b/hexa/datasets/tests/test_schema.py @@ -1,3 +1,5 @@ +import json + from django.conf import settings from django.db import IntegrityError @@ -7,7 +9,7 @@ from hexa.user_management.models import User from hexa.workspaces.models import WorkspaceMembershipRole -from ..models import Dataset, DatasetVersionFile +from ..models import Dataset, DatasetFileMetadata, DatasetVersionFile from .testutils import DatasetTestMixin @@ -461,6 +463,48 @@ def test_get_file_by_name(self): {"datasetVersion": {"fileByName": {"filename": file.filename}}}, r["data"] ) + def test_get_file_metadata(self): + self.test_create_dataset_version() + superuser = User.objects.get(email="superuser@blsq.com") + dataset = Dataset.objects.get(name="Dataset") + self.client.force_login(superuser) + file = DatasetVersionFile.objects.create( + dataset_version=dataset.latest_version, + uri=dataset.latest_version.get_full_uri("file.csv"), + created_by=superuser, + ) + metadata = DatasetFileMetadata.objects.create( + dataset_version_file=file, + sample=json.dumps({"key": "value"}), + status=DatasetFileMetadata.STATUS_PROCESSING, + ) + r = self.run_query( + """ + query GetDatasetVersionFile($id: ID!) { + datasetVersionFile(id: $id) { + filename + fileMetadata { + status + sample + } + } + } + """, + {"id": str(file.id)}, + ) + self.assertEqual( + { + "datasetVersionFile": { + "filename": file.filename, + "fileMetadata": { + "status": metadata.status, + "sample": metadata.sample, + }, + } + }, + r["data"], + ) + @mock_gcp_storage def test_prepare_version_file_download(self): serena = self.create_user("sereba@blsq.org", is_superuser=True) From 57e05c3465df954bc8366713f7c9c7fa03ee2eb1 Mon Sep 17 00:00:00 2001 From: nazarfil Date: Tue, 30 Jul 2024 10:07:28 +0200 Subject: [PATCH 31/37] fix(Dataset): fix typo, add internationalisation --- hexa/datasets/graphql/schema.graphql | 2 +- hexa/datasets/models.py | 7 ++++--- hexa/datasets/queue.py | 19 ++++++++++++------- 3 files changed, 17 insertions(+), 11 deletions(-) diff --git a/hexa/datasets/graphql/schema.graphql b/hexa/datasets/graphql/schema.graphql index a93b4a109..452e0d966 100644 --- a/hexa/datasets/graphql/schema.graphql +++ b/hexa/datasets/graphql/schema.graphql @@ -447,7 +447,7 @@ extend type Query { dataset(id: ID!): Dataset "Get a dataset by its slug." datasetVersion(id: ID!): DatasetVersion - "Get a dataset file bi its id " + "Get a dataset file by its id " datasetVersionFile(id: ID!): DatasetVersionFile "Get a dataset link by its id." datasetLink(id: ID!): DatasetLink diff --git a/hexa/datasets/models.py b/hexa/datasets/models.py index 43c2d53fa..cc49a38f5 100644 --- a/hexa/datasets/models.py +++ b/hexa/datasets/models.py @@ -4,6 +4,7 @@ from django.core.exceptions import PermissionDenied from django.db import models from django.db.models import JSONField +from django.utils.translation import gettext_lazy as _ from dpq.models import BaseJob from slugify import slugify @@ -262,9 +263,9 @@ class DatasetFileMetadata(Base): STATUS_FINISHED = "FINISHED" STATUS_CHOICES = [ - (STATUS_PROCESSING, "Processing"), - (STATUS_FAILED, "Failed"), - (STATUS_FINISHED, "Finished"), + (STATUS_PROCESSING, _("Processing")), + (STATUS_FAILED, _("Failed")), + (STATUS_FINISHED, _("Finished")), ] sample = JSONField(blank=True, default=list, null=True) diff --git a/hexa/datasets/queue.py b/hexa/datasets/queue.py index 01ef0a526..3e40d0cec 100644 --- a/hexa/datasets/queue.py +++ b/hexa/datasets/queue.py @@ -7,6 +7,7 @@ from django.db import DatabaseError, IntegrityError from dpq.queue import AtLeastOnceQueue +from hexa.core import mimetypes from hexa.datasets.api import generate_download_url from hexa.datasets.models import ( DatasetFileMetadata, @@ -20,17 +21,21 @@ def download_file_as_dataframe( dataset_version_file: DatasetVersionFile, ) -> pd.DataFrame: - filename = dataset_version_file.filename - file_format = filename.split(".")[-1] + mime_type, encoding = mimetypes.guess_type( + dataset_version_file.filename, strict=False + ) download_url = generate_download_url(dataset_version_file) - if file_format == "csv": + if mime_type == "text/csv": return pd.read_csv(download_url) - elif file_format == "parquet": - return pd.read_parquet(download_url) - elif file_format == "xlsx": + elif mime_type == "application/vnd.ms-excel": return pd.read_excel(download_url) + elif ( + mime_type == "application/vnd.apache.parquet" + or dataset_version_file.filename.split(".")[-1] == "parquet" + ): + return pd.read_parquet(download_url) else: - raise ValueError(f"Unsupported file format: {file_format}") + raise ValueError(f"Unsupported file format: {dataset_version_file.filename}") def generate_dataset_file_sample_task( From a7abfdceb6457a33006415383534fa587cca71c0 Mon Sep 17 00:00:00 2001 From: nazarfil Date: Tue, 30 Jul 2024 10:22:46 +0200 Subject: [PATCH 32/37] fix(Dataset): adds property to fethc latest metadata for a file --- hexa/datasets/models.py | 4 ++++ hexa/datasets/queue.py | 5 ++++- hexa/datasets/schema/types.py | 3 +-- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/hexa/datasets/models.py b/hexa/datasets/models.py index cc49a38f5..a6c973d65 100644 --- a/hexa/datasets/models.py +++ b/hexa/datasets/models.py @@ -253,6 +253,10 @@ class DatasetVersionFile(Base): def filename(self): return self.uri.split("/")[-1] + @property + def latest_metadata(self): + return self.file_metadata.order_by("-created_at").first() + class Meta: ordering = ["uri"] diff --git a/hexa/datasets/queue.py b/hexa/datasets/queue.py index 3e40d0cec..ded50a3f6 100644 --- a/hexa/datasets/queue.py +++ b/hexa/datasets/queue.py @@ -27,7 +27,10 @@ def download_file_as_dataframe( download_url = generate_download_url(dataset_version_file) if mime_type == "text/csv": return pd.read_csv(download_url) - elif mime_type == "application/vnd.ms-excel": + elif ( + mime_type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" + or mime_type == "application/vnd.ms-excel" + ): return pd.read_excel(download_url) elif ( mime_type == "application/vnd.apache.parquet" diff --git a/hexa/datasets/schema/types.py b/hexa/datasets/schema/types.py index e8a773fd5..e65ca0d74 100644 --- a/hexa/datasets/schema/types.py +++ b/hexa/datasets/schema/types.py @@ -221,8 +221,7 @@ def resolve_upload_url(obj, info, **kwargs): @dataset_version_file_object.field("fileMetadata") def resolve_version_file_metadata(obj: DatasetVersionFile, info, **kwargs): try: - file_metadata = DatasetFileMetadata.objects.get(dataset_version_file=obj) - return file_metadata + return obj.latest_metadata except DatasetFileMetadata.DoesNotExist: logging.error(f"No metadata found for file {obj.filename} with id {obj.id}") return None From 6f7b71ccd0c42626f967e894c3566dfcad3ecc1b Mon Sep 17 00:00:00 2001 From: nazarfil Date: Tue, 6 Aug 2024 17:45:07 +0200 Subject: [PATCH 33/37] fix(Dataset): fixes typo and dict argument --- hexa/datasets/migrations/0006_datasetfilemetadata.py | 4 ++-- hexa/datasets/models.py | 4 ++-- hexa/datasets/queue.py | 8 +++----- 3 files changed, 7 insertions(+), 9 deletions(-) diff --git a/hexa/datasets/migrations/0006_datasetfilemetadata.py b/hexa/datasets/migrations/0006_datasetfilemetadata.py index 7ecc59078..c6c319d2f 100644 --- a/hexa/datasets/migrations/0006_datasetfilemetadata.py +++ b/hexa/datasets/migrations/0006_datasetfilemetadata.py @@ -1,4 +1,4 @@ -# Generated by Django 5.0.7 on 2024-07-18 15:11 +# Generated by Django 5.0.7 on 2024-08-07 09:02 import uuid @@ -44,7 +44,7 @@ class Migration(migrations.Migration): "dataset_version_file", models.ForeignKey( on_delete=django.db.models.deletion.CASCADE, - related_name="file_metadata", + related_name="metadata_entries", to="datasets.datasetversionfile", ), ), diff --git a/hexa/datasets/models.py b/hexa/datasets/models.py index a6c973d65..a7cef3ddb 100644 --- a/hexa/datasets/models.py +++ b/hexa/datasets/models.py @@ -255,7 +255,7 @@ def filename(self): @property def latest_metadata(self): - return self.file_metadata.order_by("-created_at").first() + return self.metadata_entries.order_by("-created_at").first() class Meta: ordering = ["uri"] @@ -284,7 +284,7 @@ class DatasetFileMetadata(Base): null=False, blank=False, on_delete=models.CASCADE, - related_name="file_metadata", + related_name="metadata_entries", ) diff --git a/hexa/datasets/queue.py b/hexa/datasets/queue.py index ded50a3f6..c4af36c72 100644 --- a/hexa/datasets/queue.py +++ b/hexa/datasets/queue.py @@ -104,10 +104,8 @@ class DatasetsFileMetadataQueue(AtLeastOnceQueue): def load_file_metadata(file_id: str): dataset_file_metadata_queue.enqueue( + "generate_file_metadata", { - "generate_file_metadata", - { - "file_id": str(file_id), - }, - } + "file_id": str(file_id), + }, ) From be80552080d6cdd889288823bdb43b9a0771d482 Mon Sep 17 00:00:00 2001 From: nazarfil Date: Wed, 7 Aug 2024 13:03:47 +0200 Subject: [PATCH 34/37] fix: renames sample in metadata --- hexa/datasets/queue.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hexa/datasets/queue.py b/hexa/datasets/queue.py index c4af36c72..12b1fda99 100644 --- a/hexa/datasets/queue.py +++ b/hexa/datasets/queue.py @@ -96,7 +96,7 @@ class DatasetsFileMetadataQueue(AtLeastOnceQueue): dataset_file_metadata_queue = DatasetsFileMetadataQueue( tasks={ - "generate_file_sample": generate_dataset_file_sample_task, + "generate_file_metadata": generate_dataset_file_sample_task, }, notify_channel="dataset_file_metadata_queue", ) From a347e391858f6bd51a6311622438db8c1a310646 Mon Sep 17 00:00:00 2001 From: nazarfil Date: Wed, 7 Aug 2024 13:08:56 +0200 Subject: [PATCH 35/37] chore: move error to warning --- hexa/datasets/queue.py | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/hexa/datasets/queue.py b/hexa/datasets/queue.py index 12b1fda99..cefd56e73 100644 --- a/hexa/datasets/queue.py +++ b/hexa/datasets/queue.py @@ -20,11 +20,14 @@ def download_file_as_dataframe( dataset_version_file: DatasetVersionFile, -) -> pd.DataFrame: +) -> pd.DataFrame | None: mime_type, encoding = mimetypes.guess_type( dataset_version_file.filename, strict=False ) download_url = generate_download_url(dataset_version_file) + print( + f"Downloading file {download_url} for filename : {dataset_version_file.filename}" + ) if mime_type == "text/csv": return pd.read_csv(download_url) elif ( @@ -38,7 +41,8 @@ def download_file_as_dataframe( ): return pd.read_parquet(download_url) else: - raise ValueError(f"Unsupported file format: {dataset_version_file.filename}") + logger.info(f"Unsupported file format: {dataset_version_file.filename}") + return None def generate_dataset_file_sample_task( @@ -55,7 +59,7 @@ def generate_dataset_file_sample_task( ) return - logger.info(f"Creating dataset sample for version file {dataset_version_file_id}") + logger.info(f"Creating dataset sample for version file {dataset_version_file.id}") try: dataset_file_metadata = DatasetFileMetadata.objects.create( dataset_version_file=dataset_version_file, @@ -67,16 +71,19 @@ def generate_dataset_file_sample_task( try: file_content = download_file_as_dataframe(dataset_version_file) - if not file_content.empty: - random_seed = 22 - file_sample = file_content.sample( - settings.WORKSPACE_DATASETS_FILE_SNAPSHOT_SIZE, - random_state=random_seed, - replace=True, - ) - dataset_file_metadata.sample = file_sample.to_json(orient="records") - else: + if file_content is None: dataset_file_metadata.sample = json.dumps([]) + else: + if not file_content.empty: + random_seed = 22 + file_sample = file_content.sample( + settings.WORKSPACE_DATASETS_FILE_SNAPSHOT_SIZE, + random_state=random_seed, + replace=True, + ) + dataset_file_metadata.sample = file_sample.to_json(orient="records") + else: + dataset_file_metadata.sample = json.dumps([]) logger.info(f"Dataset sample saved for file {dataset_version_file_id}") dataset_file_metadata.status = DatasetFileMetadata.STATUS_FINISHED dataset_file_metadata.save() From 0f7fd7cfc04a782519ef86cf71586012847fe43a Mon Sep 17 00:00:00 2001 From: nazarfil Date: Wed, 7 Aug 2024 15:12:32 +0200 Subject: [PATCH 36/37] fix(Dataset): not creating metadata if format not supported --- hexa/datasets/queue.py | 44 +++++++++++++++++++++++++----------------- 1 file changed, 26 insertions(+), 18 deletions(-) diff --git a/hexa/datasets/queue.py b/hexa/datasets/queue.py index cefd56e73..6885f8799 100644 --- a/hexa/datasets/queue.py +++ b/hexa/datasets/queue.py @@ -18,6 +18,19 @@ logger = getLogger(__name__) +def is_supported_mimetype(filename: str) -> bool: + supported_mimetypes = [ + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + "application/vnd.ms-excel", + "application/vnd.apache.parquet", + "text/csv", + ] + supported_extensions = ["parquet"] + suffix = filename.split(".")[-1] + mime_type, encoding = mimetypes.guess_type(filename, strict=False) + return mime_type in supported_mimetypes or suffix in supported_extensions + + def download_file_as_dataframe( dataset_version_file: DatasetVersionFile, ) -> pd.DataFrame | None: @@ -25,9 +38,6 @@ def download_file_as_dataframe( dataset_version_file.filename, strict=False ) download_url = generate_download_url(dataset_version_file) - print( - f"Downloading file {download_url} for filename : {dataset_version_file.filename}" - ) if mime_type == "text/csv": return pd.read_csv(download_url) elif ( @@ -40,9 +50,6 @@ def download_file_as_dataframe( or dataset_version_file.filename.split(".")[-1] == "parquet" ): return pd.read_parquet(download_url) - else: - logger.info(f"Unsupported file format: {dataset_version_file.filename}") - return None def generate_dataset_file_sample_task( @@ -59,6 +66,10 @@ def generate_dataset_file_sample_task( ) return + if not is_supported_mimetype(dataset_version_file.filename): + logger.info(f"Unsupported file format: {dataset_version_file.filename}") + return + logger.info(f"Creating dataset sample for version file {dataset_version_file.id}") try: dataset_file_metadata = DatasetFileMetadata.objects.create( @@ -71,19 +82,16 @@ def generate_dataset_file_sample_task( try: file_content = download_file_as_dataframe(dataset_version_file) - if file_content is None: - dataset_file_metadata.sample = json.dumps([]) + if not file_content.empty: + random_seed = 22 + file_sample = file_content.sample( + settings.WORKSPACE_DATASETS_FILE_SNAPSHOT_SIZE, + random_state=random_seed, + replace=True, + ) + dataset_file_metadata.sample = file_sample.to_json(orient="records") else: - if not file_content.empty: - random_seed = 22 - file_sample = file_content.sample( - settings.WORKSPACE_DATASETS_FILE_SNAPSHOT_SIZE, - random_state=random_seed, - replace=True, - ) - dataset_file_metadata.sample = file_sample.to_json(orient="records") - else: - dataset_file_metadata.sample = json.dumps([]) + dataset_file_metadata.sample = json.dumps([]) logger.info(f"Dataset sample saved for file {dataset_version_file_id}") dataset_file_metadata.status = DatasetFileMetadata.STATUS_FINISHED dataset_file_metadata.save() From 2cf1a6d014919a6c013e7ec6072334a321b14c8b Mon Sep 17 00:00:00 2001 From: nazarfil Date: Thu, 8 Aug 2024 15:05:43 +0200 Subject: [PATCH 37/37] fix: fixes issue with uplioad url --- hexa/datasets/graphql/schema.graphql | 2 +- hexa/datasets/schema/types.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/hexa/datasets/graphql/schema.graphql b/hexa/datasets/graphql/schema.graphql index 452e0d966..85190e781 100644 --- a/hexa/datasets/graphql/schema.graphql +++ b/hexa/datasets/graphql/schema.graphql @@ -333,7 +333,7 @@ type CreateDatasetVersionFileResult { "The created file object" file: DatasetVersionFile "The URL to upload the file to" - uploadUrl: String @deprecated(reason: "moved to dedicated generateDatasetUploadUrl mutation") + uploadUrl: String! @deprecated(reason: "moved to dedicated generateDatasetUploadUrl mutation") success: Boolean! errors: [CreateDatasetVersionFileError!]! } diff --git a/hexa/datasets/schema/types.py b/hexa/datasets/schema/types.py index e65ca0d74..491f42f33 100644 --- a/hexa/datasets/schema/types.py +++ b/hexa/datasets/schema/types.py @@ -211,7 +211,8 @@ def resolve_version_permissions_delete(obj: DatasetVersion, info, **kwargs): @dataset_version_file_result_object.field("uploadUrl") def resolve_upload_url(obj, info, **kwargs): try: - upload_url = generate_upload_url(obj.uri, obj.content_type) + file = obj["file"] + upload_url = generate_upload_url(file.uri, file.content_type) return upload_url except BucketObjectAlreadyExists as exc: logging.error(f"Upload URL generation failed: {exc.message}") @@ -234,5 +235,6 @@ def resolve_version_file_metadata(obj: DatasetVersionFile, info, **kwargs): dataset_version_permissions, dataset_link_permissions, dataset_version_file_object, + dataset_version_file_result_object, dataset_link_object, ]