From 09ee81da122f5090b96282176b66b2ccb5a8e078 Mon Sep 17 00:00:00 2001
From: nazarfil <nfilipchuk@bluesquarehub.com>
Date: Thu, 8 Aug 2024 09:25:01 +0200
Subject: [PATCH] feat(Dataset): metadata as array for each column

---
 hexa/datasets/queue.py                        |  50 +++----
 .../datasets/tests/fixtures/example_names.csv | 124 +++++++++---------
 .../fixtures/example_names_2_lines_result.csv |   3 +
 .../fixtures/example_names_profiling.csv      |   5 +
 ...lt.csv => senegal_rural_raw_profiling.csv} |   0
 hexa/datasets/tests/test_metadata.py          |  76 +++++------
 6 files changed, 127 insertions(+), 131 deletions(-)
 create mode 100644 hexa/datasets/tests/fixtures/example_names_2_lines_result.csv
 create mode 100644 hexa/datasets/tests/fixtures/example_names_profiling.csv
 rename hexa/datasets/tests/fixtures/{senegal_rural_raw_result.csv => senegal_rural_raw_profiling.csv} (100%)

diff --git a/hexa/datasets/queue.py b/hexa/datasets/queue.py
index 46d118c2b..05448fbb6 100644
--- a/hexa/datasets/queue.py
+++ b/hexa/datasets/queue.py
@@ -53,31 +53,31 @@ def download_file_as_dataframe(
         return pd.read_parquet(download_url)
 
 
-def normalize_data(data):
-    for key, value in data.items():
-        for i, item in enumerate(value):
-            if isinstance(item, list):
-                data[key][i] = {f"item_{j}": item[j] for j in range(len(item))}
-    return data
-
-
-def metadata_profiling(file_content: pd.DataFrame) -> dict[str, str]:
-    file_content[
-        file_content.select_dtypes(["object"]).columns
-    ] = file_content.select_dtypes(["object"]).astype("string")
-
-    profiling = {
-        "column_names": file_content.columns.to_series(),
-        "data_types": file_content.dtypes.apply(str),
-        "missing_values": file_content.isnull().sum(),
-        "unique_values": file_content.nunique(),
-        "distinct_values": file_content.apply(lambda x: x.nunique(dropna=False)),
-        "constant_values": file_content.apply(lambda x: x.nunique() == 1).astype(
-            "bool"
-        ),
-    }
-    profiling_as_json = {key: val.to_json() for key, val in profiling.items()}
-    return profiling_as_json
+def metadata_profiling(file_content: pd.DataFrame) -> list:
+    for col in file_content.select_dtypes(include=["object"]).columns:
+        file_content[col] = file_content[col].astype("string")
+
+    data_types = file_content.dtypes.apply(str).to_dict()
+    missing_values = file_content.isnull().sum().to_dict()
+    unique_values = file_content.nunique().to_dict()
+    distinct_values = file_content.apply(lambda x: x.nunique(dropna=False)).to_dict()
+    constant_values = (
+        file_content.apply(lambda x: x.nunique() == 1).astype("bool").to_dict()
+    )
+
+    metadata_per_column = [
+        {
+            "column_names": column,
+            "data_types": data_types.get(column, "-"),
+            "missing_values": missing_values.get(column, "-"),
+            "unique_values": unique_values.get(column, "-"),
+            "distinct_values": distinct_values.get(column, "-"),
+            "constant_values": constant_values.get(column, "-"),
+        }
+        for column in file_content.columns
+    ]
+
+    return metadata_per_column
 
 
 def generate_dataset_file_sample_task(
diff --git a/hexa/datasets/tests/fixtures/example_names.csv b/hexa/datasets/tests/fixtures/example_names.csv
index 93b20ed12..d140a01aa 100644
--- a/hexa/datasets/tests/fixtures/example_names.csv
+++ b/hexa/datasets/tests/fixtures/example_names.csv
@@ -1,62 +1,62 @@
-name,surname
-Joe,Doe
-Liam,Smith
-Emma,Johnson
-Noah,Williams
-Olivia,Brown
-William,Jones
-Ava,Garcia
-James,Miller
-Sophia,Davis
-Oliver,Martinez
-Isabella,Hernandez
-Benjamin,Lopez
-Mia,Gonzalez
-Elijah,Wilson
-Charlotte,Anderson
-Lucas,Thomas
-Amelia,Taylor
-Mason,Moore
-Harper,Jackson
-Logan,Martin
-Evelyn,Lee
-Alexander,Perez
-Abigail,Thompson
-Ethan,White
-Emily,Harris
-Jacob,Sanchez
-Ella,Clark
-Michael,Ramirez
-Avery,Lewis
-Daniel,Robinson
-Sofia,Walker
-Henry,Young
-Scarlett,Allen
-Jackson,King
-Grace,Scott
-Sebastian,Green
-Victoria,Baker
-Aiden,Adams
-Chloe,Nelson
-Matthew,Hill
-Riley,Campbell
-Samuel,Mitchell
-Aria,Carter
-David,Rogers
-Lily,Evans
-Joseph,Murphy
-Layla,Parker
-Carter,Roberts
-Aubrey,Gonzalez
-Owen,Reed
-Zoey,Cook
-Wyatt,Morgan
-Hannah,Murphy
-Jack,Howard
-Lillian,Richardson
-Luke,Cox
-Addison,James
-Gabriel,Wright
-Eleanor,Hughes
-Anthony,Butler
-Natalie,Foster
\ No newline at end of file
+name,surname,age,married
+Joe,Doe,31,True
+Liam,Smith,47,False
+Emma,Johnson,87,False
+Noah,Williams,11,False
+Olivia,Brown,45,False
+William,Jones,32,True
+Ava,Garcia,45,True
+James,Miller,70,True
+Sophia,Davis,73,True
+Oliver,Martinez,66,False
+Isabella,Hernandez,25,True
+Benjamin,Lopez,73,True
+Mia,Gonzalez,46,False
+Elijah,Wilson,98,True
+Charlotte,Anderson,70,False
+Lucas,Thomas,68,True
+Amelia,Taylor,95,True
+Mason,Moore,58,False
+Harper,Jackson,5,False
+Logan,Martin,76,False
+Evelyn,Lee,25,False
+Alexander,Perez,100,True
+Abigail,Thompson,60,True
+Ethan,White,45,True
+Emily,Harris,94,False
+Jacob,Sanchez,90,True
+Ella,Clark,28,True
+Michael,Ramirez,70,True
+Avery,Lewis,46,True
+Daniel,Robinson,14,True
+Sofia,Walker,56,True
+Henry,Young,25,True
+Scarlett,Allen,11,True
+Jackson,King,28,False
+Grace,Scott,61,True
+Sebastian,Green,16,False
+Victoria,Baker,9,False
+Aiden,Adams,83,False
+Chloe,Nelson,80,False
+Matthew,Hill,46,False
+Riley,Campbell,49,False
+Samuel,Mitchell,84,True
+Aria,Carter,99,False
+David,Rogers,9,False
+Lily,Evans,65,False
+Joseph,Murphy,3,False
+Layla,Parker,16,False
+Carter,Roberts,41,True
+Aubrey,Gonzalez,52,False
+Owen,Reed,55,True
+Zoey,Cook,33,True
+Wyatt,Morgan,4,True
+Hannah,Murphy,1,False
+Jack,Howard,82,True
+Lillian,Richardson,26,True
+Luke,Cox,10,True
+Addison,James,59,False
+Gabriel,Wright,10,True
+Eleanor,Hughes,66,True
+Anthony,Butler,36,False
+Natalie,Foster,8,True
diff --git a/hexa/datasets/tests/fixtures/example_names_2_lines_result.csv b/hexa/datasets/tests/fixtures/example_names_2_lines_result.csv
new file mode 100644
index 000000000..c1a4fddcb
--- /dev/null
+++ b/hexa/datasets/tests/fixtures/example_names_2_lines_result.csv
@@ -0,0 +1,3 @@
+column_names,data_types,missing_values,unique_values,distinct_values,constant_values
+name,string,0,2,2,False
+surname,string,0,2,2,False
diff --git a/hexa/datasets/tests/fixtures/example_names_profiling.csv b/hexa/datasets/tests/fixtures/example_names_profiling.csv
new file mode 100644
index 000000000..c6fc3b872
--- /dev/null
+++ b/hexa/datasets/tests/fixtures/example_names_profiling.csv
@@ -0,0 +1,5 @@
+column_names,data_types,missing_values,unique_values,distinct_values,constant_values
+name,string,0,61,61,False
+surname,string,0,59,59,False
+age,int64,0,46,46,False
+married,bool,0,2,2,False
diff --git a/hexa/datasets/tests/fixtures/senegal_rural_raw_result.csv b/hexa/datasets/tests/fixtures/senegal_rural_raw_profiling.csv
similarity index 100%
rename from hexa/datasets/tests/fixtures/senegal_rural_raw_result.csv
rename to hexa/datasets/tests/fixtures/senegal_rural_raw_profiling.csv
diff --git a/hexa/datasets/tests/test_metadata.py b/hexa/datasets/tests/test_metadata.py
index f553198a9..d06bfaca7 100644
--- a/hexa/datasets/tests/test_metadata.py
+++ b/hexa/datasets/tests/test_metadata.py
@@ -181,47 +181,35 @@ def test_fill_in_metadata(
         mock_DatasetFileMetadata_create,
         mock_DatasetVersionFile_get,
     ):
-        filename = "example_names.csv"
-        dataset_version_file = mock.Mock()
-        dataset_version_file.id = 1
-        dataset_version_file.filename = f"{filename}"
-        mock_DatasetVersionFile_get.return_value = dataset_version_file
-
-        dataset_file_metadata = mock.Mock()
-        mock_DatasetFileMetadata_create.return_value = dataset_file_metadata
-
-        fixture_file_path = os.path.join(
-            os.path.dirname(__file__), f"./fixtures/{filename}"
-        )
-        mock_generate_download_url.return_value = fixture_file_path
-
-        job = mock.Mock()
-        job.args = {"file_id": dataset_version_file.id}
-
-        generate_dataset_file_sample_task(mock.Mock(), job)
-        print(
-            f"status: {dataset_file_metadata.status} : reason: {dataset_file_metadata.status_reason}"
-        )
-        data = json.loads(dataset_file_metadata.profiling)
-        decoded_profiling = {key: json.loads(value) for key, value in data.items()}
-        data_pd = pd.DataFrame(decoded_profiling)
-        dtype_spec = {
-            "column_names": "string",
-            "data_types": "string",
-            "missing_values": "int",
-            "unique_values": "int",
-            "distinct_values": "int",
-            "constant_values": "string",
-        }
-        data_pd.to_csv(fixture_file_path.replace(".csv", "_result.csv"), index=False)
-        expected_profiling = pd.read_csv(
-            fixture_file_path.replace(".csv", "_result.csv"), dtype=dtype_spec
-        )
-
-        data_pd, expected_data_pd = data_pd.align(
-            expected_profiling, join="outer", axis=1
-        )
-        data_pd = data_pd.sort_index(axis=1)
-        expected_data_pd = expected_data_pd.sort_index(axis=1)
-
-        self.assertEqual(data_pd.equals(expected_data_pd), True)
+        filenames = ["example_names.csv", "senegal_rural_raw.csv"]
+        for filename in filenames:
+            dataset_version_file = mock.Mock()
+            dataset_version_file.id = 1
+            dataset_version_file.filename = f"{filename}"
+            mock_DatasetVersionFile_get.return_value = dataset_version_file
+
+            dataset_file_metadata = mock.Mock()
+            mock_DatasetFileMetadata_create.return_value = dataset_file_metadata
+
+            fixture_file_path = os.path.join(
+                os.path.dirname(__file__), f"./fixtures/{filename}"
+            )
+            mock_generate_download_url.return_value = fixture_file_path
+
+            job = mock.Mock()
+            job.args = {"file_id": dataset_version_file.id}
+
+            generate_dataset_file_sample_task(mock.Mock(), job)
+            metadata_profiling = json.loads(dataset_file_metadata.profiling)
+            profiling = pd.DataFrame(metadata_profiling)
+            expected_profiling = pd.read_csv(
+                fixture_file_path.replace(".csv", "_profiling.csv")
+            )
+            expected_profiling = expected_profiling.sort_index(axis=1)
+            profiling = profiling.sort_index(axis=1)
+            self.assertTrue(expected_profiling.equals(profiling))
+
+            mock_generate_download_url.reset_mock()
+            mock_DatasetVersionFile_get.reset_mock()
+            mock_DatasetFileMetadata_create.reset_mock()
+            dataset_file_metadata.save.reset_mock()