From 09ee81da122f5090b96282176b66b2ccb5a8e078 Mon Sep 17 00:00:00 2001 From: nazarfil Date: Thu, 8 Aug 2024 09:25:01 +0200 Subject: [PATCH] feat(Dataset): metadata as array for each column --- hexa/datasets/queue.py | 50 +++---- .../datasets/tests/fixtures/example_names.csv | 124 +++++++++--------- .../fixtures/example_names_2_lines_result.csv | 3 + .../fixtures/example_names_profiling.csv | 5 + ...lt.csv => senegal_rural_raw_profiling.csv} | 0 hexa/datasets/tests/test_metadata.py | 76 +++++------ 6 files changed, 127 insertions(+), 131 deletions(-) create mode 100644 hexa/datasets/tests/fixtures/example_names_2_lines_result.csv create mode 100644 hexa/datasets/tests/fixtures/example_names_profiling.csv rename hexa/datasets/tests/fixtures/{senegal_rural_raw_result.csv => senegal_rural_raw_profiling.csv} (100%) diff --git a/hexa/datasets/queue.py b/hexa/datasets/queue.py index 46d118c2b..05448fbb6 100644 --- a/hexa/datasets/queue.py +++ b/hexa/datasets/queue.py @@ -53,31 +53,31 @@ def download_file_as_dataframe( return pd.read_parquet(download_url) -def normalize_data(data): - for key, value in data.items(): - for i, item in enumerate(value): - if isinstance(item, list): - data[key][i] = {f"item_{j}": item[j] for j in range(len(item))} - return data - - -def metadata_profiling(file_content: pd.DataFrame) -> dict[str, str]: - file_content[ - file_content.select_dtypes(["object"]).columns - ] = file_content.select_dtypes(["object"]).astype("string") - - profiling = { - "column_names": file_content.columns.to_series(), - "data_types": file_content.dtypes.apply(str), - "missing_values": file_content.isnull().sum(), - "unique_values": file_content.nunique(), - "distinct_values": file_content.apply(lambda x: x.nunique(dropna=False)), - "constant_values": file_content.apply(lambda x: x.nunique() == 1).astype( - "bool" - ), - } - profiling_as_json = {key: val.to_json() for key, val in profiling.items()} - return profiling_as_json +def metadata_profiling(file_content: pd.DataFrame) -> list: + for col in file_content.select_dtypes(include=["object"]).columns: + file_content[col] = file_content[col].astype("string") + + data_types = file_content.dtypes.apply(str).to_dict() + missing_values = file_content.isnull().sum().to_dict() + unique_values = file_content.nunique().to_dict() + distinct_values = file_content.apply(lambda x: x.nunique(dropna=False)).to_dict() + constant_values = ( + file_content.apply(lambda x: x.nunique() == 1).astype("bool").to_dict() + ) + + metadata_per_column = [ + { + "column_names": column, + "data_types": data_types.get(column, "-"), + "missing_values": missing_values.get(column, "-"), + "unique_values": unique_values.get(column, "-"), + "distinct_values": distinct_values.get(column, "-"), + "constant_values": constant_values.get(column, "-"), + } + for column in file_content.columns + ] + + return metadata_per_column def generate_dataset_file_sample_task( diff --git a/hexa/datasets/tests/fixtures/example_names.csv b/hexa/datasets/tests/fixtures/example_names.csv index 93b20ed12..d140a01aa 100644 --- a/hexa/datasets/tests/fixtures/example_names.csv +++ b/hexa/datasets/tests/fixtures/example_names.csv @@ -1,62 +1,62 @@ -name,surname -Joe,Doe -Liam,Smith -Emma,Johnson -Noah,Williams -Olivia,Brown -William,Jones -Ava,Garcia -James,Miller -Sophia,Davis -Oliver,Martinez -Isabella,Hernandez -Benjamin,Lopez -Mia,Gonzalez -Elijah,Wilson -Charlotte,Anderson -Lucas,Thomas -Amelia,Taylor -Mason,Moore -Harper,Jackson -Logan,Martin -Evelyn,Lee -Alexander,Perez -Abigail,Thompson -Ethan,White -Emily,Harris -Jacob,Sanchez -Ella,Clark -Michael,Ramirez -Avery,Lewis -Daniel,Robinson -Sofia,Walker -Henry,Young -Scarlett,Allen -Jackson,King -Grace,Scott -Sebastian,Green -Victoria,Baker -Aiden,Adams -Chloe,Nelson -Matthew,Hill -Riley,Campbell -Samuel,Mitchell -Aria,Carter -David,Rogers -Lily,Evans -Joseph,Murphy -Layla,Parker -Carter,Roberts -Aubrey,Gonzalez -Owen,Reed -Zoey,Cook -Wyatt,Morgan -Hannah,Murphy -Jack,Howard -Lillian,Richardson -Luke,Cox -Addison,James -Gabriel,Wright -Eleanor,Hughes -Anthony,Butler -Natalie,Foster \ No newline at end of file +name,surname,age,married +Joe,Doe,31,True +Liam,Smith,47,False +Emma,Johnson,87,False +Noah,Williams,11,False +Olivia,Brown,45,False +William,Jones,32,True +Ava,Garcia,45,True +James,Miller,70,True +Sophia,Davis,73,True +Oliver,Martinez,66,False +Isabella,Hernandez,25,True +Benjamin,Lopez,73,True +Mia,Gonzalez,46,False +Elijah,Wilson,98,True +Charlotte,Anderson,70,False +Lucas,Thomas,68,True +Amelia,Taylor,95,True +Mason,Moore,58,False +Harper,Jackson,5,False +Logan,Martin,76,False +Evelyn,Lee,25,False +Alexander,Perez,100,True +Abigail,Thompson,60,True +Ethan,White,45,True +Emily,Harris,94,False +Jacob,Sanchez,90,True +Ella,Clark,28,True +Michael,Ramirez,70,True +Avery,Lewis,46,True +Daniel,Robinson,14,True +Sofia,Walker,56,True +Henry,Young,25,True +Scarlett,Allen,11,True +Jackson,King,28,False +Grace,Scott,61,True +Sebastian,Green,16,False +Victoria,Baker,9,False +Aiden,Adams,83,False +Chloe,Nelson,80,False +Matthew,Hill,46,False +Riley,Campbell,49,False +Samuel,Mitchell,84,True +Aria,Carter,99,False +David,Rogers,9,False +Lily,Evans,65,False +Joseph,Murphy,3,False +Layla,Parker,16,False +Carter,Roberts,41,True +Aubrey,Gonzalez,52,False +Owen,Reed,55,True +Zoey,Cook,33,True +Wyatt,Morgan,4,True +Hannah,Murphy,1,False +Jack,Howard,82,True +Lillian,Richardson,26,True +Luke,Cox,10,True +Addison,James,59,False +Gabriel,Wright,10,True +Eleanor,Hughes,66,True +Anthony,Butler,36,False +Natalie,Foster,8,True diff --git a/hexa/datasets/tests/fixtures/example_names_2_lines_result.csv b/hexa/datasets/tests/fixtures/example_names_2_lines_result.csv new file mode 100644 index 000000000..c1a4fddcb --- /dev/null +++ b/hexa/datasets/tests/fixtures/example_names_2_lines_result.csv @@ -0,0 +1,3 @@ +column_names,data_types,missing_values,unique_values,distinct_values,constant_values +name,string,0,2,2,False +surname,string,0,2,2,False diff --git a/hexa/datasets/tests/fixtures/example_names_profiling.csv b/hexa/datasets/tests/fixtures/example_names_profiling.csv new file mode 100644 index 000000000..c6fc3b872 --- /dev/null +++ b/hexa/datasets/tests/fixtures/example_names_profiling.csv @@ -0,0 +1,5 @@ +column_names,data_types,missing_values,unique_values,distinct_values,constant_values +name,string,0,61,61,False +surname,string,0,59,59,False +age,int64,0,46,46,False +married,bool,0,2,2,False diff --git a/hexa/datasets/tests/fixtures/senegal_rural_raw_result.csv b/hexa/datasets/tests/fixtures/senegal_rural_raw_profiling.csv similarity index 100% rename from hexa/datasets/tests/fixtures/senegal_rural_raw_result.csv rename to hexa/datasets/tests/fixtures/senegal_rural_raw_profiling.csv diff --git a/hexa/datasets/tests/test_metadata.py b/hexa/datasets/tests/test_metadata.py index f553198a9..d06bfaca7 100644 --- a/hexa/datasets/tests/test_metadata.py +++ b/hexa/datasets/tests/test_metadata.py @@ -181,47 +181,35 @@ def test_fill_in_metadata( mock_DatasetFileMetadata_create, mock_DatasetVersionFile_get, ): - filename = "example_names.csv" - dataset_version_file = mock.Mock() - dataset_version_file.id = 1 - dataset_version_file.filename = f"{filename}" - mock_DatasetVersionFile_get.return_value = dataset_version_file - - dataset_file_metadata = mock.Mock() - mock_DatasetFileMetadata_create.return_value = dataset_file_metadata - - fixture_file_path = os.path.join( - os.path.dirname(__file__), f"./fixtures/{filename}" - ) - mock_generate_download_url.return_value = fixture_file_path - - job = mock.Mock() - job.args = {"file_id": dataset_version_file.id} - - generate_dataset_file_sample_task(mock.Mock(), job) - print( - f"status: {dataset_file_metadata.status} : reason: {dataset_file_metadata.status_reason}" - ) - data = json.loads(dataset_file_metadata.profiling) - decoded_profiling = {key: json.loads(value) for key, value in data.items()} - data_pd = pd.DataFrame(decoded_profiling) - dtype_spec = { - "column_names": "string", - "data_types": "string", - "missing_values": "int", - "unique_values": "int", - "distinct_values": "int", - "constant_values": "string", - } - data_pd.to_csv(fixture_file_path.replace(".csv", "_result.csv"), index=False) - expected_profiling = pd.read_csv( - fixture_file_path.replace(".csv", "_result.csv"), dtype=dtype_spec - ) - - data_pd, expected_data_pd = data_pd.align( - expected_profiling, join="outer", axis=1 - ) - data_pd = data_pd.sort_index(axis=1) - expected_data_pd = expected_data_pd.sort_index(axis=1) - - self.assertEqual(data_pd.equals(expected_data_pd), True) + filenames = ["example_names.csv", "senegal_rural_raw.csv"] + for filename in filenames: + dataset_version_file = mock.Mock() + dataset_version_file.id = 1 + dataset_version_file.filename = f"{filename}" + mock_DatasetVersionFile_get.return_value = dataset_version_file + + dataset_file_metadata = mock.Mock() + mock_DatasetFileMetadata_create.return_value = dataset_file_metadata + + fixture_file_path = os.path.join( + os.path.dirname(__file__), f"./fixtures/{filename}" + ) + mock_generate_download_url.return_value = fixture_file_path + + job = mock.Mock() + job.args = {"file_id": dataset_version_file.id} + + generate_dataset_file_sample_task(mock.Mock(), job) + metadata_profiling = json.loads(dataset_file_metadata.profiling) + profiling = pd.DataFrame(metadata_profiling) + expected_profiling = pd.read_csv( + fixture_file_path.replace(".csv", "_profiling.csv") + ) + expected_profiling = expected_profiling.sort_index(axis=1) + profiling = profiling.sort_index(axis=1) + self.assertTrue(expected_profiling.equals(profiling)) + + mock_generate_download_url.reset_mock() + mock_DatasetVersionFile_get.reset_mock() + mock_DatasetFileMetadata_create.reset_mock() + dataset_file_metadata.save.reset_mock()