Skip to content

Commit

Permalink
feat(Dataset): metadata as array for each column
Browse files Browse the repository at this point in the history
  • Loading branch information
nazarfil committed Aug 8, 2024
1 parent ce79c75 commit 09ee81d
Show file tree
Hide file tree
Showing 6 changed files with 127 additions and 131 deletions.
50 changes: 25 additions & 25 deletions hexa/datasets/queue.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,31 +53,31 @@ def download_file_as_dataframe(
return pd.read_parquet(download_url)


def normalize_data(data):
for key, value in data.items():
for i, item in enumerate(value):
if isinstance(item, list):
data[key][i] = {f"item_{j}": item[j] for j in range(len(item))}
return data


def metadata_profiling(file_content: pd.DataFrame) -> dict[str, str]:
file_content[
file_content.select_dtypes(["object"]).columns
] = file_content.select_dtypes(["object"]).astype("string")

profiling = {
"column_names": file_content.columns.to_series(),
"data_types": file_content.dtypes.apply(str),
"missing_values": file_content.isnull().sum(),
"unique_values": file_content.nunique(),
"distinct_values": file_content.apply(lambda x: x.nunique(dropna=False)),
"constant_values": file_content.apply(lambda x: x.nunique() == 1).astype(
"bool"
),
}
profiling_as_json = {key: val.to_json() for key, val in profiling.items()}
return profiling_as_json
def metadata_profiling(file_content: pd.DataFrame) -> list:
for col in file_content.select_dtypes(include=["object"]).columns:
file_content[col] = file_content[col].astype("string")

data_types = file_content.dtypes.apply(str).to_dict()
missing_values = file_content.isnull().sum().to_dict()
unique_values = file_content.nunique().to_dict()
distinct_values = file_content.apply(lambda x: x.nunique(dropna=False)).to_dict()
constant_values = (
file_content.apply(lambda x: x.nunique() == 1).astype("bool").to_dict()
)

metadata_per_column = [
{
"column_names": column,
"data_types": data_types.get(column, "-"),
"missing_values": missing_values.get(column, "-"),
"unique_values": unique_values.get(column, "-"),
"distinct_values": distinct_values.get(column, "-"),
"constant_values": constant_values.get(column, "-"),
}
for column in file_content.columns
]

return metadata_per_column


def generate_dataset_file_sample_task(
Expand Down
124 changes: 62 additions & 62 deletions hexa/datasets/tests/fixtures/example_names.csv
Original file line number Diff line number Diff line change
@@ -1,62 +1,62 @@
name,surname
Joe,Doe
Liam,Smith
Emma,Johnson
Noah,Williams
Olivia,Brown
William,Jones
Ava,Garcia
James,Miller
Sophia,Davis
Oliver,Martinez
Isabella,Hernandez
Benjamin,Lopez
Mia,Gonzalez
Elijah,Wilson
Charlotte,Anderson
Lucas,Thomas
Amelia,Taylor
Mason,Moore
Harper,Jackson
Logan,Martin
Evelyn,Lee
Alexander,Perez
Abigail,Thompson
Ethan,White
Emily,Harris
Jacob,Sanchez
Ella,Clark
Michael,Ramirez
Avery,Lewis
Daniel,Robinson
Sofia,Walker
Henry,Young
Scarlett,Allen
Jackson,King
Grace,Scott
Sebastian,Green
Victoria,Baker
Aiden,Adams
Chloe,Nelson
Matthew,Hill
Riley,Campbell
Samuel,Mitchell
Aria,Carter
David,Rogers
Lily,Evans
Joseph,Murphy
Layla,Parker
Carter,Roberts
Aubrey,Gonzalez
Owen,Reed
Zoey,Cook
Wyatt,Morgan
Hannah,Murphy
Jack,Howard
Lillian,Richardson
Luke,Cox
Addison,James
Gabriel,Wright
Eleanor,Hughes
Anthony,Butler
Natalie,Foster
name,surname,age,married
Joe,Doe,31,True
Liam,Smith,47,False
Emma,Johnson,87,False
Noah,Williams,11,False
Olivia,Brown,45,False
William,Jones,32,True
Ava,Garcia,45,True
James,Miller,70,True
Sophia,Davis,73,True
Oliver,Martinez,66,False
Isabella,Hernandez,25,True
Benjamin,Lopez,73,True
Mia,Gonzalez,46,False
Elijah,Wilson,98,True
Charlotte,Anderson,70,False
Lucas,Thomas,68,True
Amelia,Taylor,95,True
Mason,Moore,58,False
Harper,Jackson,5,False
Logan,Martin,76,False
Evelyn,Lee,25,False
Alexander,Perez,100,True
Abigail,Thompson,60,True
Ethan,White,45,True
Emily,Harris,94,False
Jacob,Sanchez,90,True
Ella,Clark,28,True
Michael,Ramirez,70,True
Avery,Lewis,46,True
Daniel,Robinson,14,True
Sofia,Walker,56,True
Henry,Young,25,True
Scarlett,Allen,11,True
Jackson,King,28,False
Grace,Scott,61,True
Sebastian,Green,16,False
Victoria,Baker,9,False
Aiden,Adams,83,False
Chloe,Nelson,80,False
Matthew,Hill,46,False
Riley,Campbell,49,False
Samuel,Mitchell,84,True
Aria,Carter,99,False
David,Rogers,9,False
Lily,Evans,65,False
Joseph,Murphy,3,False
Layla,Parker,16,False
Carter,Roberts,41,True
Aubrey,Gonzalez,52,False
Owen,Reed,55,True
Zoey,Cook,33,True
Wyatt,Morgan,4,True
Hannah,Murphy,1,False
Jack,Howard,82,True
Lillian,Richardson,26,True
Luke,Cox,10,True
Addison,James,59,False
Gabriel,Wright,10,True
Eleanor,Hughes,66,True
Anthony,Butler,36,False
Natalie,Foster,8,True
3 changes: 3 additions & 0 deletions hexa/datasets/tests/fixtures/example_names_2_lines_result.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
column_names,data_types,missing_values,unique_values,distinct_values,constant_values
name,string,0,2,2,False
surname,string,0,2,2,False
5 changes: 5 additions & 0 deletions hexa/datasets/tests/fixtures/example_names_profiling.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
column_names,data_types,missing_values,unique_values,distinct_values,constant_values
name,string,0,61,61,False
surname,string,0,59,59,False
age,int64,0,46,46,False
married,bool,0,2,2,False
76 changes: 32 additions & 44 deletions hexa/datasets/tests/test_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,47 +181,35 @@ def test_fill_in_metadata(
mock_DatasetFileMetadata_create,
mock_DatasetVersionFile_get,
):
filename = "example_names.csv"
dataset_version_file = mock.Mock()
dataset_version_file.id = 1
dataset_version_file.filename = f"{filename}"
mock_DatasetVersionFile_get.return_value = dataset_version_file

dataset_file_metadata = mock.Mock()
mock_DatasetFileMetadata_create.return_value = dataset_file_metadata

fixture_file_path = os.path.join(
os.path.dirname(__file__), f"./fixtures/{filename}"
)
mock_generate_download_url.return_value = fixture_file_path

job = mock.Mock()
job.args = {"file_id": dataset_version_file.id}

generate_dataset_file_sample_task(mock.Mock(), job)
print(
f"status: {dataset_file_metadata.status} : reason: {dataset_file_metadata.status_reason}"
)
data = json.loads(dataset_file_metadata.profiling)
decoded_profiling = {key: json.loads(value) for key, value in data.items()}
data_pd = pd.DataFrame(decoded_profiling)
dtype_spec = {
"column_names": "string",
"data_types": "string",
"missing_values": "int",
"unique_values": "int",
"distinct_values": "int",
"constant_values": "string",
}
data_pd.to_csv(fixture_file_path.replace(".csv", "_result.csv"), index=False)
expected_profiling = pd.read_csv(
fixture_file_path.replace(".csv", "_result.csv"), dtype=dtype_spec
)

data_pd, expected_data_pd = data_pd.align(
expected_profiling, join="outer", axis=1
)
data_pd = data_pd.sort_index(axis=1)
expected_data_pd = expected_data_pd.sort_index(axis=1)

self.assertEqual(data_pd.equals(expected_data_pd), True)
filenames = ["example_names.csv", "senegal_rural_raw.csv"]
for filename in filenames:
dataset_version_file = mock.Mock()
dataset_version_file.id = 1
dataset_version_file.filename = f"{filename}"
mock_DatasetVersionFile_get.return_value = dataset_version_file

dataset_file_metadata = mock.Mock()
mock_DatasetFileMetadata_create.return_value = dataset_file_metadata

fixture_file_path = os.path.join(
os.path.dirname(__file__), f"./fixtures/{filename}"
)
mock_generate_download_url.return_value = fixture_file_path

job = mock.Mock()
job.args = {"file_id": dataset_version_file.id}

generate_dataset_file_sample_task(mock.Mock(), job)
metadata_profiling = json.loads(dataset_file_metadata.profiling)
profiling = pd.DataFrame(metadata_profiling)
expected_profiling = pd.read_csv(
fixture_file_path.replace(".csv", "_profiling.csv")
)
expected_profiling = expected_profiling.sort_index(axis=1)
profiling = profiling.sort_index(axis=1)
self.assertTrue(expected_profiling.equals(profiling))

mock_generate_download_url.reset_mock()
mock_DatasetVersionFile_get.reset_mock()
mock_DatasetFileMetadata_create.reset_mock()
dataset_file_metadata.save.reset_mock()

0 comments on commit 09ee81d

Please sign in to comment.