From f3a56dbbba2bf15e8d54f36aa0e32006c8787ae2 Mon Sep 17 00:00:00 2001 From: Nazar F Date: Tue, 3 Dec 2024 16:31:17 +0100 Subject: [PATCH] Fix remove byte profiling (#863) * feat: skip any byte column for profiling, adds number of rows * feat: skip any byte column for profiling, adds number of column --- hexa/datasets/queue.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/hexa/datasets/queue.py b/hexa/datasets/queue.py index 0beaa02c8..e9e2b0d01 100644 --- a/hexa/datasets/queue.py +++ b/hexa/datasets/queue.py @@ -96,26 +96,26 @@ def generate_sample( def generate_profile(df: pd.DataFrame) -> list: - logger.info("Calculating profiling per column") + logger.info("Starting profiling calculation per column") try: - if "geometry" in df.columns: - logger.warning("Skipping the 'geometry' column from profiling.") - df = df.drop(columns=["geometry"]) for col in df.select_dtypes(include=["object"]).columns: try: df[col] = df[col].astype("string") except Exception as e: logger.warning(f"Failed to convert column '{col}' to string: {e}") + df.drop(columns=[col], inplace=True) data_types = df.dtypes.apply(str).to_dict() missing_values = df.isnull().sum().to_dict() unique_values = df.nunique().to_dict() distinct_values = df.apply(lambda x: x.nunique(dropna=False)).to_dict() constant_values = df.apply(lambda x: x.nunique() == 1).astype("bool").to_dict() + count = df.count().to_dict() metadata_per_column = [ { "column_name": column, + "count": count.get(column), "data_type": data_types.get(column), "missing_values": missing_values.get(column), "unique_values": unique_values.get(column), @@ -124,6 +124,7 @@ def generate_profile(df: pd.DataFrame) -> list: } for column in df.columns ] + logger.info("Finished profiling calculation per column") return metadata_per_column except Exception as e: