Skip to content

Commit

Permalink
Fix remove byte profiling (#863)
Browse files Browse the repository at this point in the history
* feat: skip any byte column for profiling, adds number of rows

* feat: skip any byte column for profiling, adds number of column
  • Loading branch information
nazarfil authored Dec 3, 2024
1 parent 168dd66 commit f3a56db
Showing 1 changed file with 5 additions and 4 deletions.
9 changes: 5 additions & 4 deletions hexa/datasets/queue.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,26 +96,26 @@ def generate_sample(


def generate_profile(df: pd.DataFrame) -> list:
logger.info("Calculating profiling per column")
logger.info("Starting profiling calculation per column")
try:
if "geometry" in df.columns:
logger.warning("Skipping the 'geometry' column from profiling.")
df = df.drop(columns=["geometry"])
for col in df.select_dtypes(include=["object"]).columns:
try:
df[col] = df[col].astype("string")
except Exception as e:
logger.warning(f"Failed to convert column '{col}' to string: {e}")
df.drop(columns=[col], inplace=True)

data_types = df.dtypes.apply(str).to_dict()
missing_values = df.isnull().sum().to_dict()
unique_values = df.nunique().to_dict()
distinct_values = df.apply(lambda x: x.nunique(dropna=False)).to_dict()
constant_values = df.apply(lambda x: x.nunique() == 1).astype("bool").to_dict()
count = df.count().to_dict()

metadata_per_column = [
{
"column_name": column,
"count": count.get(column),
"data_type": data_types.get(column),
"missing_values": missing_values.get(column),
"unique_values": unique_values.get(column),
Expand All @@ -124,6 +124,7 @@ def generate_profile(df: pd.DataFrame) -> list:
}
for column in df.columns
]
logger.info("Finished profiling calculation per column")
return metadata_per_column

except Exception as e:
Expand Down

0 comments on commit f3a56db

Please sign in to comment.