You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
DataDescriber does not handle well bool dtypes in the source dataset. When the CSV file has columns with only TRUE and FALSE as values, pandas reads such columns as bool dtype (not object) and, when inferring types, the code ends up in checking them as dates and fails.
What I Did
The source dataset is the telco-customer-churn dataset from Kaggle, after being imported in Google BigQuery and exported back to CSV, generating those TRUE and FALSE values instead of Yes and No. Below is my code:
from DataSynthesizer.DataDescriber import DataDescriber
from DataSynthesizer.DataGenerator import DataGenerator
from DataSynthesizer.ModelInspector import ModelInspector
from DataSynthesizer.lib.utils import read_json_file, display_bayesian_network
import pandas as pd
# input dataset
input_data = "./out/from_bq.csv" # this CSV file has columns with TRUE and FALSE value which get read by pandas as bool dtype
mode = 'correlated_attribute_mode'
# location of two output files
description_file = f'./out/{mode}/description.json'
synthetic_data = f'./out/{mode}/synthetic_data.csv'
# An attribute is categorical if its domain size is less than this threshold.
threshold_value = 20
# list of dicsrete columns and primary key
categorical_columns = ["gender", "Partner", "Dependents", "PhoneService", "MultipleLines", "InternetService", "OnlineSecurity", "OnlineBackup", "DeviceProtection", "TechSupport", "StreamingTV", "StreamingMovies", "Contract", "PaperlessBilling", "PaymentMethod", "TotalCharges", "Churn"]
primary_key_column = "customerID"
# specify categorical attributes
categorical_attributes = {}
for column in categorical_columns:
categorical_attributes[column] = True
# specify which attributes are candidate keys of input dataset.
candidate_keys = {primary_key_column: True}
# A parameter in Differential Privacy. It roughly means that removing a row in the input dataset will not
# change the probability of getting the same output more than a multiplicative difference of exp(epsilon).
# Increase epsilon value to reduce the injected noises. Set epsilon=0 to turn off differential privacy.
epsilon = 0
# The maximum number of parents in Bayesian network, i.e., the maximum number of incoming edges.
degree_of_bayesian_network = 2
# Number of tuples generated in synthetic dataset.
num_tuples_to_generate = 1000
# build the Bayesian Network
describer = DataDescriber(category_threshold=threshold_value)
describer.describe_dataset_in_correlated_attribute_mode(dataset_file="./out/from_bq.csv",
epsilon=epsilon,
k=degree_of_bayesian_network,
attribute_to_is_categorical=categorical_attributes,
attribute_to_is_candidate_key=candidate_keys)
# save the output
describer.save_dataset_description_to_file(description_file)
Here is the output:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
/tmp/ipykernel_4364/1321006366.py in <module>
46 k=degree_of_bayesian_network,
47 attribute_to_is_categorical=categorical_attributes,
---> 48 attribute_to_is_candidate_key=candidate_keys)
49
50 # save the output
/opt/conda/lib/python3.7/site-packages/DataSynthesizer/DataDescriber.py in describe_dataset_in_correlated_attribute_mode(self, dataset_file, k, epsilon, attribute_to_datatype, attribute_to_is_categorical, attribute_to_is_candidate_key, categorical_attribute_domain_file, numerical_attribute_ranges, seed)
170 categorical_attribute_domain_file,
171 numerical_attribute_ranges,
--> 172 seed)
173 self.df_encoded = self.encode_dataset_into_binning_indices()
174 if self.df_encoded.shape[1] < 2:
/opt/conda/lib/python3.7/site-packages/DataSynthesizer/DataDescriber.py in describe_dataset_in_independent_attribute_mode(self, dataset_file, epsilon, attribute_to_datatype, attribute_to_is_categorical, attribute_to_is_candidate_key, categorical_attribute_domain_file, numerical_attribute_ranges, seed)
118 categorical_attribute_domain_file,
119 numerical_attribute_ranges,
--> 120 seed=seed)
121
122 for column in self.attr_to_column.values():
/opt/conda/lib/python3.7/site-packages/DataSynthesizer/DataDescriber.py in describe_dataset_in_random_mode(self, dataset_file, attribute_to_datatype, attribute_to_is_categorical, attribute_to_is_candidate_key, categorical_attribute_domain_file, numerical_attribute_ranges, seed)
85 self.attr_to_is_candidate_key = attribute_to_is_candidate_key
86 self.read_dataset_from_csv(dataset_file)
---> 87 self.infer_attribute_data_types()
88 self.analyze_dataset_meta()
89 self.represent_input_dataset_by_columns()
/opt/conda/lib/python3.7/site-packages/DataSynthesizer/DataDescriber.py in infer_attribute_data_types(self)
213 # Sample 20 values to test its data_type.
214 samples = column_dropna.sample(20, replace=True)
--> 215 if all(samples.map(is_datetime)):
216 self.attr_to_datatype[attr] = DataType.DATETIME
217 else:
/opt/conda/lib/python3.7/site-packages/pandas/core/series.py in map(self, arg, na_action)
3980 dtype: object
3981 """
-> 3982 new_values = super()._map_values(arg, na_action=na_action)
3983 return self._constructor(new_values, index=self.index).__finalize__(
3984 self, method="map"
/opt/conda/lib/python3.7/site-packages/pandas/core/base.py in _map_values(self, mapper, na_action)
1158
1159 # mapper is a function
-> 1160 new_values = map_f(values, mapper)
1161
1162 return new_values
pandas/_libs/lib.pyx in pandas._libs.lib.map_infer()
/opt/conda/lib/python3.7/site-packages/DataSynthesizer/datatypes/DateTimeAttribute.py in is_datetime(value)
19 'dec', 'december'}
20
---> 21 value_lower = value.lower()
22 if (value_lower in weekdays) or (value_lower in months):
23 return False
AttributeError: 'bool' object has no attribute 'lower'
The text was updated successfully, but these errors were encountered:
Hi, @simone-mangiante, I could not replicate this error with either DataSynthesizer 0.1.10 or 0.1.11. I tested your script with the telco-customer-churn dataset from Kaggle, and Python 3.7, on Pop!_OS 22.04.
Description
DataDescriber
does not handle wellbool
dtypes in the source dataset. When the CSV file has columns with onlyTRUE
andFALSE
as values,pandas
reads such columns asbool
dtype (notobject
) and, when inferring types, the code ends up in checking them as dates and fails.What I Did
The source dataset is the telco-customer-churn dataset from Kaggle, after being imported in Google BigQuery and exported back to CSV, generating those
TRUE
andFALSE
values instead ofYes
andNo
. Below is my code:Here is the output:
The text was updated successfully, but these errors were encountered: