Skip to content

Commit

Permalink
fix tests, calculate features and X_prediction
Browse files Browse the repository at this point in the history
  • Loading branch information
diegomarvid committed Jul 17, 2024
1 parent 38a68bf commit e8b4acf
Show file tree
Hide file tree
Showing 5 changed files with 80 additions and 67 deletions.
49 changes: 30 additions & 19 deletions ml_garden/core/steps/calculate_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,14 +65,16 @@ def __init__(
f" features: {list(self.feature_extractors.keys())}"
)

def _convert_column_to_datetime(self, df: pd.DataFrame, column: str) -> pd.DataFrame:
def _convert_column_to_datetime(self, df: pd.DataFrame, column: str, log: bool) -> pd.DataFrame:
"""Convert a column to datetime.
Parameters
----------
df : pd.DataFrame
The DataFrame containing the column to convert
column : str
The name of the column to convert
log: bool
If True, logs information.
Returns
-------
pd.DataFrame
Expand All @@ -85,14 +87,15 @@ def _convert_column_to_datetime(self, df: pd.DataFrame, column: str) -> pd.DataF
df[column],
errors="raise",
)
self.logger.info(f"Column '{column}' automatically converted to datetime.")
if log:
self.logger.info(f"Column '{column}' automatically converted to datetime.")
except ValueError as e:
self.logger.error(f"Error converting column '{column}' to datetime: {e}")
except Exception as e:
self.logger.error(f"Unexpected error converting column '{column}' to datetime: {e}")
else:
self.logger.debug(f"Column '{column}' is already a datetime type.")

if log:
self.logger.debug(f"Column '{column}' is already a datetime type.")
return df

def _extract_feature(self, df: pd.DataFrame, column: str, feature: str) -> None:
Expand Down Expand Up @@ -122,6 +125,14 @@ def _extract_feature(self, df: pd.DataFrame, column: str, feature: str) -> None:
)
raise ValueError(error_message)

def _drop_datetime_columns(self, df: pd.DataFrame, log: bool) -> pd.DataFrame:
"""Drop the datetime columns from the `df`."""
if self.datetime_columns:
if log:
self.logger.info(f"Dropping original datetime columns: {self.datetime_columns}")
return df.drop(columns=self.datetime_columns)
return df

def execute(self, data: DataContainer) -> DataContainer:
"""Execute the step.
Parameters
Expand All @@ -135,21 +146,18 @@ def execute(self, data: DataContainer) -> DataContainer:
"""
self.logger.info("Calculating features")

if not data.is_train:
data.flow = self._create_datetime_features(data.flow, log=True)
datasets = [
("X_prediction", data.X_prediction, True),
("X_train", data.X_train, True),
("X_validation", data.X_validation, False),
("X_test", data.X_test, False),
]

if data.train is not None:
data.train = self._create_datetime_features(data.train, log=True)

if data.validation is not None:
data.validation = self._create_datetime_features(data.validation)

if data.test is not None:
data.test = self._create_datetime_features(data.test)

## add datetime columns to ignore columns for training
if self.datetime_columns:
data.columns_to_ignore_for_training.extend(self.datetime_columns)
for attr_name, dataset, should_log in datasets:
if dataset is not None:
dataset = self._create_datetime_features(dataset, log=should_log)
dataset = self._drop_datetime_columns(dataset, log=should_log)
setattr(data, attr_name, dataset)

return data

Expand All @@ -173,7 +181,7 @@ def _create_datetime_features(
if self.datetime_columns:
for column in self.datetime_columns:
if column in df.columns:
df = self._convert_column_to_datetime(df, column)
df = self._convert_column_to_datetime(df, column, log)

if self.features:
for feature in self.features:
Expand All @@ -191,4 +199,7 @@ def _create_datetime_features(
if log:
self.logger.warning("No datetime columns specified. Skipping feature extraction.")

if log:
self.logger.info(f"Created new features: {self.features}")

return df
8 changes: 2 additions & 6 deletions ml_garden/core/steps/encode.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,11 +52,9 @@ def execute(self, data: DataContainer) -> DataContainer:
self.logger.info("Encoding data")

if not data.is_train:
categorical_features, numeric_features = self._get_feature_types(
data.flow.drop(columns=data.columns_to_ignore_for_training)
)
categorical_features, numeric_features = self._get_feature_types(data.X_prediction)
data.X_prediction, _, _ = self._apply_encoding(
X=data.flow,
X=data.X_prediction,
y=None,
categorical_features=categorical_features,
numeric_features=numeric_features,
Expand All @@ -66,8 +64,6 @@ def execute(self, data: DataContainer) -> DataContainer:
return data

categorical_features, numeric_features = self._get_feature_types(data.X_train)
self.logger.info(f"New categorical features: {categorical_features}")
self.logger.info(f"New numeric features: {numeric_features}")

data.X_train, data.y_train, data._encoder = self._apply_encoding(
X=data.X_train,
Expand Down
1 change: 0 additions & 1 deletion ml_garden/core/steps/fit_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -298,7 +298,6 @@ def predict(self, data: DataContainer) -> DataContainer:
The updated data container
"""
self.logger.info(f"Predicting with {self.model_class.__name__} model")
data.X_prediction = data.flow.drop(columns=data.columns_to_ignore_for_training)
data.flow[data.prediction_column] = data.model.predict(data.X_prediction)
data.predictions = data.flow[data.prediction_column]
return data
9 changes: 8 additions & 1 deletion ml_garden/core/steps/tabular_split.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
class TabularSplitStep(PipelineStep):
"""Split the data."""

used_for_prediction = False
used_for_prediction = True
used_for_training = True

def __init__(
Expand Down Expand Up @@ -129,6 +129,13 @@ def execute(self, data: DataContainer) -> DataContainer:
Where df is the DataFrame used as input to the SplitStep
"""
if not data.is_train:
data.X_prediction = data.flow
if data.columns_to_ignore_for_training:
data.X_prediction = data.X_prediction.drop(
columns=data.columns_to_ignore_for_training
)
return data

self.logger.info("Splitting tabular data...")
df = data.flow
Expand Down
80 changes: 40 additions & 40 deletions tests/core/steps/test_calculate_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def input_data() -> pd.DataFrame:
def data(input_data: pd.DataFrame) -> DataContainer:
data = DataContainer({"is_train": True})
data.columns_to_ignore_for_training = []
data.train = input_data
data.X_train = input_data
return data


Expand All @@ -72,7 +72,7 @@ def test_skipping_with_no_parameters(data: DataContainer):
result = calculate_features_step.execute(data)

assert isinstance(result, DataContainer)
assert result.train.equals(data.train)
assert result.X_train.equals(data.X_train)


def test_feature_names(data: DataContainer):
Expand All @@ -87,22 +87,22 @@ def test_feature_names(data: DataContainer):
result = calculate_features_step.execute(data)

assert isinstance(result, DataContainer)
assert "creation_date_year" in result.train.columns
assert "creation_date_month" in result.train.columns
assert "creation_date_day" in result.train.columns
assert "creation_date_hour" in result.train.columns
assert "creation_date_minute" in result.train.columns
assert "creation_date_second" in result.train.columns
assert "creation_date_weekday" in result.train.columns
assert "creation_date_dayofyear" in result.train.columns
assert "deletion_date_year" in result.train.columns
assert "deletion_date_month" in result.train.columns
assert "deletion_date_day" in result.train.columns
assert "deletion_date_hour" in result.train.columns
assert "deletion_date_minute" in result.train.columns
assert "deletion_date_second" in result.train.columns
assert "deletion_date_weekday" in result.train.columns
assert "deletion_date_dayofyear" in result.train.columns
assert "creation_date_year" in result.X_train.columns
assert "creation_date_month" in result.X_train.columns
assert "creation_date_day" in result.X_train.columns
assert "creation_date_hour" in result.X_train.columns
assert "creation_date_minute" in result.X_train.columns
assert "creation_date_second" in result.X_train.columns
assert "creation_date_weekday" in result.X_train.columns
assert "creation_date_dayofyear" in result.X_train.columns
assert "deletion_date_year" in result.X_train.columns
assert "deletion_date_month" in result.X_train.columns
assert "deletion_date_day" in result.X_train.columns
assert "deletion_date_hour" in result.X_train.columns
assert "deletion_date_minute" in result.X_train.columns
assert "deletion_date_second" in result.X_train.columns
assert "deletion_date_weekday" in result.X_train.columns
assert "deletion_date_dayofyear" in result.X_train.columns


def test_date_columns_are_ignored_for_training(data: DataContainer):
Expand All @@ -117,8 +117,8 @@ def test_date_columns_are_ignored_for_training(data: DataContainer):
result = calculate_features_step.execute(data)

assert isinstance(result, DataContainer)
assert "creation_date" in result.columns_to_ignore_for_training
assert "deletion_date" in result.columns_to_ignore_for_training
assert "creation_date" not in result.X_train.columns
assert "deletion_date" not in result.X_train.columns


def test_output_dtypes(data: DataContainer):
Expand All @@ -133,14 +133,14 @@ def test_output_dtypes(data: DataContainer):
result = calculate_features_step.execute(data)

assert isinstance(result, DataContainer)
assert result.train["creation_date_year"].dtype == np.dtype("uint16")
assert result.train["creation_date_month"].dtype == np.dtype("uint8")
assert result.train["creation_date_day"].dtype == np.dtype("uint8")
assert result.train["creation_date_hour"].dtype == np.dtype("uint8")
assert result.train["creation_date_minute"].dtype == np.dtype("uint8")
assert result.train["creation_date_second"].dtype == np.dtype("uint8")
assert result.train["creation_date_weekday"].dtype == np.dtype("uint8")
assert result.train["creation_date_dayofyear"].dtype == np.dtype("uint16")
assert result.X_train["creation_date_year"].dtype == np.dtype("uint16")
assert result.X_train["creation_date_month"].dtype == np.dtype("uint8")
assert result.X_train["creation_date_day"].dtype == np.dtype("uint8")
assert result.X_train["creation_date_hour"].dtype == np.dtype("uint8")
assert result.X_train["creation_date_minute"].dtype == np.dtype("uint8")
assert result.X_train["creation_date_second"].dtype == np.dtype("uint8")
assert result.X_train["creation_date_weekday"].dtype == np.dtype("uint8")
assert result.X_train["creation_date_dayofyear"].dtype == np.dtype("uint16")


def test_output_values(data: DataContainer):
Expand All @@ -155,28 +155,28 @@ def test_output_values(data: DataContainer):
result = calculate_features_step.execute(data)

assert isinstance(result, DataContainer)
assert result.train["creation_date_year"].equals(
assert result.X_train["creation_date_year"].equals(
pd.Series([2023, 2023, 2023, 2023, 2023, 2023, 2024, 2024], dtype="uint16")
)
assert result.train["creation_date_month"].equals(
assert result.X_train["creation_date_month"].equals(
pd.Series([1, 1, 1, 1, 1, 11, 2, 3], dtype="uint8")
)
assert result.train["creation_date_day"].equals(
assert result.X_train["creation_date_day"].equals(
pd.Series([1, 2, 3, 4, 5, 1, 28, 28], dtype="uint8")
)
assert result.train["creation_date_hour"].equals(
assert result.X_train["creation_date_hour"].equals(
pd.Series([0, 0, 0, 0, 0, 0, 0, 0], dtype="uint8")
)
assert result.train["creation_date_minute"].equals(
assert result.X_train["creation_date_minute"].equals(
pd.Series([0, 0, 0, 0, 0, 0, 0, 0], dtype="uint8")
)
assert result.train["creation_date_second"].equals(
assert result.X_train["creation_date_second"].equals(
pd.Series([0, 0, 0, 0, 0, 0, 0, 0], dtype="uint8")
)
assert result.train["creation_date_weekday"].equals(
assert result.X_train["creation_date_weekday"].equals(
pd.Series([6, 0, 1, 2, 3, 2, 2, 3], dtype="uint8")
)
assert result.train["creation_date_dayofyear"].equals(
assert result.X_train["creation_date_dayofyear"].equals(
pd.Series([1, 2, 3, 4, 5, 305, 59, 88], dtype="uint16")
)

Expand Down Expand Up @@ -214,7 +214,7 @@ def test_init_with_unsupported_features():

def test_execute_with_prediction(data: DataContainer):
data.is_train = False
data.flow = data.train.copy()
data.X_prediction = data.X_train.copy()

datetime_columns = ["creation_date"]
features = ["year", "month", "day"]
Expand All @@ -226,6 +226,6 @@ def test_execute_with_prediction(data: DataContainer):
result = calculate_features_step.execute(data)

assert isinstance(result, DataContainer)
assert "creation_date_year" in result.flow.columns
assert "creation_date_month" in result.flow.columns
assert "creation_date_day" in result.flow.columns
assert "creation_date_year" in result.X_prediction.columns
assert "creation_date_month" in result.X_prediction.columns
assert "creation_date_day" in result.X_prediction.columns

0 comments on commit e8b4acf

Please sign in to comment.