diff --git a/ml_garden/core/steps/calculate_features.py b/ml_garden/core/steps/calculate_features.py index 8575d61..c434cc1 100644 --- a/ml_garden/core/steps/calculate_features.py +++ b/ml_garden/core/steps/calculate_features.py @@ -65,7 +65,7 @@ def __init__( f" features: {list(self.feature_extractors.keys())}" ) - def _convert_column_to_datetime(self, df: pd.DataFrame, column: str) -> pd.DataFrame: + def _convert_column_to_datetime(self, df: pd.DataFrame, column: str, log: bool) -> pd.DataFrame: """Convert a column to datetime. Parameters ---------- @@ -73,6 +73,8 @@ def _convert_column_to_datetime(self, df: pd.DataFrame, column: str) -> pd.DataF The DataFrame containing the column to convert column : str The name of the column to convert + log: bool + If True, logs information. Returns ------- pd.DataFrame @@ -85,14 +87,15 @@ def _convert_column_to_datetime(self, df: pd.DataFrame, column: str) -> pd.DataF df[column], errors="raise", ) - self.logger.info(f"Column '{column}' automatically converted to datetime.") + if log: + self.logger.info(f"Column '{column}' automatically converted to datetime.") except ValueError as e: self.logger.error(f"Error converting column '{column}' to datetime: {e}") except Exception as e: self.logger.error(f"Unexpected error converting column '{column}' to datetime: {e}") else: - self.logger.debug(f"Column '{column}' is already a datetime type.") - + if log: + self.logger.debug(f"Column '{column}' is already a datetime type.") return df def _extract_feature(self, df: pd.DataFrame, column: str, feature: str) -> None: @@ -122,6 +125,14 @@ def _extract_feature(self, df: pd.DataFrame, column: str, feature: str) -> None: ) raise ValueError(error_message) + def _drop_datetime_columns(self, df: pd.DataFrame, log: bool) -> pd.DataFrame: + """Drop the datetime columns from the `df`.""" + if self.datetime_columns: + if log: + self.logger.info(f"Dropping original datetime columns: {self.datetime_columns}") + return df.drop(columns=self.datetime_columns) + return df + def execute(self, data: DataContainer) -> DataContainer: """Execute the step. Parameters @@ -135,21 +146,18 @@ def execute(self, data: DataContainer) -> DataContainer: """ self.logger.info("Calculating features") - if not data.is_train: - data.flow = self._create_datetime_features(data.flow, log=True) + datasets = [ + ("X_prediction", data.X_prediction, True), + ("X_train", data.X_train, True), + ("X_validation", data.X_validation, False), + ("X_test", data.X_test, False), + ] - if data.train is not None: - data.train = self._create_datetime_features(data.train, log=True) - - if data.validation is not None: - data.validation = self._create_datetime_features(data.validation) - - if data.test is not None: - data.test = self._create_datetime_features(data.test) - - ## add datetime columns to ignore columns for training - if self.datetime_columns: - data.columns_to_ignore_for_training.extend(self.datetime_columns) + for attr_name, dataset, should_log in datasets: + if dataset is not None: + dataset = self._create_datetime_features(dataset, log=should_log) + dataset = self._drop_datetime_columns(dataset, log=should_log) + setattr(data, attr_name, dataset) return data @@ -173,7 +181,7 @@ def _create_datetime_features( if self.datetime_columns: for column in self.datetime_columns: if column in df.columns: - df = self._convert_column_to_datetime(df, column) + df = self._convert_column_to_datetime(df, column, log) if self.features: for feature in self.features: @@ -191,4 +199,7 @@ def _create_datetime_features( if log: self.logger.warning("No datetime columns specified. Skipping feature extraction.") + if log: + self.logger.info(f"Created new features: {self.features}") + return df diff --git a/ml_garden/core/steps/encode.py b/ml_garden/core/steps/encode.py index 1832c97..c17c6ee 100644 --- a/ml_garden/core/steps/encode.py +++ b/ml_garden/core/steps/encode.py @@ -52,11 +52,9 @@ def execute(self, data: DataContainer) -> DataContainer: self.logger.info("Encoding data") if not data.is_train: - categorical_features, numeric_features = self._get_feature_types( - data.flow.drop(columns=data.columns_to_ignore_for_training) - ) + categorical_features, numeric_features = self._get_feature_types(data.X_prediction) data.X_prediction, _, _ = self._apply_encoding( - X=data.flow, + X=data.X_prediction, y=None, categorical_features=categorical_features, numeric_features=numeric_features, @@ -66,8 +64,6 @@ def execute(self, data: DataContainer) -> DataContainer: return data categorical_features, numeric_features = self._get_feature_types(data.X_train) - self.logger.info(f"New categorical features: {categorical_features}") - self.logger.info(f"New numeric features: {numeric_features}") data.X_train, data.y_train, data._encoder = self._apply_encoding( X=data.X_train, diff --git a/ml_garden/core/steps/fit_model.py b/ml_garden/core/steps/fit_model.py index 883c131..f57de08 100644 --- a/ml_garden/core/steps/fit_model.py +++ b/ml_garden/core/steps/fit_model.py @@ -298,7 +298,6 @@ def predict(self, data: DataContainer) -> DataContainer: The updated data container """ self.logger.info(f"Predicting with {self.model_class.__name__} model") - data.X_prediction = data.flow.drop(columns=data.columns_to_ignore_for_training) data.flow[data.prediction_column] = data.model.predict(data.X_prediction) data.predictions = data.flow[data.prediction_column] return data diff --git a/ml_garden/core/steps/tabular_split.py b/ml_garden/core/steps/tabular_split.py index 2e7786d..316fea5 100644 --- a/ml_garden/core/steps/tabular_split.py +++ b/ml_garden/core/steps/tabular_split.py @@ -13,7 +13,7 @@ class TabularSplitStep(PipelineStep): """Split the data.""" - used_for_prediction = False + used_for_prediction = True used_for_training = True def __init__( @@ -129,6 +129,13 @@ def execute(self, data: DataContainer) -> DataContainer: Where df is the DataFrame used as input to the SplitStep """ + if not data.is_train: + data.X_prediction = data.flow + if data.columns_to_ignore_for_training: + data.X_prediction = data.X_prediction.drop( + columns=data.columns_to_ignore_for_training + ) + return data self.logger.info("Splitting tabular data...") df = data.flow diff --git a/tests/core/steps/test_calculate_features.py b/tests/core/steps/test_calculate_features.py index 3abf661..958fb6b 100644 --- a/tests/core/steps/test_calculate_features.py +++ b/tests/core/steps/test_calculate_features.py @@ -62,7 +62,7 @@ def input_data() -> pd.DataFrame: def data(input_data: pd.DataFrame) -> DataContainer: data = DataContainer({"is_train": True}) data.columns_to_ignore_for_training = [] - data.train = input_data + data.X_train = input_data return data @@ -72,7 +72,7 @@ def test_skipping_with_no_parameters(data: DataContainer): result = calculate_features_step.execute(data) assert isinstance(result, DataContainer) - assert result.train.equals(data.train) + assert result.X_train.equals(data.X_train) def test_feature_names(data: DataContainer): @@ -87,22 +87,22 @@ def test_feature_names(data: DataContainer): result = calculate_features_step.execute(data) assert isinstance(result, DataContainer) - assert "creation_date_year" in result.train.columns - assert "creation_date_month" in result.train.columns - assert "creation_date_day" in result.train.columns - assert "creation_date_hour" in result.train.columns - assert "creation_date_minute" in result.train.columns - assert "creation_date_second" in result.train.columns - assert "creation_date_weekday" in result.train.columns - assert "creation_date_dayofyear" in result.train.columns - assert "deletion_date_year" in result.train.columns - assert "deletion_date_month" in result.train.columns - assert "deletion_date_day" in result.train.columns - assert "deletion_date_hour" in result.train.columns - assert "deletion_date_minute" in result.train.columns - assert "deletion_date_second" in result.train.columns - assert "deletion_date_weekday" in result.train.columns - assert "deletion_date_dayofyear" in result.train.columns + assert "creation_date_year" in result.X_train.columns + assert "creation_date_month" in result.X_train.columns + assert "creation_date_day" in result.X_train.columns + assert "creation_date_hour" in result.X_train.columns + assert "creation_date_minute" in result.X_train.columns + assert "creation_date_second" in result.X_train.columns + assert "creation_date_weekday" in result.X_train.columns + assert "creation_date_dayofyear" in result.X_train.columns + assert "deletion_date_year" in result.X_train.columns + assert "deletion_date_month" in result.X_train.columns + assert "deletion_date_day" in result.X_train.columns + assert "deletion_date_hour" in result.X_train.columns + assert "deletion_date_minute" in result.X_train.columns + assert "deletion_date_second" in result.X_train.columns + assert "deletion_date_weekday" in result.X_train.columns + assert "deletion_date_dayofyear" in result.X_train.columns def test_date_columns_are_ignored_for_training(data: DataContainer): @@ -117,8 +117,8 @@ def test_date_columns_are_ignored_for_training(data: DataContainer): result = calculate_features_step.execute(data) assert isinstance(result, DataContainer) - assert "creation_date" in result.columns_to_ignore_for_training - assert "deletion_date" in result.columns_to_ignore_for_training + assert "creation_date" not in result.X_train.columns + assert "deletion_date" not in result.X_train.columns def test_output_dtypes(data: DataContainer): @@ -133,14 +133,14 @@ def test_output_dtypes(data: DataContainer): result = calculate_features_step.execute(data) assert isinstance(result, DataContainer) - assert result.train["creation_date_year"].dtype == np.dtype("uint16") - assert result.train["creation_date_month"].dtype == np.dtype("uint8") - assert result.train["creation_date_day"].dtype == np.dtype("uint8") - assert result.train["creation_date_hour"].dtype == np.dtype("uint8") - assert result.train["creation_date_minute"].dtype == np.dtype("uint8") - assert result.train["creation_date_second"].dtype == np.dtype("uint8") - assert result.train["creation_date_weekday"].dtype == np.dtype("uint8") - assert result.train["creation_date_dayofyear"].dtype == np.dtype("uint16") + assert result.X_train["creation_date_year"].dtype == np.dtype("uint16") + assert result.X_train["creation_date_month"].dtype == np.dtype("uint8") + assert result.X_train["creation_date_day"].dtype == np.dtype("uint8") + assert result.X_train["creation_date_hour"].dtype == np.dtype("uint8") + assert result.X_train["creation_date_minute"].dtype == np.dtype("uint8") + assert result.X_train["creation_date_second"].dtype == np.dtype("uint8") + assert result.X_train["creation_date_weekday"].dtype == np.dtype("uint8") + assert result.X_train["creation_date_dayofyear"].dtype == np.dtype("uint16") def test_output_values(data: DataContainer): @@ -155,28 +155,28 @@ def test_output_values(data: DataContainer): result = calculate_features_step.execute(data) assert isinstance(result, DataContainer) - assert result.train["creation_date_year"].equals( + assert result.X_train["creation_date_year"].equals( pd.Series([2023, 2023, 2023, 2023, 2023, 2023, 2024, 2024], dtype="uint16") ) - assert result.train["creation_date_month"].equals( + assert result.X_train["creation_date_month"].equals( pd.Series([1, 1, 1, 1, 1, 11, 2, 3], dtype="uint8") ) - assert result.train["creation_date_day"].equals( + assert result.X_train["creation_date_day"].equals( pd.Series([1, 2, 3, 4, 5, 1, 28, 28], dtype="uint8") ) - assert result.train["creation_date_hour"].equals( + assert result.X_train["creation_date_hour"].equals( pd.Series([0, 0, 0, 0, 0, 0, 0, 0], dtype="uint8") ) - assert result.train["creation_date_minute"].equals( + assert result.X_train["creation_date_minute"].equals( pd.Series([0, 0, 0, 0, 0, 0, 0, 0], dtype="uint8") ) - assert result.train["creation_date_second"].equals( + assert result.X_train["creation_date_second"].equals( pd.Series([0, 0, 0, 0, 0, 0, 0, 0], dtype="uint8") ) - assert result.train["creation_date_weekday"].equals( + assert result.X_train["creation_date_weekday"].equals( pd.Series([6, 0, 1, 2, 3, 2, 2, 3], dtype="uint8") ) - assert result.train["creation_date_dayofyear"].equals( + assert result.X_train["creation_date_dayofyear"].equals( pd.Series([1, 2, 3, 4, 5, 305, 59, 88], dtype="uint16") ) @@ -214,7 +214,7 @@ def test_init_with_unsupported_features(): def test_execute_with_prediction(data: DataContainer): data.is_train = False - data.flow = data.train.copy() + data.X_prediction = data.X_train.copy() datetime_columns = ["creation_date"] features = ["year", "month", "day"] @@ -226,6 +226,6 @@ def test_execute_with_prediction(data: DataContainer): result = calculate_features_step.execute(data) assert isinstance(result, DataContainer) - assert "creation_date_year" in result.flow.columns - assert "creation_date_month" in result.flow.columns - assert "creation_date_day" in result.flow.columns + assert "creation_date_year" in result.X_prediction.columns + assert "creation_date_month" in result.X_prediction.columns + assert "creation_date_day" in result.X_prediction.columns