diff --git a/ml_garden/core/data_container.py b/ml_garden/core/data_container.py index e7c9809..7b4e409 100644 --- a/ml_garden/core/data_container.py +++ b/ml_garden/core/data_container.py @@ -6,7 +6,7 @@ import logging import os import sys -from typing import Any, Optional, Union +from typing import Any, Dict, Optional, Union import dill as pickle import pandas as pd @@ -675,6 +675,32 @@ def predictions(self, value: pd.Series): """ self["predictions"] = value + @property + def predict_proba(self) -> pd.DataFrame: + """ + Get the prediction probabilities from the DataContainer. + + Returns + ------- + pd.DataFrame + The prediction probabilities stored in the DataContainer. + For binary and multiclass classification, returns a DataFrame with a column for each class. + """ + return self["predict_proba"] + + @predict_proba.setter + def predict_proba(self, value: pd.DataFrame): + """ + Set the prediction probabilities in the DataContainer. + + Parameters + ---------- + value : pd.DataFrame + The prediction probabilities to be stored in the DataContainer. + Should be a DataFrame with a column for each class. + """ + self["predict_proba"] = value + @property def explainer(self) -> BaseExplainer: """ diff --git a/ml_garden/core/model.py b/ml_garden/core/model.py index 7513b97..c36c353 100644 --- a/ml_garden/core/model.py +++ b/ml_garden/core/model.py @@ -24,3 +24,19 @@ def fit( @abstractmethod def predict(self, X: pd.DataFrame) -> pd.Series: """Abstract method for making predictions.""" + + def predict_proba(self, X: pd.DataFrame) -> pd.DataFrame: + """ + Predict class probabilities with the trained model. + + Parameters + ---------- + X : pd.DataFrame + Features to make probability predictions on. + + Returns + ------- + pd.DataFrame + Predicted class probabilities for the input features. + """ + pass diff --git a/ml_garden/core/steps/fit_model.py b/ml_garden/core/steps/fit_model.py index f57de08..9437c4b 100644 --- a/ml_garden/core/steps/fit_model.py +++ b/ml_garden/core/steps/fit_model.py @@ -4,6 +4,7 @@ import numpy as np import optuna +import pandas as pd from sklearn.metrics import ( accuracy_score, f1_score, @@ -300,4 +301,14 @@ def predict(self, data: DataContainer) -> DataContainer: self.logger.info(f"Predicting with {self.model_class.__name__} model") data.flow[data.prediction_column] = data.model.predict(data.X_prediction) data.predictions = data.flow[data.prediction_column] + + # If the task is classification, also get the prediction probabilities + if data.task == Task.CLASSIFICATION: + proba_df = data.model.predict_proba(data.X_prediction) + proba_df.columns = [f"proba_{col}" for col in proba_df.columns] + + # Concatenate the probabilities DataFrame with the existing DataFrame + data.flow = pd.concat([data.flow, proba_df], axis=1) + data.predict_proba = proba_df + return data diff --git a/ml_garden/implementation/tabular/autogluon/model.py b/ml_garden/implementation/tabular/autogluon/model.py index 3164acd..88da3a7 100644 --- a/ml_garden/implementation/tabular/autogluon/model.py +++ b/ml_garden/implementation/tabular/autogluon/model.py @@ -33,4 +33,10 @@ def fit(self, X: pd.DataFrame, y: pd.Series, eval_set=None, verbose=True) -> Non def predict(self, X: pd.DataFrame) -> pd.Series: predictions = self.model.predict(X) - return predictions + return pd.Series(predictions, index=X.index) + + def predict_proba(self, X: pd.DataFrame) -> pd.DataFrame: + if self.model.problem_type == "regression": + raise ValueError("predict_proba is not available for regression tasks.") + probabilities = self.model.predict_proba(X) + return pd.DataFrame(probabilities, index=X.index) diff --git a/ml_garden/implementation/tabular/xgboost/model.py b/ml_garden/implementation/tabular/xgboost/model.py index f757183..65f1010 100644 --- a/ml_garden/implementation/tabular/xgboost/model.py +++ b/ml_garden/implementation/tabular/xgboost/model.py @@ -1,4 +1,5 @@ import logging +from typing import List import pandas as pd import xgboost as xgb @@ -75,4 +76,5 @@ def predict_proba(self, X: pd.DataFrame) -> pd.DataFrame: pd.DataFrame Predicted class probabilities for the input features. """ - return self.model.predict_proba(X) + proba = self.model.predict_proba(X) + return pd.DataFrame(proba, columns=self.model.classes_)