tryolabs · diegomarvid · Jul 14, 2024 · Jul 14, 2024 · Jul 14, 2024 · Jul 14, 2024
diff --git a/.gitignore b/.gitignore
@@ -11,6 +11,8 @@ __pycache__/
 *.json
 *.pkl
 
+# Autogluon
+AutogluonModels/
 
 # ignore mlflow folder
 mlruns/

diff --git a/examples/delay/configs/base.json b/examples/delay/configs/base.json
@@ -7,14 +7,16 @@
             "target": "Delay",
             "task": "classification",
             "columns_to_ignore_for_training": [
-                "FlightID"
+                "FlightID",
+                "FlightNumber"
             ]
         },
         "steps": [
             {
                 "step_type": "GenerateStep",
                 "parameters": {
-                    "train_path": "examples/delay/data/train.csv"
+                    "train_path": "examples/delay/data/train.csv",
+                    "predict_path": "examples/delay/data/test.csv"
                 }
             },
             {

diff --git a/examples/delay/data/test.csv b/examples/delay/data/test.csv
@@ -0,0 +1,11 @@
+FlightID,Airline,FlightNumber,Origin,Destination,ScheduledDeparture,ScheduledArrival,Distance,Weather
+101,NK,101,FLL,LGA,2024-07-10 07:00:00,2024-07-10 10:00:00,1076,Clear
+102,B6,202,BOS,SFO,2024-07-10 08:30:00,2024-07-10 12:30:00,2704,Fog
+103,AS,303,SEA,ANC,2024-07-10 10:00:00,2024-07-10 12:30:00,1448,Rain
+104,F9,404,LAS,MCO,2024-07-10 11:30:00,2024-07-10 19:00:00,2035,Clear
+105,NK,505,ORD,MSY,2024-07-10 13:00:00,2024-07-10 15:30:00,837,Thunderstorm
+106,B6,606,JFK,AUS,2024-07-10 14:30:00,2024-07-10 17:30:00,1521,Clear
+107,AS,707,PDX,HNL,2024-07-10 16:00:00,2024-07-10 19:30:00,2603,Windy
+108,F9,808,DEN,SAN,2024-07-10 17:30:00,2024-07-10 19:00:00,853,Clear
+109,NK,909,DTW,PHX,2024-07-10 19:00:00,2024-07-10 20:30:00,1671,Hail
+110,B6,1010,BOS,MCO,2024-07-10 20:30:00,2024-07-10 23:30:00,1121,Clear
diff --git a/ml_garden/__init__.py b/ml_garden/__init__.py
@@ -1,6 +1,4 @@
 from .core.pipeline import Pipeline
 
 Pipeline.step_registry.auto_register_steps_from_package("ml_garden.core.steps")
-Pipeline.model_registry.auto_register_models_from_package(
-    "ml_garden.implementation.tabular.xgboost"
-)
+Pipeline.model_registry.auto_register_models_from_package("ml_garden.implementation.tabular")
diff --git a/ml_garden/core/data_container.py b/ml_garden/core/data_container.py
@@ -675,6 +675,31 @@ def predictions(self, value: pd.Series):
         """
         self["predictions"] = value
 
+    @property
+    def predict_proba(self) -> pd.DataFrame:
+        """
+        Get the prediction probabilities from the DataContainer.
+
+        Returns
+        -------
+        pd.DataFrame
+            The prediction probabilities stored in the DataContainer.
+        """
+        return self["predict_proba"]
+
+    @predict_proba.setter
+    def predict_proba(self, value: pd.DataFrame):
+        """
+        Set the prediction probabilities in the DataContainer.
+
+        Parameters
+        ----------
+        value : pd.DataFrame
+            The prediction probabilities to be stored in the DataContainer.
+            Should be a DataFrame with a column for each class.
+        """
+        self["predict_proba"] = value
+
     @property
     def explainer(self) -> BaseExplainer:
         """

diff --git a/ml_garden/core/model.py b/ml_garden/core/model.py
@@ -24,3 +24,19 @@ def fit(
     @abstractmethod
     def predict(self, X: pd.DataFrame) -> pd.Series:
         """Abstract method for making predictions."""
+
+    def predict_proba(self, X: pd.DataFrame) -> pd.DataFrame:
+        """
+        Predict class probabilities with the trained model.
+
+        Parameters
+        ----------
+        X : pd.DataFrame
+            Features to make probability predictions on.
+
+        Returns
+        -------
+        pd.DataFrame
+            Predicted class probabilities for the input features.
+        """
+        pass
diff --git a/ml_garden/core/model_registry.py b/ml_garden/core/model_registry.py
@@ -1,26 +1,71 @@
 import importlib
 import logging
 import pkgutil
+from typing import Dict, Type
 
 from ml_garden.core.model import Model
 
 
 class ModelClassNotFoundError(Exception):
+    """Exception raised when a model class is not found in the registry."""
+
     pass
 
 
 class ModelRegistry:
     def __init__(self):
-        self._model_registry = {}
+        """
+        Initialize a new ModelRegistry instance.
+
+        Attributes
+        ----------
+        _model_registry : dict
+            A dictionary mapping model names to model classes.
+        logger : logging.Logger
+            Logger for the class.
+        """
+        self._model_registry: Dict[str, Type[Model]] = {}
         self.logger = logging.getLogger(__name__)
 
-    def register_model(self, model_class: type):
-        model_name = model_class.__name__
+    def register_model(self, model_class: Type[Model]) -> None:
+        """
+        Register a model class in the registry.
+
+        Parameters
+        ----------
+        model_class : Type[Model]
+            The model class to be registered.
+
+        Raises
+        ------
+        ValueError
+            If the model_class is not a subclass of Model.
+        """
+        model_name = model_class.__name__.lower()
         if not issubclass(model_class, Model):
             raise ValueError(f"{model_class} must be a subclass of Model")
         self._model_registry[model_name] = model_class
 
-    def get_model_class(self, model_name: str) -> type:
+    def get_model_class(self, model_name: str) -> Type[Model]:
+        """
+        Retrieve a model class from the registry.
+
+        Parameters
+        ----------
+        model_name : str
+            The name of the model class to retrieve.
+
+        Returns
+        -------
+        Type[Model]
+            The model class.
+
+        Raises
+        ------
+        ModelClassNotFoundError
+            If the model class is not found in the registry.
+        """
+        model_name = model_name.lower()
         if model_name in self._model_registry:
             return self._model_registry[model_name]
         else:
@@ -29,10 +74,31 @@ def get_model_class(self, model_name: str) -> type:
                 f" {list(self._model_registry.keys())}"
             )
 
-    def get_all_model_classes(self) -> dict:
+    def get_all_model_classes(self) -> Dict[str, Type[Model]]:
+        """
+        Get all registered model classes.
+
+        Returns
+        -------
+        dict
+            A dictionary of all registered model classes.
+        """
         return self._model_registry
 
-    def auto_register_models_from_package(self, package_name: str):
+    def auto_register_models_from_package(self, package_name: str) -> None:
+        """
+        Automatically register all model classes from a given package.
+
+        Parameters
+        ----------
+        package_name : str
+            The name of the package to search for model classes.
+
+        Raises
+        ------
+        ImportError
+            If the package cannot be imported.
+        """
         try:
             package = importlib.import_module(package_name)
             prefix = package.__name__ + "."

diff --git a/ml_garden/core/steps/calculate_features.py b/ml_garden/core/steps/calculate_features.py
@@ -65,14 +65,16 @@ def __init__(
                 f" features: {list(self.feature_extractors.keys())}"
             )
 
-    def _convert_column_to_datetime(self, df: pd.DataFrame, column: str) -> pd.DataFrame:
+    def _convert_column_to_datetime(self, df: pd.DataFrame, column: str, log: bool) -> pd.DataFrame:
         """Convert a column to datetime.
         Parameters
         ----------
         df : pd.DataFrame
             The DataFrame containing the column to convert
         column : str
             The name of the column to convert
+        log: bool
+            If True, logs information.
         Returns
         -------
         pd.DataFrame
@@ -85,14 +87,15 @@ def _convert_column_to_datetime(self, df: pd.DataFrame, column: str) -> pd.DataF
                     df[column],
                     errors="raise",
                 )
-                self.logger.info(f"Column '{column}' automatically converted to datetime.")
+                if log:
+                    self.logger.info(f"Column '{column}' automatically converted to datetime.")
             except ValueError as e:
                 self.logger.error(f"Error converting column '{column}' to datetime: {e}")
             except Exception as e:
                 self.logger.error(f"Unexpected error converting column '{column}' to datetime: {e}")
         else:
-            self.logger.debug(f"Column '{column}' is already a datetime type.")
-
+            if log:
+                self.logger.debug(f"Column '{column}' is already a datetime type.")
         return df
 
     def _extract_feature(self, df: pd.DataFrame, column: str, feature: str) -> None:
@@ -122,6 +125,14 @@ def _extract_feature(self, df: pd.DataFrame, column: str, feature: str) -> None:
             )
             raise ValueError(error_message)
 
+    def _drop_datetime_columns(self, df: pd.DataFrame, log: bool) -> pd.DataFrame:
+        """Drop the datetime columns from the `df`."""
+        if self.datetime_columns:
+            if log:
+                self.logger.info(f"Dropping original datetime columns: {self.datetime_columns}")
+            return df.drop(columns=self.datetime_columns)
+        return df
+
     def execute(self, data: DataContainer) -> DataContainer:
         """Execute the step.
         Parameters
@@ -135,21 +146,18 @@ def execute(self, data: DataContainer) -> DataContainer:
         """
         self.logger.info("Calculating features")
 
-        if not data.is_train:
-            data.flow = self._create_datetime_features(data.flow, log=True)
+        datasets = [
+            ("X_prediction", data.X_prediction, True),
+            ("X_train", data.X_train, True),
+            ("X_validation", data.X_validation, False),
+            ("X_test", data.X_test, False),
+        ]
 
-        if data.train is not None:
-            data.train = self._create_datetime_features(data.train, log=True)
-
-        if data.validation is not None:
-            data.validation = self._create_datetime_features(data.validation)
-
-        if data.test is not None:
-            data.test = self._create_datetime_features(data.test)
-
-        ## add datetime columns to ignore columns for training
-        if self.datetime_columns:
-            data.columns_to_ignore_for_training.extend(self.datetime_columns)
+        for attr_name, dataset, should_log in datasets:
+            if dataset is not None:
+                dataset = self._create_datetime_features(dataset, log=should_log)
+                dataset = self._drop_datetime_columns(dataset, log=should_log)
+                setattr(data, attr_name, dataset)
 
         return data
 
@@ -173,7 +181,7 @@ def _create_datetime_features(
         if self.datetime_columns:
             for column in self.datetime_columns:
                 if column in df.columns:
-                    df = self._convert_column_to_datetime(df, column)
+                    df = self._convert_column_to_datetime(df, column, log)
 
                     if self.features:
                         for feature in self.features:
@@ -191,4 +199,7 @@ def _create_datetime_features(
             if log:
                 self.logger.warning("No datetime columns specified. Skipping feature extraction.")
 
+        if log:
+            self.logger.info(f"Created new features: {self.features}")
+
         return df
-Original file line number
+Diff line change
@@ Expand Up / @@ -11,6 +11,8 @@ __pycache__/ @@
     *.json
     *.pkl
+    # Autogluon
+    AutogluonModels/
     # ignore mlflow folder
     mlruns/
@@ Expand Down @@