Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Autogluon #33

Open
wants to merge 11 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ __pycache__/
*.json
*.pkl

# Autogluon
AutogluonModels/

# ignore mlflow folder
mlruns/
Expand Down
6 changes: 4 additions & 2 deletions examples/delay/configs/base.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,16 @@
"target": "Delay",
"task": "classification",
"columns_to_ignore_for_training": [
"FlightID"
"FlightID",
"FlightNumber"
]
},
"steps": [
{
"step_type": "GenerateStep",
"parameters": {
"train_path": "examples/delay/data/train.csv"
"train_path": "examples/delay/data/train.csv",
"predict_path": "examples/delay/data/test.csv"
}
},
{
Expand Down
11 changes: 11 additions & 0 deletions examples/delay/data/test.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
FlightID,Airline,FlightNumber,Origin,Destination,ScheduledDeparture,ScheduledArrival,Distance,Weather
101,NK,101,FLL,LGA,2024-07-10 07:00:00,2024-07-10 10:00:00,1076,Clear
102,B6,202,BOS,SFO,2024-07-10 08:30:00,2024-07-10 12:30:00,2704,Fog
103,AS,303,SEA,ANC,2024-07-10 10:00:00,2024-07-10 12:30:00,1448,Rain
104,F9,404,LAS,MCO,2024-07-10 11:30:00,2024-07-10 19:00:00,2035,Clear
105,NK,505,ORD,MSY,2024-07-10 13:00:00,2024-07-10 15:30:00,837,Thunderstorm
106,B6,606,JFK,AUS,2024-07-10 14:30:00,2024-07-10 17:30:00,1521,Clear
107,AS,707,PDX,HNL,2024-07-10 16:00:00,2024-07-10 19:30:00,2603,Windy
108,F9,808,DEN,SAN,2024-07-10 17:30:00,2024-07-10 19:00:00,853,Clear
109,NK,909,DTW,PHX,2024-07-10 19:00:00,2024-07-10 20:30:00,1671,Hail
110,B6,1010,BOS,MCO,2024-07-10 20:30:00,2024-07-10 23:30:00,1121,Clear
4 changes: 1 addition & 3 deletions ml_garden/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
from .core.pipeline import Pipeline

Pipeline.step_registry.auto_register_steps_from_package("ml_garden.core.steps")
Pipeline.model_registry.auto_register_models_from_package(
"ml_garden.implementation.tabular.xgboost"
)
Pipeline.model_registry.auto_register_models_from_package("ml_garden.implementation.tabular")
25 changes: 25 additions & 0 deletions ml_garden/core/data_container.py
Original file line number Diff line number Diff line change
Expand Up @@ -675,6 +675,31 @@ def predictions(self, value: pd.Series):
"""
self["predictions"] = value

@property
def predict_proba(self) -> pd.DataFrame:
"""
Get the prediction probabilities from the DataContainer.

Returns
-------
pd.DataFrame
The prediction probabilities stored in the DataContainer.
"""
return self["predict_proba"]

@predict_proba.setter
def predict_proba(self, value: pd.DataFrame):
"""
Set the prediction probabilities in the DataContainer.

Parameters
----------
value : pd.DataFrame
The prediction probabilities to be stored in the DataContainer.
Should be a DataFrame with a column for each class.
"""
self["predict_proba"] = value

@property
def explainer(self) -> BaseExplainer:
"""
Expand Down
16 changes: 16 additions & 0 deletions ml_garden/core/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,3 +24,19 @@ def fit(
@abstractmethod
def predict(self, X: pd.DataFrame) -> pd.Series:
"""Abstract method for making predictions."""

def predict_proba(self, X: pd.DataFrame) -> pd.DataFrame:
"""
Predict class probabilities with the trained model.

Parameters
----------
X : pd.DataFrame
Features to make probability predictions on.

Returns
-------
pd.DataFrame
Predicted class probabilities for the input features.
"""
pass
78 changes: 72 additions & 6 deletions ml_garden/core/model_registry.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,71 @@
import importlib
import logging
import pkgutil
from typing import Dict, Type

from ml_garden.core.model import Model


class ModelClassNotFoundError(Exception):
"""Exception raised when a model class is not found in the registry."""

pass


class ModelRegistry:
def __init__(self):
self._model_registry = {}
"""
Initialize a new ModelRegistry instance.

Attributes
----------
_model_registry : dict
A dictionary mapping model names to model classes.
logger : logging.Logger
Logger for the class.
"""
self._model_registry: Dict[str, Type[Model]] = {}
self.logger = logging.getLogger(__name__)

def register_model(self, model_class: type):
model_name = model_class.__name__
def register_model(self, model_class: Type[Model]) -> None:
"""
Register a model class in the registry.

Parameters
----------
model_class : Type[Model]
The model class to be registered.

Raises
------
ValueError
If the model_class is not a subclass of Model.
"""
model_name = model_class.__name__.lower()
if not issubclass(model_class, Model):
raise ValueError(f"{model_class} must be a subclass of Model")
self._model_registry[model_name] = model_class

def get_model_class(self, model_name: str) -> type:
def get_model_class(self, model_name: str) -> Type[Model]:
"""
Retrieve a model class from the registry.

Parameters
----------
model_name : str
The name of the model class to retrieve.

Returns
-------
Type[Model]
The model class.

Raises
------
ModelClassNotFoundError
If the model class is not found in the registry.
"""
model_name = model_name.lower()
if model_name in self._model_registry:
return self._model_registry[model_name]
else:
Expand All @@ -29,10 +74,31 @@ def get_model_class(self, model_name: str) -> type:
f" {list(self._model_registry.keys())}"
)

def get_all_model_classes(self) -> dict:
def get_all_model_classes(self) -> Dict[str, Type[Model]]:
"""
Get all registered model classes.

Returns
-------
dict
A dictionary of all registered model classes.
"""
return self._model_registry

def auto_register_models_from_package(self, package_name: str):
def auto_register_models_from_package(self, package_name: str) -> None:
"""
Automatically register all model classes from a given package.

Parameters
----------
package_name : str
The name of the package to search for model classes.

Raises
------
ImportError
If the package cannot be imported.
"""
try:
package = importlib.import_module(package_name)
prefix = package.__name__ + "."
Expand Down
49 changes: 30 additions & 19 deletions ml_garden/core/steps/calculate_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,14 +65,16 @@ def __init__(
f" features: {list(self.feature_extractors.keys())}"
)

def _convert_column_to_datetime(self, df: pd.DataFrame, column: str) -> pd.DataFrame:
def _convert_column_to_datetime(self, df: pd.DataFrame, column: str, log: bool) -> pd.DataFrame:
"""Convert a column to datetime.
Parameters
----------
df : pd.DataFrame
The DataFrame containing the column to convert
column : str
The name of the column to convert
log: bool
If True, logs information.
Returns
-------
pd.DataFrame
Expand All @@ -85,14 +87,15 @@ def _convert_column_to_datetime(self, df: pd.DataFrame, column: str) -> pd.DataF
df[column],
errors="raise",
)
self.logger.info(f"Column '{column}' automatically converted to datetime.")
if log:
self.logger.info(f"Column '{column}' automatically converted to datetime.")
except ValueError as e:
self.logger.error(f"Error converting column '{column}' to datetime: {e}")
except Exception as e:
self.logger.error(f"Unexpected error converting column '{column}' to datetime: {e}")
else:
self.logger.debug(f"Column '{column}' is already a datetime type.")

if log:
self.logger.debug(f"Column '{column}' is already a datetime type.")
return df

def _extract_feature(self, df: pd.DataFrame, column: str, feature: str) -> None:
Expand Down Expand Up @@ -122,6 +125,14 @@ def _extract_feature(self, df: pd.DataFrame, column: str, feature: str) -> None:
)
raise ValueError(error_message)

def _drop_datetime_columns(self, df: pd.DataFrame, log: bool) -> pd.DataFrame:
"""Drop the datetime columns from the `df`."""
if self.datetime_columns:
if log:
self.logger.info(f"Dropping original datetime columns: {self.datetime_columns}")
return df.drop(columns=self.datetime_columns)
return df

def execute(self, data: DataContainer) -> DataContainer:
"""Execute the step.
Parameters
Expand All @@ -135,21 +146,18 @@ def execute(self, data: DataContainer) -> DataContainer:
"""
self.logger.info("Calculating features")

if not data.is_train:
data.flow = self._create_datetime_features(data.flow, log=True)
datasets = [
("X_prediction", data.X_prediction, True),
("X_train", data.X_train, True),
("X_validation", data.X_validation, False),
("X_test", data.X_test, False),
]

if data.train is not None:
data.train = self._create_datetime_features(data.train, log=True)

if data.validation is not None:
data.validation = self._create_datetime_features(data.validation)

if data.test is not None:
data.test = self._create_datetime_features(data.test)

## add datetime columns to ignore columns for training
if self.datetime_columns:
data.columns_to_ignore_for_training.extend(self.datetime_columns)
for attr_name, dataset, should_log in datasets:
if dataset is not None:
dataset = self._create_datetime_features(dataset, log=should_log)
dataset = self._drop_datetime_columns(dataset, log=should_log)
setattr(data, attr_name, dataset)

return data

Expand All @@ -173,7 +181,7 @@ def _create_datetime_features(
if self.datetime_columns:
for column in self.datetime_columns:
if column in df.columns:
df = self._convert_column_to_datetime(df, column)
df = self._convert_column_to_datetime(df, column, log)

if self.features:
for feature in self.features:
Expand All @@ -191,4 +199,7 @@ def _create_datetime_features(
if log:
self.logger.warning("No datetime columns specified. Skipping feature extraction.")

if log:
self.logger.info(f"Created new features: {self.features}")

return df
Loading
Loading