Skip to content

Commit

Permalink
automatically encode categorical features in models not supporting th…
Browse files Browse the repository at this point in the history
…em (so far added only to LinearModel)
  • Loading branch information
vascomedici committed Aug 27, 2024
1 parent d006c14 commit c037095
Show file tree
Hide file tree
Showing 2 changed files with 56 additions and 3 deletions.
35 changes: 34 additions & 1 deletion pyforecaster/forecaster.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,42 @@
from abc import abstractmethod
from lightgbm import LGBMRegressor, Dataset, train
from sklearn.linear_model import RidgeCV, LinearRegression
from sklearn.preprocessing import LabelEncoder
from pyforecaster.scenarios_generator import ScenGen
from pyforecaster.utilities import get_logger
from inspect import signature


def encode_categorical(func):
def wrapper(self, x: pd.DataFrame, *args, **kwargs):
# Initialize a protected dictionary to store encoders if it doesn't exist yet
if not hasattr(self, '_le'):
self._le = {}

# Check if x contains columns that are not numbers and encode them
for column in x.select_dtypes(include=['object', 'category']).columns:
if column not in self._le:
# Create and fit a new encoder for the column if it's the first encounter
le = LabelEncoder()
x[column] = le.fit_transform(x[column].astype(str))
self._le[column] = le # Store the encoder for future use
else:
# Use the existing encoder to transform the data
le = self._le[column]
# Check for unseen categories
unique_values = set(x[column].astype(str))
unseen_values = unique_values - set(le.classes_)
if unseen_values:
raise ValueError(f"Unseen categories {unseen_values} encountered in column '{column}'.")
x[column] = le.transform(x[column].astype(str))

# Call the original function with preprocessed data
return func(self, x, *args, **kwargs)

return wrapper



class ScenarioGenerator(object):
def __init__(self, q_vect=None, nodes_at_step=None, val_ratio=None, logger=None, n_scen_fit=100,
additional_node=False, formatter=None, **scengen_kwgs):
Expand All @@ -34,7 +65,6 @@ def online_tree_reduction(self, value):
def set_params(self, **kwargs):
[self.__setattr__(k, v) for k, v in kwargs.items() if k in self.__dict__.keys()]

@abstractmethod
def get_params(self, **kwargs):
return {k: getattr(self, k) for k in signature(self.__class__).parameters.keys() if k in self.__dict__.keys()}

Expand Down Expand Up @@ -127,6 +157,7 @@ def __init__(self, q_vect=None, val_ratio=None, nodes_at_step=None, kind='linear
self.m = None
self.kind = kind

@encode_categorical
def fit(self, x:pd.DataFrame, y:pd.DataFrame):
x, y, x_val, y_val = self.train_val_split(x, y)
if self.kind == 'linear':
Expand All @@ -138,10 +169,12 @@ def fit(self, x:pd.DataFrame, y:pd.DataFrame):
super().fit(x_val, y_val)
return self

@encode_categorical
def predict(self, x:pd.DataFrame, **kwargs):
y_hat = pd.DataFrame(self.m.predict(x), index=x.index, columns=self.target_cols)
y_hat = self.anti_transform(x, y_hat)
return y_hat

def predict_quantiles(self, x:pd.DataFrame, **kwargs):
preds = np.expand_dims(self.predict(x), -1) * np.ones((1, 1, len(self.q_vect)))
for h in np.unique(x.index.hour):
Expand Down
24 changes: 22 additions & 2 deletions tests/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,26 @@ def test_fast_linreg(self):
y_hat.iloc[:, s_a].plot()
(y_hat_fast.iloc[:, s_a]).plot()

def test_linreg_with_categorical_features(self):

formatter = Formatter(logger=self.logger).add_transform(['all'], lags=np.arange(24), relative_lags=True)
formatter.add_transform(['all'], ['min', 'max'], agg_bins=[1, 2, 15, 20])
formatter.add_target_transform(['all'], lags=-np.arange(6))
x, y = formatter.transform(self.data.iloc[:1000])
x['cat'] = np.random.choice(['cat', 'dog'], len(x))
x.columns = x.columns.astype(str)
y.columns = y.columns.astype(str)
n_tr = int(len(x) * 0.8)
x_tr, x_te, y_tr, y_te = [x.iloc[:n_tr, :].copy(), x.iloc[n_tr:, :].copy(), y.iloc[:n_tr].copy(),
y.iloc[n_tr:].copy()]

m_lin = LinearForecaster(val_ratio=0.2, fit_intercept=False, normalize=False).fit(x_tr, y_tr)
y_hat = m_lin.predict(x_te)

s_a = 5
y_te.iloc[:, s_a].plot()
y_hat.iloc[:, s_a].plot()

def test_hw_difficult(self):

n_tr = int(len(self.x) * 0.5)
Expand Down Expand Up @@ -213,7 +233,7 @@ def test_antinormalize(self):
formatter.add_target_normalizer(['all'], 'mean', agg_freq='3d', name='a_movingavg')
formatter.add_target_normalizer(['all'], 'std', agg_freq='3d', name='a_movingstd')

x, y = formatter.transform(self.data.iloc[:10000])
x, y = formatter.transform(self.data)

n_tr = int(len(x) * 0.9)
x_tr, x_te, y_tr, y_te = [x.iloc[:n_tr, :].copy(), x.iloc[n_tr:, :].copy(), y.iloc[:n_tr].copy(),
Expand All @@ -231,7 +251,7 @@ def test_antinormalize(self):

formatter.add_normalizing_fun(expr="(df[t] - df['a_movingavg']) / (df['a_movingstd'] + 1)",
inv_expr="df[t]*(df['a_movingstd']+1) + df['a_movingavg']")
x, y_norm = formatter.transform(self.data.iloc[:10000])
x, y_norm = formatter.transform(self.data)
y = formatter.denormalize(x, y_norm)

x_tr, x_te, y_tr = [x.iloc[:n_tr, :].copy(), x.iloc[n_tr:, :].copy(), y_norm.iloc[:n_tr].copy()]
Expand Down

0 comments on commit c037095

Please sign in to comment.