diff --git a/pyforecaster/forecaster.py b/pyforecaster/forecaster.py index fa1c0e4..4b4deca 100644 --- a/pyforecaster/forecaster.py +++ b/pyforecaster/forecaster.py @@ -6,11 +6,42 @@ from abc import abstractmethod from lightgbm import LGBMRegressor, Dataset, train from sklearn.linear_model import RidgeCV, LinearRegression +from sklearn.preprocessing import LabelEncoder from pyforecaster.scenarios_generator import ScenGen from pyforecaster.utilities import get_logger from inspect import signature +def encode_categorical(func): + def wrapper(self, x: pd.DataFrame, *args, **kwargs): + # Initialize a protected dictionary to store encoders if it doesn't exist yet + if not hasattr(self, '_le'): + self._le = {} + + # Check if x contains columns that are not numbers and encode them + for column in x.select_dtypes(include=['object', 'category']).columns: + if column not in self._le: + # Create and fit a new encoder for the column if it's the first encounter + le = LabelEncoder() + x[column] = le.fit_transform(x[column].astype(str)) + self._le[column] = le # Store the encoder for future use + else: + # Use the existing encoder to transform the data + le = self._le[column] + # Check for unseen categories + unique_values = set(x[column].astype(str)) + unseen_values = unique_values - set(le.classes_) + if unseen_values: + raise ValueError(f"Unseen categories {unseen_values} encountered in column '{column}'.") + x[column] = le.transform(x[column].astype(str)) + + # Call the original function with preprocessed data + return func(self, x, *args, **kwargs) + + return wrapper + + + class ScenarioGenerator(object): def __init__(self, q_vect=None, nodes_at_step=None, val_ratio=None, logger=None, n_scen_fit=100, additional_node=False, formatter=None, **scengen_kwgs): @@ -34,7 +65,6 @@ def online_tree_reduction(self, value): def set_params(self, **kwargs): [self.__setattr__(k, v) for k, v in kwargs.items() if k in self.__dict__.keys()] - @abstractmethod def get_params(self, **kwargs): return {k: getattr(self, k) for k in signature(self.__class__).parameters.keys() if k in self.__dict__.keys()} @@ -127,6 +157,7 @@ def __init__(self, q_vect=None, val_ratio=None, nodes_at_step=None, kind='linear self.m = None self.kind = kind + @encode_categorical def fit(self, x:pd.DataFrame, y:pd.DataFrame): x, y, x_val, y_val = self.train_val_split(x, y) if self.kind == 'linear': @@ -138,10 +169,12 @@ def fit(self, x:pd.DataFrame, y:pd.DataFrame): super().fit(x_val, y_val) return self + @encode_categorical def predict(self, x:pd.DataFrame, **kwargs): y_hat = pd.DataFrame(self.m.predict(x), index=x.index, columns=self.target_cols) y_hat = self.anti_transform(x, y_hat) return y_hat + def predict_quantiles(self, x:pd.DataFrame, **kwargs): preds = np.expand_dims(self.predict(x), -1) * np.ones((1, 1, len(self.q_vect))) for h in np.unique(x.index.hour): diff --git a/tests/test_models.py b/tests/test_models.py index 9ebdc77..09b2851 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -80,6 +80,26 @@ def test_fast_linreg(self): y_hat.iloc[:, s_a].plot() (y_hat_fast.iloc[:, s_a]).plot() + def test_linreg_with_categorical_features(self): + + formatter = Formatter(logger=self.logger).add_transform(['all'], lags=np.arange(24), relative_lags=True) + formatter.add_transform(['all'], ['min', 'max'], agg_bins=[1, 2, 15, 20]) + formatter.add_target_transform(['all'], lags=-np.arange(6)) + x, y = formatter.transform(self.data.iloc[:1000]) + x['cat'] = np.random.choice(['cat', 'dog'], len(x)) + x.columns = x.columns.astype(str) + y.columns = y.columns.astype(str) + n_tr = int(len(x) * 0.8) + x_tr, x_te, y_tr, y_te = [x.iloc[:n_tr, :].copy(), x.iloc[n_tr:, :].copy(), y.iloc[:n_tr].copy(), + y.iloc[n_tr:].copy()] + + m_lin = LinearForecaster(val_ratio=0.2, fit_intercept=False, normalize=False).fit(x_tr, y_tr) + y_hat = m_lin.predict(x_te) + + s_a = 5 + y_te.iloc[:, s_a].plot() + y_hat.iloc[:, s_a].plot() + def test_hw_difficult(self): n_tr = int(len(self.x) * 0.5) @@ -213,7 +233,7 @@ def test_antinormalize(self): formatter.add_target_normalizer(['all'], 'mean', agg_freq='3d', name='a_movingavg') formatter.add_target_normalizer(['all'], 'std', agg_freq='3d', name='a_movingstd') - x, y = formatter.transform(self.data.iloc[:10000]) + x, y = formatter.transform(self.data) n_tr = int(len(x) * 0.9) x_tr, x_te, y_tr, y_te = [x.iloc[:n_tr, :].copy(), x.iloc[n_tr:, :].copy(), y.iloc[:n_tr].copy(), @@ -231,7 +251,7 @@ def test_antinormalize(self): formatter.add_normalizing_fun(expr="(df[t] - df['a_movingavg']) / (df['a_movingstd'] + 1)", inv_expr="df[t]*(df['a_movingstd']+1) + df['a_movingavg']") - x, y_norm = formatter.transform(self.data.iloc[:10000]) + x, y_norm = formatter.transform(self.data) y = formatter.denormalize(x, y_norm) x_tr, x_te, y_tr = [x.iloc[:n_tr, :].copy(), x.iloc[n_tr:, :].copy(), y_norm.iloc[:n_tr].copy()]