automatically encode categorical features in models not supporting th…

…em (so far added only to LinearModel)
supsi-dacd-isaac · Aug 27, 2024 · c037095 · c037095
1 parent d006c14
commit c037095
Show file tree

Hide file tree

Showing 2 changed files with 56 additions and 3 deletions.
diff --git a/pyforecaster/forecaster.py b/pyforecaster/forecaster.py
@@ -6,11 +6,42 @@
 from abc import abstractmethod
 from lightgbm import LGBMRegressor, Dataset, train
 from sklearn.linear_model import RidgeCV, LinearRegression
+from sklearn.preprocessing import LabelEncoder
 from pyforecaster.scenarios_generator import ScenGen
 from pyforecaster.utilities import get_logger
 from inspect import signature
 
 
+def encode_categorical(func):
+    def wrapper(self, x: pd.DataFrame, *args, **kwargs):
+        # Initialize a protected dictionary to store encoders if it doesn't exist yet
+        if not hasattr(self, '_le'):
+            self._le = {}
+
+        # Check if x contains columns that are not numbers and encode them
+        for column in x.select_dtypes(include=['object', 'category']).columns:
+            if column not in self._le:
+                # Create and fit a new encoder for the column if it's the first encounter
+                le = LabelEncoder()
+                x[column] = le.fit_transform(x[column].astype(str))
+                self._le[column] = le  # Store the encoder for future use
+            else:
+                # Use the existing encoder to transform the data
+                le = self._le[column]
+                # Check for unseen categories
+                unique_values = set(x[column].astype(str))
+                unseen_values = unique_values - set(le.classes_)
+                if unseen_values:
+                    raise ValueError(f"Unseen categories {unseen_values} encountered in column '{column}'.")
+                x[column] = le.transform(x[column].astype(str))
+
+        # Call the original function with preprocessed data
+        return func(self, x, *args, **kwargs)
+
+    return wrapper
+
+
+
 class ScenarioGenerator(object):
     def __init__(self, q_vect=None, nodes_at_step=None, val_ratio=None, logger=None, n_scen_fit=100,
                  additional_node=False, formatter=None, **scengen_kwgs):
@@ -34,7 +65,6 @@ def online_tree_reduction(self, value):
     def set_params(self, **kwargs):
         [self.__setattr__(k, v) for k, v in kwargs.items() if k in self.__dict__.keys()]
 
-    @abstractmethod
     def get_params(self, **kwargs):
         return {k: getattr(self, k) for k in signature(self.__class__).parameters.keys() if k in self.__dict__.keys()}
 
@@ -127,6 +157,7 @@ def __init__(self, q_vect=None, val_ratio=None, nodes_at_step=None, kind='linear
         self.m = None
         self.kind = kind
 
+    @encode_categorical
     def fit(self, x:pd.DataFrame, y:pd.DataFrame):
         x, y, x_val, y_val = self.train_val_split(x, y)
         if self.kind == 'linear':
@@ -138,10 +169,12 @@ def fit(self, x:pd.DataFrame, y:pd.DataFrame):
         super().fit(x_val, y_val)
         return self
 
+    @encode_categorical
     def predict(self, x:pd.DataFrame, **kwargs):
         y_hat = pd.DataFrame(self.m.predict(x), index=x.index, columns=self.target_cols)
         y_hat = self.anti_transform(x, y_hat)
         return y_hat
+
     def predict_quantiles(self, x:pd.DataFrame, **kwargs):
         preds = np.expand_dims(self.predict(x), -1) * np.ones((1, 1, len(self.q_vect)))
         for h in np.unique(x.index.hour):

diff --git a/tests/test_models.py b/tests/test_models.py
@@ -80,6 +80,26 @@ def test_fast_linreg(self):
         y_hat.iloc[:, s_a].plot()
         (y_hat_fast.iloc[:, s_a]).plot()
 
+    def test_linreg_with_categorical_features(self):
+
+        formatter = Formatter(logger=self.logger).add_transform(['all'], lags=np.arange(24), relative_lags=True)
+        formatter.add_transform(['all'], ['min', 'max'], agg_bins=[1, 2, 15, 20])
+        formatter.add_target_transform(['all'], lags=-np.arange(6))
+        x, y = formatter.transform(self.data.iloc[:1000])
+        x['cat'] = np.random.choice(['cat', 'dog'], len(x))
+        x.columns = x.columns.astype(str)
+        y.columns = y.columns.astype(str)
+        n_tr = int(len(x) * 0.8)
+        x_tr, x_te, y_tr, y_te = [x.iloc[:n_tr, :].copy(), x.iloc[n_tr:, :].copy(), y.iloc[:n_tr].copy(),
+                                  y.iloc[n_tr:].copy()]
+
+        m_lin = LinearForecaster(val_ratio=0.2, fit_intercept=False, normalize=False).fit(x_tr, y_tr)
+        y_hat = m_lin.predict(x_te)
+
+        s_a = 5
+        y_te.iloc[:, s_a].plot()
+        y_hat.iloc[:, s_a].plot()
+
     def test_hw_difficult(self):
 
         n_tr = int(len(self.x) * 0.5)
@@ -213,7 +233,7 @@ def test_antinormalize(self):
         formatter.add_target_normalizer(['all'], 'mean', agg_freq='3d', name='a_movingavg')
         formatter.add_target_normalizer(['all'], 'std', agg_freq='3d', name='a_movingstd')
 
-        x, y = formatter.transform(self.data.iloc[:10000])
+        x, y = formatter.transform(self.data)
 
         n_tr = int(len(x) * 0.9)
         x_tr, x_te, y_tr, y_te = [x.iloc[:n_tr, :].copy(), x.iloc[n_tr:, :].copy(), y.iloc[:n_tr].copy(),
@@ -231,7 +251,7 @@ def test_antinormalize(self):
 
         formatter.add_normalizing_fun(expr="(df[t] - df['a_movingavg']) / (df['a_movingstd'] + 1)",
                                       inv_expr="df[t]*(df['a_movingstd']+1) + df['a_movingavg']")
-        x, y_norm = formatter.transform(self.data.iloc[:10000])
+        x, y_norm = formatter.transform(self.data)
         y = formatter.denormalize(x, y_norm)
 
         x_tr, x_te, y_tr = [x.iloc[:n_tr, :].copy(), x.iloc[n_tr:, :].copy(), y_norm.iloc[:n_tr].copy()]