Merge remote-tracking branch 'origin/main' into main

supsi-dacd-isaac · Sep 9, 2024 · d614986 · d614986
2 parents e616fdd + e1dfe95
commit d614986
Show file tree

Hide file tree

Showing 5 changed files with 86 additions and 13 deletions.
diff --git a/pyforecaster/forecaster.py b/pyforecaster/forecaster.py
@@ -6,11 +6,42 @@
 from abc import abstractmethod
 from lightgbm import LGBMRegressor, Dataset, train
 from sklearn.linear_model import RidgeCV, LinearRegression
+from sklearn.preprocessing import LabelEncoder
 from pyforecaster.scenarios_generator import ScenGen
 from pyforecaster.utilities import get_logger
 from inspect import signature
 
 
+def encode_categorical(func):
+    def wrapper(self, x: pd.DataFrame, *args, **kwargs):
+        # Initialize a protected dictionary to store encoders if it doesn't exist yet
+        if not hasattr(self, '_le'):
+            self._le = {}
+
+        # Check if x contains columns that are not numbers and encode them
+        for column in x.select_dtypes(include=['object', 'category']).columns:
+            if column not in self._le:
+                # Create and fit a new encoder for the column if it's the first encounter
+                le = LabelEncoder()
+                x[column] = le.fit_transform(x[column].astype(str))
+                self._le[column] = le  # Store the encoder for future use
+            else:
+                # Use the existing encoder to transform the data
+                le = self._le[column]
+                # Check for unseen categories
+                unique_values = set(x[column].astype(str))
+                unseen_values = unique_values - set(le.classes_)
+                if unseen_values:
+                    raise ValueError(f"Unseen categories {unseen_values} encountered in column '{column}'.")
+                x[column] = le.transform(x[column].astype(str))
+
+        # Call the original function with preprocessed data
+        return func(self, x, *args, **kwargs)
+
+    return wrapper
+
+
+
 class ScenarioGenerator(object):
     def __init__(self, q_vect=None, nodes_at_step=None, val_ratio=None, logger=None, n_scen_fit=100,
                  additional_node=False, formatter=None, conditional_to_hour=True, **scengen_kwgs):
@@ -35,7 +66,6 @@ def online_tree_reduction(self, value):
     def set_params(self, **kwargs):
         [self.__setattr__(k, v) for k, v in kwargs.items() if k in self.__dict__.keys()]
 
-    @abstractmethod
     def get_params(self, **kwargs):
         return {k: getattr(self, k) for k in signature(self.__class__).parameters.keys() if k in self.__dict__.keys()}
 
@@ -65,9 +95,8 @@ def predict(self, x, **kwargs):
         pass
 
     def anti_transform(self, x, y_hat):
-        if self.formatter is not None:
-            if self.formatter.target_transformers is not None:
-                y_hat = self.formatter.denormalize(x, y_hat)
+        if self.formatter is not None and self.formatter.denormalizing_fun is not None:
+            y_hat = self.formatter.denormalize(x, y_hat)
         return y_hat
 
     @abstractmethod
@@ -147,19 +176,24 @@ def __init__(self, q_vect=None, val_ratio=None, nodes_at_step=None, kind='linear
         self.m = None
         self.kind = kind
 
+    @encode_categorical
     def fit(self, x:pd.DataFrame, y:pd.DataFrame):
         x, y, x_val, y_val = self.train_val_split(x, y)
         if self.kind == 'linear':
             self.m = LinearRegression().fit(x, y)
         elif self.kind == 'ridge':
             self.m = RidgeCV(alphas=10 ** np.linspace(-2, 8, 9)).fit(x, y)
+        else:
+            raise ValueError('kind must be either linear or ridge')
         super().fit(x_val, y_val)
         return self
 
+    @encode_categorical
     def predict(self, x:pd.DataFrame, **kwargs):
         y_hat = pd.DataFrame(self.m.predict(x), index=x.index, columns=self.target_cols)
         y_hat = self.anti_transform(x, y_hat)
         return y_hat
+
     def predict_quantiles(self, x:pd.DataFrame, **kwargs):
         preds = np.expand_dims(self.predict(x), -1) * np.ones((1, 1, len(self.q_vect)))
         for h in np.unique(x.index.hour):
@@ -169,7 +203,7 @@ def predict_quantiles(self, x:pd.DataFrame, **kwargs):
 
 class LGBForecaster(ScenarioGenerator):
     def __init__(self, device_type='cpu', max_depth=20, n_estimators=100, num_leaves=100, learning_rate=0.1, min_child_samples=20,
-                 n_jobs=4, objective='regression', verbose=-1, metric='l2', colsample_bytree=1, colsample_bynode=1, q_vect=None, val_ratio=None, nodes_at_step=None, **scengen_kwgs):
+                 n_jobs=4, n_jobs_predict=0, objective='regression', verbose=-1, metric='l2', colsample_bytree=1, colsample_bynode=1, q_vect=None, val_ratio=None, nodes_at_step=None, **scengen_kwgs):
         super().__init__(q_vect, val_ratio=val_ratio, nodes_at_step=nodes_at_step, **scengen_kwgs)
         self.m = []
         self.device_type = device_type
@@ -184,6 +218,7 @@ def __init__(self, device_type='cpu', max_depth=20, n_estimators=100, num_leaves
         self.colsample_bytree=colsample_bytree
         self.colsample_bynode = colsample_bynode
         self.n_jobs = n_jobs
+        self.n_jobs_predict = n_jobs_predict
         self.lgb_pars = {"device_type": self.device_type,
                     "objective": self.objective,
                     "max_depth": self.max_depth,
@@ -210,7 +245,7 @@ def fit(self, x, y):
     def predict(self, x, **kwargs):
         preds = []
         for m in self.m:
-            preds.append(m.predict(x).reshape(-1, 1))
+            preds.append(m.predict(x, num_threads=self.n_jobs_predict).reshape(-1, 1))
         y_hat = pd.DataFrame(np.hstack(preds), index=x.index, columns=self.target_cols)
         y_hat = self.anti_transform(x, y_hat)
         return y_hat

diff --git a/pyforecaster/forecasting_models/gradientboosters.py b/pyforecaster/forecasting_models/gradientboosters.py
@@ -145,8 +145,13 @@ def predict_single(self, i, x):
         return self.multi_step_model.predict(x,num_threads=1).reshape(-1, 1)
 
     def predict_parallel(self, x):
-        with concurrent.futures.ProcessPoolExecutor(max_workers=2) as executor:
-            y_hat = list(executor.map(partial(self.predict_single, x=x), np.arange(self.n_multistep)))
+        if self.parallel:
+            with concurrent.futures.ProcessPoolExecutor(max_workers=2) as executor:
+                y_hat = list(executor.map(partial(self.predict_single, x=x), np.arange(self.n_multistep)))
+        else:
+            y_hat = []
+            for i in range(self.n_multistep):
+                y_hat.append(self.predict_single(i, x))
         return np.hstack(y_hat)
 
     @staticmethod

diff --git a/pyforecaster/forecasting_models/randomforests.py b/pyforecaster/forecasting_models/randomforests.py
@@ -2,7 +2,7 @@
 import pandas as pd
 from pyforecaster.utilities import get_logger
 from tqdm import tqdm
-from pyforecaster.forecaster import ScenarioGenerator
+from pyforecaster.forecaster import ScenarioGenerator, encode_categorical
 import numpy as np
 import concurrent.futures
 from time import time
@@ -94,6 +94,7 @@ def _fit(self, i, x, y):
         model = RandomForestQuantileRegressor(**self.qrf_pars).fit(x_i, y.iloc[:, i], sparse_pickle=True)
         return model
 
+    @encode_categorical
     def fit(self, x, y):
         x, y, x_val, y_val = self.train_val_split(x, y)
         if self.parallel:
@@ -135,6 +136,7 @@ def fit(self, x, y):
         super().fit(x_val, y_val)
         return self
 
+    @encode_categorical
     def predict(self, x, **kwargs):
         preds = []
         period = kwargs['period'] if 'period' in kwargs else '24h'

diff --git a/tests/test_boosters.py b/tests/test_boosters.py
@@ -32,8 +32,19 @@ def test_linear_val_split(self):
         y_hat_lin = m_lin.predict(x_te)
         q = m_lin.predict_quantiles(x_te)
 
-        m_lgbhybrid = LGBMHybrid(red_frac_multistep=0.1,  val_ratio=0.3, lgb_pars={'num_leaves': 300, 'n_estimators': 10, 'learning_rate':0.05},
-                                 n_single=10, parallel=True, formatter=formatter, metadata_features=['minuteofday', 'utc_offset', 'dayofweek', 'hour'],tol_period='1h', keep_last_seconds=3600).fit(x_tr, y_tr)
+        m_lgbhybrid = LGBMHybrid(red_frac_multistep=0.1, val_ratio=0.3,
+                                 lgb_pars={'num_leaves': 300, 'n_estimators': 10, 'learning_rate': 0.05},
+                                 n_single=10, parallel=False, formatter=formatter,
+                                 metadata_features=['minuteofday', 'utc_offset', 'dayofweek', 'hour'], tol_period='1h',
+                                 keep_last_seconds=3600).fit(x_tr, y_tr)
+        y_hat_lgbh = m_lgbhybrid.predict(x_te)
+        q = m_lgbhybrid.predict_quantiles(x_te)
+
+        m_lgbhybrid = LGBMHybrid(red_frac_multistep=0.1, val_ratio=0.3,
+                                 lgb_pars={'num_leaves': 300, 'n_estimators': 10, 'learning_rate': 0.05},
+                                 n_single=10, parallel=True, formatter=formatter,
+                                 metadata_features=['minuteofday', 'utc_offset', 'dayofweek', 'hour'], tol_period='1h',
+                                 keep_last_seconds=3600).fit(x_tr, y_tr)
         y_hat_lgbh = m_lgbhybrid.predict(x_te)
         q = m_lgbhybrid.predict_quantiles(x_te)
 

diff --git a/tests/test_models.py b/tests/test_models.py
@@ -83,6 +83,26 @@ def test_fast_linreg(self):
         y_hat.iloc[:, s_a].plot()
         (y_hat_fast.iloc[:, s_a]).plot()
 
+    def test_linreg_with_categorical_features(self):
+
+        formatter = Formatter(logger=self.logger).add_transform(['all'], lags=np.arange(24), relative_lags=True)
+        formatter.add_transform(['all'], ['min', 'max'], agg_bins=[1, 2, 15, 20])
+        formatter.add_target_transform(['all'], lags=-np.arange(6))
+        x, y = formatter.transform(self.data.iloc[:1000])
+        x['cat'] = np.random.choice(['cat', 'dog'], len(x))
+        x.columns = x.columns.astype(str)
+        y.columns = y.columns.astype(str)
+        n_tr = int(len(x) * 0.8)
+        x_tr, x_te, y_tr, y_te = [x.iloc[:n_tr, :].copy(), x.iloc[n_tr:, :].copy(), y.iloc[:n_tr].copy(),
+                                  y.iloc[n_tr:].copy()]
+
+        m_lin = LinearForecaster(val_ratio=0.2, fit_intercept=False, normalize=False).fit(x_tr, y_tr)
+        y_hat = m_lin.predict(x_te)
+
+        s_a = 5
+        y_te.iloc[:, s_a].plot()
+        y_hat.iloc[:, s_a].plot()
+
     def test_hw_difficult(self):
 
         n_tr = int(len(self.x) * 0.5)
@@ -199,7 +219,7 @@ def test_antinormalize(self):
         formatter.add_target_normalizer(['all'], 'mean', agg_freq='3d', name='a_movingavg')
         formatter.add_target_normalizer(['all'], 'std', agg_freq='3d', name='a_movingstd')
 
-        x, y = formatter.transform(self.data.iloc[:10000])
+        x, y = formatter.transform(self.data)
 
         n_tr = int(len(x) * 0.9)
         x_tr, x_te, y_tr, y_te = [x.iloc[:n_tr, :].copy(), x.iloc[n_tr:, :].copy(), y.iloc[:n_tr].copy(),
@@ -217,7 +237,7 @@ def test_antinormalize(self):
 
         formatter.add_normalizing_fun(expr="(df[t] - df['a_movingavg']) / (df['a_movingstd'] + 1)",
                                       inv_expr="df[t]*(df['a_movingstd']+1) + df['a_movingavg']")
-        x, y_norm = formatter.transform(self.data.iloc[:10000])
+        x, y_norm = formatter.transform(self.data)
         y = formatter.denormalize(x, y_norm)
 
         x_tr, x_te, y_tr = [x.iloc[:n_tr, :].copy(), x.iloc[n_tr:, :].copy(), y_norm.iloc[:n_tr].copy()]