Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/main' into main
Browse files Browse the repository at this point in the history
  • Loading branch information
nepslor committed Sep 9, 2024
2 parents e616fdd + e1dfe95 commit d614986
Show file tree
Hide file tree
Showing 5 changed files with 86 additions and 13 deletions.
47 changes: 41 additions & 6 deletions pyforecaster/forecaster.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,42 @@
from abc import abstractmethod
from lightgbm import LGBMRegressor, Dataset, train
from sklearn.linear_model import RidgeCV, LinearRegression
from sklearn.preprocessing import LabelEncoder
from pyforecaster.scenarios_generator import ScenGen
from pyforecaster.utilities import get_logger
from inspect import signature


def encode_categorical(func):
def wrapper(self, x: pd.DataFrame, *args, **kwargs):
# Initialize a protected dictionary to store encoders if it doesn't exist yet
if not hasattr(self, '_le'):
self._le = {}

# Check if x contains columns that are not numbers and encode them
for column in x.select_dtypes(include=['object', 'category']).columns:
if column not in self._le:
# Create and fit a new encoder for the column if it's the first encounter
le = LabelEncoder()
x[column] = le.fit_transform(x[column].astype(str))
self._le[column] = le # Store the encoder for future use
else:
# Use the existing encoder to transform the data
le = self._le[column]
# Check for unseen categories
unique_values = set(x[column].astype(str))
unseen_values = unique_values - set(le.classes_)
if unseen_values:
raise ValueError(f"Unseen categories {unseen_values} encountered in column '{column}'.")
x[column] = le.transform(x[column].astype(str))

# Call the original function with preprocessed data
return func(self, x, *args, **kwargs)

return wrapper



class ScenarioGenerator(object):
def __init__(self, q_vect=None, nodes_at_step=None, val_ratio=None, logger=None, n_scen_fit=100,
additional_node=False, formatter=None, conditional_to_hour=True, **scengen_kwgs):
Expand All @@ -35,7 +66,6 @@ def online_tree_reduction(self, value):
def set_params(self, **kwargs):
[self.__setattr__(k, v) for k, v in kwargs.items() if k in self.__dict__.keys()]

@abstractmethod
def get_params(self, **kwargs):
return {k: getattr(self, k) for k in signature(self.__class__).parameters.keys() if k in self.__dict__.keys()}

Expand Down Expand Up @@ -65,9 +95,8 @@ def predict(self, x, **kwargs):
pass

def anti_transform(self, x, y_hat):
if self.formatter is not None:
if self.formatter.target_transformers is not None:
y_hat = self.formatter.denormalize(x, y_hat)
if self.formatter is not None and self.formatter.denormalizing_fun is not None:
y_hat = self.formatter.denormalize(x, y_hat)
return y_hat

@abstractmethod
Expand Down Expand Up @@ -147,19 +176,24 @@ def __init__(self, q_vect=None, val_ratio=None, nodes_at_step=None, kind='linear
self.m = None
self.kind = kind

@encode_categorical
def fit(self, x:pd.DataFrame, y:pd.DataFrame):
x, y, x_val, y_val = self.train_val_split(x, y)
if self.kind == 'linear':
self.m = LinearRegression().fit(x, y)
elif self.kind == 'ridge':
self.m = RidgeCV(alphas=10 ** np.linspace(-2, 8, 9)).fit(x, y)
else:
raise ValueError('kind must be either linear or ridge')
super().fit(x_val, y_val)
return self

@encode_categorical
def predict(self, x:pd.DataFrame, **kwargs):
y_hat = pd.DataFrame(self.m.predict(x), index=x.index, columns=self.target_cols)
y_hat = self.anti_transform(x, y_hat)
return y_hat

def predict_quantiles(self, x:pd.DataFrame, **kwargs):
preds = np.expand_dims(self.predict(x), -1) * np.ones((1, 1, len(self.q_vect)))
for h in np.unique(x.index.hour):
Expand All @@ -169,7 +203,7 @@ def predict_quantiles(self, x:pd.DataFrame, **kwargs):

class LGBForecaster(ScenarioGenerator):
def __init__(self, device_type='cpu', max_depth=20, n_estimators=100, num_leaves=100, learning_rate=0.1, min_child_samples=20,
n_jobs=4, objective='regression', verbose=-1, metric='l2', colsample_bytree=1, colsample_bynode=1, q_vect=None, val_ratio=None, nodes_at_step=None, **scengen_kwgs):
n_jobs=4, n_jobs_predict=0, objective='regression', verbose=-1, metric='l2', colsample_bytree=1, colsample_bynode=1, q_vect=None, val_ratio=None, nodes_at_step=None, **scengen_kwgs):
super().__init__(q_vect, val_ratio=val_ratio, nodes_at_step=nodes_at_step, **scengen_kwgs)
self.m = []
self.device_type = device_type
Expand All @@ -184,6 +218,7 @@ def __init__(self, device_type='cpu', max_depth=20, n_estimators=100, num_leaves
self.colsample_bytree=colsample_bytree
self.colsample_bynode = colsample_bynode
self.n_jobs = n_jobs
self.n_jobs_predict = n_jobs_predict
self.lgb_pars = {"device_type": self.device_type,
"objective": self.objective,
"max_depth": self.max_depth,
Expand All @@ -210,7 +245,7 @@ def fit(self, x, y):
def predict(self, x, **kwargs):
preds = []
for m in self.m:
preds.append(m.predict(x).reshape(-1, 1))
preds.append(m.predict(x, num_threads=self.n_jobs_predict).reshape(-1, 1))
y_hat = pd.DataFrame(np.hstack(preds), index=x.index, columns=self.target_cols)
y_hat = self.anti_transform(x, y_hat)
return y_hat
Expand Down
9 changes: 7 additions & 2 deletions pyforecaster/forecasting_models/gradientboosters.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,8 +145,13 @@ def predict_single(self, i, x):
return self.multi_step_model.predict(x,num_threads=1).reshape(-1, 1)

def predict_parallel(self, x):
with concurrent.futures.ProcessPoolExecutor(max_workers=2) as executor:
y_hat = list(executor.map(partial(self.predict_single, x=x), np.arange(self.n_multistep)))
if self.parallel:
with concurrent.futures.ProcessPoolExecutor(max_workers=2) as executor:
y_hat = list(executor.map(partial(self.predict_single, x=x), np.arange(self.n_multistep)))
else:
y_hat = []
for i in range(self.n_multistep):
y_hat.append(self.predict_single(i, x))
return np.hstack(y_hat)

@staticmethod
Expand Down
4 changes: 3 additions & 1 deletion pyforecaster/forecasting_models/randomforests.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import pandas as pd
from pyforecaster.utilities import get_logger
from tqdm import tqdm
from pyforecaster.forecaster import ScenarioGenerator
from pyforecaster.forecaster import ScenarioGenerator, encode_categorical
import numpy as np
import concurrent.futures
from time import time
Expand Down Expand Up @@ -94,6 +94,7 @@ def _fit(self, i, x, y):
model = RandomForestQuantileRegressor(**self.qrf_pars).fit(x_i, y.iloc[:, i], sparse_pickle=True)
return model

@encode_categorical
def fit(self, x, y):
x, y, x_val, y_val = self.train_val_split(x, y)
if self.parallel:
Expand Down Expand Up @@ -135,6 +136,7 @@ def fit(self, x, y):
super().fit(x_val, y_val)
return self

@encode_categorical
def predict(self, x, **kwargs):
preds = []
period = kwargs['period'] if 'period' in kwargs else '24h'
Expand Down
15 changes: 13 additions & 2 deletions tests/test_boosters.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,19 @@ def test_linear_val_split(self):
y_hat_lin = m_lin.predict(x_te)
q = m_lin.predict_quantiles(x_te)

m_lgbhybrid = LGBMHybrid(red_frac_multistep=0.1, val_ratio=0.3, lgb_pars={'num_leaves': 300, 'n_estimators': 10, 'learning_rate':0.05},
n_single=10, parallel=True, formatter=formatter, metadata_features=['minuteofday', 'utc_offset', 'dayofweek', 'hour'],tol_period='1h', keep_last_seconds=3600).fit(x_tr, y_tr)
m_lgbhybrid = LGBMHybrid(red_frac_multistep=0.1, val_ratio=0.3,
lgb_pars={'num_leaves': 300, 'n_estimators': 10, 'learning_rate': 0.05},
n_single=10, parallel=False, formatter=formatter,
metadata_features=['minuteofday', 'utc_offset', 'dayofweek', 'hour'], tol_period='1h',
keep_last_seconds=3600).fit(x_tr, y_tr)
y_hat_lgbh = m_lgbhybrid.predict(x_te)
q = m_lgbhybrid.predict_quantiles(x_te)

m_lgbhybrid = LGBMHybrid(red_frac_multistep=0.1, val_ratio=0.3,
lgb_pars={'num_leaves': 300, 'n_estimators': 10, 'learning_rate': 0.05},
n_single=10, parallel=True, formatter=formatter,
metadata_features=['minuteofday', 'utc_offset', 'dayofweek', 'hour'], tol_period='1h',
keep_last_seconds=3600).fit(x_tr, y_tr)
y_hat_lgbh = m_lgbhybrid.predict(x_te)
q = m_lgbhybrid.predict_quantiles(x_te)

Expand Down
24 changes: 22 additions & 2 deletions tests/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,26 @@ def test_fast_linreg(self):
y_hat.iloc[:, s_a].plot()
(y_hat_fast.iloc[:, s_a]).plot()

def test_linreg_with_categorical_features(self):

formatter = Formatter(logger=self.logger).add_transform(['all'], lags=np.arange(24), relative_lags=True)
formatter.add_transform(['all'], ['min', 'max'], agg_bins=[1, 2, 15, 20])
formatter.add_target_transform(['all'], lags=-np.arange(6))
x, y = formatter.transform(self.data.iloc[:1000])
x['cat'] = np.random.choice(['cat', 'dog'], len(x))
x.columns = x.columns.astype(str)
y.columns = y.columns.astype(str)
n_tr = int(len(x) * 0.8)
x_tr, x_te, y_tr, y_te = [x.iloc[:n_tr, :].copy(), x.iloc[n_tr:, :].copy(), y.iloc[:n_tr].copy(),
y.iloc[n_tr:].copy()]

m_lin = LinearForecaster(val_ratio=0.2, fit_intercept=False, normalize=False).fit(x_tr, y_tr)
y_hat = m_lin.predict(x_te)

s_a = 5
y_te.iloc[:, s_a].plot()
y_hat.iloc[:, s_a].plot()

def test_hw_difficult(self):

n_tr = int(len(self.x) * 0.5)
Expand Down Expand Up @@ -199,7 +219,7 @@ def test_antinormalize(self):
formatter.add_target_normalizer(['all'], 'mean', agg_freq='3d', name='a_movingavg')
formatter.add_target_normalizer(['all'], 'std', agg_freq='3d', name='a_movingstd')

x, y = formatter.transform(self.data.iloc[:10000])
x, y = formatter.transform(self.data)

n_tr = int(len(x) * 0.9)
x_tr, x_te, y_tr, y_te = [x.iloc[:n_tr, :].copy(), x.iloc[n_tr:, :].copy(), y.iloc[:n_tr].copy(),
Expand All @@ -217,7 +237,7 @@ def test_antinormalize(self):

formatter.add_normalizing_fun(expr="(df[t] - df['a_movingavg']) / (df['a_movingstd'] + 1)",
inv_expr="df[t]*(df['a_movingstd']+1) + df['a_movingavg']")
x, y_norm = formatter.transform(self.data.iloc[:10000])
x, y_norm = formatter.transform(self.data)
y = formatter.denormalize(x, y_norm)

x_tr, x_te, y_tr = [x.iloc[:n_tr, :].copy(), x.iloc[n_tr:, :].copy(), y_norm.iloc[:n_tr].copy()]
Expand Down

0 comments on commit d614986

Please sign in to comment.