diff --git a/docs/conf.py b/docs/conf.py
index 85d30f6..73e999f 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -77,8 +77,8 @@
# a list of builtin themes.
#
# html_theme = "sphinx_rtd_theme"
-html_permalinks_icon = '#'
-html_theme = 'sphinxawesome_theme'
+html_permalinks_icon = "#"
+html_theme = "sphinxawesome_theme"
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
diff --git a/docs/notebooks/arfs_classification.ipynb b/docs/notebooks/arfs_classification.ipynb
index 9fac97a..a6dd460 100644
--- a/docs/notebooks/arfs_classification.ipynb
+++ b/docs/notebooks/arfs_classification.ipynb
@@ -424,7 +424,12 @@
"model = clone(model)\n",
"\n",
"feat_selector = arfsgroot.Leshy(\n",
- " model, n_estimators=50, verbose=1, max_iter=10, random_state=42, importance=\"fastshap\"\n",
+ " model,\n",
+ " n_estimators=50,\n",
+ " verbose=1,\n",
+ " max_iter=10,\n",
+ " random_state=42,\n",
+ " importance=\"fastshap\",\n",
")\n",
"feat_selector.fit(X, y, sample_weight=None)\n",
"print(f\"The selected features: {feat_selector.get_feature_names_out()}\")\n",
@@ -1946,7 +1951,7 @@
" CatBoostClassifier(random_state=42, verbose=0),\n",
" LGBMClassifier(random_state=42, verbose=-1),\n",
" LightForestClassifier(n_feat=X.shape[1]),\n",
- " XGBClassifier(random_state=42, verbosity=0, eval_metric='logloss'),\n",
+ " XGBClassifier(random_state=42, verbosity=0, eval_metric=\"logloss\"),\n",
"]\n",
"\n",
"feat_selector = arfsgroot.Leshy(\n",
@@ -1987,10 +1992,14 @@
"from xgboost import XGBClassifier\n",
"from fasttreeshap import TreeExplainer as FastTreeExplainer\n",
"\n",
- "X, y = make_classification(n_samples=1000, n_features=10, n_informative=8, random_state=8)\n",
- "model = XGBClassifier() \n",
+ "X, y = make_classification(\n",
+ " n_samples=1000, n_features=10, n_informative=8, random_state=8\n",
+ ")\n",
+ "model = XGBClassifier()\n",
"model.fit(X, y)\n",
- "explainer = FastTreeExplainer(model, algorithm=\"auto\", shortcut=False, feature_perturbation=\"tree_path_dependent\")\n",
+ "explainer = FastTreeExplainer(\n",
+ " model, algorithm=\"auto\", shortcut=False, feature_perturbation=\"tree_path_dependent\"\n",
+ ")\n",
"shap_matrix = explainer.shap_values(X)"
]
},
@@ -3036,6 +3045,7 @@
"source": [
"# Leshy\n",
"from arfs.preprocessing import OrdinalEncoderPandas\n",
+ "\n",
"model = LGBMClassifier(random_state=42, verbose=-1, n_estimators=10)\n",
"X_encoded = OrdinalEncoderPandas().fit_transform(X=X)\n",
"feat_selector = arfsgroot.Leshy(\n",
diff --git a/docs/notebooks/arfs_grootcv_custom_params.ipynb b/docs/notebooks/arfs_grootcv_custom_params.ipynb
index fe1c520..756e868 100644
--- a/docs/notebooks/arfs_grootcv_custom_params.ipynb
+++ b/docs/notebooks/arfs_grootcv_custom_params.ipynb
@@ -52,19 +52,21 @@
"import os\n",
"import multiprocessing\n",
"\n",
+ "\n",
"def get_physical_cores():\n",
- " if os.name == 'posix': # For Unix-based systems (e.g., Linux, macOS)\n",
+ " if os.name == \"posix\": # For Unix-based systems (e.g., Linux, macOS)\n",
" try:\n",
" return os.sysconf(\"SC_NPROCESSORS_ONLN\")\n",
" except ValueError:\n",
" pass\n",
- " elif os.name == 'nt': # For Windows\n",
+ " elif os.name == \"nt\": # For Windows\n",
" try:\n",
" return int(os.environ[\"NUMBER_OF_PROCESSORS\"])\n",
" except (ValueError, KeyError):\n",
" pass\n",
" return multiprocessing.cpu_count()\n",
"\n",
+ "\n",
"num_physical_cores = get_physical_cores()\n",
"print(f\"Number of physical cores: {num_physical_cores}\")"
]
@@ -508,12 +510,19 @@
"source": [
"for n_jobs in range(num_physical_cores):\n",
" start_time = time.time()\n",
- " feat_selector = GrootCV(objective=\"rmse\", cutoff=1, n_folds=5, n_iter=5, silent=True, fastshap=False, n_jobs=n_jobs)\n",
+ " feat_selector = GrootCV(\n",
+ " objective=\"rmse\",\n",
+ " cutoff=1,\n",
+ " n_folds=5,\n",
+ " n_iter=5,\n",
+ " silent=True,\n",
+ " fastshap=False,\n",
+ " n_jobs=n_jobs,\n",
+ " )\n",
" feat_selector.fit(X, y, sample_weight=None)\n",
" end_time = time.time()\n",
" execution_time = end_time - start_time\n",
- " print(f\"n_jobs = {n_jobs}, Execution time: {execution_time:.3f} seconds\")\n",
- "\n"
+ " print(f\"n_jobs = {n_jobs}, Execution time: {execution_time:.3f} seconds\")"
]
},
{
@@ -572,7 +581,14 @@
"source": [
"# GrootCV with less regularization\n",
"feat_selector = GrootCV(\n",
- " objective=\"rmse\", cutoff=1, n_folds=5, n_iter=5, silent=True, fastshap=True, n_jobs=0, lgbm_params={\"min_data_in_leaf\": 10}\n",
+ " objective=\"rmse\",\n",
+ " cutoff=1,\n",
+ " n_folds=5,\n",
+ " n_iter=5,\n",
+ " silent=True,\n",
+ " fastshap=True,\n",
+ " n_jobs=0,\n",
+ " lgbm_params={\"min_data_in_leaf\": 10},\n",
")\n",
"feat_selector.fit(X, y, sample_weight=None)\n",
"print(f\"The selected features: {feat_selector.get_feature_names_out()}\")\n",
@@ -628,7 +644,14 @@
"source": [
"# GrootCV with default regularization\n",
"feat_selector = GrootCV(\n",
- " objective=\"rmse\", cutoff=1, n_folds=5, n_iter=5, silent=True, fastshap=True, n_jobs=0, lgbm_params=None\n",
+ " objective=\"rmse\",\n",
+ " cutoff=1,\n",
+ " n_folds=5,\n",
+ " n_iter=5,\n",
+ " silent=True,\n",
+ " fastshap=True,\n",
+ " n_jobs=0,\n",
+ " lgbm_params=None,\n",
")\n",
"feat_selector.fit(X, y, sample_weight=None)\n",
"print(f\"The selected features: {feat_selector.get_feature_names_out()}\")\n",
@@ -684,7 +707,14 @@
"source": [
"# GrootCV with larger regularization\n",
"feat_selector = GrootCV(\n",
- " objective=\"rmse\", cutoff=1, n_folds=5, n_iter=5, silent=True, fastshap=True, n_jobs=0, lgbm_params={\"min_data_in_leaf\": 100}\n",
+ " objective=\"rmse\",\n",
+ " cutoff=1,\n",
+ " n_folds=5,\n",
+ " n_iter=5,\n",
+ " silent=True,\n",
+ " fastshap=True,\n",
+ " n_jobs=0,\n",
+ " lgbm_params={\"min_data_in_leaf\": 100},\n",
")\n",
"feat_selector.fit(X, y, sample_weight=None)\n",
"print(f\"The selected features: {feat_selector.get_feature_names_out()}\")\n",
diff --git a/docs/notebooks/arfs_non_normal_loss_and_sample_weight.ipynb b/docs/notebooks/arfs_non_normal_loss_and_sample_weight.ipynb
index 36cbb52..01dd20c 100644
--- a/docs/notebooks/arfs_non_normal_loss_and_sample_weight.ipynb
+++ b/docs/notebooks/arfs_non_normal_loss_and_sample_weight.ipynb
@@ -40,7 +40,10 @@
"\n",
"rng = np.random.RandomState(seed=42)\n",
"\n",
- "def plot_y_vs_X(X: pd.DataFrame, y: pd.Series, ncols: int = 2, figsize: tuple = (10, 10)) -> plt.Figure:\n",
+ "\n",
+ "def plot_y_vs_X(\n",
+ " X: pd.DataFrame, y: pd.Series, ncols: int = 2, figsize: tuple = (10, 10)\n",
+ ") -> plt.Figure:\n",
" \"\"\"\n",
" Create subplots of scatter plots showing the relationship between each column in X and the target variable y.\n",
"\n",
@@ -73,7 +76,7 @@
" ax.set_title(col)\n",
"\n",
" # Remove any unused subplots\n",
- " for ax in axs.flat[len(X.columns):]:\n",
+ " for ax in axs.flat[len(X.columns) :]:\n",
" ax.set_axis_off()\n",
"\n",
" # Display the figure\n",
@@ -119,20 +122,30 @@
"source": [
"# Generate synthetic data with Poisson-distributed target variable\n",
"bias = 1\n",
- "X, y, true_coef = make_regression(n_samples=2_000, n_features=10, n_informative=5, noise=1, random_state=8, bias=bias, coef=True)\n",
- "y = (y-y.mean())/y.std()\n",
+ "X, y, true_coef = make_regression(\n",
+ " n_samples=2_000,\n",
+ " n_features=10,\n",
+ " n_informative=5,\n",
+ " noise=1,\n",
+ " random_state=8,\n",
+ " bias=bias,\n",
+ " coef=True,\n",
+ ")\n",
+ "y = (y - y.mean()) / y.std()\n",
"y = np.exp(y) # Transform to positive values for Poisson distribution\n",
"y = np.random.poisson(y) # Add Poisson noise to the target variable\n",
"# dummy sample weight (e.g. exposure), smallest being 30 days\n",
- "w = np.random.uniform(30/365, 1, size=len(y))\n",
+ "w = np.random.uniform(30 / 365, 1, size=len(y))\n",
"# make the count a Poisson rate (frequency)\n",
- "y = y/w\n",
+ "y = y / w\n",
"\n",
"X = pd.DataFrame(X)\n",
"X.columns = [f\"pred_{i}\" for i in range(X.shape[1])]\n",
"\n",
"# Split the data into training and testing sets\n",
- "X_train, X_test, y_train, y_test, w_train, w_test = train_test_split(X, y, w, test_size=0.5, random_state=42)\n",
+ "X_train, X_test, y_train, y_test, w_train, w_test = train_test_split(\n",
+ " X, y, w, test_size=0.5, random_state=42\n",
+ ")\n",
"\n",
"true_coef = pd.Series(true_coef)\n",
"true_coef.index = X.columns\n",
@@ -211,14 +224,29 @@
],
"source": [
"# Create a pipeline with LassoFeatureSelection and LinearRegression\n",
- "pipeline = Pipeline([\n",
- " ('scaler', StandardScaler().set_output(transform=\"pandas\")),\n",
- " ('selector', GrootCV(objective=\"poisson\", cutoff=1, n_folds=5, n_iter=5, silent=True, fastshap=True, n_jobs=0)),\n",
- " ('glm', PoissonRegressor())\n",
- "])\n",
+ "pipeline = Pipeline(\n",
+ " [\n",
+ " (\"scaler\", StandardScaler().set_output(transform=\"pandas\")),\n",
+ " (\n",
+ " \"selector\",\n",
+ " GrootCV(\n",
+ " objective=\"poisson\",\n",
+ " cutoff=1,\n",
+ " n_folds=5,\n",
+ " n_iter=5,\n",
+ " silent=True,\n",
+ " fastshap=True,\n",
+ " n_jobs=0,\n",
+ " ),\n",
+ " ),\n",
+ " (\"glm\", PoissonRegressor()),\n",
+ " ]\n",
+ ")\n",
"\n",
"# Fit the pipeline to the training data\n",
- "pipeline.fit(X_train, y_train, selector__sample_weight=w_train, glm__sample_weight=w_train)\n",
+ "pipeline.fit(\n",
+ " X_train, y_train, selector__sample_weight=w_train, glm__sample_weight=w_train\n",
+ ")\n",
"\n",
"# Make predictions on the test set\n",
"y_pred = pipeline.predict(X_test)\n",
@@ -232,16 +260,21 @@
"# Plot predictions\n",
"plt.subplot(1, 2, 1)\n",
"plt.scatter(y_test, y_pred, alpha=0.05)\n",
- "plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='#d1ae11', linestyle='--')\n",
- "plt.xlabel('True Values')\n",
- "plt.ylabel('Predictions')\n",
- "plt.title('True Values vs. Predictions')\n",
+ "plt.plot(\n",
+ " [min(y_test), max(y_test)],\n",
+ " [min(y_test), max(y_test)],\n",
+ " color=\"#d1ae11\",\n",
+ " linestyle=\"--\",\n",
+ ")\n",
+ "plt.xlabel(\"True Values\")\n",
+ "plt.ylabel(\"Predictions\")\n",
+ "plt.title(\"True Values vs. Predictions\")\n",
"\n",
"# Inset zoom of the first panel\n",
"ax = plt.gca()\n",
- "axins = inset_axes(ax, width=\"40%\", height=\"40%\", loc='upper right')\n",
+ "axins = inset_axes(ax, width=\"40%\", height=\"40%\", loc=\"upper right\")\n",
"axins.scatter(y_test, y_pred, alpha=0.1)\n",
- "axins.plot([0, 10], [0, 10], color='#d1ae11', linestyle='--')\n",
+ "axins.plot([0, 10], [0, 10], color=\"#d1ae11\", linestyle=\"--\")\n",
"axins.set_xlim(0, 10)\n",
"axins.set_ylim(0, 10)\n",
"ax.indicate_inset_zoom(axins, edgecolor=\"#c40f06\")\n",
@@ -249,9 +282,9 @@
"# Plot residuals\n",
"plt.subplot(1, 2, 2)\n",
"sns.histplot(residuals, kde=True)\n",
- "plt.xlabel('Residuals')\n",
- "plt.ylabel('Frequency')\n",
- "plt.title('Distribution of Residuals')\n",
+ "plt.xlabel(\"Residuals\")\n",
+ "plt.ylabel(\"Frequency\")\n",
+ "plt.title(\"Distribution of Residuals\")\n",
"\n",
"plt.show()"
]
@@ -282,10 +315,12 @@
}
],
"source": [
- "print(f\"The selected features: {pipeline.named_steps['selector'].get_feature_names_out()}\")\n",
+ "print(\n",
+ " f\"The selected features: {pipeline.named_steps['selector'].get_feature_names_out()}\"\n",
+ ")\n",
"print(f\"The agnostic ranking: {pipeline.named_steps['selector'].ranking_}\")\n",
"print(f\"The naive ranking: {pipeline.named_steps['selector'].ranking_absolutes_}\")\n",
- "fig = pipeline.named_steps['selector'].plot_importance(n_feat_per_inch=5)\n",
+ "fig = pipeline.named_steps[\"selector\"].plot_importance(n_feat_per_inch=5)\n",
"\n",
"# highlight synthetic random variable\n",
"for name in true_coef.index:\n",
@@ -469,14 +504,28 @@
"source": [
"model = LGBMRegressor(random_state=42, verbose=-1, objective=\"poisson\")\n",
"\n",
- "pipeline = Pipeline([\n",
- " ('scaler', StandardScaler().set_output(transform=\"pandas\")),\n",
- " ('selector', Leshy(model, n_estimators=20, verbose=1, max_iter=10, random_state=42, importance=\"fastshap\")),\n",
- " ('glm', PoissonRegressor())\n",
- "])\n",
+ "pipeline = Pipeline(\n",
+ " [\n",
+ " (\"scaler\", StandardScaler().set_output(transform=\"pandas\")),\n",
+ " (\n",
+ " \"selector\",\n",
+ " Leshy(\n",
+ " model,\n",
+ " n_estimators=20,\n",
+ " verbose=1,\n",
+ " max_iter=10,\n",
+ " random_state=42,\n",
+ " importance=\"fastshap\",\n",
+ " ),\n",
+ " ),\n",
+ " (\"glm\", PoissonRegressor()),\n",
+ " ]\n",
+ ")\n",
"\n",
"# Fit the pipeline to the training data\n",
- "pipeline.fit(X_train, y_train, selector__sample_weight=w_train, glm__sample_weight=w_train)\n",
+ "pipeline.fit(\n",
+ " X_train, y_train, selector__sample_weight=w_train, glm__sample_weight=w_train\n",
+ ")\n",
"\n",
"# Make predictions on the test set\n",
"y_pred = pipeline.predict(X_test)\n",
@@ -490,16 +539,21 @@
"# Plot predictions\n",
"plt.subplot(1, 2, 1)\n",
"plt.scatter(y_test, y_pred, alpha=0.05)\n",
- "plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='#d1ae11', linestyle='--')\n",
- "plt.xlabel('True Values')\n",
- "plt.ylabel('Predictions')\n",
- "plt.title('True Values vs. Predictions')\n",
+ "plt.plot(\n",
+ " [min(y_test), max(y_test)],\n",
+ " [min(y_test), max(y_test)],\n",
+ " color=\"#d1ae11\",\n",
+ " linestyle=\"--\",\n",
+ ")\n",
+ "plt.xlabel(\"True Values\")\n",
+ "plt.ylabel(\"Predictions\")\n",
+ "plt.title(\"True Values vs. Predictions\")\n",
"\n",
"# Inset zoom of the first panel\n",
"ax = plt.gca()\n",
- "axins = inset_axes(ax, width=\"40%\", height=\"40%\", loc='upper right')\n",
+ "axins = inset_axes(ax, width=\"40%\", height=\"40%\", loc=\"upper right\")\n",
"axins.scatter(y_test, y_pred, alpha=0.1)\n",
- "axins.plot([0, 10], [0, 10], color='#d1ae11', linestyle='--')\n",
+ "axins.plot([0, 10], [0, 10], color=\"#d1ae11\", linestyle=\"--\")\n",
"axins.set_xlim(0, 10)\n",
"axins.set_ylim(0, 10)\n",
"ax.indicate_inset_zoom(axins, edgecolor=\"#c40f06\")\n",
@@ -507,9 +561,9 @@
"# Plot residuals\n",
"plt.subplot(1, 2, 2)\n",
"sns.histplot(residuals, kde=True)\n",
- "plt.xlabel('Residuals')\n",
- "plt.ylabel('Frequency')\n",
- "plt.title('Distribution of Residuals')\n",
+ "plt.xlabel(\"Residuals\")\n",
+ "plt.ylabel(\"Frequency\")\n",
+ "plt.title(\"Distribution of Residuals\")\n",
"\n",
"plt.show()"
]
@@ -540,10 +594,12 @@
}
],
"source": [
- "print(f\"The selected features: {pipeline.named_steps['selector'].get_feature_names_out()}\")\n",
+ "print(\n",
+ " f\"The selected features: {pipeline.named_steps['selector'].get_feature_names_out()}\"\n",
+ ")\n",
"print(f\"The agnostic ranking: {pipeline.named_steps['selector'].ranking_}\")\n",
"print(f\"The naive ranking: {pipeline.named_steps['selector'].ranking_absolutes_}\")\n",
- "fig = pipeline.named_steps['selector'].plot_importance(n_feat_per_inch=5)\n",
+ "fig = pipeline.named_steps[\"selector\"].plot_importance(n_feat_per_inch=5)\n",
"\n",
"# highlight synthetic random variable\n",
"for name in true_coef.index:\n",
@@ -596,14 +652,28 @@
"source": [
"model = LGBMRegressor(random_state=42, verbose=-1, objective=\"poisson\")\n",
"\n",
- "pipeline = Pipeline([\n",
- " ('scaler', StandardScaler().set_output(transform=\"pandas\")),\n",
- " ('selector', BoostAGroota(estimator=model, cutoff=1, iters=10, max_rounds=10, delta=0.1, importance=\"fastshap\")),\n",
- " ('glm', PoissonRegressor())\n",
- "])\n",
+ "pipeline = Pipeline(\n",
+ " [\n",
+ " (\"scaler\", StandardScaler().set_output(transform=\"pandas\")),\n",
+ " (\n",
+ " \"selector\",\n",
+ " BoostAGroota(\n",
+ " estimator=model,\n",
+ " cutoff=1,\n",
+ " iters=10,\n",
+ " max_rounds=10,\n",
+ " delta=0.1,\n",
+ " importance=\"fastshap\",\n",
+ " ),\n",
+ " ),\n",
+ " (\"glm\", PoissonRegressor()),\n",
+ " ]\n",
+ ")\n",
"\n",
"# Fit the pipeline to the training data\n",
- "pipeline.fit(X_train, y_train, selector__sample_weight=w_train, glm__sample_weight=w_train)\n",
+ "pipeline.fit(\n",
+ " X_train, y_train, selector__sample_weight=w_train, glm__sample_weight=w_train\n",
+ ")\n",
"\n",
"# Make predictions on the test set\n",
"y_pred = pipeline.predict(X_test)\n",
@@ -617,16 +687,21 @@
"# Plot predictions\n",
"plt.subplot(1, 2, 1)\n",
"plt.scatter(y_test, y_pred, alpha=0.05)\n",
- "plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='#d1ae11', linestyle='--')\n",
- "plt.xlabel('True Values')\n",
- "plt.ylabel('Predictions')\n",
- "plt.title('True Values vs. Predictions')\n",
+ "plt.plot(\n",
+ " [min(y_test), max(y_test)],\n",
+ " [min(y_test), max(y_test)],\n",
+ " color=\"#d1ae11\",\n",
+ " linestyle=\"--\",\n",
+ ")\n",
+ "plt.xlabel(\"True Values\")\n",
+ "plt.ylabel(\"Predictions\")\n",
+ "plt.title(\"True Values vs. Predictions\")\n",
"\n",
"# Inset zoom of the first panel\n",
"ax = plt.gca()\n",
- "axins = inset_axes(ax, width=\"40%\", height=\"40%\", loc='upper right')\n",
+ "axins = inset_axes(ax, width=\"40%\", height=\"40%\", loc=\"upper right\")\n",
"axins.scatter(y_test, y_pred, alpha=0.1)\n",
- "axins.plot([0, 10], [0, 10], color='#d1ae11', linestyle='--')\n",
+ "axins.plot([0, 10], [0, 10], color=\"#d1ae11\", linestyle=\"--\")\n",
"axins.set_xlim(0, 10)\n",
"axins.set_ylim(0, 10)\n",
"ax.indicate_inset_zoom(axins, edgecolor=\"#c40f06\")\n",
@@ -634,9 +709,9 @@
"# Plot residuals\n",
"plt.subplot(1, 2, 2)\n",
"sns.histplot(residuals, kde=True)\n",
- "plt.xlabel('Residuals')\n",
- "plt.ylabel('Frequency')\n",
- "plt.title('Distribution of Residuals')\n",
+ "plt.xlabel(\"Residuals\")\n",
+ "plt.ylabel(\"Frequency\")\n",
+ "plt.title(\"Distribution of Residuals\")\n",
"\n",
"plt.show()"
]
@@ -667,10 +742,12 @@
}
],
"source": [
- "print(f\"The selected features: {pipeline.named_steps['selector'].get_feature_names_out()}\")\n",
+ "print(\n",
+ " f\"The selected features: {pipeline.named_steps['selector'].get_feature_names_out()}\"\n",
+ ")\n",
"print(f\"The agnostic ranking: {pipeline.named_steps['selector'].ranking_}\")\n",
"print(f\"The naive ranking: {pipeline.named_steps['selector'].ranking_absolutes_}\")\n",
- "fig = pipeline.named_steps['selector'].plot_importance(n_feat_per_inch=5)\n",
+ "fig = pipeline.named_steps[\"selector\"].plot_importance(n_feat_per_inch=5)\n",
"\n",
"# highlight synthetic random variable\n",
"for name in true_coef.index:\n",
@@ -732,14 +809,21 @@
" n_jobs=-1,\n",
")\n",
"\n",
- "pipeline = Pipeline([\n",
- " ('selector', mrmr),\n",
- " ('scaler', StandardScaler().set_output(transform=\"pandas\")),\n",
- " ('glm', PoissonRegressor())\n",
- "])\n",
+ "pipeline = Pipeline(\n",
+ " [\n",
+ " (\"selector\", mrmr),\n",
+ " (\"scaler\", StandardScaler().set_output(transform=\"pandas\")),\n",
+ " (\"glm\", PoissonRegressor()),\n",
+ " ]\n",
+ ")\n",
"\n",
"# Fit the pipeline to the training data\n",
- "pipeline.fit(X_train, pd.Series(y_train), selector__sample_weight=pd.Series(w_train), glm__sample_weight=w_train)\n",
+ "pipeline.fit(\n",
+ " X_train,\n",
+ " pd.Series(y_train),\n",
+ " selector__sample_weight=pd.Series(w_train),\n",
+ " glm__sample_weight=w_train,\n",
+ ")\n",
"\n",
"# Make predictions on the test set\n",
"y_pred = pipeline.predict(X_test)\n",
@@ -753,16 +837,21 @@
"# Plot predictions\n",
"plt.subplot(1, 2, 1)\n",
"plt.scatter(y_test, y_pred, alpha=0.05)\n",
- "plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='#d1ae11', linestyle='--')\n",
- "plt.xlabel('True Values')\n",
- "plt.ylabel('Predictions')\n",
- "plt.title('True Values vs. Predictions')\n",
+ "plt.plot(\n",
+ " [min(y_test), max(y_test)],\n",
+ " [min(y_test), max(y_test)],\n",
+ " color=\"#d1ae11\",\n",
+ " linestyle=\"--\",\n",
+ ")\n",
+ "plt.xlabel(\"True Values\")\n",
+ "plt.ylabel(\"Predictions\")\n",
+ "plt.title(\"True Values vs. Predictions\")\n",
"\n",
"# Inset zoom of the first panel\n",
"ax = plt.gca()\n",
- "axins = inset_axes(ax, width=\"40%\", height=\"40%\", loc='upper right')\n",
+ "axins = inset_axes(ax, width=\"40%\", height=\"40%\", loc=\"upper right\")\n",
"axins.scatter(y_test, y_pred, alpha=0.1)\n",
- "axins.plot([0, 10], [0, 10], color='#d1ae11', linestyle='--')\n",
+ "axins.plot([0, 10], [0, 10], color=\"#d1ae11\", linestyle=\"--\")\n",
"axins.set_xlim(0, 10)\n",
"axins.set_ylim(0, 10)\n",
"ax.indicate_inset_zoom(axins, edgecolor=\"#c40f06\")\n",
@@ -770,9 +859,9 @@
"# Plot residuals\n",
"plt.subplot(1, 2, 2)\n",
"sns.histplot(residuals, kde=True)\n",
- "plt.xlabel('Residuals')\n",
- "plt.ylabel('Frequency')\n",
- "plt.title('Distribution of Residuals')\n",
+ "plt.xlabel(\"Residuals\")\n",
+ "plt.ylabel(\"Frequency\")\n",
+ "plt.title(\"Distribution of Residuals\")\n",
"\n",
"plt.show()"
]
@@ -817,7 +906,9 @@
}
],
"source": [
- "print(f\"The selected features: {pipeline.named_steps['selector'].get_feature_names_out()}\")"
+ "print(\n",
+ " f\"The selected features: {pipeline.named_steps['selector'].get_feature_names_out()}\"\n",
+ ")"
]
},
{
@@ -901,7 +992,7 @@
}
],
"source": [
- "pipeline.named_steps['selector'].ranking_"
+ "pipeline.named_steps[\"selector\"].ranking_"
]
}
],
diff --git a/docs/notebooks/arfs_on_GPU.ipynb b/docs/notebooks/arfs_on_GPU.ipynb
index eaee1d7..84c42d9 100644
--- a/docs/notebooks/arfs_on_GPU.ipynb
+++ b/docs/notebooks/arfs_on_GPU.ipynb
@@ -80,20 +80,30 @@
"n_features = 100\n",
"n_informative = 20\n",
"\n",
- "X, y, true_coef = make_regression(n_samples=n_samples, n_features=n_features, n_informative=n_informative, noise=1, random_state=8, bias=bias, coef=True)\n",
- "y = (y-y.mean())/y.std()\n",
+ "X, y, true_coef = make_regression(\n",
+ " n_samples=n_samples,\n",
+ " n_features=n_features,\n",
+ " n_informative=n_informative,\n",
+ " noise=1,\n",
+ " random_state=8,\n",
+ " bias=bias,\n",
+ " coef=True,\n",
+ ")\n",
+ "y = (y - y.mean()) / y.std()\n",
"y = np.exp(y) # Transform to positive values for Poisson distribution\n",
"y = np.random.poisson(y) # Add Poisson noise to the target variable\n",
"# dummy sample weight (e.g. exposure), smallest being 30 days\n",
- "w = np.random.uniform(30/365, 1, size=len(y))\n",
+ "w = np.random.uniform(30 / 365, 1, size=len(y))\n",
"# make the count a Poisson rate (frequency)\n",
- "y = y/w\n",
+ "y = y / w\n",
"\n",
"X = pd.DataFrame(X)\n",
"X.columns = [f\"pred_{i}\" for i in range(X.shape[1])]\n",
"\n",
"# Split the data into training and testing sets\n",
- "X_train, X_test, y_train, y_test, w_train, w_test = train_test_split(X, y, w, test_size=0.5, random_state=42)\n",
+ "X_train, X_test, y_train, y_test, w_train, w_test = train_test_split(\n",
+ " X, y, w, test_size=0.5, random_state=42\n",
+ ")\n",
"\n",
"true_coef = pd.Series(true_coef)\n",
"true_coef.index = X.columns\n",
@@ -166,7 +176,14 @@
"source": [
"%%time\n",
"feat_selector = GrootCV(\n",
- " objective=\"rmse\", cutoff=1, n_folds=3, n_iter=3, silent=True, fastshap=True, n_jobs=0, lgbm_params={\"device\": \"gpu\"}\n",
+ " objective=\"rmse\",\n",
+ " cutoff=1,\n",
+ " n_folds=3,\n",
+ " n_iter=3,\n",
+ " silent=True,\n",
+ " fastshap=True,\n",
+ " n_jobs=0,\n",
+ " lgbm_params={\"device\": \"gpu\"},\n",
")\n",
"feat_selector.fit(X_train, y_train, sample_weight=None)\n",
"print(f\"The selected features: {feat_selector.get_feature_names_out()}\")\n",
@@ -245,7 +262,14 @@
"source": [
"%%time\n",
"feat_selector = GrootCV(\n",
- " objective=\"rmse\", cutoff=1, n_folds=3, n_iter=3, silent=True, fastshap=True, n_jobs=0, lgbm_params={\"device\": \"cpu\"}\n",
+ " objective=\"rmse\",\n",
+ " cutoff=1,\n",
+ " n_folds=3,\n",
+ " n_iter=3,\n",
+ " silent=True,\n",
+ " fastshap=True,\n",
+ " n_jobs=0,\n",
+ " lgbm_params={\"device\": \"cpu\"},\n",
")\n",
"feat_selector.fit(X_train, y_train, sample_weight=None)\n",
"print(f\"The selected features: {feat_selector.get_feature_names_out()}\")\n",
@@ -330,7 +354,14 @@
"source": [
"%%time\n",
"feat_selector = GrootCV(\n",
- " objective=\"rmse\", cutoff=1, n_folds=5, n_iter=5, silent=True, fastshap=True, n_jobs=0, lgbm_params={\"device\": \"cpu\"}\n",
+ " objective=\"rmse\",\n",
+ " cutoff=1,\n",
+ " n_folds=5,\n",
+ " n_iter=5,\n",
+ " silent=True,\n",
+ " fastshap=True,\n",
+ " n_jobs=0,\n",
+ " lgbm_params={\"device\": \"cpu\"},\n",
")\n",
"feat_selector.fit(X, y, sample_weight=None)\n",
"print(f\"The selected features: {feat_selector.get_feature_names_out()}\")\n",
@@ -394,7 +425,14 @@
"source": [
"%%time\n",
"feat_selector = GrootCV(\n",
- " objective=\"rmse\", cutoff=1, n_folds=5, n_iter=5, silent=True, fastshap=True, n_jobs=0, lgbm_params={\"device\": \"gpu\"}\n",
+ " objective=\"rmse\",\n",
+ " cutoff=1,\n",
+ " n_folds=5,\n",
+ " n_iter=5,\n",
+ " silent=True,\n",
+ " fastshap=True,\n",
+ " n_jobs=0,\n",
+ " lgbm_params={\"device\": \"gpu\"},\n",
")\n",
"feat_selector.fit(X, y, sample_weight=None)\n",
"print(f\"The selected features: {feat_selector.get_feature_names_out()}\")\n",
@@ -448,7 +486,14 @@
"source": [
"%%time\n",
"feat_selector = GrootCV(\n",
- " objective=\"rmse\", cutoff=1, n_folds=5, n_iter=5, silent=True, fastshap=True, n_jobs=0, lgbm_params={\"device\": \"cuda\"}\n",
+ " objective=\"rmse\",\n",
+ " cutoff=1,\n",
+ " n_folds=5,\n",
+ " n_iter=5,\n",
+ " silent=True,\n",
+ " fastshap=True,\n",
+ " n_jobs=0,\n",
+ " lgbm_params={\"device\": \"cuda\"},\n",
")\n",
"feat_selector.fit(X, y, sample_weight=None)\n",
"print(f\"The selected features: {feat_selector.get_feature_names_out()}\")\n",
diff --git a/docs/notebooks/arfs_regression.ipynb b/docs/notebooks/arfs_regression.ipynb
index 0f8a7da..5f19ca7 100644
--- a/docs/notebooks/arfs_regression.ipynb
+++ b/docs/notebooks/arfs_regression.ipynb
@@ -605,7 +605,12 @@
"\n",
"# Leshy\n",
"feat_selector = arfsgroot.Leshy(\n",
- " model, n_estimators=20, verbose=1, max_iter=10, random_state=42, importance=\"fastshap\"\n",
+ " model,\n",
+ " n_estimators=20,\n",
+ " verbose=1,\n",
+ " max_iter=10,\n",
+ " random_state=42,\n",
+ " importance=\"fastshap\",\n",
")\n",
"feat_selector.fit(X, y, sample_weight=None)\n",
"print(f\"The selected features: {feat_selector.get_feature_names_out()}\")\n",
@@ -965,7 +970,13 @@
"%%time\n",
"# GrootCV\n",
"feat_selector = arfsgroot.GrootCV(\n",
- " objective=\"rmse\", cutoff=1, n_folds=5, n_iter=5, silent=True, fastshap=False, n_jobs=0\n",
+ " objective=\"rmse\",\n",
+ " cutoff=1,\n",
+ " n_folds=5,\n",
+ " n_iter=5,\n",
+ " silent=True,\n",
+ " fastshap=False,\n",
+ " n_jobs=0,\n",
")\n",
"feat_selector.fit(X, y, sample_weight=None)\n",
"print(f\"The selected features: {feat_selector.get_feature_names_out()}\")\n",
@@ -1037,7 +1048,13 @@
"%%time\n",
"# GrootCV\n",
"feat_selector = arfsgroot.GrootCV(\n",
- " objective=\"rmse\", cutoff=1, n_folds=5, n_iter=5, silent=True, fastshap=True, n_jobs=0\n",
+ " objective=\"rmse\",\n",
+ " cutoff=1,\n",
+ " n_folds=5,\n",
+ " n_iter=5,\n",
+ " silent=True,\n",
+ " fastshap=True,\n",
+ " n_jobs=0,\n",
")\n",
"feat_selector.fit(X, y, sample_weight=None)\n",
"print(f\"The selected features: {feat_selector.get_feature_names_out()}\")\n",
@@ -2318,10 +2335,14 @@
"from lightgbm import LGBMRegressor\n",
"from fasttreeshap import TreeExplainer as FastTreeExplainer\n",
"\n",
- "X, y = make_regression(n_samples=1000, n_features=10, n_informative=8, noise=1, random_state=8)\n",
- "model = XGBRegressor() #LGBMRegressor()\n",
+ "X, y = make_regression(\n",
+ " n_samples=1000, n_features=10, n_informative=8, noise=1, random_state=8\n",
+ ")\n",
+ "model = XGBRegressor() # LGBMRegressor()\n",
"model.fit(X, y)\n",
- "explainer = FastTreeExplainer(model, algorithm=\"auto\", shortcut=False, feature_perturbation=\"tree_path_dependent\")\n",
+ "explainer = FastTreeExplainer(\n",
+ " model, algorithm=\"auto\", shortcut=False, feature_perturbation=\"tree_path_dependent\"\n",
+ ")\n",
"shap_matrix = explainer.shap_values(X)"
]
},
@@ -2350,7 +2371,9 @@
"import shap\n",
"import xgboost\n",
"\n",
- "print(f\"Using xgboost {xgboost.__version__}, shap {shap.__version__} and fasttreeshap {fasttreeshap.__version__}\")"
+ "print(\n",
+ " f\"Using xgboost {xgboost.__version__}, shap {shap.__version__} and fasttreeshap {fasttreeshap.__version__}\"\n",
+ ")"
]
},
{
@@ -2401,6 +2424,7 @@
"source": [
"# Testing the changes with rnd cat. and num. predictors added to the set of genuine predictors\n",
"\n",
+ "\n",
"def testing_estimators(X, y, sample_weight=None, objective=\"rmse\"):\n",
" feat_selector = arfsgroot.GrootCV(\n",
" objective=objective, cutoff=1, n_folds=5, n_iter=5, fastshap=False\n",
@@ -3327,7 +3351,7 @@
" CatBoostRegressor(random_state=42, verbose=0),\n",
" LGBMRegressor(random_state=42, verbose=-1),\n",
" LightForestRegressor(n_feat=X.shape[1]),\n",
- " XGBRegressor(random_state=42, verbosity=0)\n",
+ " XGBRegressor(random_state=42, verbosity=0),\n",
"]\n",
"\n",
"feat_selector = arfsgroot.BoostAGroota(\n",
diff --git a/docs/notebooks/arfs_shap_vs_fastshap.ipynb b/docs/notebooks/arfs_shap_vs_fastshap.ipynb
index 5b82864..2ed4b6f 100644
--- a/docs/notebooks/arfs_shap_vs_fastshap.ipynb
+++ b/docs/notebooks/arfs_shap_vs_fastshap.ipynb
@@ -64,20 +64,30 @@
"n_features = 100\n",
"n_informative = 20\n",
"\n",
- "X, y, true_coef = make_regression(n_samples=n_samples, n_features=n_features, n_informative=n_informative, noise=1, random_state=8, bias=bias, coef=True)\n",
- "y = (y-y.mean())/y.std()\n",
+ "X, y, true_coef = make_regression(\n",
+ " n_samples=n_samples,\n",
+ " n_features=n_features,\n",
+ " n_informative=n_informative,\n",
+ " noise=1,\n",
+ " random_state=8,\n",
+ " bias=bias,\n",
+ " coef=True,\n",
+ ")\n",
+ "y = (y - y.mean()) / y.std()\n",
"y = np.exp(y) # Transform to positive values for Poisson distribution\n",
"y = np.random.poisson(y) # Add Poisson noise to the target variable\n",
"# dummy sample weight (e.g. exposure), smallest being 30 days\n",
- "w = np.random.uniform(30/365, 1, size=len(y))\n",
+ "w = np.random.uniform(30 / 365, 1, size=len(y))\n",
"# make the count a Poisson rate (frequency)\n",
- "y = y/w\n",
+ "y = y / w\n",
"\n",
"X = pd.DataFrame(X)\n",
"X.columns = [f\"pred_{i}\" for i in range(X.shape[1])]\n",
"\n",
"# Split the data into training and testing sets\n",
- "X_train, X_test, y_train, y_test, w_train, w_test = train_test_split(X, y, w, test_size=0.5, random_state=42)\n",
+ "X_train, X_test, y_train, y_test, w_train, w_test = train_test_split(\n",
+ " X, y, w, test_size=0.5, random_state=42\n",
+ ")\n",
"\n",
"true_coef = pd.Series(true_coef)\n",
"true_coef.index = X.columns\n",
@@ -151,7 +161,14 @@
"source": [
"%%time\n",
"feat_selector = GrootCV(\n",
- " objective=\"rmse\", cutoff=1, n_folds=3, n_iter=3, silent=True, fastshap=True, n_jobs=0, lgbm_params={\"device\": \"cpu\"}\n",
+ " objective=\"rmse\",\n",
+ " cutoff=1,\n",
+ " n_folds=3,\n",
+ " n_iter=3,\n",
+ " silent=True,\n",
+ " fastshap=True,\n",
+ " n_jobs=0,\n",
+ " lgbm_params={\"device\": \"cpu\"},\n",
")\n",
"feat_selector.fit(X_train, y_train, sample_weight=None)"
]
@@ -249,7 +266,14 @@
"source": [
"%%time\n",
"feat_selector = GrootCV(\n",
- " objective=\"rmse\", cutoff=1, n_folds=3, n_iter=3, silent=True, fastshap=False, n_jobs=0, lgbm_params={\"device\": \"cpu\"}\n",
+ " objective=\"rmse\",\n",
+ " cutoff=1,\n",
+ " n_folds=3,\n",
+ " n_iter=3,\n",
+ " silent=True,\n",
+ " fastshap=False,\n",
+ " n_jobs=0,\n",
+ " lgbm_params={\"device\": \"cpu\"},\n",
")\n",
"feat_selector.fit(X_train, y_train, sample_weight=None)"
]
diff --git a/docs/notebooks/basic_feature_selection.ipynb b/docs/notebooks/basic_feature_selection.ipynb
index cdace8a..5f2c4f9 100644
--- a/docs/notebooks/basic_feature_selection.ipynb
+++ b/docs/notebooks/basic_feature_selection.ipynb
@@ -1920,7 +1920,9 @@
],
"source": [
"lgb_kwargs = {\"objective\": \"rmse\", \"zero_as_missing\": False}\n",
- "selector = arfsfs.VariableImportance(verbose=2, threshold=0.99, lgb_kwargs=lgb_kwargs, fastshap=False)\n",
+ "selector = arfsfs.VariableImportance(\n",
+ " verbose=2, threshold=0.99, lgb_kwargs=lgb_kwargs, fastshap=False\n",
+ ")\n",
"X_trans = selector.fit_transform(X=X, y=y, sample_weight=w)\n",
"print(f\"The features going in the selector are : {selector.feature_names_in_}\")\n",
"print(f\"The support is : {selector.support_}\")\n",
@@ -1987,7 +1989,9 @@
],
"source": [
"lgb_kwargs = {\"objective\": \"rmse\", \"zero_as_missing\": False}\n",
- "selector = arfsfs.VariableImportance(verbose=2, threshold=0.99, lgb_kwargs=lgb_kwargs, fastshap=True)\n",
+ "selector = arfsfs.VariableImportance(\n",
+ " verbose=2, threshold=0.99, lgb_kwargs=lgb_kwargs, fastshap=True\n",
+ ")\n",
"X_trans = selector.fit_transform(X=X, y=y, sample_weight=w)\n",
"print(f\"The features going in the selector are : {selector.feature_names_in_}\")\n",
"print(f\"The support is : {selector.support_}\")\n",
diff --git a/docs/notebooks/issue_categoricals.ipynb b/docs/notebooks/issue_categoricals.ipynb
index c2adeae..136bc5c 100644
--- a/docs/notebooks/issue_categoricals.ipynb
+++ b/docs/notebooks/issue_categoricals.ipynb
@@ -529,7 +529,12 @@
"\n",
"# Leshy, all the predictors, no-preprocessing\n",
"feat_selector = arfsgroot.Leshy(\n",
- " model, n_estimators=1000, verbose=1, max_iter=10, random_state=42, importance=\"fastshap\"\n",
+ " model,\n",
+ " n_estimators=1000,\n",
+ " verbose=1,\n",
+ " max_iter=10,\n",
+ " random_state=42,\n",
+ " importance=\"fastshap\",\n",
")\n",
"\n",
"feat_selector.fit(X, y, sample_weight=None)\n",
@@ -628,7 +633,12 @@
"\n",
"# Leshy, all the predictors, no-preprocessing\n",
"feat_selector = arfsgroot.Leshy(\n",
- " model, n_estimators=100, verbose=1, max_iter=10, random_state=42, importance=\"fastshap\"\n",
+ " model,\n",
+ " n_estimators=100,\n",
+ " verbose=1,\n",
+ " max_iter=10,\n",
+ " random_state=42,\n",
+ " importance=\"fastshap\",\n",
")\n",
"\n",
"feat_selector.fit(X, y, sample_weight=None)\n",
diff --git a/docs/notebooks/issue_collinearity.ipynb b/docs/notebooks/issue_collinearity.ipynb
index 83280d0..8b2fdce 100644
--- a/docs/notebooks/issue_collinearity.ipynb
+++ b/docs/notebooks/issue_collinearity.ipynb
@@ -669,7 +669,12 @@
"model = clone(model)\n",
"# Leshy\n",
"feat_selector = arfsgroot.Leshy(\n",
- " model, n_estimators=100, verbose=1, max_iter=10, random_state=42, importance=\"fastshap\"\n",
+ " model,\n",
+ " n_estimators=100,\n",
+ " verbose=1,\n",
+ " max_iter=10,\n",
+ " random_state=42,\n",
+ " importance=\"fastshap\",\n",
")\n",
"feat_selector.fit(X, y, sample_weight=None)\n",
"print(f\"The selected features: {feat_selector.get_feature_names_out()}\")\n",
@@ -754,7 +759,12 @@
"model = clone(model)\n",
"# Leshy\n",
"feat_selector = arfsgroot.Leshy(\n",
- " model, n_estimators=100, verbose=1, max_iter=10, random_state=42, importance=\"fastshap\"\n",
+ " model,\n",
+ " n_estimators=100,\n",
+ " verbose=1,\n",
+ " max_iter=10,\n",
+ " random_state=42,\n",
+ " importance=\"fastshap\",\n",
")\n",
"feat_selector.fit(X_filtered, y, sample_weight=None)\n",
"print(f\"The selected features: {feat_selector.get_feature_names_out()}\")\n",
diff --git a/docs/notebooks/lasso_feature_selection.ipynb b/docs/notebooks/lasso_feature_selection.ipynb
index 898c0d0..e7090fd 100644
--- a/docs/notebooks/lasso_feature_selection.ipynb
+++ b/docs/notebooks/lasso_feature_selection.ipynb
@@ -53,7 +53,9 @@
"import arfs.feature_selection as arfsfs\n",
"\n",
"\n",
- "def plot_y_vs_X(X: pd.DataFrame, y: pd.Series, ncols: int = 2, figsize: tuple = (10, 10)) -> plt.Figure:\n",
+ "def plot_y_vs_X(\n",
+ " X: pd.DataFrame, y: pd.Series, ncols: int = 2, figsize: tuple = (10, 10)\n",
+ ") -> plt.Figure:\n",
" \"\"\"\n",
" Create subplots of scatter plots showing the relationship between each column in X and the target variable y.\n",
"\n",
@@ -86,7 +88,7 @@
" ax.set_title(col)\n",
"\n",
" # Remove any unused subplots\n",
- " for ax in axs.flat[len(X.columns):]:\n",
+ " for ax in axs.flat[len(X.columns) :]:\n",
" ax.set_axis_off()\n",
"\n",
" # Display the figure\n",
@@ -130,11 +132,21 @@
],
"source": [
"bias = 7.0\n",
- "X, y, true_coef = make_regression(n_samples=2_000, n_features=10, n_informative=5, noise=1, random_state=8, bias=bias, coef=True)\n",
+ "X, y, true_coef = make_regression(\n",
+ " n_samples=2_000,\n",
+ " n_features=10,\n",
+ " n_informative=5,\n",
+ " noise=1,\n",
+ " random_state=8,\n",
+ " bias=bias,\n",
+ " coef=True,\n",
+ ")\n",
"X = pd.DataFrame(X)\n",
"X.columns = [f\"pred_{i}\" for i in range(X.shape[1])]\n",
"\n",
- "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)\n",
+ "X_train, X_test, y_train, y_test = train_test_split(\n",
+ " X, y, test_size=0.5, random_state=42\n",
+ ")\n",
"\n",
"print(f\"The true coefficient of the linear data generating process are:\\n {true_coef}\")\n",
"f = plot_y_vs_X(X_train, y_train, ncols=5, figsize=(15, 5))"
@@ -550,11 +562,13 @@
],
"source": [
"# Create a pipeline with LassoFeatureSelection and LinearRegression\n",
- "pipeline = Pipeline([\n",
- " ('scaler', StandardScaler()),\n",
- " ('selector', LassoFeatureSelection(n_iterations=10, score=\"bic\")),\n",
- " ('linear_regression', LinearRegression())\n",
- "])\n",
+ "pipeline = Pipeline(\n",
+ " [\n",
+ " (\"scaler\", StandardScaler()),\n",
+ " (\"selector\", LassoFeatureSelection(n_iterations=10, score=\"bic\")),\n",
+ " (\"linear_regression\", LinearRegression()),\n",
+ " ]\n",
+ ")\n",
"\n",
"# Fit the pipeline to the training data\n",
"pipeline.fit(X_train, y_train)\n",
@@ -564,7 +578,7 @@
"\n",
"# Calculate the mean squared error\n",
"mse = mean_squared_error(y_test, y_pred)\n",
- "print(f\"Mean Squared Error: {mse}\")\n"
+ "print(f\"Mean Squared Error: {mse}\")"
]
},
{
@@ -729,24 +743,30 @@
],
"source": [
"import seaborn as sns\n",
+ "\n",
"# Plot the predictions and residuals\n",
"plt.figure(figsize=(10, 5))\n",
"\n",
"# Plot predictions\n",
"plt.subplot(1, 2, 1)\n",
"plt.scatter(y_test, y_pred, alpha=0.5)\n",
- "plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='#d1ae11', linestyle='--')\n",
- "plt.xlabel('True Values')\n",
- "plt.ylabel('Predictions')\n",
- "plt.title('True Values vs. Predictions')\n",
+ "plt.plot(\n",
+ " [min(y_test), max(y_test)],\n",
+ " [min(y_test), max(y_test)],\n",
+ " color=\"#d1ae11\",\n",
+ " linestyle=\"--\",\n",
+ ")\n",
+ "plt.xlabel(\"True Values\")\n",
+ "plt.ylabel(\"Predictions\")\n",
+ "plt.title(\"True Values vs. Predictions\")\n",
"\n",
"# Plot residuals\n",
"residuals = y_test - y_pred\n",
"plt.subplot(1, 2, 2)\n",
"plt.scatter(y=residuals, x=y_pred, alpha=0.5)\n",
- "plt.ylabel('Residuals')\n",
- "plt.xlabel('Prediction')\n",
- "plt.title('Distribution of Residuals')\n",
+ "plt.ylabel(\"Residuals\")\n",
+ "plt.xlabel(\"Prediction\")\n",
+ "plt.title(\"Distribution of Residuals\")\n",
"\n",
"plt.tight_layout()\n",
"plt.show();"
@@ -789,20 +809,30 @@
"source": [
"# Generate synthetic data with Poisson-distributed target variable\n",
"bias = 1\n",
- "X, y, true_coef = make_regression(n_samples=2_000, n_features=10, n_informative=5, noise=1, random_state=8, bias=bias, coef=True)\n",
- "y = (y-y.mean())/y.std()\n",
+ "X, y, true_coef = make_regression(\n",
+ " n_samples=2_000,\n",
+ " n_features=10,\n",
+ " n_informative=5,\n",
+ " noise=1,\n",
+ " random_state=8,\n",
+ " bias=bias,\n",
+ " coef=True,\n",
+ ")\n",
+ "y = (y - y.mean()) / y.std()\n",
"y = np.exp(y) # Transform to positive values for Poisson distribution\n",
"y = np.random.poisson(y) # Add Poisson noise to the target variable\n",
"# dummy sample weight (e.g. exposure), smallest being 30 days\n",
- "w = np.random.uniform(30/365, 1, size=len(y))\n",
+ "w = np.random.uniform(30 / 365, 1, size=len(y))\n",
"# make the count a Poisson rate (frequency)\n",
- "y = y/w\n",
+ "y = y / w\n",
"\n",
"X = pd.DataFrame(X)\n",
"X.columns = [f\"pred_{i}\" for i in range(X.shape[1])]\n",
"\n",
"# Split the data into training and testing sets\n",
- "X_train, X_test, y_train, y_test, w_train, w_test = train_test_split(X, y, w, test_size=0.5, random_state=42)\n",
+ "X_train, X_test, y_train, y_test, w_train, w_test = train_test_split(\n",
+ " X, y, w, test_size=0.5, random_state=42\n",
+ ")\n",
"\n",
"print(f\"The true coefficient of the linear data generating process are:\\n {true_coef}\")\n",
"\n",
@@ -848,14 +878,23 @@
],
"source": [
"# Create a pipeline with LassoFeatureSelection and LinearRegression\n",
- "pipeline = Pipeline([\n",
- " ('scaler', StandardScaler()),\n",
- " ('selector', LassoFeatureSelection(n_iterations=10, score=\"bic\", family=\"poisson\", fit_intercept=True)),\n",
- " ('glm', PoissonRegressor())\n",
- "])\n",
+ "pipeline = Pipeline(\n",
+ " [\n",
+ " (\"scaler\", StandardScaler()),\n",
+ " (\n",
+ " \"selector\",\n",
+ " LassoFeatureSelection(\n",
+ " n_iterations=10, score=\"bic\", family=\"poisson\", fit_intercept=True\n",
+ " ),\n",
+ " ),\n",
+ " (\"glm\", PoissonRegressor()),\n",
+ " ]\n",
+ ")\n",
"\n",
"# Fit the pipeline to the training data\n",
- "pipeline.fit(X_train, y_train, selector__sample_weight=w_train, glm__sample_weight=w_train)\n",
+ "pipeline.fit(\n",
+ " X_train, y_train, selector__sample_weight=w_train, glm__sample_weight=w_train\n",
+ ")\n",
"\n",
"# Make predictions on the test set\n",
"y_pred = pipeline.predict(X_test)\n",
@@ -869,16 +908,21 @@
"# Plot predictions\n",
"plt.subplot(1, 2, 1)\n",
"plt.scatter(y_test, y_pred, alpha=0.05)\n",
- "plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='#d1ae11', linestyle='--')\n",
- "plt.xlabel('True Values')\n",
- "plt.ylabel('Predictions')\n",
- "plt.title('True Values vs. Predictions')\n",
+ "plt.plot(\n",
+ " [min(y_test), max(y_test)],\n",
+ " [min(y_test), max(y_test)],\n",
+ " color=\"#d1ae11\",\n",
+ " linestyle=\"--\",\n",
+ ")\n",
+ "plt.xlabel(\"True Values\")\n",
+ "plt.ylabel(\"Predictions\")\n",
+ "plt.title(\"True Values vs. Predictions\")\n",
"\n",
"# Inset zoom of the first panel\n",
"ax = plt.gca()\n",
- "axins = inset_axes(ax, width=\"40%\", height=\"40%\", loc='upper right')\n",
+ "axins = inset_axes(ax, width=\"40%\", height=\"40%\", loc=\"upper right\")\n",
"axins.scatter(y_test, y_pred, alpha=0.1)\n",
- "axins.plot([0, 10], [0, 10], color='#d1ae11', linestyle='--')\n",
+ "axins.plot([0, 10], [0, 10], color=\"#d1ae11\", linestyle=\"--\")\n",
"axins.set_xlim(0, 10)\n",
"axins.set_ylim(0, 10)\n",
"ax.indicate_inset_zoom(axins, edgecolor=\"#c40f06\")\n",
@@ -886,9 +930,9 @@
"# Plot residuals\n",
"plt.subplot(1, 2, 2)\n",
"sns.histplot(residuals, kde=True)\n",
- "plt.xlabel('Residuals')\n",
- "plt.ylabel('Frequency')\n",
- "plt.title('Distribution of Residuals')\n",
+ "plt.xlabel(\"Residuals\")\n",
+ "plt.ylabel(\"Frequency\")\n",
+ "plt.title(\"Distribution of Residuals\")\n",
"\n",
"plt.show()"
]
@@ -1255,20 +1299,24 @@
"# either drop the NA or impute them\n",
"column_transformer = ColumnTransformer(\n",
" transformers=[\n",
- " ('cat', SimpleImputer(missing_values=np.nan, strategy=\"constant\", fill_value=\"Missing_Value\"), cat_features),\n",
- " ('num', SimpleImputer(missing_values=np.nan, strategy=\"median\"), num_features)\n",
+ " (\n",
+ " \"cat\",\n",
+ " SimpleImputer(\n",
+ " missing_values=np.nan, strategy=\"constant\", fill_value=\"Missing_Value\"\n",
+ " ),\n",
+ " cat_features,\n",
+ " ),\n",
+ " (\"num\", SimpleImputer(missing_values=np.nan, strategy=\"median\"), num_features),\n",
" ],\n",
- " remainder='passthrough',\n",
- " verbose_feature_names_out=False\n",
+ " remainder=\"passthrough\",\n",
+ " verbose_feature_names_out=False,\n",
").set_output(transform=\"pandas\")\n",
"\n",
"scaler = ColumnTransformer(\n",
- " transformers=[\n",
- " ('num', StandardScaler(), num_features)\n",
- " ],\n",
- " remainder='passthrough',\n",
- " verbose_feature_names_out=False\n",
- ").set_output(transform=\"pandas\")\n"
+ " transformers=[(\"num\", StandardScaler(), num_features)],\n",
+ " remainder=\"passthrough\",\n",
+ " verbose_feature_names_out=False,\n",
+ ").set_output(transform=\"pandas\")"
]
},
{
@@ -1369,15 +1417,22 @@
"X = X.drop(columns=[\"var10\", \"var11\"])\n",
"\n",
"# Create the pipeline\n",
- "pipeline = Pipeline([\n",
- " ('imputer', column_transformer),\n",
- " ('scaler', scaler),\n",
- " ('preprocess', PatsyTransformer()), # column_transformer PatsyTransformer()\n",
- " ('selector', LassoFeatureSelection(n_iterations=10, score=\"bic\", family=\"gaussian\", fit_intercept=True))\n",
- "])\n",
+ "pipeline = Pipeline(\n",
+ " [\n",
+ " (\"imputer\", column_transformer),\n",
+ " (\"scaler\", scaler),\n",
+ " (\"preprocess\", PatsyTransformer()), # column_transformer PatsyTransformer()\n",
+ " (\n",
+ " \"selector\",\n",
+ " LassoFeatureSelection(\n",
+ " n_iterations=10, score=\"bic\", family=\"gaussian\", fit_intercept=True\n",
+ " ),\n",
+ " ),\n",
+ " ]\n",
+ ")\n",
"\n",
"# Fit the model\n",
- "pipeline.fit(X, y) #, selector__sample_weight=w)\n"
+ "pipeline.fit(X, y) # , selector__sample_weight=w)"
]
},
{
@@ -2342,18 +2397,28 @@
"\n",
"# main parameter controlling how agressive will be the auto-grouping\n",
"lgb_params = {\"min_split_gain\": 0.05}\n",
- "pipeline = Pipeline([\n",
- " (\"disctretizer\", TreeDiscretizer(bin_features=\"all\", n_bins=10, boost_params=lgb_params)),\n",
- " (\"midpointer\", IntervalToMidpoint()),\n",
- " (\"zero_variance\", UniqueValuesThreshold()),\n",
- " # the treediscretization might introduce NaN for pure noise columns\n",
- " (\"missing\", MissingValueThreshold(0.05)),\n",
- " ('scaler', scaler),\n",
- " ('preprocess', PatsyTransformer()), # column_transformer PatsyTransformer()\n",
- " ('selector', LassoFeatureSelection(n_iterations=10, score=\"bic\", family=\"gaussian\", fit_intercept=True))\n",
- "])\n",
+ "pipeline = Pipeline(\n",
+ " [\n",
+ " (\n",
+ " \"disctretizer\",\n",
+ " TreeDiscretizer(bin_features=\"all\", n_bins=10, boost_params=lgb_params),\n",
+ " ),\n",
+ " (\"midpointer\", IntervalToMidpoint()),\n",
+ " (\"zero_variance\", UniqueValuesThreshold()),\n",
+ " # the treediscretization might introduce NaN for pure noise columns\n",
+ " (\"missing\", MissingValueThreshold(0.05)),\n",
+ " (\"scaler\", scaler),\n",
+ " (\"preprocess\", PatsyTransformer()), # column_transformer PatsyTransformer()\n",
+ " (\n",
+ " \"selector\",\n",
+ " LassoFeatureSelection(\n",
+ " n_iterations=10, score=\"bic\", family=\"gaussian\", fit_intercept=True\n",
+ " ),\n",
+ " ),\n",
+ " ]\n",
+ ")\n",
"\n",
- "pipeline.fit(X, y)\n"
+ "pipeline.fit(X, y)"
]
},
{
diff --git a/src/arfs/association.py b/src/arfs/association.py
index 5fb1048..3f23094 100644
--- a/src/arfs/association.py
+++ b/src/arfs/association.py
@@ -17,7 +17,6 @@
import scipy.stats as ss
-
from mpl_toolkits.axes_grid1 import make_axes_locatable
from sklearn.utils import as_float_array, safe_sqr, safe_mask
from multiprocessing import cpu_count
@@ -1683,14 +1682,14 @@ def f_stat_regression_parallel(
handle_na : str, optional
Either drop rows with NA, fill NA with 0, or do nothing, by default "drop".
force_finite : bool, optional
- Whether or not to force the F-statistics and associated p-values to be finite.
+ Whether or not to force the F-statistics and associated p-values to be finite.
There are two cases where the F-statistic is expected to not be finite:
- - When the target `y` or some features in `X` are constant. In this case,
- the Pearson's R correlation is not defined leading to obtain `np.nan`
- values in the F-statistic and p-value. When `force_finite=True`, the
+ - When the target `y` or some features in `X` are constant. In this case,
+ the Pearson's R correlation is not defined leading to obtain `np.nan`
+ values in the F-statistic and p-value. When `force_finite=True`, the
F-statistic is set to `0.0` and the associated p-value is set to `1.0`.
- - When a feature in `X` is perfectly correlated (or anti-correlated)
- with the target `y`. In this case, the F-statistic is expected to be `np.inf`.
+ - When a feature in `X` is perfectly correlated (or anti-correlated)
+ with the target `y`. In this case, the F-statistic is expected to be `np.inf`.
When `force_finite=True`, the F-statistic is set to `np.finfo(dtype).max`.
Returns
@@ -1924,14 +1923,14 @@ def f_stat_classification_parallel(
handle_na : str, optional
Either drop rows with NA, fill NA with 0, or do nothing, by default "drop".
force_finite : bool, optional
- Whether or not to force the F-statistics and associated p-values to be finite.
+ Whether or not to force the F-statistics and associated p-values to be finite.
There are two cases where the F-statistic is expected to not be finite:
- - When the target `y` or some features in `X` are constant. In this case,
- the Pearson's R correlation is not defined leading to obtain `np.nan`
- values in the F-statistic and p-value. When `force_finite=True`, the
+ - When the target `y` or some features in `X` are constant. In this case,
+ the Pearson's R correlation is not defined leading to obtain `np.nan`
+ values in the F-statistic and p-value. When `force_finite=True`, the
F-statistic is set to `0.0` and the associated p-value is set to `1.0`.
- - When a feature in `X` is perfectly correlated (or anti-correlated)
- with the target `y`. In this case, the F-statistic is expected to be `np.inf`.
+ - When a feature in `X` is perfectly correlated (or anti-correlated)
+ with the target `y`. In this case, the F-statistic is expected to be `np.inf`.
When `force_finite=True`, the F-statistic is set to `np.finfo(dtype).max`.
Returns
@@ -2402,12 +2401,16 @@ def plot_association_matrix_int(
try:
import holoviews as hv
except ImportError:
- raise ImportError("Holoviews is not installed. Please install it using 'pip install holoviews'.")
-
+ raise ImportError(
+ "Holoviews is not installed. Please install it using 'pip install holoviews'."
+ )
+
try:
import panel as pn
except ImportError:
- raise ImportError("Panel is not installed. Please install it using 'pip install panel'.")
+ raise ImportError(
+ "Panel is not installed. Please install it using 'pip install panel'."
+ )
cmap = cmap if cmap is not None else "coolwarm"
diff --git a/src/arfs/feature_selection/allrelevant.py b/src/arfs/feature_selection/allrelevant.py
index eda0cfb..52bb129 100644
--- a/src/arfs/feature_selection/allrelevant.py
+++ b/src/arfs/feature_selection/allrelevant.py
@@ -319,7 +319,7 @@ def fit(self, X, y, sample_weight=None):
except ImportError:
warnings.warn("fasttreeshap is not installed. Fallback to shap.")
self.importance = "shap"
-
+
if isinstance(X, pd.DataFrame):
self.feature_names_in_ = X.columns.to_numpy()
else:
@@ -438,7 +438,7 @@ def _fit(self, X_raw, y, sample_weight=None):
# only sklearn requires to fillna data
# modern GBM implementations can handle this
- #X = X.fillna(0)
+ # X = X.fillna(0)
y = pd.Series(y).fillna(0) if not isinstance(y, pd.Series) else y.fillna(0)
# check input params
@@ -501,7 +501,11 @@ def _get_tree_num(self, n_feat):
n_estimators : int
the number of trees
"""
- depth = self.estimator.get_params()["max_depth"] if not self.is_cat else self.estimator.get_param("max_depth")
+ depth = (
+ self.estimator.get_params()["max_depth"]
+ if not self.is_cat
+ else self.estimator.get_param("max_depth")
+ )
if depth is None:
depth = 10
# how many times a feature should be considered on average
@@ -1259,7 +1263,8 @@ def _get_shap_imp(estimator, X, y, sample_weight=None, cat_feature=None):
else:
shap_imp = np.abs(shap_values).mean(0)
return shap_imp
-
+
+
def _get_shap_imp_fast(estimator, X, y, sample_weight=None, cat_feature=None):
"""Get the SHAP feature importance using the fasttreeshap implementation
@@ -1289,7 +1294,12 @@ def _get_shap_imp_fast(estimator, X, y, sample_weight=None, cat_feature=None):
model, X_tt, y_tt, w_tt = _split_fit_estimator(
estimator, X, y, sample_weight=sample_weight, cat_feature=cat_feature
)
- explainer = FastTreeExplainer(model, algorithm="auto", shortcut=False, feature_perturbation="tree_path_dependent")
+ explainer = FastTreeExplainer(
+ model,
+ algorithm="auto",
+ shortcut=False,
+ feature_perturbation="tree_path_dependent",
+ )
shap_matrix = explainer.shap_values(X_tt)
# multiclass returns a list
# for binary and for some models, shap is still returning a list
@@ -1376,7 +1386,6 @@ def _get_imp(estimator, X, y, sample_weight=None, cat_feature=None):
X, _, cat_idx = get_pandas_cat_codes(X)
else:
cat_idx = cat_feature
-
# handle catboost and cat features
if is_catboost(estimator) or (
@@ -1551,7 +1560,7 @@ def fit(self, X, y, sample_weight=None):
except ImportError:
warnings.warn("fasttreeshap is not installed. Fallback to shap.")
self.importance = "shap"
-
+
if isinstance(X, pd.DataFrame):
self.feature_names_in_ = X.columns.to_numpy()
else:
@@ -1559,7 +1568,6 @@ def fit(self, X, y, sample_weight=None):
if sample_weight is not None:
sample_weight = pd.Series(_check_sample_weight(sample_weight, X))
-
# crit, keep_vars, df_vimp, mean_shadow
_, self.selected_features_, self.sha_cutoff_df, self.mean_shadow = _boostaroota(
@@ -1755,7 +1763,12 @@ def _reduce_vars_sklearn(
for i in range(1, n_iterations + 1):
# Create the shadow variables and run the model to obtain importances
new_x, shadow_names = _create_shadow(X)
- imp_func = {"shap": _get_shap_imp, "fastshap": _get_shap_imp, "pimp": _get_perm_imp, "native": _get_imp}
+ imp_func = {
+ "shap": _get_shap_imp,
+ "fastshap": _get_shap_imp,
+ "pimp": _get_perm_imp,
+ "native": _get_imp,
+ }
importance = imp_func[imp_kind](
estimator, new_x, y, sample_weight=weight, cat_feature=cat_feature
)
@@ -1915,7 +1928,7 @@ class GrootCV(SelectorMixin, BaseEstimator):
it improves the convergence (needs less evaluation to find a threshold)
- Not based on a given percentage of cols needed to be deleted
- Plot method for var. imp
-
+
Parameters
----------
objective: str
@@ -1939,8 +1952,8 @@ class GrootCV(SelectorMixin, BaseEstimator):
n_jobs: int, default 0
0 means default number of threads in OpenMP
for the best speed, set this to the number of real CPU cores, not the number of threads
-
-
+
+
Attributes
----------
selected_features_: list of str
@@ -1970,7 +1983,16 @@ class GrootCV(SelectorMixin, BaseEstimator):
"""
def __init__(
- self, objective=None, cutoff=1, n_folds=5, n_iter=5, silent=True, rf=False, fastshap=False, n_jobs=0, lgbm_params=None
+ self,
+ objective=None,
+ cutoff=1,
+ n_folds=5,
+ n_iter=5,
+ silent=True,
+ rf=False,
+ fastshap=False,
+ n_jobs=0,
+ lgbm_params=None,
):
self.objective = objective
self.cutoff = cutoff
@@ -1978,13 +2000,13 @@ def __init__(
self.n_iter = n_iter
self.silent = silent
self.rf = rf
- self.fastshap = fastshap
+ self.fastshap = fastshap
self.cv_df = None
self.sha_cutoff = None
self.ranking_absolutes_ = None
self.ranking_ = None
self.lgbm_params = lgbm_params
- self.n_jobs=n_jobs
+ self.n_jobs = n_jobs
# Throw errors if the inputted parameters don't meet the necessary criteria
# Ensure parameters meet necessary criteria
@@ -2033,7 +2055,7 @@ def fit(self, X, y, sample_weight=None):
rf=self.rf,
fastshap=self.fastshap,
lgbm_params=self.lgbm_params,
- n_jobs=self.n_jobs
+ n_jobs=self.n_jobs,
)
self.selected_features_ = self.selected_features_.values
@@ -2139,7 +2161,20 @@ def plot_importance(self, n_feat_per_inch=5):
########################################################################################
-def _reduce_vars_lgb_cv(X, y, objective, n_folds, cutoff, n_iter, silent, weight, rf, fastshap, lgbm_params=None, n_jobs=0):
+def _reduce_vars_lgb_cv(
+ X,
+ y,
+ objective,
+ n_folds,
+ cutoff,
+ n_iter,
+ silent,
+ weight,
+ rf,
+ fastshap,
+ lgbm_params=None,
+ n_jobs=0,
+):
"""
Reduce the number of predictors using a lightgbm (python API)
@@ -2180,7 +2215,15 @@ def _reduce_vars_lgb_cv(X, y, objective, n_folds, cutoff, n_iter, silent, weight
the feature importance threshold, to reject or not the predictors
"""
- params = _set_lgb_parameters(X=X, y=y, objective=objective, rf=rf, silent=silent, n_jobs=n_jobs, lgbm_params=lgbm_params)
+ params = _set_lgb_parameters(
+ X=X,
+ y=y,
+ objective=objective,
+ rf=rf,
+ silent=silent,
+ n_jobs=n_jobs,
+ lgbm_params=lgbm_params,
+ )
dtypes_dic = create_dtype_dict(X, dic_keys="dtypes")
category_cols = dtypes_dic["cat"] + dtypes_dic["time"] + dtypes_dic["unk"]
@@ -2213,7 +2256,9 @@ def _reduce_vars_lgb_cv(X, y, objective, n_folds, cutoff, n_iter, silent, weight
**params,
)
- importance = _compute_importance(new_x_tr, shap_matrix, params, objective, fastshap)
+ importance = _compute_importance(
+ new_x_tr, shap_matrix, params, objective, fastshap
+ )
df = _merge_importance_df(
df=df,
importance=importance,
@@ -2239,7 +2284,13 @@ def _reduce_vars_lgb_cv(X, y, objective, n_folds, cutoff, n_iter, silent, weight
def _set_lgb_parameters(
- X: np.ndarray, y: np.ndarray, objective: str, rf: bool, silent: bool, n_jobs: int = 0, lgbm_params: dict = None
+ X: np.ndarray,
+ y: np.ndarray,
+ objective: str,
+ rf: bool,
+ silent: bool,
+ n_jobs: int = 0,
+ lgbm_params: dict = None,
) -> dict:
"""Set parameters for a LightGBM model based on the input features and the objective.
@@ -2290,7 +2341,17 @@ def _set_lgb_parameters(
}
)
- clf_losses = [ "binary", "softmax", "multi_logloss", "multiclassova", "multiclass", "multiclass_ova", "ova", "ovr", "binary_logloss"]
+ clf_losses = [
+ "binary",
+ "softmax",
+ "multi_logloss",
+ "multiclassova",
+ "multiclass",
+ "multiclass_ova",
+ "ova",
+ "ovr",
+ "binary_logloss",
+ ]
if objective in clf_losses:
y = y.astype(int)
y_freq_table = pd.Series(y.fillna(0)).value_counts(normalize=True)
@@ -2307,17 +2368,26 @@ def _set_lgb_parameters(
params.update({"is_unbalance": True})
params.update({"num_threads": n_jobs})
-
+
# we are using early_stopping
# we prevent the overridding of it by popping the n_iterations
- keys_to_pop = ['num_iterations', 'num_iteration', 'n_iter', 'num_tree', 'num_trees',
- 'num_round', 'num_rounds', 'nrounds', 'num_boost_round', 'n_estimators', 'max_iter']
+ keys_to_pop = [
+ "num_iterations",
+ "num_iteration",
+ "n_iter",
+ "num_tree",
+ "num_trees",
+ "num_round",
+ "num_rounds",
+ "nrounds",
+ "num_boost_round",
+ "n_estimators",
+ "max_iter",
+ ]
for key in keys_to_pop:
params.pop(key, None)
-
- return params
-
+ return params
def _split_data(X, y, tridx, validx, weight=None):
@@ -2427,13 +2497,20 @@ def _train_lgb_model(
try:
from fasttreeshap import TreeExplainer as FastTreeExplainer
except ImportError:
- raise ImportError("fasttreeshap is not installed. Please install it using 'pip/conda install fasttreeshap'.")
-
- explainer = FastTreeExplainer(bst, algorithm="auto", shortcut=False, feature_perturbation="tree_path_dependent")
+ raise ImportError(
+ "fasttreeshap is not installed. Please install it using 'pip/conda install fasttreeshap'."
+ )
+
+ explainer = FastTreeExplainer(
+ bst,
+ algorithm="auto",
+ shortcut=False,
+ feature_perturbation="tree_path_dependent",
+ )
shap_matrix = explainer.shap_values(X_train)
else:
shap_matrix = bst.predict(X_train, pred_contrib=True)
-
+
return bst, shap_matrix, bst.best_iteration
diff --git a/src/arfs/feature_selection/lasso.py b/src/arfs/feature_selection/lasso.py
index df0d545..f532719 100644
--- a/src/arfs/feature_selection/lasso.py
+++ b/src/arfs/feature_selection/lasso.py
@@ -8,29 +8,31 @@
from joblib import Parallel, delayed
from typing import Any, Callable, Union, List, Tuple, Optional, Dict, Literal
-def _map_family_link(family: str = "gaussian", link: Optional[str]=None):
+
+def _map_family_link(family: str = "gaussian", link: Optional[str] = None):
family_mapping = {
"gaussian": sm.families.Gaussian,
"binomial": sm.families.Binomial,
"poisson": sm.families.Poisson,
"gamma": sm.families.Gamma,
"negativebinomial": sm.families.NegativeBinomial,
- "tweedie": sm.families.Tweedie
- }
+ "tweedie": sm.families.Tweedie,
+ }
link_mapping = {
"identity": sm.genmod.families.links.Identity(),
"log": sm.genmod.families.links.Log(),
"logit": sm.genmod.families.links.Logit(),
"probit": sm.genmod.families.links.Probit(),
"cloglog": sm.genmod.families.links.CLogLog(),
- "inverse_squared": sm.genmod.families.links.InverseSquared()
- }
+ "inverse_squared": sm.genmod.families.links.InverseSquared(),
+ }
if link is not None:
objective = family_mapping[family](link_mapping[link])
else:
objective = family_mapping[family]()
return objective
+
class EnetGLM(BaseEstimator, RegressorMixin):
"""
Elastic Net Generalized Linear Model.
@@ -49,23 +51,30 @@ class EnetGLM(BaseEstimator, RegressorMixin):
Whether to fit an intercept term in the model.
"""
- def __init__(self, family: str = "gaussian", link: Optional[str] = None, alpha: float = 0.0, L1_wt: float = 0.0, fit_intercept: bool = True):
+ def __init__(
+ self,
+ family: str = "gaussian",
+ link: Optional[str] = None,
+ alpha: float = 0.0,
+ L1_wt: float = 0.0,
+ fit_intercept: bool = True,
+ ):
"""
Initialize self.
Parameters
----------
- family :
+ family :
The distributional assumption of the model.
link:
the GLM link function
- alpha :
+ alpha :
The elastic net mixing parameter. 0 <= alpha <= 1.
alpha = 0 is equivalent to ridge regression, alpha = 1 is equivalent to lasso regression.
- L1_wt :
+ L1_wt :
The weight of the L1 penalty term. 0 <= L1_wt <= 1.
L1_wt = 0 is equivalent to ridge regression, L1_wt = 1 is equivalent to lasso regression.
- fit_intercept :
+ fit_intercept :
Whether to fit an intercept term in the model.
"""
self.family = family
@@ -77,36 +86,41 @@ def __init__(self, family: str = "gaussian", link: Optional[str] = None, alpha:
self.fit_intercept = fit_intercept
self.objective = _map_family_link(family=family, link=link)
- def fit(self, X: pd.DataFrame, y: Union[np.ndarray, pd.Series], sample_weight: Optional[Union[np.ndarray, pd.Series]] = None):
+ def fit(
+ self,
+ X: pd.DataFrame,
+ y: Union[np.ndarray, pd.Series],
+ sample_weight: Optional[Union[np.ndarray, pd.Series]] = None,
+ ):
"""
Fit the model to the data.
-
+
Notes
-----
- In statsmodels and GLMs in general, you can use either an offset or a weight to account for
- differences in exposure between observations. However, if you choose to use an offset,
- you need to pass the number of cases (ncl) instead of the frequency and set the offset to
- the logarithm of the exposure due to the log link function. It is recommended to use the frequency
- and the weights instead of the offset because this ensures that all models have the same inputs.
+ In statsmodels and GLMs in general, you can use either an offset or a weight to account for
+ differences in exposure between observations. However, if you choose to use an offset,
+ you need to pass the number of cases (ncl) instead of the frequency and set the offset to
+ the logarithm of the exposure due to the log link function. It is recommended to use the frequency
+ and the weights instead of the offset because this ensures that all models have the same inputs.
To use the frequency and the weights, you can fit the model using the following code:
-
+
```python
self.model = sm.GLM(endog=y, exog=X, var_weights=sample_weight, family=self.family)
```
-
+
This is equivalent to using the exposure and the log of the exposure internally, which can be done using the following code:
-
+
```python
self.model = sm.GLM(endog=y, exog=sm.add_constant(X), exposure=sample_weight, family=sm.families.Poisson())
self.result = self.model.fit()
```
-
+
Parameters
----------
- X :
+ X :
array-like, shape (n_samples, n_features)
The input data.
- y :
+ y :
array-like, shape (n_samples,)
The target values.
sample_weight : array-like, shape (n_samples,), optional (default=None)
@@ -120,24 +134,25 @@ def fit(self, X: pd.DataFrame, y: Union[np.ndarray, pd.Series], sample_weight: O
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X)
X.columns = [f"pred_{i}" for i in range(X.shape[1])]
-
+
if self.fit_intercept:
X = sm.add_constant(X)
- X = X.rename(columns={'const': 'Intercept'})
+ X = X.rename(columns={"const": "Intercept"})
else:
X = drop_existing_sm_constant_from_df(X)
self.n_features_in_ = X.shape[1]
-
+
self.model = sm.GLM(
endog=y,
exog=X,
var_weights=sample_weight,
family=self.objective,
)
-
-
- self.result = self.model.fit_regularized(method="elastic_net", alpha=self.alpha, L1_wt=self.L1_wt, refit=True)
+
+ self.result = self.model.fit_regularized(
+ method="elastic_net", alpha=self.alpha, L1_wt=self.L1_wt, refit=True
+ )
self.coef_ = self.result.params
self.bse_ = self.result.bse
self.deviance_ = self.result.deviance
@@ -148,17 +163,16 @@ def fit(self, X: pd.DataFrame, y: Union[np.ndarray, pd.Series], sample_weight: O
self.tvalues_ = self.result.tvalues
self.pearson_chi2_ = self.result.pearson_chi2
-
def predict(self, X):
"""
Predict using the fitted model.
Parameters
----------
- X :
+ X :
array-like, shape (n_samples, n_features)
The input data.
-
+
Returns
-------
y : array-like, shape (n_samples,)
@@ -171,14 +185,14 @@ def predict(self, X):
"""
if self.model is None:
raise ValueError("Fit the model first.")
-
+
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X)
X.columns = [f"pred_{i}" for i in range(X.shape[1])]
-
+
if self.fit_intercept:
X = sm.add_constant(X)
- X = X.rename(columns={'const': 'Intercept'})
+ X = X.rename(columns={"const": "Intercept"})
return self.result.predict()
@@ -193,13 +207,18 @@ def get_coef(self):
"""
return self.coef_
- def score(self, X: pd.DataFrame, y: pd.Series, sample_weight: Optional[Union[np.ndarray, pd.Series]] = None):
+ def score(
+ self,
+ X: pd.DataFrame,
+ y: pd.Series,
+ sample_weight: Optional[Union[np.ndarray, pd.Series]] = None,
+ ):
"""
Return the deviance of the fitted model.
Parameters
----------
- X :
+ X :
array-like, shape (n_samples, n_features)
The input data.
sample_weight : array-like, shape (n_samples,), optional (default=None)
@@ -214,7 +233,7 @@ def score(self, X: pd.DataFrame, y: pd.Series, sample_weight: Optional[Union[np.
var_weights = sample_weight if sample_weight is not None else 1.0
return self.objective.deviance(endog=y, mu=mu, var_weights=var_weights)
-
+
def summary(self):
"""
Print a summary of the fitted model.
@@ -226,9 +245,8 @@ def summary(self):
"""
return self.result.summary()
-def weighted_cross_val_score(
- estimator, X, y, sample_weight=None, cv=5, n_jobs=-1
-):
+
+def weighted_cross_val_score(estimator, X, y, sample_weight=None, cv=5, n_jobs=-1):
"""
Perform cross-validation for a scikit-learn estimator with a score function that requires sample_weight.
@@ -244,7 +262,7 @@ def weighted_cross_val_score(
The sample weights for each data point.
cv : int, default=5
The number of cross-validation folds.
- n_jobs:
+ n_jobs:
the number of processes
Returns
@@ -258,22 +276,27 @@ def weighted_cross_val_score(
# logging.info("Starting cross-validation...")
- splitter = KFold(n_splits=cv) if len(np.unique(y)) > 2 else StratifiedKFold(n_splits=cv)
-
- if not hasattr(estimator, 'score') or not callable(getattr(estimator, 'score')):
+ splitter = (
+ KFold(n_splits=cv) if len(np.unique(y)) > 2 else StratifiedKFold(n_splits=cv)
+ )
+
+ if not hasattr(estimator, "score") or not callable(getattr(estimator, "score")):
raise ValueError(
"The estimator does not have a score method that takes a sample_weight argument."
)
with Parallel(n_jobs=-1) as parallel:
scores = parallel(
- delayed(_fit_and_score)(estimator, X, y, train_index, test_index, sample_weight)
+ delayed(_fit_and_score)(
+ estimator, X, y, train_index, test_index, sample_weight
+ )
for train_index, test_index in splitter.split(X)
)
# logging.info("Finished cross-validation.")
return scores
+
def _fit_and_score(
estimator: BaseEstimator,
X: Union[pd.DataFrame, np.ndarray],
@@ -316,7 +339,11 @@ def _fit_and_score(
y_train, y_test = y[train_index], y[test_index]
if sample_weight is not None:
- sample_weight = sample_weight.values if isinstance(sample_weight, pd.Series) else sample_weight
+ sample_weight = (
+ sample_weight.values
+ if isinstance(sample_weight, pd.Series)
+ else sample_weight
+ )
sample_weight_train = sample_weight[train_index]
sample_weight_test = sample_weight[test_index]
estimator.fit(X_train, y_train, sample_weight=sample_weight_train)
@@ -328,7 +355,6 @@ def _fit_and_score(
return score
-
def grid_search_cv(
X: Union[pd.DataFrame, np.ndarray],
y: Union[pd.Series, np.ndarray],
@@ -380,22 +406,39 @@ def grid_search_cv(
for param in grid:
estimator = clone(estimator)
- estimator.set_params(**{'alpha': param, 'L1_wt': 1.0, 'fit_intercept': fit_intercept, 'family': family})
+ estimator.set_params(
+ **{
+ "alpha": param,
+ "L1_wt": 1.0,
+ "fit_intercept": fit_intercept,
+ "family": family,
+ }
+ )
if score == "bic":
estimator.fit(X=X, y=y, sample_weight=sample_weight)
param_score.append(estimator.bic_)
else:
- scores = weighted_cross_val_score(estimator, X, y, sample_weight=sample_weight, cv=5, n_jobs=-1)
+ scores = weighted_cross_val_score(
+ estimator, X, y, sample_weight=sample_weight, cv=5, n_jobs=-1
+ )
param_score.append(np.mean(scores))
# min deviance or min BIC
best_alpha_value = grid[np.argmin(param_score)]
best_estimator = clone(estimator)
- best_estimator.set_params(**{'alpha': best_alpha_value, 'L1_wt': 1.0, 'fit_intercept': fit_intercept, 'family': family})
+ best_estimator.set_params(
+ **{
+ "alpha": best_alpha_value,
+ "L1_wt": 1.0,
+ "fit_intercept": fit_intercept,
+ "family": family,
+ }
+ )
best_estimator.fit(X, y, sample_weight=sample_weight)
return best_estimator
+
class LassoFeatureSelection(BaseEstimator, TransformerMixin):
"""
LassoFeatureSelection performs feature selection using GLM Lasso regularization.
@@ -437,7 +480,13 @@ class LassoFeatureSelection(BaseEstimator, TransformerMixin):
"""
- def __init__(self, family: str = "gaussian", n_iterations: int = 10, score: str = "bic", fit_intercept: bool = True):
+ def __init__(
+ self,
+ family: str = "gaussian",
+ n_iterations: int = 10,
+ score: str = "bic",
+ fit_intercept: bool = True,
+ ):
self.family = family
self.n_iterations = n_iterations
self.best_estimator_ = None
@@ -474,13 +523,25 @@ def fit(
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X)
X.columns = [f"pred_{i}" for i in range(X.shape[1])]
-
+
if not self.fit_intercept:
X = drop_existing_sm_constant_from_df(X)
-
- self.feature_names_in_ = X.columns.insert(0, "Intercept") if self.fit_intercept and "Intercept" not in X.columns else X.columns
- self.best_estimator_ = grid_search_cv(family=self.family, X=X, y=y, sample_weight=sample_weight, n_iterations=self.n_iterations, score=self.score, fit_intercept=self.fit_intercept)
+ self.feature_names_in_ = (
+ X.columns.insert(0, "Intercept")
+ if self.fit_intercept and "Intercept" not in X.columns
+ else X.columns
+ )
+
+ self.best_estimator_ = grid_search_cv(
+ family=self.family,
+ X=X,
+ y=y,
+ sample_weight=sample_weight,
+ n_iterations=self.n_iterations,
+ score=self.score,
+ fit_intercept=self.fit_intercept,
+ )
self.support_ = self.best_estimator_.coef_ != 0
self.selected_features_ = self.feature_names_in_[self.support_]
return self
@@ -501,17 +562,16 @@ def transform(self, X: Union[pd.DataFrame, np.ndarray]) -> pd.DataFrame:
"""
-
if self.fit_intercept:
X = sm.add_constant(X)
-
+
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X)
- # if not a DF, assuming the col orders is
+ # if not a DF, assuming the col orders is
# the same, as required anyway
X.columns = self.feature_names_in_
-
- X = X.rename(columns={'const': 'Intercept'}) if "const" in X.columns else X
+
+ X = X.rename(columns={"const": "Intercept"}) if "const" in X.columns else X
return X[self.selected_features_]
def get_feature_names_out(self) -> np.ndarray:
@@ -525,9 +585,10 @@ def get_feature_names_out(self) -> np.ndarray:
"""
return self.feature_names_in_[self.support_]
-
+
+
def drop_existing_sm_constant_from_df(X):
- X = X.drop(columns=['Intercept']) if 'Intercept' in X.columns else X
- X = X.drop(columns=['const']) if 'const' in X.columns else X
- X = X.drop(columns=['intercept']) if 'intercept' in X.columns else X
- return X
\ No newline at end of file
+ X = X.drop(columns=["Intercept"]) if "Intercept" in X.columns else X
+ X = X.drop(columns=["const"]) if "const" in X.columns else X
+ X = X.drop(columns=["intercept"]) if "intercept" in X.columns else X
+ return X
diff --git a/src/arfs/feature_selection/mrmr.py b/src/arfs/feature_selection/mrmr.py
index 44dd4d4..5a30546 100644
--- a/src/arfs/feature_selection/mrmr.py
+++ b/src/arfs/feature_selection/mrmr.py
@@ -162,7 +162,7 @@ def fit(self, X, y, sample_weight=None):
if not isinstance(y, pd.Series):
y = pd.Series(y)
-
+
y.name = "target"
target = y.copy()
diff --git a/src/arfs/feature_selection/unsupervised.py b/src/arfs/feature_selection/unsupervised.py
index d4681f4..468566b 100644
--- a/src/arfs/feature_selection/unsupervised.py
+++ b/src/arfs/feature_selection/unsupervised.py
@@ -39,13 +39,15 @@ def _missing_ratio(df):
raise TypeError("df should be a pandas DataFrame")
numeric_columns = df.select_dtypes(np.number).columns
n_samples = len(df)
-
+
missing_counts = {}
for column in df.columns:
if column in numeric_columns:
- missing_counts[column] = (df[column].isnull().sum() + np.isinf(df[column]).sum())/n_samples
+ missing_counts[column] = (
+ df[column].isnull().sum() + np.isinf(df[column]).sum()
+ ) / n_samples
else:
- missing_counts[column] = df[column].isnull().sum()/n_samples
+ missing_counts[column] = df[column].isnull().sum() / n_samples
return pd.Series(missing_counts)
diff --git a/src/arfs/feature_selection/variable_importance.py b/src/arfs/feature_selection/variable_importance.py
index a1eca7c..35fe214 100644
--- a/src/arfs/feature_selection/variable_importance.py
+++ b/src/arfs/feature_selection/variable_importance.py
@@ -155,7 +155,7 @@ def fit(self, X, y, sample_weight=None):
verbose=self.verbose,
encoder_kwargs=self.encoder_kwargs,
lgb_kwargs=self.lgb_kwargs,
- fastshap=self.fastshap
+ fastshap=self.fastshap,
)
self.feature_importances_summary_ = feature_importances
@@ -372,7 +372,12 @@ def _compute_varimp_lgb(
# )
# perm_imp = perm_imp.importances_mean
if fastshap:
- explainer = FastTreeExplainer(gbm_model.model, algorithm="auto", shortcut=False, feature_perturbation="tree_path_dependent")
+ explainer = FastTreeExplainer(
+ gbm_model.model,
+ algorithm="auto",
+ shortcut=False,
+ feature_perturbation="tree_path_dependent",
+ )
shap_matrix = explainer.shap_values(gbm_model.valid_features)
if isinstance(shap_matrix, list):
# For LightGBM classifier, RF, in sklearn API, SHAP returns a list of arrays
@@ -381,7 +386,9 @@ def _compute_varimp_lgb(
else:
shap_imp = np.abs(shap_matrix).mean(0)
else:
- shap_matrix = gbm_model.model.predict(gbm_model.valid_features, pred_contrib=True)
+ shap_matrix = gbm_model.model.predict(
+ gbm_model.valid_features, pred_contrib=True
+ )
# the dim changed in lightGBM >= 3.0.0
if task == "multiclass":
# X_SHAP_values (array-like of shape = [n_samples, n_features + 1]
diff --git a/src/arfs/preprocessing.py b/src/arfs/preprocessing.py
index 88b6711..b852ae2 100644
--- a/src/arfs/preprocessing.py
+++ b/src/arfs/preprocessing.py
@@ -397,33 +397,33 @@ def cat_var(data, col_excl=None, return_cat=True):
class TreeDiscretizer(BaseEstimator, TransformerMixin):
- """The purpose of the function is to discretize continuous and/or categorical data, returning a pandas DataFrame.
- It is designed to support regression and binary classification tasks. Discretization, also known as quantization or binning,
- allows for the partitioning of continuous features into discrete values. In certain datasets with continuous attributes,
- discretization can be beneficial as it transforms the dataset into one with only nominal attributes.
+ """The purpose of the function is to discretize continuous and/or categorical data, returning a pandas DataFrame.
+ It is designed to support regression and binary classification tasks. Discretization, also known as quantization or binning,
+ allows for the partitioning of continuous features into discrete values. In certain datasets with continuous attributes,
+ discretization can be beneficial as it transforms the dataset into one with only nominal attributes.
Additionally, for categorical predictors, grouping levels can help reduce overfitting and create meaningful clusters.
- By encoding discretized features, a model can become more expressive while maintaining interpretability.
- For example, preprocessing with a discretizer can introduce nonlinearity to linear models.
+ By encoding discretized features, a model can become more expressive while maintaining interpretability.
+ For example, preprocessing with a discretizer can introduce nonlinearity to linear models.
For more advanced possibilities, particularly smooth ones, you can refer to the section on generating polynomial features.
- The TreeDiscretizer function utilizes univariate regularized trees, with one tree per column to be binned.
- It finds the optimal partition and returns numerical intervals for numerical continuous columns and pd.Categorical for categorical columns.
+ The TreeDiscretizer function utilizes univariate regularized trees, with one tree per column to be binned.
+ It finds the optimal partition and returns numerical intervals for numerical continuous columns and pd.Categorical for categorical columns.
This approach groups similar levels together, reducing dimensionality and regularizing the model.
-
- TreeDiscretizer handles missing values for both numerical and categorical predictors,
+
+ TreeDiscretizer handles missing values for both numerical and categorical predictors,
eliminating the need for encoding categorical predictors separately.
Notes
-----
This is a substitution to proper regularization schemes such as:
- - GroupLasso: Categorical predictors, which are usually encoded as multiple dummy variables,
+ - GroupLasso: Categorical predictors, which are usually encoded as multiple dummy variables,
are considered together rather than separately.
- FusedLasso: Takes into account the ordering of the features.
Parameters
----------
bin_features : List of string or None
- The list of names of the variable that has to be binned, or "all", "numerical" or "categorical"
+ The list of names of the variable that has to be binned, or "all", "numerical" or "categorical"
for splitting and grouping all, only numerical or only categorical columns.
n_bins : int
The number of bins that has to be created while binning the variables in the "bin_features" list.
@@ -526,13 +526,13 @@ def fit(self, X, y, sample_weight=None):
DataFrame with the binned and grouped columns.
"""
X = X.copy()
-
+
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X)
X.columns = [f"pred_{i}" for i in range(X.shape[1])]
-
+
self.feature_names_in_ = X.columns.to_numpy()
-
+
if self.bin_features is None:
self.bin_features = list(X.select_dtypes("number").columns)
self.cat_features = []
@@ -553,10 +553,14 @@ def fit(self, X, y, sample_weight=None):
):
self.bin_features = list(X.select_dtypes(["category", "object"]).columns)
self.cat_features = self.bin_features
-
+
self.n_unique_table_ = X[self.bin_features].nunique()
# transform only the columns with more than n_bins_max
- self.bin_features = self.n_unique_table_[self.n_unique_table_ > self.n_bins_max].index.to_list() if self.n_bins_max else self.bin_features
+ self.bin_features = (
+ self.n_unique_table_[self.n_unique_table_ > self.n_bins_max].index.to_list()
+ if self.n_bins_max
+ else self.bin_features
+ )
for col in self.bin_features:
is_categorical = (self.cat_features is not None) and (
@@ -580,8 +584,7 @@ def fit(self, X, y, sample_weight=None):
X[col] = dum.values.ravel()
else:
X[col] = dum.ravel()
-
-
+
else:
encoder = None
@@ -606,7 +609,7 @@ def fit(self, X, y, sample_weight=None):
X[col] = dum.values.ravel()
else:
X[col] = dum.ravel()
-
+
self.cat_bin_dict[col] = (
X[[f"{col}_g", col]]
.groupby(f"{col}_g")
@@ -644,7 +647,7 @@ def fit(self, X, y, sample_weight=None):
def transform(self, X):
"""Apply the discretizer on `X`. Only the columns with more than n_bins_max unique values will be transformed.
-
+
Parameters
----------
X : array-like of shape (n_samples, n_features)
@@ -686,7 +689,7 @@ def transform(self, X):
include_lowest=True,
precision=2,
)
-
+
if not self.num_bins_as_category:
X[col] = X[col].astype(IntervalDtype())
return X
@@ -756,6 +759,7 @@ def make_fs_summary(selector_pipe):
)
return tag_df
+
class IntervalToMidpoint(BaseEstimator, TransformerMixin):
"""
IntervalToMidpoint is a transformer that converts numerical intervals in a pandas DataFrame to their midpoints.
@@ -784,8 +788,7 @@ class IntervalToMidpoint(BaseEstimator, TransformerMixin):
Inverse transform is not implemented for this transformer.
"""
- def __init__(self, cols: Union[List[str], str]="all"):
-
+ def __init__(self, cols: Union[List[str], str] = "all"):
self.cols = cols
def fit(self, X: pd.DataFrame = None, y: pd.Series = None):
@@ -794,9 +797,9 @@ def fit(self, X: pd.DataFrame = None, y: pd.Series = None):
Parameters
----------
- X :
+ X :
The input data to fit the transformer on.
- y :
+ y :
Ignored parameter.
Returns
@@ -805,12 +808,16 @@ def fit(self, X: pd.DataFrame = None, y: pd.Series = None):
The fitted transformer object.
"""
data = X.copy()
-
+
if self.cols == "all":
self.cols = data.columns
-
- self.float_interval_cols_ = create_dtype_dict(X, dic_keys="dtypes")["num_interval"]
- self.columns_to_transform_ = list(set(self.cols).intersection(set(self.float_interval_cols_)))
+
+ self.float_interval_cols_ = create_dtype_dict(X, dic_keys="dtypes")[
+ "num_interval"
+ ]
+ self.columns_to_transform_ = list(
+ set(self.cols).intersection(set(self.float_interval_cols_))
+ )
return self
def transform(self, X: pd.DataFrame):
@@ -850,11 +857,14 @@ def inverse_transform(self, X: pd.DataFrame):
raise NotImplementedError(
"inverse_transform is not implemented for this transformer."
)
-
-def transform_interval_to_midpoint(X: pd.DataFrame, cols: Union[List[str], str] = "all") -> pd.DataFrame:
+
+
+def transform_interval_to_midpoint(
+ X: pd.DataFrame, cols: Union[List[str], str] = "all"
+) -> pd.DataFrame:
"""
Transforms interval columns in a pandas DataFrame to their midpoint values.
-
+
Notes
-----
Equivalent function to ``IntervalToMidpoint`` without the estimator API
@@ -868,17 +878,17 @@ def transform_interval_to_midpoint(X: pd.DataFrame, cols: Union[List[str], str]
Returns
-------
- pd.DataFrame :
+ pd.DataFrame :
The transformed DataFrame with interval columns replaced by their midpoint values.
Raises
------
- TypeError :
+ TypeError :
If the input data is not a pandas DataFrame.
"""
if cols == "all":
cols = X.columns
-
+
X = X.copy()
float_interval_cols_ = create_dtype_dict(X, dic_keys="dtypes")["num_interval"]
columns_to_transform_ = list(set(cols).intersection(set(float_interval_cols_)))
@@ -906,10 +916,13 @@ def find_interval_midpoint(interval_series: pd.Series) -> np.ndarray:
left_inf = np.isinf(left)
right_inf = np.isinf(right)
- return np.where(left_inf & right_inf, np.inf,
- np.where(left_inf, right,
- np.where(right_inf, left, mid)))
-
+ return np.where(
+ left_inf & right_inf,
+ np.inf,
+ np.where(left_inf, right, np.where(right_inf, left, mid)),
+ )
+
+
class PatsyTransformer(BaseEstimator, TransformerMixin):
"""Transformer using patsy-formulas.
@@ -956,8 +969,15 @@ class PatsyTransformer(BaseEstimator, TransformerMixin):
should not contain a left hand side. If you need to transform both
features and targets, use PatsyModel.
"""
- def __init__(self, formula=None, add_intercept=True, eval_env=0, NA_action="drop",
- return_type='dataframe'):
+
+ def __init__(
+ self,
+ formula=None,
+ add_intercept=True,
+ eval_env=0,
+ NA_action="drop",
+ return_type="dataframe",
+ ):
self.formula = formula
self.eval_env = eval_env
self.add_intercept = add_intercept
@@ -991,24 +1011,31 @@ def fit_transform(self, data, y=None):
return self._fit_transform(data, y)
def _fit_transform(self, data, y=None):
-
if not isinstance(data, pd.DataFrame):
data = pd.DataFrame(data)
data.columns = [f"pred_{i}" for i in range(data.shape[1])]
-
+
if not isinstance(y, pd.Series):
y = pd.Series(y)
y.name = "target"
-
+
target_name = y.name if y is not None else "y"
- self.formula = self.formula or " + ".join(data.columns.difference([target_name]))
+ self.formula = self.formula or " + ".join(
+ data.columns.difference([target_name])
+ )
eval_env = EvalEnvironment.capture(self.eval_env, reference=2)
# self.formula = _drop_intercept(self.formula, self.add_intercept)
- design = dmatrix(self.formula, data, NA_action=self.NA_action, return_type='dataframe', eval_env=eval_env)
+ design = dmatrix(
+ self.formula,
+ data,
+ NA_action=self.NA_action,
+ return_type="dataframe",
+ eval_env=eval_env,
+ )
self.design_ = design.design_info
- if self.return_type == 'dataframe':
+ if self.return_type == "dataframe":
return design
else:
return np.array(design)
@@ -1024,8 +1051,8 @@ def transform(self, data):
data : dict-like (pandas dataframe)
Input data. Column names need to match variables in formula.
"""
- if self.return_type == 'dataframe':
- return dmatrix(self.design_, data, return_type='dataframe')
+ if self.return_type == "dataframe":
+ return dmatrix(self.design_, data, return_type="dataframe")
else:
return np.array(dmatrix(self.design_, data))
@@ -1038,4 +1065,4 @@ def _drop_intercept(formula, add_intercept):
if INTERCEPT in formula.rhs_termlist:
formula.rhs_termlist.remove(INTERCEPT)
return formula
- return formula
\ No newline at end of file
+ return formula
diff --git a/src/arfs/utils.py b/src/arfs/utils.py
index f0de783..0ced7f0 100644
--- a/src/arfs/utils.py
+++ b/src/arfs/utils.py
@@ -108,10 +108,15 @@ def create_dtype_dict(df: pd.DataFrame, dic_keys: str = "col_names") -> dict:
time_cols = df.select_dtypes(
include=["datetime", "timedelta", "datetimetz"]
).columns
- numerical_interval_cols = df.select_dtypes(["Interval[float]", "Interval[int]"]).columns
+ numerical_interval_cols = df.select_dtypes(
+ ["Interval[float]", "Interval[int]"]
+ ).columns
numerical_cols = df.select_dtypes(include=np.number).columns
remaining_cols = (
- df.columns.difference(categorical_cols).difference(numerical_cols).difference(time_cols).difference(numerical_interval_cols)
+ df.columns.difference(categorical_cols)
+ .difference(numerical_cols)
+ .difference(time_cols)
+ .difference(numerical_interval_cols)
)
if dic_keys == "col_names":
@@ -120,7 +125,13 @@ def create_dtype_dict(df: pd.DataFrame, dic_keys: str = "col_names") -> dict:
num_interval_dict = dict.fromkeys(numerical_interval_cols, "num_interval")
time_dict = dict.fromkeys(time_cols, "time")
remaining_dict = dict.fromkeys(remaining_cols, "unk")
- return {**cat_dict, **num_dict, **num_interval_dict, **time_dict, **remaining_dict}
+ return {
+ **cat_dict,
+ **num_dict,
+ **num_interval_dict,
+ **time_dict,
+ **remaining_dict,
+ }
if dic_keys == "dtypes":
return {
@@ -859,5 +870,5 @@ def _make_corr_dataset_classification(size=1000):
"Bane",
"MarkZ",
]
-
+
return X, y, w