diff --git a/README.md b/README.md index 6da484f..e18fa3e 100644 --- a/README.md +++ b/README.md @@ -198,7 +198,7 @@ Selected intervals: ``` ## References -* [Original Safe algorithm](https://mi2datalab.github.io/SAFE/index.html), implemented in R +* [Original Safe algorithm](https://ModelOriented.github.io/SAFE/index.html), implemented in R * [ruptures library](https://github.com/deepcharles/ruptures), used for finding changepoints * [kneed library](https://github.com/arvkevi/kneed), used for cutting hierarchical tree * [SAFE article](https://arxiv.org/abs/1902.11035) - article about SAFE algorithm, including benchmark results using SAFE library diff --git a/SafeTransformer/SafeTransformer.py b/SafeTransformer/SafeTransformer.py index 45a8462..aa58cfd 100644 --- a/SafeTransformer/SafeTransformer.py +++ b/SafeTransformer/SafeTransformer.py @@ -1,11 +1,14 @@ import numpy as np import ruptures as rpt -from sklearn.base import TransformerMixin import pandas as pd +import sys +import shap + from sklearn.exceptions import NotFittedError from scipy.cluster.hierarchy import ward, cut_tree from kneed import KneeLocator -import sys +from sklearn.base import TransformerMixin +from os import error class Variable(): @@ -18,36 +21,50 @@ def __init__(self, name, index): class NumericVariable(Variable): - def __init__(self, name, index, penalty, pelt_model, no_changepoint_strategy='median'): + def __init__(self, name, index, penalty, pelt_model, no_changepoint_strategy='median', dependence_method = 'pdp'): super().__init__(name, index) self.changepoints = [] self.penalty = penalty self.pelt_model = pelt_model self.changepoint_values = [] self.no_changepoint_strategy = no_changepoint_strategy + self.dependence_method = dependence_method - def _get_partial_dependence(self, model, X, grid_resolution=1000): + def _get_partial_dependence(self, model, X, grid_resolution=1000, dependence_method='pdp'): axes = [] - pdp = [] - points = np.linspace(min(X.loc[:, self.original_name]), max( - X.loc[:, self.original_name]), grid_resolution) - X_copy = X.copy() - for point in points: - axes.append(point) - X_copy.loc[:, self.original_name] = point - if(hasattr(model, 'predict_proba')): - predictions = model.predict_proba(X_copy) - else: - predictions = model.predict(X_copy) - val = np.mean(predictions, axis=0) - pdp.append(val) + pdp = [] + if dependence_method == 'pdp': + points = np.linspace(min(X.loc[:, self.original_name]), max( + X.loc[:, self.original_name]), grid_resolution) + X_copy = X.copy() + for point in points: + axes.append(point) + X_copy.loc[:, self.original_name] = point + if(hasattr(model, 'predict_proba')): + predictions = model.predict_proba(X_copy) + else: + predictions = model.predict(X_copy) + val = np.mean(predictions, axis=0) + pdp.append(val) + elif dependence_method == "shap": + explainer = shap.Explainer(model) + shap_values = explainer.shap_values(X) + if type(shap_values) is list: + shap_values = shap_values[0] + axes = np.unique(X.loc[:, self.original_name]) + for value in axes: + mean_shap = np.mean([shap_values[idx, self.original_index] for idx in range(len(axes)) if axes[idx] == value]) + pdp.append(mean_shap) + + else: + raise ValueError("Unknown dependence method, use 'pdp' or 'shap'.") return np.array(pdp), axes def fit(self, model, X, verbose): if verbose: print('Fitting variable:' + str(self.original_name)) - pdp, axis = self._get_partial_dependence(model, X, grid_resolution=1000) - algo = rpt.Pelt(model=self.pelt_model).fit(pdp) + pdp, axis = self._get_partial_dependence(model, X, grid_resolution=1000, dependence_method=self.dependence_method) + algo = rpt.Pelt(model=self.pelt_model, min_size=2, jump=2).fit(pdp) self.changepoints = algo.predict(pen=self.penalty) self.changepoint_values = [axis[i] for i in self.changepoints[:-1]] if not self.changepoint_values and self.no_changepoint_strategy == 'median': @@ -145,7 +162,7 @@ def transform(self, X, verbose): ret = np.zeros([X.shape[0], ret_len]) for row_num in range(dummies.shape[0]): if not np.sum(dummies.iloc[row_num, :]) == 0: - idx = np.argwhere(dummies.iloc[row_num, :] == 1)[0] + idx = np.argwhere(dummies.iloc[row_num, :].to_numpy() == 1)[0] if self.clusters[idx + 1] > 0: ret[row_num, self.clusters[idx + 1] - 1] = 1 return pd.DataFrame(ret, columns=self.new_names[1:]) @@ -203,12 +220,13 @@ class SafeTransformer(TransformerMixin): :param penalty: Penalty corresponding to adding a new changepoint. The higher the value of penalty the smaller nunber of levels of transformed variableswill be created (Default value = 3) :param pelt_model: Cost function used in pelt algorith, possible values: 'l2', 'l1', 'rbf' (Default value = 'l2') :param model_params: Dictionary of paramters passed to fit method of surrogate model. Only used if passed surrogate model is not alreedy fitted. - + :param dependence_method" Method of partial dependence fitting, possible values: 'pdp', 'shap' (Default value = 'pdp'). + """ categorical_dtypes = ['category', 'object'] - def __init__(self, model, penalty=3, pelt_model='l2', model_params={}, no_changepoint_strategy='median'): + def __init__(self, model, penalty=3, pelt_model='l2', model_params={}, no_changepoint_strategy='median', dependence_method = 'pdp'): """ Initialize new transformer instance @@ -217,6 +235,8 @@ def __init__(self, model, penalty=3, pelt_model='l2', model_params={}, no_change :param pelt_model: Cost function used in pelt algorith, possible values: 'l2', 'l1', 'rbf' (Default value = 'l2') :param model_params: Dictionary of parameters passed to fit method of surrogate model. Only used if passed surrogate model is not alreedy fitted. :param no_changepoint_strategy: String specifying strategy to take, when no changepoint was detected. Should be one of: 'median', 'no_value'. If median is chosen, then there will be one changepoint set to 'median' value of a column. If 'no_value' is specified column will be removed. Default value = 'median'. + :param dependence_method" Method of partial dependence fitting, possible values: 'pdp', 'shap' (Default value = 'pdp'). + """ self.variables = [] self.model = model @@ -224,6 +244,7 @@ def __init__(self, model, penalty=3, pelt_model='l2', model_params={}, no_change self.pelt_model = pelt_model self.model_params = model_params self.is_fitted = False + self.dependence_method = dependence_method if no_changepoint_strategy != 'median' and no_changepoint_strategy != 'no_value': raise ValueError('Incorrect no changepoint strategy value. Should be one of: median or no_value.') self.no_changepoint_strategy = no_changepoint_strategy @@ -259,7 +280,7 @@ def fit(self, X, y=None, verbose=False): X = pd.concat([X.iloc[:,range(dummy_index)], dummies, X.iloc[:, range(dummy_index+1, len(X.columns))]], axis=1) self.variables.append(CategoricalVariable(name, idx, list(dummies), levels=levels)) else: - self.variables.append(NumericVariable(name, idx, self.penalty, self.pelt_model, self.no_changepoint_strategy)) + self.variables.append(NumericVariable(name, idx, self.penalty, self.pelt_model, self.no_changepoint_strategy, self.dependence_method)) if not self._is_model_fitted(X): self.model.fit(X, y, **self.model_params) for variable in self.variables: diff --git a/examples/SHAP_dependence.ipynb b/examples/SHAP_dependence.ipynb new file mode 100644 index 0000000..1d9bef1 --- /dev/null +++ b/examples/SHAP_dependence.ipynb @@ -0,0 +1,140 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from SafeTransformer import SafeTransformer\n", + "from sklearn.ensemble import GradientBoostingRegressor\n", + "from sklearn.linear_model import LinearRegression\n", + "import numpy as np\n", + "import pandas as pd\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import mean_squared_error\n", + "from sklearn.pipeline import Pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "apartments = pd.read_csv('apartments.csv', index_col=0)\n", + "X_ap = apartments.drop(columns='m2.price')\n", + "y = apartments['m2.price']" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "X = X_ap.copy()\n", + "colnames = list(X)\n", + "for idx, name in enumerate(colnames):\n", + " if str(X.loc[:, name].dtype) in ['category', 'object']:\n", + " dummies = pd.get_dummies(X.loc[:, name], prefix=name, drop_first=True)\n", + " dummy_index = X.columns.get_loc(name)\n", + " X = pd.concat([X.iloc[:,range(dummy_index)], dummies, X.iloc[:, range(dummy_index+1, len(X.columns))]], axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "X_train, X_test, X_lin_train, X_lin_test, y_train, y_test = train_test_split(X_ap, X, y)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "90\n", + "90\n", + "130\n", + "130\n", + "10\n", + "10\n", + "6\n", + "6\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "" + ] + }, + "metadata": {}, + "execution_count": 5 + } + ], + "source": [ + "surrogate_model = GradientBoostingRegressor(\n", + " n_estimators=1000,\n", + " max_depth=4,\n", + " learning_rate=0.1,\n", + " loss='huber'\n", + ")\n", + "safe_transformer = SafeTransformer(surrogate_model, penalty = 10, dependence_method = \"shap\")\n", + "safe_transformer.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Numerical Variable construction.year\nSelected intervals:\n\t[-Inf, 1922.00)\n\t[1922.00, 1924.00)\n\t[1924.00, 1926.00)\n\t[1926.00, 1929.00)\n\t[1929.00, 1933.00)\n\t[1933.00, 1935.00)\n\t[1935.00, 1938.00)\n\t[1938.00, 1940.00)\n\t[1940.00, 1942.00)\n\t[1942.00, 1944.00)\n\t[1944.00, 1947.00)\n\t[1947.00, 1949.00)\n\t[1949.00, 1951.00)\n\t[1951.00, 1953.00)\n\t[1953.00, 1956.00)\n\t[1956.00, 1958.00)\n\t[1958.00, 1961.00)\n\t[1961.00, 1964.00)\n\t[1964.00, 1967.00)\n\t[1967.00, 1969.00)\n\t[1969.00, 1971.00)\n\t[1971.00, 1973.00)\n\t[1973.00, 1975.00)\n\t[1975.00, 1978.00)\n\t[1978.00, 1982.00)\n\t[1982.00, 1984.00)\n\t[1984.00, 1986.00)\n\t[1986.00, 1988.00)\n\t[1988.00, 1990.00)\n\t[1990.00, 1992.00)\n\t[1992.00, 1995.00)\n\t[1995.00, 1997.00)\n\t[1997.00, 1999.00)\n\t[1999.00, 2001.00)\n\t[2001.00, 2003.00)\n\t[2003.00, 2005.00)\n\t[2005.00, 2007.00)\n\t[2007.00, 2009.00)\n\t[2009.00, Inf)\nNumerical Variable surface\nSelected intervals:\n\t[-Inf, 23.00)\n\t[23.00, 25.00)\n\t[25.00, 28.00)\n\t[28.00, 31.00)\n\t[31.00, 34.00)\n\t[34.00, 37.00)\n\t[37.00, 39.00)\n\t[39.00, 41.00)\n\t[41.00, 44.00)\n\t[44.00, 46.00)\n\t[46.00, 48.00)\n\t[48.00, 50.00)\n\t[50.00, 52.00)\n\t[52.00, 54.00)\n\t[54.00, 56.00)\n\t[56.00, 58.00)\n\t[58.00, 60.00)\n\t[60.00, 63.00)\n\t[63.00, 66.00)\n\t[66.00, 68.00)\n\t[68.00, 70.00)\n\t[70.00, 73.00)\n\t[73.00, 75.00)\n\t[75.00, 78.00)\n\t[78.00, 80.00)\n\t[80.00, 83.00)\n\t[83.00, 85.00)\n\t[85.00, 87.00)\n\t[87.00, 89.00)\n\t[89.00, 91.00)\n\t[91.00, 93.00)\n\t[93.00, 95.00)\n\t[95.00, 98.00)\n\t[98.00, 100.00)\n\t[100.00, 102.00)\n\t[102.00, 105.00)\n\t[105.00, 107.00)\n\t[107.00, 109.00)\n\t[109.00, 112.00)\n\t[112.00, 114.00)\n\t[114.00, 116.00)\n\t[116.00, 118.00)\n\t[118.00, 121.00)\n\t[121.00, 124.00)\n\t[124.00, 126.00)\n\t[126.00, 128.00)\n\t[128.00, 131.00)\n\t[131.00, 133.00)\n\t[133.00, 135.00)\n\t[135.00, 137.00)\n\t[137.00, 139.00)\n\t[139.00, 141.00)\n\t[141.00, 144.00)\n\t[144.00, 146.00)\n\t[146.00, 148.00)\n\t[148.00, Inf)\nNumerical Variable floor\nSelected intervals:\n\t[-Inf, 4.00)\n\t[4.00, 6.00)\n\t[6.00, 8.00)\n\t[8.00, Inf)\nNumerical Variable no.rooms\nSelected intervals:\n\t[-Inf, 5.00)\n\t[5.00, Inf)\nCategorical Variable district\nCreated variable levels:\n\tBemowo, Bielany, Praga, Ursus, Ursynow, Wola -> Bemowo_Bielany_Praga_Ursus_Ursynow_Wola\n\tMokotow, Ochota, Zoliborz -> Mokotow_Ochota_Zoliborz\n\tSrodmiescie -> Srodmiescie\n" + ] + } + ], + "source": [ + "safe_transformer.summary()" + ] + } + ], + "metadata": { + "kernelspec": { + "name": "python371jvsc74a57bd04a14d88addbefcb3d7fbbd2032e02fafd8dd0b3bd5861453f44d6a559c03a0d0", + "display_name": "Python 3.7.1 64-bit ('venv-safe')" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.1" + }, + "metadata": { + "interpreter": { + "hash": "4a14d88addbefcb3d7fbbd2032e02fafd8dd0b3bd5861453f44d6a559c03a0d0" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file diff --git a/setup.py b/setup.py index b3e5419..7d53968 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setuptools.setup( name="safe-transformer", - version="0.0.4", + version="0.0.6", author="Aleksandra Gacek, Piotr LuboĊ„", author_email="lubonp@student.mini.pw.edu.pl, gaceka@student.mini.pw.edu.pl", description="Build explainable ML models using surrogate models.", @@ -19,7 +19,8 @@ 'sklearn', 'pandas', 'scipy', - 'kneed' + 'kneed', + 'shap' ], classifiers=[ "Programming Language :: Python :: 3",