From 445d0a20a80ef79af16ac8417558acb6675ea51e Mon Sep 17 00:00:00 2001 From: agosiewska <alicjagosiewska@gmail.com> Date: Wed, 11 Aug 2021 22:12:08 +0200 Subject: [PATCH 1/3] Add shap partial dependence --- README.md | 2 +- SafeTransformer/SafeTransformer.py | 63 ++++++---- examples/SHAP_dependence.ipynb | 177 +++++++++++++++++++++++++++++ setup.py | 5 +- 4 files changed, 222 insertions(+), 25 deletions(-) create mode 100644 examples/SHAP_dependence.ipynb diff --git a/README.md b/README.md index 6da484f..e18fa3e 100644 --- a/README.md +++ b/README.md @@ -198,7 +198,7 @@ Selected intervals: ``` ## References -* [Original Safe algorithm](https://mi2datalab.github.io/SAFE/index.html), implemented in R +* [Original Safe algorithm](https://ModelOriented.github.io/SAFE/index.html), implemented in R * [ruptures library](https://github.com/deepcharles/ruptures), used for finding changepoints * [kneed library](https://github.com/arvkevi/kneed), used for cutting hierarchical tree * [SAFE article](https://arxiv.org/abs/1902.11035) - article about SAFE algorithm, including benchmark results using SAFE library diff --git a/SafeTransformer/SafeTransformer.py b/SafeTransformer/SafeTransformer.py index 45a8462..11de918 100644 --- a/SafeTransformer/SafeTransformer.py +++ b/SafeTransformer/SafeTransformer.py @@ -1,11 +1,14 @@ import numpy as np import ruptures as rpt -from sklearn.base import TransformerMixin import pandas as pd +import sys +import shap + from sklearn.exceptions import NotFittedError from scipy.cluster.hierarchy import ward, cut_tree from kneed import KneeLocator -import sys +from sklearn.base import TransformerMixin +from os import error class Variable(): @@ -18,36 +21,48 @@ def __init__(self, name, index): class NumericVariable(Variable): - def __init__(self, name, index, penalty, pelt_model, no_changepoint_strategy='median'): + def __init__(self, name, index, penalty, pelt_model, no_changepoint_strategy='median', dependence_method = 'pdp'): super().__init__(name, index) self.changepoints = [] self.penalty = penalty self.pelt_model = pelt_model self.changepoint_values = [] self.no_changepoint_strategy = no_changepoint_strategy + self.dependence_method = dependence_method - def _get_partial_dependence(self, model, X, grid_resolution=1000): + def _get_partial_dependence(self, model, X, grid_resolution=1000, dependence_method='pdp'): axes = [] - pdp = [] - points = np.linspace(min(X.loc[:, self.original_name]), max( - X.loc[:, self.original_name]), grid_resolution) - X_copy = X.copy() - for point in points: - axes.append(point) - X_copy.loc[:, self.original_name] = point - if(hasattr(model, 'predict_proba')): - predictions = model.predict_proba(X_copy) - else: - predictions = model.predict(X_copy) - val = np.mean(predictions, axis=0) - pdp.append(val) + pdp = [] + if dependence_method == 'pdp': + points = np.linspace(min(X.loc[:, self.original_name]), max( + X.loc[:, self.original_name]), grid_resolution) + X_copy = X.copy() + for point in points: + axes.append(point) + X_copy.loc[:, self.original_name] = point + if(hasattr(model, 'predict_proba')): + predictions = model.predict_proba(X_copy) + else: + predictions = model.predict(X_copy) + val = np.mean(predictions, axis=0) + pdp.append(val) + elif dependence_method == "shap": + explainer = shap.Explainer(model) + shap_values = explainer.shap_values(X) + axes = np.unique(X.values[:, self.original_index]) + for value in axes: + mean_shap = np.mean([shap_values[idx, self.original_index] for idx in range(len(axes)) if axes[idx] == value]) + pdp.append(mean_shap) + + else: + raise ValueError("Unknown dependence method, use 'pdp' or 'shap'.") return np.array(pdp), axes def fit(self, model, X, verbose): if verbose: print('Fitting variable:' + str(self.original_name)) - pdp, axis = self._get_partial_dependence(model, X, grid_resolution=1000) - algo = rpt.Pelt(model=self.pelt_model).fit(pdp) + pdp, axis = self._get_partial_dependence(model, X, grid_resolution=1000, dependence_method=self.dependence_method) + algo = rpt.Pelt(model=self.pelt_model, min_size=2, jump=2).fit(pdp) self.changepoints = algo.predict(pen=self.penalty) self.changepoint_values = [axis[i] for i in self.changepoints[:-1]] if not self.changepoint_values and self.no_changepoint_strategy == 'median': @@ -203,12 +218,13 @@ class SafeTransformer(TransformerMixin): :param penalty: Penalty corresponding to adding a new changepoint. The higher the value of penalty the smaller nunber of levels of transformed variableswill be created (Default value = 3) :param pelt_model: Cost function used in pelt algorith, possible values: 'l2', 'l1', 'rbf' (Default value = 'l2') :param model_params: Dictionary of paramters passed to fit method of surrogate model. Only used if passed surrogate model is not alreedy fitted. - + :param dependence_method" Method of partial dependence fitting, possible values: 'pdp', 'shap' (Default value = 'pdp'). + """ categorical_dtypes = ['category', 'object'] - def __init__(self, model, penalty=3, pelt_model='l2', model_params={}, no_changepoint_strategy='median'): + def __init__(self, model, penalty=3, pelt_model='l2', model_params={}, no_changepoint_strategy='median', dependence_method = 'pdp'): """ Initialize new transformer instance @@ -217,6 +233,8 @@ def __init__(self, model, penalty=3, pelt_model='l2', model_params={}, no_change :param pelt_model: Cost function used in pelt algorith, possible values: 'l2', 'l1', 'rbf' (Default value = 'l2') :param model_params: Dictionary of parameters passed to fit method of surrogate model. Only used if passed surrogate model is not alreedy fitted. :param no_changepoint_strategy: String specifying strategy to take, when no changepoint was detected. Should be one of: 'median', 'no_value'. If median is chosen, then there will be one changepoint set to 'median' value of a column. If 'no_value' is specified column will be removed. Default value = 'median'. + :param dependence_method" Method of partial dependence fitting, possible values: 'pdp', 'shap' (Default value = 'pdp'). + """ self.variables = [] self.model = model @@ -224,6 +242,7 @@ def __init__(self, model, penalty=3, pelt_model='l2', model_params={}, no_change self.pelt_model = pelt_model self.model_params = model_params self.is_fitted = False + self.dependence_method = dependence_method if no_changepoint_strategy != 'median' and no_changepoint_strategy != 'no_value': raise ValueError('Incorrect no changepoint strategy value. Should be one of: median or no_value.') self.no_changepoint_strategy = no_changepoint_strategy @@ -259,7 +278,7 @@ def fit(self, X, y=None, verbose=False): X = pd.concat([X.iloc[:,range(dummy_index)], dummies, X.iloc[:, range(dummy_index+1, len(X.columns))]], axis=1) self.variables.append(CategoricalVariable(name, idx, list(dummies), levels=levels)) else: - self.variables.append(NumericVariable(name, idx, self.penalty, self.pelt_model, self.no_changepoint_strategy)) + self.variables.append(NumericVariable(name, idx, self.penalty, self.pelt_model, self.no_changepoint_strategy, self.dependence_method)) if not self._is_model_fitted(X): self.model.fit(X, y, **self.model_params) for variable in self.variables: diff --git a/examples/SHAP_dependence.ipynb b/examples/SHAP_dependence.ipynb new file mode 100644 index 0000000..db1451d --- /dev/null +++ b/examples/SHAP_dependence.ipynb @@ -0,0 +1,177 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from SafeTransformer import SafeTransformer\n", + "from sklearn.ensemble import GradientBoostingRegressor\n", + "from sklearn.linear_model import LinearRegression\n", + "import numpy as np\n", + "import pandas as pd\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import mean_squared_error\n", + "from sklearn.pipeline import Pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "apartments = pd.read_csv('apartments.csv', index_col=0)\n", + "X_ap = apartments.drop(columns='m2.price')\n", + "y = apartments['m2.price']" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "X = X_ap.copy()\n", + "colnames = list(X)\n", + "for idx, name in enumerate(colnames):\n", + " if str(X.loc[:, name].dtype) in ['category', 'object']:\n", + " dummies = pd.get_dummies(X.loc[:, name], prefix=name, drop_first=True)\n", + " dummy_index = X.columns.get_loc(name)\n", + " X = pd.concat([X.iloc[:,range(dummy_index)], dummies, X.iloc[:, range(dummy_index+1, len(X.columns))]], axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "X_train, X_test, X_lin_train, X_lin_test, y_train, y_test = train_test_split(X_ap, X, y)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "90\n", + "90\n", + "130\n", + "130\n", + "10\n", + "10\n", + "6\n", + "6\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "<SafeTransformer.SafeTransformer.SafeTransformer at 0x7f58f0243da0>" + ] + }, + "metadata": {}, + "execution_count": 5 + } + ], + "source": [ + "surrogate_model = GradientBoostingRegressor(\n", + " n_estimators=1000,\n", + " max_depth=4,\n", + " learning_rate=0.1,\n", + " loss='huber'\n", + ")\n", + "safe_transformer = SafeTransformer(surrogate_model, penalty = 10, dependence_method = \"shap\")\n", + "safe_transformer.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Numerical Variable construction.year\nSelected intervals:\n\t[-Inf, 1922.00)\n\t[1922.00, 1924.00)\n\t[1924.00, 1926.00)\n\t[1926.00, 1929.00)\n\t[1929.00, 1933.00)\n\t[1933.00, 1935.00)\n\t[1935.00, 1938.00)\n\t[1938.00, 1940.00)\n\t[1940.00, 1942.00)\n\t[1942.00, 1944.00)\n\t[1944.00, 1947.00)\n\t[1947.00, 1949.00)\n\t[1949.00, 1951.00)\n\t[1951.00, 1953.00)\n\t[1953.00, 1956.00)\n\t[1956.00, 1958.00)\n\t[1958.00, 1961.00)\n\t[1961.00, 1964.00)\n\t[1964.00, 1967.00)\n\t[1967.00, 1969.00)\n\t[1969.00, 1971.00)\n\t[1971.00, 1973.00)\n\t[1973.00, 1975.00)\n\t[1975.00, 1978.00)\n\t[1978.00, 1982.00)\n\t[1982.00, 1984.00)\n\t[1984.00, 1986.00)\n\t[1986.00, 1988.00)\n\t[1988.00, 1990.00)\n\t[1990.00, 1992.00)\n\t[1992.00, 1995.00)\n\t[1995.00, 1997.00)\n\t[1997.00, 1999.00)\n\t[1999.00, 2001.00)\n\t[2001.00, 2003.00)\n\t[2003.00, 2005.00)\n\t[2005.00, 2007.00)\n\t[2007.00, 2009.00)\n\t[2009.00, Inf)\nNumerical Variable surface\nSelected intervals:\n\t[-Inf, 23.00)\n\t[23.00, 25.00)\n\t[25.00, 28.00)\n\t[28.00, 31.00)\n\t[31.00, 34.00)\n\t[34.00, 37.00)\n\t[37.00, 39.00)\n\t[39.00, 41.00)\n\t[41.00, 44.00)\n\t[44.00, 46.00)\n\t[46.00, 48.00)\n\t[48.00, 50.00)\n\t[50.00, 52.00)\n\t[52.00, 54.00)\n\t[54.00, 56.00)\n\t[56.00, 58.00)\n\t[58.00, 60.00)\n\t[60.00, 63.00)\n\t[63.00, 66.00)\n\t[66.00, 68.00)\n\t[68.00, 70.00)\n\t[70.00, 73.00)\n\t[73.00, 75.00)\n\t[75.00, 78.00)\n\t[78.00, 80.00)\n\t[80.00, 83.00)\n\t[83.00, 85.00)\n\t[85.00, 87.00)\n\t[87.00, 89.00)\n\t[89.00, 91.00)\n\t[91.00, 93.00)\n\t[93.00, 95.00)\n\t[95.00, 98.00)\n\t[98.00, 100.00)\n\t[100.00, 102.00)\n\t[102.00, 105.00)\n\t[105.00, 107.00)\n\t[107.00, 109.00)\n\t[109.00, 112.00)\n\t[112.00, 114.00)\n\t[114.00, 116.00)\n\t[116.00, 118.00)\n\t[118.00, 121.00)\n\t[121.00, 124.00)\n\t[124.00, 126.00)\n\t[126.00, 128.00)\n\t[128.00, 131.00)\n\t[131.00, 133.00)\n\t[133.00, 135.00)\n\t[135.00, 137.00)\n\t[137.00, 139.00)\n\t[139.00, 141.00)\n\t[141.00, 144.00)\n\t[144.00, 146.00)\n\t[146.00, 148.00)\n\t[148.00, Inf)\nNumerical Variable floor\nSelected intervals:\n\t[-Inf, 4.00)\n\t[4.00, 6.00)\n\t[6.00, 8.00)\n\t[8.00, Inf)\nNumerical Variable no.rooms\nSelected intervals:\n\t[-Inf, 5.00)\n\t[5.00, Inf)\nCategorical Variable district\nCreated variable levels:\n\tBemowo, Bielany, Praga, Ursus, Ursynow, Wola -> Bemowo_Bielany_Praga_Ursus_Ursynow_Wola\n\tMokotow, Ochota, Zoliborz -> Mokotow_Ochota_Zoliborz\n\tSrodmiescie -> Srodmiescie\n" + ] + } + ], + "source": [ + "safe_transformer.summary()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "array([1920, 1921, 1922, 1923, 1924, 1925, 1926, 1927, 1928, 1929, 1930,\n", + " 1931, 1932, 1933, 1934, 1935, 1936, 1937, 1938, 1939, 1940, 1941,\n", + " 1942, 1943, 1944, 1945, 1946, 1947, 1948, 1949, 1950, 1951, 1952,\n", + " 1953, 1954, 1955, 1956, 1957, 1958, 1959, 1960, 1961, 1962, 1963,\n", + " 1964, 1965, 1966, 1967, 1968, 1969, 1970, 1971, 1972, 1973, 1974,\n", + " 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982, 1983, 1984, 1985,\n", + " 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996,\n", + " 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007,\n", + " 2008, 2009, 2010])" + ] + }, + "metadata": {}, + "execution_count": 7 + } + ], + "source": [ + "np.unique(X.iloc[:,0])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "safe_transformer." + ] + } + ], + "metadata": { + "kernelspec": { + "name": "python371jvsc74a57bd04a14d88addbefcb3d7fbbd2032e02fafd8dd0b3bd5861453f44d6a559c03a0d0", + "display_name": "Python 3.7.1 64-bit ('venv-safe')" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.1" + }, + "metadata": { + "interpreter": { + "hash": "4a14d88addbefcb3d7fbbd2032e02fafd8dd0b3bd5861453f44d6a559c03a0d0" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file diff --git a/setup.py b/setup.py index b3e5419..0a46bac 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setuptools.setup( name="safe-transformer", - version="0.0.4", + version="0.0.5", author="Aleksandra Gacek, Piotr Luboń", author_email="lubonp@student.mini.pw.edu.pl, gaceka@student.mini.pw.edu.pl", description="Build explainable ML models using surrogate models.", @@ -19,7 +19,8 @@ 'sklearn', 'pandas', 'scipy', - 'kneed' + 'kneed', + 'shap' ], classifiers=[ "Programming Language :: Python :: 3", From c1150e4d7a75ea4b000e377cc10331f1d479da66 Mon Sep 17 00:00:00 2001 From: agosiewska <alicjagosiewska@gmail.com> Date: Wed, 11 Aug 2021 22:45:29 +0200 Subject: [PATCH 2/3] Cleaning --- SafeTransformer/SafeTransformer.py | 4 +++- examples/SHAP_dependence.ipynb | 37 ------------------------------ 2 files changed, 3 insertions(+), 38 deletions(-) diff --git a/SafeTransformer/SafeTransformer.py b/SafeTransformer/SafeTransformer.py index 11de918..adecf70 100644 --- a/SafeTransformer/SafeTransformer.py +++ b/SafeTransformer/SafeTransformer.py @@ -49,7 +49,9 @@ def _get_partial_dependence(self, model, X, grid_resolution=1000, dependence_met elif dependence_method == "shap": explainer = shap.Explainer(model) shap_values = explainer.shap_values(X) - axes = np.unique(X.values[:, self.original_index]) + if type(shap_values) is list: + shap_values = shap_values[0] + axes = np.unique(X.loc[:, self.original_name]) for value in axes: mean_shap = np.mean([shap_values[idx, self.original_index] for idx in range(len(axes)) if axes[idx] == value]) pdp.append(mean_shap) diff --git a/examples/SHAP_dependence.ipynb b/examples/SHAP_dependence.ipynb index db1451d..1d9bef1 100644 --- a/examples/SHAP_dependence.ipynb +++ b/examples/SHAP_dependence.ipynb @@ -110,43 +110,6 @@ "source": [ "safe_transformer.summary()" ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "array([1920, 1921, 1922, 1923, 1924, 1925, 1926, 1927, 1928, 1929, 1930,\n", - " 1931, 1932, 1933, 1934, 1935, 1936, 1937, 1938, 1939, 1940, 1941,\n", - " 1942, 1943, 1944, 1945, 1946, 1947, 1948, 1949, 1950, 1951, 1952,\n", - " 1953, 1954, 1955, 1956, 1957, 1958, 1959, 1960, 1961, 1962, 1963,\n", - " 1964, 1965, 1966, 1967, 1968, 1969, 1970, 1971, 1972, 1973, 1974,\n", - " 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982, 1983, 1984, 1985,\n", - " 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996,\n", - " 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007,\n", - " 2008, 2009, 2010])" - ] - }, - "metadata": {}, - "execution_count": 7 - } - ], - "source": [ - "np.unique(X.iloc[:,0])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "safe_transformer." - ] } ], "metadata": { From 99c48c42ea6f73e098a4dc3ab4d1fabfc0958af8 Mon Sep 17 00:00:00 2001 From: agosiewska <alicjagosiewska@gmail.com> Date: Thu, 19 Aug 2021 13:18:44 +0200 Subject: [PATCH 3/3] Fix pandas error for categorical variables #10 --- SafeTransformer/SafeTransformer.py | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/SafeTransformer/SafeTransformer.py b/SafeTransformer/SafeTransformer.py index adecf70..aa58cfd 100644 --- a/SafeTransformer/SafeTransformer.py +++ b/SafeTransformer/SafeTransformer.py @@ -162,7 +162,7 @@ def transform(self, X, verbose): ret = np.zeros([X.shape[0], ret_len]) for row_num in range(dummies.shape[0]): if not np.sum(dummies.iloc[row_num, :]) == 0: - idx = np.argwhere(dummies.iloc[row_num, :] == 1)[0] + idx = np.argwhere(dummies.iloc[row_num, :].to_numpy() == 1)[0] if self.clusters[idx + 1] > 0: ret[row_num, self.clusters[idx + 1] - 1] = 1 return pd.DataFrame(ret, columns=self.new_names[1:]) diff --git a/setup.py b/setup.py index 0a46bac..7d53968 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setuptools.setup( name="safe-transformer", - version="0.0.5", + version="0.0.6", author="Aleksandra Gacek, Piotr Luboń", author_email="lubonp@student.mini.pw.edu.pl, gaceka@student.mini.pw.edu.pl", description="Build explainable ML models using surrogate models.",