Skip to content

Commit

Permalink
add association collinearity (categorical) filtering
Browse files Browse the repository at this point in the history
  • Loading branch information
Thomas Bury committed Jul 11, 2021
1 parent e71d8d2 commit 986ac82
Show file tree
Hide file tree
Showing 5 changed files with 1,080 additions and 787 deletions.
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,8 @@ fs_df
fs = arfsfs.FeatureSelector(X = X[filtered_features], y = y, sample_weight = w)
# identify highly correlated columns (here corr_coef >= 0.75)
# set encode to True if there are categorical/string cols (takes a bit of time)
# the identify collinear method now supports continuous and categorical filtering,
# using associations (con-con, con-cat, cat-cat) without encoding the variables.
fs.identify_collinear(correlation_threshold=0.5, encode=False)
# tag the discarded predictors and store the results
fs_df = fs_df.merge(fs.tag_df, how='left')
Expand Down Expand Up @@ -273,6 +275,10 @@ In the spirit, the same heuristic than Boruta but using Boosting (originally Bor

## Changes

### 0.2.0

- Prefilters now support the filtering of continuous and nominal (categorical) collinear variables

### 0.1.6

- improve the plot_y_vs_X function
Expand Down
2 changes: 1 addition & 1 deletion arfs/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.1.6"
__version__ = "0.2.0"
128 changes: 108 additions & 20 deletions arfs/featselect.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
# numpy and pandas for data manipulation
import pandas as pd
import numpy as np
from dython.nominal import compute_associations
# model used for feature importance, Shapley values are builtin
import lightgbm as lgb
# visualizations
Expand All @@ -32,6 +33,7 @@
from palettable.cmocean.diverging import Curl_5_r
from palettable.cartocolors.qualitative import Bold_10
import holoviews as hv
import panel as pn
# progress bar
from tqdm.autonotebook import trange, tqdm
# ML
Expand Down Expand Up @@ -173,6 +175,49 @@ def plot_corr(df, size=10):
return fig


def plot_associations(df, features=None, size=1200, theil_u=False):

if features is None:
features = df.columns

# continuous features
con_features = set(features).intersection(set(list(df.select_dtypes(include=[np.number]))))

# nominal features
nom_features = set(features) - set(con_features)

assoc_df = compute_associations(df,
nominal_columns=nom_features,
mark_columns=True,
theil_u=theil_u,
clustering=True,
bias_correction=True,
nan_strategy='drop_samples')

heatmap = hv.HeatMap((assoc_df.columns, assoc_df.index, assoc_df)).redim.range(z=(-1, 1))

heatmap.opts(tools=['tap', 'hover'], height=size, width=size + 50, toolbar='left', colorbar=True,
cmap=Curl_5_r.mpl_colormap, fontsize={'title': 12, 'ticks': 12, 'minor_ticks': 12}, xrotation=90,
invert_xaxis=False, invert_yaxis=True, # title=title_str,
xlabel='', ylabel=''
)
title_str = "**Continuous (con) and Categorical (nom) Associations **"
sub_title_str = "*Categorical(nom): uncertainty coefficient & correlation ratio from 0 to 1. The uncertainty " \
"coefficient is assymmetrical, (approximating how much the elements on the " \
"left PROVIDE INFORMATION on elements in the row). Continuous(con): symmetrical numerical " \
"correlations (Pearson's) from -1 to 1*"
panel_layout = pn.Column(
pn.pane.Markdown(title_str, align="start"), # bold
pn.pane.Markdown(sub_title_str, align="start"), # italic
heatmap, background='#ebebeb'
)

gc.enable()
del assoc_df
gc.collect()
return panel_layout


######################
# #
# main class #
Expand Down Expand Up @@ -459,6 +504,7 @@ def __init__(self, X, y=None, sample_weight=None):
self.n_identified = None
self.encoded = False
self.mapper = None
self.collinear_method = None
self.tag_df = pd.DataFrame({'predictor': self.base_features})

# Dictionary to hold removal operations
Expand Down Expand Up @@ -626,7 +672,7 @@ def encode_cat_var(self, df=None, col_excl=None, return_cat=False):

return df

def identify_collinear(self, correlation_threshold, encode=False, method='spearman'):
def identify_collinear(self, correlation_threshold, encode=False, method='association'):
"""
Finds collinear features based on the correlation coefficient between features.
For each pair of features with a correlation coefficient greather than `correlation_threshold`,
Expand All @@ -645,7 +691,7 @@ def identify_collinear(self, correlation_threshold, encode=False, method='spearm
Whether to encode the features before calculating the correlation coefficients
method: str
spearman or pearson correlation coefficient
association, spearman or pearson correlation coefficient. If you have categorical variables, use "association"
"""

Expand All @@ -660,7 +706,31 @@ def identify_collinear(self, correlation_threshold, encode=False, method='spearm
print('Encoding done, {0:4.0f} min'.format(round((tic - time.time()) / 60)))

tic_corr = time.time()
if method == 'spearman':

self.collinear_method = method
if method == 'association':
features = self.data.columns

# continuous features
con_features = set(features).intersection(set(list(self.data.select_dtypes(include=[np.number]))))

# nominal features
nom_features = set(features) - set(con_features)

self.corr_matrix = compute_associations(self.data,
nominal_columns=nom_features,
mark_columns=True,
theil_u=True,
clustering=True,
bias_correction=True,
nan_strategy='drop_samples')

upper = self.corr_matrix.where(np.triu(np.ones(self.corr_matrix.shape), k=1).astype(np.bool))
to_drop = [column for column in upper.columns if
any(upper[column].abs() > correlation_threshold)]


elif method == 'spearman':
self.corr_matrix = self.data.corr(method="spearman").fillna(0)
# Extract the upper triangle of the correlation matrix
upper = self.corr_matrix.where(np.triu(np.ones(self.corr_matrix.shape), k=1).astype(np.bool))
Expand Down Expand Up @@ -1100,31 +1170,49 @@ def plot_collinear(self, plot_all=False, size=1000):

if plot_all:
corr_matrix_plot = self.corr_matrix
title = 'All Correlations'
subtitle_str = 'All Correlations'

else:
# Identify the correlations that were above the threshold
# columns (x-axis) are features to drop and rows (y_axis) are correlated pairs
corr_matrix_plot = self.corr_matrix.loc[list(set(self.record_collinear['corr_feature'])),
list(set(self.record_collinear['drop_feature']))]
title = "Correlations Above Threshold"

d = sch.distance.pdist(corr_matrix_plot)
L = sch.linkage(d, method='ward')
ind = sch.fcluster(L, 0.5 * np.nanmax(d), 'distance')
columns = [corr_matrix_plot.columns.tolist()[i] for i in list((np.argsort(ind)))]
corr_matrix_plot = corr_matrix_plot.reindex(columns, axis=1)
corr_matrix_plot = corr_matrix_plot.reindex(columns, axis=0)

subtitle_str = "Correlations Above Threshold"

d = sch.distance.pdist(corr_matrix_plot)
L = sch.linkage(d, method='ward')
ind = sch.fcluster(L, 0.5 * np.nanmax(d), 'distance')
columns = [corr_matrix_plot.columns.tolist()[i] for i in list((np.argsort(ind)))]
corr_matrix_plot = corr_matrix_plot.reindex(columns, axis=1)
corr_matrix_plot = corr_matrix_plot.reindex(columns, axis=0)
# interactive plot of the correlation matrix
heatmap = hv.HeatMap((corr_matrix_plot.columns, corr_matrix_plot.index, corr_matrix_plot)).redim.range(
z=(-1, 1))
heatmap.opts(tools=['tap', 'hover'], height=size, width=size + 50, toolbar='above', colorbar=True,
cmap=Curl_5_r.mpl_colormap,
fontsize={'title': 20, 'ticks': 12, 'minor_ticks': 12}, xrotation=90,
invert_xaxis=False, invert_yaxis=True, title=title, xlabel='',
ylabel='', xaxis=None, yaxis=None)
return heatmap
z=(-1, 1))

heatmap.opts(tools=['tap', 'hover'], height=size, width=size + 50, toolbar='left', colorbar=True,
cmap=Curl_5_r.mpl_colormap, fontsize={'title': 12, 'ticks': 12, 'minor_ticks': 12}, xrotation=90,
invert_xaxis=False, invert_yaxis=True, # title=title_str,
xlabel='', ylabel=''
)
if self.collinear_method == 'association':
title_str = "**Continuous (con) and Categorical (nom) Associations **"
sub_title_str = "*Categorical(nom): uncertainty coefficient & correlation ratio from 0 to 1. The uncertainty " \
"coefficient is assymmetrical, (approximating how much the elements on the " \
"left PROVIDE INFORMATION on elements in the row). Continuous(con): symmetrical numerical " \
"correlations (Pearson's) from -1 to 1*"
else:
title_str = "**Correlations (continuous variables only) **"
subtitle_str = "Use `method='association'` if there are categorical variables. " + subtitle_str


panel_layout = pn.Column(
pn.pane.Markdown(title_str, align="start"), # bold
pn.pane.Markdown(sub_title_str, align="start"), # italic
heatmap, background='#ebebeb'
)

return panel_layout


def plot_feature_importances(self, plot_n=15, threshold=None, figsize=None):
"""
Expand Down
Loading

0 comments on commit 986ac82

Please sign in to comment.