Skip to content

Commit

Permalink
feature: adds threshold and max_categories parameter to RobustOrdinal…
Browse files Browse the repository at this point in the history
…Encoder (#26)

* feature: adds ThresholdOrdinalEncoder to preprocessors

ThresholdOrdinalEncoder is based of sklearn.preprocessing.OrdinalEncoder. This encoder only encodes categories whose frequencies are above the given `threshold` and only encodes up to `max_categories`.

* feature: adds 'threshold' and 'max_categories' parameters to 'RobustOrdinalEncoder'

* change: removes ThresholdOrdinalEncoder

* change: modifies order of fit and category manipulation in RobustOrdinalEncoder

* fix: changes single quotes to double quotes to match style

* change: moves self.features_under_threshold_ initialization to fit in RobustOrdinalEncoder

* change: updated doc string and changed attribute name to `feature_idxs_no_categories_`

Co-authored-by: Isabel Panepento <[email protected]>
  • Loading branch information
ipanepen and Isabel Panepento authored Oct 15, 2020
1 parent c95ede5 commit 7c49356
Show file tree
Hide file tree
Showing 2 changed files with 133 additions and 10 deletions.
74 changes: 67 additions & 7 deletions src/sagemaker_sklearn_extension/preprocessing/encoders.py
Original file line number Diff line number Diff line change
Expand Up @@ -477,6 +477,26 @@ class RobustOrdinalEncoder(OrdinalEncoder):
When unknown_as_nan is false, unknown values are transformed to n, where n-1 is the last category
When unknown_as_nan is true, unknown values are transformed to np.nan
threshold : 'auto' or float, default = 1
The threshold for encoding a value as its own label in the result. Default value 1. If `threshold='auto'`, the
maximum of `10` or`n_features / 1000` where `n_features` is the number of columns of input X is calculated as
the `threshold`. How this parameter is interpreted depends on whether it is more than or equal to or less
than 1.
- If `threshold` is more than or equal to one, it represents the number of times a value must appear to be
one hot encoded in the result.
- If `threshold` is less than one, it represents the fraction of rows which must contain the value for it to be
one hot encoded in the result. The values is rounded up, so if `threshold` is 0.255 and there are 100 rows, a
value must appear at least 26 times to be included.
max_categories : int or np.inf, default = np.inf
Maximum number of categories to encode per feature. Default value is np.inf and does not place an upper bound on
the number of categories. If the number of observed categories is greater than ``max_categories``, the encoder
will take the top ``max_categories`` observed categories, sorted by count. All remaining values will be
encoded as the last category. Note this means that the number of categories will be ``max_categories + 1``.
In the case of a tie between categories, the category whose label is higher will be chosen.
Attributes
----------
Expand All @@ -485,6 +505,10 @@ class RobustOrdinalEncoder(OrdinalEncoder):
(in order of the features in X and corresponding with the output
of ``transform``).
feature_idxs_no_categories_ : list of ints
A list of indexes of features who have no categories with a frequency
greater than or equal to the value of ``threshold``.
Examples
--------
Given a dataset with two features, we let the encoder find the unique
Expand Down Expand Up @@ -513,11 +537,13 @@ class RobustOrdinalEncoder(OrdinalEncoder):
"""

def __init__(self, categories="auto", dtype=np.float32, unknown_as_nan=False):
def __init__(self, categories="auto", dtype=np.float32, unknown_as_nan=False, threshold=1, max_categories=np.inf):
super(RobustOrdinalEncoder, self).__init__(categories=categories, dtype=dtype)
self.categories = categories
self.dtype = dtype
self.unknown_as_nan = unknown_as_nan
self.threshold = threshold
self.max_categories = max_categories

def fit(self, X, y=None):
"""Fit the RobustOrdinalEncoder to X.
Expand All @@ -532,10 +558,41 @@ def fit(self, X, y=None):
self
"""
# sklearn.preprocessing._BaseEncoder uses _categories due to deprecations in other classes
# can be removed once deprecations are removed
self._categories = self.categories
self._fit(X, handle_unknown="unknown")
self._fit(X, handle_unknown="ignore")

assert self.max_categories >= 1

self.feature_idxs_no_categories_ = []

if isinstance(self.max_categories, int) or self.threshold != 1:
X_columns, n_samples, n_features = self._check_X(X)

if self.threshold == "auto":
threshold = max(10, n_samples / 1000)
elif self.threshold >= 1:
threshold = self.threshold
else:
threshold = ceil(self.threshold * n_samples)

for i in range(n_features):
dtype = X_columns[i].dtype
items, counts = np.unique(X_columns[i].astype(str), return_counts=True)
categories_to_encode = items[counts >= threshold].astype("O")
if categories_to_encode.size == 0:
warnings.warn(
"feature at index {} does not have any categories appearing more than {} {}".format(
i, threshold, "time" if threshold == 1 else "times"
)
)
# If no category is above the threshold, create an unknown category to prevent
# self._transform() from raising an IndexError
categories_to_encode = np.array(["unknown"])
self.feature_idxs_no_categories_.append(i)
if len(categories_to_encode) > self.max_categories:
most_freq_idxs = np.argsort(counts)[len(counts) - self.max_categories :]
categories_to_encode = items[most_freq_idxs]
self.categories_[i] = np.sort(categories_to_encode.astype(dtype))

return self

def transform(self, X):
Expand All @@ -552,19 +609,20 @@ def transform(self, X):
Transformed input.
"""
X_int, X_mask = self._transform(X, handle_unknown="unknown")
X_int, X_mask = self._transform(X, handle_unknown="ignore")
if self.unknown_as_nan:
# assign the unknowns np.nan
X_int = X_int.astype(self.dtype, copy=False)
X_int[~X_mask] = np.nan
X_int[:, self.feature_idxs_no_categories_] = np.nan
else:
# assign the unknowns an integer indicating they are unknown. The largest integer is always reserved for
# unknowns
for col in range(X_int.shape[1]):
mask = X_mask[:, col]
X_int[~mask, col] = self.categories_[col].shape[0]
X_int = X_int.astype(self.dtype, copy=False)

X_int[:, self.feature_idxs_no_categories_] = 0
return X_int

def inverse_transform(self, X):
Expand Down Expand Up @@ -620,6 +678,8 @@ def inverse_transform(self, X):
for idx, unknown_mask in found_unknown.items():
X_tr[unknown_mask, idx] = None

X_tr[:, self.feature_idxs_no_categories_] = None

return X_tr


Expand Down
69 changes: 66 additions & 3 deletions test/test_preprocessing_encoders.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,11 +214,14 @@ def test_na_label_encoder(y, y_expected):
np.testing.assert_array_equal(y_transform, y_expected)


def test_robust_ordinal_encoding_categories():
encoder = RobustOrdinalEncoder()
@pytest.mark.parametrize(
"threshold, expected", ([1, ordinal_expected_categories_], [2, [{"hot dog"}, {"1", "3"}, {"a", "b"}]])
)
def test_robust_ordinal_encoding_categories(threshold, expected):
encoder = RobustOrdinalEncoder(threshold=threshold)
encoder.fit(ordinal_data)
for i, cat in enumerate(encoder.categories_):
assert set(cat) == set(ordinal_expected_categories_[i])
assert set(cat) == set(expected[i])


@pytest.mark.parametrize("unknown_as_nan", (True, False))
Expand All @@ -234,6 +237,50 @@ def test_robust_ordinal_encoding_transform(unknown_as_nan):
assert all(list(encoded[-1] == 3))


def test_robust_ordinal_encoding_transform_threshold():
# Test where some categories are below the threshold
encoder = RobustOrdinalEncoder(threshold=2)
encoder.fit(ordinal_data)
encoded = encoder.transform(ordinal_data)
assert all(list(encoded[:, 0] < 2))
assert all(list((encoded[:, 1:] < 3).reshape((-1,))))

# Test where some categories are below the threshold and new categories are introduced in transformation
test_data = np.concatenate([ordinal_data, np.array([["waffle", 1213, np.nan]])], axis=0)
encoded = encoder.transform(test_data)
assert all(list(encoded[:, 0] < 2))
assert all(list((encoded[:, 1:] < 3).reshape((-1,))))

# Test where all categories are below the threshold
encoder = RobustOrdinalEncoder(threshold=10)
encoder.fit(ordinal_data)
assert len(encoder.feature_idxs_no_categories_) == 3
encoded = encoder.transform(test_data)
assert np.all(encoded == 0)


def test_robust_ordinal_encoding_transform_max_categories():
# Test where number of categories is much larger than max_categories
data = np.array([[i for i in range(200)] + [i for i in range(150)] + [i for i in range(100)]]).T
encoder = RobustOrdinalEncoder(max_categories=100)
encoder.fit(data)
assert len(encoder.categories_[0]) == 100
assert all(list(encoder.categories_[0] <= 100))
encoded = encoder.transform(data)
cats, frequencies = np.unique(encoded, return_counts=True)
assert len(cats) == encoder.max_categories + 1
assert sum(frequencies == 3) == 100

# Test where number of categories is equal to max categories
encoder = RobustOrdinalEncoder(max_categories=2)
encoder.fit(np.array([["x", "y"], ["y", "x"]]))
assert len(encoder.categories_[0]) == 2
assert len(encoder.categories_[1]) == 2
encoded = encoder.transform([["x", "y"], ["z", "z"]])
assert np.all(encoded[1] == 2)
assert np.all(encoded[0] == [0, 1])


@pytest.mark.parametrize("unknown_as_nan", (True, False))
def test_robust_ordinal_encoding_inverse_transform(unknown_as_nan):
encoder = RobustOrdinalEncoder(unknown_as_nan=unknown_as_nan)
Expand All @@ -244,6 +291,22 @@ def test_robust_ordinal_encoding_inverse_transform(unknown_as_nan):
assert np.array_equal(ordinal_data, reverse[:-1])
assert all([x is None for x in reverse[-1]])

# Test where some categories are below the threshold
encoder = RobustOrdinalEncoder(unknown_as_nan=unknown_as_nan, threshold=2)
encoder.fit(ordinal_data)
encoded = encoder.transform(test_data)
reverse = encoder.inverse_transform(encoded)
assert sum([i is None for i in reverse[:, 0]]) == 3
assert sum([i is None for i in reverse[:, 1]]) == 2
assert sum([i is None for i in reverse[:, 2]]) == 2

# Test where all categories are below the threshold
encoder = RobustOrdinalEncoder(unknown_as_nan=unknown_as_nan, threshold=10)
encoder.fit(ordinal_data)
encoded = encoder.transform(test_data)
reverse = encoder.inverse_transform(encoded)
assert sum(([i is None for i in reverse.flatten()])) == reverse.size


def test_robust_ordinal_encoding_inverse_transform_floatkeys():
encoder = RobustOrdinalEncoder()
Expand Down

0 comments on commit 7c49356

Please sign in to comment.