feature: adds threshold and max_categories parameter to RobustOrdinal…

…Encoder (#26) * feature: adds ThresholdOrdinalEncoder to preprocessors ThresholdOrdinalEncoder is based of sklearn.preprocessing.OrdinalEncoder. This encoder only encodes categories whose frequencies are above the given `threshold` and only encodes up to `max_categories`. * feature: adds 'threshold' and 'max_categories' parameters to 'RobustOrdinalEncoder' * change: removes ThresholdOrdinalEncoder * change: modifies order of fit and category manipulation in RobustOrdinalEncoder * fix: changes single quotes to double quotes to match style * change: moves self.features_under_threshold_ initialization to fit in RobustOrdinalEncoder * change: updated doc string and changed attribute name to `feature_idxs_no_categories_` Co-authored-by: Isabel Panepento <[email protected]>
aws · Oct 15, 2020 · 7c49356 · 7c49356
1 parent c95ede5
commit 7c49356
Show file tree

Hide file tree

Showing 2 changed files with 133 additions and 10 deletions.
diff --git a/src/sagemaker_sklearn_extension/preprocessing/encoders.py b/src/sagemaker_sklearn_extension/preprocessing/encoders.py
@@ -477,6 +477,26 @@ class RobustOrdinalEncoder(OrdinalEncoder):
         When unknown_as_nan is false, unknown values are transformed to n, where n-1 is the last category
         When unknown_as_nan is true, unknown values are transformed to np.nan
 
+    threshold : 'auto' or float, default = 1
+        The threshold for encoding a value as its own label in the result. Default value 1. If `threshold='auto'`, the
+        maximum of `10` or`n_features / 1000` where `n_features` is the number of columns of input X is calculated as
+        the `threshold`. How this parameter is interpreted depends on whether it is more than or equal to or less
+        than 1.
+
+        - If `threshold` is more than or equal to one, it represents the number of times a value must appear to be
+          one hot encoded in the result.
+
+        - If `threshold` is less than one, it represents the fraction of rows which must contain the value for it to be
+          one hot encoded in the result. The values is rounded up, so if `threshold` is 0.255 and there are 100 rows, a
+          value must appear at least 26 times to be included.
+
+    max_categories : int or np.inf, default = np.inf
+        Maximum number of categories to encode per feature. Default value is np.inf and does not place an upper bound on
+        the number of categories. If the number of observed categories is greater than ``max_categories``, the encoder
+        will take the top ``max_categories`` observed categories, sorted by count. All remaining values will be
+        encoded as the last category. Note this means that the number of categories will be ``max_categories + 1``.
+        In the case of a tie between categories, the category whose label is higher will be chosen.
+
 
     Attributes
     ----------
@@ -485,6 +505,10 @@ class RobustOrdinalEncoder(OrdinalEncoder):
         (in order of the features in X and corresponding with the output
         of ``transform``).
 
+    feature_idxs_no_categories_ : list of ints
+        A list of indexes of features who have no categories with a frequency
+        greater than or equal to the value of ``threshold``.
+
     Examples
     --------
     Given a dataset with two features, we let the encoder find the unique
@@ -513,11 +537,13 @@ class RobustOrdinalEncoder(OrdinalEncoder):
 
     """
 
-    def __init__(self, categories="auto", dtype=np.float32, unknown_as_nan=False):
+    def __init__(self, categories="auto", dtype=np.float32, unknown_as_nan=False, threshold=1, max_categories=np.inf):
         super(RobustOrdinalEncoder, self).__init__(categories=categories, dtype=dtype)
         self.categories = categories
         self.dtype = dtype
         self.unknown_as_nan = unknown_as_nan
+        self.threshold = threshold
+        self.max_categories = max_categories
 
     def fit(self, X, y=None):
         """Fit the RobustOrdinalEncoder to X.
@@ -532,10 +558,41 @@ def fit(self, X, y=None):
         self
 
         """
-        # sklearn.preprocessing._BaseEncoder uses _categories due to deprecations in other classes
-        # can be removed once deprecations are removed
-        self._categories = self.categories
-        self._fit(X, handle_unknown="unknown")
+        self._fit(X, handle_unknown="ignore")
+
+        assert self.max_categories >= 1
+
+        self.feature_idxs_no_categories_ = []
+
+        if isinstance(self.max_categories, int) or self.threshold != 1:
+            X_columns, n_samples, n_features = self._check_X(X)
+
+            if self.threshold == "auto":
+                threshold = max(10, n_samples / 1000)
+            elif self.threshold >= 1:
+                threshold = self.threshold
+            else:
+                threshold = ceil(self.threshold * n_samples)
+
+            for i in range(n_features):
+                dtype = X_columns[i].dtype
+                items, counts = np.unique(X_columns[i].astype(str), return_counts=True)
+                categories_to_encode = items[counts >= threshold].astype("O")
+                if categories_to_encode.size == 0:
+                    warnings.warn(
+                        "feature at index {} does not have any categories appearing more than {} {}".format(
+                            i, threshold, "time" if threshold == 1 else "times"
+                        )
+                    )
+                    # If no category is above the threshold, create an unknown category to prevent
+                    # self._transform() from raising an IndexError
+                    categories_to_encode = np.array(["unknown"])
+                    self.feature_idxs_no_categories_.append(i)
+                if len(categories_to_encode) > self.max_categories:
+                    most_freq_idxs = np.argsort(counts)[len(counts) - self.max_categories :]
+                    categories_to_encode = items[most_freq_idxs]
+                self.categories_[i] = np.sort(categories_to_encode.astype(dtype))
+
         return self
 
     def transform(self, X):
@@ -552,19 +609,20 @@ def transform(self, X):
             Transformed input.
 
         """
-        X_int, X_mask = self._transform(X, handle_unknown="unknown")
+        X_int, X_mask = self._transform(X, handle_unknown="ignore")
         if self.unknown_as_nan:
             # assign the unknowns np.nan
             X_int = X_int.astype(self.dtype, copy=False)
             X_int[~X_mask] = np.nan
+            X_int[:, self.feature_idxs_no_categories_] = np.nan
         else:
             # assign the unknowns an integer indicating they are unknown. The largest integer is always reserved for
             # unknowns
             for col in range(X_int.shape[1]):
                 mask = X_mask[:, col]
                 X_int[~mask, col] = self.categories_[col].shape[0]
             X_int = X_int.astype(self.dtype, copy=False)
-
+            X_int[:, self.feature_idxs_no_categories_] = 0
         return X_int
 
     def inverse_transform(self, X):
@@ -620,6 +678,8 @@ def inverse_transform(self, X):
             for idx, unknown_mask in found_unknown.items():
                 X_tr[unknown_mask, idx] = None
 
+        X_tr[:, self.feature_idxs_no_categories_] = None
+
         return X_tr
 
 

diff --git a/test/test_preprocessing_encoders.py b/test/test_preprocessing_encoders.py
@@ -214,11 +214,14 @@ def test_na_label_encoder(y, y_expected):
     np.testing.assert_array_equal(y_transform, y_expected)
 
 
-def test_robust_ordinal_encoding_categories():
-    encoder = RobustOrdinalEncoder()
+@pytest.mark.parametrize(
+    "threshold, expected", ([1, ordinal_expected_categories_], [2, [{"hot dog"}, {"1", "3"}, {"a", "b"}]])
+)
+def test_robust_ordinal_encoding_categories(threshold, expected):
+    encoder = RobustOrdinalEncoder(threshold=threshold)
     encoder.fit(ordinal_data)
     for i, cat in enumerate(encoder.categories_):
-        assert set(cat) == set(ordinal_expected_categories_[i])
+        assert set(cat) == set(expected[i])
 
 
 @pytest.mark.parametrize("unknown_as_nan", (True, False))
@@ -234,6 +237,50 @@ def test_robust_ordinal_encoding_transform(unknown_as_nan):
         assert all(list(encoded[-1] == 3))
 
 
+def test_robust_ordinal_encoding_transform_threshold():
+    # Test where some categories are below the threshold
+    encoder = RobustOrdinalEncoder(threshold=2)
+    encoder.fit(ordinal_data)
+    encoded = encoder.transform(ordinal_data)
+    assert all(list(encoded[:, 0] < 2))
+    assert all(list((encoded[:, 1:] < 3).reshape((-1,))))
+
+    # Test where some categories are below the threshold and new categories are introduced in transformation
+    test_data = np.concatenate([ordinal_data, np.array([["waffle", 1213, np.nan]])], axis=0)
+    encoded = encoder.transform(test_data)
+    assert all(list(encoded[:, 0] < 2))
+    assert all(list((encoded[:, 1:] < 3).reshape((-1,))))
+
+    # Test where all categories are below the threshold
+    encoder = RobustOrdinalEncoder(threshold=10)
+    encoder.fit(ordinal_data)
+    assert len(encoder.feature_idxs_no_categories_) == 3
+    encoded = encoder.transform(test_data)
+    assert np.all(encoded == 0)
+
+
+def test_robust_ordinal_encoding_transform_max_categories():
+    # Test where number of categories is much larger than max_categories
+    data = np.array([[i for i in range(200)] + [i for i in range(150)] + [i for i in range(100)]]).T
+    encoder = RobustOrdinalEncoder(max_categories=100)
+    encoder.fit(data)
+    assert len(encoder.categories_[0]) == 100
+    assert all(list(encoder.categories_[0] <= 100))
+    encoded = encoder.transform(data)
+    cats, frequencies = np.unique(encoded, return_counts=True)
+    assert len(cats) == encoder.max_categories + 1
+    assert sum(frequencies == 3) == 100
+
+    # Test where number of categories is equal to max categories
+    encoder = RobustOrdinalEncoder(max_categories=2)
+    encoder.fit(np.array([["x", "y"], ["y", "x"]]))
+    assert len(encoder.categories_[0]) == 2
+    assert len(encoder.categories_[1]) == 2
+    encoded = encoder.transform([["x", "y"], ["z", "z"]])
+    assert np.all(encoded[1] == 2)
+    assert np.all(encoded[0] == [0, 1])
+
+
 @pytest.mark.parametrize("unknown_as_nan", (True, False))
 def test_robust_ordinal_encoding_inverse_transform(unknown_as_nan):
     encoder = RobustOrdinalEncoder(unknown_as_nan=unknown_as_nan)
@@ -244,6 +291,22 @@ def test_robust_ordinal_encoding_inverse_transform(unknown_as_nan):
     assert np.array_equal(ordinal_data, reverse[:-1])
     assert all([x is None for x in reverse[-1]])
 
+    # Test where some categories are below the threshold
+    encoder = RobustOrdinalEncoder(unknown_as_nan=unknown_as_nan, threshold=2)
+    encoder.fit(ordinal_data)
+    encoded = encoder.transform(test_data)
+    reverse = encoder.inverse_transform(encoded)
+    assert sum([i is None for i in reverse[:, 0]]) == 3
+    assert sum([i is None for i in reverse[:, 1]]) == 2
+    assert sum([i is None for i in reverse[:, 2]]) == 2
+
+    # Test where all categories are below the threshold
+    encoder = RobustOrdinalEncoder(unknown_as_nan=unknown_as_nan, threshold=10)
+    encoder.fit(ordinal_data)
+    encoded = encoder.transform(test_data)
+    reverse = encoder.inverse_transform(encoded)
+    assert sum(([i is None for i in reverse.flatten()])) == reverse.size
+
 
 def test_robust_ordinal_encoding_inverse_transform_floatkeys():
     encoder = RobustOrdinalEncoder()