Save commit.

aai-institute · Apr 11, 2023 · 65599cc · 65599cc
1 parent 2aed995
commit 65599cc
Show file tree

Hide file tree

Showing 3 changed files with 68 additions and 5 deletions.
diff --git a/src/pydvl/utils/numeric.py b/src/pydvl/utils/numeric.py
@@ -156,14 +156,15 @@ def random_powerset_group_conditional(
     if n_samples is None:
         n_samples = np.iinfo(np.int32).max
 
+    unique_labels = np.unique(labels)
     while total <= n_samples:
 
         subsets: List[NDArray[T]] = []
-        for label in labels:
+        for label in unique_labels:
             label_indices = np.asarray(np.where(labels == label)[0])
             subset_length = int(
                 rng.integers(
-                    min(min_elements, len(label_indices) - 1), len(label_indices)
+                    min(min_elements, len(label_indices)), len(label_indices) + 1
                 )
             )
             subsets.append(random_subset_of_size(s[label_indices], subset_length))

diff --git a/src/pydvl/value/shapley/classwise.py b/src/pydvl/value/shapley/classwise.py
@@ -151,7 +151,7 @@ def _class_wise_shapley_worker(
                 final_score = u(train_set)
                 prev_score = 0.0
 
-                for i, _ in enumerate(label_set):
+                for i in range(len(permutation_label_set)):
 
                     if np.abs(prev_score - final_score) < eps:
                         score = prev_score

diff --git a/tests/value/shapley/test_classwise.py b/tests/value/shapley/test_classwise.py
@@ -9,9 +9,15 @@
 import numpy as np
 import pytest
 from numpy._typing import NDArray
+from sklearn.metrics import accuracy_score
 
-from pydvl.utils import SupervisedModel
-from pydvl.value.shapley.classwise import _estimate_in_out_cls_accuracy
+from pydvl.utils import Dataset, SupervisedModel, Utility
+from pydvl.value import MaxChecks
+from pydvl.value.shapley.classwise import (
+    CSScorer,
+    _class_wise_shapley_worker,
+    _estimate_in_out_cls_accuracy,
+)
 
 
 @pytest.fixture(scope="function")
@@ -50,3 +56,59 @@ def test_estimate_in_out_cls_accuracy(
     in_cls_acc_1, out_of_cls_acc_1 = _estimate_in_out_cls_accuracy(mock_model, x, y, 1)
     assert in_cls_acc_1 == out_of_cls_acc_0
     assert in_cls_acc_0 == out_of_cls_acc_1
+
+
+@pytest.fixture(scope="function")
+def dataset_cs_shapley() -> Dataset:
+    """
+    A simple dataset for testing the class wise shapley value.
+    """
+    x_train = np.arange(1, 5).reshape([-1, 1])
+    y_train = np.array([0, 0, 1, 1])
+    x_test = x_train
+    y_test = np.array([0, 0, 0, 1])
+    return Dataset(x_train, y_train, x_test, y_test)
+
+
+@pytest.fixture(scope="function")
+def linear_regression_classifier() -> SupervisedModel:
+    """
+    A classifier based on linear regression, so that a closed form solution exists
+    """
+
+    class _LinearRegressionBasedClassifier(SupervisedModel):
+        def __init__(self):
+            self._beta = None
+
+        def fit(self, x: NDArray, y: NDArray) -> float:
+            v = x[:, 0]
+            self._beta = np.dot(v, y) / np.dot(v, v)
+            return -1
+
+        def predict(self, x: NDArray) -> NDArray:
+            if self._beta is None:
+                raise AttributeError("Model not fitted")
+
+            x = x[:, 0]
+            probs = self._beta * x
+            return np.clip(np.round(probs), 0, 1).astype(int)
+
+        def score(self, x: NDArray, y: NDArray) -> float:
+            pred_y = self.predict(x)
+            return np.sum(pred_y == y) / 4
+
+    return _LinearRegressionBasedClassifier()
+
+
+def test_cs_shapley_exact_solution(
+    dataset_cs_shapley: Dataset, linear_regression_classifier: SupervisedModel
+):
+    n_samples = 100
+    scorer = CSScorer()
+    utility = Utility(
+        linear_regression_classifier, dataset_cs_shapley, scorer, catch_errors=False
+    )
+    valuation_result = _class_wise_shapley_worker(
+        dataset_cs_shapley.indices, utility, done=MaxChecks(n_samples)
+    )
+    print(valuation_result)