From 4bd92ec7cf3aafca1e656787429d59fda57d999f Mon Sep 17 00:00:00 2001
From: Markus Semmler <markus.semmler@unternehmertum.de>
Date: Sat, 12 Aug 2023 23:21:28 +0200
Subject: [PATCH] Implement algorithm from paper `CS-Shapley: Class-wise
 Shapley Values for Data Valuation in Classification`
 (https://arxiv.org/abs/2211.06800)

---
 CHANGELOG.md                          |   2 +
 docs/30-data-valuation.rst            |  46 ++
 src/pydvl/utils/config.py             |   1 +
 src/pydvl/utils/dataset.py            |   9 +-
 src/pydvl/utils/numeric.py            |  84 ++-
 src/pydvl/utils/score.py              | 151 ++++-
 src/pydvl/utils/util.py               |  14 +
 src/pydvl/value/result.py             |  26 +-
 src/pydvl/value/shapley/__init__.py   |   1 +
 src/pydvl/value/shapley/classwise.py  | 251 ++++++++
 src/pydvl/value/shapley/montecarlo.py | 187 +++++-
 src/pydvl/value/shapley/truncated.py  |  22 +-
 src/pydvl/value/stopping.py           |  10 +-
 tests/conftest.py                     |  24 +-
 tests/misc.py                         |  36 ++
 tests/utils/conftest.py               |  22 +
 tests/utils/test_numeric.py           |  27 +
 tests/utils/test_score.py             | 126 ++++-
 tests/value/shapley/test_classwise.py | 786 ++++++++++++++++++++++++++
 19 files changed, 1780 insertions(+), 45 deletions(-)
 create mode 100644 src/pydvl/utils/util.py
 create mode 100644 src/pydvl/value/shapley/classwise.py
 create mode 100644 tests/misc.py
 create mode 100644 tests/value/shapley/test_classwise.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 14aca878a..f625f407c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -27,6 +27,8 @@
   [PR #382](https://github.com/appliedAI-Initiative/pyDVL/pull/382)
 - Decouple ray.init from ParallelConfig 
   [PR #373](https://github.com/appliedAI-Initiative/pyDVL/pull/383)
+- **New Method**: Add classwise Shapley algorithm. 
+  [PR #338](https://github.com/appliedAI-Initiative/pyDVL/pull/338)
 
 ## 0.6.1 - 🏗 Bug fixes and small improvement
 
diff --git a/docs/30-data-valuation.rst b/docs/30-data-valuation.rst
index b3fd5018e..565f17906 100644
--- a/docs/30-data-valuation.rst
+++ b/docs/30-data-valuation.rst
@@ -359,6 +359,52 @@ useful in applications.
        u=utility, mode="truncated_montecarlo", done=MaxUpdates(1000)
    )
 
+Classwise Shapley
+^^^^^^^^^^^^^^^^^^
+
+A different schema applicable for classification problems first appeared in
+:footcite:t:`schoch_csshapley_2022`. The key insight is that samples can be beneficial
+for overall performance, while being detrimental for their own class. This could be an
+indication of some problem with the data. CS-Shapley changes the utility to account for
+this effect by decomposing it into a product of two functions: one gives
+priority to in-class accuracy, while the other adds a slight discount which
+increases as the out-of-class accuracy increases.
+
+The value is computed as:
+
+$$
+v_u(x_i) \approx \frac{1}{K \cdot L}
+\sum_{S^{(k)}_{-y_i} \subseteq T_{-y_i} \setminus \{i\}}
+\sum_{\sigma^{(l)} \in \Pi(T_{y_i} \setminus \{i\})}
+[u( \sigma_{\colon i} \cup \{i\} | S_{-y_i} )
+− u( \sigma_{\colon i} |  S_{-y_i})]
+$$
+
+where $K$ is the number of subsets $S^{(k)}_{-y_i}$ sampled from the class complement
+set $T_{-y_i}$ of class c and $L$ is the number of permutations sampled from the class
+indices set $T_{y_i}$. The scoring function used has the form
+
+$$u(S_{y_i}|S_{-y_i}) = a_S(D_{y_i}))) \exp\{a_S(D_{-y_i}))\}.$$
+
+This can be further customised, but that form is shown by the authors to have certain
+desirable properties.
+
+.. code-block:: python
+
+   from pydvl.utils import Dataset, Utility
+   from pydvl.value import compute_shapley_values
+
+   model = ...
+   scoring = ClassWiseScorer("accuracy")
+   data = Dataset(...)
+   utility = Utility(model, data, scoring)
+   values = classwise_shapley(
+        utility,
+        done=HistoryDeviation(n_steps=500, rtol=1e-3),
+        n_resample_complement_sets=10,
+        normalize_values=True
+    )
+
 
 Exact Shapley for KNN
 ^^^^^^^^^^^^^^^^^^^^^
diff --git a/src/pydvl/utils/config.py b/src/pydvl/utils/config.py
index 36b9ab647..675c1df02 100644
--- a/src/pydvl/utils/config.py
+++ b/src/pydvl/utils/config.py
@@ -25,6 +25,7 @@ class ParallelConfig:
     address: Optional[Union[str, Tuple[str, int]]] = None
     n_cpus_local: Optional[int] = None
     logging_level: int = logging.WARNING
+    _temp_dir: Optional[str] = None
 
     def __post_init__(self) -> None:
         if self.address is not None and self.n_cpus_local is not None:
diff --git a/src/pydvl/utils/dataset.py b/src/pydvl/utils/dataset.py
index 980957cbc..cab59416b 100644
--- a/src/pydvl/utils/dataset.py
+++ b/src/pydvl/utils/dataset.py
@@ -222,6 +222,10 @@ def indices(self):
         """
         return self._indices
 
+    @indices.setter
+    def indices(self, indices: np.ndarray):
+        self._indices = indices
+
     @property
     def data_names(self):
         """Names of each individual datapoint.
@@ -410,11 +414,6 @@ def __init__(
     def __len__(self):
         return len(self.groups)
 
-    @property
-    def indices(self):
-        """Indices of the groups."""
-        return self._indices
-
     # FIXME this is a misnomer, should be `names` in `Dataset` so that here it
     #  makes sense
     @property
diff --git a/src/pydvl/utils/numeric.py b/src/pydvl/utils/numeric.py
index c639da82b..5e5904c56 100644
--- a/src/pydvl/utils/numeric.py
+++ b/src/pydvl/utils/numeric.py
@@ -4,8 +4,22 @@
 """
 from __future__ import annotations
 
+import logging
+import os
+import random
+import time
 from itertools import chain, combinations
-from typing import Collection, Generator, Iterator, Optional, Tuple, TypeVar, overload
+from typing import (
+    Collection,
+    Generator,
+    Iterator,
+    List,
+    Optional,
+    Tuple,
+    TypeVar,
+    cast,
+    overload,
+)
 
 import numpy as np
 from numpy.typing import NDArray
@@ -17,10 +31,15 @@
     "random_matrix_with_condition_number",
     "random_subset",
     "random_powerset",
+    "random_powerset_group_conditional",
     "random_subset_of_size",
     "top_k_value_accuracy",
 ]
 
+
+logger = logging.getLogger(__name__)
+
+
 T = TypeVar("T", bound=np.generic)
 
 
@@ -110,6 +129,69 @@ def random_powerset(
         total += 1
 
 
+def random_powerset_group_conditional(
+    s: NDArray[T],
+    groups: NDArray[np.int_],
+    min_elements_per_group: int = 1,
+) -> Generator[NDArray[T], None, None]:
+    """
+    Draw infinite random group-conditional subsets from the passed set s. It is ensured
+    that in each sampled set, each unique group is represented at least ``min_elements``
+    times. The groups are specified as integers for all elements of the set separately.
+
+    :param s: Vector of size N representing the set to sample elements from.
+    :param groups: Vector of size N containing the group as an integer for each element.
+    :param min_elements_per_group: The minimum number of elements for each group.
+
+    :return: Generated draw from the power set of s with ``min_elements`` of each group.
+    :raises: TypeError: If the data ``s`` or ``groups`` is not a NumPy array.
+    :raises: ValueError: If the length of ``s``and ``groups`` different or
+        ``min_elements`` is smaller than 0.
+    """
+    if not isinstance(s, np.ndarray):
+        raise TypeError("Set must be an NDArray")
+
+    if not isinstance(groups, np.ndarray):
+        raise TypeError("Labels must be an NDArray")
+
+    if len(groups) != len(s):
+        raise ValueError("Set and labels have to be of same size.")
+
+    if min_elements_per_group < 0:
+        raise ValueError(
+            f"Parameter min_elements={min_elements_per_group} needs to be bigger or equal to 0."
+        )
+
+    if min_elements_per_group == 0:
+        logger.warning(
+            "It is recommended to ensure at least one element of each group is"
+            " contained in the sampled and yielded set."
+        )
+
+    rng = np.random.default_rng()
+    unique_labels = np.unique(groups)
+
+    while True:
+        subsets: List[NDArray[T]] = []
+        for label in unique_labels:
+            label_indices = np.asarray(np.where(groups == label)[0])
+            subset_length = int(
+                rng.integers(
+                    min(min_elements_per_group, len(label_indices)),
+                    len(label_indices) + 1,
+                )
+            )
+            if subset_length > 0:
+                subsets.append(random_subset_of_size(s[label_indices], subset_length))
+
+        if len(subsets) > 0:
+            subset = np.concatenate(tuple(subsets))
+            rng.shuffle(subset)
+            yield subset
+        else:
+            yield np.array([])
+
+
 def random_subset_of_size(s: NDArray[T], size: int) -> NDArray[T]:
     """Samples a random subset of given size uniformly from the powerset
     of ``s``.
diff --git a/src/pydvl/utils/score.py b/src/pydvl/utils/score.py
index 933706d98..ca02b1ccb 100644
--- a/src/pydvl/utils/score.py
+++ b/src/pydvl/utils/score.py
@@ -2,7 +2,7 @@
 This module provides a :class:`Scorer` class that wraps scoring functions with
 additional information.
 
-Scorers can be constructed in the same way as in scikit-learn: either from 
+Scorers can be constructed in the same way as in scikit-learn: either from
 known strings or from a callable. Greater values must be better. If they are not,
 a negated version can be used, see scikit-learn's `make_scorer()
 <https://scikit-learn.org/stable/modules/generated/sklearn.metrics.make_scorer.html>`_.
@@ -17,11 +17,17 @@
 import numpy as np
 from numpy.typing import NDArray
 from scipy.special import expit
-from sklearn.metrics import get_scorer
+from sklearn.metrics import accuracy_score, get_scorer, make_scorer
 
 from pydvl.utils.types import SupervisedModel
 
-__all__ = ["Scorer", "compose_score", "squashed_r2", "squashed_variance"]
+__all__ = [
+    "Scorer",
+    "ClasswiseScorer",
+    "compose_score",
+    "squashed_r2",
+    "squashed_variance",
+]
 
 
 class ScorerCallable(Protocol):
@@ -58,7 +64,7 @@ class Scorer:
     def __init__(
         self,
         scoring: Union[str, ScorerCallable],
-        default: float = np.nan,
+        default: float = 0.0,
         range: Tuple = (-np.inf, np.inf),
         name: Optional[str] = None,
     ):
@@ -81,6 +87,143 @@ def __repr__(self):
         return f"{capitalized_name} (scorer={self._scorer})"
 
 
+class ClasswiseScorer(Scorer):
+    """A Scorer which is applicable for valuation in classification problems. Its value
+    is based on in-cls and out-of-cls score :footcite:t:`schoch_csshapley_2022`. For
+    each class ``label`` it separates the elements into two groups, namely in-cls
+    instances and out-of-cls instances. The value function itself than estimates the
+    in-cls metric discounted by the out-of-cls metric. In other words the value function
+    for each element of one class is conditioned on the out-of-cls instances (or a
+    subset of it). The form of the value function can be written as
+
+    .. math::
+        v_{y_i}(D) = f(a_S(D_{y_i}))) * g(a_S(D_{-y_i})))
+
+    where f and g are continuous, monotonic functions and D is the test set.
+
+    in order to produce meaningful results. For further reference see also section four
+    of :footcite:t:`schoch_csshapley_2022`.
+
+    :param default: Score used when a model cannot be fit, e.g. when too little data is
+        passed, or errors arise.
+    :param range: Numerical range of the score function. Some Monte Carlo methods can
+        use this to estimate the number of samples required for a certain quality of
+        approximation. If not provided, it can be read from the ``scoring`` object if it
+        provides it, for instance if it was constructed with
+        :func:`~pydvl.utils.types.compose_score`.
+    :param in_class_discount_fn: Continuous, monotonic increasing function used to
+        discount the in-class score.
+    :param out_of_class_discount_fn: Continuous, monotonic increasing function used to
+        discount the out-of-class score.
+    :param initial_label: Set initial label (Doesn't require to set parameter ``label``
+        on ``ClassWiseDiscountedScorer`` in first iteration)
+    :param name: Name of the scorer. If not provided, the name of the passed
+        function will be prefixed by 'classwise '.
+
+    .. versionadded:: 0.7.0
+    """
+
+    def __init__(
+        self,
+        scoring: str = "accuracy",
+        default: float = 0.0,
+        range: Tuple[float, float] = (-np.inf, np.inf),
+        in_class_discount_fn: Callable[[float], float] = lambda x: x,
+        out_of_class_discount_fn: Callable[[float], float] = np.exp,
+        initial_label: Optional[int] = None,
+        name: Optional[str] = None,
+    ):
+        disc_score_in_cls = in_class_discount_fn(range[1])
+        disc_score_out_of_cls = out_of_class_discount_fn(range[1])
+        transformed_range = (0, disc_score_in_cls * disc_score_out_of_cls)
+        super().__init__(
+            "accuracy",
+            range=transformed_range,
+            default=default,
+            name=name or f"classwise {scoring}",
+        )
+        self._in_cls_discount_fn = in_class_discount_fn
+        self._out_of_cls_discount_fn = out_of_class_discount_fn
+        self.label = initial_label
+
+    def __str__(self):
+        return self._name
+
+    def __call__(
+        self: "ClasswiseScorer",
+        model: SupervisedModel,
+        x_test: NDArray[np.float_],
+        y_test: NDArray[np.int_],
+    ) -> float:
+        """
+        :param model: Model used for computing the score on the validation set.
+        :param x_test: Array containing the features of the classification problem.
+        :param y_test: Array containing the labels of the classification problem.
+        :return: Calculated score.
+        """
+        in_cls_score, out_of_cls_score = self.estimate_in_cls_and_out_of_cls_score(
+            model, x_test, y_test
+        )
+        disc_score_in_cls = self._in_cls_discount_fn(in_cls_score)
+        disc_score_out_of_cls = self._out_of_cls_discount_fn(out_of_cls_score)
+        return disc_score_in_cls * disc_score_out_of_cls
+
+    def estimate_in_cls_and_out_of_cls_score(
+        self,
+        model: SupervisedModel,
+        x_test: NDArray[np.float_],
+        y_test: NDArray[np.int_],
+        rescale_scores: bool = True,
+    ) -> Tuple[float, float]:
+        r"""
+        Computes in-class and out-of-class scores using the provided scoring function,
+        which can be expressed as:
+
+        .. math::
+            a_S(D=\{(\hat{x}_1, \hat{y}_1), \dots, (\hat{x}_K, \hat{y}_K)\}) &=
+            \frac{1}{N} \sum_k s(y(\hat{x}_k), \hat{y}_k)
+
+        In this context, the computation is performed twice: once on D_i and once on D_o
+        to calculate the in-class and out-of-class scores. Here, D_i contains only
+        samples with the specified 'label' from the validation set, while D_o contains
+        all other samples. By default, the scores are scaled to have the same order of
+        magnitude. In such cases, the raw scores are multiplied by:
+
+        .. math::
+            N_{y_i} = \frac{a_S(D_{y_i})}{a_S(D_{y_i})+a_S(D_{-y_i})} \quad \text{and}
+            \quad N_{-y_i} = \frac{a_S(D_{-y_i})}{a_S(D_{y_i})+a_S(D_{-y_i})}
+
+        :param model: Model used for computing the score on the validation set.
+        :param x_test: Array containing the features of the classification problem.
+        :param y_test: Array containing the labels of the classification problem.
+        :param rescale_scores: If set to True, the scores will be denormalized. This is
+            particularly useful when the inner score is calculated by an estimator of
+            the form 1/N sum_i x_i.
+        :return: Tuple containing the in-class and out-of-class scores.
+        """
+        scorer = self._scorer
+        label_set_match = y_test == self.label
+        label_set = np.where(label_set_match)[0]
+        num_classes = len(np.unique(y_test))
+
+        if len(label_set) == 0:
+            return 0, 1 / (num_classes - 1)
+
+        complement_label_set = np.where(~label_set_match)[0]
+        in_cls_score = scorer(model, x_test[label_set], y_test[label_set])
+        out_of_cls_score = scorer(
+            model, x_test[complement_label_set], y_test[complement_label_set]
+        )
+
+        if rescale_scores:
+            n_in_cls = np.count_nonzero(y_test == self.label)
+            n_out_of_cls = len(y_test) - n_in_cls
+            in_cls_score *= n_in_cls / (n_in_cls + n_out_of_cls)
+            out_of_cls_score *= n_out_of_cls / (n_in_cls + n_out_of_cls)
+
+        return in_cls_score, out_of_cls_score
+
+
 def compose_score(
     scorer: Scorer,
     transformation: Callable[[float], float],
diff --git a/src/pydvl/utils/util.py b/src/pydvl/utils/util.py
new file mode 100644
index 000000000..d556b4d28
--- /dev/null
+++ b/src/pydvl/utils/util.py
@@ -0,0 +1,14 @@
+import numpy as np
+from numpy.typing import NDArray
+
+
+def arr_or_writeable_copy(arr: NDArray) -> NDArray:
+    """Return a copy of ``arr`` if it's not writeable, otherwise return ``arr``.
+
+    :param arr: Array to copy if it's not writeable.
+    :return: Copy of ``arr`` if it's not writeable, otherwise ``arr``.
+    """
+    if not arr.flags.writeable:
+        return np.copy(arr)
+
+    return arr
diff --git a/src/pydvl/value/result.py b/src/pydvl/value/result.py
index 219b8ea90..f66a514e7 100644
--- a/src/pydvl/value/result.py
+++ b/src/pydvl/value/result.py
@@ -66,6 +66,7 @@
 from pydvl.utils.dataset import Dataset
 from pydvl.utils.numeric import running_moments
 from pydvl.utils.status import Status
+from pydvl.utils.util import arr_or_writeable_copy
 
 try:
     import pandas  # Try to import here for the benefit of mypy
@@ -234,8 +235,12 @@ def __init__(
 
         self._algorithm = algorithm
         self._status = Status(status)  # Just in case we are given a string
-        self._values = values
-        self._variances = np.zeros_like(values) if variances is None else variances
+        self._values = arr_or_writeable_copy(values)
+        self._variances = (
+            np.zeros_like(values)
+            if variances is None
+            else arr_or_writeable_copy(variances)
+        )
         self._counts = np.ones_like(values) if counts is None else counts
         self._sort_order = None
         self._extra_values = extra_values or {}
@@ -526,10 +531,14 @@ def __add__(self, other: "ValuationResult") -> "ValuationResult":
         xm[other_pos] = other._values
         vm[other_pos] = other._variances
 
+        # np.maximum(1, n + m) covers case n = m = 0 with
+        n_m_sum = np.maximum(1, n + m)
+
         # Sample mean of n+m samples from two means of n and m samples
-        xnm = (n * xn + m * xm) / (n + m)
+        xnm = (n * xn + m * xm) / n_m_sum
+
         # Sample variance of n+m samples from two sample variances of n and m samples
-        vnm = (n * (vn + xn**2) + m * (vm + xm**2)) / (n + m) - xnm**2
+        vnm = (n * (vn + xn**2) + m * (vm + xm**2)) / n_m_sum - xnm**2
 
         if np.any(vnm < 0):
             if np.any(vnm < -1e-6):
@@ -610,6 +619,15 @@ def update(self, idx: int, new_value: float) -> "ValuationResult":
         )
         return self
 
+    def scale(self, coefficient: float, indices: Optional[NDArray[IndexT]] = None):
+        """
+        Scales the values and variances of the result by a coefficient.
+        :param coefficient: Coefficient to scale by.
+        :param indices: Indices to scale. If None, all values are scaled.
+        """
+        self._values[self._sort_positions[indices]] *= coefficient
+        self._variances[self._sort_positions[indices]] *= coefficient**2
+
     def get(self, idx: Integral) -> ValueItem:
         """Retrieves a ValueItem by data index, as opposed to sort index, like
         the indexing operator.
diff --git a/src/pydvl/value/shapley/__init__.py b/src/pydvl/value/shapley/__init__.py
index 6f93cd60e..db5802f25 100644
--- a/src/pydvl/value/shapley/__init__.py
+++ b/src/pydvl/value/shapley/__init__.py
@@ -8,6 +8,7 @@
 
 from ..result import *
 from ..stopping import *
+from .classwise import *
 from .common import *
 from .gt import *
 from .knn import *
diff --git a/src/pydvl/value/shapley/classwise.py b/src/pydvl/value/shapley/classwise.py
new file mode 100644
index 000000000..9dd9f5ad6
--- /dev/null
+++ b/src/pydvl/value/shapley/classwise.py
@@ -0,0 +1,251 @@
+"""
+Implementation of the algorithm footcite:t:`schoch_csshapley_2022`.
+"""
+import logging
+import numbers
+from concurrent.futures import FIRST_COMPLETED, wait
+from copy import copy
+from typing import cast
+
+import numpy as np
+
+from pydvl.utils import (
+    ParallelConfig,
+    Utility,
+    effective_n_jobs,
+    init_executor,
+    init_parallel_backend,
+)
+
+__all__ = [
+    "compute_classwise_shapley_values",
+]
+
+from tqdm import tqdm
+
+from pydvl.utils.score import ClasswiseScorer
+from pydvl.value.result import ValuationResult
+from pydvl.value.shapley.montecarlo import permutation_montecarlo_classwise_shapley
+from pydvl.value.shapley.truncated import TruncationPolicy
+from pydvl.value.stopping import MaxChecks, StoppingCriterion
+
+logger = logging.getLogger(__name__)
+
+
+def compute_classwise_shapley_values(
+    u: Utility,
+    *,
+    done: StoppingCriterion,
+    truncation: TruncationPolicy,
+    normalize_values: bool = True,
+    n_resample_complement_sets: int = 1,
+    use_default_scorer_value: bool = True,
+    min_elements_per_label: int = 1,
+    n_jobs: int = 1,
+    config: ParallelConfig = ParallelConfig(),
+    progress: bool = False,
+) -> ValuationResult:
+    """
+    Computes the classwise Shapley value by parallel processing. Independent workers
+    are spawned to process the data in parallel. Once the data is aggregated, the values
+    can be optionally normalized, depending on ``normalize_values``.
+
+    :param u: Utility object containing model, data, and scoring function. The scoring
+        function should be of type :class:`~pydvl.utils.score.ClassWiseScorer`.
+    :param done: Function that checks whether the computation needs to stop.
+    :param truncation: Callable function that decides whether to interrupt processing a
+        permutation and set subsequent marginals to zero.
+    :param normalize_values: Indicates whether to normalize the values by the variation
+        in each class times their in-class accuracy.
+    :param n_resample_complement_sets: Number of times to resample the complement set
+        for each permutation.
+    :param use_default_scorer_value: Use default scorer value even if additional_indices
+        is not None.
+    :param min_elements_per_label: The minimum number of elements for each opposite
+        label.
+    :param n_jobs: Number of parallel jobs to run.
+    :param config: Parallel configuration.
+    :param progress: Whether to display progress bars for each job.
+    :return: ValuationResult object containing computed data values.
+    """
+
+    _check_classwise_shapley_utility(u)
+
+    parallel_backend = init_parallel_backend(config)
+    u_ref = parallel_backend.put(u)
+    # This represents the number of jobs that are running
+    n_jobs = effective_n_jobs(n_jobs, config)
+    # This determines the total number of submitted jobs
+    # including the ones that are running
+    n_submitted_jobs = 2 * n_jobs
+
+    pbar = tqdm(disable=not progress, position=0, total=100, unit="%")
+    accumulated_result = ValuationResult.zeros(
+        algorithm="classwise_shapley",
+        indices=u.data.indices,
+        data_names=u.data.data_names,
+    )
+    terminate_exec = False
+    with init_executor(max_workers=n_jobs, config=config) as executor:
+        futures = set()
+        # Initial batch of computations
+        for _ in range(n_submitted_jobs):
+            future = executor.submit(
+                _classwise_shapley_one_step,
+                u_ref,
+                truncation=truncation,
+                n_resample_complement_sets=n_resample_complement_sets,
+                use_default_scorer_value=use_default_scorer_value,
+                min_elements_per_label=min_elements_per_label,
+            )
+            futures.add(future)
+        while futures:
+            # Wait for the next futures to complete.
+            completed_futures, futures = wait(
+                futures, timeout=60, return_when=FIRST_COMPLETED
+            )
+            for future in completed_futures:
+                accumulated_result += future.result()
+                if done(accumulated_result):
+                    terminate_exec = True
+                    break
+
+            pbar.n = 100 * done.completion()
+            pbar.refresh()
+            if terminate_exec:
+                break
+
+            # Submit more computations
+            # The goal is to always have `n_jobs`
+            # computations running
+            for _ in range(n_submitted_jobs - len(futures)):
+                future = executor.submit(
+                    _classwise_shapley_one_step,
+                    u_ref,
+                    truncation=truncation,
+                    n_resample_complement_sets=n_resample_complement_sets,
+                    use_default_scorer_value=use_default_scorer_value,
+                    min_elements_per_label=min_elements_per_label,
+                )
+                futures.add(future)
+
+    result = accumulated_result
+    if normalize_values:
+        result = _normalize_classwise_shapley_values(result, u)
+
+    return result
+
+
+def _classwise_shapley_one_step(
+    u: Utility,
+    *,
+    truncation: TruncationPolicy,
+    n_resample_complement_sets: int = 1,
+    use_default_scorer_value: bool = True,
+    min_elements_per_label: int = 1,
+) -> ValuationResult:
+    """Computes classwise Shapley value using truncated Monte Carlo permutation
+    sampling for the subsets.
+
+    :param u: Utility object containing model, data, and scoring function. The scoring
+        function should be of type :class:`~pydvl.utils.score.ClassWiseScorer`.
+    :param truncation: Callable function that decides whether to interrupt processing a
+        permutation and set subsequent marginals to zero.
+    :param n_resample_complement_sets: Number of times to resample the complement set
+        for each permutation.
+    :param use_default_scorer_value: Use default scorer value even if additional_indices
+        is not None.
+     :param min_elements_per_label: The minimum number of elements for each opposite
+        label.
+    :return: ValuationResult object containing computed data values.
+    """
+    result = ValuationResult.zeros(
+        algorithm="classwise_shapley",
+        indices=u.data.indices,
+        data_names=u.data.data_names,
+    )
+    x_train, y_train = u.data.get_training_data(u.data.indices)
+    unique_labels = np.unique(y_train)
+    scorer = cast(ClasswiseScorer, copy(u.scorer))
+    u.scorer = scorer
+
+    for label in unique_labels:
+        u.scorer.label = label
+        result += permutation_montecarlo_classwise_shapley(
+            u,
+            label,
+            done=MaxChecks(n_resample_complement_sets - 1),
+            truncation=truncation,
+            use_default_scorer_value=use_default_scorer_value,
+            min_elements_per_label=min_elements_per_label,
+        )
+
+    return result
+
+
+def _check_classwise_shapley_utility(u: Utility):
+    """
+    Verifies if the provided utility object supports classwise Shapley values.
+
+    :param u: Utility object containing model, data, and scoring function. The scoring
+        function should be of type :class:`~pydvl.utils.score.ClassWiseScorer`.
+    :raises: ValueError: If ``u.data`` is not a classification problem.
+    :raises: ValueError: If ``u.scorer`` is not an instance of
+        :class:`~pydvl.utils.score.ClassWiseScorer`
+    """
+
+    dim_correct = u.data.y_train.ndim == 1 and u.data.y_test.ndim == 1
+    is_integral = all(
+        map(
+            lambda v: isinstance(v, numbers.Integral), (*u.data.y_train, *u.data.y_test)
+        )
+    )
+    if not dim_correct or not is_integral:
+        raise ValueError(
+            "The supplied dataset has to be a 1-dimensional classification dataset."
+        )
+
+    if not isinstance(u.scorer, ClasswiseScorer):
+        raise ValueError(
+            "Please set a subclass of ClassWiseScorer object as scorer object of the"
+            " utility. See scoring argument of Utility."
+        )
+
+
+def _normalize_classwise_shapley_values(
+    result: ValuationResult,
+    u: Utility,
+) -> ValuationResult:
+    """
+    Normalize a valuation result specific to classwise Shapley.
+
+    Each value corresponds to a class c and gets normalized by multiplying
+    `in-class-score / sigma`. In this context `sigma` is the magnitude of all values
+    belonging to the currently viewed class. See footcite:t:`schoch_csshapley_2022` for
+    more details.
+
+    :param result: ValuationResult object to be normalized.
+    :param u: Utility object containing model, data, and scoring function. The scoring
+        function should be of type :class:`~pydvl.utils.score.ClassWiseScorer`.
+    """
+    y_train = u.data.y_train
+    unique_labels = np.unique(np.concatenate((y_train, u.data.y_test)))
+    scorer = cast(ClasswiseScorer, u.scorer)
+
+    for idx_label, label in enumerate(unique_labels):
+        scorer.label = label
+        active_elements = y_train == label
+        indices_label_set = np.where(active_elements)[0]
+        indices_label_set = u.data.indices[indices_label_set]
+
+        u.model.fit(u.data.x_train, u.data.y_train)
+        scorer.label = label
+        in_cls_acc, _ = scorer.estimate_in_cls_and_out_of_cls_score(
+            u.model, u.data.x_test, u.data.y_test
+        )
+
+        sigma = np.sum(result.values[indices_label_set])
+        if sigma != 0:
+            result.scale(in_cls_acc / sigma, indices=indices_label_set)
+
+    return result
diff --git a/src/pydvl/value/shapley/montecarlo.py b/src/pydvl/value/shapley/montecarlo.py
index ad43edad1..7eef96032 100644
--- a/src/pydvl/value/shapley/montecarlo.py
+++ b/src/pydvl/value/shapley/montecarlo.py
@@ -35,12 +35,14 @@
 import operator
 from functools import reduce
 from itertools import cycle, takewhile
-from typing import Sequence
+from typing import Optional, Sequence, Tuple
 
 import numpy as np
+from numpy._typing import NDArray
 from numpy.typing import NDArray
 from tqdm import tqdm
 
+from pydvl.utils import Utility, random_powerset_group_conditional
 from pydvl.utils.config import ParallelConfig
 from pydvl.utils.numeric import random_powerset
 from pydvl.utils.parallel import MapReduceJob
@@ -51,7 +53,11 @@
 
 logger = logging.getLogger(__name__)
 
-__all__ = ["permutation_montecarlo_shapley", "combinatorial_montecarlo_shapley"]
+__all__ = [
+    "permutation_montecarlo_shapley",
+    "permutation_montecarlo_classwise_shapley",
+    "combinatorial_montecarlo_shapley",
+]
 
 
 def _permutation_montecarlo_shapley(
@@ -87,20 +93,11 @@ def _permutation_montecarlo_shapley(
     while not done(result):
         pbar.n = 100 * done.completion()
         pbar.refresh()
-        prev_score = 0.0
         permutation = np.random.permutation(u.data.indices)
-        permutation_done = False
-        truncation.reset()
-        for i, idx in enumerate(permutation):
-            if permutation_done:
-                score = prev_score
-            else:
-                score = u(permutation[: i + 1])
-            marginal = score - prev_score
-            result.update(idx, marginal)
-            prev_score = score
-            if not permutation_done and truncation(i, score):
-                permutation_done = True
+        result += _permutation_montecarlo_shapley_rollout(
+            u, permutation, truncation=truncation, algorithm_name=algorithm_name
+        )
+
     return result
 
 
@@ -152,6 +149,146 @@ def permutation_montecarlo_shapley(
     return map_reduce_job()
 
 
+def permutation_montecarlo_classwise_shapley(
+    u: Utility,
+    label: int,
+    *,
+    done: StoppingCriterion,
+    truncation: TruncationPolicy,
+    use_default_scorer_value: bool = True,
+    min_elements_per_label: int = 1,
+) -> ValuationResult:
+    """
+    Samples a random subset of the complement set and computes the truncated Monte Carlo
+    estimator.
+
+    :param u: Utility object containing model, data, and scoring function. The scoring
+        function should be of type :class:`~pydvl.utils.score.ClassWiseScorer`.
+    :param done: Function checking whether computation needs to stop.
+    :param label: The label for which to sample the complement (e.g. all other labels)
+    :param truncation: Callable which decides whether to interrupt processing a
+        permutation and set all subsequent marginals to zero.
+    :param use_default_scorer_value: Use default scorer value even if additional_indices
+        is not None.
+    :param min_elements_per_label: The minimum number of elements for each opposite
+        label.
+    :return: ValuationResult object containing computed data values.
+    """
+
+    algorithm_name = "classwise_shapley"
+    result = ValuationResult.zeros(
+        algorithm="classwise_shapley",
+        indices=u.data.indices,
+        data_names=u.data.data_names,
+    )
+
+    _, y_train = u.data.get_training_data(u.data.indices)
+    class_indices_set, class_complement_indices_set = split_indices_by_label(
+        u.data.indices,
+        y_train,
+        label,
+    )
+    _, complement_y_train = u.data.get_training_data(class_complement_indices_set)
+    indices_permutation = np.random.permutation(class_indices_set)
+
+    for subset_idx, subset_complement in enumerate(
+        random_powerset_group_conditional(
+            class_complement_indices_set,
+            complement_y_train,
+            min_elements_per_group=min_elements_per_label,
+        )
+    ):
+        result += _permutation_montecarlo_shapley_rollout(
+            u,
+            indices_permutation,
+            additional_indices=subset_complement,
+            truncation=truncation,
+            algorithm_name=algorithm_name,
+            use_default_scorer_value=use_default_scorer_value,
+        )
+        if done(result):
+            break
+
+    return result
+
+
+def _permutation_montecarlo_shapley_rollout(
+    u: Utility,
+    permutation: NDArray[np.int_],
+    *,
+    truncation: TruncationPolicy,
+    algorithm_name: str,
+    additional_indices: Optional[NDArray[np.int_]] = None,
+    use_default_scorer_value: bool = True,
+) -> ValuationResult:
+    """
+    A truncated version of a permutation-based MC estimator for classwise Shapley
+    values. It generates a permutation p[i] of the class label indices and iterates over
+    all subsets starting from the empty set to the full set of indices.
+
+    :param u: Utility object containing model, data, and scoring function. The scoring
+        function should to be of type :class:`~pydvl.utils.score.ClassWiseScorer`.
+    :param permutation: Permutation of indices to be considered.
+    :param truncation: Callable which decides whether to interrupt processing a
+        permutation and set all subsequent marginals to zero.
+    :param additional_indices: Set of additional indices for data points which should be
+        always considered.
+    :param use_default_scorer_value: Use default scorer value even if additional_indices
+        is not None.
+    :return: ValuationResult object containing computed data values.
+    """
+    if (
+        additional_indices is not None
+        and len(np.intersect1d(permutation, additional_indices)) > 0
+    ):
+        raise ValueError(
+            "The class label set and the complement set have to be disjoint."
+        )
+
+    result = ValuationResult.zeros(
+        algorithm=algorithm_name,
+        indices=u.data.indices,
+        data_names=u.data.data_names,
+    )
+
+    prev_score = (
+        u.default_score
+        if (
+            use_default_scorer_value
+            or additional_indices is None
+            or additional_indices is not None
+            and len(additional_indices) == 0
+        )
+        else u(additional_indices)
+    )
+
+    # hack to calculate the correct value in reset.
+    if additional_indices is not None:
+        old_indices = u.data.indices
+        u.data.indices = np.sort(np.concatenate((permutation, additional_indices)))
+        truncation.reset(u)
+        u.data.indices = old_indices
+    else:
+        truncation.reset(u)
+
+    is_terminated = False
+    for i, idx in enumerate(permutation):
+        if is_terminated or (is_terminated := truncation(i, prev_score)):
+            score = prev_score
+        else:
+            score = u(
+                np.concatenate((permutation[: i + 1], additional_indices))
+                if additional_indices is not None and len(additional_indices) > 0
+                else permutation[: i + 1]
+            )
+
+        marginal = score - prev_score
+        result.update(idx, marginal)
+        prev_score = score
+
+    return result
+
+
 def _combinatorial_montecarlo_shapley(
     indices: Sequence[int],
     u: Utility,
@@ -246,3 +383,23 @@ def combinatorial_montecarlo_shapley(
         config=config,
     )
     return map_reduce_job()
+
+
+def split_indices_by_label(
+    indices: NDArray[np.int_], labels: NDArray[np.int_], label: int
+) -> Tuple[NDArray[np.int_], NDArray[np.int_]]:
+    """
+    Splits the indices into two sets based on the value of  ``label``: those samples
+    with and without that label.
+
+    :param indices: The indices to be used for referring to the data.
+    :param labels: Corresponding labels for the indices.
+    :param label: Label to be used for splitting.
+    :return: Tuple with two sets of indices.
+    """
+    active_elements = labels == label
+    class_indices_set = np.where(active_elements)[0]
+    class_complement_indices_set = np.where(~active_elements)[0]
+    class_indices_set = indices[class_indices_set]
+    class_complement_indices_set = indices[class_complement_indices_set]
+    return class_indices_set, class_complement_indices_set
diff --git a/src/pydvl/value/shapley/truncated.py b/src/pydvl/value/shapley/truncated.py
index 23b871699..2945c95bf 100644
--- a/src/pydvl/value/shapley/truncated.py
+++ b/src/pydvl/value/shapley/truncated.py
@@ -1,6 +1,7 @@
 import abc
 import logging
 from concurrent.futures import FIRST_COMPLETED, wait
+from typing import Optional
 
 import numpy as np
 from deprecate import deprecated
@@ -48,7 +49,7 @@ def _check(self, idx: int, score: float) -> bool:
         ...
 
     @abc.abstractmethod
-    def reset(self):
+    def reset(self, u: Optional[Utility] = None):
         """Reset the policy to a state ready for a new permutation."""
         ...
 
@@ -71,7 +72,7 @@ class NoTruncation(TruncationPolicy):
     def _check(self, idx: int, score: float) -> bool:
         return False
 
-    def reset(self):
+    def reset(self, u: Optional[Utility] = None):
         pass
 
 
@@ -94,7 +95,7 @@ def _check(self, idx: int, score: float) -> bool:
         self.count += 1
         return self.count >= self.max_marginals
 
-    def reset(self):
+    def reset(self, u: Optional[Utility] = None):
         self.count = 0
 
 
@@ -111,14 +112,18 @@ class RelativeTruncation(TruncationPolicy):
     def __init__(self, u: Utility, rtol: float):
         super().__init__()
         self.rtol = rtol
-        logger.info("Computing total utility for permutation truncation.")
-        self.total_utility = u(u.data.indices)
+        self.total_utility = self.reset(u)
+        self._u = u
 
     def _check(self, idx: int, score: float) -> bool:
         return np.allclose(score, self.total_utility, rtol=self.rtol)
 
-    def reset(self):
-        pass
+    def reset(self, u: Optional[Utility] = None) -> float:
+        if u is None:
+            u = self._u
+
+        self.total_utility = u(u.data.indices)
+        return self.total_utility
 
 
 class BootstrapTruncation(TruncationPolicy):
@@ -134,7 +139,6 @@ class BootstrapTruncation(TruncationPolicy):
     def __init__(self, u: Utility, n_samples: int, sigmas: float = 1):
         super().__init__()
         self.n_samples = n_samples
-        logger.info("Computing total utility for permutation truncation.")
         self.total_utility = u(u.data.indices)
         self.count: int = 0
         self.variance: float = 0
@@ -155,7 +159,7 @@ def _check(self, idx: int, score: float) -> bool:
             self.sigmas * np.sqrt(self.variance)
         )
 
-    def reset(self):
+    def reset(self, u: Optional[Utility] = None):
         self.count = 0
         self.variance = self.mean = 0
 
diff --git a/src/pydvl/value/stopping.py b/src/pydvl/value/stopping.py
index 09ba84475..b235d2067 100644
--- a/src/pydvl/value/stopping.py
+++ b/src/pydvl/value/stopping.py
@@ -279,13 +279,13 @@ class MaxChecks(StoppingCriterion):
 
     def __init__(self, n_checks: Optional[int], modify_result: bool = True):
         super().__init__(modify_result=modify_result)
-        if n_checks is not None and n_checks < 1:
-            raise ValueError("n_iterations must be at least 1 or None")
+        if n_checks is not None and n_checks < 0:
+            raise ValueError("n_iterations must be at least 0 or None")
         self.n_checks = n_checks
         self._count = 0
 
     def _check(self, result: ValuationResult) -> Status:
-        if self.n_checks:
+        if self.n_checks is not None:
             self._count += 1
             if self._count > self.n_checks:
                 self._converged = np.ones_like(result.values, dtype=bool)
@@ -293,7 +293,7 @@ def _check(self, result: ValuationResult) -> Status:
         return Status.Pending
 
     def completion(self) -> float:
-        if self.n_checks:
+        if self.n_checks is not None:
             return min(1.0, self._count / self.n_checks)
         return 0.0
 
@@ -476,7 +476,7 @@ def _check(self, r: ValuationResult) -> Status:
             quots = np.divide(diffs, curr[ii], out=diffs, where=curr[ii] != 0)
             # quots holds the quotients when the denominator is non-zero, and
             # the absolute difference, which is just the memory, otherwise.
-            if np.mean(quots) < self.rtol:
+            if len(quots) > 0 and np.mean(quots) < self.rtol:
                 self._converged = self.update_op(
                     self._converged, r.counts > self.n_steps
                 )  # type: ignore
diff --git a/tests/conftest.py b/tests/conftest.py
index 41244d275..d03779214 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -11,8 +11,9 @@
 from sklearn import datasets
 from sklearn.utils import Bunch
 
-from pydvl.utils import Dataset, MemcachedClientConfig
+from pydvl.utils import ClasswiseScorer, Dataset, MemcachedClientConfig, Utility
 from pydvl.utils.parallel.backend import available_cpus
+from tests.misc import ClosedFormLinearClassifier
 
 if TYPE_CHECKING:
     from _pytest.config import Config
@@ -411,3 +412,24 @@ def pytest_terminal_summary(
 ):
     tolerate_session = terminalreporter.config._tolerate_session
     tolerate_session.display(terminalreporter)
+
+
+@pytest.fixture(scope="function")
+def dataset_alt_seq_full() -> Dataset:
+    x_train = np.arange(1, 5).reshape([-1, 1])
+    y_train = np.array([0, 0, 1, 1])
+    x_test = x_train
+    y_test = np.array([0, 0, 0, 1])
+    return Dataset(x_train, y_train, x_test, y_test)
+
+
+@pytest.fixture(scope="function")
+def linear_classifier_cs_scorer(
+    dataset_alt_seq_full: Dataset,
+) -> Utility:
+    return Utility(
+        ClosedFormLinearClassifier(),
+        dataset_alt_seq_full,
+        ClasswiseScorer("accuracy"),
+        catch_errors=False,
+    )
diff --git a/tests/misc.py b/tests/misc.py
new file mode 100644
index 000000000..2d12fb673
--- /dev/null
+++ b/tests/misc.py
@@ -0,0 +1,36 @@
+import numpy as np
+from numpy._typing import NDArray
+
+
+class ThresholdClassifier:
+    def fit(self, x: NDArray, y: NDArray) -> float:
+        raise NotImplementedError("Mock model")
+
+    def predict(self, x: NDArray) -> NDArray:
+        y = 0.5 < x
+        return y[:, 0].astype(int)
+
+    def score(self, x: NDArray, y: NDArray) -> float:
+        raise NotImplementedError("Mock model")
+
+
+class ClosedFormLinearClassifier:
+    def __init__(self):
+        self._beta = None
+
+    def fit(self, x: NDArray, y: NDArray) -> float:
+        v = x[:, 0]
+        self._beta = np.dot(v, y) / np.dot(v, v)
+        return -1
+
+    def predict(self, x: NDArray) -> NDArray:
+        if self._beta is None:
+            raise AttributeError("Model not fitted")
+
+        x = x[:, 0]
+        probs = self._beta * x
+        return np.clip(np.round(probs + 1e-10), 0, 1).astype(int)
+
+    def score(self, x: NDArray, y: NDArray) -> float:
+        pred_y = self.predict(x)
+        return np.sum(pred_y == y) / 4
diff --git a/tests/utils/conftest.py b/tests/utils/conftest.py
index f64330777..5783aba8a 100644
--- a/tests/utils/conftest.py
+++ b/tests/utils/conftest.py
@@ -1,5 +1,9 @@
+from typing import Dict, Tuple
+
+import numpy as np
 import pytest
 import ray
+from numpy.typing import NDArray
 from ray.cluster_utils import Cluster
 
 from pydvl.utils.config import ParallelConfig
@@ -23,3 +27,21 @@ def parallel_config(request, num_workers):
         yield ParallelConfig(backend="ray", address=cluster.address)
         ray.shutdown()
         cluster.shutdown()
+
+
+@pytest.fixture(scope="function")
+def dataset_alt_seq_simple(
+    request,
+) -> Tuple[NDArray[np.float_], NDArray[np.int_], Dict[str, float]]:
+    """
+    The label set is represented as 0000011100011111, with adjustable left and right
+    margins. The left margin denotes the percentage of zeros at the beginning, while the
+    right margin denotes the percentage of ones at the end. Accuracy can be efficiently
+    calculated using a closed-form solution.
+    """
+    n_element, left_margin, right_margin = request.param
+    x = np.linspace(0, 1, n_element)
+    y = ((left_margin <= x) & (x < 0.5)) | ((1 - right_margin) <= x)
+    y = y.astype(int)
+    x = np.expand_dims(x, -1)
+    return x, y, {"left_margin": left_margin, "right_margin": right_margin}
diff --git a/tests/utils/test_numeric.py b/tests/utils/test_numeric.py
index e6101defb..5b3d2582e 100644
--- a/tests/utils/test_numeric.py
+++ b/tests/utils/test_numeric.py
@@ -5,6 +5,7 @@
     powerset,
     random_matrix_with_condition_number,
     random_powerset,
+    random_powerset_group_conditional,
     random_subset_of_size,
     running_moments,
 )
@@ -138,3 +139,29 @@ def test_running_moments():
         true_variances = [np.var(vv) for vv in values]
         assert np.allclose(means, true_means)
         assert np.allclose(variances, true_variances)
+
+
+@pytest.mark.parametrize("min_elements", [1, 2])
+@pytest.mark.parametrize("elements_per_group", [10])
+@pytest.mark.parametrize("num_groups", [3])
+@pytest.mark.parametrize("check_num_samples", [10])
+def test_random_powerset_group_conditional(
+    min_elements: int,
+    elements_per_group: int,
+    num_groups: int,
+    check_num_samples: int,
+):
+    s = np.arange(num_groups * elements_per_group)
+    groups = np.arange(num_groups).repeat(elements_per_group)
+
+    for idx, subset in enumerate(
+        random_powerset_group_conditional(s, groups, min_elements)
+    ):
+        assert np.all(np.isin(subset, s))
+        assert np.all(np.unique(groups[subset]) == np.unique(groups))
+
+        for group in np.unique(groups):
+            assert np.sum(group == groups[subset]) >= min_elements
+
+        if idx == check_num_samples:
+            break
diff --git a/tests/utils/test_score.py b/tests/utils/test_score.py
index 078775240..15bd91d1e 100644
--- a/tests/utils/test_score.py
+++ b/tests/utils/test_score.py
@@ -1,7 +1,19 @@
+from typing import Dict, Tuple, cast
+
 import numpy as np
+import pandas as pd
+import pytest
 from numpy.typing import NDArray
 
-from pydvl.utils.score import Scorer, compose_score, squashed_r2, squashed_variance
+from pydvl.utils import Utility, powerset
+from pydvl.utils.score import (
+    ClasswiseScorer,
+    Scorer,
+    compose_score,
+    squashed_r2,
+    squashed_variance,
+)
+from tests.misc import ThresholdClassifier
 
 sigmoid = lambda x: 1 / (1 + np.exp(-x))
 
@@ -69,3 +81,115 @@ def test_squashed_variance():
     X = np.array([[1, 2], [3, 4]])
     model = FittedLinearModel(coef)
     assert sigmoid(1.0) == squashed_variance(model, X, X @ coef)
+
+
+@pytest.mark.parametrize(
+    "dataset_alt_seq_simple",
+    [((101, 0.3, 0.4))],
+    indirect=True,
+)
+def test_cs_scorer_on_dataset_alt_seq_simple(dataset_alt_seq_simple):
+    """
+    Tests the class wise scorer.
+    """
+
+    scorer = ClasswiseScorer("accuracy", initial_label=0)
+    assert str(scorer) == "classwise accuracy"
+    assert repr(scorer) == "ClasswiseAccuracy (scorer=make_scorer(accuracy_score))"
+
+    x, y, info = dataset_alt_seq_simple
+    n_element = len(x)
+    target_in_cls_acc_0 = (info["left_margin"] * 100 + 1) / n_element
+    target_out_of_cls_acc_0 = (info["right_margin"] * 100 + 1) / n_element
+
+    model = ThresholdClassifier()
+    in_cls_acc_0, out_of_cls_acc_0 = scorer.estimate_in_cls_and_out_of_cls_score(
+        model, x, y
+    )
+    assert np.isclose(in_cls_acc_0, target_in_cls_acc_0)
+    assert np.isclose(out_of_cls_acc_0, target_out_of_cls_acc_0)
+
+    scorer.label = 1
+    in_cls_acc_1, out_of_cls_acc_1 = scorer.estimate_in_cls_and_out_of_cls_score(
+        model, x, y
+    )
+    assert in_cls_acc_1 == out_of_cls_acc_0
+    assert in_cls_acc_0 == out_of_cls_acc_1
+
+    scorer.label = 0
+    value = scorer(model, x, y)
+    assert np.isclose(value, in_cls_acc_0 * np.exp(out_of_cls_acc_0))
+
+    scorer.label = 1
+    value = scorer(model, x, y)
+    assert np.isclose(value, in_cls_acc_1 * np.exp(out_of_cls_acc_1))
+
+
+def test_cs_scorer_on_alt_seq_cf_linear_classifier_cs_score(
+    linear_classifier_cs_scorer: Utility,
+):
+    subsets_zero = list(powerset(np.array((0, 1))))
+    subsets_one = list(powerset(np.array((2, 3))))
+    subsets_zero = [tuple(s) for s in subsets_zero]
+    subsets_one = [tuple(s) for s in subsets_one]
+    target_betas = pd.DataFrame(
+        [
+            [np.nan, 1 / 3, 1 / 4, 7 / 25],
+            [0, 3 / 10, 4 / 17, 7 / 26],
+            [0, 3 / 13, 1 / 5, 7 / 29],
+            [0, 3 / 14, 4 / 21, 7 / 30],
+        ],
+        index=subsets_zero,
+        columns=subsets_one,
+    )
+    target_accuracies_zero = pd.DataFrame(
+        [
+            [0, 1 / 4, 1 / 4, 1 / 4],
+            [3 / 4, 1 / 4, 1 / 2, 1 / 4],
+            [3 / 4, 1 / 2, 1 / 2, 1 / 2],
+            [3 / 4, 1 / 2, 1 / 2, 1 / 2],
+        ],
+        index=subsets_zero,
+        columns=subsets_one,
+    )
+    target_accuracies_one = pd.DataFrame(
+        [
+            [0, 1 / 4, 1 / 4, 1 / 4],
+            [0, 1 / 4, 1 / 4, 1 / 4],
+            [0, 1 / 4, 1 / 4, 1 / 4],
+            [0, 1 / 4, 1 / 4, 1 / 4],
+        ],
+        index=subsets_zero,
+        columns=subsets_one,
+    )
+    model = linear_classifier_cs_scorer.model
+    scorer = cast(ClasswiseScorer, linear_classifier_cs_scorer.scorer)
+    scorer.label = 0
+
+    for set_zero_idx in range(len(subsets_zero)):
+        for set_one_idx in range(len(subsets_one)):
+            indices = list(subsets_zero[set_zero_idx] + subsets_one[set_one_idx])
+            (
+                x_train,
+                y_train,
+            ) = linear_classifier_cs_scorer.data.get_training_data(indices)
+            linear_classifier_cs_scorer.model.fit(x_train, y_train)
+            fitted_beta = linear_classifier_cs_scorer.model._beta  # noqa
+            target_beta = target_betas.iloc[set_zero_idx, set_one_idx]
+            assert (
+                np.isnan(fitted_beta)
+                if np.isnan(target_beta)
+                else fitted_beta == target_beta
+            )
+
+            (
+                x_test,
+                y_test,
+            ) = linear_classifier_cs_scorer.data.get_test_data()
+            in_cls_acc_0, in_cls_acc_1 = scorer.estimate_in_cls_and_out_of_cls_score(
+                model, x_test, y_test
+            )
+            assert (
+                in_cls_acc_0 == target_accuracies_zero.iloc[set_zero_idx, set_one_idx]
+            )
+            assert in_cls_acc_1 == target_accuracies_one.iloc[set_zero_idx, set_one_idx]
diff --git a/tests/value/shapley/test_classwise.py b/tests/value/shapley/test_classwise.py
new file mode 100644
index 000000000..1d263a7d3
--- /dev/null
+++ b/tests/value/shapley/test_classwise.py
@@ -0,0 +1,786 @@
+"""
+Test cases for the class wise shapley value.
+"""
+import random
+from random import seed
+from typing import Dict, Tuple
+
+import numpy as np
+import pytest
+
+from pydvl.utils import Utility
+from pydvl.value import MaxChecks, ValuationResult
+from pydvl.value.shapley.classwise import compute_classwise_shapley_values
+from pydvl.value.shapley.truncated import NoTruncation
+from tests.value import check_values
+
+
+@pytest.fixture(scope="function")
+def linear_classifier_cs_scorer_args_exact_solution_use_default_score() -> Tuple[
+    Dict, ValuationResult, Dict
+]:
+    r"""
+    Returns the exact solution for the class wise shapley value of the training and
+    validation set of the `utility_alt_seq_cf_linear_classifier_cs_scorer` fixture.
+
+    ===========================
+    CS-Shapley Manual Derivation
+    ===========================
+
+    :Author: Markus Semmler
+    :Date:   August 2023
+
+    Dataset description
+    ===================
+
+    We have a training and a test dataset. We want to model a simple XOR dataset. The
+    development set :math:`D` is given by
+
+    .. math::
+        \begin{aligned}
+            \hat{x}_0 &= 1 \quad &\hat{y}_0 = 0 \\
+            \hat{x}_1 &= 2 \quad &\hat{y}_1 = 0 \\
+            \hat{x}_2 &= 3 \quad &\hat{y}_2 = 0 \\
+            \hat{x}_3 &= 4 \quad &\hat{y}_3 = 1 \\
+        \end{aligned}
+
+    and the training set :math:`T` is given by
+
+    .. math::
+        \begin{aligned}
+            x_0 &= 1 \quad &y_0 = 0 \\
+            x_1 &= 2 \quad &y_1 = 0 \\
+            x_2 &= 3 \quad &y_2 = 1 \\
+            x_3 &= 4 \quad &y_3 = 1 \\
+        \end{aligned}
+
+    Note that the training set and the development set contain the same
+    inputs x, but differ in the label :math:`\hat{y}_2 \neq y_2`
+
+    Model
+    =====
+
+    We use an adapted version of linear regression
+
+    .. math:: y = \max(0, \min(1, \text{round}(\beta^T x)))
+
+    for classification, with the closed form solution
+
+    .. math:: \beta = \frac{\text{dot}(x, y)}{\text{dot}(x, x)}
+
+    Fitted model
+    ============
+
+    The hyperparameters for all combinations are
+
+    .. container:: tabular
+
+       | \|c||Sc \| Sc \| Sc \| Sc \| :math:`S_1 \cup S_2` &
+         :math:`\emptyset` & :math:`\{x_2\}` & :math:`\{x_3\}` &
+         :math:`\{x_2, x_3\}`
+       | :math:`\emptyset` & nan & :math:`\frac{1}{3}` & :math:`\frac{1}{4}`
+         & :math:`\frac{7}{25}`
+       | :math:`\{x_0\}` & :math:`0` & :math:`\frac{3}{10}` &
+         :math:`\frac{4}{17}` & :math:`\frac{7}{26}`
+       | :math:`\{x_1\}` & :math:`0` & :math:`\frac{3}{13}` &
+         :math:`\frac{1}{5}` &\ :math:`\frac{7}{29}`
+       | :math:`\{x_0, x_1 \}` & :math:`0` & :math:`\frac{3}{14}` &
+         :math:`\frac{4}{21}` & :math:`\frac{7}{30}`
+
+    Accuracy tables on development set :math:`D`
+    ============================================
+
+    (*) Note that the algorithm described in the paper overwrites these
+    values with 0.
+
+    .. container:: tabular
+
+       | \|c||Sc \| Sc \| Sc \| Sc \| :math:`S_1 \cup S_2` &
+         :math:`\emptyset` & :math:`\{x_2\}` & :math:`\{x_3\}` &
+         :math:`\{x_2, x_3\}`
+       | :math:`\emptyset` & :math:`0` & :math:`\frac{1}{4}` &
+         :math:`\frac{1}{4}` & :math:`\frac{1}{4}`
+       | :math:`\{x_0\}` & :math:`\frac{3}{4}` & :math:`\frac{1}{4}` &
+         :math:`\frac{1}{2}` & :math:`\frac{1}{4}`
+       | :math:`\{x_1\}` & :math:`\frac{3}{4}` & :math:`\frac{1}{2}` &
+         :math:`\frac{1}{2}` &\ :math:`\frac{1}{2}`
+       | :math:`\{x_0, x_1 \}` & :math:`\frac{3}{4}` & :math:`\frac{1}{2}` &
+         :math:`\frac{1}{2}` & :math:`\frac{1}{2}`
+
+    .. container:: tabular
+
+       | \|c||Sc \| Sc \| Sc \| Sc \| :math:`S_1 \cup S_2` &
+         :math:`\emptyset` & :math:`\{x_2\}` & :math:`\{x_3\}` &
+         :math:`\{x_2, x_3\}`
+       | :math:`\emptyset` & :math:`0` & :math:`\frac{1}{4}` &
+         :math:`\frac{1}{4}` & :math:`\frac{1}{4}`
+       | :math:`\{x_0\}` & :math:`0` & :math:`\frac{1}{4}` &
+         :math:`\frac{1}{4}` & :math:`\frac{1}{4}`
+       | :math:`\{x_1\}` & :math:`0` & :math:`\frac{1}{4}` &
+         :math:`\frac{1}{4}` &\ :math:`\frac{1}{4}`
+       | :math:`\{x_0, x_1 \}` & :math:`0` & :math:`\frac{1}{4}` &
+         :math:`\frac{1}{4}` & :math:`\frac{1}{4}`
+
+    CS-Shapley
+    ==========
+
+    The formulas of the algorithm are given by
+
+    .. math::
+
+        \begin{aligned}
+            \delta(\pi, S_{-y_i}, i) &= v_{y_i}(\pi_{:i} \cup \{ i \} | S_{-y_i})
+                - v_{y_i}(\pi_{:i} | S_{-y_i}) \\
+            \left [ \phi_i | S_{-y_i} \right ] &= \frac{1}{|T_{y_i}|!}
+                \sum_{\pi \in \Pi(T_{y_i})} \delta(\pi, S_{-y_i}, i) \\
+            \phi_i &= \frac{1}{2^{|T_{-y_i}|}-1} \left [\sum_{\emptyset \subset S_{-y_i}
+                \subseteq T_{-y_i}} \left [ \phi_i | S_{-y_i} \right ] \right ]
+        \end{aligned}
+
+    Valuation of :math:`x_0`
+    ========================
+
+    .. math::
+        \begin{aligned}
+            \delta((x_0, x_1), \{ x_2 \}, 0) &= \frac{1}{4} e^\frac{1}{4} &\quad
+                \delta((x_1, x_0), \{ x_2 \}, 0) &= 0 \\
+            \delta((x_0, x_1), \{ x_3 \}, 0) &= \frac{1}{2} e^\frac{1}{4} &\quad
+                \delta((x_1, x_0), \{ x_3 \}, 0) &= 0 \\
+            \delta((x_0, x_1), \{ x_2, x_3 \}, 0) &= \frac{1}{4} e^\frac{1}{4} &\quad
+                \delta((x_1, x_0), \{ x_2, x_3 \}, 0) &= 0
+        \end{aligned}
+
+    .. math::
+        \begin{aligned}
+            \left [ \phi_0 | \{ x_2 \} \right] &= \frac{1}{8} e^\frac{1}{4} \\
+            \left [ \phi_0 | \{ x_3 \} \right] &= \frac{1}{4} e^\frac{1}{4} \\
+            \left [ \phi_0 | \{ x_2, x_3 \} \right] &= \frac{1}{8} e^\frac{1}{4}
+        \end{aligned}
+
+    .. math:: \phi_0 = \frac{1}{6} e^\frac{1}{4} \approx 0.214
+
+    Valuation of :math:`x_1`
+    ========================
+
+    .. math::
+        \begin{aligned}
+            \delta((x_0, x_1), \{ x_2 \}, 1) &= \frac{1}{4} e^\frac{1}{4} &\quad
+                \delta((x_1, x_0), \{ x_2 \}, 1) &= \frac{1}{2} e^\frac{1}{4} \\
+            \delta((x_0, x_1), \{ x_3 \}, 1) &= 0 &\quad
+                \delta((x_1, x_0), \{ x_3 \}, 1) &= \frac{1}{2} e^\frac{1}{4} \\
+            \delta((x_0, x_1), \{ x_2, x_3 \}, 1) &= \frac{1}{4} e^\frac{1}{4} &\quad
+                \delta((x_1, x_0), \{ x_2, x_3 \}, 1) &= \frac{1}{2} e^\frac{1}{4}
+        \end{aligned}
+
+    .. math::
+        \begin{aligned}
+            \left [ \phi_1 | \{ x_2 \} \right] &= \frac{3}{8} e^\frac{1}{4} \\
+            \left [ \phi_1 | \{ x_3 \} \right] &= \frac{1}{4} e^\frac{1}{4} \\
+            \left [ \phi_1 | \{ x_2, x_3 \} \right] &= \frac{3}{8} e^\frac{1}{4}
+        \end{aligned}
+
+    .. math:: \phi_0 = \frac{1}{3} e^\frac{1}{4} \approx 0.428
+
+    Valuation of :math:`x_2`
+    ========================
+
+    .. math::
+        \begin{aligned}
+            \delta((x_2, x_3), \{ x_0 \}, 2) &= \frac{1}{4} e^\frac{1}{4} &\quad
+                \delta((x_3, x_2), \{ x_0 \}, 2)
+                &= \frac{1}{4} e^\frac{1}{4} - \frac{1}{4} e^\frac{1}{2} \\
+            \delta((x_2, x_3), \{ x_1 \}, 2) &= \frac{1}{4} e^\frac{1}{2} &\quad
+                \delta((x_3, x_2), \{ x_1 \}, 2) &= 0 \\
+            \delta((x_2, x_3), \{ x_0, x_1 \}, 2) &= \frac{1}{4} e^\frac{1}{2} &\quad
+                \delta((x_3, x_2), \{ x_0, x_1 \}, 2) &= 0
+        \end{aligned}
+
+    .. math::
+        \begin{aligned}
+            \left [ \phi_2 | \{ x_0 \} \right]
+                &= \frac{1}{4} e^\frac{1}{4} - \frac{1}{8} e^\frac{1}{2} \\
+            \left [ \phi_2 | \{ x_1 \} \right] &= \frac{1}{8} e^\frac{1}{2} \\
+            \left [ \phi_2 | \{ x_0, x_1 \} \right] &= \frac{1}{8} e^\frac{1}{2}
+        \end{aligned}
+
+    .. math:: \phi_2 = \frac{1}{12} e^\frac{1}{4} + \frac{1}{24} e^\frac{1}{2} \approx 0.1757
+
+    Valuation of :math:`x_3`
+    ========================
+
+    .. math::
+        \begin{aligned}
+            \delta((x_2, x_3), \{ x_0 \}, 3) &= 0 &\quad
+                \delta((x_3, x_2), \{ x_0 \}, 3) &= \frac{1}{4} e^\frac{1}{2} \\
+            \delta((x_2, x_3), \{ x_1 \}, 3) &= 0 &\quad
+                \delta((x_3, x_2), \{ x_1 \}, 3) &= \frac{1}{4} e^\frac{1}{2} \\
+            \delta((x_2, x_3), \{ x_0, x_1 \}, 3) &= 0 &\quad
+                \delta((x_3, x_2), \{ x_0, x_1 \}, 3) &= \frac{1}{4} e^\frac{1}{2}
+        \end{aligned}
+
+    .. math::
+        \begin{aligned}
+            \left [ \phi_3 | \{ x_0 \} \right] &= \frac{1}{8} e^\frac{1}{2} \\
+            \left [ \phi_3 | \{ x_1 \} \right] &= \frac{1}{8} e^\frac{1}{2} \\
+            \left [ \phi_3 | \{ x_0, x_1 \} \right] &= \frac{1}{8} e^\frac{1}{2}
+        \end{aligned}
+
+    .. math:: \phi_3 = \frac{1}{8} e^\frac{1}{2} \approx 0.2061
+    """
+    return (
+        {
+            "normalize_values": False,
+        },
+        ValuationResult(
+            values=np.array(
+                [
+                    1 / 6 * np.exp(1 / 4),
+                    1 / 3 * np.exp(1 / 4),
+                    1 / 12 * np.exp(1 / 4) + 1 / 24 * np.exp(1 / 2),
+                    1 / 8 * np.exp(1 / 2),
+                ]
+            )
+        ),
+        {"atol": 0.05},
+    )
+
+
+@pytest.fixture(scope="function")
+def linear_classifier_cs_scorer_args_exact_solution_use_default_score_norm(
+    linear_classifier_cs_scorer_args_exact_solution_use_default_score: Tuple[
+        Dict, ValuationResult, Dict
+    ]
+) -> Tuple[Dict, ValuationResult, Dict]:
+    """
+    Same as :func:`linear_classifier_cs_scorer_args_exact_solution_use_default_score`
+    but with normalization. The values of label c are normalized by the in-class score
+    of label c divided by the sum of values of that specific label.
+    """
+    values = linear_classifier_cs_scorer_args_exact_solution_use_default_score[1].values
+    label_zero_coefficient = 1 / np.exp(1 / 4)
+    label_one_coefficient = 1 / (1 / 3 * np.exp(1 / 4) + 2 / 3 * np.exp(1 / 2))
+
+    return (
+        {
+            "normalize_values": True,
+        },
+        ValuationResult(
+            values=np.array(
+                [
+                    values[0] * label_zero_coefficient,
+                    values[1] * label_zero_coefficient,
+                    values[2] * label_one_coefficient,
+                    values[3] * label_one_coefficient,
+                ]
+            )
+        ),
+        {"atol": 0.05},
+    )
+
+
+@pytest.fixture(scope="function")
+def linear_classifier_cs_scorer_args_exact_solution_use_add_idx() -> Tuple[
+    Dict, ValuationResult, Dict
+]:
+    r"""
+    Returns the exact solution for the class wise shapley value of the training and
+    validation set of the `utility_alt_seq_cf_linear_classifier_cs_scorer` fixture.
+
+    ===========================
+    CS-Shapley Manual Derivation
+    ===========================
+
+    :Author: Markus Semmler
+    :Date:   August 2023
+
+    Dataset description
+    ===================
+
+    We have a training and a test dataset. We want to model a simple XOR dataset. The
+    development set :math:`D` is given by
+
+    .. math::
+        \begin{aligned}
+            \hat{x}_0 &= 1 \quad &\hat{y}_0 = 0 \\
+            \hat{x}_1 &= 2 \quad &\hat{y}_1 = 0 \\
+            \hat{x}_2 &= 3 \quad &\hat{y}_2 = 0 \\
+            \hat{x}_3 &= 4 \quad &\hat{y}_3 = 1 \\
+        \end{aligned}
+
+    and the training set :math:`T` is given by
+
+    .. math::
+        \begin{aligned}
+            x_0 &= 1 \quad &y_0 = 0 \\
+            x_1 &= 2 \quad &y_1 = 0 \\
+            x_2 &= 3 \quad &y_2 = 1 \\
+            x_3 &= 4 \quad &y_3 = 1 \\
+        \end{aligned}
+
+    Note that the training set and the development set contain the same
+    inputs x, but differ in the label :math:`\hat{y}_2 \neq y_2`
+
+    Model
+    =====
+
+    We use an adapted version of linear regression
+
+    .. math:: y = \max(0, \min(1, \text{round}(\beta^T x)))
+
+    for classification, with the closed form solution
+
+    .. math:: \beta = \frac{\text{dot}(x, y)}{\text{dot}(x, x)}
+
+    Fitted model
+    ============
+
+    The hyperparameters for all combinations are
+
+    .. container:: tabular
+
+       | \|c||Sc \| Sc \| Sc \| Sc \| :math:`S_1 \cup S_2` &
+         :math:`\emptyset` & :math:`\{x_2\}` & :math:`\{x_3\}` &
+         :math:`\{x_2, x_3\}`
+       | :math:`\emptyset` & nan & :math:`\frac{1}{3}` & :math:`\frac{1}{4}`
+         & :math:`\frac{7}{25}`
+       | :math:`\{x_0\}` & :math:`0` & :math:`\frac{3}{10}` &
+         :math:`\frac{4}{17}` & :math:`\frac{7}{26}`
+       | :math:`\{x_1\}` & :math:`0` & :math:`\frac{3}{13}` &
+         :math:`\frac{1}{5}` &\ :math:`\frac{7}{29}`
+       | :math:`\{x_0, x_1 \}` & :math:`0` & :math:`\frac{3}{14}` &
+         :math:`\frac{4}{21}` & :math:`\frac{7}{30}`
+
+    Accuracy tables on development set :math:`D`
+    ============================================
+
+    .. container:: tabular
+
+       | \|c||Sc \| Sc \| Sc \| Sc \| :math:`S_1 \cup S_2` &
+         :math:`\emptyset` & :math:`\{x_2\}` & :math:`\{x_3\}` &
+         :math:`\{x_2, x_3\}`
+       | :math:`\emptyset` & :math:`0` & :math:`\frac{1}{4}` &
+         :math:`\frac{1}{4}` & :math:`\frac{1}{4}`
+       | :math:`\{x_0\}` & :math:`\frac{3}{4}` & :math:`\frac{1}{4}` &
+         :math:`\frac{1}{2}` & :math:`\frac{1}{4}`
+       | :math:`\{x_1\}` & :math:`\frac{3}{4}` & :math:`\frac{1}{2}` &
+         :math:`\frac{1}{2}` &\ :math:`\frac{1}{2}`
+       | :math:`\{x_0, x_1 \}` & :math:`\frac{3}{4}` & :math:`\frac{1}{2}` &
+         :math:`\frac{1}{2}` & :math:`\frac{1}{2}`
+
+    .. container:: tabular
+
+       | \|c||Sc \| Sc \| Sc \| Sc \| :math:`S_1 \cup S_2` &
+         :math:`\emptyset` & :math:`\{x_2\}` & :math:`\{x_3\}` &
+         :math:`\{x_2, x_3\}`
+       | :math:`\emptyset` & :math:`0` & :math:`\frac{1}{4}` &
+         :math:`\frac{1}{4}` & :math:`\frac{1}{4}`
+       | :math:`\{x_0\}` & :math:`0` & :math:`\frac{1}{4}` &
+         :math:`\frac{1}{4}` & :math:`\frac{1}{4}`
+       | :math:`\{x_1\}` & :math:`0` & :math:`\frac{1}{4}` &
+         :math:`\frac{1}{4}` &\ :math:`\frac{1}{4}`
+       | :math:`\{x_0, x_1 \}` & :math:`0` & :math:`\frac{1}{4}` &
+         :math:`\frac{1}{4}` & :math:`\frac{1}{4}`
+
+    CS-Shapley
+    ==========
+
+    The formulas of the algorithm are given by
+
+    .. math::
+
+        \begin{aligned}
+            \delta(\pi, S_{-y_i}, i) &= v_{y_i}(\pi_{:i} \cup \{ i \} | S_{-y_i})
+                - v_{y_i}(\pi_{:i} | S_{-y_i}) \\
+            \left [ \phi_i | S_{-y_i} \right ] &= \frac{1}{|T_{y_i}|!}
+                \sum_{\pi \in \Pi(T_{y_i})} \delta(\pi, S_{-y_i}, i) \\
+            \phi_i &= \frac{1}{2^{|T_{-y_i}|}-1} \left [\sum_{\emptyset \subset S_{-y_i}
+                \subseteq T_{-y_i}} \left [ \phi_i | S_{-y_i} \right ] \right ]
+        \end{aligned}
+
+    Valuation of :math:`x_0`
+    ========================
+
+    .. math::
+        \begin{aligned}
+            \delta((x_0, x_1), \{ x_2 \}, 0) &= 0 &\quad
+                \delta((x_1, x_0), \{ x_2 \}, 0) &= 0 \\
+            \delta((x_0, x_1), \{ x_3 \}, 0) &= \frac{1}{4} e^\frac{1}{4} &\quad
+                \delta((x_1, x_0), \{ x_3 \}, 0) &= 0 \\
+            \delta((x_0, x_1), \{ x_2, x_3 \}, 0) &= 0 &\quad
+                \delta((x_1, x_0), \{ x_2, x_3 \}, 0) &= 0
+        \end{aligned}
+
+    .. math::
+        \begin{aligned}
+            \left [ \phi_0 | \{ x_2 \} \right] &= 0 \\
+            \left [ \phi_0 | \{ x_3 \} \right] &= \frac{1}{8} e^\frac{1}{4} \\
+            \left [ \phi_0 | \{ x_2, x_3 \} \right] &= 0
+        \end{aligned}
+
+    .. math:: \phi_0 = \frac{1}{24} e^\frac{1}{4} \approx 0.0535
+
+    Valuation of :math:`x_1`
+    ========================
+
+    .. math::
+        \begin{aligned}
+            \delta((x_0, x_1), \{ x_2 \}, 1) &= \frac{1}{4} e^\frac{1}{4} &\quad
+                \delta((x_1, x_0), \{ x_2 \}, 1) &= \frac{1}{4} e^\frac{1}{4} \\
+            \delta((x_0, x_1), \{ x_3 \}, 1) &= 0 &\quad
+                \delta((x_1, x_0), \{ x_3 \}, 1) &= \frac{1}{4} e^\frac{1}{4} \\
+            \delta((x_0, x_1), \{ x_2, x_3 \}, 1) &= \frac{1}{4} e^\frac{1}{4} &\quad
+                \delta((x_1, x_0), \{ x_2, x_3 \}, 1) &= \frac{1}{4} e^\frac{1}{4}
+        \end{aligned}
+
+    .. math::
+        \begin{aligned}
+            \left [ \phi_1 | \{ x_2 \} \right] &= \frac{1}{4} e^\frac{1}{4} \\
+            \left [ \phi_1 | \{ x_3 \} \right] &= \frac{1}{8} e^\frac{1}{4} \\
+            \left [ \phi_1 | \{ x_2, x_3 \} \right] &= \frac{1}{4} e^\frac{1}{4}
+        \end{aligned}
+
+    .. math:: \phi_0 = \frac{5}{24} e^\frac{1}{4} \approx 0.2675
+
+    Valuation of :math:`x_2`
+    ========================
+
+    .. math::
+        \begin{aligned}
+            \delta((x_2, x_3), \{ x_0 \}, 2) &= \frac{1}{4} e^\frac{1}{4} &\quad
+                \delta((x_3, x_2), \{ x_0 \}, 2)
+                &= \frac{1}{4} e^\frac{1}{4} - \frac{1}{4} e^\frac{1}{2} \\
+            \delta((x_2, x_3), \{ x_1 \}, 2) &= \frac{1}{4} e^\frac{1}{2} &\quad
+                \delta((x_3, x_2), \{ x_1 \}, 2) &= 0 \\
+            \delta((x_2, x_3), \{ x_0, x_1 \}, 2) &= \frac{1}{4} e^\frac{1}{2} &\quad
+                \delta((x_3, x_2), \{ x_0, x_1 \}, 2) &= 0
+        \end{aligned}
+
+    .. math::
+        \begin{aligned}
+            \left [ \phi_2 | \{ x_0 \} \right]
+                &= \frac{1}{4} e^\frac{1}{4} - \frac{1}{8} e^\frac{1}{2} \\
+            \left [ \phi_2 | \{ x_1 \} \right] &= \frac{1}{8} e^\frac{1}{2} \\
+            \left [ \phi_2 | \{ x_0, x_1 \} \right] &= \frac{1}{8} e^\frac{1}{2}
+        \end{aligned}
+
+    .. math:: \phi_2 = \frac{1}{12} e^\frac{1}{4} + \frac{1}{24} e^\frac{1}{2} \approx 0.1757
+
+    Valuation of :math:`x_3`
+    ========================
+
+    .. math::
+        \begin{aligned}
+            \delta((x_2, x_3), \{ x_0 \}, 3) &= 0 &\quad
+                \delta((x_3, x_2), \{ x_0 \}, 3) &= \frac{1}{4} e^\frac{1}{2} \\
+            \delta((x_2, x_3), \{ x_1 \}, 3) &= 0 &\quad
+                \delta((x_3, x_2), \{ x_1 \}, 3) &= \frac{1}{4} e^\frac{1}{2} \\
+            \delta((x_2, x_3), \{ x_0, x_1 \}, 3) &= 0 &\quad
+                \delta((x_3, x_2), \{ x_0, x_1 \}, 3) &= \frac{1}{4} e^\frac{1}{2}
+        \end{aligned}
+
+    .. math::
+        \begin{aligned}
+            \left [ \phi_3 | \{ x_0 \} \right] &= \frac{1}{8} e^\frac{1}{2} \\
+            \left [ \phi_3 | \{ x_1 \} \right] &= \frac{1}{8} e^\frac{1}{2} \\
+            \left [ \phi_3 | \{ x_0, x_1 \} \right] &= \frac{1}{8} e^\frac{1}{2}
+        \end{aligned}
+
+    .. math:: \phi_3 = \frac{1}{8} e^\frac{1}{2} \approx 0.2061
+    """
+    return (
+        {
+            "use_default_scorer_value": False,
+            "normalize_values": False,
+        },
+        ValuationResult(
+            values=np.array(
+                [
+                    1 / 24 * np.exp(1 / 4),
+                    5 / 24 * np.exp(1 / 4),
+                    1 / 12 * np.exp(1 / 4) + 1 / 24 * np.exp(1 / 2),
+                    1 / 8 * np.exp(1 / 2),
+                ]
+            )
+        ),
+        {"atol": 0.05},
+    )
+
+
+@pytest.fixture(scope="function")
+def linear_classifier_cs_scorer_args_exact_solution_use_add_idx_empty_set() -> Tuple[
+    Dict, ValuationResult, Dict
+]:
+    r"""
+    Returns the exact solution for the class wise shapley value of the training and
+    validation set of the `utility_alt_seq_cf_linear_classifier_cs_scorer` fixture.
+
+    ===========================
+    CS-Shapley Manual Derivation
+    ===========================
+
+    :Author: Markus Semmler
+    :Date:   August 2023
+
+    Dataset description
+    ===================
+
+    We have a training and a test dataset. We want to model a simple XOR dataset. The
+    development set :math:`D` is given by
+
+    .. math::
+        \begin{aligned}
+            \hat{x}_0 &= 1 \quad &\hat{y}_0 = 0 \\
+            \hat{x}_1 &= 2 \quad &\hat{y}_1 = 0 \\
+            \hat{x}_2 &= 3 \quad &\hat{y}_2 = 0 \\
+            \hat{x}_3 &= 4 \quad &\hat{y}_3 = 1 \\
+        \end{aligned}
+
+    and the training set :math:`T` is given by
+
+    .. math::
+        \begin{aligned}
+            x_0 &= 1 \quad &y_0 = 0 \\
+            x_1 &= 2 \quad &y_1 = 0 \\
+            x_2 &= 3 \quad &y_2 = 1 \\
+            x_3 &= 4 \quad &y_3 = 1 \\
+        \end{aligned}
+
+    Note that the training set and the development set contain the same
+    inputs x, but differ in the label :math:`\hat{y}_2 \neq y_2`
+
+    Model
+    =====
+
+    We use an adapted version of linear regression
+
+    .. math:: y = \max(0, \min(1, \text{round}(\beta^T x)))
+
+    for classification, with the closed form solution
+
+    .. math:: \beta = \frac{\text{dot}(x, y)}{\text{dot}(x, x)}
+
+    Fitted model
+    ============
+
+    The hyperparameters for all combinations are
+
+    .. container:: tabular
+
+       | \|c||Sc \| Sc \| Sc \| Sc \| :math:`S_1 \cup S_2` &
+         :math:`\emptyset` & :math:`\{x_2\}` & :math:`\{x_3\}` &
+         :math:`\{x_2, x_3\}`
+       | :math:`\emptyset` & nan & :math:`\frac{1}{3}` & :math:`\frac{1}{4}`
+         & :math:`\frac{7}{25}`
+       | :math:`\{x_0\}` & :math:`0` & :math:`\frac{3}{10}` &
+         :math:`\frac{4}{17}` & :math:`\frac{7}{26}`
+       | :math:`\{x_1\}` & :math:`0` & :math:`\frac{3}{13}` &
+         :math:`\frac{1}{5}` &\ :math:`\frac{7}{29}`
+       | :math:`\{x_0, x_1 \}` & :math:`0` & :math:`\frac{3}{14}` &
+         :math:`\frac{4}{21}` & :math:`\frac{7}{30}`
+
+    Accuracy tables on development set :math:`D`
+    ============================================
+
+    .. container:: tabular
+
+       | \|c||Sc \| Sc \| Sc \| Sc \| :math:`S_1 \cup S_2` &
+         :math:`\emptyset` & :math:`\{x_2\}` & :math:`\{x_3\}` &
+         :math:`\{x_2, x_3\}`
+       | :math:`\emptyset` & :math:`0` & :math:`\frac{1}{4}` &
+         :math:`\frac{1}{4}` & :math:`\frac{1}{4}`
+       | :math:`\{x_0\}` & :math:`\frac{3}{4}` & :math:`\frac{1}{4}` &
+         :math:`\frac{1}{2}` & :math:`\frac{1}{4}`
+       | :math:`\{x_1\}` & :math:`\frac{3}{4}` & :math:`\frac{1}{2}` &
+         :math:`\frac{1}{2}` &\ :math:`\frac{1}{2}`
+       | :math:`\{x_0, x_1 \}` & :math:`\frac{3}{4}` & :math:`\frac{1}{2}` &
+         :math:`\frac{1}{2}` & :math:`\frac{1}{2}`
+
+    .. container:: tabular
+
+       | \|c||Sc \| Sc \| Sc \| Sc \| :math:`S_1 \cup S_2` &
+         :math:`\emptyset` & :math:`\{x_2\}` & :math:`\{x_3\}` &
+         :math:`\{x_2, x_3\}`
+       | :math:`\emptyset` & :math:`0` & :math:`\frac{1}{4}` &
+         :math:`\frac{1}{4}` & :math:`\frac{1}{4}`
+       | :math:`\{x_0\}` & :math:`0` & :math:`\frac{1}{4}` &
+         :math:`\frac{1}{4}` & :math:`\frac{1}{4}`
+       | :math:`\{x_1\}` & :math:`0` & :math:`\frac{1}{4}` &
+         :math:`\frac{1}{4}` &\ :math:`\frac{1}{4}`
+       | :math:`\{x_0, x_1 \}` & :math:`0` & :math:`\frac{1}{4}` &
+         :math:`\frac{1}{4}` & :math:`\frac{1}{4}`
+
+    CS-Shapley
+    ==========
+
+    The formulas of the algorithm are given by
+
+    .. math::
+
+        \begin{aligned}
+            \delta(\pi, S_{-y_i}, i) &= v_{y_i}(\pi_{:i} \cup \{ i \} | S_{-y_i})
+                - v_{y_i}(\pi_{:i} | S_{-y_i}) \\
+            \left [ \phi_i | S_{-y_i} \right ] &= \frac{1}{|T_{y_i}|!}
+                \sum_{\pi \in \Pi(T_{y_i})} \delta(\pi, S_{-y_i}, i) \\
+            \phi_i &= \frac{1}{2^{|T_{-y_i}|}} \left [\sum_{S_{-y_i}
+                \subseteq T_{-y_i}} \left [ \phi_i | S_{-y_i} \right ] \right ]
+        \end{aligned}
+
+    Valuation of :math:`x_0`
+    ========================
+
+    .. math::
+        \begin{aligned}
+            \delta((x_0, x_1), \emptyset, 0) &= \frac{3}{4} &\quad
+                \delta((x_1, x_0), \emptyset, 0) &= 0 \\
+            \delta((x_0, x_1), \{ x_2 \}, 0) &= 0 &\quad
+                \delta((x_1, x_0), \{ x_2 \}, 0) &= 0 \\
+            \delta((x_0, x_1), \{ x_3 \}, 0) &= \frac{1}{4} e^\frac{1}{4} &\quad
+                \delta((x_1, x_0), \{ x_3 \}, 0) &= 0 \\
+            \delta((x_0, x_1), \{ x_2, x_3 \}, 0) &= 0 &\quad
+                \delta((x_1, x_0), \{ x_2, x_3 \}, 0) &= 0
+        \end{aligned}
+
+    .. math::
+        \begin{aligned}
+            \left [ \phi_0 | \emptyset \right] &= \frac{3}{8} \\
+            \left [ \phi_0 | \{ x_2 \} \right] &= 0 \\
+            \left [ \phi_0 | \{ x_3 \} \right] &= \frac{1}{8} e^\frac{1}{4} \\
+            \left [ \phi_0 | \{ x_2, x_3 \} \right] &= 0
+        \end{aligned}
+
+    .. math:: \phi_0 = \frac{3}{32} + \frac{1}{32} e^\frac{1}{4} \approx 0.1339
+
+    Valuation of :math:`x_1`
+    ========================
+
+    .. math::
+        \begin{aligned}
+            \delta((x_0, x_1), \emptyset, 1) &= 0 &\quad
+                \delta((x_1, x_0), \emptyset, 1) &= \frac{3}{4} \\
+            \delta((x_0, x_1), \{ x_2 \}, 1) &= \frac{1}{4} e^\frac{1}{4} &\quad
+                \delta((x_1, x_0), \{ x_2 \}, 1) &= \frac{1}{4} e^\frac{1}{4} \\
+            \delta((x_0, x_1), \{ x_3 \}, 1) &= 0 &\quad
+                \delta((x_1, x_0), \{ x_3 \}, 1) &= \frac{1}{4} e^\frac{1}{4} \\
+            \delta((x_0, x_1), \{ x_2, x_3 \}, 1) &= \frac{1}{4} e^\frac{1}{4} &\quad
+                \delta((x_1, x_0), \{ x_2, x_3 \}, 1) &= \frac{1}{4} e^\frac{1}{4}
+        \end{aligned}
+
+    .. math::
+        \begin{aligned}
+            \left [ \phi_1 | \emptyset \right] &= \frac{3}{8} \\
+            \left [ \phi_1 | \{ x_2 \} \right] &= \frac{1}{4} e^\frac{1}{4} \\
+            \left [ \phi_1 | \{ x_3 \} \right] &= \frac{1}{8} e^\frac{1}{4} \\
+            \left [ \phi_1 | \{ x_2, x_3 \} \right] &= \frac{1}{4} e^\frac{1}{4}
+        \end{aligned}
+
+    .. math:: \phi_0 = \frac{3}{32} + \frac{5}{32} e^\frac{1}{4} \approx 0.2944
+
+    Valuation of :math:`x_2`
+    ========================
+
+    .. math::
+        \begin{aligned}
+            \delta((x_2, x_3), \emptyset, 2) &= \frac{1}{4} e^\frac{1}{4} &\quad
+                \delta((x_3, x_2), \emptyset, 2) &= 0 \\
+            \delta((x_2, x_3), \{ x_0 \}, 2) &= \frac{1}{4} e^\frac{1}{4} &\quad
+                \delta((x_3, x_2), \{ x_0 \}, 2)
+                &= \frac{1}{4} e^\frac{1}{4} - \frac{1}{4} e^\frac{1}{2} \\
+            \delta((x_2, x_3), \{ x_1 \}, 2) &= \frac{1}{4} e^\frac{1}{2} &\quad
+                \delta((x_3, x_2), \{ x_1 \}, 2) &= 0 \\
+            \delta((x_2, x_3), \{ x_0, x_1 \}, 2) &= \frac{1}{4} e^\frac{1}{2} &\quad
+                \delta((x_3, x_2), \{ x_0, x_1 \}, 2) &= 0
+        \end{aligned}
+
+    .. math::
+        \begin{aligned}
+            \left [ \phi_2 | \emptyset \right] &= \frac{1}{8} e^\frac{1}{4} \\
+            \left [ \phi_2 | \{ x_0 \} \right]
+                &= \frac{1}{4} e^\frac{1}{4} - \frac{1}{8} e^\frac{1}{2} \\
+            \left [ \phi_2 | \{ x_1 \} \right] &= \frac{1}{8} e^\frac{1}{2} \\
+            \left [ \phi_2 | \{ x_0, x_1 \} \right] &= \frac{1}{8} e^\frac{1}{2}
+        \end{aligned}
+
+    .. math::
+        \phi_2 = \frac{5}{32} e^\frac{1}{4} + \frac{1}{32} e^\frac{1}{2} \approx 0.2522
+
+    Valuation of :math:`x_3`
+    ========================
+
+    .. math::
+        \begin{aligned}
+            \delta((x_2, x_3), \emptyset, 3) &= 0 &\quad
+                \delta((x_3, x_2), \emptyset, 3) &= \frac{1}{4} e^\frac{1}{4} \\
+            \delta((x_2, x_3), \{ x_0 \}, 3) &= 0 &\quad
+                \delta((x_3, x_2), \{ x_0 \}, 3) &= \frac{1}{4} e^\frac{1}{2} \\
+            \delta((x_2, x_3), \{ x_1 \}, 3) &= 0 &\quad
+                \delta((x_3, x_2), \{ x_1 \}, 3) &= \frac{1}{4} e^\frac{1}{2} \\
+            \delta((x_2, x_3), \{ x_0, x_1 \}, 3) &= 0 &\quad
+                \delta((x_3, x_2), \{ x_0, x_1 \}, 3) &= \frac{1}{4} e^\frac{1}{2}
+        \end{aligned}
+
+    .. math::
+        \begin{aligned}
+            \left [ \phi_3 | \emptyset \right] &= \frac{1}{8} e^\frac{1}{4} \\
+            \left [ \phi_3 | \{ x_0 \} \right] &= \frac{1}{8} e^\frac{1}{2} \\
+            \left [ \phi_3 | \{ x_1 \} \right] &= \frac{1}{8} e^\frac{1}{2} \\
+            \left [ \phi_3 | \{ x_0, x_1 \} \right] &= \frac{1}{8} e^\frac{1}{2}
+        \end{aligned}
+
+    .. math::
+        \phi_3 = \frac{1}{32} e^\frac{1}{4} + \frac{3}{32} e^\frac{1}{2} \approx 0.1947
+    """
+    return (
+        {
+            "use_default_scorer_value": False,
+            "min_elements_per_label": 0,
+            "normalize_values": False,
+        },
+        ValuationResult(
+            values=np.array(
+                [
+                    3 / 32 + 1 / 32 * np.exp(1 / 4),
+                    3 / 32 + 5 / 32 * np.exp(1 / 4),
+                    5 / 32 * np.exp(1 / 4) + 1 / 32 * np.exp(1 / 2),
+                    1 / 32 * np.exp(1 / 4) + 3 / 32 * np.exp(1 / 2),
+                ]
+            )
+        ),
+        {"atol": 0.05},
+    )
+
+
+@pytest.mark.parametrize("n_samples", [500], ids=lambda x: "n_samples={}".format(x))
+@pytest.mark.parametrize(
+    "n_resample_complement_sets",
+    [1],
+    ids=lambda x: "n_resample_complement_sets={}".format(x),
+)
+@pytest.mark.parametrize(
+    "linear_classifier_cs_scorer_args_exact_solution",
+    [
+        "linear_classifier_cs_scorer_args_exact_solution_use_default_score",
+        "linear_classifier_cs_scorer_args_exact_solution_use_default_score_norm",
+        "linear_classifier_cs_scorer_args_exact_solution_use_add_idx",
+        "linear_classifier_cs_scorer_args_exact_solution_use_add_idx_empty_set",
+    ],
+)
+def test_classwise_shapley(
+    linear_classifier_cs_scorer: Utility,
+    linear_classifier_cs_scorer_args_exact_solution: Tuple[Dict, ValuationResult],
+    n_samples: int,
+    n_resample_complement_sets: int,
+    request,
+):
+    args, exact_solution, check_args = request.getfixturevalue(
+        linear_classifier_cs_scorer_args_exact_solution
+    )
+    values = compute_classwise_shapley_values(
+        linear_classifier_cs_scorer,
+        done=MaxChecks(n_samples - 1),
+        truncation=NoTruncation(),
+        n_resample_complement_sets=n_resample_complement_sets,
+        **args,
+        progress=True,
+    )
+    check_values(values, exact_solution, **check_args)
+    assert np.all(values.counts == n_samples * n_resample_complement_sets)