diff --git a/CHANGELOG.md b/CHANGELOG.md
index f083e7aee..c9c068667 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,6 +4,8 @@
- Using pytest-xdist for faster local tests
[PR #440](https://github.com/aai-institute/pyDVL/pull/440)
+- Added `AntitheticPermutationSampler`
+ [PR #439](https://github.com/aai-institute/pyDVL/pull/439)
- Implementation of Data-OOB by @BastienZim
[PR #426](https://github.com/aai-institute/pyDVL/pull/426),
[PR $431](https://github.com/aai-institute/pyDVL/pull/431)
diff --git a/docs/assets/pydvl.bib b/docs/assets/pydvl.bib
index c622e5541..e87ad3484 100644
--- a/docs/assets/pydvl.bib
+++ b/docs/assets/pydvl.bib
@@ -24,7 +24,7 @@ @article{benmerzoug_re_2023
doi = {10.5281/zenodo.8173733},
url = {https://zenodo.org/record/8173733},
urldate = {2023-08-27},
- abstract = {Replication}
+ abstract = {We investigate the results of [1] in the field of data valuation. We repeat their experiments and conclude that the (Monte Carlo) Least Core is sensitive to important characteristics of the ML problem of interest, making it difficult to apply.},
}
@article{castro_polynomial_2009,
@@ -198,6 +198,21 @@ @inproceedings{kwon_efficient_2021
langid = {english}
}
+@article{mitchell_sampling_2022,
+ title = {Sampling {{Permutations}} for {{Shapley Value Estimation}}},
+ author = {Mitchell, Rory and Cooper, Joshua and Frank, Eibe and Holmes, Geoffrey},
+ date = {2022},
+ journaltitle = {Journal of Machine Learning Research},
+ shortjournal = {J. Mach. Learn. Res.},
+ volume = {23},
+ number = {43},
+ pages = {1--46},
+ issn = {1533-7928},
+ url = {http://jmlr.org/papers/v23/21-0439.html},
+ urldate = {2022-10-23},
+ abstract = {Game-theoretic attribution techniques based on Shapley values are used to interpret black-box machine learning models, but their exact calculation is generally NP-hard, requiring approximation methods for non-trivial models. As the computation of Shapley values can be expressed as a summation over a set of permutations, a common approach is to sample a subset of these permutations for approximation. Unfortunately, standard Monte Carlo sampling methods can exhibit slow convergence, and more sophisticated quasi-Monte Carlo methods have not yet been applied to the space of permutations. To address this, we investigate new approaches based on two classes of approximation methods and compare them empirically. First, we demonstrate quadrature techniques in a RKHS containing functions of permutations, using the Mallows kernel in combination with kernel herding and sequential Bayesian quadrature. The RKHS perspective also leads to quasi-Monte Carlo type error bounds, with a tractable discrepancy measure defined on permutations. Second, we exploit connections between the hypersphere S d−2 Sd−2 and permutations to create practical algorithms for generating permutation samples with good properties. Experiments show the above techniques provide significant improvements for Shapley value estimates over existing methods, converging to a smaller RMSE in the same number of model evaluations.}
+}
+
@inproceedings{okhrati_multilinear_2021,
title = {A {{Multilinear Sampling Algorithm}} to {{Estimate Shapley Values}}},
booktitle = {2020 25th {{International Conference}} on {{Pattern Recognition}} ({{ICPR}})},
diff --git a/src/pydvl/value/sampler.py b/src/pydvl/value/sampler.py
index a51dbfb79..58961dcb8 100644
--- a/src/pydvl/value/sampler.py
+++ b/src/pydvl/value/sampler.py
@@ -27,7 +27,7 @@
compute any semi-value, in particular Shapley and Beta values, and Banzhaf
indices.
-# Slicing of samplers
+## Slicing of samplers
The samplers can be sliced for parallel computation. For those which are
embarrassingly parallel, this is done by slicing the set of "outer" indices and
@@ -36,6 +36,15 @@
and [UniformSampler][pydvl.value.sampler.UniformSampler]. In contrast, slicing a
[PermutationSampler][pydvl.value.sampler.PermutationSampler] creates a new
sampler which iterates over the same indices.
+
+
+## References
+
+[^1]: Mitchell, Rory, Joshua Cooper, Eibe
+ Frank, and Geoffrey Holmes. [Sampling Permutations for Shapley Value
+ Estimation](http://jmlr.org/papers/v23/21-0439.html). Journal of Machine
+ Learning Research 23, no. 43 (2022): 1–46.
+
"""
from __future__ import annotations
@@ -315,18 +324,19 @@ class AntitheticSampler(StochasticSamplerMixin, PowersetSampler[IndexT]):
"""An iterator to perform uniform random sampling of subsets, and their
complements.
- Works as :class:`~pydvl.value.sampler.UniformSampler`, but for every tuple
- $(i,S)$, it subsequently returns $(i,S^c)$, where $S^c$ is the complement of
- the set $S$, including the index $i$ itself.
+ Works as [UniformSampler][pydvl.value.sampler.UniformSampler], but for every
+ tuple $(i,S)$, it subsequently returns $(i,S^c)$, where $S^c$ is the
+ complement of the set $S$ in the set of indices, excluding $i$.
"""
def __iter__(self) -> Iterator[SampleT]:
while True:
for idx in self.iterindices():
- subset = random_subset(self.complement([idx]), seed=self._rng)
+ _complement = self.complement([idx])
+ subset = random_subset(_complement, seed=self._rng)
yield idx, subset
self._n_samples += 1
- yield idx, self.complement(np.concatenate((subset, np.array([idx]))))
+ yield idx, np.setxor1d(_complement, subset)
self._n_samples += 1
if self._n_samples == 0: # Empty index set
break
@@ -372,6 +382,29 @@ def weight(cls, n: int, subset_len: int) -> float:
return n * math.comb(n - 1, subset_len) if n > 0 else 1.0
+class AntitheticPermutationSampler(PermutationSampler[IndexT]):
+ """Samples permutations like
+ [PermutationSampler][pydvl.value.sampler.PermutationSampler], but after
+ each permutation, it returns the same permutation in reverse order.
+
+ This sampler was suggested in (Mitchell et al. 2022)1
+
+ !!! tip "New in version 0.7.1"
+ """
+
+ def __iter__(self) -> Iterator[SampleT]:
+ while True:
+ permutation = self._rng.permutation(self._indices)
+ for perm in permutation, permutation[::-1]:
+ for i, idx in enumerate(perm):
+ yield idx, perm[:i]
+ self._n_samples += 1
+
+ if self._n_samples == 0: # Empty index set
+ break
+
+
class DeterministicPermutationSampler(PermutationSampler[IndexT]):
"""Samples all n! permutations of the indices deterministically, and
iterates through them, returning sets as required for the permutation-based
diff --git a/tests/value/test_semivalues.py b/tests/value/test_semivalues.py
index 0a71bcd54..23694e6ac 100644
--- a/tests/value/test_semivalues.py
+++ b/tests/value/test_semivalues.py
@@ -7,6 +7,7 @@
from pydvl.parallel.config import ParallelConfig
from pydvl.utils.types import Seed
from pydvl.value.sampler import (
+ AntitheticPermutationSampler,
AntitheticSampler,
DeterministicPermutationSampler,
DeterministicUniformSampler,
@@ -36,6 +37,7 @@
UniformSampler,
PermutationSampler,
AntitheticSampler,
+ AntitheticPermutationSampler,
],
)
@pytest.mark.parametrize("coefficient", [shapley_coefficient, beta_coefficient(1, 1)])
@@ -112,6 +114,7 @@ def test_shapley_batch_size(
UniformSampler,
PermutationSampler,
AntitheticSampler,
+ AntitheticPermutationSampler,
],
)
def test_banzhaf(