Skip to content

Commit

Permalink
Merge pull request #439 from aai-institute/feature/antithetic-sampling
Browse files Browse the repository at this point in the history
Improve antithetic sampling
  • Loading branch information
mdbenito authored Sep 30, 2023
2 parents 3d0e579 + c804a4a commit 23316c2
Show file tree
Hide file tree
Showing 4 changed files with 60 additions and 7 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

- Using pytest-xdist for faster local tests
[PR #440](https://github.com/aai-institute/pyDVL/pull/440)
- Added `AntitheticPermutationSampler`
[PR #439](https://github.com/aai-institute/pyDVL/pull/439)
- Implementation of Data-OOB by @BastienZim
[PR #426](https://github.com/aai-institute/pyDVL/pull/426),
[PR $431](https://github.com/aai-institute/pyDVL/pull/431)
Expand Down
17 changes: 16 additions & 1 deletion docs/assets/pydvl.bib
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ @article{benmerzoug_re_2023
doi = {10.5281/zenodo.8173733},
url = {https://zenodo.org/record/8173733},
urldate = {2023-08-27},
abstract = {Replication}
abstract = {We investigate the results of [1] in the field of data valuation. We repeat their experiments and conclude that the (Monte Carlo) Least Core is sensitive to important characteristics of the ML problem of interest, making it difficult to apply.},
}

@article{castro_polynomial_2009,
Expand Down Expand Up @@ -198,6 +198,21 @@ @inproceedings{kwon_efficient_2021
langid = {english}
}

@article{mitchell_sampling_2022,
title = {Sampling {{Permutations}} for {{Shapley Value Estimation}}},
author = {Mitchell, Rory and Cooper, Joshua and Frank, Eibe and Holmes, Geoffrey},
date = {2022},
journaltitle = {Journal of Machine Learning Research},
shortjournal = {J. Mach. Learn. Res.},
volume = {23},
number = {43},
pages = {1--46},
issn = {1533-7928},
url = {http://jmlr.org/papers/v23/21-0439.html},
urldate = {2022-10-23},
abstract = {Game-theoretic attribution techniques based on Shapley values are used to interpret black-box machine learning models, but their exact calculation is generally NP-hard, requiring approximation methods for non-trivial models. As the computation of Shapley values can be expressed as a summation over a set of permutations, a common approach is to sample a subset of these permutations for approximation. Unfortunately, standard Monte Carlo sampling methods can exhibit slow convergence, and more sophisticated quasi-Monte Carlo methods have not yet been applied to the space of permutations. To address this, we investigate new approaches based on two classes of approximation methods and compare them empirically. First, we demonstrate quadrature techniques in a RKHS containing functions of permutations, using the Mallows kernel in combination with kernel herding and sequential Bayesian quadrature. The RKHS perspective also leads to quasi-Monte Carlo type error bounds, with a tractable discrepancy measure defined on permutations. Second, we exploit connections between the hypersphere S d−2 Sd−2 and permutations to create practical algorithms for generating permutation samples with good properties. Experiments show the above techniques provide significant improvements for Shapley value estimates over existing methods, converging to a smaller RMSE in the same number of model evaluations.}
}

@inproceedings{okhrati_multilinear_2021,
title = {A {{Multilinear Sampling Algorithm}} to {{Estimate Shapley Values}}},
booktitle = {2020 25th {{International Conference}} on {{Pattern Recognition}} ({{ICPR}})},
Expand Down
45 changes: 39 additions & 6 deletions src/pydvl/value/sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
compute any semi-value, in particular Shapley and Beta values, and Banzhaf
indices.
# Slicing of samplers
## Slicing of samplers
The samplers can be sliced for parallel computation. For those which are
embarrassingly parallel, this is done by slicing the set of "outer" indices and
Expand All @@ -36,6 +36,15 @@
and [UniformSampler][pydvl.value.sampler.UniformSampler]. In contrast, slicing a
[PermutationSampler][pydvl.value.sampler.PermutationSampler] creates a new
sampler which iterates over the same indices.
## References
[^1]: <a name="mitchell_sampling_2022"></a>Mitchell, Rory, Joshua Cooper, Eibe
Frank, and Geoffrey Holmes. [Sampling Permutations for Shapley Value
Estimation](http://jmlr.org/papers/v23/21-0439.html). Journal of Machine
Learning Research 23, no. 43 (2022): 1–46.
"""

from __future__ import annotations
Expand Down Expand Up @@ -315,18 +324,19 @@ class AntitheticSampler(StochasticSamplerMixin, PowersetSampler[IndexT]):
"""An iterator to perform uniform random sampling of subsets, and their
complements.
Works as :class:`~pydvl.value.sampler.UniformSampler`, but for every tuple
$(i,S)$, it subsequently returns $(i,S^c)$, where $S^c$ is the complement of
the set $S$, including the index $i$ itself.
Works as [UniformSampler][pydvl.value.sampler.UniformSampler], but for every
tuple $(i,S)$, it subsequently returns $(i,S^c)$, where $S^c$ is the
complement of the set $S$ in the set of indices, excluding $i$.
"""

def __iter__(self) -> Iterator[SampleT]:
while True:
for idx in self.iterindices():
subset = random_subset(self.complement([idx]), seed=self._rng)
_complement = self.complement([idx])
subset = random_subset(_complement, seed=self._rng)
yield idx, subset
self._n_samples += 1
yield idx, self.complement(np.concatenate((subset, np.array([idx]))))
yield idx, np.setxor1d(_complement, subset)
self._n_samples += 1
if self._n_samples == 0: # Empty index set
break
Expand Down Expand Up @@ -372,6 +382,29 @@ def weight(cls, n: int, subset_len: int) -> float:
return n * math.comb(n - 1, subset_len) if n > 0 else 1.0


class AntitheticPermutationSampler(PermutationSampler[IndexT]):
"""Samples permutations like
[PermutationSampler][pydvl.value.sampler.PermutationSampler], but after
each permutation, it returns the same permutation in reverse order.
This sampler was suggested in (Mitchell et al. 2022)<sup><a
href="#mitchell_sampling_2022">1</a></sup>
!!! tip "New in version 0.7.1"
"""

def __iter__(self) -> Iterator[SampleT]:
while True:
permutation = self._rng.permutation(self._indices)
for perm in permutation, permutation[::-1]:
for i, idx in enumerate(perm):
yield idx, perm[:i]
self._n_samples += 1

if self._n_samples == 0: # Empty index set
break


class DeterministicPermutationSampler(PermutationSampler[IndexT]):
"""Samples all n! permutations of the indices deterministically, and
iterates through them, returning sets as required for the permutation-based
Expand Down
3 changes: 3 additions & 0 deletions tests/value/test_semivalues.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from pydvl.parallel.config import ParallelConfig
from pydvl.utils.types import Seed
from pydvl.value.sampler import (
AntitheticPermutationSampler,
AntitheticSampler,
DeterministicPermutationSampler,
DeterministicUniformSampler,
Expand Down Expand Up @@ -36,6 +37,7 @@
UniformSampler,
PermutationSampler,
AntitheticSampler,
AntitheticPermutationSampler,
],
)
@pytest.mark.parametrize("coefficient", [shapley_coefficient, beta_coefficient(1, 1)])
Expand Down Expand Up @@ -112,6 +114,7 @@ def test_shapley_batch_size(
UniformSampler,
PermutationSampler,
AntitheticSampler,
AntitheticPermutationSampler,
],
)
def test_banzhaf(
Expand Down

0 comments on commit 23316c2

Please sign in to comment.