Skip to content

Commit

Permalink
Add first test case and add minor refactorings.
Browse files Browse the repository at this point in the history
  • Loading branch information
Markus Semmler committed Apr 12, 2023
1 parent 2aed995 commit 0c51d75
Show file tree
Hide file tree
Showing 6 changed files with 269 additions and 81 deletions.
5 changes: 3 additions & 2 deletions src/pydvl/utils/numeric.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,14 +156,15 @@ def random_powerset_group_conditional(
if n_samples is None:
n_samples = np.iinfo(np.int32).max

unique_labels = np.unique(labels)
while total <= n_samples:

subsets: List[NDArray[T]] = []
for label in labels:
for label in unique_labels:
label_indices = np.asarray(np.where(labels == label)[0])
subset_length = int(
rng.integers(
min(min_elements, len(label_indices) - 1), len(label_indices)
min(min_elements, len(label_indices)), len(label_indices) + 1
)
)
subsets.append(random_subset_of_size(s[label_indices], subset_length))
Expand Down
8 changes: 7 additions & 1 deletion src/pydvl/utils/score.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,13 @@

from pydvl.utils.types import SupervisedModel

__all__ = ["Scorer", "compose_score", "squashed_r2", "squashed_variance"]
__all__ = [
"Scorer",
"ScorerCallable",
"compose_score",
"squashed_r2",
"squashed_variance",
]


class ScorerCallable(Protocol):
Expand Down
4 changes: 2 additions & 2 deletions src/pydvl/utils/utility.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
from pydvl.utils import Dataset
from pydvl.utils.caching import CacheStats, memcached, serialize
from pydvl.utils.config import MemcachedConfig
from pydvl.utils.score import Scorer
from pydvl.utils.score import Scorer, ScorerCallable
from pydvl.utils.types import SupervisedModel

__all__ = ["Utility", "DataUtilityLearning", "MinerGameUtility", "GlovesGameUtility"]
Expand Down Expand Up @@ -110,7 +110,7 @@ def __init__(
self,
model: SupervisedModel,
data: Dataset,
scorer: Optional[Union[str, Scorer]] = None,
scorer: Optional[Union[str, ScorerCallable]] = None,
*,
default_score: float = 0.0,
score_range: Tuple[float, float] = (-np.inf, np.inf),
Expand Down
4 changes: 2 additions & 2 deletions src/pydvl/value/result.py
Original file line number Diff line number Diff line change
Expand Up @@ -499,9 +499,9 @@ def __add__(self, other: "ValuationResult") -> "ValuationResult":
"""
# empty results
if len(self.values) == 0:
if len(self.values) == 0 or np.all(self.counts == 0):
return other
if len(other.values) == 0:
if len(other.values) == 0 or np.all(other.counts == 0):
return self

self._check_compatible(other)
Expand Down
235 changes: 166 additions & 69 deletions src/pydvl/value/shapley/classwise.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,45 +14,20 @@
import numpy as np
from numpy._typing import NDArray

from pydvl.utils import MapReduceJob, ParallelConfig, SupervisedModel, Utility
from pydvl.utils import MapReduceJob, ParallelConfig, Scorer, SupervisedModel, Utility

__all__ = ["class_wise_shapley", "CSScorer"]
__all__ = [
"class_wise_shapley",
"CSScorer",
]

from sklearn.metrics import accuracy_score
from tqdm import tqdm

from pydvl.utils.numeric import random_powerset_group_conditional
from pydvl.value import StoppingCriterion, ValuationResult


def _estimate_in_out_cls_accuracy(
model: SupervisedModel, x: np.ndarray, labels: np.ndarray, label: np.int_
) -> Tuple[float, float]:
"""
Estimate the in and out of class accuracy as defined in [1], Equation 3.
:param model: A model to be used for predicting the labels.
:param x: The inputs to be used for measuring the accuracies. Has to match the labels.
:param labels: The labels ot be used for measuring the accuracies. It is divided further by the passed label.
:param label: The label of the class, which is currently viewed.
:return: A tuple, containing the in class accuracy as well as the out of class accuracy.
"""
n = len(x)
y_pred = model.predict(x)
label_set_match = labels == label
label_set = np.where(label_set_match)[0]
complement_label_set = np.where(~label_set_match)[0]

acc_in_cls = (
accuracy_score(labels[label_set], y_pred[label_set], normalize=False) / n
)
acc_out_of_cls = (
accuracy_score(
labels[complement_label_set], y_pred[complement_label_set], normalize=False
)
/ n
)
return acc_in_cls, acc_out_of_cls
IntArray = NDArray[np.int_]


class CSScorer:
Expand Down Expand Up @@ -87,13 +62,13 @@ def __call__(


def _class_wise_shapley_worker(
indices: Sequence[int],
indices: np.ndarray,
u: Utility,
*,
progress: bool = True,
num_resample_complement_sets: int = 1,
done: StoppingCriterion,
eps: float = 1e-4,
num_resample_complement_sets: int = 1,
) -> ValuationResult:
r"""Computes the class-wise Shapley value using the formulation with permutations:
Expand All @@ -119,52 +94,26 @@ def _class_wise_shapley_worker(
data_names=u.data.data_names[indices],
)

x_train, y_train = u.data.get_training_data(indices)
_, y_train = u.data.get_training_data(indices)
unique_labels = np.unique(y_train)
pbar = tqdm(disable=not progress, position=0, total=100, unit="%")

while not done(result):

pbar.n = 100 * done.completion()
pbar.refresh()

for idx_label, label in enumerate(unique_labels):

u.scorer.label = label
active_elements = y_train == label
label_set = np.where(active_elements)[0]
complement_label_set = np.where(~active_elements)[0]
label_set = indices[label_set]
complement_label_set = indices[complement_label_set]

_, complement_y_train = u.data.get_training_data(complement_label_set)
permutation_label_set = np.random.permutation(label_set)

for kl, subset_complement in enumerate(
random_powerset_group_conditional(
complement_label_set,
complement_y_train,
n_samples=num_resample_complement_sets,
)
):

train_set = np.concatenate((label_set, subset_complement))
final_score = u(train_set)
prev_score = 0.0

for i, _ in enumerate(label_set):

if np.abs(prev_score - final_score) < eps:
score = prev_score

else:
train_set = np.concatenate(
(permutation_label_set[: i + 1], subset_complement)
)
score = u(train_set)

marginal = score - prev_score
result.update(permutation_label_set[i], marginal)
prev_score = score
result = __class_complement_conditional_sampler(
u,
label,
result,
active_indices=indices,
eps=eps,
num_samples=num_resample_complement_sets,
)

return result

Expand All @@ -179,6 +128,18 @@ def class_wise_shapley(
n_jobs: int = 4,
config: ParallelConfig = ParallelConfig(),
) -> ValuationResult:
"""
Computes the class-wise Shapley value using the formulation with permutations using map reduce.
:param u: Utility object with model, data, and scoring function. The scoring function has to be of type CSScorer.
:param progress: Whether to display progress bars for each job.
:param done: Criterion on when no new permutation shall be sampled.
:param normalize_score: Whether to normalize the score by the number of classes.
:param eps: The threshold when updating using the truncated monte carlo estimator.
:param n_jobs: Number of jobs to run in parallel.
:param config: Parallel configuration.
:return: ValuationResult object with the data values.
"""

map_reduce_job: MapReduceJob[NDArray, ValuationResult] = MapReduceJob(
u.data.indices,
Expand Down Expand Up @@ -209,3 +170,139 @@ def class_wise_shapley(
result.values[label_set] *= in_cls_acc / sigma

return result


def __class_complement_conditional_sampler(
u: Utility,
label: int,
result: ValuationResult,
*,
active_indices: IntArray = None,
eps: float = 1e-4,
num_samples: int = 10,
) -> ValuationResult:
"""
Samples a random subset of the complement set and computes the truncated monte carlo estimator.
:param u: Utility object with model, data, and scoring function. The scoring function has to be of type CSScorer.
:param label: The label for which the complement set shall be sampled.
:param result: The current result object.
:param active_indices: The indices of the active elements.
:param eps: The threshold when updating using the truncated monte carlo estimator.
:param num_samples: The number of subset samples to be drawn from the complement set.
:return: The updated result object.
"""
if active_indices is None:
active_indices = u.data.indices

_, y_train = u.data.get_training_data(active_indices)
label_set, complement_label_set = __split_into_label_sets(
y_train, label, active_indices
)
_, complement_y_train = u.data.get_training_data(complement_label_set)

for kl, subset_complement in enumerate(
random_powerset_group_conditional(
complement_label_set,
complement_y_train,
n_samples=num_samples,
)
):
result = __truncated_permutation_mcmc_sampler(
u, label_set, subset_complement, result, eps
)
return result


def _estimate_in_out_cls_accuracy(
model: SupervisedModel, x: np.ndarray, labels: np.ndarray, label: np.int_
) -> Tuple[float, float]:
"""
Estimate the in and out of class accuracy as defined in [1], Equation 3.
:param model: A model to be used for predicting the labels.
:param x: The inputs to be used for measuring the accuracies. Has to match the labels.
:param labels: The labels ot be used for measuring the accuracies. It is divided further by the passed label.
:param label: The label of the class, which is currently viewed.
:return: A tuple, containing the in class accuracy as well as the out of class accuracy.
"""
n = len(x)
y_pred = model.predict(x)
label_set_match = labels == label
label_set = np.where(label_set_match)[0]
complement_label_set = np.where(~label_set_match)[0]

acc_in_cls = (
accuracy_score(labels[label_set], y_pred[label_set], normalize=False) / n
)
acc_out_of_cls = (
accuracy_score(
labels[complement_label_set], y_pred[complement_label_set], normalize=False
)
/ n
)
return acc_in_cls, acc_out_of_cls


def __split_into_label_sets(
labels: IntArray, label: int, indices: IntArray
) -> Tuple[IntArray, IntArray]:
"""
Splits the indices into two sets, one containing the indices with the label and the other the remaining indices.
:param labels: The labels of the indices.
:param label: The label to be used for splitting.
:param indices: The indices to be split.
:return: The two sets of indices.
"""
active_elements = labels == label
label_set = np.where(active_elements)[0]
complement_label_set = np.where(~active_elements)[0]
label_set = indices[label_set]
complement_label_set = indices[complement_label_set]
return label_set, complement_label_set


def __truncated_permutation_mcmc_sampler(
u: Utility,
class_label_set: np.ndarray,
complement_set: np.ndarray,
result: ValuationResult,
atol: float,
) -> ValuationResult:
"""
A truncated version of a permutation-based MCMC estimator for class-wise shapley values. It generates
a permutation p[i] of the class label set and adds one element at a time to estimate the utility on
the newly created training set.
:param u: Utility object with model, data, and scoring function. The scoring function has to be of type CSScorer.
:param class_label_set: The set of indices of the class label set.
:param complement_set: The set of indices of the complement set.
:param atol: The threshold when updating using the truncated monte carlo estimator.
:return: ValuationResult object with the data values.
"""
if len(np.intersect1d(class_label_set, complement_set)) > 0:
raise ValueError(
"The class label set and the complement set have to be disjoint."
)

class_label_set = np.random.permutation(class_label_set)
train_set = np.concatenate((class_label_set, complement_set))
final_score = u(train_set)
prev_score = None

for i in range(len(class_label_set)):

if prev_score is not None and np.abs(prev_score - final_score) < atol:
score = prev_score

else:
train_set = np.concatenate((class_label_set[: i + 1], complement_set))
score = u(train_set)

if prev_score is not None:
marginal = score - prev_score
result.update(class_label_set[i], marginal)

prev_score = score

return result
Loading

0 comments on commit 0c51d75

Please sign in to comment.