Skip to content

Commit

Permalink
Merge pull request #428 from aai-institute/feature/427-batch-parallel…
Browse files Browse the repository at this point in the history
…ized-calculations-of-beta-shapley

Batch (parallelized) calculations of beta Shapley
  • Loading branch information
mdbenito authored Sep 18, 2023
2 parents 62ff516 + a3b1d4a commit 55f0760
Show file tree
Hide file tree
Showing 11 changed files with 315 additions and 113 deletions.
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,10 @@ randomness.
`pydvl.value.semivalues`. Introduced new type `Seed` and conversion function
`ensure_seed_sequence`.
[PR #396](https://github.com/aai-institute/pyDVL/pull/396)
- Added `batch_size` parameter to `compute_banzhaf_semivalues`,
`compute_beta_shapley_semivalues`, `compute_shapley_semivalues` and
`compute_generic_semivalues`.
[PR #428](https://github.com/aai-institute/pyDVL/pull/428)

### Changed

Expand Down Expand Up @@ -247,3 +251,4 @@ It contains:
- Parallelization of computations with Ray
- Documentation
- Notebooks containing examples of different use cases

38 changes: 22 additions & 16 deletions src/pydvl/utils/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,10 @@ class Dataset:

def __init__(
self,
x_train: Union[np.ndarray, pd.DataFrame],
y_train: Union[np.ndarray, pd.DataFrame],
x_test: Union[np.ndarray, pd.DataFrame],
y_test: Union[np.ndarray, pd.DataFrame],
x_train: Union[NDArray, pd.DataFrame],
y_train: Union[NDArray, pd.DataFrame],
x_test: Union[NDArray, pd.DataFrame],
y_test: Union[NDArray, pd.DataFrame],
feature_names: Optional[Sequence[str]] = None,
target_names: Optional[Sequence[str]] = None,
data_names: Optional[Sequence[str]] = None,
Expand Down Expand Up @@ -124,8 +124,12 @@ def make_names(s: str, a: np.ndarray) -> List[str]:
raise ValueError("Mismatching number of targets and names")

self.description = description or "No description"
self._indices = np.arange(len(self.x_train))
self._data_names = data_names if data_names is not None else self._indices
self._indices = np.arange(len(self.x_train), dtype=np.int_)
self._data_names = (
np.array(data_names, dtype=object)
if data_names is not None
else self._indices.astype(object)
)

def __getitem__(self, idx: Union[int, slice, Iterable]) -> Tuple:
return self.x_train[idx], self.y_train[idx]
Expand Down Expand Up @@ -220,25 +224,25 @@ def target(self, name: str) -> Tuple[slice, int]:
raise ValueError(f"Target {name} is not in {self.target_names}")

@property
def indices(self):
def indices(self) -> NDArray[np.int_]:
"""Index of positions in data.x_train.
Contiguous integers from 0 to len(Dataset).
"""
return self._indices

@property
def data_names(self):
def data_names(self) -> NDArray[np.object_]:
"""Names of each individual datapoint.
Used for reporting Shapley values.
"""
return self._data_names

@property
def dim(self):
def dim(self) -> int:
"""Returns the number of dimensions of a sample."""
return self.x_train.shape[1] if len(self.x_train.shape) > 1 else 1
return int(self.x_train.shape[1]) if len(self.x_train.shape) > 1 else 1

def __str__(self):
return self.description
Expand All @@ -256,7 +260,7 @@ def from_sklearn(
**kwargs,
) -> "Dataset":
"""Constructs a [Dataset][pydvl.utils.Dataset] object from a
[sklearn.utils.Bunch][sklearn.utils.Bunch], as returned by the `load_*`
[sklearn.utils.Bunch][], as returned by the `load_*`
functions in [scikit-learn toy datasets](https://scikit-learn.org/stable/datasets/toy_dataset.html).
??? Example
Expand Down Expand Up @@ -360,10 +364,10 @@ def from_arrays(
class GroupedDataset(Dataset):
def __init__(
self,
x_train: np.ndarray,
y_train: np.ndarray,
x_test: np.ndarray,
y_test: np.ndarray,
x_train: NDArray,
y_train: NDArray,
x_test: NDArray,
y_test: NDArray,
data_groups: Sequence,
feature_names: Optional[Sequence[str]] = None,
target_names: Optional[Sequence[str]] = None,
Expand Down Expand Up @@ -423,7 +427,9 @@ def __init__(
self.group_items = list(self.groups.items())
self._indices = np.arange(len(self.groups.keys()))
self._data_names = (
group_names if group_names is not None else list(self.groups.keys())
np.array(group_names, dtype=object)
if group_names is not None
else np.array(list(self.groups.keys()), dtype=object)
)

def __len__(self):
Expand Down
9 changes: 6 additions & 3 deletions src/pydvl/utils/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,19 +6,25 @@
from abc import ABCMeta
from typing import Any, Optional, Protocol, TypeVar, Union, cast

import numpy as np
from numpy.random import Generator, SeedSequence
from numpy.typing import NDArray

__all__ = [
"ensure_seed_sequence",
"IndexT",
"NameT",
"MapFunction",
"NoPublicConstructor",
"ReduceFunction",
"Seed",
"SupervisedModel",
]

IndexT = TypeVar("IndexT", bound=np.int_)
NameT = TypeVar("NameT", bound=np.object_)
R = TypeVar("R", covariant=True)
Seed = Union[int, Generator]


class MapFunction(Protocol[R]):
Expand Down Expand Up @@ -74,9 +80,6 @@ def create(cls, *args: Any, **kwargs: Any):
return super().__call__(*args, **kwargs)


Seed = Union[int, Generator]


def ensure_seed_sequence(
seed: Optional[Union[Seed, SeedSequence]] = None
) -> SeedSequence:
Expand Down
2 changes: 1 addition & 1 deletion src/pydvl/value/oob/oob.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def compute_data_oob(
Object with the data values.
"""

result: ValuationResult[np.int_, np.float_] = ValuationResult.empty(
result: ValuationResult[np.int_, np.object_] = ValuationResult.empty(
algorithm="data_oob", indices=u.data.indices, data_names=u.data.data_names
)

Expand Down
28 changes: 12 additions & 16 deletions src/pydvl/value/result.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,6 @@
Literal,
Optional,
Sequence,
Tuple,
TypeVar,
Union,
cast,
overload,
Expand All @@ -71,21 +69,17 @@
from pydvl.utils.dataset import Dataset
from pydvl.utils.numeric import running_moments
from pydvl.utils.status import Status
from pydvl.utils.types import Seed
from pydvl.utils.types import IndexT, NameT, Seed

try:
import pandas # Try to import here for the benefit of mypy
except ImportError:
pass

__all__ = ["ValuationResult", "ValueItem", "IndexT", "NameT"]
__all__ = ["ValuationResult", "ValueItem"]

logger = logging.getLogger(__name__)

# TODO: Move to value.types once it's there
IndexT = TypeVar("IndexT", bound=np.int_)
NameT = TypeVar("NameT", bound=Any)


@total_ordering
@dataclass
Expand Down Expand Up @@ -484,15 +478,17 @@ def __repr__(self) -> str:
repr_string += ")"
return repr_string

def _check_compatible(self, other: "ValuationResult"):
def _check_compatible(self, other: ValuationResult):
if not isinstance(other, ValuationResult):
raise NotImplementedError(
f"Cannot combine ValuationResult with {type(other)}"
)
if self.algorithm and self.algorithm != other.algorithm:
raise ValueError("Cannot combine results from different algorithms")

def __add__(self, other: "ValuationResult") -> "ValuationResult":
def __add__(
self, other: ValuationResult[IndexT, NameT]
) -> ValuationResult[IndexT, NameT]:
"""Adds two ValuationResults.
The values must have been computed with the same algorithm. An exception
Expand Down Expand Up @@ -601,7 +597,7 @@ def __add__(self, other: "ValuationResult") -> "ValuationResult":
# extra_values=self._extra_values.update(other._extra_values),
)

def update(self, idx: int, new_value: float) -> "ValuationResult":
def update(self, idx: int, new_value: float) -> ValuationResult[IndexT, NameT]:
"""Updates the result in place with a new value, using running mean
and variance.
Expand All @@ -623,7 +619,7 @@ def update(self, idx: int, new_value: float) -> "ValuationResult":
self._values[pos], self._variances[pos], self._counts[pos], new_value
)
self[pos] = ValueItem(
index=cast(IndexT, idx),
index=cast(IndexT, idx), # FIXME
name=self._names[pos],
value=val,
variance=var,
Expand Down Expand Up @@ -738,7 +734,7 @@ def empty(
indices: Optional[Sequence[IndexT] | NDArray[IndexT]] = None,
data_names: Optional[Sequence[NameT] | NDArray[NameT]] = None,
n_samples: int = 0,
) -> "ValuationResult":
) -> ValuationResult:
"""Creates an empty [ValuationResult][pydvl.value.result.ValuationResult] object.
Empty results are characterised by having an empty array of values. When
Expand Down Expand Up @@ -766,7 +762,7 @@ def zeros(
indices: Optional[Sequence[IndexT] | NDArray[IndexT]] = None,
data_names: Optional[Sequence[NameT] | NDArray[NameT]] = None,
n_samples: int = 0,
) -> "ValuationResult":
) -> ValuationResult:
"""Creates an empty [ValuationResult][pydvl.value.result.ValuationResult] object.
Empty results are characterised by having an empty array of values. When
Expand All @@ -787,12 +783,12 @@ def zeros(
if indices is None:
indices = np.arange(n_samples, dtype=np.int_)
else:
indices = np.array(indices)
indices = np.array(indices, dtype=np.int_)
return cls(
algorithm=algorithm,
status=Status.Pending,
indices=indices,
data_names=data_names
data_names=np.array(data_names, dtype=object)
if data_names is not None
else np.empty_like(indices, dtype=object),
values=np.zeros(len(indices)),
Expand Down
Loading

0 comments on commit 55f0760

Please sign in to comment.