Merge pull request #428 from aai-institute/feature/427-batch-parallel…

…ized-calculations-of-beta-shapley Batch (parallelized) calculations of beta Shapley
aai-institute · Sep 18, 2023 · 55f0760 · 55f0760
2 parents 62ff516 + a3b1d4a
commit 55f0760
Show file tree

Hide file tree

Showing 11 changed files with 315 additions and 113 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -26,6 +26,10 @@ randomness.
   `pydvl.value.semivalues`. Introduced new type `Seed` and conversion function 
   `ensure_seed_sequence`.
   [PR #396](https://github.com/aai-institute/pyDVL/pull/396)
+- Added `batch_size` parameter to `compute_banzhaf_semivalues`,
+  `compute_beta_shapley_semivalues`, `compute_shapley_semivalues` and
+  `compute_generic_semivalues`.
+  [PR #428](https://github.com/aai-institute/pyDVL/pull/428)
 
 ### Changed
 
@@ -247,3 +251,4 @@ It contains:
 - Parallelization of computations with Ray
 - Documentation
 - Notebooks containing examples of different use cases
+
diff --git a/src/pydvl/utils/dataset.py b/src/pydvl/utils/dataset.py
@@ -40,10 +40,10 @@ class Dataset:
 
     def __init__(
         self,
-        x_train: Union[np.ndarray, pd.DataFrame],
-        y_train: Union[np.ndarray, pd.DataFrame],
-        x_test: Union[np.ndarray, pd.DataFrame],
-        y_test: Union[np.ndarray, pd.DataFrame],
+        x_train: Union[NDArray, pd.DataFrame],
+        y_train: Union[NDArray, pd.DataFrame],
+        x_test: Union[NDArray, pd.DataFrame],
+        y_test: Union[NDArray, pd.DataFrame],
         feature_names: Optional[Sequence[str]] = None,
         target_names: Optional[Sequence[str]] = None,
         data_names: Optional[Sequence[str]] = None,
@@ -124,8 +124,12 @@ def make_names(s: str, a: np.ndarray) -> List[str]:
                 raise ValueError("Mismatching number of targets and names")
 
         self.description = description or "No description"
-        self._indices = np.arange(len(self.x_train))
-        self._data_names = data_names if data_names is not None else self._indices
+        self._indices = np.arange(len(self.x_train), dtype=np.int_)
+        self._data_names = (
+            np.array(data_names, dtype=object)
+            if data_names is not None
+            else self._indices.astype(object)
+        )
 
     def __getitem__(self, idx: Union[int, slice, Iterable]) -> Tuple:
         return self.x_train[idx], self.y_train[idx]
@@ -220,25 +224,25 @@ def target(self, name: str) -> Tuple[slice, int]:
             raise ValueError(f"Target {name} is not in {self.target_names}")
 
     @property
-    def indices(self):
+    def indices(self) -> NDArray[np.int_]:
         """Index of positions in data.x_train.
 
         Contiguous integers from 0 to len(Dataset).
         """
         return self._indices
 
     @property
-    def data_names(self):
+    def data_names(self) -> NDArray[np.object_]:
         """Names of each individual datapoint.
 
         Used for reporting Shapley values.
         """
         return self._data_names
 
     @property
-    def dim(self):
+    def dim(self) -> int:
         """Returns the number of dimensions of a sample."""
-        return self.x_train.shape[1] if len(self.x_train.shape) > 1 else 1
+        return int(self.x_train.shape[1]) if len(self.x_train.shape) > 1 else 1
 
     def __str__(self):
         return self.description
@@ -256,7 +260,7 @@ def from_sklearn(
         **kwargs,
     ) -> "Dataset":
         """Constructs a [Dataset][pydvl.utils.Dataset] object from a
-        [sklearn.utils.Bunch][sklearn.utils.Bunch], as returned by the `load_*`
+        [sklearn.utils.Bunch][], as returned by the `load_*`
         functions in [scikit-learn toy datasets](https://scikit-learn.org/stable/datasets/toy_dataset.html).
 
         ??? Example
@@ -360,10 +364,10 @@ def from_arrays(
 class GroupedDataset(Dataset):
     def __init__(
         self,
-        x_train: np.ndarray,
-        y_train: np.ndarray,
-        x_test: np.ndarray,
-        y_test: np.ndarray,
+        x_train: NDArray,
+        y_train: NDArray,
+        x_test: NDArray,
+        y_test: NDArray,
         data_groups: Sequence,
         feature_names: Optional[Sequence[str]] = None,
         target_names: Optional[Sequence[str]] = None,
@@ -423,7 +427,9 @@ def __init__(
         self.group_items = list(self.groups.items())
         self._indices = np.arange(len(self.groups.keys()))
         self._data_names = (
-            group_names if group_names is not None else list(self.groups.keys())
+            np.array(group_names, dtype=object)
+            if group_names is not None
+            else np.array(list(self.groups.keys()), dtype=object)
         )
 
     def __len__(self):

diff --git a/src/pydvl/utils/types.py b/src/pydvl/utils/types.py
@@ -6,19 +6,25 @@
 from abc import ABCMeta
 from typing import Any, Optional, Protocol, TypeVar, Union, cast
 
+import numpy as np
 from numpy.random import Generator, SeedSequence
 from numpy.typing import NDArray
 
 __all__ = [
     "ensure_seed_sequence",
+    "IndexT",
+    "NameT",
     "MapFunction",
     "NoPublicConstructor",
     "ReduceFunction",
     "Seed",
     "SupervisedModel",
 ]
 
+IndexT = TypeVar("IndexT", bound=np.int_)
+NameT = TypeVar("NameT", bound=np.object_)
 R = TypeVar("R", covariant=True)
+Seed = Union[int, Generator]
 
 
 class MapFunction(Protocol[R]):
@@ -74,9 +80,6 @@ def create(cls, *args: Any, **kwargs: Any):
         return super().__call__(*args, **kwargs)
 
 
-Seed = Union[int, Generator]
-
-
 def ensure_seed_sequence(
     seed: Optional[Union[Seed, SeedSequence]] = None
 ) -> SeedSequence:

diff --git a/src/pydvl/value/oob/oob.py b/src/pydvl/value/oob/oob.py
@@ -70,7 +70,7 @@ def compute_data_oob(
         Object with the data values.
     """
 
-    result: ValuationResult[np.int_, np.float_] = ValuationResult.empty(
+    result: ValuationResult[np.int_, np.object_] = ValuationResult.empty(
         algorithm="data_oob", indices=u.data.indices, data_names=u.data.data_names
     )
 

diff --git a/src/pydvl/value/result.py b/src/pydvl/value/result.py
@@ -57,8 +57,6 @@
     Literal,
     Optional,
     Sequence,
-    Tuple,
-    TypeVar,
     Union,
     cast,
     overload,
@@ -71,21 +69,17 @@
 from pydvl.utils.dataset import Dataset
 from pydvl.utils.numeric import running_moments
 from pydvl.utils.status import Status
-from pydvl.utils.types import Seed
+from pydvl.utils.types import IndexT, NameT, Seed
 
 try:
     import pandas  # Try to import here for the benefit of mypy
 except ImportError:
     pass
 
-__all__ = ["ValuationResult", "ValueItem", "IndexT", "NameT"]
+__all__ = ["ValuationResult", "ValueItem"]
 
 logger = logging.getLogger(__name__)
 
-# TODO: Move to value.types once it's there
-IndexT = TypeVar("IndexT", bound=np.int_)
-NameT = TypeVar("NameT", bound=Any)
-
 
 @total_ordering
 @dataclass
@@ -484,15 +478,17 @@ def __repr__(self) -> str:
         repr_string += ")"
         return repr_string
 
-    def _check_compatible(self, other: "ValuationResult"):
+    def _check_compatible(self, other: ValuationResult):
         if not isinstance(other, ValuationResult):
             raise NotImplementedError(
                 f"Cannot combine ValuationResult with {type(other)}"
             )
         if self.algorithm and self.algorithm != other.algorithm:
             raise ValueError("Cannot combine results from different algorithms")
 
-    def __add__(self, other: "ValuationResult") -> "ValuationResult":
+    def __add__(
+        self, other: ValuationResult[IndexT, NameT]
+    ) -> ValuationResult[IndexT, NameT]:
         """Adds two ValuationResults.
 
         The values must have been computed with the same algorithm. An exception
@@ -601,7 +597,7 @@ def __add__(self, other: "ValuationResult") -> "ValuationResult":
             # extra_values=self._extra_values.update(other._extra_values),
         )
 
-    def update(self, idx: int, new_value: float) -> "ValuationResult":
+    def update(self, idx: int, new_value: float) -> ValuationResult[IndexT, NameT]:
         """Updates the result in place with a new value, using running mean
         and variance.
 
@@ -623,7 +619,7 @@ def update(self, idx: int, new_value: float) -> "ValuationResult":
             self._values[pos], self._variances[pos], self._counts[pos], new_value
         )
         self[pos] = ValueItem(
-            index=cast(IndexT, idx),
+            index=cast(IndexT, idx),  # FIXME
             name=self._names[pos],
             value=val,
             variance=var,
@@ -738,7 +734,7 @@ def empty(
         indices: Optional[Sequence[IndexT] | NDArray[IndexT]] = None,
         data_names: Optional[Sequence[NameT] | NDArray[NameT]] = None,
         n_samples: int = 0,
-    ) -> "ValuationResult":
+    ) -> ValuationResult:
         """Creates an empty [ValuationResult][pydvl.value.result.ValuationResult] object.
 
         Empty results are characterised by having an empty array of values. When
@@ -766,7 +762,7 @@ def zeros(
         indices: Optional[Sequence[IndexT] | NDArray[IndexT]] = None,
         data_names: Optional[Sequence[NameT] | NDArray[NameT]] = None,
         n_samples: int = 0,
-    ) -> "ValuationResult":
+    ) -> ValuationResult:
         """Creates an empty [ValuationResult][pydvl.value.result.ValuationResult] object.
 
         Empty results are characterised by having an empty array of values. When
@@ -787,12 +783,12 @@ def zeros(
         if indices is None:
             indices = np.arange(n_samples, dtype=np.int_)
         else:
-            indices = np.array(indices)
+            indices = np.array(indices, dtype=np.int_)
         return cls(
             algorithm=algorithm,
             status=Status.Pending,
             indices=indices,
-            data_names=data_names
+            data_names=np.array(data_names, dtype=object)
             if data_names is not None
             else np.empty_like(indices, dtype=object),
             values=np.zeros(len(indices)),