From ad71a0adf41da808bd353e43c322cc1f7441db68 Mon Sep 17 00:00:00 2001 From: Markus Semmler Date: Thu, 12 Oct 2023 23:55:33 +0200 Subject: [PATCH] Experimental: Store history. --- src/pydvl/value/semivalues.py | 11 +++++++++++ src/pydvl/value/shapley/classwise.py | 14 ++++++++++++++ src/pydvl/value/shapley/montecarlo.py | 12 ++++++++++++ 3 files changed, 37 insertions(+) diff --git a/src/pydvl/value/semivalues.py b/src/pydvl/value/semivalues.py index cabe3c3ba..f1670143d 100644 --- a/src/pydvl/value/semivalues.py +++ b/src/pydvl/value/semivalues.py @@ -94,6 +94,7 @@ from itertools import islice from typing import Iterable, List, Optional, Protocol, Tuple, Type, cast +import numpy as np import scipy as sp from deprecate import deprecated from tqdm import tqdm @@ -182,6 +183,7 @@ def compute_generic_semivalues( n_jobs: int = 1, config: ParallelConfig = ParallelConfig(), progress: bool = False, + log_folder: Optional[Path] = None, ) -> ValuationResult: """Computes semi-values for a given utility function and subset sampler. @@ -204,6 +206,8 @@ def compute_generic_semivalues( config: Object configuring parallel computation, with cluster address, number of cpus, etc. progress: Whether to display a progress bar. + log_folder: If set history of each valuation result is stored in the + specified folder. Returns: Object with the results. @@ -246,6 +250,7 @@ def compute_generic_semivalues( sampler_it = iter(sampler) pbar = tqdm(disable=not progress, total=100, unit="%") + history = [] if log_folder else None with init_executor( max_workers=max_workers, config=config, cancel_futures=True @@ -259,7 +264,13 @@ def compute_generic_semivalues( for future in completed: for idx, marginal in future.result(): result.update(idx, marginal) + if log_folder is not None: + history.append(result.values[result.indices]) + if done(result): + if log_folder: + np.savetxt(log_folder / "history.txt", np.array(history)) + return result # Ensure that we always have n_submitted_jobs running diff --git a/src/pydvl/value/shapley/classwise.py b/src/pydvl/value/shapley/classwise.py index 438d953c8..c57c5f524 100644 --- a/src/pydvl/value/shapley/classwise.py +++ b/src/pydvl/value/shapley/classwise.py @@ -60,8 +60,10 @@ """ import logging import numbers +import os from concurrent.futures import FIRST_COMPLETED, Future, wait from copy import copy +from pathlib import Path from typing import Callable, Optional, Set, Tuple, Union, cast import numpy as np @@ -252,6 +254,7 @@ def compute_classwise_shapley_values( config: ParallelConfig = ParallelConfig(), progress: bool = False, seed: Optional[Seed] = None, + log_folder: Optional[Path] = None, ) -> ValuationResult: r""" Computes an approximate Class-wise Shapley value by sampling independent @@ -291,6 +294,8 @@ def compute_classwise_shapley_values( config: Parallel configuration. progress: Whether to display a progress bar. seed: Either an instance of a numpy random number generator or a seed for it. + log_folder: If set history of each valuation result is stored in the + specified folder. Returns: ValuationResult object containing computed data values. @@ -326,6 +331,7 @@ def compute_classwise_shapley_values( ) terminate_exec = False seed_sequence = ensure_seed_sequence(seed) + history = [] if log_folder else None with init_executor(max_workers=n_jobs, config=config) as executor: pending: Set[Future] = set() @@ -335,6 +341,11 @@ def compute_classwise_shapley_values( ) for future in completed_futures: accumulated_result += future.result() + if log_folder is not None: + history.append( + accumulated_result.values[accumulated_result.indices] + ) + if done(accumulated_result): terminate_exec = True break @@ -363,6 +374,9 @@ def compute_classwise_shapley_values( if normalize_values: result = _normalize_classwise_shapley_values(result, u) + if log_folder: + np.savetxt(log_folder / "history.txt", np.array(history)) + return result diff --git a/src/pydvl/value/shapley/montecarlo.py b/src/pydvl/value/shapley/montecarlo.py index 6471a964d..e613d253a 100644 --- a/src/pydvl/value/shapley/montecarlo.py +++ b/src/pydvl/value/shapley/montecarlo.py @@ -48,6 +48,7 @@ from concurrent.futures import FIRST_COMPLETED, Future, wait from functools import reduce from itertools import cycle, takewhile +from pathlib import Path from typing import Optional, Sequence, Union import numpy as np @@ -142,6 +143,7 @@ def permutation_montecarlo_shapley( config: ParallelConfig = ParallelConfig(), progress: bool = False, seed: Seed = None, + log_folder: Optional[Path] = None, ) -> ValuationResult: r"""Computes an approximate Shapley value by sampling independent permutations of the index set, approximating the sum: @@ -189,6 +191,8 @@ def permutation_montecarlo_shapley( number of cpus, etc. progress: Whether to display a progress bar. seed: Either an instance of a numpy random number generator or a seed for it. + log_folder: If set history of each valuation result is stored in the + specified folder. Returns: Object with the data values. @@ -206,6 +210,7 @@ def permutation_montecarlo_shapley( ) pbar = tqdm(disable=not progress, total=100, unit="%") + history = [] if log_folder else None with init_executor( max_workers=max_workers, config=config, cancel_futures=CancellationPolicy.ALL @@ -222,7 +227,14 @@ def permutation_montecarlo_shapley( result += future.result() # we could check outside the loop, but that means more # submissions if the stopping criterion is unstable + + if log_folder is not None: + history.append(result.values[result.indices]) + if done(result): + if log_folder: + np.savetxt(log_folder / "history.txt", np.array(history)) + return result # Ensure that we always have n_submitted_jobs in the queue or running