Skip to content

Commit

Permalink
Merge pull request #466 from aai-institute/feature/269-simplify-progr…
Browse files Browse the repository at this point in the history
…ess-bars

Simplify progress bars
  • Loading branch information
AnesBenmerzoug authored Dec 18, 2023
2 parents 6a37624 + 504a172 commit 3076b61
Show file tree
Hide file tree
Showing 15 changed files with 81 additions and 112 deletions.
4 changes: 3 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
## Unreleased

### Added

- New influence function interface `InfluenceFunctionModel`
- Data parallel computation with `DaskInfluenceCalculator`
[PR #26](https://github.com/aai-institute/pyDVL/issues/26)
Expand All @@ -14,11 +15,12 @@

### Changed

- Simplify display of computation progress
[PR #466](https://github.com/aai-institute/pyDVL/pull/466)
- Improve readme and explain better the examples
[PR #465](https://github.com/aai-institute/pyDVL/pull/465)
- Simplify and improve tests, add CodeCov code coverage
[PR #429](https://github.com/aai-institute/pyDVL/pull/429)
-
- **Breaking Changes**
- Removed `compute_influences` and all related code.
Replaced by new `InfluenceFunctionModel` interface. Removed modules:
Expand Down
5 changes: 2 additions & 3 deletions notebooks/support/torch.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,7 @@
from torch.optim.lr_scheduler import _LRScheduler
from torch.utils.data import DataLoader
from torchvision.models import ResNet18_Weights, resnet18

from pydvl.utils import maybe_progress
from tqdm.auto import tqdm

from .types import Losses

Expand Down Expand Up @@ -123,7 +122,7 @@ def fit_torch_model(
train_loss = []
val_loss = []

for epoch in maybe_progress(range(num_epochs), progress, desc="Model fitting"):
for epoch in tqdm(range(num_epochs), disable=not progress, desc="Model fitting"):
batch_loss = []
for train_batch in training_data:
batch_x, batch_y = train_batch
Expand Down
8 changes: 5 additions & 3 deletions src/pydvl/influence/torch/influence_function_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,10 @@
import torch
from torch import nn as nn
from torch.utils.data import DataLoader
from tqdm.auto import tqdm

from pydvl.utils.progress import log_duration

from ...utils import log_duration, maybe_progress
from ..base_influence_function_model import (
InfluenceFunctionModel,
InfluenceMode,
Expand Down Expand Up @@ -522,7 +524,7 @@ def reg_hvp(v: torch.Tensor):
batch_cg = torch.zeros_like(rhs)

for idx, bi in enumerate(
maybe_progress(rhs, self.progress, desc="Conjugate gradient")
tqdm(rhs, disable=not self.progress, desc="Conjugate gradient")
):
batch_result = self._solve_cg(
reg_hvp,
Expand Down Expand Up @@ -689,7 +691,7 @@ def lissa_step(
create_batch_hvp_function(self.model, self.loss),
in_dims=(None, None, None, 0),
)
for _ in maybe_progress(range(self.maxiter), self.progress, desc="Lissa"):
for _ in tqdm(range(self.maxiter), disable=not self.progress, desc="Lissa"):
x, y = next(iter(shuffled_training_data))
# grad_xy = model.grad(x, y, create_graph=True)
reg_hvp = (
Expand Down
5 changes: 3 additions & 2 deletions src/pydvl/reporting/scores.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@

import numpy as np
from numpy.typing import NDArray
from tqdm.auto import tqdm

from pydvl.utils import Utility, maybe_progress
from pydvl.utils import Utility
from pydvl.value.result import ValuationResult

__all__ = ["compute_removal_score"]
Expand Down Expand Up @@ -44,7 +45,7 @@ def compute_removal_score(
# We sort in descending order if we want to remove the best values
values.sort(reverse=remove_best)

for pct in maybe_progress(percentages, display=progress, desc="Removal Scores"):
for pct in tqdm(percentages, disable=not progress, desc="Removal Scores"):
n_removal = int(pct * len(u.data))
indices = values.indices[n_removal:]
score = u(indices)
Expand Down
92 changes: 26 additions & 66 deletions src/pydvl/utils/progress.py
Original file line number Diff line number Diff line change
@@ -1,82 +1,42 @@
"""
!!! Warning
This module is deprecated and will be removed in a future release.
It implements a wrapper for the [tqdm](https://tqdm.github.io/) progress bar
iterator for easy toggling, but this functionality is already provided by
the `disable` argument of `tqdm`.
"""
import collections.abc
import logging
from functools import wraps
from itertools import cycle, takewhile
from time import time
from typing import Iterable, Iterator, Union
from typing import TYPE_CHECKING, Collection, Iterator

from tqdm.auto import tqdm

__all__ = ["maybe_progress", "log_duration"]
# This is needed to avoid circular import errors
if TYPE_CHECKING:
from pydvl.value.result import ValuationResult
from pydvl.value.stopping import StoppingCriterion

logger = logging.getLogger(__name__)


class MockProgress(collections.abc.Iterator):
"""A Naive mock class to use with maybe_progress and tqdm.
Mocked methods don't support return values.
Mocked properties don't do anything
"""

class MiniMock:
def __call__(self, *args, **kwargs):
pass

def __add__(self, other):
pass

def __sub__(self, other):
pass

def __mul__(self, other):
pass
__all__ = ["repeat_indices", "log_duration"]

def __floordiv__(self, other):
pass

def __truediv__(self, other):
pass

def __init__(self, iterator: Union[Iterator, Iterable]):
# Since there is no _it in __dict__ at this point, doing here
# self._it = iterator
# results in a call to __getattr__() and the assignment fails, so we
# use __dict__ instead
self.__dict__["_it"] = iterator

def __iter__(self):
return iter(self._it)

def __next__(self):
return next(self._it)

def __getattr__(self, key):
return self.MiniMock()

def __setattr__(self, key, value):
pass
logger = logging.getLogger(__name__)


def maybe_progress(
it: Union[int, Iterable, Iterator], display: bool = False, **kwargs
) -> Union[tqdm, MockProgress]:
"""Returns either a tqdm progress bar or a mock object which wraps the
iterator as well, but ignores any accesses to methods or properties.
def repeat_indices(
indices: Collection[int],
result: "ValuationResult",
done: "StoppingCriterion",
**kwargs,
) -> Iterator[int]:
"""Helper function to cycle indefinitely over a collection of indices
until the stopping criterion is satisfied while displaying progress.
Args:
it: the iterator to wrap
display: set to True to return a tqdm bar
kwargs: Keyword arguments that will be forwarded to tqdm
indices: Collection of indices that will be cycled until done.
result: Object containing the current results.
done: Stopping criterion.
kwargs: Keyword arguments passed to tqdm.
"""
if isinstance(it, int):
it = range(it) # type: ignore
return tqdm(it, **kwargs) if display else MockProgress(it)
with tqdm(total=100, unit="%", **kwargs) as pbar:
it = takewhile(lambda _: not done(result), cycle(indices))
for i in it:
yield i
pbar.update(100 * done.completion() - pbar.n)
pbar.refresh()


def log_duration(func):
Expand Down
2 changes: 1 addition & 1 deletion src/pydvl/value/least_core/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ def compute_least_core_values(
solver_options.update(kwargs)

if mode == LeastCoreMode.MonteCarlo:
# TODO fix progress showing and maybe_progress in remote case
# TODO fix progress showing in remote case
progress = False
if n_iterations is None:
raise ValueError("n_iterations cannot be None for Monte Carlo Least Core")
Expand Down
4 changes: 2 additions & 2 deletions src/pydvl/value/least_core/montecarlo.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@

import numpy as np
from numpy.typing import NDArray
from tqdm.auto import tqdm

from pydvl.parallel import MapReduceJob, ParallelConfig, effective_n_jobs
from pydvl.utils.numeric import random_powerset
from pydvl.utils.progress import maybe_progress
from pydvl.utils.types import Seed
from pydvl.utils.utility import Utility
from pydvl.value.least_core.common import LeastCoreProblem, lc_solve_problem
Expand Down Expand Up @@ -175,7 +175,7 @@ def _montecarlo_least_core(
A_lb = np.zeros((n_iterations, n))

for i, subset in enumerate(
maybe_progress(power_set, progress, total=n_iterations, position=job_id)
tqdm(power_set, disable=not progress, total=n_iterations, position=job_id)
):
indices: NDArray[np.bool_] = np.zeros(n, dtype=bool)
indices[list(subset)] = True
Expand Down
14 changes: 9 additions & 5 deletions src/pydvl/value/least_core/naive.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,9 @@

import numpy as np
from numpy.typing import NDArray
from tqdm.auto import tqdm

from pydvl.utils import Utility, maybe_progress, powerset
from pydvl.utils import Utility, powerset
from pydvl.value.least_core.common import LeastCoreProblem, lc_solve_problem
from pydvl.value.result import ValuationResult

Expand Down Expand Up @@ -103,14 +104,17 @@ def lc_prepare_problem(u: Utility, progress: bool = False) -> LeastCoreProblem:

logger.debug("Iterating over all subsets")
utility_values = np.zeros(powerset_size)
for i, subset in enumerate(
maybe_progress(
powerset(u.data.indices), progress, total=powerset_size - 1, position=0
for i, subset in enumerate( # type: ignore
tqdm(
powerset(u.data.indices),
disable=not progress,
total=powerset_size - 1,
position=0,
)
):
indices: NDArray[np.bool_] = np.zeros(n, dtype=bool)
indices[list(subset)] = True
A_lb[i, indices] = 1
utility_values[i] = u(subset)
utility_values[i] = u(subset) # type: ignore

return LeastCoreProblem(utility_values, A_lb)
7 changes: 4 additions & 3 deletions src/pydvl/value/oob/oob.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,9 @@
from numpy.typing import NDArray
from sklearn.base import is_classifier, is_regressor
from sklearn.ensemble import BaggingClassifier, BaggingRegressor
from tqdm.auto import tqdm

from pydvl.utils import Seed, Utility, maybe_progress
from pydvl.utils import Seed, Utility
from pydvl.utils.types import LossFunction
from pydvl.value.result import ValuationResult

Expand Down Expand Up @@ -112,8 +113,8 @@ def compute_data_oob(

bag.fit(u.data.x_train, u.data.y_train)

for est, samples in maybe_progress(
zip(bag.estimators_, bag.estimators_samples_), progress, total=n_est
for est, samples in tqdm(
zip(bag.estimators_, bag.estimators_samples_), disable=not progress, total=n_est
): # The bottleneck is the bag fitting not this part so TQDM is not very useful here
oob_idx = np.setxor1d(u.data.indices, np.unique(samples))
array_loss = loss(
Expand Down
5 changes: 3 additions & 2 deletions src/pydvl/value/shapley/gt.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,10 @@
import numpy as np
from numpy.random import SeedSequence
from numpy.typing import NDArray
from tqdm.auto import trange

from pydvl.parallel import MapReduceJob, ParallelConfig, effective_n_jobs
from pydvl.utils import Utility, maybe_progress
from pydvl.utils import Utility
from pydvl.utils.numeric import random_subset_of_size
from pydvl.utils.status import Status
from pydvl.utils.types import Seed, ensure_seed_sequence
Expand Down Expand Up @@ -155,7 +156,7 @@ def _group_testing_shapley(
) # indicator vars
uu = np.empty(n_samples) # utilities

for t in maybe_progress(n_samples, progress=progress, position=job_id):
for t in trange(n_samples, disable=not progress, position=job_id):
k = rng.choice(const.kk, size=1, p=const.q).item()
s = random_subset_of_size(u.data.indices, k, seed=rng)
uu[t] = u(s)
Expand Down
5 changes: 3 additions & 2 deletions src/pydvl/value/shapley/knn.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,9 @@
import numpy as np
from numpy.typing import NDArray
from sklearn.neighbors import KNeighborsClassifier, NearestNeighbors
from tqdm.auto import tqdm

from pydvl.utils import Utility, maybe_progress
from pydvl.utils import Utility
from pydvl.utils.status import Status
from pydvl.value.result import ValuationResult

Expand Down Expand Up @@ -76,7 +77,7 @@ def knn_shapley(u: Utility, *, progress: bool = True) -> ValuationResult:
n = len(u.data)
yt = u.data.y_train
iterator = enumerate(zip(u.data.y_test, indices), start=1)
for j, (y, ii) in maybe_progress(iterator, progress):
for j, (y, ii) in tqdm(iterator, disable=not progress):
value_at_x = int(yt[ii[-1]] == y) / n
values[ii[-1]] += (value_at_x - values[ii[-1]]) / j
for i in range(n - 2, n_neighbors, -1): # farthest to closest
Expand Down
13 changes: 6 additions & 7 deletions src/pydvl/value/shapley/montecarlo.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,14 +47,13 @@
import operator
from concurrent.futures import FIRST_COMPLETED, Future, wait
from functools import reduce
from itertools import cycle, takewhile
from typing import Optional, Sequence, Union

import numpy as np
from deprecate import deprecated
from numpy.random import SeedSequence
from numpy.typing import NDArray
from tqdm import tqdm
from tqdm.auto import tqdm

from pydvl.parallel import (
CancellationPolicy,
Expand All @@ -65,6 +64,7 @@
init_parallel_backend,
)
from pydvl.utils.numeric import random_powerset
from pydvl.utils.progress import repeat_indices
from pydvl.utils.types import Seed, ensure_seed_sequence
from pydvl.utils.utility import Utility
from pydvl.value.result import ValuationResult
Expand Down Expand Up @@ -281,11 +281,10 @@ def _combinatorial_montecarlo_shapley(
)

rng = np.random.default_rng(seed)
repeat_indices = takewhile(lambda _: not done(result), cycle(indices))
pbar = tqdm(disable=not progress, position=job_id, total=100, unit="%")
for idx in repeat_indices:
pbar.n = 100 * done.completion()
pbar.refresh()

for idx in repeat_indices(
indices, result=result, done=done, disable=not progress, position=job_id
):
# Randomly sample subsets of full dataset without idx
subset = np.setxor1d(u.data.indices, [idx], assume_unique=True)
s = next(random_powerset(subset, n_samples=1, seed=rng))
Expand Down
Loading

0 comments on commit 3076b61

Please sign in to comment.