Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Simplify progress bars #466

Merged
merged 13 commits into from
Dec 18, 2023
4 changes: 2 additions & 2 deletions notebooks/support/torch.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@
from torch.optim.lr_scheduler import _LRScheduler
from torch.utils.data import DataLoader
from torchvision.models import ResNet18_Weights, resnet18
from tqdm.auto import tqdm

from pydvl.influence.torch import as_tensor
from pydvl.utils import maybe_progress

from .types import Losses

Expand Down Expand Up @@ -124,7 +124,7 @@ def fit_torch_model(
train_loss = []
val_loss = []

for epoch in maybe_progress(range(num_epochs), progress, desc="Model fitting"):
for epoch in tqdm(range(num_epochs), disable=not progress, desc="Model fitting"):
batch_loss = []
for train_batch in training_data:
batch_x, batch_y = train_batch
Expand Down
15 changes: 8 additions & 7 deletions src/pydvl/influence/general.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@
from enum import Enum
from typing import Any, Callable, Dict, Generator, Optional, Type

from ..utils import maybe_progress
from tqdm.auto import tqdm

from .inversion import InversionMethod, solve_hvp
from .twice_differentiable import (
DataLoaderType,
Expand Down Expand Up @@ -93,8 +94,8 @@ def compute_influence_factors(
cat = tensor_util.cat

def test_grads() -> Generator[TensorType, None, None]:
for x_test, y_test in maybe_progress(
test_data, progress, desc="Batch Test Gradients"
for x_test, y_test in tqdm(
test_data, disable=not progress, desc="Batch Test Gradients"
):
yield stack(
[
Expand Down Expand Up @@ -167,8 +168,8 @@ def compute_influences_up(
einsum = tensor_util.einsum

def train_grads() -> Generator[TensorType, None, None]:
for x, y in maybe_progress(
input_data, progress, desc="Batch Split Input Gradients"
for x, y in tqdm(
input_data, disable=not progress, desc="Batch Split Input Gradients"
):
yield stack(
[model.grad(inpt, target) for inpt, target in zip(unsqueeze(x, 1), y)]
Expand Down Expand Up @@ -232,9 +233,9 @@ def compute_influences_pert(
shape = tensor_util.shape

all_pert_influences = []
for x, y in maybe_progress(
for x, y in tqdm(
input_data,
progress,
disable=not progress,
desc="Batch Influence Perturbation",
):
for i in range(len(x)):
Expand Down
10 changes: 5 additions & 5 deletions src/pydvl/influence/torch/torch_differentiable.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,8 @@
from torch import autograd
from torch.autograd import Variable
from torch.utils.data import DataLoader
from tqdm.auto import tqdm

from ...utils import maybe_progress
from ..inversion import InversionMethod, InversionRegistry
from ..twice_differentiable import (
InverseHvpResult,
Expand Down Expand Up @@ -192,7 +192,7 @@ def mvp(
z = (grad_xy * Variable(v)).sum(dim=1)

mvp = []
for i in maybe_progress(range(len(z)), progress, desc="MVP"):
for i in tqdm(range(len(z)), disable=not progress, desc="MVP"):
mvp.append(
flatten_tensors_to_vector(
autograd.grad(z[i], backprop_on, retain_graph=True)
Expand Down Expand Up @@ -578,7 +578,7 @@ def solve_batch_cg(
total_grad_xy = torch.empty(0)
total_points = 0

for x, y in maybe_progress(training_data, progress, desc="Batch Train Gradients"):
for x, y in tqdm(training_data, disable=not progress, desc="Batch Train Gradients"):
grad_xy = model.grad(x, y, create_graph=True)
if total_grad_xy.nelement() == 0:
total_grad_xy = torch.zeros_like(grad_xy)
Expand All @@ -592,7 +592,7 @@ def solve_batch_cg(
batch_cg = torch.zeros_like(b)
info = {}

for idx, bi in enumerate(maybe_progress(b, progress, desc="Conjugate gradient")):
for idx, bi in enumerate(tqdm(b, disable=not progress, desc="Conjugate gradient")):
batch_result, batch_info = solve_cg(
reg_hvp, bi, x0=x0, rtol=rtol, atol=atol, maxiter=maxiter
)
Expand Down Expand Up @@ -724,7 +724,7 @@ def lissa_step(
"""
return b + (1 - dampen) * h - reg_hvp(h) / scale

for _ in maybe_progress(range(maxiter), progress, desc="Lissa"):
for _ in tqdm(range(maxiter), disable=not progress, desc="Lissa"):
x, y = next(iter(shuffled_training_data))
grad_xy = model.grad(x, y, create_graph=True)
reg_hvp = (
Expand Down
5 changes: 3 additions & 2 deletions src/pydvl/reporting/scores.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@

import numpy as np
from numpy.typing import NDArray
from tqdm.auto import tqdm

from pydvl.utils import Utility, maybe_progress
from pydvl.utils import Utility
from pydvl.value.result import ValuationResult

__all__ = ["compute_removal_score"]
Expand Down Expand Up @@ -44,7 +45,7 @@ def compute_removal_score(
# We sort in descending order if we want to remove the best values
values.sort(reverse=remove_best)

for pct in maybe_progress(percentages, display=progress, desc="Removal Scores"):
for pct in tqdm(percentages, disable=not progress, desc="Removal Scores"):
n_removal = int(pct * len(u.data))
indices = values.indices[n_removal:]
score = u(indices)
Expand Down
89 changes: 24 additions & 65 deletions src/pydvl/utils/progress.py
Original file line number Diff line number Diff line change
@@ -1,74 +1,33 @@
"""
!!! Warning
This module is deprecated and will be removed in a future release.
It implements a wrapper for the [tqdm](https://tqdm.github.io/) progress bar
iterator for easy toggling, but this functionality is already provided by
the `disable` argument of `tqdm`.
"""
import collections.abc
from typing import Iterable, Iterator, Union
from itertools import cycle, takewhile
from typing import TYPE_CHECKING, Collection, Iterator

from tqdm.auto import tqdm

__all__ = ["maybe_progress"]
if TYPE_CHECKING:
from pydvl.value.result import ValuationResult
from pydvl.value.stopping import StoppingCriterion


class MockProgress(collections.abc.Iterator):
"""A Naive mock class to use with maybe_progress and tqdm.
Mocked methods don't support return values.
Mocked properties don't do anything
"""

class MiniMock:
def __call__(self, *args, **kwargs):
pass

def __add__(self, other):
pass

def __sub__(self, other):
pass

def __mul__(self, other):
pass

def __floordiv__(self, other):
pass

def __truediv__(self, other):
pass

def __init__(self, iterator: Union[Iterator, Iterable]):
# Since there is no _it in __dict__ at this point, doing here
# self._it = iterator
# results in a call to __getattr__() and the assignment fails, so we
# use __dict__ instead
self.__dict__["_it"] = iterator

def __iter__(self):
return iter(self._it)

def __next__(self):
return next(self._it)

def __getattr__(self, key):
return self.MiniMock()

def __setattr__(self, key, value):
pass
__all__ = ["repeat_indices"]


def maybe_progress(
it: Union[int, Iterable, Iterator], display: bool = False, **kwargs
) -> Union[tqdm, MockProgress]:
"""Returns either a tqdm progress bar or a mock object which wraps the
iterator as well, but ignores any accesses to methods or properties.
def repeat_indices(
indices: Collection[int],
result: "ValuationResult",
done: "StoppingCriterion",
**kwargs
) -> Iterator[int]:
"""Helper function to cycle indefinitely over a collection of indices
until the stopping criterion is satisfied while displaying progress.

Args:
it: the iterator to wrap
display: set to True to return a tqdm bar
kwargs: Keyword arguments that will be forwarded to tqdm
indices: Collection of indices that will be cycled until done.
result: Object containing the current results.
done: Stopping criterion.
kwargs: Keyword arguments passed to tqdm.
"""
if isinstance(it, int):
it = range(it) # type: ignore
return tqdm(it, **kwargs) if display else MockProgress(it)
with tqdm(total=100, unit="%", **kwargs) as pbar:
it = takewhile(lambda _: not done(result), cycle(indices))
for i in it:
yield i
pbar.update(100 * done.completion() - pbar.n)
pbar.refresh()
2 changes: 1 addition & 1 deletion src/pydvl/value/least_core/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ def compute_least_core_values(
solver_options.update(kwargs)

if mode == LeastCoreMode.MonteCarlo:
# TODO fix progress showing and maybe_progress in remote case
# TODO fix progress showing in remote case
progress = False
if n_iterations is None:
raise ValueError("n_iterations cannot be None for Monte Carlo Least Core")
Expand Down
4 changes: 2 additions & 2 deletions src/pydvl/value/least_core/montecarlo.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@

import numpy as np
from numpy.typing import NDArray
from tqdm.auto import tqdm

from pydvl.parallel import MapReduceJob, ParallelConfig, effective_n_jobs
from pydvl.utils.numeric import random_powerset
from pydvl.utils.progress import maybe_progress
from pydvl.utils.types import Seed
from pydvl.utils.utility import Utility
from pydvl.value.least_core.common import LeastCoreProblem, lc_solve_problem
Expand Down Expand Up @@ -175,7 +175,7 @@ def _montecarlo_least_core(
A_lb = np.zeros((n_iterations, n))

for i, subset in enumerate(
maybe_progress(power_set, progress, total=n_iterations, position=job_id)
tqdm(power_set, disable=not progress, total=n_iterations, position=job_id)
):
indices: NDArray[np.bool_] = np.zeros(n, dtype=bool)
indices[list(subset)] = True
Expand Down
14 changes: 9 additions & 5 deletions src/pydvl/value/least_core/naive.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,9 @@

import numpy as np
from numpy.typing import NDArray
from tqdm.auto import tqdm

from pydvl.utils import Utility, maybe_progress, powerset
from pydvl.utils import Utility, powerset
from pydvl.value.least_core.common import LeastCoreProblem, lc_solve_problem
from pydvl.value.result import ValuationResult

Expand Down Expand Up @@ -103,14 +104,17 @@ def lc_prepare_problem(u: Utility, progress: bool = False) -> LeastCoreProblem:

logger.debug("Iterating over all subsets")
utility_values = np.zeros(powerset_size)
for i, subset in enumerate(
maybe_progress(
powerset(u.data.indices), progress, total=powerset_size - 1, position=0
for i, subset in enumerate( # type: ignore
tqdm(
powerset(u.data.indices),
disable=not progress,
total=powerset_size - 1,
position=0,
)
):
indices: NDArray[np.bool_] = np.zeros(n, dtype=bool)
indices[list(subset)] = True
A_lb[i, indices] = 1
utility_values[i] = u(subset)
utility_values[i] = u(subset) # type: ignore

return LeastCoreProblem(utility_values, A_lb)
7 changes: 4 additions & 3 deletions src/pydvl/value/oob/oob.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,9 @@
from numpy.typing import NDArray
from sklearn.base import is_classifier, is_regressor
from sklearn.ensemble import BaggingClassifier, BaggingRegressor
from tqdm.auto import tqdm

from pydvl.utils import Seed, Utility, maybe_progress
from pydvl.utils import Seed, Utility
from pydvl.utils.types import LossFunction
from pydvl.value.result import ValuationResult

Expand Down Expand Up @@ -112,8 +113,8 @@ def compute_data_oob(

bag.fit(u.data.x_train, u.data.y_train)

for est, samples in maybe_progress(
zip(bag.estimators_, bag.estimators_samples_), progress, total=n_est
for est, samples in tqdm(
zip(bag.estimators_, bag.estimators_samples_), disable=not progress, total=n_est
): # The bottleneck is the bag fitting not this part so TQDM is not very useful here
oob_idx = np.setxor1d(u.data.indices, np.unique(samples))
array_loss = loss(
Expand Down
5 changes: 3 additions & 2 deletions src/pydvl/value/shapley/gt.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,10 @@
import numpy as np
from numpy.random import SeedSequence
from numpy.typing import NDArray
from tqdm.auto import trange

from pydvl.parallel import MapReduceJob, ParallelConfig, effective_n_jobs
from pydvl.utils import Utility, maybe_progress
from pydvl.utils import Utility
from pydvl.utils.numeric import random_subset_of_size
from pydvl.utils.status import Status
from pydvl.utils.types import Seed, ensure_seed_sequence
Expand Down Expand Up @@ -155,7 +156,7 @@ def _group_testing_shapley(
) # indicator vars
uu = np.empty(n_samples) # utilities

for t in maybe_progress(n_samples, progress=progress, position=job_id):
for t in trange(n_samples, disable=not progress, position=job_id):
k = rng.choice(const.kk, size=1, p=const.q).item()
s = random_subset_of_size(u.data.indices, k, seed=rng)
uu[t] = u(s)
Expand Down
5 changes: 3 additions & 2 deletions src/pydvl/value/shapley/knn.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,9 @@
import numpy as np
from numpy.typing import NDArray
from sklearn.neighbors import KNeighborsClassifier, NearestNeighbors
from tqdm.auto import tqdm

from pydvl.utils import Utility, maybe_progress
from pydvl.utils import Utility
from pydvl.utils.status import Status
from pydvl.value.result import ValuationResult

Expand Down Expand Up @@ -76,7 +77,7 @@ def knn_shapley(u: Utility, *, progress: bool = True) -> ValuationResult:
n = len(u.data)
yt = u.data.y_train
iterator = enumerate(zip(u.data.y_test, indices), start=1)
for j, (y, ii) in maybe_progress(iterator, progress):
for j, (y, ii) in tqdm(iterator, disable=not progress):
value_at_x = int(yt[ii[-1]] == y) / n
values[ii[-1]] += (value_at_x - values[ii[-1]]) / j
for i in range(n - 2, n_neighbors, -1): # farthest to closest
Expand Down
13 changes: 6 additions & 7 deletions src/pydvl/value/shapley/montecarlo.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,14 +47,13 @@
import operator
from concurrent.futures import FIRST_COMPLETED, Future, wait
from functools import reduce
from itertools import cycle, takewhile
from typing import Optional, Sequence, Union

import numpy as np
from deprecate import deprecated
from numpy.random import SeedSequence
from numpy.typing import NDArray
from tqdm import tqdm
from tqdm.auto import tqdm

from pydvl.parallel import (
CancellationPolicy,
Expand All @@ -65,6 +64,7 @@
init_parallel_backend,
)
from pydvl.utils.numeric import random_powerset
from pydvl.utils.progress import repeat_indices
from pydvl.utils.types import Seed, ensure_seed_sequence
from pydvl.utils.utility import Utility
from pydvl.value.result import ValuationResult
Expand Down Expand Up @@ -281,11 +281,10 @@ def _combinatorial_montecarlo_shapley(
)

rng = np.random.default_rng(seed)
repeat_indices = takewhile(lambda _: not done(result), cycle(indices))
pbar = tqdm(disable=not progress, position=job_id, total=100, unit="%")
for idx in repeat_indices:
pbar.n = 100 * done.completion()
pbar.refresh()

for idx in repeat_indices(
indices, result=result, done=done, disable=not progress, position=job_id
):
# Randomly sample subsets of full dataset without idx
subset = np.setxor1d(u.data.indices, [idx], assume_unique=True)
s = next(random_powerset(subset, n_samples=1, seed=rng))
Expand Down
Loading
Loading