Merge pull request #466 from aai-institute/feature/269-simplify-progr…

…ess-bars Simplify progress bars
aai-institute · Dec 18, 2023 · 3076b61 · 3076b61
2 parents 6a37624 + 504a172
commit 3076b61
Show file tree

Hide file tree

Showing 15 changed files with 81 additions and 112 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,7 @@
 ## Unreleased
 
 ### Added
+
 - New influence function interface `InfluenceFunctionModel`
 - Data parallel computation with `DaskInfluenceCalculator`
   [PR #26](https://github.com/aai-institute/pyDVL/issues/26)
@@ -14,11 +15,12 @@
 
 ### Changed
 
+- Simplify display of computation progress
+  [PR #466](https://github.com/aai-institute/pyDVL/pull/466)
 - Improve readme and explain better the examples
   [PR #465](https://github.com/aai-institute/pyDVL/pull/465)
 - Simplify and improve tests, add CodeCov code coverage
   [PR #429](https://github.com/aai-institute/pyDVL/pull/429)
-- 
 - **Breaking Changes**
   - Removed `compute_influences` and all related code.
     Replaced by new `InfluenceFunctionModel` interface. Removed modules:

diff --git a/notebooks/support/torch.py b/notebooks/support/torch.py
@@ -12,8 +12,7 @@
 from torch.optim.lr_scheduler import _LRScheduler
 from torch.utils.data import DataLoader
 from torchvision.models import ResNet18_Weights, resnet18
-
-from pydvl.utils import maybe_progress
+from tqdm.auto import tqdm
 
 from .types import Losses
 
@@ -123,7 +122,7 @@ def fit_torch_model(
     train_loss = []
     val_loss = []
 
-    for epoch in maybe_progress(range(num_epochs), progress, desc="Model fitting"):
+    for epoch in tqdm(range(num_epochs), disable=not progress, desc="Model fitting"):
         batch_loss = []
         for train_batch in training_data:
             batch_x, batch_y = train_batch

diff --git a/src/pydvl/influence/torch/influence_function_model.py b/src/pydvl/influence/torch/influence_function_model.py
@@ -13,8 +13,10 @@
 import torch
 from torch import nn as nn
 from torch.utils.data import DataLoader
+from tqdm.auto import tqdm
+
+from pydvl.utils.progress import log_duration
 
-from ...utils import log_duration, maybe_progress
 from ..base_influence_function_model import (
     InfluenceFunctionModel,
     InfluenceMode,
@@ -522,7 +524,7 @@ def reg_hvp(v: torch.Tensor):
         batch_cg = torch.zeros_like(rhs)
 
         for idx, bi in enumerate(
-            maybe_progress(rhs, self.progress, desc="Conjugate gradient")
+            tqdm(rhs, disable=not self.progress, desc="Conjugate gradient")
         ):
             batch_result = self._solve_cg(
                 reg_hvp,
@@ -689,7 +691,7 @@ def lissa_step(
             create_batch_hvp_function(self.model, self.loss),
             in_dims=(None, None, None, 0),
         )
-        for _ in maybe_progress(range(self.maxiter), self.progress, desc="Lissa"):
+        for _ in tqdm(range(self.maxiter), disable=not self.progress, desc="Lissa"):
             x, y = next(iter(shuffled_training_data))
             # grad_xy = model.grad(x, y, create_graph=True)
             reg_hvp = (

diff --git a/src/pydvl/reporting/scores.py b/src/pydvl/reporting/scores.py
@@ -2,8 +2,9 @@
 
 import numpy as np
 from numpy.typing import NDArray
+from tqdm.auto import tqdm
 
-from pydvl.utils import Utility, maybe_progress
+from pydvl.utils import Utility
 from pydvl.value.result import ValuationResult
 
 __all__ = ["compute_removal_score"]
@@ -44,7 +45,7 @@ def compute_removal_score(
     # We sort in descending order if we want to remove the best values
     values.sort(reverse=remove_best)
 
-    for pct in maybe_progress(percentages, display=progress, desc="Removal Scores"):
+    for pct in tqdm(percentages, disable=not progress, desc="Removal Scores"):
         n_removal = int(pct * len(u.data))
         indices = values.indices[n_removal:]
         score = u(indices)

diff --git a/src/pydvl/utils/progress.py b/src/pydvl/utils/progress.py
@@ -1,82 +1,42 @@
-"""
-!!! Warning
-    This module is deprecated and will be removed in a future release.
-    It implements a wrapper for the [tqdm](https://tqdm.github.io/) progress bar
-    iterator for easy toggling, but this functionality is already provided by
-    the `disable` argument of `tqdm`.
-"""
-import collections.abc
 import logging
 from functools import wraps
+from itertools import cycle, takewhile
 from time import time
-from typing import Iterable, Iterator, Union
+from typing import TYPE_CHECKING, Collection, Iterator
 
 from tqdm.auto import tqdm
 
-__all__ = ["maybe_progress", "log_duration"]
+# This is needed to avoid circular import errors
+if TYPE_CHECKING:
+    from pydvl.value.result import ValuationResult
+    from pydvl.value.stopping import StoppingCriterion
 
-logger = logging.getLogger(__name__)
-
-
-class MockProgress(collections.abc.Iterator):
-    """A Naive mock class to use with maybe_progress and tqdm.
-    Mocked methods don't support return values.
-    Mocked properties don't do anything
-    """
-
-    class MiniMock:
-        def __call__(self, *args, **kwargs):
-            pass
-
-        def __add__(self, other):
-            pass
-
-        def __sub__(self, other):
-            pass
-
-        def __mul__(self, other):
-            pass
+__all__ = ["repeat_indices", "log_duration"]
 
-        def __floordiv__(self, other):
-            pass
-
-        def __truediv__(self, other):
-            pass
-
-    def __init__(self, iterator: Union[Iterator, Iterable]):
-        # Since there is no _it in __dict__ at this point, doing here
-        # self._it = iterator
-        # results in a call to __getattr__() and the assignment fails, so we
-        # use __dict__ instead
-        self.__dict__["_it"] = iterator
-
-    def __iter__(self):
-        return iter(self._it)
-
-    def __next__(self):
-        return next(self._it)
-
-    def __getattr__(self, key):
-        return self.MiniMock()
-
-    def __setattr__(self, key, value):
-        pass
+logger = logging.getLogger(__name__)
 
 
-def maybe_progress(
-    it: Union[int, Iterable, Iterator], display: bool = False, **kwargs
-) -> Union[tqdm, MockProgress]:
-    """Returns either a tqdm progress bar or a mock object which wraps the
-    iterator as well, but ignores any accesses to methods or properties.
+def repeat_indices(
+    indices: Collection[int],
+    result: "ValuationResult",
+    done: "StoppingCriterion",
+    **kwargs,
+) -> Iterator[int]:
+    """Helper function to cycle indefinitely over a collection of indices
+    until the stopping criterion is satisfied while displaying progress.
 
     Args:
-        it: the iterator to wrap
-        display: set to True to return a tqdm bar
-        kwargs: Keyword arguments that will be forwarded to tqdm
+        indices: Collection of indices that will be cycled until done.
+        result: Object containing the current results.
+        done: Stopping criterion.
+        kwargs: Keyword arguments passed to tqdm.
     """
-    if isinstance(it, int):
-        it = range(it)  # type: ignore
-    return tqdm(it, **kwargs) if display else MockProgress(it)
+    with tqdm(total=100, unit="%", **kwargs) as pbar:
+        it = takewhile(lambda _: not done(result), cycle(indices))
+        for i in it:
+            yield i
+            pbar.update(100 * done.completion() - pbar.n)
+            pbar.refresh()
 
 
 def log_duration(func):

diff --git a/src/pydvl/value/least_core/__init__.py b/src/pydvl/value/least_core/__init__.py
@@ -96,7 +96,7 @@ def compute_least_core_values(
             solver_options.update(kwargs)
 
     if mode == LeastCoreMode.MonteCarlo:
-        # TODO fix progress showing and maybe_progress in remote case
+        # TODO fix progress showing in remote case
         progress = False
         if n_iterations is None:
             raise ValueError("n_iterations cannot be None for Monte Carlo Least Core")

diff --git a/src/pydvl/value/least_core/montecarlo.py b/src/pydvl/value/least_core/montecarlo.py
@@ -4,10 +4,10 @@
 
 import numpy as np
 from numpy.typing import NDArray
+from tqdm.auto import tqdm
 
 from pydvl.parallel import MapReduceJob, ParallelConfig, effective_n_jobs
 from pydvl.utils.numeric import random_powerset
-from pydvl.utils.progress import maybe_progress
 from pydvl.utils.types import Seed
 from pydvl.utils.utility import Utility
 from pydvl.value.least_core.common import LeastCoreProblem, lc_solve_problem
@@ -175,7 +175,7 @@ def _montecarlo_least_core(
     A_lb = np.zeros((n_iterations, n))
 
     for i, subset in enumerate(
-        maybe_progress(power_set, progress, total=n_iterations, position=job_id)
+        tqdm(power_set, disable=not progress, total=n_iterations, position=job_id)
     ):
         indices: NDArray[np.bool_] = np.zeros(n, dtype=bool)
         indices[list(subset)] = True

diff --git a/src/pydvl/value/least_core/naive.py b/src/pydvl/value/least_core/naive.py
@@ -4,8 +4,9 @@
 
 import numpy as np
 from numpy.typing import NDArray
+from tqdm.auto import tqdm
 
-from pydvl.utils import Utility, maybe_progress, powerset
+from pydvl.utils import Utility, powerset
 from pydvl.value.least_core.common import LeastCoreProblem, lc_solve_problem
 from pydvl.value.result import ValuationResult
 
@@ -103,14 +104,17 @@ def lc_prepare_problem(u: Utility, progress: bool = False) -> LeastCoreProblem:
 
     logger.debug("Iterating over all subsets")
     utility_values = np.zeros(powerset_size)
-    for i, subset in enumerate(
-        maybe_progress(
-            powerset(u.data.indices), progress, total=powerset_size - 1, position=0
+    for i, subset in enumerate(  # type: ignore
+        tqdm(
+            powerset(u.data.indices),
+            disable=not progress,
+            total=powerset_size - 1,
+            position=0,
         )
     ):
         indices: NDArray[np.bool_] = np.zeros(n, dtype=bool)
         indices[list(subset)] = True
         A_lb[i, indices] = 1
-        utility_values[i] = u(subset)
+        utility_values[i] = u(subset)  # type: ignore
 
     return LeastCoreProblem(utility_values, A_lb)
diff --git a/src/pydvl/value/oob/oob.py b/src/pydvl/value/oob/oob.py
@@ -12,8 +12,9 @@
 from numpy.typing import NDArray
 from sklearn.base import is_classifier, is_regressor
 from sklearn.ensemble import BaggingClassifier, BaggingRegressor
+from tqdm.auto import tqdm
 
-from pydvl.utils import Seed, Utility, maybe_progress
+from pydvl.utils import Seed, Utility
 from pydvl.utils.types import LossFunction
 from pydvl.value.result import ValuationResult
 
@@ -112,8 +113,8 @@ def compute_data_oob(
 
     bag.fit(u.data.x_train, u.data.y_train)
 
-    for est, samples in maybe_progress(
-        zip(bag.estimators_, bag.estimators_samples_), progress, total=n_est
+    for est, samples in tqdm(
+        zip(bag.estimators_, bag.estimators_samples_), disable=not progress, total=n_est
     ):  # The bottleneck is the bag fitting not this part so TQDM is not very useful here
         oob_idx = np.setxor1d(u.data.indices, np.unique(samples))
         array_loss = loss(

diff --git a/src/pydvl/value/shapley/gt.py b/src/pydvl/value/shapley/gt.py
@@ -30,9 +30,10 @@
 import numpy as np
 from numpy.random import SeedSequence
 from numpy.typing import NDArray
+from tqdm.auto import trange
 
 from pydvl.parallel import MapReduceJob, ParallelConfig, effective_n_jobs
-from pydvl.utils import Utility, maybe_progress
+from pydvl.utils import Utility
 from pydvl.utils.numeric import random_subset_of_size
 from pydvl.utils.status import Status
 from pydvl.utils.types import Seed, ensure_seed_sequence
@@ -155,7 +156,7 @@ def _group_testing_shapley(
     )  # indicator vars
     uu = np.empty(n_samples)  # utilities
 
-    for t in maybe_progress(n_samples, progress=progress, position=job_id):
+    for t in trange(n_samples, disable=not progress, position=job_id):
         k = rng.choice(const.kk, size=1, p=const.q).item()
         s = random_subset_of_size(u.data.indices, k, seed=rng)
         uu[t] = u(s)

diff --git a/src/pydvl/value/shapley/knn.py b/src/pydvl/value/shapley/knn.py
@@ -19,8 +19,9 @@
 import numpy as np
 from numpy.typing import NDArray
 from sklearn.neighbors import KNeighborsClassifier, NearestNeighbors
+from tqdm.auto import tqdm
 
-from pydvl.utils import Utility, maybe_progress
+from pydvl.utils import Utility
 from pydvl.utils.status import Status
 from pydvl.value.result import ValuationResult
 
@@ -76,7 +77,7 @@ def knn_shapley(u: Utility, *, progress: bool = True) -> ValuationResult:
     n = len(u.data)
     yt = u.data.y_train
     iterator = enumerate(zip(u.data.y_test, indices), start=1)
-    for j, (y, ii) in maybe_progress(iterator, progress):
+    for j, (y, ii) in tqdm(iterator, disable=not progress):
         value_at_x = int(yt[ii[-1]] == y) / n
         values[ii[-1]] += (value_at_x - values[ii[-1]]) / j
         for i in range(n - 2, n_neighbors, -1):  # farthest to closest

diff --git a/src/pydvl/value/shapley/montecarlo.py b/src/pydvl/value/shapley/montecarlo.py
@@ -47,14 +47,13 @@
 import operator
 from concurrent.futures import FIRST_COMPLETED, Future, wait
 from functools import reduce
-from itertools import cycle, takewhile
 from typing import Optional, Sequence, Union
 
 import numpy as np
 from deprecate import deprecated
 from numpy.random import SeedSequence
 from numpy.typing import NDArray
-from tqdm import tqdm
+from tqdm.auto import tqdm
 
 from pydvl.parallel import (
     CancellationPolicy,
@@ -65,6 +64,7 @@
     init_parallel_backend,
 )
 from pydvl.utils.numeric import random_powerset
+from pydvl.utils.progress import repeat_indices
 from pydvl.utils.types import Seed, ensure_seed_sequence
 from pydvl.utils.utility import Utility
 from pydvl.value.result import ValuationResult
@@ -281,11 +281,10 @@ def _combinatorial_montecarlo_shapley(
     )
 
     rng = np.random.default_rng(seed)
-    repeat_indices = takewhile(lambda _: not done(result), cycle(indices))
-    pbar = tqdm(disable=not progress, position=job_id, total=100, unit="%")
-    for idx in repeat_indices:
-        pbar.n = 100 * done.completion()
-        pbar.refresh()
+
+    for idx in repeat_indices(
+        indices, result=result, done=done, disable=not progress, position=job_id
+    ):
         # Randomly sample subsets of full dataset without idx
         subset = np.setxor1d(u.data.indices, [idx], assume_unique=True)
         s = next(random_powerset(subset, n_samples=1, seed=rng))