From cafa32a117a76cb47d57c70c9460505c93f75d3d Mon Sep 17 00:00:00 2001
From: Kristof Schroeder <kristof_schroeder@web.de>
Date: Fri, 3 May 2024 01:28:19 +0200
Subject: [PATCH 1/7] Fix missing move to model device for EkfacInfluence
 implementation

---
 .../influence/torch/influence_function_model.py     | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/pydvl/influence/torch/influence_function_model.py b/src/pydvl/influence/torch/influence_function_model.py
index 46a5fa16e..4a6cb638c 100644
--- a/src/pydvl/influence/torch/influence_function_model.py
+++ b/src/pydvl/influence/torch/influence_function_model.py
@@ -1195,7 +1195,7 @@ def _get_kfac_blocks(
             data, disable=not self.progress, desc="K-FAC blocks - batch progress"
         ):
             data_len += x.shape[0]
-            pred_y = self.model(x)
+            pred_y = self.model(x.to(self.model_device))
             loss = empirical_cross_entropy_loss_fn(pred_y)
             loss.backward()
 
@@ -1319,7 +1319,7 @@ def _update_diag(
             data, disable=not self.progress, desc="Update Diagonal - batch progress"
         ):
             data_len += x.shape[0]
-            pred_y = self.model(x)
+            pred_y = self.model(x.to(self.model_device))
             loss = empirical_cross_entropy_loss_fn(pred_y)
             loss.backward()
 
@@ -1526,7 +1526,10 @@ def influences_from_factors_by_layer(
             influences = {}
             for layer_id, layer_z_test in z_test_factors.items():
                 end_idx = start_idx + layer_z_test.shape[1]
-                influences[layer_id] = layer_z_test @ total_grad[:, start_idx:end_idx].T
+                influences[layer_id] = (
+                    layer_z_test.to(self.model_device)
+                    @ total_grad[:, start_idx:end_idx].T
+                )
                 start_idx = end_idx
             return influences
         elif mode == InfluenceMode.Perturbation:
@@ -1539,7 +1542,7 @@ def influences_from_factors_by_layer(
                 end_idx = start_idx + layer_z_test.shape[1]
                 influences[layer_id] = torch.einsum(
                     "ia,j...a->ij...",
-                    layer_z_test,
+                    layer_z_test.to(self.model_device),
                     total_mixed_grad[:, start_idx:end_idx],
                 )
                 start_idx = end_idx
@@ -1626,7 +1629,7 @@ def explore_hessian_regularization(
             being dictionaries containing the influences for each layer of the model,
             with the layer name as key.
         """
-        grad = self._loss_grad(x, y)
+        grad = self._loss_grad(x.to(self.model_device), y.to(self.model_device))
         influences_by_reg_value = {}
         for reg_value in regularization_values:
             reg_factors = self._solve_hvp_by_layer(

From a151422d31dff79624b5750a42d280b5ab7ee061 Mon Sep 17 00:00:00 2001
From: Kristof Schroeder <kristof_schroeder@web.de>
Date: Fri, 3 May 2024 01:35:01 +0200
Subject: [PATCH 2/7] Update CHANGELOG.md

---
 CHANGELOG.md | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 52bc910a4..abea5f5ac 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,12 @@
 # Changelog
 
+## Unreleased
+
+### Fixed
+
+- Fixed missing move of tensors to model device in `EkfacInfluence` 
+  implementation [PR #570](https://github.com/aai-institute/pyDVL/pull/570)
+
 ## 0.9.1 - Bug fixes, logging improvement
 
 ### Fixed

From 36ea3bada2221febe1b1cb75ff687ad122ce2fdb Mon Sep 17 00:00:00 2001
From: Kristof Schroeder <kristof_schroeder@web.de>
Date: Fri, 3 May 2024 11:31:39 +0200
Subject: [PATCH 3/7] Add device move in influence_from_factors method in base
 class TorchInfluenceFunctionModel

---
 src/pydvl/influence/torch/influence_function_model.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/pydvl/influence/torch/influence_function_model.py b/src/pydvl/influence/torch/influence_function_model.py
index 4a6cb638c..08fd64a14 100644
--- a/src/pydvl/influence/torch/influence_function_model.py
+++ b/src/pydvl/influence/torch/influence_function_model.py
@@ -303,13 +303,13 @@ def influences_from_factors(
         """
         if mode == InfluenceMode.Up:
             return (
-                z_test_factors
+                z_test_factors.to(self.model_device)
                 @ self._loss_grad(x.to(self.model_device), y.to(self.model_device)).T
             )
         elif mode == InfluenceMode.Perturbation:
             return torch.einsum(
                 "ia,j...a->ij...",
-                z_test_factors,
+                z_test_factors.to(self.model_device),
                 self._flat_loss_mixed_grad(
                     x.to(self.model_device), y.to(self.model_device)
                 ),

From 919e73f17063aaaa00515a67ec3b3d17338f51bb Mon Sep 17 00:00:00 2001
From: Kristof Schroeder <kristof_schroeder@web.de>
Date: Fri, 3 May 2024 12:16:53 +0200
Subject: [PATCH 4/7] Overwrite `to` method of `CgInfluence`, add `to` method
 to preconditoners, fix wrong device for indices array in block CG
 implementation

---
 .../influence/torch/influence_function_model.py |  9 ++++++++-
 src/pydvl/influence/torch/pre_conditioner.py    | 17 +++++++++++++++++
 2 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/src/pydvl/influence/torch/influence_function_model.py b/src/pydvl/influence/torch/influence_function_model.py
index 46a5fa16e..b4ec964cc 100644
--- a/src/pydvl/influence/torch/influence_function_model.py
+++ b/src/pydvl/influence/torch/influence_function_model.py
@@ -706,7 +706,9 @@ def mat_mat(x: torch.Tensor):
         R = (rhs - mat_mat(X)).T
         Z = R if self.pre_conditioner is None else self.pre_conditioner.solve(R)
         P, _, _ = torch.linalg.svd(Z, full_matrices=False)
-        active_indices = torch.as_tensor(list(range(X.shape[-1])), dtype=torch.long)
+        active_indices = torch.as_tensor(
+            list(range(X.shape[-1])), dtype=torch.long, device=self.model_device
+        )
 
         maxiter = self.maxiter if self.maxiter is not None else len(rhs) * 10
         y_norm = torch.linalg.norm(rhs, dim=1)
@@ -758,6 +760,11 @@ def mat_mat(x: torch.Tensor):
 
         return X.T
 
+    def to(self, device: torch.device):
+        if self.pre_conditioner is not None:
+            self.pre_conditioner = self.pre_conditioner.to(device)
+        return super().to(device)
+
 
 class LissaInfluence(TorchInfluenceFunctionModel):
     r"""
diff --git a/src/pydvl/influence/torch/pre_conditioner.py b/src/pydvl/influence/torch/pre_conditioner.py
index 4497d81c2..f42852c2c 100644
--- a/src/pydvl/influence/torch/pre_conditioner.py
+++ b/src/pydvl/influence/torch/pre_conditioner.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 from abc import ABC, abstractmethod
 from typing import Callable, Optional
 
@@ -70,6 +72,11 @@ def solve(self, rhs: torch.Tensor):
     def _solve(self, rhs: torch.Tensor):
         pass
 
+    @abstractmethod
+    def to(self, device: torch.device) -> PreConditioner:
+        """Implement this to move the (potentially fitted) preconditioner to a
+        specific device"""
+
 
 class JacobiPreConditioner(PreConditioner):
     r"""
@@ -141,6 +148,11 @@ def _solve(self, rhs: torch.Tensor):
 
         return rhs * inv_diag.unsqueeze(-1)
 
+    def to(self, device: torch.device) -> JacobiPreConditioner:
+        if self._diag is not None:
+            self._diag = self._diag.to(device)
+        return self
+
 
 class NystroemPreConditioner(PreConditioner):
     r"""
@@ -233,3 +245,8 @@ def _solve(self, rhs: torch.Tensor):
             result = result.squeeze()
 
         return result
+
+    def to(self, device: torch.device) -> NystroemPreConditioner:
+        if self._low_rank_approx is not None:
+            self._low_rank_approx = self._low_rank_approx.to(device)
+        return self

From 6375afe31d900698bb30ac42a69657d65fbdc31c Mon Sep 17 00:00:00 2001
From: Kristof Schroeder <kristof_schroeder@web.de>
Date: Fri, 3 May 2024 12:24:26 +0200
Subject: [PATCH 5/7] Update CHANGELOG.md

---
 CHANGELOG.md | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 52bc910a4..e2d4bf923 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,12 @@
 # Changelog
 
+## Unreleased
+
+### Fixed
+
+- Missing move to device of `preconditioner` in `CgInfluence` implementation
+  [PR #572](https://github.com/aai-institute/pyDVL/pull/572)
+
 ## 0.9.1 - Bug fixes, logging improvement
 
 ### Fixed

From 18d4fb8ebd1acf0ab12a81bc5543e2619e313cc2 Mon Sep 17 00:00:00 2001
From: Kristof Schroeder <kristof_schroeder@web.de>
Date: Fri, 3 May 2024 13:25:22 +0200
Subject: [PATCH 6/7] Add functionality to set a device fixture depending on
 the availability of cuda and a user input (pytest --with-cuda)

---
 CONTRIBUTING.md                               |   7 +
 tests/conftest.py                             |   6 +
 tests/influence/torch/conftest.py             |  12 ++
 tests/influence/torch/test_influence_model.py | 143 ++++++++++++------
 4 files changed, 121 insertions(+), 47 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 56d8ead7b..ecd1288de 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -131,6 +131,13 @@ There are a few important arguments:
 - `--slow-tests` enables running slow tests. See below for a description
   of slow tests.
 
+- `--with-cuda` sets the device fixture in [tests/influence/torch/conftest.py](
+  tests/influence/torch/conftest.py) to `cuda` if it is available.
+  Using this fixture within tests, you can run parts of your tests on a `cuda` 
+  device. Be aware, that you still have to take care of the usage of the device
+  manually in a specific test. Setting this flag does not result in
+  running all tests on a GPU.
+
 ### Markers
 
 We use a few different markers to differentiate between tests and runs
diff --git a/tests/conftest.py b/tests/conftest.py
index b08f09377..d8594c314 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -48,6 +48,12 @@ def pytest_addoption(parser):
         default=False,
         help="Disable reporting. Verbose mode takes precedence.",
     )
+    parser.addoption(
+        "--with-cuda",
+        action="store_true",
+        default=False,
+        help="Set device fixture to 'cuda' if available",
+    )
 
 
 @pytest.fixture
diff --git a/tests/influence/torch/conftest.py b/tests/influence/torch/conftest.py
index b16a2d856..37459f1cc 100644
--- a/tests/influence/torch/conftest.py
+++ b/tests/influence/torch/conftest.py
@@ -1,5 +1,6 @@
 from typing import Tuple
 
+import pytest
 import torch
 from numpy.typing import NDArray
 from torch.optim import LBFGS
@@ -59,3 +60,14 @@ def closure():
 def torch_linear_model_to_numpy(model: torch.nn.Linear) -> Tuple[NDArray, NDArray]:
     model.eval()
     return model.weight.data.numpy(), model.bias.data.numpy()
+
+
+@pytest.fixture(scope="session")
+def device(request):
+    import torch
+
+    use_cuda = request.config.getoption("--with-cuda")
+    if use_cuda and torch.cuda.is_available():
+        return torch.device("cuda")
+    else:
+        return torch.device("cpu")
diff --git a/tests/influence/torch/test_influence_model.py b/tests/influence/torch/test_influence_model.py
index 0631c60fc..d2203a84e 100644
--- a/tests/influence/torch/test_influence_model.py
+++ b/tests/influence/torch/test_influence_model.py
@@ -340,6 +340,7 @@ def test_influence_linear_model(
     rtol,
     mode: InfluenceMode,
     train_set_size: int,
+    device: torch.device,
     hessian_reg: float = 0.1,
     test_set_size: int = 20,
     problem_dimension: Tuple[int, int] = (4, 20),
@@ -373,16 +374,20 @@ def test_influence_linear_model(
 
     train_data_set = TensorDataset(*list(map(torch.from_numpy, train_data)))
     train_data_loader = DataLoader(train_data_set, batch_size=40, num_workers=0)
-    influence = influence_factory(linear_layer, loss, train_data_loader, hessian_reg)
+    influence = influence_factory(
+        linear_layer.to(device), loss, train_data_loader, hessian_reg
+    )
 
     x_train, y_train = tuple(map(torch.from_numpy, train_data))
     x_test, y_test = tuple(map(torch.from_numpy, test_data))
-    influence_values = influence.influences(
-        x_test, y_test, x_train, y_train, mode=mode
-    ).numpy()
-    sym_influence_values = influence.influences(
-        x_train, y_train, x_train, y_train, mode=mode
-    ).numpy()
+    influence_values = (
+        influence.influences(x_test, y_test, x_train, y_train, mode=mode).cpu().numpy()
+    )
+    sym_influence_values = (
+        influence.influences(x_train, y_train, x_train, y_train, mode=mode)
+        .cpu()
+        .numpy()
+    )
 
     with pytest.raises(ValueError):
         influence.influences(x_test, y_test, x=x_train, mode=mode)
@@ -431,6 +436,7 @@ def test_influences_lissa(
     ],
     direct_influences,
     influence_factory,
+    device,
 ):
     model, loss, x_train, y_train, x_test, y_test = model_and_data
 
@@ -438,11 +444,15 @@ def test_influences_lissa(
         TensorDataset(x_train, y_train), batch_size=test_case.batch_size
     )
     influence_model = influence_factory(
-        model, loss, train_dataloader, test_case.hessian_reg
+        model.to(device), loss, train_dataloader, test_case.hessian_reg
+    )
+    approx_influences = (
+        influence_model.influences(
+            x_test, y_test, x_train, y_train, mode=test_case.mode
+        )
+        .cpu()
+        .numpy()
     )
-    approx_influences = influence_model.influences(
-        x_test, y_test, x_train, y_train, mode=test_case.mode
-    ).numpy()
 
     assert not np.any(np.isnan(approx_influences))
 
@@ -497,9 +507,10 @@ def test_influences_low_rank(
     direct_sym_influences,
     direct_factors,
     influence_factory,
+    device: torch.device,
 ):
-    atol = 1e-8
-    rtol = 1e-5
+    atol = 1e-7
+    rtol = 1e-4
     model, loss, x_train, y_train, x_test, y_test = model_and_data
 
     num_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
@@ -509,7 +520,7 @@ def test_influences_low_rank(
     )
 
     influence_func_model = influence_factory(
-        model,
+        model.to(device),
         loss,
         test_case.hessian_reg,
         num_parameters - 1,
@@ -525,33 +536,47 @@ def test_influences_low_rank(
 
     influence_func_model = influence_func_model.fit(train_dataloader)
 
-    low_rank_influence = influence_func_model.influences(
-        x_test, y_test, x_train, y_train, mode=test_case.mode
-    ).numpy()
+    low_rank_influence = (
+        influence_func_model.influences(
+            x_test, y_test, x_train, y_train, mode=test_case.mode
+        )
+        .cpu()
+        .numpy()
+    )
 
-    sym_low_rank_influence = influence_func_model.influences(
-        x_train, y_train, mode=test_case.mode
-    ).numpy()
+    sym_low_rank_influence = (
+        influence_func_model.influences(x_train, y_train, mode=test_case.mode)
+        .cpu()
+        .numpy()
+    )
 
     low_rank_factors = influence_func_model.influence_factors(x_test, y_test)
     assert np.allclose(
         direct_factors,
-        influence_func_model.influence_factors(x_train, y_train).numpy(),
+        influence_func_model.influence_factors(x_train, y_train).cpu().numpy(),
         atol=atol,
         rtol=rtol,
     )
 
     if test_case.mode is InfluenceMode.Up:
-        low_rank_influence_transpose = influence_func_model.influences(
-            x_train, y_train, x_test, y_test, mode=test_case.mode
-        ).numpy()
+        low_rank_influence_transpose = (
+            influence_func_model.influences(
+                x_train, y_train, x_test, y_test, mode=test_case.mode
+            )
+            .cpu()
+            .numpy()
+        )
         assert np.allclose(
             low_rank_influence_transpose, low_rank_influence.swapaxes(0, 1)
         )
 
-    low_rank_values_from_factors = influence_func_model.influences_from_factors(
-        low_rank_factors, x_train, y_train, mode=test_case.mode
-    ).numpy()
+    low_rank_values_from_factors = (
+        influence_func_model.influences_from_factors(
+            low_rank_factors, x_train, y_train, mode=test_case.mode
+        )
+        .cpu()
+        .numpy()
+    )
     assert np.allclose(direct_influences, low_rank_influence, atol=atol, rtol=rtol)
     assert np.allclose(
         direct_sym_influences, sym_low_rank_influence, atol=atol, rtol=rtol
@@ -578,6 +603,7 @@ def test_influences_ekfac(
     ],
     direct_influences,
     direct_sym_influences,
+    device: torch.device,
 ):
     model, loss, x_train, y_train, x_test, y_test = model_and_data
 
@@ -589,7 +615,7 @@ def test_influences_ekfac(
         model,
         update_diagonal=True,
         hessian_regularization=test_case.hessian_reg,
-    )
+    ).to(device)
 
     with pytest.raises(NotFittedException):
         ekfac_influence.influences(
@@ -604,9 +630,13 @@ def test_influences_ekfac(
             ekfac_influence.fit(train_dataloader)
     elif isinstance(loss, nn.CrossEntropyLoss):
         ekfac_influence = ekfac_influence.fit(train_dataloader)
-        ekfac_influence_values = ekfac_influence.influences(
-            x_test, y_test, x_train, y_train, mode=test_case.mode
-        ).numpy()
+        ekfac_influence_values = (
+            ekfac_influence.influences(
+                x_test, y_test, x_train, y_train, mode=test_case.mode
+            )
+            .cpu()
+            .numpy()
+        )
 
         ekfac_influences_by_layer = ekfac_influence.influences_by_layer(
             x_test, y_test, x_train, y_train, mode=test_case.mode
@@ -614,22 +644,32 @@ def test_influences_ekfac(
 
         accumulated_inf_by_layer = np.zeros_like(ekfac_influence_values)
         for layer, infl in ekfac_influences_by_layer.items():
-            accumulated_inf_by_layer += infl.detach().numpy()
+            accumulated_inf_by_layer += infl.detach().cpu().numpy()
 
-        ekfac_self_influence = ekfac_influence.influences(
-            x_train, y_train, mode=test_case.mode
-        ).numpy()
+        ekfac_self_influence = (
+            ekfac_influence.influences(x_train, y_train, mode=test_case.mode)
+            .cpu()
+            .numpy()
+        )
 
         ekfac_factors = ekfac_influence.influence_factors(x_test, y_test)
 
-        influence_from_factors = ekfac_influence.influences_from_factors(
-            ekfac_factors, x_train, y_train, mode=test_case.mode
-        ).numpy()
+        influence_from_factors = (
+            ekfac_influence.influences_from_factors(
+                ekfac_factors, x_train, y_train, mode=test_case.mode
+            )
+            .cpu()
+            .numpy()
+        )
 
         assert np.allclose(ekfac_influence_values, influence_from_factors)
         assert np.allclose(ekfac_influence_values, accumulated_inf_by_layer)
-        check_influence_correlations(direct_influences, ekfac_influence_values)
-        check_influence_correlations(direct_sym_influences, ekfac_self_influence)
+        check_influence_correlations(
+            direct_influences, ekfac_influence_values, threshold=0.94
+        )
+        check_influence_correlations(
+            direct_sym_influences, ekfac_self_influence, threshold=0.94
+        )
 
 
 @pytest.mark.torch
@@ -656,6 +696,7 @@ def test_influences_cg(
     direct_factors,
     use_block_cg: bool,
     pre_conditioner: PreConditioner,
+    device: torch.device,
 ):
     model, loss, x_train, y_train, x_test, y_test = model_and_data
 
@@ -663,7 +704,7 @@ def test_influences_cg(
         TensorDataset(x_train, y_train), batch_size=test_case.batch_size
     )
     influence_model = CgInfluence(
-        model,
+        model.to(device),
         loss,
         test_case.hessian_reg,
         maxiter=5,
@@ -672,9 +713,13 @@ def test_influences_cg(
     )
     influence_model = influence_model.fit(train_dataloader)
 
-    approx_influences = influence_model.influences(
-        x_test, y_test, x_train, y_train, mode=test_case.mode
-    ).numpy()
+    approx_influences = (
+        influence_model.influences(
+            x_test, y_test, x_train, y_train, mode=test_case.mode
+        )
+        .cpu()
+        .numpy()
+    )
 
     assert not np.any(np.isnan(approx_influences))
 
@@ -701,7 +746,11 @@ def test_influences_cg(
     # check that block variant returns the correct vector, if only one right hand side
     # is provided
     if use_block_cg:
-        single_influence = influence_model.influence_factors(
-            x_train[0].unsqueeze(0), y_train[0].unsqueeze(0)
-        ).numpy()
+        single_influence = (
+            influence_model.influence_factors(
+                x_train[0].unsqueeze(0), y_train[0].unsqueeze(0)
+            )
+            .cpu()
+            .numpy()
+        )
         assert np.allclose(single_influence, direct_factors[0], atol=1e-6, rtol=1e-4)

From 4cf4ac2a8cf4a8a86dbc3c5caa9a25b59c775faa Mon Sep 17 00:00:00 2001
From: Kristof Schroeder <kristof_schroeder@web.de>
Date: Fri, 3 May 2024 13:32:45 +0200
Subject: [PATCH 7/7] Update CHANGELOG.md

---
 CHANGELOG.md | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 52bc910a4..a0e27a8d3 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,13 @@
 # Changelog
 
+## Unreleased
+
+### Added
+
+- Add a device fixture for `pytest`, which depending on the availability and 
+  user input (`pytest --with-cuda`) resolves to cuda device
+  [PR #574](https://github.com/aai-institute/pyDVL/pull/574)
+
 ## 0.9.1 - Bug fixes, logging improvement
 
 ### Fixed