diff --git a/src/LinearSolvers/preconditioners.jl b/src/LinearSolvers/preconditioners.jl
index 80ec9ca1..a0133747 100644
--- a/src/LinearSolvers/preconditioners.jl
+++ b/src/LinearSolvers/preconditioners.jl
@@ -185,18 +185,21 @@ Base.eltype(::BlockJacobiPreconditioner) = Float64
 # of the blocks, gemm_strided is performing too many unecessary operations,
 # impairing its performance.
 @kernel function mblock_kernel!(y, b, p_len, rp_len, part, blocks)
+    p = size(b, 2)
     i, j = @index(Global, NTuple)
     len = p_len[i]
     rlen = rp_len[i]
 
     if j <= rlen
-        accum = 0.0
-        idxA = @inbounds part[j, i]
-        for k=1:len
-            idxB = @inbounds part[k, i]
-            @inbounds accum = accum + blocks[j, k, i]*b[idxB]
+        for ℓ=1:p
+            accum = 0.0
+            idxA = @inbounds part[j, i]
+            for k=1:len
+                idxB = @inbounds part[k, i]
+                @inbounds accum = accum + blocks[j, k, i]*b[idxB,ℓ]
+            end
+            y[idxA,ℓ] = accum
         end
-        y[idxA] = accum
     end
 end
 
@@ -223,7 +226,7 @@ function mul!(Y, C::BlockJacobiPreconditioner, B::Matrix{T}) where T
         blck = view(C.blocks, 1:rlen, 1:rlen, i)
         for j=1:C.rest_size[i]
             idx = part[j]
-            mul!(view(Y,idx,:), view(blck, j, :)', view(B, part, :), one(T), one(T))
+            mul!(view(Y,idx,:), view(blck, :, j), view(B, part, :), one(T), one(T))
         end
     end
 end
@@ -241,6 +244,19 @@ function mul!(y, C::BlockJacobiPreconditioner, b::CuVector{T}) where T
     KA.synchronize(CUDABackend())
 end
 
+function mul!(Y, C::BlockJacobiPreconditioner, B::CuMatrix{T}) where T
+    n, p = size(B)
+    fill!(Y, zero(T))
+    max_rlen = maximum(C.rest_size)
+    ndrange = (C.nblocks, max_rlen)
+    mblock_kernel!(CUDABackend())(
+        Y, B, C.culpartitions, C.curest_size,
+        C.cupartitions, C.cublocks,
+        ndrange=ndrange,
+    )
+    KA.synchronize(CUDABackend())
+end
+
 """
     build_adjmatrix
 
diff --git a/test/TestLinearSolvers.jl b/test/TestLinearSolvers.jl
index d36a99b2..0e32461a 100644
--- a/test/TestLinearSolvers.jl
+++ b/test/TestLinearSolvers.jl
@@ -30,15 +30,28 @@ function test_preconditioners(device, AT, SMT)
     nblocks = 2
     precond = LS.BlockJacobiPreconditioner(A, nblocks, device)
     LS.update(precond, A, device)
-    Iₙ = Matrix{Float64}(I, n, n)
-    Y = zeros(Float64, n, n)
-    Y2 = zeros(Float64, n, n)
-    mul!(Y, precond, Iₙ)
-    for i=1:n
-        eᵢ = Iₙ[:,i]
-        mul!(view(Y2,:,i), precond, eᵢ)
+    if device == ExaPF.CPU()
+        Iₙ = Matrix{Float64}(I, n, n)
+        Y = zeros(Float64, n, n)
+        Y2 = zeros(Float64, n, n)
+        mul!(Y, precond, Iₙ)
+        for i=1:n
+            eᵢ = Vector(Iₙ[:,i])
+            mul!(view(Y2,:,i), precond, eᵢ)
+        end
+        @test Y ≈ Y2
+    end
+    if device == ExaPF.GPU()
+        Iₙ = CuMatrix{Float64}(I, n, n)
+        Y = CUDA.zeros(Float64, n, n)
+        Y2 = CUDA.zeros(Float64, n, n)
+        mul!(Y, precond, Iₙ)
+        for i=1:n
+            eᵢ = CuVector(Iₙ[:,i])
+            mul!(view(Y2,:,i), precond, eᵢ)
+        end
+        @test Y ≈ Y2
     end
-    @test Y ≈ Y2
 end
 
 function test_custom_bicgstab(device, AT, SMT)