Skip to content

Commit

Permalink
fix missing Event in KA kernels (#244)
Browse files Browse the repository at this point in the history
  • Loading branch information
frapac authored and michel2323 committed May 10, 2022
1 parent 58bbb14 commit 12daa4c
Show file tree
Hide file tree
Showing 5 changed files with 62 additions and 28 deletions.
12 changes: 10 additions & 2 deletions src/LinearSolvers/preconditioners.jl
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,9 @@ function mul!(y, C::BlockJacobiPreconditioner, b::CuVector{T}) where T
ndrange = (C.nblocks, max_rlen)
ev = mblock_kernel!(CUDADevice())(
y, b, C.culpartitions, C.curest_size,
C.cupartitions, C.cublocks, ndrange=ndrange,
C.cupartitions, C.cublocks,
ndrange=ndrange,
dependencies=Event(CUDADevice()),
)
wait(ev)
end
Expand Down Expand Up @@ -287,7 +289,13 @@ function _update_gpu(p, j_rowptr, j_colval, j_nzval, device)
nblocks = p.nblocks
fillblock_gpu_kernel! = fillblock_gpu!(device)
# Fill Block Jacobi" begin
ev = fillblock_gpu_kernel!(p.cublocks, size(p.id,1), p.cupartitions, p.cumap, j_rowptr, j_colval, j_nzval, p.cupart, p.culpartitions, p.id, ndrange=nblocks, dependencies=Event(device))
ev = fillblock_gpu_kernel!(
p.cublocks, size(p.id,1),
p.cupartitions, p.cumap,
j_rowptr, j_colval, j_nzval,
p.cupart, p.culpartitions, p.id,
ndrange=nblocks, dependencies=Event(device),
)
wait(ev)
# Invert blocks begin
blocklist = Array{CuArray{Float64,2}}(undef, nblocks)
Expand Down
4 changes: 2 additions & 2 deletions src/Polar/first_order.jl
Original file line number Diff line number Diff line change
Expand Up @@ -239,7 +239,7 @@ end
end
end

@kernel function _arrowhead_partials_csr_kernelll!(J_rowptr, J_colval, J_nzval, duals, coloring, nx, nu, nblock)
@kernel function _arrowhead_partials_csr_kernel!(J_rowptr, J_colval, J_nzval, duals, coloring, nx, nu, nblock)
i = @index(Global, Linear)

for c in J_rowptr[i]:J_rowptr[i+1]-1
Expand Down Expand Up @@ -272,7 +272,7 @@ function AutoDiff.partials!(jac::ArrowheadJacobian)
)
elseif isa(J, CuSparseMatrixCSR)
ndrange = (size(J, 1), )
ev = _arrowhead_partials_csr_kernelll!(device)(
ev = _arrowhead_partials_csr_kernel!(device)(
J.rowPtr, J.colVal, J.nzVal, duals_, coloring, jac.nx, jac.nu, jac.nblocks;
ndrange=ndrange, dependencies=Event(device),
)
Expand Down
8 changes: 4 additions & 4 deletions src/Polar/functions.jl
Original file line number Diff line number Diff line change
Expand Up @@ -253,7 +253,7 @@ function (func::CostFunction)(output::AbstractArray, stack::AbstractNetworkStack
ndrange = (ngen, nbatches(stack))
ev = _quadratic_cost_kernel(func.device)(
costs, stack.pgen, func.c0, func.c1, func.c2, ngen;
ndrange=ndrange,
ndrange=ndrange, dependencies=Event(func.device),
)
wait(ev)
# Sum costs across all generators
Expand All @@ -274,7 +274,7 @@ function adjoint!(func::CostFunction, ∂stack, stack, ∂v)
ndrange = (ngen, nbatches(stack))
ev = _adj_quadratic_cost_kernel(func.device)(
∂stack.pgen, stack.pgen, ∂v, func.c0, func.c1, func.c2, ngen;
ndrange=ndrange,
ndrange=ndrange, dependencies=Event(func.device),
)
wait(ev)
blockmul!(∂stack.ψ, func.M', ∂stack.pgen, 1.0, 1.0)
Expand Down Expand Up @@ -558,7 +558,7 @@ function (func::LineFlows)(cons::AbstractVector, stack::AbstractNetworkStack)
ndrange = (func.nlines, nbatches(stack))
ev = _line_flow_kernel(func.device)(
cons, sfp, sfq, stp, stq, func.nlines;
ndrange=ndrange,
ndrange=ndrange, dependencies=Event(func.device),
)
wait(ev)
return
Expand Down Expand Up @@ -589,7 +589,7 @@ function adjoint!(func::LineFlows, ∂stack, stack, ∂v)
stack.intermediate.sfp, stack.intermediate.sfq,
stack.intermediate.stp, stack.intermediate.stq,
∂v, nlines;
ndrange=ndrange,
ndrange=ndrange, dependencies=Event(func.device),
)
wait(ev)

Expand Down
43 changes: 30 additions & 13 deletions src/autodiff.jl
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,9 @@ function seed!(
dest_ = reshape(reinterpret(T, dest), 2, n)
ndrange = length(map)
ev = _seed_kernel!(device)(
dest_, v, map, ndrange=ndrange, dependencies=Event(device))
dest_, v, map;
ndrange=ndrange, dependencies=Event(device),
)
wait(ev)
end

Expand All @@ -158,7 +160,9 @@ function _seed_coloring!(
dest_ = reshape(reinterpret(T, dest), N+1, n)
ndrange = (length(map), N)
ev = _seed_coloring_kernel!(device)(
dest_, coloring, map, ndrange=ndrange, dependencies=Event(device))
dest_, coloring, map;
ndrange=ndrange, dependencies=Event(device),
)
wait(ev)
end

Expand Down Expand Up @@ -199,8 +203,11 @@ function getpartials_kernel!(hv::AbstractVector, H::AbstractHessianProd)
device = H.model.device
map = H.map
adj_t1sx = H.∂stack.input
kernel! = getpartials_hv_kernel!(device)
ev = kernel!(hv, adj_t1sx, map, ndrange=length(hv), dependencies=Event(device))
kernel! =
ev = getpartials_hv_kernel!(device)(
hv, adj_t1sx, map;
ndrange=length(hv), dependencies=Event(device),
)
wait(ev)
end

Expand Down Expand Up @@ -241,11 +248,15 @@ function partials!(jac::AbstractJacobian)
duals_ = reshape(reinterpret(T, duals), N+1, n)

if isa(device, CPU)
kernel! = partials_kernel_csc!(device)
ev = kernel!(J.colptr, J.rowval, J.nzval, duals_, coloring, ndrange=size(J,2), dependencies=Event(device))
ev = partials_kernel_csc!(device)(
J.colptr, J.rowval, J.nzval, duals_, coloring;
ndrange=size(J,2), dependencies=Event(device),
)
elseif isa(device, GPU)
kernel! = partials_kernel_csr!(device)
ev = kernel!(J.rowPtr, J.colVal, J.nzVal, duals_, coloring, ndrange=size(J,1), dependencies=Event(device))
ev = partials_kernel_csr!(device)(
J.rowPtr, J.colVal, J.nzVal, duals_, coloring;
ndrange=size(J,1), dependencies=Event(device),
)
else
error("Unknown device $device")
end
Expand Down Expand Up @@ -290,11 +301,15 @@ function partials!(hess::AbstractFullHessian)
duals_ = reshape(reinterpret(T, duals), N+1, n)

if isa(device, CPU)
kernel! = partials_kernel_cpu!(device)
ev = kernel!(H.colptr, H.rowval, H.nzval, duals_, map, coloring, ndrange=size(H,2), dependencies=Event(device))
ev = partials_kernel_cpu!(device)(
H.colptr, H.rowval, H.nzval, duals_, map, coloring;
ndrange=size(H,2), dependencies=Event(device),
)
elseif isa(device, GPU)
kernel! = partials_kernel_gpu!(device)
ev = kernel!(H.rowPtr, H.colVal, H.nzVal, duals_, map, coloring, ndrange=size(H,1), dependencies=Event(device))
ev = partials_kernel_gpu!(device)(
H.rowPtr, H.colVal, H.nzVal, duals_, map, coloring;
ndrange=size(H,1), dependencies=Event(device),
)
else
error("Unknown device $device")
end
Expand Down Expand Up @@ -327,7 +342,9 @@ function set_value!(
N = jac.ncolors
duals_ = reshape(reinterpret(T, duals), N+1, n)
ev = _set_value_kernel!(device)(
duals_, primals, ndrange=n, dependencies=Event(device))
duals_, primals;
ndrange=n, dependencies=Event(device),
)
wait(ev)
end

Expand Down
23 changes: 16 additions & 7 deletions src/cuda_wrapper.jl
Original file line number Diff line number Diff line change
Expand Up @@ -35,14 +35,20 @@ CuSparseMatrixCSR{Tv, Int32}(A::SparseMatrixCSC{Tv, Ti}) where {Tv, Ti} = CuSpar
function Base.copyto!(stack::AutoDiff.AbstractStack, map::AbstractVector{Int}, vals::VT) where {VT <: CuArray}
@assert length(map) == length(vals)
ndrange = (length(map),)
ev = _transfer_to_input!(CUDADevice())(stack.input, map, vals, ndrange=ndrange)
ev = _transfer_to_input!(CUDADevice())(
stack.input, map, vals;
ndrange=ndrange, dependencies=Event(CUDADevice()),
)
wait(ev)
end

function Base.copyto!(dest::VT, stack::AutoDiff.AbstractStack, map::AbstractVector{Int}) where {VT <: CuArray}
@assert length(map) == length(dest)
ndrange = (length(map),)
ev = _transfer_fr_input!(CUDADevice())(dest, stack.input, map, ndrange=ndrange)
ev = _transfer_fr_input!(CUDADevice())(
dest, stack.input, map;
ndrange=ndrange, dependencies=Event(CUDADevice()),
)
wait(ev)
end

Expand Down Expand Up @@ -81,7 +87,7 @@ function LinearAlgebra.mul!(

ndrange = (n, p)
ev = _spmv_csr_kernel!(CUDADevice())(
Ys, Xs, A.colVal, A.rowPtr, A.nzVal, alpha, beta, n, m,
Ys, Xs, A.colVal, A.rowPtr, A.nzVal, alpha, beta, n, m;
ndrange=ndrange, dependencies=Event(CUDADevice()),
)
wait(ev)
Expand All @@ -108,7 +114,7 @@ function LinearAlgebra.mul!(

ndrange = (n, p, k)
ev = _spmv_blk_csr_kernel!(CUDADevice())(
Ys, Xs, A.colVal, A.rowPtr, A.nzVal, alpha, beta, n, m,
Ys, Xs, A.colVal, A.rowPtr, A.nzVal, alpha, beta, n, m;
ndrange=ndrange, dependencies=Event(CUDADevice()),
)
wait(ev)
Expand All @@ -132,7 +138,7 @@ function LinearAlgebra.mul!(

ndrange = (n, )
ev = _spmv_csr_kernel_double!(CUDADevice())(
Ys, X, A.colVal, A.rowPtr, A.nzVal, alpha, beta, n, m,
Ys, X, A.colVal, A.rowPtr, A.nzVal, alpha, beta, n, m;
ndrange=ndrange, dependencies=Event(CUDADevice()),
)
wait(ev)
Expand All @@ -158,7 +164,7 @@ function LinearAlgebra.mul!(

ndrange = (n, k)
ev = _spmv_blk_csr_kernel_double!(CUDADevice())(
Ys, X, A.colVal, A.rowPtr, A.nzVal, alpha, beta, n, m,
Ys, X, A.colVal, A.rowPtr, A.nzVal, alpha, beta, n, m;
ndrange=ndrange, dependencies=Event(CUDADevice()),
)
wait(ev)
Expand Down Expand Up @@ -268,7 +274,10 @@ function blockcopy!(stack::BlockNetworkStack, map::CuArray{Int}, x::CuArray{Floa
@assert length(map) % nx == 0
nb = div(length(map), nx)
ndrange = (nx, nb)
ev = _blk_transfer_to_input!(CUDADevice())(stack.input, map, x, nx, ndrange=ndrange)
ev = _blk_transfer_to_input!(CUDADevice())(
stack.input, map, x, nx;
ndrange=ndrange, dependencies=Event(CUDADevice()),
)
wait(ev)
end

0 comments on commit 12daa4c

Please sign in to comment.