From fd82b5a678a49e2574fb10decceee65f6639f53c Mon Sep 17 00:00:00 2001 From: Kali Uday Date: Tue, 1 Nov 2022 22:16:47 +0530 Subject: [PATCH 01/21] added nrm2 base structure --- deps/onemkl.cpp | 15 +++++++++++++++ deps/onemkl.h | 6 ++++++ lib/mkl/libonemkl.jl | 18 ++++++++++++++++++ lib/mkl/wrappers.jl | 18 +++++++++++++++++- 4 files changed, 56 insertions(+), 1 deletion(-) diff --git a/deps/onemkl.cpp b/deps/onemkl.cpp index ba9654ba..b7c79194 100644 --- a/deps/onemkl.cpp +++ b/deps/onemkl.cpp @@ -81,6 +81,21 @@ extern "C" int onemklZgemm(syclQueue_t device_queue, onemklTranspose transA, return 0; } +extern "C" void onemklDnrm2(syclQueue_t device_queue, int64_t n, const double *x, int64_t incx, double *result) { + oneapi::mkl::blas::column_major::nrm2(device_queue->val, n, x, incx, result); +} + +extern "C" void onemklSnrm2(syclQueue_t device_queue, int64_t n, const float *x, int64_t incx, float *result) { + oneapi::mkl::blas::column_major::nrm2(device_queue->val, n, x, incx, result); +} + +extern "C" void onemklCnrm2(syclQueue_t device_queue, int64_t n, const std::complex *x, int64_t incx, float *result) { + oneapi::mkl::blas::column_major::nrm2(device_queue->val, n, x, incx, result); +} + +extern "C" void onemklZnrm2(syclQueue_t device_queue, int64_t n, const std::complex *x, int64_t incx, double *result) { + oneapi::mkl::blas::column_major::nrm2(device_queue->val, n, x, incx, result); +} // other diff --git a/deps/onemkl.h b/deps/onemkl.h index 7e7e065b..de6aa9a9 100644 --- a/deps/onemkl.h +++ b/deps/onemkl.h @@ -39,6 +39,12 @@ int onemklZgemm(syclQueue_t device_queue, onemklTranspose transA, const double _Complex *B, int64_t ldb, double _Complex beta, double _Complex *C, int64_t ldc); +# level-1: nrm2 +void onemklDnrm2(syclQueue_t device_queue, int64_t n, const double *x, int64_t incx, double *result); +void onemklSnrm2(syclQueue_t device_queue, int64_t n, const float *x, int64_t incx, float *result); +void onemklCnrm2(syclQueue_t device_queue, int64_t n, const std::complex *x, int64_t incx, float *result); +void onemklZnrm2(syclQueue_t device_queue, int64_t n, const std::complex *x, int64_t incx, double *result); + void onemklDestroy(); #ifdef __cplusplus } diff --git a/lib/mkl/libonemkl.jl b/lib/mkl/libonemkl.jl index 042b5e3a..70e862d3 100644 --- a/lib/mkl/libonemkl.jl +++ b/lib/mkl/libonemkl.jl @@ -41,3 +41,21 @@ function onemklZgemm(device_queue, transA, transB, m, n, k, alpha, A, lda, B, ld B::ZePtr{ComplexF64}, ldb::Int64, beta::ComplexF64, C::ZePtr{ComplexF64}, ldc::Int64)::Cint end + +function onemklDnrm2(device_queue, n, x, incx, result) + @ccall liboneapi_support.onemklDnrm2(device_queue::syclQueue_t, n::Int64, x::ZePtr{Cdouble}, incx::Int64, result::ZePtr{Cdouble}) +end + +function onemklSnrm2(device_queue, n, x, incx, result) + @ccall liboneapi_support.onemklDnrm2(device_queue::syclQueue_t, n::Int64, x::ZePtr{Cfloat}, incx::Int64, result::ZePtr{Cfloat}) +end + +function onemklCnrm2(device_queue, n, x, incx, result) + @ccall liboneapi_support.onemklDnrm2(device_queue::syclQueue_t, n::Int64, x::ZePtr{ComplexF32}, incx::Int64, result::ZePtr{ComplexF32}) +end + +function onemklZnrm2(device_queue, n, x, incx, result) + @ccall liboneapi_support.onemklDnrm2(device_queue::syclQueue_t, n::Int64, x::ZePtr{ComplexF64}, incx::Int64, result::ZePtr{ComplexF64}) +end + + diff --git a/lib/mkl/wrappers.jl b/lib/mkl/wrappers.jl index 64da7c37..ca89e3b1 100644 --- a/lib/mkl/wrappers.jl +++ b/lib/mkl/wrappers.jl @@ -14,7 +14,23 @@ function Base.convert(::Type{onemklTranspose}, trans::Char) end end - +# level 1 +## nrm2 +for (fname, elty) in + ((:onemklDnrm2, :Float64), + (:onemklSnrm2, :Float32), + (:onemklZnrm2, :ComplexF32), + (:onemklZnrm2, :ComplexF64)) + @eval begin + function nrm2!(n::Integer, + x::StridedArray{$elty}, + result::StridedArray{$elty}) + queue = global_queue(context(x), device(x)) + $fname(sycl_queue(queue), n, x, stride(x,1), result) + result + end + end +end # # BLAS From e29ef0344615620151f6527fcbf496357d4ec21c Mon Sep 17 00:00:00 2001 From: Kali Uday Date: Wed, 2 Nov 2022 00:49:23 +0530 Subject: [PATCH 02/21] * build_local was successful but runtime errors observed * precompile() fails with similar error --- deps/onemkl.cpp | 8 ++++---- deps/onemkl.h | 6 ++---- lib/mkl/libonemkl.jl | 4 ++-- test/onemkl.jl | 18 ++++++++++++++++++ 4 files changed, 26 insertions(+), 10 deletions(-) create mode 100644 test/onemkl.jl diff --git a/deps/onemkl.cpp b/deps/onemkl.cpp index b7c79194..e94bca9c 100644 --- a/deps/onemkl.cpp +++ b/deps/onemkl.cpp @@ -89,12 +89,12 @@ extern "C" void onemklSnrm2(syclQueue_t device_queue, int64_t n, const float *x, oneapi::mkl::blas::column_major::nrm2(device_queue->val, n, x, incx, result); } -extern "C" void onemklCnrm2(syclQueue_t device_queue, int64_t n, const std::complex *x, int64_t incx, float *result) { - oneapi::mkl::blas::column_major::nrm2(device_queue->val, n, x, incx, result); +extern "C" void onemklCnrm2(syclQueue_t device_queue, int64_t n, const float _Complex *x, int64_t incx, float *result) { + oneapi::mkl::blas::column_major::nrm2(device_queue->val, n, reinterpret_cast *>(x), incx, result); } -extern "C" void onemklZnrm2(syclQueue_t device_queue, int64_t n, const std::complex *x, int64_t incx, double *result) { - oneapi::mkl::blas::column_major::nrm2(device_queue->val, n, x, incx, result); +extern "C" void onemklZnrm2(syclQueue_t device_queue, int64_t n, const double _Complex *x, int64_t incx, double *result) { + oneapi::mkl::blas::column_major::nrm2(device_queue->val, n, reinterpret_cast *>(x), incx, result); } // other diff --git a/deps/onemkl.h b/deps/onemkl.h index de6aa9a9..dbaa2031 100644 --- a/deps/onemkl.h +++ b/deps/onemkl.h @@ -38,12 +38,10 @@ int onemklZgemm(syclQueue_t device_queue, onemklTranspose transA, double _Complex alpha, const double _Complex *A, int64_t lda, const double _Complex *B, int64_t ldb, double _Complex beta, double _Complex *C, int64_t ldc); - -# level-1: nrm2 void onemklDnrm2(syclQueue_t device_queue, int64_t n, const double *x, int64_t incx, double *result); void onemklSnrm2(syclQueue_t device_queue, int64_t n, const float *x, int64_t incx, float *result); -void onemklCnrm2(syclQueue_t device_queue, int64_t n, const std::complex *x, int64_t incx, float *result); -void onemklZnrm2(syclQueue_t device_queue, int64_t n, const std::complex *x, int64_t incx, double *result); +void onemklCnrm2(syclQueue_t device_queue, int64_t n, const float _Complex *x, int64_t incx, float *result); +void onemklZnrm2(syclQueue_t device_queue, int64_t n, const double _Complex *x, int64_t incx, double *result); void onemklDestroy(); #ifdef __cplusplus diff --git a/lib/mkl/libonemkl.jl b/lib/mkl/libonemkl.jl index 70e862d3..fc314c6d 100644 --- a/lib/mkl/libonemkl.jl +++ b/lib/mkl/libonemkl.jl @@ -51,11 +51,11 @@ function onemklSnrm2(device_queue, n, x, incx, result) end function onemklCnrm2(device_queue, n, x, incx, result) - @ccall liboneapi_support.onemklDnrm2(device_queue::syclQueue_t, n::Int64, x::ZePtr{ComplexF32}, incx::Int64, result::ZePtr{ComplexF32}) + @ccall liboneapi_support.onemklDnrm2(device_queue::syclQueue_t, n::Int64, x::ZePtr{ComplexF32}, incx::Int64, result::ZePtr{Cfloat}) end function onemklZnrm2(device_queue, n, x, incx, result) - @ccall liboneapi_support.onemklDnrm2(device_queue::syclQueue_t, n::Int64, x::ZePtr{ComplexF64}, incx::Int64, result::ZePtr{ComplexF64}) + @ccall liboneapi_support.onemklDnrm2(device_queue::syclQueue_t, n::Int64, x::ZePtr{ComplexF64}, incx::Int64, result::ZePtr{Cdouble}) end diff --git a/test/onemkl.jl b/test/onemkl.jl new file mode 100644 index 00000000..ebf80cc1 --- /dev/null +++ b/test/onemkl.jl @@ -0,0 +1,18 @@ +using oneAPI +using oneAPI.oneMKL +using LinearAlgebra + +m = 20 +n = 35 +k = 13 + +################ +@testset "level 1" begin + @testset for T in intersect(eltypes, [Float32, Float64, ComplexF32, ComplexF64]) + A = rand(T,m) + gpuA = oneArray(A) + res = oneArray(A) + #oneMKL.nrm2(m, gpuA, res) + @show res + end +end From a7f9678fa3e21b1c2b79dac5c08dcd414e1f19ac Mon Sep 17 00:00:00 2001 From: Kali Uday Date: Wed, 2 Nov 2022 15:30:19 +0530 Subject: [PATCH 03/21] compilation successful --- deps/onemkl.h | 2 ++ lib/mkl/libonemkl.jl | 8 ++++---- lib/mkl/oneMKL.jl | 2 +- lib/mkl/wrappers.jl | 13 ++++--------- test/onemkl.jl | 10 ++++------ 5 files changed, 15 insertions(+), 20 deletions(-) diff --git a/deps/onemkl.h b/deps/onemkl.h index dbaa2031..7febe777 100644 --- a/deps/onemkl.h +++ b/deps/onemkl.h @@ -38,6 +38,8 @@ int onemklZgemm(syclQueue_t device_queue, onemklTranspose transA, double _Complex alpha, const double _Complex *A, int64_t lda, const double _Complex *B, int64_t ldb, double _Complex beta, double _Complex *C, int64_t ldc); + +// Supported Level-1: Nrm2 void onemklDnrm2(syclQueue_t device_queue, int64_t n, const double *x, int64_t incx, double *result); void onemklSnrm2(syclQueue_t device_queue, int64_t n, const float *x, int64_t incx, float *result); void onemklCnrm2(syclQueue_t device_queue, int64_t n, const float _Complex *x, int64_t incx, float *result); diff --git a/lib/mkl/libonemkl.jl b/lib/mkl/libonemkl.jl index fc314c6d..a9228036 100644 --- a/lib/mkl/libonemkl.jl +++ b/lib/mkl/libonemkl.jl @@ -43,19 +43,19 @@ function onemklZgemm(device_queue, transA, transB, m, n, k, alpha, A, lda, B, ld end function onemklDnrm2(device_queue, n, x, incx, result) - @ccall liboneapi_support.onemklDnrm2(device_queue::syclQueue_t, n::Int64, x::ZePtr{Cdouble}, incx::Int64, result::ZePtr{Cdouble}) + @ccall liboneapi_support.onemklDnrm2(device_queue::syclQueue_t, n::Int64, x::ZePtr{Cdouble}, incx::Int64, result::ZePtr{Cdouble})::Cvoid end function onemklSnrm2(device_queue, n, x, incx, result) - @ccall liboneapi_support.onemklDnrm2(device_queue::syclQueue_t, n::Int64, x::ZePtr{Cfloat}, incx::Int64, result::ZePtr{Cfloat}) + @ccall liboneapi_support.onemklSnrm2(device_queue::syclQueue_t, n::Int64, x::ZePtr{Cfloat}, incx::Int64, result::ZePtr{Cfloat})::Cvoid end function onemklCnrm2(device_queue, n, x, incx, result) - @ccall liboneapi_support.onemklDnrm2(device_queue::syclQueue_t, n::Int64, x::ZePtr{ComplexF32}, incx::Int64, result::ZePtr{Cfloat}) + @ccall liboneapi_support.onemklCnrm2(device_queue::syclQueue_t, n::Int64, x::ZePtr{ComplexF32}, incx::Int64, result::ZePtr{Cfloat})::Cvoid end function onemklZnrm2(device_queue, n, x, incx, result) - @ccall liboneapi_support.onemklDnrm2(device_queue::syclQueue_t, n::Int64, x::ZePtr{ComplexF64}, incx::Int64, result::ZePtr{Cdouble}) + @ccall liboneapi_support.onemklZnrm2(device_queue::syclQueue_t, n::Int64, x::ZePtr{ComplexF64}, incx::Int64, result::ZePtr{Cdouble})::Cvoid end diff --git a/lib/mkl/oneMKL.jl b/lib/mkl/oneMKL.jl index d83f2141..41dfbea0 100644 --- a/lib/mkl/oneMKL.jl +++ b/lib/mkl/oneMKL.jl @@ -12,7 +12,7 @@ using GPUArrays include("libonemkl.jl") -const onemklFloat = Union{Float64,Float32,Float16,ComplexF64,ComplexF32} +const onemklFloat = Union{Float64,Float32,ComplexF64,ComplexF32} include("wrappers.jl") include("linalg.jl") diff --git a/lib/mkl/wrappers.jl b/lib/mkl/wrappers.jl index ca89e3b1..81c1306f 100644 --- a/lib/mkl/wrappers.jl +++ b/lib/mkl/wrappers.jl @@ -19,25 +19,20 @@ end for (fname, elty) in ((:onemklDnrm2, :Float64), (:onemklSnrm2, :Float32), - (:onemklZnrm2, :ComplexF32), + (:onemklCnrm2, :ComplexF32), (:onemklZnrm2, :ComplexF64)) @eval begin function nrm2!(n::Integer, x::StridedArray{$elty}, - result::StridedArray{$elty}) + result::StridedArray{$elty}) queue = global_queue(context(x), device(x)) + #result = Ref{$ret_type}() $fname(sycl_queue(queue), n, x, stride(x,1), result) - result + result end end end -# -# BLAS -# - -# level 3 - for (fname, elty) in ((:onemklDgemm,:Float64), (:onemklSgemm,:Float32), diff --git a/test/onemkl.jl b/test/onemkl.jl index ebf80cc1..65207988 100644 --- a/test/onemkl.jl +++ b/test/onemkl.jl @@ -8,11 +8,9 @@ k = 13 ################ @testset "level 1" begin - @testset for T in intersect(eltypes, [Float32, Float64, ComplexF32, ComplexF64]) - A = rand(T,m) - gpuA = oneArray(A) - res = oneArray(A) - #oneMKL.nrm2(m, gpuA, res) - @show res + @testset for T in eltypes + if T <:oneMKL.onemklFloat + println(T) + end end end From f41d48055d997c4b83c4e4318fdd55b1459af04b Mon Sep 17 00:00:00 2001 From: Kali Uday Date: Wed, 2 Nov 2022 17:58:42 +0530 Subject: [PATCH 04/21] bug fixes and linalg.jl updated facing oneMKL native crash with wrong args --- lib/mkl/libonemkl.jl | 8 ++++---- lib/mkl/linalg.jl | 6 ++++++ lib/mkl/wrappers.jl | 20 ++++++++++---------- test/onemkl.jl | 3 ++- 4 files changed, 22 insertions(+), 15 deletions(-) diff --git a/lib/mkl/libonemkl.jl b/lib/mkl/libonemkl.jl index a9228036..01366836 100644 --- a/lib/mkl/libonemkl.jl +++ b/lib/mkl/libonemkl.jl @@ -43,19 +43,19 @@ function onemklZgemm(device_queue, transA, transB, m, n, k, alpha, A, lda, B, ld end function onemklDnrm2(device_queue, n, x, incx, result) - @ccall liboneapi_support.onemklDnrm2(device_queue::syclQueue_t, n::Int64, x::ZePtr{Cdouble}, incx::Int64, result::ZePtr{Cdouble})::Cvoid + @ccall liboneapi_support.onemklDnrm2(device_queue::syclQueue_t, n::Int64, x::ZePtr{Cdouble}, incx::Int64, result::Ptr{Cdouble})::Cvoid end function onemklSnrm2(device_queue, n, x, incx, result) - @ccall liboneapi_support.onemklSnrm2(device_queue::syclQueue_t, n::Int64, x::ZePtr{Cfloat}, incx::Int64, result::ZePtr{Cfloat})::Cvoid + @ccall liboneapi_support.onemklSnrm2(device_queue::syclQueue_t, n::Int64, x::ZePtr{Cfloat}, incx::Int64, result::Ptr{Cfloat})::Cvoid end function onemklCnrm2(device_queue, n, x, incx, result) - @ccall liboneapi_support.onemklCnrm2(device_queue::syclQueue_t, n::Int64, x::ZePtr{ComplexF32}, incx::Int64, result::ZePtr{Cfloat})::Cvoid + @ccall liboneapi_support.onemklCnrm2(device_queue::syclQueue_t, n::Int64, x::ZePtr{ComplexF32}, incx::Int64, result::Ptr{Cfloat})::Cvoid end function onemklZnrm2(device_queue, n, x, incx, result) - @ccall liboneapi_support.onemklZnrm2(device_queue::syclQueue_t, n::Int64, x::ZePtr{ComplexF64}, incx::Int64, result::ZePtr{Cdouble})::Cvoid + @ccall liboneapi_support.onemklZnrm2(device_queue::syclQueue_t, n::Int64, x::ZePtr{ComplexF64}, incx::Int64, result::Ptr{Cdouble})::Cvoid end diff --git a/lib/mkl/linalg.jl b/lib/mkl/linalg.jl index d1d6ae6b..b20ef7b0 100644 --- a/lib/mkl/linalg.jl +++ b/lib/mkl/linalg.jl @@ -49,6 +49,12 @@ function gemm_dispatch!(C::oneStridedVecOrMat, A, B, alpha::Number=true, beta::N end end +LinearAlgebra.norm(x::oneStridedVecOrMat{<:onemklFloat}, n::Number) = + oneMKL.nrm2!(length(x), x) + +LinearAlgebra.norm(x::oneStridedVecOrMat{<:onemklFloat}, n::Real) = + invoke(norm, Tuple{typeof(x), Number}, x, n) + for NT in (Number, Real) # NOTE: alpha/beta also ::Real to avoid ambiguities with certain Base methods @eval begin diff --git a/lib/mkl/wrappers.jl b/lib/mkl/wrappers.jl index 81c1306f..4e1a199d 100644 --- a/lib/mkl/wrappers.jl +++ b/lib/mkl/wrappers.jl @@ -16,23 +16,23 @@ end # level 1 ## nrm2 -for (fname, elty) in - ((:onemklDnrm2, :Float64), - (:onemklSnrm2, :Float32), - (:onemklCnrm2, :ComplexF32), - (:onemklZnrm2, :ComplexF64)) +for (fname, elty, ret_type) in + ((:onemklDnrm2, :Float64,:Float64), + (:onemklSnrm2, :Float32,:Float32), + (:onemklCnrm2, :ComplexF32,:Float32), + (:onemklZnrm2, :ComplexF64,:Float64)) @eval begin function nrm2!(n::Integer, - x::StridedArray{$elty}, - result::StridedArray{$elty}) + x::StridedArray{$elty}) queue = global_queue(context(x), device(x)) - #result = Ref{$ret_type}() - $fname(sycl_queue(queue), n, x, stride(x,1), result) - result + result = Ref{$ret_type}() + $fname(sycl_queue(queue), n, x, stride(x,1), result) + return result[] end end end + for (fname, elty) in ((:onemklDgemm,:Float64), (:onemklSgemm,:Float32), diff --git a/test/onemkl.jl b/test/onemkl.jl index 65207988..16e57fd5 100644 --- a/test/onemkl.jl +++ b/test/onemkl.jl @@ -10,7 +10,8 @@ k = 13 @testset "level 1" begin @testset for T in eltypes if T <:oneMKL.onemklFloat - println(T) + A = rand(T,m) + @test testf(norm, A, m) end end end From cf3923622c425452c822be32d0b70157f3285ed3 Mon Sep 17 00:00:00 2001 From: Kali Uday Date: Wed, 2 Nov 2022 18:04:16 +0530 Subject: [PATCH 05/21] NITS --- lib/mkl/wrappers.jl | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/lib/mkl/wrappers.jl b/lib/mkl/wrappers.jl index 4e1a199d..921de8e9 100644 --- a/lib/mkl/wrappers.jl +++ b/lib/mkl/wrappers.jl @@ -32,6 +32,11 @@ for (fname, elty, ret_type) in end end +# +# BLAS +# + +# level 3 for (fname, elty) in ((:onemklDgemm,:Float64), From 4ace1b4168419fa659013ea51865ce3a727dc4fa Mon Sep 17 00:00:00 2001 From: Kali Uday Date: Thu, 3 Nov 2022 17:51:03 +0530 Subject: [PATCH 06/21] create sycl ptr manually for result --- deps/onemkl.cpp | 14 ++++++++++---- deps/onemkl.h | 4 ++-- lib/mkl/libonemkl.jl | 9 +++++---- lib/mkl/linalg.jl | 2 +- lib/mkl/wrappers.jl | 4 ++-- test/onemkl.jl | 2 ++ 6 files changed, 22 insertions(+), 13 deletions(-) diff --git a/deps/onemkl.cpp b/deps/onemkl.cpp index e94bca9c..e8f0732b 100644 --- a/deps/onemkl.cpp +++ b/deps/onemkl.cpp @@ -81,12 +81,18 @@ extern "C" int onemklZgemm(syclQueue_t device_queue, onemklTranspose transA, return 0; } -extern "C" void onemklDnrm2(syclQueue_t device_queue, int64_t n, const double *x, int64_t incx, double *result) { - oneapi::mkl::blas::column_major::nrm2(device_queue->val, n, x, incx, result); +extern "C" void onemklDnrm2(syclQueue_t device_queue, syclContext_t ctx, syclDevice_t dev, int64_t n, const double *x, int64_t incx, double *result) { + auto result_p = sycl::malloc_shared(1, dev->val, ctx->val); + auto status = oneapi::mkl::blas::column_major::nrm2(device_queue->val, n, x, incx, result_p); + status.wait(); + *result = *result_p; } -extern "C" void onemklSnrm2(syclQueue_t device_queue, int64_t n, const float *x, int64_t incx, float *result) { - oneapi::mkl::blas::column_major::nrm2(device_queue->val, n, x, incx, result); +extern "C" void onemklSnrm2(syclQueue_t device_queue, syclContext_t ctx, syclDevice_t dev, int64_t n, const float *x, int64_t incx, float *result) { + auto result_p = sycl::malloc_shared(1, dev->val, ctx->val); + auto status = oneapi::mkl::blas::column_major::nrm2(device_queue->val, n, x, incx, result_p); + status.wait(); + *result = *result_p; } extern "C" void onemklCnrm2(syclQueue_t device_queue, int64_t n, const float _Complex *x, int64_t incx, float *result) { diff --git a/deps/onemkl.h b/deps/onemkl.h index 7febe777..dfe59981 100644 --- a/deps/onemkl.h +++ b/deps/onemkl.h @@ -40,8 +40,8 @@ int onemklZgemm(syclQueue_t device_queue, onemklTranspose transA, double _Complex *C, int64_t ldc); // Supported Level-1: Nrm2 -void onemklDnrm2(syclQueue_t device_queue, int64_t n, const double *x, int64_t incx, double *result); -void onemklSnrm2(syclQueue_t device_queue, int64_t n, const float *x, int64_t incx, float *result); +void onemklDnrm2(syclQueue_t device_queue, syclContext_t ctx, syclDevice_t dev, int64_t n, const double *x, int64_t incx, double *result); +void onemklSnrm2(syclQueue_t device_queue, syclContext_t ctx, syclDevice_t dev, int64_t n, const float *x, int64_t incx, float *result); void onemklCnrm2(syclQueue_t device_queue, int64_t n, const float _Complex *x, int64_t incx, float *result); void onemklZnrm2(syclQueue_t device_queue, int64_t n, const double _Complex *x, int64_t incx, double *result); diff --git a/lib/mkl/libonemkl.jl b/lib/mkl/libonemkl.jl index 01366836..da85c2f8 100644 --- a/lib/mkl/libonemkl.jl +++ b/lib/mkl/libonemkl.jl @@ -1,4 +1,5 @@ using CEnum +using oneAPI.SYCL: syclQueue_t, syclContext_t, syclDevice_t @cenum onemklTranspose::UInt32 begin ONEMKL_TRANSPOSE_NONTRANS = 0 @@ -42,12 +43,12 @@ function onemklZgemm(device_queue, transA, transB, m, n, k, alpha, A, lda, B, ld C::ZePtr{ComplexF64}, ldc::Int64)::Cint end -function onemklDnrm2(device_queue, n, x, incx, result) - @ccall liboneapi_support.onemklDnrm2(device_queue::syclQueue_t, n::Int64, x::ZePtr{Cdouble}, incx::Int64, result::Ptr{Cdouble})::Cvoid +function onemklDnrm2(device_queue, ctx, dev, n, x, incx, result) + @ccall liboneapi_support.onemklDnrm2(device_queue::syclQueue_t, ctx::syclContext_t, dev::syclDevice_t, n::Int64, x::ZePtr{Cdouble}, incx::Int64, result::Ref{Cdouble})::Cvoid end -function onemklSnrm2(device_queue, n, x, incx, result) - @ccall liboneapi_support.onemklSnrm2(device_queue::syclQueue_t, n::Int64, x::ZePtr{Cfloat}, incx::Int64, result::Ptr{Cfloat})::Cvoid +function onemklSnrm2(device_queue, ctx, dev, n, x, incx, result) + @ccall liboneapi_support.onemklSnrm2(device_queue::syclQueue_t, ctx::syclContext_t, dev::syclDevice_t, n::Int64, x::ZePtr{Cfloat}, incx::Int64, result::Ref{Cfloat})::Cvoid end function onemklCnrm2(device_queue, n, x, incx, result) diff --git a/lib/mkl/linalg.jl b/lib/mkl/linalg.jl index b20ef7b0..a46c4d21 100644 --- a/lib/mkl/linalg.jl +++ b/lib/mkl/linalg.jl @@ -50,7 +50,7 @@ function gemm_dispatch!(C::oneStridedVecOrMat, A, B, alpha::Number=true, beta::N end LinearAlgebra.norm(x::oneStridedVecOrMat{<:onemklFloat}, n::Number) = - oneMKL.nrm2!(length(x), x) + oneMKL.nrm2(length(x), x) LinearAlgebra.norm(x::oneStridedVecOrMat{<:onemklFloat}, n::Real) = invoke(norm, Tuple{typeof(x), Number}, x, n) diff --git a/lib/mkl/wrappers.jl b/lib/mkl/wrappers.jl index 921de8e9..34af55ac 100644 --- a/lib/mkl/wrappers.jl +++ b/lib/mkl/wrappers.jl @@ -22,11 +22,11 @@ for (fname, elty, ret_type) in (:onemklCnrm2, :ComplexF32,:Float32), (:onemklZnrm2, :ComplexF64,:Float64)) @eval begin - function nrm2!(n::Integer, + function nrm2(n::Integer, x::StridedArray{$elty}) queue = global_queue(context(x), device(x)) result = Ref{$ret_type}() - $fname(sycl_queue(queue), n, x, stride(x,1), result) + $fname(sycl_queue(queue), sycl_context(context(x), device(x)), sycl_device(device(x)), n, x, stride(x,1), result) return result[] end end diff --git a/test/onemkl.jl b/test/onemkl.jl index 16e57fd5..90bd3cb2 100644 --- a/test/onemkl.jl +++ b/test/onemkl.jl @@ -11,6 +11,8 @@ k = 13 @testset for T in eltypes if T <:oneMKL.onemklFloat A = rand(T,m) + gpuA = oneArray(A) + #println(oneMKL.nrm2!(m, gpuA)) @test testf(norm, A, m) end end From 52518696eb7fbff27d2f5a9cfa2b79ba231775a6 Mon Sep 17 00:00:00 2001 From: Kali Uday Date: Thu, 3 Nov 2022 21:38:51 +0530 Subject: [PATCH 07/21] testf enabled --- test/onemkl.jl | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/test/onemkl.jl b/test/onemkl.jl index 90bd3cb2..28145af4 100644 --- a/test/onemkl.jl +++ b/test/onemkl.jl @@ -11,9 +11,7 @@ k = 13 @testset for T in eltypes if T <:oneMKL.onemklFloat A = rand(T,m) - gpuA = oneArray(A) - #println(oneMKL.nrm2!(m, gpuA)) - @test testf(norm, A, m) + @test testf(norm, A) end end end From aea8ead5bba4641d8157a0a971e962d15443d039 Mon Sep 17 00:00:00 2001 From: Kali Uday Date: Thu, 3 Nov 2022 21:57:23 +0530 Subject: [PATCH 08/21] enabled compleF32 --- deps/onemkl.cpp | 7 +++++-- deps/onemkl.h | 2 +- lib/mkl/libonemkl.jl | 4 ++-- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/deps/onemkl.cpp b/deps/onemkl.cpp index e8f0732b..ad11fba1 100644 --- a/deps/onemkl.cpp +++ b/deps/onemkl.cpp @@ -95,8 +95,11 @@ extern "C" void onemklSnrm2(syclQueue_t device_queue, syclContext_t ctx, syclDev *result = *result_p; } -extern "C" void onemklCnrm2(syclQueue_t device_queue, int64_t n, const float _Complex *x, int64_t incx, float *result) { - oneapi::mkl::blas::column_major::nrm2(device_queue->val, n, reinterpret_cast *>(x), incx, result); +extern "C" void onemklCnrm2(syclQueue_t device_queue, syclContext_t ctx, syclDevice_t dev, int64_t n, const float _Complex *x, int64_t incx, float *result) { + auto result_p = sycl::malloc_shared(1, dev->val, ctx->val); + auto status = oneapi::mkl::blas::column_major::nrm2(device_queue->val, n, reinterpret_cast *>(x), incx, result_p); + status.wait(); + *result = *result_p; } extern "C" void onemklZnrm2(syclQueue_t device_queue, int64_t n, const double _Complex *x, int64_t incx, double *result) { diff --git a/deps/onemkl.h b/deps/onemkl.h index dfe59981..6cd82043 100644 --- a/deps/onemkl.h +++ b/deps/onemkl.h @@ -42,7 +42,7 @@ int onemklZgemm(syclQueue_t device_queue, onemklTranspose transA, // Supported Level-1: Nrm2 void onemklDnrm2(syclQueue_t device_queue, syclContext_t ctx, syclDevice_t dev, int64_t n, const double *x, int64_t incx, double *result); void onemklSnrm2(syclQueue_t device_queue, syclContext_t ctx, syclDevice_t dev, int64_t n, const float *x, int64_t incx, float *result); -void onemklCnrm2(syclQueue_t device_queue, int64_t n, const float _Complex *x, int64_t incx, float *result); +void onemklCnrm2(syclQueue_t device_queue, syclContext_t ctx, syclDevice_t dev, int64_t n, const float _Complex *x, int64_t incx, float *result); void onemklZnrm2(syclQueue_t device_queue, int64_t n, const double _Complex *x, int64_t incx, double *result); void onemklDestroy(); diff --git a/lib/mkl/libonemkl.jl b/lib/mkl/libonemkl.jl index da85c2f8..b19968d0 100644 --- a/lib/mkl/libonemkl.jl +++ b/lib/mkl/libonemkl.jl @@ -51,8 +51,8 @@ function onemklSnrm2(device_queue, ctx, dev, n, x, incx, result) @ccall liboneapi_support.onemklSnrm2(device_queue::syclQueue_t, ctx::syclContext_t, dev::syclDevice_t, n::Int64, x::ZePtr{Cfloat}, incx::Int64, result::Ref{Cfloat})::Cvoid end -function onemklCnrm2(device_queue, n, x, incx, result) - @ccall liboneapi_support.onemklCnrm2(device_queue::syclQueue_t, n::Int64, x::ZePtr{ComplexF32}, incx::Int64, result::Ptr{Cfloat})::Cvoid +function onemklCnrm2(device_queue, ctx, dev, n, x, incx, result) + @ccall liboneapi_support.onemklCnrm2(device_queue::syclQueue_t, ctx::syclContext_t, dev::syclDevice_t, n::Int64, x::ZePtr{ComplexF32}, incx::Int64, result::Ref{Cfloat})::Cvoid end function onemklZnrm2(device_queue, n, x, incx, result) From d6072b3508c1f747625cc3d35daa9bac7fac038c Mon Sep 17 00:00:00 2001 From: Kali Uday Date: Thu, 3 Nov 2022 22:00:45 +0530 Subject: [PATCH 09/21] enabled rest of the methods --- deps/onemkl.cpp | 7 +++++-- deps/onemkl.h | 2 +- lib/mkl/libonemkl.jl | 4 ++-- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/deps/onemkl.cpp b/deps/onemkl.cpp index ad11fba1..fbdf74a2 100644 --- a/deps/onemkl.cpp +++ b/deps/onemkl.cpp @@ -102,8 +102,11 @@ extern "C" void onemklCnrm2(syclQueue_t device_queue, syclContext_t ctx, syclDev *result = *result_p; } -extern "C" void onemklZnrm2(syclQueue_t device_queue, int64_t n, const double _Complex *x, int64_t incx, double *result) { - oneapi::mkl::blas::column_major::nrm2(device_queue->val, n, reinterpret_cast *>(x), incx, result); +extern "C" void onemklZnrm2(syclQueue_t device_queue, syclContext_t ctx, syclDevice_t dev, int64_t n, const double _Complex *x, int64_t incx, double *result) { + auto result_p = sycl::malloc_shared(1, dev->val, ctx->val); + auto status = oneapi::mkl::blas::column_major::nrm2(device_queue->val, n, reinterpret_cast *>(x), incx, result_p); + status.wait(); + *result = *result_p; } // other diff --git a/deps/onemkl.h b/deps/onemkl.h index 6cd82043..f3f92e8f 100644 --- a/deps/onemkl.h +++ b/deps/onemkl.h @@ -43,7 +43,7 @@ int onemklZgemm(syclQueue_t device_queue, onemklTranspose transA, void onemklDnrm2(syclQueue_t device_queue, syclContext_t ctx, syclDevice_t dev, int64_t n, const double *x, int64_t incx, double *result); void onemklSnrm2(syclQueue_t device_queue, syclContext_t ctx, syclDevice_t dev, int64_t n, const float *x, int64_t incx, float *result); void onemklCnrm2(syclQueue_t device_queue, syclContext_t ctx, syclDevice_t dev, int64_t n, const float _Complex *x, int64_t incx, float *result); -void onemklZnrm2(syclQueue_t device_queue, int64_t n, const double _Complex *x, int64_t incx, double *result); +void onemklZnrm2(syclQueue_t device_queue, syclContext_t ctx, syclDevice_t dev, int64_t n, const double _Complex *x, int64_t incx, double *result); void onemklDestroy(); #ifdef __cplusplus diff --git a/lib/mkl/libonemkl.jl b/lib/mkl/libonemkl.jl index b19968d0..62f9b705 100644 --- a/lib/mkl/libonemkl.jl +++ b/lib/mkl/libonemkl.jl @@ -55,8 +55,8 @@ function onemklCnrm2(device_queue, ctx, dev, n, x, incx, result) @ccall liboneapi_support.onemklCnrm2(device_queue::syclQueue_t, ctx::syclContext_t, dev::syclDevice_t, n::Int64, x::ZePtr{ComplexF32}, incx::Int64, result::Ref{Cfloat})::Cvoid end -function onemklZnrm2(device_queue, n, x, incx, result) - @ccall liboneapi_support.onemklZnrm2(device_queue::syclQueue_t, n::Int64, x::ZePtr{ComplexF64}, incx::Int64, result::Ptr{Cdouble})::Cvoid +function onemklZnrm2(device_queue, ctx, dev, n, x, incx, result) + @ccall liboneapi_support.onemklZnrm2(device_queue::syclQueue_t, ctx::syclContext_t, dev::syclDevice_t, n::Int64, x::ZePtr{ComplexF64}, incx::Int64, result::Ref{Cdouble})::Cvoid end From a27f95bee56d7d598203b926113df14c29e488fd Mon Sep 17 00:00:00 2001 From: Kali Uday Date: Fri, 4 Nov 2022 11:19:23 +0530 Subject: [PATCH 10/21] 1. results dev array created at wrapper 2. onestrided array used --- deps/onemkl.cpp | 16 ++++------------ lib/mkl/libonemkl.jl | 8 ++++---- lib/mkl/wrappers.jl | 7 ++++--- 3 files changed, 12 insertions(+), 19 deletions(-) diff --git a/deps/onemkl.cpp b/deps/onemkl.cpp index fbdf74a2..2f95d845 100644 --- a/deps/onemkl.cpp +++ b/deps/onemkl.cpp @@ -82,31 +82,23 @@ extern "C" int onemklZgemm(syclQueue_t device_queue, onemklTranspose transA, } extern "C" void onemklDnrm2(syclQueue_t device_queue, syclContext_t ctx, syclDevice_t dev, int64_t n, const double *x, int64_t incx, double *result) { - auto result_p = sycl::malloc_shared(1, dev->val, ctx->val); - auto status = oneapi::mkl::blas::column_major::nrm2(device_queue->val, n, x, incx, result_p); + auto status = oneapi::mkl::blas::column_major::nrm2(device_queue->val, n, x, incx, result); status.wait(); - *result = *result_p; } extern "C" void onemklSnrm2(syclQueue_t device_queue, syclContext_t ctx, syclDevice_t dev, int64_t n, const float *x, int64_t incx, float *result) { - auto result_p = sycl::malloc_shared(1, dev->val, ctx->val); - auto status = oneapi::mkl::blas::column_major::nrm2(device_queue->val, n, x, incx, result_p); + auto status = oneapi::mkl::blas::column_major::nrm2(device_queue->val, n, x, incx, result); status.wait(); - *result = *result_p; } extern "C" void onemklCnrm2(syclQueue_t device_queue, syclContext_t ctx, syclDevice_t dev, int64_t n, const float _Complex *x, int64_t incx, float *result) { - auto result_p = sycl::malloc_shared(1, dev->val, ctx->val); - auto status = oneapi::mkl::blas::column_major::nrm2(device_queue->val, n, reinterpret_cast *>(x), incx, result_p); + auto status = oneapi::mkl::blas::column_major::nrm2(device_queue->val, n, reinterpret_cast *>(x), incx, result); status.wait(); - *result = *result_p; } extern "C" void onemklZnrm2(syclQueue_t device_queue, syclContext_t ctx, syclDevice_t dev, int64_t n, const double _Complex *x, int64_t incx, double *result) { - auto result_p = sycl::malloc_shared(1, dev->val, ctx->val); - auto status = oneapi::mkl::blas::column_major::nrm2(device_queue->val, n, reinterpret_cast *>(x), incx, result_p); + auto status = oneapi::mkl::blas::column_major::nrm2(device_queue->val, n, reinterpret_cast *>(x), incx, result); status.wait(); - *result = *result_p; } // other diff --git a/lib/mkl/libonemkl.jl b/lib/mkl/libonemkl.jl index 62f9b705..8b0c2c9a 100644 --- a/lib/mkl/libonemkl.jl +++ b/lib/mkl/libonemkl.jl @@ -44,19 +44,19 @@ function onemklZgemm(device_queue, transA, transB, m, n, k, alpha, A, lda, B, ld end function onemklDnrm2(device_queue, ctx, dev, n, x, incx, result) - @ccall liboneapi_support.onemklDnrm2(device_queue::syclQueue_t, ctx::syclContext_t, dev::syclDevice_t, n::Int64, x::ZePtr{Cdouble}, incx::Int64, result::Ref{Cdouble})::Cvoid + @ccall liboneapi_support.onemklDnrm2(device_queue::syclQueue_t, ctx::syclContext_t, dev::syclDevice_t, n::Int64, x::ZePtr{Cdouble}, incx::Int64, result::RefOrZeRef{Cdouble})::Cvoid end function onemklSnrm2(device_queue, ctx, dev, n, x, incx, result) - @ccall liboneapi_support.onemklSnrm2(device_queue::syclQueue_t, ctx::syclContext_t, dev::syclDevice_t, n::Int64, x::ZePtr{Cfloat}, incx::Int64, result::Ref{Cfloat})::Cvoid + @ccall liboneapi_support.onemklSnrm2(device_queue::syclQueue_t, ctx::syclContext_t, dev::syclDevice_t, n::Int64, x::ZePtr{Cfloat}, incx::Int64, result::RefOrZeRef{Cfloat})::Cvoid end function onemklCnrm2(device_queue, ctx, dev, n, x, incx, result) - @ccall liboneapi_support.onemklCnrm2(device_queue::syclQueue_t, ctx::syclContext_t, dev::syclDevice_t, n::Int64, x::ZePtr{ComplexF32}, incx::Int64, result::Ref{Cfloat})::Cvoid + @ccall liboneapi_support.onemklCnrm2(device_queue::syclQueue_t, ctx::syclContext_t, dev::syclDevice_t, n::Int64, x::ZePtr{ComplexF32}, incx::Int64, result::RefOrZeRef{Cfloat})::Cvoid end function onemklZnrm2(device_queue, ctx, dev, n, x, incx, result) - @ccall liboneapi_support.onemklZnrm2(device_queue::syclQueue_t, ctx::syclContext_t, dev::syclDevice_t, n::Int64, x::ZePtr{ComplexF64}, incx::Int64, result::Ref{Cdouble})::Cvoid + @ccall liboneapi_support.onemklZnrm2(device_queue::syclQueue_t, ctx::syclContext_t, dev::syclDevice_t, n::Int64, x::ZePtr{ComplexF64}, incx::Int64, result::RefOrZeRef{Cdouble})::Cvoid end diff --git a/lib/mkl/wrappers.jl b/lib/mkl/wrappers.jl index 34af55ac..b5d6d02a 100644 --- a/lib/mkl/wrappers.jl +++ b/lib/mkl/wrappers.jl @@ -23,11 +23,12 @@ for (fname, elty, ret_type) in (:onemklZnrm2, :ComplexF64,:Float64)) @eval begin function nrm2(n::Integer, - x::StridedArray{$elty}) + x::oneStridedArray{$elty}) queue = global_queue(context(x), device(x)) - result = Ref{$ret_type}() + result = oneArray{$ret_type}([0]); $fname(sycl_queue(queue), sycl_context(context(x), device(x)), sycl_device(device(x)), n, x, stride(x,1), result) - return result[] + res = Array(result) + return res[1] end end end From 6204f73e8a6de8d1782b6053ff9d0f88f9d5dfb5 Mon Sep 17 00:00:00 2001 From: Kali Uday Date: Mon, 7 Nov 2022 16:10:20 +0530 Subject: [PATCH 11/21] NITS --- lib/mkl/oneMKL.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/mkl/oneMKL.jl b/lib/mkl/oneMKL.jl index 41dfbea0..d83f2141 100644 --- a/lib/mkl/oneMKL.jl +++ b/lib/mkl/oneMKL.jl @@ -12,7 +12,7 @@ using GPUArrays include("libonemkl.jl") -const onemklFloat = Union{Float64,Float32,ComplexF64,ComplexF32} +const onemklFloat = Union{Float64,Float32,Float16,ComplexF64,ComplexF32} include("wrappers.jl") include("linalg.jl") From 77a16c85365ef5826b957fa71f88369db39c68c8 Mon Sep 17 00:00:00 2001 From: Kali Uday Date: Mon, 7 Nov 2022 17:20:20 +0530 Subject: [PATCH 12/21] alignment corrections --- lib/mkl/libonemkl.jl | 22 ++++++++++++++++++---- lib/mkl/wrappers.jl | 4 ++-- test/onemkl.jl | 4 ++-- 3 files changed, 22 insertions(+), 8 deletions(-) diff --git a/lib/mkl/libonemkl.jl b/lib/mkl/libonemkl.jl index 1562a105..63d78c00 100644 --- a/lib/mkl/libonemkl.jl +++ b/lib/mkl/libonemkl.jl @@ -44,19 +44,33 @@ function onemklZgemm(device_queue, transA, transB, m, n, k, alpha, A, lda, B, ld end function onemklDnrm2(device_queue, ctx, dev, n, x, incx, result) - @ccall liboneapi_support.onemklDnrm2(device_queue::syclQueue_t, ctx::syclContext_t, dev::syclDevice_t, n::Int64, x::ZePtr{Cdouble}, incx::Int64, result::RefOrZeRef{Cdouble})::Cvoid + @ccall liboneapi_support.onemklDnrm2(device_queue::syclQueue_t, + ctx::syclContext_t, dev::syclDevice_t, + n::Int64, x::ZePtr{Cdouble}, incx::Int64, + result::RefOrZeRef{Cdouble})::Cvoid end function onemklSnrm2(device_queue, ctx, dev, n, x, incx, result) - @ccall liboneapi_support.onemklSnrm2(device_queue::syclQueue_t, ctx::syclContext_t, dev::syclDevice_t, n::Int64, x::ZePtr{Cfloat}, incx::Int64, result::RefOrZeRef{Cfloat})::Cvoid + @ccall liboneapi_support.onemklSnrm2(device_queue::syclQueue_t, + ctx::syclContext_t, dev::syclDevice_t, + n::Int64, x::ZePtr{Cfloat}, incx::Int64, + result::RefOrZeRef{Cfloat})::Cvoid end function onemklCnrm2(device_queue, ctx, dev, n, x, incx, result) - @ccall liboneapi_support.onemklCnrm2(device_queue::syclQueue_t, ctx::syclContext_t, dev::syclDevice_t, n::Int64, x::ZePtr{ComplexF32}, incx::Int64, result::RefOrZeRef{Cfloat})::Cvoid + @ccall liboneapi_support.onemklCnrm2(device_queue::syclQueue_t, + ctx::syclContext_t, + dev::syclDevice_t, n::Int64, + x::ZePtr{ComplexF32}, incx::Int64, + result::RefOrZeRef{Cfloat})::Cvoid end function onemklZnrm2(device_queue, ctx, dev, n, x, incx, result) - @ccall liboneapi_support.onemklZnrm2(device_queue::syclQueue_t, ctx::syclContext_t, dev::syclDevice_t, n::Int64, x::ZePtr{ComplexF64}, incx::Int64, result::RefOrZeRef{Cdouble})::Cvoid + @ccall liboneapi_support.onemklZnrm2(device_queue::syclQueue_t, + ctx::syclContext_t, + dev::syclDevice_t, n::Int64, + x::ZePtr{ComplexF64}, incx::Int64, + result::RefOrZeRef{Cdouble})::Cvoid end diff --git a/lib/mkl/wrappers.jl b/lib/mkl/wrappers.jl index 4001246f..2e66fcf6 100644 --- a/lib/mkl/wrappers.jl +++ b/lib/mkl/wrappers.jl @@ -23,8 +23,8 @@ for (fname, elty, ret_type) in (:onemklZnrm2, :ComplexF64,:Float64)) @eval begin function nrm2(n::Integer, - x::oneStridedArray{$elty}) - queue = global_queue(context(x), device(x)) + x::oneStridedArray{$elty}) + queue = global_queue(context(x), device(x)) result = oneArray{$ret_type}([0]); $fname(sycl_queue(queue), sycl_context(context(x), device(x)), sycl_device(device(x)), n, x, stride(x,1), result) res = Array(result) diff --git a/test/onemkl.jl b/test/onemkl.jl index ccf96fe7..76506319 100644 --- a/test/onemkl.jl +++ b/test/onemkl.jl @@ -15,7 +15,7 @@ k = 13 oneMKL.copy!(m,A,B) @test Array(A) == Array(B) - # Test nrm2 primitive - @test testf(norm, rand(T,m)) + # Test nrm2 primitive + @test testf(norm, rand(T,m)) end # level 1 testset end From aefb2e73a0a06c36c63899a39bc63069d6cef206 Mon Sep 17 00:00:00 2001 From: Kali Uday Date: Mon, 7 Nov 2022 17:28:35 +0530 Subject: [PATCH 13/21] NITS --- deps/onemkl.cpp | 18 ++++++++++++------ deps/onemkl.h | 16 ++++++++++++---- lib/mkl/wrappers.jl | 12 ++++++------ 3 files changed, 30 insertions(+), 16 deletions(-) diff --git a/deps/onemkl.cpp b/deps/onemkl.cpp index 38e44521..4f8afdf4 100644 --- a/deps/onemkl.cpp +++ b/deps/onemkl.cpp @@ -81,23 +81,29 @@ extern "C" int onemklZgemm(syclQueue_t device_queue, onemklTranspose transA, return 0; } -extern "C" void onemklDnrm2(syclQueue_t device_queue, syclContext_t ctx, syclDevice_t dev, int64_t n, const double *x, int64_t incx, double *result) { +extern "C" void onemklDnrm2(syclQueue_t device_queue, syclContext_t ctx, syclDevice_t dev, + int64_t n, const double *x, int64_t incx, double *result) { auto status = oneapi::mkl::blas::column_major::nrm2(device_queue->val, n, x, incx, result); status.wait(); } -extern "C" void onemklSnrm2(syclQueue_t device_queue, syclContext_t ctx, syclDevice_t dev, int64_t n, const float *x, int64_t incx, float *result) { +extern "C" void onemklSnrm2(syclQueue_t device_queue, syclContext_t ctx, syclDevice_t dev, + int64_t n, const float *x, int64_t incx, float *result) { auto status = oneapi::mkl::blas::column_major::nrm2(device_queue->val, n, x, incx, result); status.wait(); } -extern "C" void onemklCnrm2(syclQueue_t device_queue, syclContext_t ctx, syclDevice_t dev, int64_t n, const float _Complex *x, int64_t incx, float *result) { - auto status = oneapi::mkl::blas::column_major::nrm2(device_queue->val, n, reinterpret_cast *>(x), incx, result); +extern "C" void onemklCnrm2(syclQueue_t device_queue, syclContext_t ctx, syclDevice_t dev, + int64_t n, const float _Complex *x, int64_t incx, float *result) { + auto status = oneapi::mkl::blas::column_major::nrm2(device_queue->val, n, + reinterpret_cast *>(x), incx, result); status.wait(); } -extern "C" void onemklZnrm2(syclQueue_t device_queue, syclContext_t ctx, syclDevice_t dev, int64_t n, const double _Complex *x, int64_t incx, double *result) { - auto status = oneapi::mkl::blas::column_major::nrm2(device_queue->val, n, reinterpret_cast *>(x), incx, result); +extern "C" void onemklZnrm2(syclQueue_t device_queue, syclContext_t ctx, syclDevice_t dev, + int64_t n, const double _Complex *x, int64_t incx, double *result) { + auto status = oneapi::mkl::blas::column_major::nrm2(device_queue->val, n, + reinterpret_cast *>(x), incx, result); status.wait(); } diff --git a/deps/onemkl.h b/deps/onemkl.h index 358edd48..6a5978be 100644 --- a/deps/onemkl.h +++ b/deps/onemkl.h @@ -40,10 +40,18 @@ int onemklZgemm(syclQueue_t device_queue, onemklTranspose transA, double _Complex *C, int64_t ldc); // Supported Level-1: Nrm2 -void onemklDnrm2(syclQueue_t device_queue, syclContext_t ctx, syclDevice_t dev, int64_t n, const double *x, int64_t incx, double *result); -void onemklSnrm2(syclQueue_t device_queue, syclContext_t ctx, syclDevice_t dev, int64_t n, const float *x, int64_t incx, float *result); -void onemklCnrm2(syclQueue_t device_queue, syclContext_t ctx, syclDevice_t dev, int64_t n, const float _Complex *x, int64_t incx, float *result); -void onemklZnrm2(syclQueue_t device_queue, syclContext_t ctx, syclDevice_t dev, int64_t n, const double _Complex *x, int64_t incx, double *result); +void onemklDnrm2(syclQueue_t device_queue, syclContext_t ctx, + syclDevice_t dev, int64_t n, const double *x, + int64_t incx, double *result); +void onemklSnrm2(syclQueue_t device_queue, syclContext_t ctx, + syclDevice_t dev, int64_t n, const float *x, + int64_t incx, float *result); +void onemklCnrm2(syclQueue_t device_queue, syclContext_t ctx, + syclDevice_t dev, int64_t n, const float _Complex *x, + int64_t incx, float *result); +void onemklZnrm2(syclQueue_t device_queue, syclContext_t ctx, + syclDevice_t dev, int64_t n, const double _Complex *x, + int64_t incx, double *result); void onemklDcopy(syclQueue_t device_queue, int64_t n, const double *x, int64_t incx, double *y, int64_t incy); diff --git a/lib/mkl/wrappers.jl b/lib/mkl/wrappers.jl index 2e66fcf6..666a30f9 100644 --- a/lib/mkl/wrappers.jl +++ b/lib/mkl/wrappers.jl @@ -23,12 +23,12 @@ for (fname, elty, ret_type) in (:onemklZnrm2, :ComplexF64,:Float64)) @eval begin function nrm2(n::Integer, - x::oneStridedArray{$elty}) - queue = global_queue(context(x), device(x)) - result = oneArray{$ret_type}([0]); - $fname(sycl_queue(queue), sycl_context(context(x), device(x)), sycl_device(device(x)), n, x, stride(x,1), result) - res = Array(result) - return res[1] + x::oneStridedArray{$elty}) + queue = global_queue(context(x), device(x)) + result = oneArray{$ret_type}([0]); + $fname(sycl_queue(queue), sycl_context(context(x), device(x)), sycl_device(device(x)), n, x, stride(x,1), result) + res = Array(result) + return res[1] end end end From 1ca1f82e75acedb7244841788264d9be3e8faeb2 Mon Sep 17 00:00:00 2001 From: Kali Uday Date: Mon, 7 Nov 2022 17:34:35 +0530 Subject: [PATCH 14/21] alignment wrappers --- lib/mkl/wrappers.jl | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/lib/mkl/wrappers.jl b/lib/mkl/wrappers.jl index 666a30f9..521c727b 100644 --- a/lib/mkl/wrappers.jl +++ b/lib/mkl/wrappers.jl @@ -17,13 +17,12 @@ end # level 1 ## nrm2 for (fname, elty, ret_type) in - ((:onemklDnrm2, :Float64,:Float64), - (:onemklSnrm2, :Float32,:Float32), - (:onemklCnrm2, :ComplexF32,:Float32), - (:onemklZnrm2, :ComplexF64,:Float64)) - @eval begin - function nrm2(n::Integer, - x::oneStridedArray{$elty}) + ((:onemklDnrm2, :Float64,:Float64), + (:onemklSnrm2, :Float32,:Float32), + (:onemklCnrm2, :ComplexF32,:Float32), + (:onemklZnrm2, :ComplexF64,:Float64)) + @eval begin + function nrm2(n::Integer, x::oneStridedArray{$elty}) queue = global_queue(context(x), device(x)) result = oneArray{$ret_type}([0]); $fname(sycl_queue(queue), sycl_context(context(x), device(x)), sycl_device(device(x)), n, x, stride(x,1), result) From 77cd3cabe225f5a32ba6a49c09c1d22795979704 Mon Sep 17 00:00:00 2001 From: Kali Uday Date: Mon, 7 Nov 2022 17:36:02 +0530 Subject: [PATCH 15/21] NITS --- lib/mkl/wrappers.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/mkl/wrappers.jl b/lib/mkl/wrappers.jl index 521c727b..b69b93a2 100644 --- a/lib/mkl/wrappers.jl +++ b/lib/mkl/wrappers.jl @@ -28,8 +28,8 @@ for (fname, elty, ret_type) in $fname(sycl_queue(queue), sycl_context(context(x), device(x)), sycl_device(device(x)), n, x, stride(x,1), result) res = Array(result) return res[1] - end - end + end + end end # From 9e9b5fbd659ad91100ead37cf50123a8420bc558 Mon Sep 17 00:00:00 2001 From: Kali Uday Date: Mon, 7 Nov 2022 17:44:42 +0530 Subject: [PATCH 16/21] wait not required at cpp level --- deps/onemkl.cpp | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/deps/onemkl.cpp b/deps/onemkl.cpp index 4f8afdf4..c5502106 100644 --- a/deps/onemkl.cpp +++ b/deps/onemkl.cpp @@ -83,28 +83,24 @@ extern "C" int onemklZgemm(syclQueue_t device_queue, onemklTranspose transA, extern "C" void onemklDnrm2(syclQueue_t device_queue, syclContext_t ctx, syclDevice_t dev, int64_t n, const double *x, int64_t incx, double *result) { - auto status = oneapi::mkl::blas::column_major::nrm2(device_queue->val, n, x, incx, result); - status.wait(); + oneapi::mkl::blas::column_major::nrm2(device_queue->val, n, x, incx, result); } extern "C" void onemklSnrm2(syclQueue_t device_queue, syclContext_t ctx, syclDevice_t dev, int64_t n, const float *x, int64_t incx, float *result) { - auto status = oneapi::mkl::blas::column_major::nrm2(device_queue->val, n, x, incx, result); - status.wait(); + oneapi::mkl::blas::column_major::nrm2(device_queue->val, n, x, incx, result); } extern "C" void onemklCnrm2(syclQueue_t device_queue, syclContext_t ctx, syclDevice_t dev, int64_t n, const float _Complex *x, int64_t incx, float *result) { - auto status = oneapi::mkl::blas::column_major::nrm2(device_queue->val, n, + oneapi::mkl::blas::column_major::nrm2(device_queue->val, n, reinterpret_cast *>(x), incx, result); - status.wait(); } extern "C" void onemklZnrm2(syclQueue_t device_queue, syclContext_t ctx, syclDevice_t dev, int64_t n, const double _Complex *x, int64_t incx, double *result) { - auto status = oneapi::mkl::blas::column_major::nrm2(device_queue->val, n, + oneapi::mkl::blas::column_major::nrm2(device_queue->val, n, reinterpret_cast *>(x), incx, result); - status.wait(); } extern "C" void onemklDcopy(syclQueue_t device_queue, int64_t n, const double *x, From 9a1826035955b5bd7b5d106f574e460dc236e849 Mon Sep 17 00:00:00 2001 From: Kali Uday Date: Mon, 7 Nov 2022 19:31:04 +0530 Subject: [PATCH 17/21] clean up device/context --- deps/onemkl.cpp | 16 ++++++++-------- deps/onemkl.h | 12 ++++-------- lib/mkl/libonemkl.jl | 18 ++++++------------ lib/mkl/wrappers.jl | 2 +- 4 files changed, 19 insertions(+), 29 deletions(-) diff --git a/deps/onemkl.cpp b/deps/onemkl.cpp index c5502106..05f595cf 100644 --- a/deps/onemkl.cpp +++ b/deps/onemkl.cpp @@ -81,24 +81,24 @@ extern "C" int onemklZgemm(syclQueue_t device_queue, onemklTranspose transA, return 0; } -extern "C" void onemklDnrm2(syclQueue_t device_queue, syclContext_t ctx, syclDevice_t dev, - int64_t n, const double *x, int64_t incx, double *result) { +extern "C" void onemklDnrm2(syclQueue_t device_queue, int64_t n, const double *x, + int64_t incx, double *result) { oneapi::mkl::blas::column_major::nrm2(device_queue->val, n, x, incx, result); } -extern "C" void onemklSnrm2(syclQueue_t device_queue, syclContext_t ctx, syclDevice_t dev, - int64_t n, const float *x, int64_t incx, float *result) { +extern "C" void onemklSnrm2(syclQueue_t device_queue, int64_t n, const float *x, + int64_t incx, float *result) { oneapi::mkl::blas::column_major::nrm2(device_queue->val, n, x, incx, result); } -extern "C" void onemklCnrm2(syclQueue_t device_queue, syclContext_t ctx, syclDevice_t dev, - int64_t n, const float _Complex *x, int64_t incx, float *result) { +extern "C" void onemklCnrm2(syclQueue_t device_queue, int64_t n, const float _Complex *x, + int64_t incx, float *result) { oneapi::mkl::blas::column_major::nrm2(device_queue->val, n, reinterpret_cast *>(x), incx, result); } -extern "C" void onemklZnrm2(syclQueue_t device_queue, syclContext_t ctx, syclDevice_t dev, - int64_t n, const double _Complex *x, int64_t incx, double *result) { +extern "C" void onemklZnrm2(syclQueue_t device_queue, int64_t n, const double _Complex *x, + int64_t incx, double *result) { oneapi::mkl::blas::column_major::nrm2(device_queue->val, n, reinterpret_cast *>(x), incx, result); } diff --git a/deps/onemkl.h b/deps/onemkl.h index 6a5978be..b7c30976 100644 --- a/deps/onemkl.h +++ b/deps/onemkl.h @@ -40,17 +40,13 @@ int onemklZgemm(syclQueue_t device_queue, onemklTranspose transA, double _Complex *C, int64_t ldc); // Supported Level-1: Nrm2 -void onemklDnrm2(syclQueue_t device_queue, syclContext_t ctx, - syclDevice_t dev, int64_t n, const double *x, +void onemklDnrm2(syclQueue_t device_queue, int64_t n, const double *x, int64_t incx, double *result); -void onemklSnrm2(syclQueue_t device_queue, syclContext_t ctx, - syclDevice_t dev, int64_t n, const float *x, +void onemklSnrm2(syclQueue_t device_queue, int64_t n, const float *x, int64_t incx, float *result); -void onemklCnrm2(syclQueue_t device_queue, syclContext_t ctx, - syclDevice_t dev, int64_t n, const float _Complex *x, +void onemklCnrm2(syclQueue_t device_queue, int64_t n, const float _Complex *x, int64_t incx, float *result); -void onemklZnrm2(syclQueue_t device_queue, syclContext_t ctx, - syclDevice_t dev, int64_t n, const double _Complex *x, +void onemklZnrm2(syclQueue_t device_queue, int64_t n, const double _Complex *x, int64_t incx, double *result); void onemklDcopy(syclQueue_t device_queue, int64_t n, const double *x, diff --git a/lib/mkl/libonemkl.jl b/lib/mkl/libonemkl.jl index 63d78c00..3a9bc3cc 100644 --- a/lib/mkl/libonemkl.jl +++ b/lib/mkl/libonemkl.jl @@ -43,33 +43,27 @@ function onemklZgemm(device_queue, transA, transB, m, n, k, alpha, A, lda, B, ld C::ZePtr{ComplexF64}, ldc::Int64)::Cint end -function onemklDnrm2(device_queue, ctx, dev, n, x, incx, result) +function onemklDnrm2(device_queue, n, x, incx, result) @ccall liboneapi_support.onemklDnrm2(device_queue::syclQueue_t, - ctx::syclContext_t, dev::syclDevice_t, n::Int64, x::ZePtr{Cdouble}, incx::Int64, result::RefOrZeRef{Cdouble})::Cvoid end -function onemklSnrm2(device_queue, ctx, dev, n, x, incx, result) +function onemklSnrm2(device_queue, n, x, incx, result) @ccall liboneapi_support.onemklSnrm2(device_queue::syclQueue_t, - ctx::syclContext_t, dev::syclDevice_t, n::Int64, x::ZePtr{Cfloat}, incx::Int64, result::RefOrZeRef{Cfloat})::Cvoid end -function onemklCnrm2(device_queue, ctx, dev, n, x, incx, result) +function onemklCnrm2(device_queue, n, x, incx, result) @ccall liboneapi_support.onemklCnrm2(device_queue::syclQueue_t, - ctx::syclContext_t, - dev::syclDevice_t, n::Int64, - x::ZePtr{ComplexF32}, incx::Int64, + n::Int64, x::ZePtr{ComplexF32}, incx::Int64, result::RefOrZeRef{Cfloat})::Cvoid end -function onemklZnrm2(device_queue, ctx, dev, n, x, incx, result) +function onemklZnrm2(device_queue, n, x, incx, result) @ccall liboneapi_support.onemklZnrm2(device_queue::syclQueue_t, - ctx::syclContext_t, - dev::syclDevice_t, n::Int64, - x::ZePtr{ComplexF64}, incx::Int64, + n::Int64, x::ZePtr{ComplexF64}, incx::Int64, result::RefOrZeRef{Cdouble})::Cvoid end diff --git a/lib/mkl/wrappers.jl b/lib/mkl/wrappers.jl index b69b93a2..10681984 100644 --- a/lib/mkl/wrappers.jl +++ b/lib/mkl/wrappers.jl @@ -25,7 +25,7 @@ for (fname, elty, ret_type) in function nrm2(n::Integer, x::oneStridedArray{$elty}) queue = global_queue(context(x), device(x)) result = oneArray{$ret_type}([0]); - $fname(sycl_queue(queue), sycl_context(context(x), device(x)), sycl_device(device(x)), n, x, stride(x,1), result) + $fname(sycl_queue(queue), n, x, stride(x,1), result) res = Array(result) return res[1] end From 98edc69ab7cf39d0e9aa3f181c9263e26d30370a Mon Sep 17 00:00:00 2001 From: Kali Uday Date: Tue, 8 Nov 2022 10:01:13 +0530 Subject: [PATCH 18/21] reverted wait status, as ci fails with segfaults --- deps/src/onemkl.cpp | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/deps/src/onemkl.cpp b/deps/src/onemkl.cpp index 05f595cf..299be7b9 100644 --- a/deps/src/onemkl.cpp +++ b/deps/src/onemkl.cpp @@ -83,24 +83,28 @@ extern "C" int onemklZgemm(syclQueue_t device_queue, onemklTranspose transA, extern "C" void onemklDnrm2(syclQueue_t device_queue, int64_t n, const double *x, int64_t incx, double *result) { - oneapi::mkl::blas::column_major::nrm2(device_queue->val, n, x, incx, result); + auto status = oneapi::mkl::blas::column_major::nrm2(device_queue->val, n, x, incx, result); + status.wait(); } extern "C" void onemklSnrm2(syclQueue_t device_queue, int64_t n, const float *x, int64_t incx, float *result) { - oneapi::mkl::blas::column_major::nrm2(device_queue->val, n, x, incx, result); + auto status = oneapi::mkl::blas::column_major::nrm2(device_queue->val, n, x, incx, result); + status.wait(); } extern "C" void onemklCnrm2(syclQueue_t device_queue, int64_t n, const float _Complex *x, int64_t incx, float *result) { - oneapi::mkl::blas::column_major::nrm2(device_queue->val, n, + auto status = oneapi::mkl::blas::column_major::nrm2(device_queue->val, n, reinterpret_cast *>(x), incx, result); + status.wait(); } extern "C" void onemklZnrm2(syclQueue_t device_queue, int64_t n, const double _Complex *x, int64_t incx, double *result) { - oneapi::mkl::blas::column_major::nrm2(device_queue->val, n, + auto status = oneapi::mkl::blas::column_major::nrm2(device_queue->val, n, reinterpret_cast *>(x), incx, result); + status.wait(); } extern "C" void onemklDcopy(syclQueue_t device_queue, int64_t n, const double *x, From cc6a79219770514230028bb380713a68a45db058 Mon Sep 17 00:00:00 2001 From: Kali Uday Date: Tue, 8 Nov 2022 14:19:44 +0530 Subject: [PATCH 19/21] nrm2 - linalg cleanup --- lib/mkl/linalg.jl | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/lib/mkl/linalg.jl b/lib/mkl/linalg.jl index a46c4d21..94905416 100644 --- a/lib/mkl/linalg.jl +++ b/lib/mkl/linalg.jl @@ -49,11 +49,7 @@ function gemm_dispatch!(C::oneStridedVecOrMat, A, B, alpha::Number=true, beta::N end end -LinearAlgebra.norm(x::oneStridedVecOrMat{<:onemklFloat}, n::Number) = - oneMKL.nrm2(length(x), x) - -LinearAlgebra.norm(x::oneStridedVecOrMat{<:onemklFloat}, n::Real) = - invoke(norm, Tuple{typeof(x), Number}, x, n) +LinearAlgebra.norm(x::oneStridedVecOrMat{<:onemklFloat}) = oneMKL.nrm2(length(x), x) for NT in (Number, Real) # NOTE: alpha/beta also ::Real to avoid ambiguities with certain Base methods From 872fa11356e95c34a4ad82ae5331498e9feb15c7 Mon Sep 17 00:00:00 2001 From: Kali Uday Date: Tue, 8 Nov 2022 14:21:47 +0530 Subject: [PATCH 20/21] NITS --- test/onemkl.jl | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/test/onemkl.jl b/test/onemkl.jl index 76506319..1a6f95f9 100644 --- a/test/onemkl.jl +++ b/test/onemkl.jl @@ -15,7 +15,9 @@ k = 13 oneMKL.copy!(m,A,B) @test Array(A) == Array(B) - # Test nrm2 primitive - @test testf(norm, rand(T,m)) + @testset "nrm2" begin + # Test nrm2 primitive + @test testf(norm, rand(T,m)) + end end # level 1 testset end From f1f60e6d2a154f2214755326aef7e5a230e410bf Mon Sep 17 00:00:00 2001 From: Kali Uday Date: Wed, 9 Nov 2022 07:37:34 +0530 Subject: [PATCH 21/21] sycl queue/device/context creation not required --- lib/mkl/libonemkl.jl | 1 - test/onemkl.jl | 8 ++++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/lib/mkl/libonemkl.jl b/lib/mkl/libonemkl.jl index 3a9bc3cc..cc0ce69b 100644 --- a/lib/mkl/libonemkl.jl +++ b/lib/mkl/libonemkl.jl @@ -1,5 +1,4 @@ using CEnum -using oneAPI.SYCL: syclQueue_t, syclContext_t, syclDevice_t @cenum onemklTranspose::UInt32 begin ONEMKL_TRANSPOSE_NONTRANS = 0 diff --git a/test/onemkl.jl b/test/onemkl.jl index 1a6f95f9..39721b12 100644 --- a/test/onemkl.jl +++ b/test/onemkl.jl @@ -15,9 +15,9 @@ k = 13 oneMKL.copy!(m,A,B) @test Array(A) == Array(B) - @testset "nrm2" begin - # Test nrm2 primitive - @test testf(norm, rand(T,m)) - end + @testset "nrm2" begin + # Test nrm2 primitive + @test testf(norm, rand(T,m)) + end end # level 1 testset end