From 3737eabfe23fff498eeb0a717afa03c473c99bf0 Mon Sep 17 00:00:00 2001 From: Carlo Lucibello Date: Thu, 17 Oct 2024 08:02:06 +0200 Subject: [PATCH] extended testing --- src/layers/recurrent.jl | 47 ++++---- test/ext_amdgpu/runtests.jl | 5 + test/ext_common/recurrent_gpu_ad.jl | 163 ++++++++++++++++++++++++++++ test/ext_cuda/recurrent.jl | 6 - test/ext_cuda/runtests.jl | 5 +- test/ext_metal/runtests.jl | 5 + test/layers/recurrent.jl | 110 +++++++++++++++++++ test/test_utils.jl | 21 ++-- 8 files changed, 324 insertions(+), 38 deletions(-) create mode 100644 test/ext_common/recurrent_gpu_ad.jl delete mode 100644 test/ext_cuda/recurrent.jl diff --git a/src/layers/recurrent.jl b/src/layers/recurrent.jl index 37cef2fb2c..b93e3a80c3 100644 --- a/src/layers/recurrent.jl +++ b/src/layers/recurrent.jl @@ -83,9 +83,9 @@ function (m::RNNCell)(x::AbstractVecOrMat, h::AbstractVecOrMat) return h end -function Base.show(io::IO, l::RNNCell) - print(io, "RNNCell(", size(l.Wi, 2), " => ", size(l.Wi, 1)) - print(io, ", ", l.σ) +function Base.show(io::IO, m::RNNCell) + print(io, "RNNCell(", size(m.Wi, 2), " => ", size(m.Wi, 1)) + print(io, ", ", m.σ) print(io, ")") end @@ -262,7 +262,7 @@ end function (m::LSTMCell)(x::AbstractVecOrMat, (h, c)) _size_check(m, x, 1 => size(m.Wi, 2)) - b, o = m.bias, size(h, 1) + b = m.bias g = m.Wi * x .+ m.Wh * h .+ b input, forget, cell, output = chunk(g, 4; dims=1) c′ = @. sigmoid_fast(forget) * c + sigmoid_fast(input) * tanh_fast(cell) @@ -270,8 +270,8 @@ function (m::LSTMCell)(x::AbstractVecOrMat, (h, c)) return h′, c′ end -Base.show(io::IO, l::LSTMCell) = - print(io, "LSTMCell(", size(l.Wi, 2), " => ", size(l.Wi, 1)÷4, ")") +Base.show(io::IO, m::LSTMCell) = + print(io, "LSTMCell(", size(m.Wi, 2), " => ", size(m.Wi, 1)÷4, ")") @doc raw"""" @@ -431,12 +431,17 @@ function GRUCell((in, out)::Pair; init = glorot_uniform, bias = true) return GRUCell(Wi, Wh, b) end +(m::GRUCell)(x::AbstractVecOrMat) = m(x, zeros_like(x, size(m.Wh, 2))) + function (m::GRUCell)(x::AbstractVecOrMat, h) _size_check(m, x, 1 => size(m.Wi,2)) - Wi, Wh, b = m.Wi, m.Wh, m.b - gxs = chunk(Wi * x, 3, dims=1) - ghs = chunk(Wh * h, 3, dims=1) - bs = chunk(b, 3, dims=1) + gxs = chunk(m.Wi * x, 3, dims=1) + ghs = chunk(m.Wh * h, 3, dims=1) + if m.b isa AbstractArray + bs = chunk(m.b, 3, dims=1) + else # b == false + bs = [false, false, false] + end r = @. sigmoid_fast(gxs[1] + ghs[1] + bs[1]) z = @. sigmoid_fast(gxs[2] + ghs[2] + bs[2]) h̃ = @. tanh_fast(gxs[3] + r * ghs[3] + bs[3]) @@ -444,8 +449,8 @@ function (m::GRUCell)(x::AbstractVecOrMat, h) return h′ end -Base.show(io::IO, l::GRUCell) = - print(io, "GRUCell(", size(l.Wi, 2), " => ", size(l.Wi, 1)÷3, ")") +Base.show(io::IO, m::GRUCell) = + print(io, "GRUCell(", size(m.Wi, 2), " => ", size(m.Wi, 1)÷3, ")") @doc raw""" GRU(in => out; init = glorot_uniform, bias = true) @@ -507,6 +512,7 @@ end function (m::GRU)(x, h) @assert ndims(x) == 2 || ndims(x) == 3 h′ = [] + # [x] = [in, L] or [in, L, B] for x_t in eachslice(x, dims=2) h = m.cell(x_t, h) h′ = vcat(h′, [h]) @@ -573,19 +579,22 @@ end function (m::GRUv3Cell)(x::AbstractVecOrMat, h) _size_check(m, x, 1 => size(m.Wi,2)) - Wi, Wh, b, Wh_h̃ = m.Wi, m.Wh, m.b, m.Wh_h̃ - gxs = chunk(Wi * x, 3, dims=1) - ghs = chunk(Wh * h, 2, dims=1) - bs = chunk(b, 3, dims=1) + gxs = chunk(m.Wi * x, 3, dims=1) + ghs = chunk(m.Wh * h, 3, dims=1) + if m.b isa AbstractArray + bs = chunk(m.b, 3, dims=1) + else # m.b == false + bs = [false, false, false] + end r = @. sigmoid_fast(gxs[1] + ghs[1] + bs[1]) z = @. sigmoid_fast(gxs[2] + ghs[2] + bs[2]) - h̃ = tanh_fast.(gxs[3] .+ (Wh_h̃ * (r .* h)) .+ bs[3]) + h̃ = tanh_fast.(gxs[3] .+ (m.Wh_h̃ * (r .* h)) .+ bs[3]) h′ = @. (1 - z) * h̃ + z * h return h′ end -Base.show(io::IO, l::GRUv3Cell) = - print(io, "GRUv3Cell(", size(l.Wi, 2), " => ", size(l.Wi, 1)÷3, ")") +Base.show(io::IO, m::GRUv3Cell) = + print(io, "GRUv3Cell(", size(m.Wi, 2), " => ", size(m.Wi, 1)÷3, ")") @doc raw""" diff --git a/test/ext_amdgpu/runtests.jl b/test/ext_amdgpu/runtests.jl index ec779dedea..9dfbb41577 100644 --- a/test/ext_amdgpu/runtests.jl +++ b/test/ext_amdgpu/runtests.jl @@ -9,3 +9,8 @@ end @testset "Basic" begin include("basic.jl") end + +@testset "Recurrent" begin + BROKEN_TESTS = [] + include("../ext_common/recurrent_gpu_ad.jl") +end diff --git a/test/ext_common/recurrent_gpu_ad.jl b/test/ext_common/recurrent_gpu_ad.jl new file mode 100644 index 0000000000..d2ef3fe34b --- /dev/null +++ b/test/ext_common/recurrent_gpu_ad.jl @@ -0,0 +1,163 @@ + +@testset "RNNCell GPU AD" begin + function loss(r, x, h) + y = [] + for x_t in x + h = r(x_t, h) + y = vcat(y, [h]) + end + # return mean(h) + y = stack(y, dims=2) # [D, L] or [D, L, B] + return mean(y) + end + + d_in, d_out, len, batch_size = 2, 3, 4, 5 + r = RNNCell(d_in => d_out) + x = [randn(Float32, d_in, batch_size) for _ in 1:len] + h = zeros(Float32, d_out) + # Single Step + @test test_gradients(r, x[1], h; test_gpu=true, compare_finite_diff=false) broken = :rnncell_single ∈ BROKEN_TESTS + # Multiple Steps + @test test_gradients(r, x, h; test_gpu=true, compare_finite_diff=false, loss) broken = :rnncell_multiple ∈ BROKEN_TESTS +end + +@testset "RNN GPU AD" begin + struct ModelRNN + rnn::RNN + h0::AbstractVector + end + + Flux.@layer :expand ModelRNN + + (m::ModelRNN)(x) = m.rnn(x, m.h0) + + d_in, d_out, len, batch_size = 2, 3, 4, 5 + model = ModelRNN(RNN(d_in => d_out), zeros(Float32, d_out)) + x_nobatch = randn(Float32, d_in, len) + @test test_gradients(model, x_nobatch; test_gpu=true, compare_finite_diff=false) broken = :rnn_nobatch ∈ BROKEN_TESTS + x = randn(Float32, d_in, batch_size) + @test test_gradients(model, x, test_gpu=true, compare_finite_diff=false) broken = :rnn ∈ BROKEN_TESTS +end + +@testset "LSTMCell" begin + + function loss(r, x, hc) + h, c = hc + h′ = [] + c′ = [] + for x_t in x + h, c = r(x_t, (h, c)) + h′ = vcat(h′, [h]) + c′ = [c′..., c] + end + hnew = stack(h′, dims=2) + cnew = stack(c′, dims=2) + return mean(hnew) + mean(cnew) + end + + d_in, d_out, len, batch_size = 2, 3, 4, 5 + cell = LSTMCell(d_in => d_out) + x = [randn(Float32, d_in, batch_size) for _ in 1:len] + h = zeros(Float32, d_out) + c = zeros(Float32, d_out) + # Single Step + @test test_gradients(cell, x[1], (h, c); test_gpu=true, compare_finite_diff=false, + loss = (m, x, (h, c)) -> mean(m(x, (h, c))[1])) broken = :lstmcell_single ∈ BROKEN_TESTS + # Multiple Steps + @test test_gradients(cell, x, (h, c); test_gpu=true, compare_finite_diff=false, loss) broken = :lstmcell_multiple ∈ BROKEN_TESTS +end + +@testset "LSTM" begin + struct ModelLSTM + lstm::LSTM + h0::AbstractVector + c0::AbstractVector + end + + Flux.@layer :expand ModelLSTM + + (m::ModelLSTM)(x) = m.lstm(x, (m.h0, m.c0)) + + d_in, d_out, len, batch_size = 2, 3, 4, 5 + model = ModelLSTM(LSTM(d_in => d_out), zeros(Float32, d_out), zeros(Float32, d_out)) + x_nobatch = randn(Float32, d_in, len) + @test test_gradients(model, x_nobatch; test_gpu=true, compare_finite_diff=false, + loss = (m, x) -> mean(m(x)[1])) broken = :lstm_nobatch ∈ BROKEN_TESTS + x = randn(Float32, d_in, len, batch_size) + @test test_gradients(model, x; test_gpu=true, compare_finite_diff=false, + loss = (m, x) -> mean(m(x)[1])) broken = :lstm ∈ BROKEN_TESTS +end + +@testset "GRUCell" begin + function loss(r, x, h) + y = [] + for x_t in x + h = r(x_t, h) + y = vcat(y, [h]) + end + y = stack(y, dims=2) # [D, L] or [D, L, B] + return mean(y) + end + + d_in, d_out, len, batch_size = 2, 3, 4, 5 + r = GRUCell(d_in => d_out) + x = [randn(Float32, d_in, batch_size) for _ in 1:len] + h = zeros(Float32, d_out) + @test test_gradients(r, x[1], h; test_gpu=true, compare_finite_diff=false) broken = :grucell_single ∈ BROKEN_TESTS + @test test_gradients(r, x, h; test_gpu=true, compare_finite_diff=false, loss) broken = :grucell_multiple ∈ BROKEN_TESTS +end + +@testset "GRU GPU AD" begin + struct ModelGRU + gru::GRU + h0::AbstractVector + end + + Flux.@layer :expand ModelGRU + + (m::ModelGRU)(x) = m.gru(x, m.h0) + + d_in, d_out, len, batch_size = 2, 3, 4, 5 + model = ModelGRU(GRU(d_in => d_out), zeros(Float32, d_out)) + x_nobatch = randn(Float32, d_in, len) + @test test_gradients(model, x_nobatch; test_gpu=true, compare_finite_diff=false) broken = :gru_nobatch ∈ BROKEN_TESTS + x = randn(Float32, d_in, len, batch_size) + @test test_gradients(model, x; test_gpu=true, compare_finite_diff=false) broken = :gru ∈ BROKEN_TESTS +end + +@testset "GRUv3Cell GPU AD" begin + function loss(r, x, h) + y = [] + for x_t in x + h = r(x_t, h) + y = vcat(y, [h]) + end + y = stack(y, dims=2) # [D, L] or [D, L, B] + return mean(y) + end + + d_in, d_out, len, batch_size = 2, 3, 4, 5 + r = GRUv3Cell(d_in => d_out) + x = [randn(Float32, d_in, batch_size) for _ in 1:len] + h = zeros(Float32, d_out) + @test test_gradients(r, x[1], h; test_gpu=true, compare_finite_diff=false) broken = :gruv3cell_single ∈ BROKEN_TESTS + @test test_gradients(r, x, h; test_gpu=true, compare_finite_diff=false, loss) broken = :gruv3cell_multiple ∈ BROKEN_TESTS +end + +@testset "GRUv3 GPU AD" begin + struct ModelGRUv3 + gru::GRUv3 + h0::AbstractVector + end + + Flux.@layer :expand ModelGRUv3 + + (m::ModelGRUv3)(x) = m.gru(x, m.h0) + + d_in, d_out, len, batch_size = 2, 3, 4, 5 + model = ModelGRUv3(GRUv3(d_in => d_out), zeros(Float32, d_out)) + x_nobatch = randn(Float32, d_in, len) + @test test_gradients(model, x_nobatch; test_gpu=true, compare_finite_diff=false) broken = :gruv3_nobatch ∈ BROKEN_TESTS + x = randn(Float32, d_in, len, batch_size) + @test test_gradients(model, x; test_gpu=true, compare_finite_diff=false) broken = :gruv3 ∈ BROKEN_TESTS +end diff --git a/test/ext_cuda/recurrent.jl b/test/ext_cuda/recurrent.jl deleted file mode 100644 index 913e9d5269..0000000000 --- a/test/ext_cuda/recurrent.jl +++ /dev/null @@ -1,6 +0,0 @@ -@testset for R in (RNN,) - m = R(3 => 5) - x = randn(Float32, 3, 4) - h = randn(Float32, 5) - test_gradients(m, x, h, test_gpu=true, compare_finite_diff=false) -end diff --git a/test/ext_cuda/runtests.jl b/test/ext_cuda/runtests.jl index d9802762c0..d7a6ba63c9 100644 --- a/test/ext_cuda/runtests.jl +++ b/test/ext_cuda/runtests.jl @@ -22,8 +22,9 @@ end @testset "cudnn" begin include("cudnn.jl") end -@testset "recurrent" begin - include("recurrent.jl") +@testset "Recurrent" begin + BROKEN_TESTS = [] + include("../ext_common/recurrent_gpu_ad.jl") end @testset "ctc" begin include("ctc.jl") diff --git a/test/ext_metal/runtests.jl b/test/ext_metal/runtests.jl index cb9532390e..5bb34caeb7 100644 --- a/test/ext_metal/runtests.jl +++ b/test/ext_metal/runtests.jl @@ -32,6 +32,11 @@ end include("basic.jl") end +@testset "Recurrent" begin + BROKEN_TESTS = [:lstm, :gru, :gruv3] + include("../ext_common/recurrent_gpu_ad.jl") +end + @testset "Huber Loss test" begin X = Flux.gpu(Float32[0,1]) Y = Flux.gpu(Float32[1,0]) diff --git a/test/layers/recurrent.jl b/test/layers/recurrent.jl index 249aa3623c..98e072cdb1 100644 --- a/test/layers/recurrent.jl +++ b/test/layers/recurrent.jl @@ -160,3 +160,113 @@ end @test size(c) == (4, 3) test_gradients(model, x, loss = (m, x) -> mean(m(x)[1])) end + +@testset "GRUCell" begin + function loss(r, x, h) + y = [] + for x_t in x + h = r(x_t, h) + y = vcat(y, [h]) + end + y = stack(y, dims=2) # [D, L] or [D, L, B] + return mean(y.^2) + end + + r = GRUCell(3 => 5) + @test length(Flux.trainables(r)) == 3 + # An input sequence of length 6 and batch size 4. + x = [rand(Float32, 3, 4) for _ in 1:6] + + # Initial State is a single vector + h = randn(Float32, 5) + test_gradients(r, x, h; loss) + + # no initial state same as zero initial state + @test r(x[1]) ≈ r(x[1], zeros(Float32, 5)) + + # Now initial state has a batch dimension. + h = randn(Float32, 5, 4) + test_gradients(r, x, h; loss) + + # The input sequence has no batch dimension. + x = [rand(Float32, 3) for _ in 1:6] + h = randn(Float32, 5) + test_gradients(r, x, h; loss) + + # No Bias + r = GRUCell(3 => 5, bias=false) + @test length(Flux.trainables(r)) == 2 +end + +@testset "GRU" begin + struct ModelGRU + gru::GRU + h0::AbstractVector + end + + Flux.@layer :expand ModelGRU + + (m::ModelGRU)(x) = m.gru(x, m.h0) + + model = ModelGRU(GRU(2 => 4), zeros(Float32, 4)) + + x = rand(Float32, 2, 3, 1) + y = model(x) + @test y isa Array{Float32, 3} + @test size(y) == (4, 3, 1) + test_gradients(model, x) + + # no initial state same as zero initial state + gru = model.gru + @test gru(x) ≈ gru(x, zeros(Float32, 4)) + + # No Bias + gru = GRU(2 => 4, bias=false) + @test length(Flux.trainables(gru)) == 2 + test_gradients(gru, x) +end + +@testset "GRUv3Cell" begin + r = GRUv3Cell(3 => 5) + @test length(Flux.trainables(r)) == 4 + x = rand(Float32, 3) + + # Initial State is a single vector + h = randn(Float32, 5) + test_gradients(r, x, h) + + # no initial state same as zero initial state + @test r(x) ≈ r(x, zeros(Float32, 5)) + + # Now initial state has a batch dimension. + h = randn(Float32, 5, 4) + test_gradients(r, x, h) + + # The input sequence has no batch dimension. + x = rand(Float32, 3) + h = randn(Float32, 5) + test_gradients(r, x, h) +end + +@testset "GRUv3" begin + struct ModelGRUv3 + gru::GRUv3 + h0::AbstractVector + end + + Flux.@layer :expand ModelGRUv3 + + (m::ModelGRUv3)(x) = m.gru(x, m.h0) + + model = ModelGRUv3(GRUv3(2 => 4), zeros(Float32, 4)) + + x = rand(Float32, 2, 3, 1) + y = model(x) + @test y isa Array{Float32, 3} + @test size(y) == (4, 3, 1) + test_gradients(model, x) + + # no initial state same as zero initial state + gru = model.gru + @test gru(x) ≈ gru(x, zeros(Float32, 4)) +end diff --git a/test/test_utils.jl b/test/test_utils.jl index da55ebca03..25a4f1af47 100644 --- a/test/test_utils.jl +++ b/test/test_utils.jl @@ -48,7 +48,16 @@ function test_gradients( end ## Let's make sure first that the forward pass works. - @test loss(f, xs...) isa Number + l = loss(f, xs...) + @test l isa Number + if test_gpu + gpu_dev = gpu_device(force=true) + cpu_dev = cpu_device() + xs_gpu = xs |> gpu_dev + f_gpu = f |> gpu_dev + l_gpu = loss(f_gpu, xs_gpu...) + @test l_gpu isa Number + end if test_grad_x # Zygote gradient with respect to input. @@ -64,11 +73,6 @@ function test_gradients( end if test_gpu - gpu_dev = gpu_device(force=true) - cpu_dev = cpu_device() - xs_gpu = xs |> gpu_dev - f_gpu = f |> gpu_dev - # Zygote gradient with respect to input on GPU. y_gpu, g_gpu = Zygote.withgradient((xs...) -> loss(f_gpu, xs...), xs_gpu...) @test get_device(g_gpu) == get_device(xs_gpu) @@ -92,11 +96,6 @@ function test_gradients( end if test_gpu - gpu_dev = gpu_device(force=true) - cpu_dev = cpu_device() - xs_gpu = xs |> gpu_dev - f_gpu = f |> gpu_dev - # Zygote gradient with respect to f on GPU. y_gpu, g_gpu = Zygote.withgradient(f -> loss(f, xs_gpu...), f_gpu) # @test get_device(g_gpu) == get_device(xs_gpu)