From 3737eabfe23fff498eeb0a717afa03c473c99bf0 Mon Sep 17 00:00:00 2001
From: Carlo Lucibello <carlo.lucibello@gmail.com>
Date: Thu, 17 Oct 2024 08:02:06 +0200
Subject: [PATCH] extended testing

---
 src/layers/recurrent.jl             |  47 ++++----
 test/ext_amdgpu/runtests.jl         |   5 +
 test/ext_common/recurrent_gpu_ad.jl | 163 ++++++++++++++++++++++++++++
 test/ext_cuda/recurrent.jl          |   6 -
 test/ext_cuda/runtests.jl           |   5 +-
 test/ext_metal/runtests.jl          |   5 +
 test/layers/recurrent.jl            | 110 +++++++++++++++++++
 test/test_utils.jl                  |  21 ++--
 8 files changed, 324 insertions(+), 38 deletions(-)
 create mode 100644 test/ext_common/recurrent_gpu_ad.jl
 delete mode 100644 test/ext_cuda/recurrent.jl

diff --git a/src/layers/recurrent.jl b/src/layers/recurrent.jl
index 37cef2fb2c..b93e3a80c3 100644
--- a/src/layers/recurrent.jl
+++ b/src/layers/recurrent.jl
@@ -83,9 +83,9 @@ function (m::RNNCell)(x::AbstractVecOrMat, h::AbstractVecOrMat)
   return h
 end
 
-function Base.show(io::IO, l::RNNCell)
-  print(io, "RNNCell(", size(l.Wi, 2), " => ", size(l.Wi, 1))
-  print(io, ", ", l.σ)
+function Base.show(io::IO, m::RNNCell)
+  print(io, "RNNCell(", size(m.Wi, 2), " => ", size(m.Wi, 1))
+  print(io, ", ", m.σ)
   print(io, ")")
 end
 
@@ -262,7 +262,7 @@ end
 
 function (m::LSTMCell)(x::AbstractVecOrMat, (h, c))
   _size_check(m, x, 1 => size(m.Wi, 2))
-  b, o = m.bias, size(h, 1)
+  b = m.bias
   g = m.Wi * x .+ m.Wh * h .+ b
   input, forget, cell, output = chunk(g, 4; dims=1)
   c′ = @. sigmoid_fast(forget) * c + sigmoid_fast(input) * tanh_fast(cell)
@@ -270,8 +270,8 @@ function (m::LSTMCell)(x::AbstractVecOrMat, (h, c))
   return h′, c′
 end
 
-Base.show(io::IO, l::LSTMCell) =
-  print(io, "LSTMCell(", size(l.Wi, 2), " => ", size(l.Wi, 1)÷4, ")")
+Base.show(io::IO, m::LSTMCell) =
+  print(io, "LSTMCell(", size(m.Wi, 2), " => ", size(m.Wi, 1)÷4, ")")
 
 
 @doc raw""""
@@ -431,12 +431,17 @@ function GRUCell((in, out)::Pair; init = glorot_uniform, bias = true)
   return GRUCell(Wi, Wh, b)
 end
 
+(m::GRUCell)(x::AbstractVecOrMat) = m(x, zeros_like(x, size(m.Wh, 2)))
+
 function (m::GRUCell)(x::AbstractVecOrMat, h)
   _size_check(m, x, 1 => size(m.Wi,2))
-  Wi, Wh, b = m.Wi, m.Wh, m.b
-  gxs = chunk(Wi * x, 3, dims=1)
-  ghs = chunk(Wh * h, 3, dims=1)
-  bs = chunk(b, 3, dims=1)
+  gxs = chunk(m.Wi * x, 3, dims=1)
+  ghs = chunk(m.Wh * h, 3, dims=1)
+  if m.b isa AbstractArray
+    bs = chunk(m.b, 3, dims=1)
+  else # b == false
+    bs = [false, false, false]
+  end
   r = @. sigmoid_fast(gxs[1] + ghs[1] + bs[1])
   z = @. sigmoid_fast(gxs[2] + ghs[2] + bs[2])
   h̃ = @. tanh_fast(gxs[3] + r * ghs[3] + bs[3])
@@ -444,8 +449,8 @@ function (m::GRUCell)(x::AbstractVecOrMat, h)
   return h′
 end
 
-Base.show(io::IO, l::GRUCell) =
-  print(io, "GRUCell(", size(l.Wi, 2), " => ", size(l.Wi, 1)÷3, ")")
+Base.show(io::IO, m::GRUCell) =
+  print(io, "GRUCell(", size(m.Wi, 2), " => ", size(m.Wi, 1)÷3, ")")
 
 @doc raw"""
     GRU(in => out; init = glorot_uniform, bias = true)
@@ -507,6 +512,7 @@ end
 function (m::GRU)(x, h)
   @assert ndims(x) == 2 || ndims(x) == 3
   h′ = []
+  # [x] = [in, L] or [in, L, B]
   for x_t in eachslice(x, dims=2)
     h = m.cell(x_t, h)
     h′ = vcat(h′, [h])
@@ -573,19 +579,22 @@ end
 
 function (m::GRUv3Cell)(x::AbstractVecOrMat, h)
   _size_check(m, x, 1 => size(m.Wi,2))
-  Wi, Wh, b, Wh_h̃ = m.Wi, m.Wh, m.b, m.Wh_h̃
-  gxs = chunk(Wi * x, 3, dims=1)
-  ghs = chunk(Wh * h, 2, dims=1)
-  bs = chunk(b, 3, dims=1)
+  gxs = chunk(m.Wi * x, 3, dims=1)
+  ghs = chunk(m.Wh * h, 3, dims=1)
+  if m.b isa AbstractArray
+    bs = chunk(m.b, 3, dims=1)
+  else # m.b == false
+    bs = [false, false, false]
+  end
   r = @. sigmoid_fast(gxs[1] + ghs[1] + bs[1])
   z = @. sigmoid_fast(gxs[2] + ghs[2] + bs[2])
-  h̃ = tanh_fast.(gxs[3] .+ (Wh_h̃ * (r .* h)) .+ bs[3])
+  h̃ = tanh_fast.(gxs[3] .+ (m.Wh_h̃ * (r .* h)) .+ bs[3])
   h′ = @. (1 - z) * h̃ + z * h
   return h′
 end
 
-Base.show(io::IO, l::GRUv3Cell) =
-  print(io, "GRUv3Cell(", size(l.Wi, 2), " => ", size(l.Wi, 1)÷3, ")")
+Base.show(io::IO, m::GRUv3Cell) =
+  print(io, "GRUv3Cell(", size(m.Wi, 2), " => ", size(m.Wi, 1)÷3, ")")
 
 
 @doc raw"""
diff --git a/test/ext_amdgpu/runtests.jl b/test/ext_amdgpu/runtests.jl
index ec779dedea..9dfbb41577 100644
--- a/test/ext_amdgpu/runtests.jl
+++ b/test/ext_amdgpu/runtests.jl
@@ -9,3 +9,8 @@ end
 @testset "Basic" begin
     include("basic.jl")
 end
+
+@testset "Recurrent" begin
+  BROKEN_TESTS = []
+  include("../ext_common/recurrent_gpu_ad.jl")
+end
diff --git a/test/ext_common/recurrent_gpu_ad.jl b/test/ext_common/recurrent_gpu_ad.jl
new file mode 100644
index 0000000000..d2ef3fe34b
--- /dev/null
+++ b/test/ext_common/recurrent_gpu_ad.jl
@@ -0,0 +1,163 @@
+
+@testset "RNNCell GPU AD" begin
+    function loss(r, x, h)
+        y = []
+        for x_t in x
+            h = r(x_t, h)
+            y = vcat(y, [h])
+        end
+        # return mean(h)
+        y = stack(y, dims=2) # [D, L] or [D, L, B]
+        return mean(y)
+    end
+
+    d_in, d_out, len, batch_size = 2, 3, 4, 5
+    r = RNNCell(d_in => d_out)
+    x = [randn(Float32, d_in, batch_size) for _ in 1:len]
+    h = zeros(Float32, d_out)
+    # Single Step
+    @test test_gradients(r, x[1], h; test_gpu=true, compare_finite_diff=false) broken = :rnncell_single ∈ BROKEN_TESTS
+    # Multiple Steps
+    @test test_gradients(r, x, h; test_gpu=true, compare_finite_diff=false, loss)  broken = :rnncell_multiple ∈ BROKEN_TESTS
+end
+
+@testset "RNN GPU AD" begin
+    struct ModelRNN
+        rnn::RNN
+        h0::AbstractVector
+    end
+
+    Flux.@layer :expand ModelRNN
+
+    (m::ModelRNN)(x) = m.rnn(x, m.h0)
+
+    d_in, d_out, len, batch_size = 2, 3, 4, 5
+    model = ModelRNN(RNN(d_in => d_out), zeros(Float32, d_out))
+    x_nobatch = randn(Float32, d_in, len)
+    @test test_gradients(model, x_nobatch; test_gpu=true, compare_finite_diff=false)  broken = :rnn_nobatch ∈ BROKEN_TESTS
+    x = randn(Float32, d_in, batch_size)
+    @test test_gradients(model, x, test_gpu=true, compare_finite_diff=false)  broken = :rnn ∈ BROKEN_TESTS
+end
+
+@testset "LSTMCell" begin
+
+    function loss(r, x, hc)
+        h, c = hc
+        h′ = []
+        c′ = []
+        for x_t in x
+            h, c = r(x_t, (h, c))
+            h′ = vcat(h′, [h])
+            c′ = [c′..., c]
+        end
+        hnew = stack(h′, dims=2)
+        cnew = stack(c′, dims=2)
+        return mean(hnew) + mean(cnew)
+    end
+
+    d_in, d_out, len, batch_size = 2, 3, 4, 5
+    cell = LSTMCell(d_in => d_out)
+    x = [randn(Float32, d_in, batch_size) for _ in 1:len]
+    h = zeros(Float32, d_out)
+    c = zeros(Float32, d_out)
+    # Single Step
+    @test test_gradients(cell, x[1], (h, c); test_gpu=true, compare_finite_diff=false,
+        loss = (m, x, (h, c)) -> mean(m(x, (h, c))[1]))  broken = :lstmcell_single ∈ BROKEN_TESTS
+    # Multiple Steps
+    @test test_gradients(cell, x, (h, c); test_gpu=true, compare_finite_diff=false, loss)  broken = :lstmcell_multiple ∈ BROKEN_TESTS
+end
+
+@testset "LSTM" begin
+    struct ModelLSTM
+        lstm::LSTM
+        h0::AbstractVector
+        c0::AbstractVector
+    end
+
+    Flux.@layer :expand ModelLSTM
+
+    (m::ModelLSTM)(x) = m.lstm(x, (m.h0, m.c0))
+
+    d_in, d_out, len, batch_size = 2, 3, 4, 5
+    model = ModelLSTM(LSTM(d_in => d_out), zeros(Float32, d_out), zeros(Float32, d_out))
+    x_nobatch = randn(Float32, d_in, len)
+    @test test_gradients(model, x_nobatch; test_gpu=true, compare_finite_diff=false, 
+        loss = (m, x) -> mean(m(x)[1])) broken = :lstm_nobatch ∈ BROKEN_TESTS
+    x = randn(Float32, d_in, len, batch_size)
+    @test test_gradients(model, x; test_gpu=true, compare_finite_diff=false, 
+        loss = (m, x) -> mean(m(x)[1])) broken = :lstm ∈ BROKEN_TESTS
+end
+
+@testset "GRUCell" begin
+    function loss(r, x, h)
+        y = []
+        for x_t in x
+            h = r(x_t, h)
+            y = vcat(y, [h])
+        end
+        y = stack(y, dims=2) # [D, L] or [D, L, B]
+        return mean(y)
+    end
+
+    d_in, d_out, len, batch_size = 2, 3, 4, 5
+    r = GRUCell(d_in => d_out)
+    x = [randn(Float32, d_in, batch_size) for _ in 1:len]
+    h = zeros(Float32, d_out)
+    @test test_gradients(r, x[1], h; test_gpu=true, compare_finite_diff=false) broken = :grucell_single ∈ BROKEN_TESTS
+    @test test_gradients(r, x, h; test_gpu=true, compare_finite_diff=false, loss) broken = :grucell_multiple ∈ BROKEN_TESTS
+end
+
+@testset "GRU GPU AD" begin
+    struct ModelGRU
+        gru::GRU
+        h0::AbstractVector
+    end
+
+    Flux.@layer :expand ModelGRU
+
+    (m::ModelGRU)(x) = m.gru(x, m.h0)
+
+    d_in, d_out, len, batch_size = 2, 3, 4, 5
+    model = ModelGRU(GRU(d_in => d_out), zeros(Float32, d_out))
+    x_nobatch = randn(Float32, d_in, len)
+    @test test_gradients(model, x_nobatch; test_gpu=true, compare_finite_diff=false) broken = :gru_nobatch ∈ BROKEN_TESTS
+    x = randn(Float32, d_in, len, batch_size)
+    @test test_gradients(model, x; test_gpu=true, compare_finite_diff=false) broken = :gru ∈ BROKEN_TESTS
+end
+
+@testset "GRUv3Cell GPU AD" begin
+    function loss(r, x, h)
+        y = []
+        for x_t in x
+            h = r(x_t, h)
+            y = vcat(y, [h])
+        end
+        y = stack(y, dims=2) # [D, L] or [D, L, B]
+        return mean(y)
+    end
+
+    d_in, d_out, len, batch_size = 2, 3, 4, 5
+    r = GRUv3Cell(d_in => d_out)
+    x = [randn(Float32, d_in, batch_size) for _ in 1:len]
+    h = zeros(Float32, d_out)
+    @test test_gradients(r, x[1], h; test_gpu=true, compare_finite_diff=false) broken = :gruv3cell_single ∈ BROKEN_TESTS
+    @test test_gradients(r, x, h; test_gpu=true, compare_finite_diff=false, loss) broken = :gruv3cell_multiple ∈ BROKEN_TESTS
+end
+
+@testset "GRUv3 GPU AD" begin
+    struct ModelGRUv3
+        gru::GRUv3
+        h0::AbstractVector
+    end
+
+    Flux.@layer :expand ModelGRUv3
+
+    (m::ModelGRUv3)(x) = m.gru(x, m.h0)
+    
+    d_in, d_out, len, batch_size = 2, 3, 4, 5
+    model = ModelGRUv3(GRUv3(d_in => d_out), zeros(Float32, d_out))
+    x_nobatch = randn(Float32, d_in, len)
+    @test test_gradients(model, x_nobatch; test_gpu=true, compare_finite_diff=false) broken = :gruv3_nobatch ∈ BROKEN_TESTS
+    x = randn(Float32, d_in, len, batch_size)
+    @test test_gradients(model, x; test_gpu=true, compare_finite_diff=false) broken = :gruv3 ∈ BROKEN_TESTS
+end
diff --git a/test/ext_cuda/recurrent.jl b/test/ext_cuda/recurrent.jl
deleted file mode 100644
index 913e9d5269..0000000000
--- a/test/ext_cuda/recurrent.jl
+++ /dev/null
@@ -1,6 +0,0 @@
-@testset for R in (RNN,)
-  m = R(3 => 5)
-  x = randn(Float32, 3, 4)
-  h = randn(Float32, 5)
-  test_gradients(m, x, h, test_gpu=true, compare_finite_diff=false)
-end
diff --git a/test/ext_cuda/runtests.jl b/test/ext_cuda/runtests.jl
index d9802762c0..d7a6ba63c9 100644
--- a/test/ext_cuda/runtests.jl
+++ b/test/ext_cuda/runtests.jl
@@ -22,8 +22,9 @@ end
 @testset "cudnn" begin
   include("cudnn.jl")
 end
-@testset "recurrent" begin
-  include("recurrent.jl")
+@testset "Recurrent" begin
+    BROKEN_TESTS = []
+    include("../ext_common/recurrent_gpu_ad.jl")
 end
 @testset "ctc" begin
   include("ctc.jl")
diff --git a/test/ext_metal/runtests.jl b/test/ext_metal/runtests.jl
index cb9532390e..5bb34caeb7 100644
--- a/test/ext_metal/runtests.jl
+++ b/test/ext_metal/runtests.jl
@@ -32,6 +32,11 @@ end
     include("basic.jl")
 end
 
+@testset "Recurrent" begin
+    BROKEN_TESTS = [:lstm, :gru, :gruv3]
+    include("../ext_common/recurrent_gpu_ad.jl")
+end
+
 @testset "Huber Loss test" begin
     X = Flux.gpu(Float32[0,1])
     Y = Flux.gpu(Float32[1,0])
diff --git a/test/layers/recurrent.jl b/test/layers/recurrent.jl
index 249aa3623c..98e072cdb1 100644
--- a/test/layers/recurrent.jl
+++ b/test/layers/recurrent.jl
@@ -160,3 +160,113 @@ end
     @test size(c) == (4, 3)
     test_gradients(model, x, loss = (m, x) -> mean(m(x)[1]))
 end
+
+@testset "GRUCell" begin
+    function loss(r, x, h)
+        y = []
+        for x_t in x
+            h = r(x_t, h)
+            y = vcat(y, [h])
+        end
+        y = stack(y, dims=2) # [D, L] or [D, L, B]
+        return mean(y.^2)
+    end
+
+    r = GRUCell(3 => 5)
+    @test length(Flux.trainables(r)) == 3
+    # An input sequence of length 6 and batch size 4.
+    x = [rand(Float32, 3, 4) for _ in 1:6]
+
+    # Initial State is a single vector
+    h = randn(Float32, 5)
+    test_gradients(r, x, h; loss)
+
+    # no initial state same as zero initial state
+    @test r(x[1]) ≈ r(x[1], zeros(Float32, 5))
+
+    # Now initial state has a batch dimension.
+    h = randn(Float32, 5, 4)
+    test_gradients(r, x, h; loss)
+
+    # The input sequence has no batch dimension.
+    x = [rand(Float32, 3) for _ in 1:6]
+    h = randn(Float32, 5)
+    test_gradients(r, x, h; loss)
+
+    # No Bias 
+    r = GRUCell(3 => 5, bias=false)
+    @test length(Flux.trainables(r)) == 2
+end
+
+@testset "GRU" begin
+    struct ModelGRU
+        gru::GRU
+        h0::AbstractVector
+    end
+
+    Flux.@layer :expand ModelGRU
+
+    (m::ModelGRU)(x) = m.gru(x, m.h0)
+
+    model = ModelGRU(GRU(2 => 4), zeros(Float32, 4))
+
+    x = rand(Float32, 2, 3, 1)
+    y = model(x)
+    @test y isa Array{Float32, 3}
+    @test size(y) == (4, 3, 1)
+    test_gradients(model, x)
+
+    # no initial state same as zero initial state
+    gru = model.gru
+    @test gru(x) ≈ gru(x, zeros(Float32, 4))
+    
+    # No Bias
+    gru = GRU(2 => 4, bias=false)
+    @test length(Flux.trainables(gru)) == 2
+    test_gradients(gru, x)
+end
+
+@testset "GRUv3Cell" begin 
+    r = GRUv3Cell(3 => 5)
+    @test length(Flux.trainables(r)) == 4
+    x = rand(Float32, 3)
+
+    # Initial State is a single vector
+    h = randn(Float32, 5)
+    test_gradients(r, x, h)
+
+    # no initial state same as zero initial state
+    @test r(x) ≈ r(x, zeros(Float32, 5))
+
+    # Now initial state has a batch dimension.
+    h = randn(Float32, 5, 4)
+    test_gradients(r, x, h)
+
+    # The input sequence has no batch dimension.
+    x = rand(Float32, 3)
+    h = randn(Float32, 5)
+    test_gradients(r, x, h)
+end
+
+@testset "GRUv3" begin
+    struct ModelGRUv3
+        gru::GRUv3
+        h0::AbstractVector
+    end
+
+    Flux.@layer :expand ModelGRUv3
+
+    (m::ModelGRUv3)(x) = m.gru(x, m.h0)
+
+    model = ModelGRUv3(GRUv3(2 => 4), zeros(Float32, 4))
+
+    x = rand(Float32, 2, 3, 1)
+    y = model(x)
+    @test y isa Array{Float32, 3}
+    @test size(y) == (4, 3, 1)
+    test_gradients(model, x)
+
+    # no initial state same as zero initial state
+    gru = model.gru
+    @test gru(x) ≈ gru(x, zeros(Float32, 4))
+end
diff --git a/test/test_utils.jl b/test/test_utils.jl
index da55ebca03..25a4f1af47 100644
--- a/test/test_utils.jl
+++ b/test/test_utils.jl
@@ -48,7 +48,16 @@ function test_gradients(
     end
 
     ## Let's make sure first that the forward pass works.
-    @test loss(f, xs...) isa Number
+    l = loss(f, xs...)
+    @test l isa Number
+    if test_gpu
+        gpu_dev = gpu_device(force=true)
+        cpu_dev = cpu_device()
+        xs_gpu = xs |> gpu_dev
+        f_gpu = f |> gpu_dev
+        l_gpu = loss(f_gpu, xs_gpu...)
+        @test l_gpu isa Number
+    end
 
     if test_grad_x
         # Zygote gradient with respect to input.
@@ -64,11 +73,6 @@ function test_gradients(
         end
 
         if test_gpu
-            gpu_dev = gpu_device(force=true)
-            cpu_dev = cpu_device()
-            xs_gpu = xs |> gpu_dev
-            f_gpu = f |> gpu_dev
-
             # Zygote gradient with respect to input on GPU.
             y_gpu, g_gpu = Zygote.withgradient((xs...) -> loss(f_gpu, xs...), xs_gpu...)
             @test get_device(g_gpu) == get_device(xs_gpu)
@@ -92,11 +96,6 @@ function test_gradients(
         end
 
         if test_gpu
-            gpu_dev = gpu_device(force=true)
-            cpu_dev = cpu_device()
-            xs_gpu = xs |> gpu_dev
-            f_gpu = f |> gpu_dev
-
             # Zygote gradient with respect to f on GPU.
             y_gpu, g_gpu = Zygote.withgradient(f -> loss(f, xs_gpu...), f_gpu)
             # @test get_device(g_gpu) == get_device(xs_gpu)