From 344c15542db3458dc639f127e6371b37ccfbc70f Mon Sep 17 00:00:00 2001
From: christopher5106 <christopher.bourez@gmail.com>
Date: Mon, 26 Sep 2016 21:37:32 +0200
Subject: [PATCH 1/7] add grid lstm

---
 Grid2DLSTM.lua | 221 +++++++++++++++++++++++++++++++++++++++++++++++++
 init.lua       |   2 +
 2 files changed, 223 insertions(+)
 create mode 100644 Grid2DLSTM.lua

diff --git a/Grid2DLSTM.lua b/Grid2DLSTM.lua
new file mode 100644
index 0000000..05224f1
--- /dev/null
+++ b/Grid2DLSTM.lua
@@ -0,0 +1,221 @@
+local Grid2DLSTM, parent = torch.class("nn.Grid2DLSTM", 'nn.AbstractRecurrent')
+
+function lstm(h_t, h_d, prev_c, rnn_size)
+  local all_input_sums = nn.CAddTable()({h_t, h_d})
+  local reshaped = nn.Reshape(4, rnn_size)(all_input_sums)
+  local n1, n2, n3, n4 = nn.SplitTable(2)(reshaped):split(4)
+  -- decode the gates
+  local in_gate = nn.Sigmoid()(n1)
+  local forget_gate = nn.Sigmoid()(n2)
+  local out_gate = nn.Sigmoid()(n3)
+  -- decode the write inputs
+  local in_transform = nn.Tanh()(n4)
+  -- perform the LSTM update
+  local next_c           = nn.CAddTable()({
+      nn.CMulTable()({forget_gate, prev_c}),
+      nn.CMulTable()({in_gate,     in_transform})
+    })
+  -- gated cells form the output
+  local next_h = nn.CMulTable()({out_gate, nn.Tanh()(next_c)})
+  return next_c, next_h
+end
+
+
+function Grid2DLSTM:__init(inputSize, outputSize, nb_layers, dropout, tie_weights, rho, cell2gate)
+   parent.__init(self, rho or 9999)
+   self.inputSize = inputSize
+   self.outputSize = outputSize or inputSize
+   self.should_tie_weights = tie_weights or true
+   self.dropout = dropout or 0
+   self.nb_layers = nb_layers
+   -- build the model
+   self.cell2gate = (cell2gate == nil) and true or cell2gate
+   self.recurrentModule = self:buildModel()
+   -- make it work with nn.Container
+   self.modules[1] = self.recurrentModule
+   self.sharedClones[1] = self.recurrentModule
+
+   -- for output(0), cell(0) and gradCell(T)
+   self.zeroTensor = torch.Tensor()
+
+   self.cells = {}
+   self.gradCells = {}
+
+end
+
+function Grid2DLSTM:buildModel()
+  require 'nngraph'
+  assert(nngraph, "Missing nngraph package")
+
+    -- There will be 2*n+1 inputs
+    local inputs = {}
+    table.insert(inputs, nn.Identity()()) -- input c for depth dimension
+    table.insert(inputs, nn.Identity()()) -- input h for depth dimension
+    for L = 1,self.nb_layers do
+      table.insert(inputs, nn.Identity()()) -- prev_c[L] for time dimension
+      table.insert(inputs, nn.Identity()()) -- prev_h[L] for time dimension
+    end
+
+    local shared_weights
+    if self.should_tie_weights == true then shared_weights = {nn.Linear(self.outputSize, 4 * self.outputSize), nn.Linear(self.outputSize, 4 * self.outputSize)} end
+
+    local outputs_t = {} -- Outputs being handed to the next time step along the time dimension
+    local outputs_d = {} -- Outputs being handed from one layer to the next along the depth dimension
+
+    for L = 1,self.nb_layers do
+      -- Take hidden and memory cell from previous time steps
+      local prev_c_t = inputs[L*2+1]
+      local prev_h_t = inputs[L*2+2]
+
+      if L == 1 then
+        -- We're in the first layer
+        prev_c_d = inputs[1] -- input_c_d: the starting depth dimension memory cell, just a zero vec.
+        prev_h_d = nn.LookupTable(self.inputSize, self.outputSize)(inputs[2]) -- input_h_d: the starting depth dimension hidden state. We map a char into hidden space via a lookup table
+      else
+        -- We're in the higher layers 2...N
+        -- Take hidden and memory cell from layers below
+        prev_c_d = outputs_d[((L-1)*2)-1]
+        prev_h_d = outputs_d[((L-1)*2)]
+        if self.dropout > 0 then prev_h_d = nn.Dropout(self.dropout)(prev_h_d):annotate{name='drop_' .. L} end -- apply dropout, if any
+      end
+
+      -- Evaluate the input sums at once for efficiency
+      local t2h_t = nn.Linear(self.outputSize, 4 * self.outputSize)(prev_h_t):annotate{name='i2h_'..L}
+      local d2h_t = nn.Linear(self.outputSize, 4 * self.outputSize)(prev_h_d):annotate{name='h2h_'..L}
+
+      -- Get transformed memory and hidden states pointing in the time direction first
+      local next_c_t, next_h_t = lstm(t2h_t, d2h_t, prev_c_t, self.outputSize)
+
+      -- Pass memory cell and hidden state to next timestep
+      table.insert(outputs_t, next_c_t)
+      table.insert(outputs_t, next_h_t)
+
+      -- Evaluate the input sums at once for efficiency
+      local t2h_d = nn.Linear(self.outputSize, 4 * self.outputSize)(next_h_t):annotate{name='i2h_'..L}
+      local d2h_d = nn.Linear(self.outputSize, 4 * self.outputSize)(prev_h_d):annotate{name='h2h_'..L}
+
+      -- See section 3.5, "Weight Sharing" of http://arxiv.org/pdf/1507.01526.pdf
+      -- The weights along the temporal dimension are already tied (cloned many times in train.lua)
+      -- Here we can tie the weights along the depth dimension. Having invariance in computation
+      -- along the depth appears to be critical to solving the 15 digit addition problem w/ high accy.
+      -- See fig 4. to compare tied vs untied grid lstms on this task.
+      if self.should_tie_weights == true then
+        print("tying weights along the depth dimension")
+        t2h_d.data.module:share(shared_weights[1], 'weight', 'bias', 'gradWeight', 'gradBias')
+        d2h_d.data.module:share(shared_weights[2], 'weight', 'bias', 'gradWeight', 'gradBias')
+      end
+
+      -- Create the lstm gated update pointing in the depth direction.
+      -- We 'prioritize' the depth dimension by using the updated temporal hidden state as input
+      -- instead of the previous temporal hidden state. This implements Section 3.2, "Priority Dimensions"
+      local next_c_d, next_h_d = lstm(t2h_d, d2h_d, prev_c_d, self.outputSize)
+
+      -- Pass the depth dimension memory cell and hidden state to layer above
+      table.insert(outputs_d, next_c_d)
+      table.insert(outputs_d, next_h_d)
+    end
+
+    -- set up the decoder
+    local top_h = outputs_d[#outputs_d]
+    table.insert(outputs_t, top_h)
+
+    -- outputs_h contains
+    -- nb_layers x (next_c_t, next_h_t)
+    -- next-h
+
+    return nn.gModule(inputs, outputs_t)
+
+end
+
+function Grid2DLSTM:updateOutput(input)
+  if self.step == 1 then
+    -- the initial state of the cell/hidden states
+    self.cells = {[0] = {}}
+
+    for L=1,self.nb_layers do
+      local h_init = torch.zeros(input:size(1), self.outputSize):cuda()
+      table.insert(self.cells[0], h_init:clone())
+      table.insert(self.cells[0], h_init:clone()) -- extra initial state for prev_c
+    end
+  end
+
+  local input_mem_cell = torch.zeros(input:size(1),  self.outputSize):float():cuda()
+  local rnn_inputs = {input_mem_cell, input, unpack(self.cells[self.step-1])}
+  local lst
+  if self.train ~= false then
+     self:recycle()
+     local recurrentModule = self:getStepModule(self.step)
+     -- the actual forward propagation
+     lst = recurrentModule:updateOutput(rnn_inputs)
+  else
+     lst = self.recurrentModule:updateOutput(rnn_inputs)
+  end
+
+  self.cells[self.step] = {}
+  for i=1,#(self.cells[0]) do table.insert(self.cells[self.step], lst[i]) end -- extract the state, without output
+
+  self.outputs[self.step] = lst[#lst]
+  self.output = lst[#lst]
+  self.cell = cell
+
+  self.step = self.step + 1
+  self.gradPrevOutput = nil
+  self.updateGradInputStep = nil
+  self.accGradParametersStep = nil
+  -- note that we don't return the cell, just the output
+  return self.output
+
+end
+
+function Grid2DLSTM:_updateGradInput(input, gradOutput)
+  assert(self.step > 1, "expecting at least one updateOutput")
+  local step = self.updateGradInputStep - 1
+  assert(step >= 1)
+
+  -- set the output/gradOutput states of current Module
+  local recurrentModule = self:getStepModule(step)
+
+  -- backward propagate through this step
+  local gradCell = (step == self.step-1) and self.zeroTensor or self.gradCells[step]
+
+  if (step == self.step-1) then
+    self.gradCells = {[step] = {}}
+    for L=1,self.nb_layers do
+      local h_init = torch.zeros(input:size(1), self.outputSize):cuda()
+      table.insert(self.gradCells[step], h_init:clone())
+      table.insert(self.gradCells[step], h_init:clone()) -- extra initial state for prev_c
+    end
+  end
+
+  table.insert(self.gradCells[step], gradOutput)
+
+  local input_mem_cell = torch.zeros(input:size(1),  self.outputSize):float():cuda()
+  local rnn_inputs = {input_mem_cell, input, unpack(self.cells[step-1])}
+
+  local dlst = recurrentModule:updateGradInput(rnn_inputs, self.gradCells[step])
+  self.gradCells[step-1] = {}
+  for k,v in pairs(dlst) do
+      if k > 2 then -- k <= skip_index is gradient on inputs, which we dont need
+          -- note we do k-1 because first item is dembeddings, and then follow the
+          -- derivatives of the state, starting at index 2. I know...
+          self.gradCells[step-1][k-2] = v
+      end
+      if k == 2 then gradInput = v end
+  end
+  return gradInput
+end
+
+function Grid2DLSTM:_accGradParameters(input, gradOutput, scale)
+  local step = self.accGradParametersStep - 1
+  assert(step >= 1)
+
+  -- set the output/gradOutput states of current Module
+  local recurrentModule = self:getStepModule(step)
+
+  -- backward propagate through this step
+  local input_mem_cell = torch.zeros(input:size(1),  self.outputSize):float():cuda()
+  local rnn_inputs = {input_mem_cell, input, unpack(self.cells[step-1])}
+
+  recurrentModule:accGradParameters( rnn_inputs, self.gradCells[step], scale)
+
+end
diff --git a/init.lua b/init.lua
index bd49ef1..ac902af 100644
--- a/init.lua
+++ b/init.lua
@@ -35,6 +35,8 @@ torch.include('rnn', 'AbstractRecurrent.lua')
 torch.include('rnn', 'Recurrent.lua')
 torch.include('rnn', 'LSTM.lua')
 torch.include('rnn', 'FastLSTM.lua')
+torch.include('rnn', 'Grid2DLSTM.lua')
+
 torch.include('rnn', 'GRU.lua')
 torch.include('rnn', 'Recursor.lua')
 torch.include('rnn', 'Recurrence.lua')

From 32bd83e0ee211935c2f3938969d7171e6cd3d1cf Mon Sep 17 00:00:00 2001
From: christopher5106 <christopher.bourez@gmail.com>
Date: Tue, 27 Sep 2016 11:00:48 +0200
Subject: [PATCH 2/7] some corrections

---
 Grid2DLSTM.lua | 57 +++++++++++++++++++++++++++++++++-----------------
 1 file changed, 38 insertions(+), 19 deletions(-)

diff --git a/Grid2DLSTM.lua b/Grid2DLSTM.lua
index 05224f1..a96068f 100644
--- a/Grid2DLSTM.lua
+++ b/Grid2DLSTM.lua
@@ -31,6 +31,7 @@ function Grid2DLSTM:__init(inputSize, outputSize, nb_layers, dropout, tie_weight
    -- build the model
    self.cell2gate = (cell2gate == nil) and true or cell2gate
    self.recurrentModule = self:buildModel()
+
    -- make it work with nn.Container
    self.modules[1] = self.recurrentModule
    self.sharedClones[1] = self.recurrentModule
@@ -41,6 +42,24 @@ function Grid2DLSTM:__init(inputSize, outputSize, nb_layers, dropout, tie_weight
    self.cells = {}
    self.gradCells = {}
 
+   -- initialization
+  --  local net_params = self.recurrentModule:parameters()
+   --
+  --  for _, p in pairs(net_params) do
+  --    p:uniform(-0.08, 0.08)
+  --  end
+   --
+    -- initialize the LSTM forget gates with slightly higher biases to encourage remembering in the beginning
+    for layer_idx = 1, opt.num_layers do
+        for _,node in ipairs(self.recurrentModule.forwardnodes) do
+            if node.data.annotations.name == "i2h_" .. layer_idx then
+                print('setting forget gate biases to 1 in LSTM layer ' .. layer_idx)
+                -- the gates are, in order, i,f,o,g, so f is the 2nd block of weights
+                node.data.module.bias[{{self.outputSize+1, 2*self.outputSize}}]:fill(1.0)
+            end
+        end
+    end
+
 end
 
 function Grid2DLSTM:buildModel()
@@ -128,18 +147,18 @@ function Grid2DLSTM:buildModel()
 end
 
 function Grid2DLSTM:updateOutput(input)
-  if self.step == 1 then
-    -- the initial state of the cell/hidden states
-    self.cells = {[0] = {}}
-
-    for L=1,self.nb_layers do
-      local h_init = torch.zeros(input:size(1), self.outputSize):cuda()
-      table.insert(self.cells[0], h_init:clone())
-      table.insert(self.cells[0], h_init:clone()) -- extra initial state for prev_c
-    end
-  end
-
+  -- if self.step == 1 then
+  --   -- the initial state of the cell/hidden states
+  --   self.cells = {[0] = {}}
+  --
+  --   for L=1,self.nb_layers do
+  --     local h_init = torch.zeros(input:size(1), self.outputSize):cuda()
+  --     table.insert(self.cells[0], h_init:clone())
+  --     table.insert(self.cells[0], h_init:clone()) -- extra initial state for prev_c
+  --   end
+  -- end
   local input_mem_cell = torch.zeros(input:size(1),  self.outputSize):float():cuda()
+  -- print(self.cells[self.step-1])
   local rnn_inputs = {input_mem_cell, input, unpack(self.cells[self.step-1])}
   local lst
   if self.train ~= false then
@@ -152,11 +171,10 @@ function Grid2DLSTM:updateOutput(input)
   end
 
   self.cells[self.step] = {}
-  for i=1,#(self.cells[0]) do table.insert(self.cells[self.step], lst[i]) end -- extract the state, without output
+  for i=1,#(self.cells[0]) do table.insert(self.cells[self.step], lst[i]) end
 
   self.outputs[self.step] = lst[#lst]
   self.output = lst[#lst]
-  self.cell = cell
 
   self.step = self.step + 1
   self.gradPrevOutput = nil
@@ -176,8 +194,6 @@ function Grid2DLSTM:_updateGradInput(input, gradOutput)
   local recurrentModule = self:getStepModule(step)
 
   -- backward propagate through this step
-  local gradCell = (step == self.step-1) and self.zeroTensor or self.gradCells[step]
-
   if (step == self.step-1) then
     self.gradCells = {[step] = {}}
     for L=1,self.nb_layers do
@@ -185,22 +201,25 @@ function Grid2DLSTM:_updateGradInput(input, gradOutput)
       table.insert(self.gradCells[step], h_init:clone())
       table.insert(self.gradCells[step], h_init:clone()) -- extra initial state for prev_c
     end
+    local input_mem_cell = torch.zeros(input:size(1),  self.outputSize):float():cuda()
+    self.rnn_inputs = {input_mem_cell, input, unpack(self.cells[step-1])}
   end
 
   table.insert(self.gradCells[step], gradOutput)
 
-  local input_mem_cell = torch.zeros(input:size(1),  self.outputSize):float():cuda()
-  local rnn_inputs = {input_mem_cell, input, unpack(self.cells[step-1])}
 
-  local dlst = recurrentModule:updateGradInput(rnn_inputs, self.gradCells[step])
+
+  local dlst = recurrentModule:updateGradInput(self.rnn_inputs, self.gradCells[step])
   self.gradCells[step-1] = {}
+  local gradInput = {}
   for k,v in pairs(dlst) do
       if k > 2 then -- k <= skip_index is gradient on inputs, which we dont need
           -- note we do k-1 because first item is dembeddings, and then follow the
           -- derivatives of the state, starting at index 2. I know...
           self.gradCells[step-1][k-2] = v
+      else
+        table.insert(gradInput, v)
       end
-      if k == 2 then gradInput = v end
   end
   return gradInput
 end

From a89a7283e91b631c37a584d6bb76e17e5cfc69fd Mon Sep 17 00:00:00 2001
From: christopher5106 <christopher.bourez@gmail.com>
Date: Wed, 28 Sep 2016 15:12:30 +0200
Subject: [PATCH 3/7] n

---
 Grid2DLSTM.lua | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Grid2DLSTM.lua b/Grid2DLSTM.lua
index a96068f..b89df1a 100644
--- a/Grid2DLSTM.lua
+++ b/Grid2DLSTM.lua
@@ -50,7 +50,7 @@ function Grid2DLSTM:__init(inputSize, outputSize, nb_layers, dropout, tie_weight
   --  end
    --
     -- initialize the LSTM forget gates with slightly higher biases to encourage remembering in the beginning
-    for layer_idx = 1, opt.num_layers do
+    for layer_idx = 1, self.nb_layers do
         for _,node in ipairs(self.recurrentModule.forwardnodes) do
             if node.data.annotations.name == "i2h_" .. layer_idx then
                 print('setting forget gate biases to 1 in LSTM layer ' .. layer_idx)

From 61ae656b4a769aff07df2c3572453fda962a2ea9 Mon Sep 17 00:00:00 2001
From: christopher5106 <christopher.bourez@gmail.com>
Date: Wed, 28 Sep 2016 15:24:29 +0200
Subject: [PATCH 4/7] initialization of hidden and cell states in the grid
 module

---
 Grid2DLSTM.lua | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/Grid2DLSTM.lua b/Grid2DLSTM.lua
index b89df1a..06b064f 100644
--- a/Grid2DLSTM.lua
+++ b/Grid2DLSTM.lua
@@ -147,18 +147,19 @@ function Grid2DLSTM:buildModel()
 end
 
 function Grid2DLSTM:updateOutput(input)
-  -- if self.step == 1 then
-  --   -- the initial state of the cell/hidden states
-  --   self.cells = {[0] = {}}
-  --
-  --   for L=1,self.nb_layers do
-  --     local h_init = torch.zeros(input:size(1), self.outputSize):cuda()
-  --     table.insert(self.cells[0], h_init:clone())
-  --     table.insert(self.cells[0], h_init:clone()) -- extra initial state for prev_c
-  --   end
-  -- end
+  if (self.step == 1) and not self.cells[0] then
+    -- the initial state of the cell/hidden states
+    print("Initializing the cell/hidden states")
+    self.cells = {[0] = {}}
+
+    for L=1,self.nb_layers do
+      local h_init = torch.zeros(input:size(1), self.outputSize):cuda()
+      table.insert(self.cells[0], h_init:clone())
+      table.insert(self.cells[0], h_init:clone()) -- extra initial state for prev_c
+    end
+  end
   local input_mem_cell = torch.zeros(input:size(1),  self.outputSize):float():cuda()
-  -- print(self.cells[self.step-1])
+
   local rnn_inputs = {input_mem_cell, input, unpack(self.cells[self.step-1])}
   local lst
   if self.train ~= false then

From 97da470e94545ce6a9c64938487c273a4a71090e Mon Sep 17 00:00:00 2001
From: christopher5106 <christopher.bourez@gmail.com>
Date: Wed, 28 Sep 2016 16:20:23 +0200
Subject: [PATCH 5/7] taking lookup table out of module to be more generic

---
 Grid2DLSTM.lua | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/Grid2DLSTM.lua b/Grid2DLSTM.lua
index 06b064f..502330e 100644
--- a/Grid2DLSTM.lua
+++ b/Grid2DLSTM.lua
@@ -21,10 +21,9 @@ function lstm(h_t, h_d, prev_c, rnn_size)
 end
 
 
-function Grid2DLSTM:__init(inputSize, outputSize, nb_layers, dropout, tie_weights, rho, cell2gate)
+function Grid2DLSTM:__init( outputSize, nb_layers, dropout, tie_weights, rho, cell2gate)
    parent.__init(self, rho or 9999)
-   self.inputSize = inputSize
-   self.outputSize = outputSize or inputSize
+   self.outputSize = outputSize
    self.should_tie_weights = tie_weights or true
    self.dropout = dropout or 0
    self.nb_layers = nb_layers
@@ -89,7 +88,7 @@ function Grid2DLSTM:buildModel()
       if L == 1 then
         -- We're in the first layer
         prev_c_d = inputs[1] -- input_c_d: the starting depth dimension memory cell, just a zero vec.
-        prev_h_d = nn.LookupTable(self.inputSize, self.outputSize)(inputs[2]) -- input_h_d: the starting depth dimension hidden state. We map a char into hidden space via a lookup table
+        prev_h_d = inputs[2]  -- input_h_d: the starting depth dimension hidden state.
       else
         -- We're in the higher layers 2...N
         -- Take hidden and memory cell from layers below
@@ -161,6 +160,7 @@ function Grid2DLSTM:updateOutput(input)
   local input_mem_cell = torch.zeros(input:size(1),  self.outputSize):float():cuda()
 
   local rnn_inputs = {input_mem_cell, input, unpack(self.cells[self.step-1])}
+-- print(input:size())
   local lst
   if self.train ~= false then
      self:recycle()
@@ -209,7 +209,6 @@ function Grid2DLSTM:_updateGradInput(input, gradOutput)
   table.insert(self.gradCells[step], gradOutput)
 
 
-
   local dlst = recurrentModule:updateGradInput(self.rnn_inputs, self.gradCells[step])
   self.gradCells[step-1] = {}
   local gradInput = {}
@@ -222,7 +221,7 @@ function Grid2DLSTM:_updateGradInput(input, gradOutput)
         table.insert(gradInput, v)
       end
   end
-  return gradInput
+  return gradInput[2]
 end
 
 function Grid2DLSTM:_accGradParameters(input, gradOutput, scale)

From 05c188ff488c123be0420919379ddbd9f215a038 Mon Sep 17 00:00:00 2001
From: christopher5106 <christopher.bourez@gmail.com>
Date: Wed, 28 Sep 2016 16:28:36 +0200
Subject: [PATCH 6/7] for printing

---
 Grid2DLSTM.lua | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Grid2DLSTM.lua b/Grid2DLSTM.lua
index 502330e..14de67b 100644
--- a/Grid2DLSTM.lua
+++ b/Grid2DLSTM.lua
@@ -23,6 +23,7 @@ end
 
 function Grid2DLSTM:__init( outputSize, nb_layers, dropout, tie_weights, rho, cell2gate)
    parent.__init(self, rho or 9999)
+   self.inputSize = outputSize
    self.outputSize = outputSize
    self.should_tie_weights = tie_weights or true
    self.dropout = dropout or 0

From 60656898def3fbeb21925979e15bdb8288ef5744 Mon Sep 17 00:00:00 2001
From: christopher5106 <christopher.bourez@gmail.com>
Date: Wed, 28 Sep 2016 16:56:39 +0200
Subject: [PATCH 7/7] using userPrevCell to initialize cells after forget

---
 Grid2DLSTM.lua | 25 ++++++++++++++++---------
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/Grid2DLSTM.lua b/Grid2DLSTM.lua
index 14de67b..b78be4e 100644
--- a/Grid2DLSTM.lua
+++ b/Grid2DLSTM.lua
@@ -23,7 +23,7 @@ end
 
 function Grid2DLSTM:__init( outputSize, nb_layers, dropout, tie_weights, rho, cell2gate)
    parent.__init(self, rho or 9999)
-   self.inputSize = outputSize
+   self.inputSize = outputSize -- for compatibility with tostring function in AbstractRecurrent
    self.outputSize = outputSize
    self.should_tie_weights = tie_weights or true
    self.dropout = dropout or 0
@@ -147,17 +147,24 @@ function Grid2DLSTM:buildModel()
 end
 
 function Grid2DLSTM:updateOutput(input)
-  if (self.step == 1) and not self.cells[0] then
-    -- the initial state of the cell/hidden states
-    print("Initializing the cell/hidden states")
-    self.cells = {[0] = {}}
 
-    for L=1,self.nb_layers do
-      local h_init = torch.zeros(input:size(1), self.outputSize):cuda()
-      table.insert(self.cells[0], h_init:clone())
-      table.insert(self.cells[0], h_init:clone()) -- extra initial state for prev_c
+  if self.step == 1 then
+    -- the initial state of the cell/hidden states
+    if self.userPrevCell then
+      -- print("Initializing the cell/hidden states with user previous cell")
+      self.cells = { [0] = self.userPrevCell }
+    else
+      -- print("Initializing the cell/hidden states with zero")
+      self.cells = {[0] = {}}
+
+      for L=1,self.nb_layers do
+        local h_init = torch.zeros(input:size(1), self.outputSize):cuda()
+        table.insert(self.cells[0], h_init:clone())
+        table.insert(self.cells[0], h_init:clone()) -- extra initial state for prev_c
+      end
     end
   end
+
   local input_mem_cell = torch.zeros(input:size(1),  self.outputSize):float():cuda()
 
   local rnn_inputs = {input_mem_cell, input, unpack(self.cells[self.step-1])}