From 344c15542db3458dc639f127e6371b37ccfbc70f Mon Sep 17 00:00:00 2001 From: christopher5106 Date: Mon, 26 Sep 2016 21:37:32 +0200 Subject: [PATCH 1/7] add grid lstm --- Grid2DLSTM.lua | 221 +++++++++++++++++++++++++++++++++++++++++++++++++ init.lua | 2 + 2 files changed, 223 insertions(+) create mode 100644 Grid2DLSTM.lua diff --git a/Grid2DLSTM.lua b/Grid2DLSTM.lua new file mode 100644 index 0000000..05224f1 --- /dev/null +++ b/Grid2DLSTM.lua @@ -0,0 +1,221 @@ +local Grid2DLSTM, parent = torch.class("nn.Grid2DLSTM", 'nn.AbstractRecurrent') + +function lstm(h_t, h_d, prev_c, rnn_size) + local all_input_sums = nn.CAddTable()({h_t, h_d}) + local reshaped = nn.Reshape(4, rnn_size)(all_input_sums) + local n1, n2, n3, n4 = nn.SplitTable(2)(reshaped):split(4) + -- decode the gates + local in_gate = nn.Sigmoid()(n1) + local forget_gate = nn.Sigmoid()(n2) + local out_gate = nn.Sigmoid()(n3) + -- decode the write inputs + local in_transform = nn.Tanh()(n4) + -- perform the LSTM update + local next_c = nn.CAddTable()({ + nn.CMulTable()({forget_gate, prev_c}), + nn.CMulTable()({in_gate, in_transform}) + }) + -- gated cells form the output + local next_h = nn.CMulTable()({out_gate, nn.Tanh()(next_c)}) + return next_c, next_h +end + + +function Grid2DLSTM:__init(inputSize, outputSize, nb_layers, dropout, tie_weights, rho, cell2gate) + parent.__init(self, rho or 9999) + self.inputSize = inputSize + self.outputSize = outputSize or inputSize + self.should_tie_weights = tie_weights or true + self.dropout = dropout or 0 + self.nb_layers = nb_layers + -- build the model + self.cell2gate = (cell2gate == nil) and true or cell2gate + self.recurrentModule = self:buildModel() + -- make it work with nn.Container + self.modules[1] = self.recurrentModule + self.sharedClones[1] = self.recurrentModule + + -- for output(0), cell(0) and gradCell(T) + self.zeroTensor = torch.Tensor() + + self.cells = {} + self.gradCells = {} + +end + +function Grid2DLSTM:buildModel() + require 'nngraph' + assert(nngraph, "Missing nngraph package") + + -- There will be 2*n+1 inputs + local inputs = {} + table.insert(inputs, nn.Identity()()) -- input c for depth dimension + table.insert(inputs, nn.Identity()()) -- input h for depth dimension + for L = 1,self.nb_layers do + table.insert(inputs, nn.Identity()()) -- prev_c[L] for time dimension + table.insert(inputs, nn.Identity()()) -- prev_h[L] for time dimension + end + + local shared_weights + if self.should_tie_weights == true then shared_weights = {nn.Linear(self.outputSize, 4 * self.outputSize), nn.Linear(self.outputSize, 4 * self.outputSize)} end + + local outputs_t = {} -- Outputs being handed to the next time step along the time dimension + local outputs_d = {} -- Outputs being handed from one layer to the next along the depth dimension + + for L = 1,self.nb_layers do + -- Take hidden and memory cell from previous time steps + local prev_c_t = inputs[L*2+1] + local prev_h_t = inputs[L*2+2] + + if L == 1 then + -- We're in the first layer + prev_c_d = inputs[1] -- input_c_d: the starting depth dimension memory cell, just a zero vec. + prev_h_d = nn.LookupTable(self.inputSize, self.outputSize)(inputs[2]) -- input_h_d: the starting depth dimension hidden state. We map a char into hidden space via a lookup table + else + -- We're in the higher layers 2...N + -- Take hidden and memory cell from layers below + prev_c_d = outputs_d[((L-1)*2)-1] + prev_h_d = outputs_d[((L-1)*2)] + if self.dropout > 0 then prev_h_d = nn.Dropout(self.dropout)(prev_h_d):annotate{name='drop_' .. L} end -- apply dropout, if any + end + + -- Evaluate the input sums at once for efficiency + local t2h_t = nn.Linear(self.outputSize, 4 * self.outputSize)(prev_h_t):annotate{name='i2h_'..L} + local d2h_t = nn.Linear(self.outputSize, 4 * self.outputSize)(prev_h_d):annotate{name='h2h_'..L} + + -- Get transformed memory and hidden states pointing in the time direction first + local next_c_t, next_h_t = lstm(t2h_t, d2h_t, prev_c_t, self.outputSize) + + -- Pass memory cell and hidden state to next timestep + table.insert(outputs_t, next_c_t) + table.insert(outputs_t, next_h_t) + + -- Evaluate the input sums at once for efficiency + local t2h_d = nn.Linear(self.outputSize, 4 * self.outputSize)(next_h_t):annotate{name='i2h_'..L} + local d2h_d = nn.Linear(self.outputSize, 4 * self.outputSize)(prev_h_d):annotate{name='h2h_'..L} + + -- See section 3.5, "Weight Sharing" of http://arxiv.org/pdf/1507.01526.pdf + -- The weights along the temporal dimension are already tied (cloned many times in train.lua) + -- Here we can tie the weights along the depth dimension. Having invariance in computation + -- along the depth appears to be critical to solving the 15 digit addition problem w/ high accy. + -- See fig 4. to compare tied vs untied grid lstms on this task. + if self.should_tie_weights == true then + print("tying weights along the depth dimension") + t2h_d.data.module:share(shared_weights[1], 'weight', 'bias', 'gradWeight', 'gradBias') + d2h_d.data.module:share(shared_weights[2], 'weight', 'bias', 'gradWeight', 'gradBias') + end + + -- Create the lstm gated update pointing in the depth direction. + -- We 'prioritize' the depth dimension by using the updated temporal hidden state as input + -- instead of the previous temporal hidden state. This implements Section 3.2, "Priority Dimensions" + local next_c_d, next_h_d = lstm(t2h_d, d2h_d, prev_c_d, self.outputSize) + + -- Pass the depth dimension memory cell and hidden state to layer above + table.insert(outputs_d, next_c_d) + table.insert(outputs_d, next_h_d) + end + + -- set up the decoder + local top_h = outputs_d[#outputs_d] + table.insert(outputs_t, top_h) + + -- outputs_h contains + -- nb_layers x (next_c_t, next_h_t) + -- next-h + + return nn.gModule(inputs, outputs_t) + +end + +function Grid2DLSTM:updateOutput(input) + if self.step == 1 then + -- the initial state of the cell/hidden states + self.cells = {[0] = {}} + + for L=1,self.nb_layers do + local h_init = torch.zeros(input:size(1), self.outputSize):cuda() + table.insert(self.cells[0], h_init:clone()) + table.insert(self.cells[0], h_init:clone()) -- extra initial state for prev_c + end + end + + local input_mem_cell = torch.zeros(input:size(1), self.outputSize):float():cuda() + local rnn_inputs = {input_mem_cell, input, unpack(self.cells[self.step-1])} + local lst + if self.train ~= false then + self:recycle() + local recurrentModule = self:getStepModule(self.step) + -- the actual forward propagation + lst = recurrentModule:updateOutput(rnn_inputs) + else + lst = self.recurrentModule:updateOutput(rnn_inputs) + end + + self.cells[self.step] = {} + for i=1,#(self.cells[0]) do table.insert(self.cells[self.step], lst[i]) end -- extract the state, without output + + self.outputs[self.step] = lst[#lst] + self.output = lst[#lst] + self.cell = cell + + self.step = self.step + 1 + self.gradPrevOutput = nil + self.updateGradInputStep = nil + self.accGradParametersStep = nil + -- note that we don't return the cell, just the output + return self.output + +end + +function Grid2DLSTM:_updateGradInput(input, gradOutput) + assert(self.step > 1, "expecting at least one updateOutput") + local step = self.updateGradInputStep - 1 + assert(step >= 1) + + -- set the output/gradOutput states of current Module + local recurrentModule = self:getStepModule(step) + + -- backward propagate through this step + local gradCell = (step == self.step-1) and self.zeroTensor or self.gradCells[step] + + if (step == self.step-1) then + self.gradCells = {[step] = {}} + for L=1,self.nb_layers do + local h_init = torch.zeros(input:size(1), self.outputSize):cuda() + table.insert(self.gradCells[step], h_init:clone()) + table.insert(self.gradCells[step], h_init:clone()) -- extra initial state for prev_c + end + end + + table.insert(self.gradCells[step], gradOutput) + + local input_mem_cell = torch.zeros(input:size(1), self.outputSize):float():cuda() + local rnn_inputs = {input_mem_cell, input, unpack(self.cells[step-1])} + + local dlst = recurrentModule:updateGradInput(rnn_inputs, self.gradCells[step]) + self.gradCells[step-1] = {} + for k,v in pairs(dlst) do + if k > 2 then -- k <= skip_index is gradient on inputs, which we dont need + -- note we do k-1 because first item is dembeddings, and then follow the + -- derivatives of the state, starting at index 2. I know... + self.gradCells[step-1][k-2] = v + end + if k == 2 then gradInput = v end + end + return gradInput +end + +function Grid2DLSTM:_accGradParameters(input, gradOutput, scale) + local step = self.accGradParametersStep - 1 + assert(step >= 1) + + -- set the output/gradOutput states of current Module + local recurrentModule = self:getStepModule(step) + + -- backward propagate through this step + local input_mem_cell = torch.zeros(input:size(1), self.outputSize):float():cuda() + local rnn_inputs = {input_mem_cell, input, unpack(self.cells[step-1])} + + recurrentModule:accGradParameters( rnn_inputs, self.gradCells[step], scale) + +end diff --git a/init.lua b/init.lua index bd49ef1..ac902af 100644 --- a/init.lua +++ b/init.lua @@ -35,6 +35,8 @@ torch.include('rnn', 'AbstractRecurrent.lua') torch.include('rnn', 'Recurrent.lua') torch.include('rnn', 'LSTM.lua') torch.include('rnn', 'FastLSTM.lua') +torch.include('rnn', 'Grid2DLSTM.lua') + torch.include('rnn', 'GRU.lua') torch.include('rnn', 'Recursor.lua') torch.include('rnn', 'Recurrence.lua') From 32bd83e0ee211935c2f3938969d7171e6cd3d1cf Mon Sep 17 00:00:00 2001 From: christopher5106 Date: Tue, 27 Sep 2016 11:00:48 +0200 Subject: [PATCH 2/7] some corrections --- Grid2DLSTM.lua | 57 +++++++++++++++++++++++++++++++++----------------- 1 file changed, 38 insertions(+), 19 deletions(-) diff --git a/Grid2DLSTM.lua b/Grid2DLSTM.lua index 05224f1..a96068f 100644 --- a/Grid2DLSTM.lua +++ b/Grid2DLSTM.lua @@ -31,6 +31,7 @@ function Grid2DLSTM:__init(inputSize, outputSize, nb_layers, dropout, tie_weight -- build the model self.cell2gate = (cell2gate == nil) and true or cell2gate self.recurrentModule = self:buildModel() + -- make it work with nn.Container self.modules[1] = self.recurrentModule self.sharedClones[1] = self.recurrentModule @@ -41,6 +42,24 @@ function Grid2DLSTM:__init(inputSize, outputSize, nb_layers, dropout, tie_weight self.cells = {} self.gradCells = {} + -- initialization + -- local net_params = self.recurrentModule:parameters() + -- + -- for _, p in pairs(net_params) do + -- p:uniform(-0.08, 0.08) + -- end + -- + -- initialize the LSTM forget gates with slightly higher biases to encourage remembering in the beginning + for layer_idx = 1, opt.num_layers do + for _,node in ipairs(self.recurrentModule.forwardnodes) do + if node.data.annotations.name == "i2h_" .. layer_idx then + print('setting forget gate biases to 1 in LSTM layer ' .. layer_idx) + -- the gates are, in order, i,f,o,g, so f is the 2nd block of weights + node.data.module.bias[{{self.outputSize+1, 2*self.outputSize}}]:fill(1.0) + end + end + end + end function Grid2DLSTM:buildModel() @@ -128,18 +147,18 @@ function Grid2DLSTM:buildModel() end function Grid2DLSTM:updateOutput(input) - if self.step == 1 then - -- the initial state of the cell/hidden states - self.cells = {[0] = {}} - - for L=1,self.nb_layers do - local h_init = torch.zeros(input:size(1), self.outputSize):cuda() - table.insert(self.cells[0], h_init:clone()) - table.insert(self.cells[0], h_init:clone()) -- extra initial state for prev_c - end - end - + -- if self.step == 1 then + -- -- the initial state of the cell/hidden states + -- self.cells = {[0] = {}} + -- + -- for L=1,self.nb_layers do + -- local h_init = torch.zeros(input:size(1), self.outputSize):cuda() + -- table.insert(self.cells[0], h_init:clone()) + -- table.insert(self.cells[0], h_init:clone()) -- extra initial state for prev_c + -- end + -- end local input_mem_cell = torch.zeros(input:size(1), self.outputSize):float():cuda() + -- print(self.cells[self.step-1]) local rnn_inputs = {input_mem_cell, input, unpack(self.cells[self.step-1])} local lst if self.train ~= false then @@ -152,11 +171,10 @@ function Grid2DLSTM:updateOutput(input) end self.cells[self.step] = {} - for i=1,#(self.cells[0]) do table.insert(self.cells[self.step], lst[i]) end -- extract the state, without output + for i=1,#(self.cells[0]) do table.insert(self.cells[self.step], lst[i]) end self.outputs[self.step] = lst[#lst] self.output = lst[#lst] - self.cell = cell self.step = self.step + 1 self.gradPrevOutput = nil @@ -176,8 +194,6 @@ function Grid2DLSTM:_updateGradInput(input, gradOutput) local recurrentModule = self:getStepModule(step) -- backward propagate through this step - local gradCell = (step == self.step-1) and self.zeroTensor or self.gradCells[step] - if (step == self.step-1) then self.gradCells = {[step] = {}} for L=1,self.nb_layers do @@ -185,22 +201,25 @@ function Grid2DLSTM:_updateGradInput(input, gradOutput) table.insert(self.gradCells[step], h_init:clone()) table.insert(self.gradCells[step], h_init:clone()) -- extra initial state for prev_c end + local input_mem_cell = torch.zeros(input:size(1), self.outputSize):float():cuda() + self.rnn_inputs = {input_mem_cell, input, unpack(self.cells[step-1])} end table.insert(self.gradCells[step], gradOutput) - local input_mem_cell = torch.zeros(input:size(1), self.outputSize):float():cuda() - local rnn_inputs = {input_mem_cell, input, unpack(self.cells[step-1])} - local dlst = recurrentModule:updateGradInput(rnn_inputs, self.gradCells[step]) + + local dlst = recurrentModule:updateGradInput(self.rnn_inputs, self.gradCells[step]) self.gradCells[step-1] = {} + local gradInput = {} for k,v in pairs(dlst) do if k > 2 then -- k <= skip_index is gradient on inputs, which we dont need -- note we do k-1 because first item is dembeddings, and then follow the -- derivatives of the state, starting at index 2. I know... self.gradCells[step-1][k-2] = v + else + table.insert(gradInput, v) end - if k == 2 then gradInput = v end end return gradInput end From a89a7283e91b631c37a584d6bb76e17e5cfc69fd Mon Sep 17 00:00:00 2001 From: christopher5106 Date: Wed, 28 Sep 2016 15:12:30 +0200 Subject: [PATCH 3/7] n --- Grid2DLSTM.lua | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Grid2DLSTM.lua b/Grid2DLSTM.lua index a96068f..b89df1a 100644 --- a/Grid2DLSTM.lua +++ b/Grid2DLSTM.lua @@ -50,7 +50,7 @@ function Grid2DLSTM:__init(inputSize, outputSize, nb_layers, dropout, tie_weight -- end -- -- initialize the LSTM forget gates with slightly higher biases to encourage remembering in the beginning - for layer_idx = 1, opt.num_layers do + for layer_idx = 1, self.nb_layers do for _,node in ipairs(self.recurrentModule.forwardnodes) do if node.data.annotations.name == "i2h_" .. layer_idx then print('setting forget gate biases to 1 in LSTM layer ' .. layer_idx) From 61ae656b4a769aff07df2c3572453fda962a2ea9 Mon Sep 17 00:00:00 2001 From: christopher5106 Date: Wed, 28 Sep 2016 15:24:29 +0200 Subject: [PATCH 4/7] initialization of hidden and cell states in the grid module --- Grid2DLSTM.lua | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/Grid2DLSTM.lua b/Grid2DLSTM.lua index b89df1a..06b064f 100644 --- a/Grid2DLSTM.lua +++ b/Grid2DLSTM.lua @@ -147,18 +147,19 @@ function Grid2DLSTM:buildModel() end function Grid2DLSTM:updateOutput(input) - -- if self.step == 1 then - -- -- the initial state of the cell/hidden states - -- self.cells = {[0] = {}} - -- - -- for L=1,self.nb_layers do - -- local h_init = torch.zeros(input:size(1), self.outputSize):cuda() - -- table.insert(self.cells[0], h_init:clone()) - -- table.insert(self.cells[0], h_init:clone()) -- extra initial state for prev_c - -- end - -- end + if (self.step == 1) and not self.cells[0] then + -- the initial state of the cell/hidden states + print("Initializing the cell/hidden states") + self.cells = {[0] = {}} + + for L=1,self.nb_layers do + local h_init = torch.zeros(input:size(1), self.outputSize):cuda() + table.insert(self.cells[0], h_init:clone()) + table.insert(self.cells[0], h_init:clone()) -- extra initial state for prev_c + end + end local input_mem_cell = torch.zeros(input:size(1), self.outputSize):float():cuda() - -- print(self.cells[self.step-1]) + local rnn_inputs = {input_mem_cell, input, unpack(self.cells[self.step-1])} local lst if self.train ~= false then From 97da470e94545ce6a9c64938487c273a4a71090e Mon Sep 17 00:00:00 2001 From: christopher5106 Date: Wed, 28 Sep 2016 16:20:23 +0200 Subject: [PATCH 5/7] taking lookup table out of module to be more generic --- Grid2DLSTM.lua | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/Grid2DLSTM.lua b/Grid2DLSTM.lua index 06b064f..502330e 100644 --- a/Grid2DLSTM.lua +++ b/Grid2DLSTM.lua @@ -21,10 +21,9 @@ function lstm(h_t, h_d, prev_c, rnn_size) end -function Grid2DLSTM:__init(inputSize, outputSize, nb_layers, dropout, tie_weights, rho, cell2gate) +function Grid2DLSTM:__init( outputSize, nb_layers, dropout, tie_weights, rho, cell2gate) parent.__init(self, rho or 9999) - self.inputSize = inputSize - self.outputSize = outputSize or inputSize + self.outputSize = outputSize self.should_tie_weights = tie_weights or true self.dropout = dropout or 0 self.nb_layers = nb_layers @@ -89,7 +88,7 @@ function Grid2DLSTM:buildModel() if L == 1 then -- We're in the first layer prev_c_d = inputs[1] -- input_c_d: the starting depth dimension memory cell, just a zero vec. - prev_h_d = nn.LookupTable(self.inputSize, self.outputSize)(inputs[2]) -- input_h_d: the starting depth dimension hidden state. We map a char into hidden space via a lookup table + prev_h_d = inputs[2] -- input_h_d: the starting depth dimension hidden state. else -- We're in the higher layers 2...N -- Take hidden and memory cell from layers below @@ -161,6 +160,7 @@ function Grid2DLSTM:updateOutput(input) local input_mem_cell = torch.zeros(input:size(1), self.outputSize):float():cuda() local rnn_inputs = {input_mem_cell, input, unpack(self.cells[self.step-1])} +-- print(input:size()) local lst if self.train ~= false then self:recycle() @@ -209,7 +209,6 @@ function Grid2DLSTM:_updateGradInput(input, gradOutput) table.insert(self.gradCells[step], gradOutput) - local dlst = recurrentModule:updateGradInput(self.rnn_inputs, self.gradCells[step]) self.gradCells[step-1] = {} local gradInput = {} @@ -222,7 +221,7 @@ function Grid2DLSTM:_updateGradInput(input, gradOutput) table.insert(gradInput, v) end end - return gradInput + return gradInput[2] end function Grid2DLSTM:_accGradParameters(input, gradOutput, scale) From 05c188ff488c123be0420919379ddbd9f215a038 Mon Sep 17 00:00:00 2001 From: christopher5106 Date: Wed, 28 Sep 2016 16:28:36 +0200 Subject: [PATCH 6/7] for printing --- Grid2DLSTM.lua | 1 + 1 file changed, 1 insertion(+) diff --git a/Grid2DLSTM.lua b/Grid2DLSTM.lua index 502330e..14de67b 100644 --- a/Grid2DLSTM.lua +++ b/Grid2DLSTM.lua @@ -23,6 +23,7 @@ end function Grid2DLSTM:__init( outputSize, nb_layers, dropout, tie_weights, rho, cell2gate) parent.__init(self, rho or 9999) + self.inputSize = outputSize self.outputSize = outputSize self.should_tie_weights = tie_weights or true self.dropout = dropout or 0 From 60656898def3fbeb21925979e15bdb8288ef5744 Mon Sep 17 00:00:00 2001 From: christopher5106 Date: Wed, 28 Sep 2016 16:56:39 +0200 Subject: [PATCH 7/7] using userPrevCell to initialize cells after forget --- Grid2DLSTM.lua | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/Grid2DLSTM.lua b/Grid2DLSTM.lua index 14de67b..b78be4e 100644 --- a/Grid2DLSTM.lua +++ b/Grid2DLSTM.lua @@ -23,7 +23,7 @@ end function Grid2DLSTM:__init( outputSize, nb_layers, dropout, tie_weights, rho, cell2gate) parent.__init(self, rho or 9999) - self.inputSize = outputSize + self.inputSize = outputSize -- for compatibility with tostring function in AbstractRecurrent self.outputSize = outputSize self.should_tie_weights = tie_weights or true self.dropout = dropout or 0 @@ -147,17 +147,24 @@ function Grid2DLSTM:buildModel() end function Grid2DLSTM:updateOutput(input) - if (self.step == 1) and not self.cells[0] then - -- the initial state of the cell/hidden states - print("Initializing the cell/hidden states") - self.cells = {[0] = {}} - for L=1,self.nb_layers do - local h_init = torch.zeros(input:size(1), self.outputSize):cuda() - table.insert(self.cells[0], h_init:clone()) - table.insert(self.cells[0], h_init:clone()) -- extra initial state for prev_c + if self.step == 1 then + -- the initial state of the cell/hidden states + if self.userPrevCell then + -- print("Initializing the cell/hidden states with user previous cell") + self.cells = { [0] = self.userPrevCell } + else + -- print("Initializing the cell/hidden states with zero") + self.cells = {[0] = {}} + + for L=1,self.nb_layers do + local h_init = torch.zeros(input:size(1), self.outputSize):cuda() + table.insert(self.cells[0], h_init:clone()) + table.insert(self.cells[0], h_init:clone()) -- extra initial state for prev_c + end end end + local input_mem_cell = torch.zeros(input:size(1), self.outputSize):float():cuda() local rnn_inputs = {input_mem_cell, input, unpack(self.cells[self.step-1])}