require 'rnn' -- this imports 'nn' as well, and adds 'rnn' objects to the 'nn' namespace n = 3 -- sequence length d_in = 5 -- size of input vectors d_hid = 5 -- size of RNN hidden state lstm = nn.LSTM(d_in, d_hid) -- inherits from AbstractRecurrent; expects inputs in R^{d_in} and produces outputs in R^{d_hid} data = torch.randn(n, d_in) -- a sequence of n random vectors x_1, x_2, x_3, each in R^{d_in} outputs = torch.zeros(n, d_hid) for i = 1, data:size(1) do outputs[i] = lstm:forward(data[i]) -- note that we don't need to keep track of the s_i end print(outputs) print(lstm.outputs) print(lstm.cells) print(lstm.outputs[1]) print(lstm.outputs[2]) print(lstm.outputs[3]) -- note this doesn't work! just supposed to convey how you might have to implement this... dLdh_i = gradOutForFinalH() dLdc_i = gradOutForFinalC() for i = data:size(1), 1, -1 do dLdh_iminus1, dLdc_iminus1 = lstm:backward(data[i], {dLdh_i, dLdc_i}) dLdh_i, dLdc_i = dLdh_iminus1, dLdc_iminus1 end seqLin = nn.Sequencer(nn.Linear(d_in, d_hid)) t_i = torch.split(data, 1) -- make a table from our sequence of 3 vectors t_o = seqLin:forward(t_i) print(t_o) -- t_o gives the output of the Linear layer applied to each element in t_i seqLSTM = nn.Sequencer(nn.LSTM(d_in, d_hid)) t_o = seqLSTM:forward(t_i) print(t_o) gradOutput = torch.split(torch.zeros(n, d_hid), 1) -- all zero except last time-step -- we generally get the nonzero part of gradOutput from a criterion, so for illustration we'll use an MSECriterion -- and some random data mseCrit = nn.MSECriterion() randTarget = torch.randn(t_o[n]:size()) mseCrit:forward(t_o[n], randTarget) finalGradOut = mseCrit:backward(t_o[n], randTarget) gradOutput[n] = finalGradOut -- now we can BPTT with a single call! seqLSTM:backward(t_i, gradOutput) seqLSTM2 = nn.Sequential():add(nn.SplitTable(1)):add(nn.Sequencer(nn.LSTM(d_in, d_hid))) print(seqLSTM2:forward(data)) seqLSTM3 = nn.Sequential():add(nn.SplitTable(1)):add(nn.Sequencer(nn.LSTM(d_in, d_hid))) seqLSTM3:add(nn.SelectTable(-1)) -- select the last element in the output table print(seqLSTM3:forward(data)) gradOutFinal = gradOutput[#gradOutput] -- note that gradOutFinal is just a tensor seqLSTM3:backward(data, gradOutFinal) -- our vocabulary V = {["I"]=1, ["you"]=2, ["the"]=3, ["this"]=4, ["to"]=5, ["fire"]=6, ["Hey"]=7, ["is"]=8, ["just"]=9, ["zee"]=10, ["And"]=11, ["just"]=12, ["rain"]=13, ["cray"]=14, ["met"]=15, ["Set"]=16} -- get indices of words in each 5-gram songData = torch.LongTensor({ { V["Hey"], V["I"], V["just"], V["met"], V["you"] }, { V["And"], V["this"], V["is"], V["cray"], V["zee"] }, { V["Set"], V["fire"], V["to"], V["the"], V["rain"] } }) masterpieceOrNot = torch.Tensor({{1}, -- #carlyrae4ever {1}, {0}}) print(songData) -- we'll use a LookupTable to map word indices into vectors in R^6 vocab_size = 16 embed_dim = 6 LT = nn.LookupTable(vocab_size, embed_dim) -- Using a Sequencer, let's make an LSTM that consumes a sequence of song-word embeddings songLSTM = nn.Sequential() songLSTM:add(LT) -- for a single sequence, will return a sequence-length x embedDim tensor songLSTM:add(nn.SplitTable(1)) -- splits tensor into a sequence-length table containing vectors of size embedDim songLSTM:add(nn.Sequencer(nn.LSTM(embed_dim, embed_dim))) songLSTM:add(nn.SelectTable(-1)) -- selects last state of the LSTM songLSTM:add(nn.Linear(embed_dim, 1)) -- map last state to a score for classification songLSTM:add(nn.Sigmoid()) -- convert score to a probability firstSongPred = songLSTM:forward(songData[1]) print(firstSongPred) -- we can then use a simple BCE Criterion for backprop bceCrit = nn.BCECriterion() loss = bceCrit:forward(firstSongPred, masterpieceOrNot[1]) dLdPred = bceCrit:backward(firstSongPred, masterpieceOrNot[1]) songLSTM:backward(songData[1], dLdPred) -- data representing a sequence of length 3, vectors in R^5, and batch-size of 2 batch_size = 2 batchSeqData = {torch.randn(batch_size, d_in), torch.randn(batch_size, d_in), torch.randn(batch_size, d_in)} print(batchSeqData) -- do a batched :forward() call print(nn.Sequencer(nn.LSTM(d_in, d_hid)):forward(batchSeqData)) -- For batch inputs, it's a little easier to start with sequence-length x batch-size tensor, so we transpose songData songDataT = songData:t() batchSongLSTM = nn.Sequential() batchSongLSTM:add(LT) -- will return a sequence-length x batch-size x embedDim tensor batchSongLSTM:add(nn.SplitTable(1, 3)) -- splits into a sequence-length table with batch-size x embedDim entries print(batchSongLSTM:forward(songDataT)) -- sanity check -- now let's add the LSTM stuff batchSongLSTM:add(nn.Sequencer(nn.LSTM(embed_dim, embed_dim))) batchSongLSTM:add(nn.SelectTable(-1)) -- selects last state of the LSTM batchSongLSTM:add(nn.Linear(embed_dim, 1)) -- map last state to a score for classification batchSongLSTM:add(nn.Sigmoid()) -- convert score to a probability songPreds = batchSongLSTM:forward(songDataT) print(songPreds) -- we can now call :backward() as follows loss = bceCrit:forward(songPreds, masterpieceOrNot) dLdPreds = bceCrit:backward(songPreds, masterpieceOrNot) batchSongLSTM:backward(songDataT, dLdPreds) -- you can use FastLSTMs just like ordinary LSTMs print(nn.Sequencer(nn.FastLSTM(d_in, d_hid)):forward({torch.randn(batch_size, d_in), torch.randn(batch_size, d_in)})) stackedSongLSTM = nn.Sequential():add(LT):add(nn.SplitTable(1, 3)) stackedSongLSTM:add(nn.Sequencer(nn.LSTM(embed_dim, embed_dim))) -- add first layer stackedSongLSTM:add(nn.Sequencer(nn.LSTM(embed_dim, embed_dim))) -- add second layer print(stackedSongLSTM:forward(songDataT)) -- let's make sure the recurrences happened as we expect -- as a sanity check, let's print out the embeddings we sent into the lstm print(LT:forward(songDataT)) -- now let's look at the first layer LSTM's input at the third time-step; should match 3rd thing in the above! firstLayerLSTM = stackedSongLSTM:get(3):get(1) -- Sequencer was 3rd thing added, and its first child is the LSTM print(firstLayerLSTM.inputs[3]) -- now let's look at the first layer LSTM's output at the 3rd time-step print(firstLayerLSTM.outputs[3]) -- let's now examine the second layer LSTM and its input secondLayerLSTM = stackedSongLSTM:get(4):get(1) print(secondLayerLSTM.inputs[3]) -- should match the OUTPUT of firstLayerLSTM at 3rd timestep stackedSongLSTMDO = nn.Sequential():add(LT):add(nn.SplitTable(1, 3)) stackedSongLSTMDO:add(nn.Sequencer(nn.LSTM(embed_dim, embed_dim))) -- add first layer stackedSongLSTMDO:add(nn.Sequencer(nn.Dropout(0.5))) stackedSongLSTMDO:add(nn.Sequencer(nn.LSTM(embed_dim, embed_dim))) -- add second layer print(stackedSongLSTMDO:forward(songDataT)) while stillInEpoch do batch = next batch of sequence-length x batch-size inputs lstm:forward(batch) gradOuts = gradOutput for each time-step (for each thing in the batch) lstm:backward(batch, gradOuts) end -- let's make another LSTM rememberLSTM = nn.Sequential():add(LT):add(nn.SplitTable(1, 3)) seqLSTM = nn.Sequencer(nn.LSTM(embed_dim, embed_dim)) -- possible arguments for :remember() are 'eval', 'train', 'both', or 'neither', which tells it whether to remember -- only during evaluation (but not training), only during training, both or neither seqLSTM:remember('both') -- :remember() typically needs to only be called once rememberLSTM:add(seqLSTM) --since we're remembering, we expect inputting the same sequence twice in a row to give us different outputs -- (since the first time the pre-first state is the zero vector, and the second it's the end of the sequence) print(rememberLSTM:forward(songDataT)[5]) -- printing out just the final time-step -- let's do it again with the same sequence! print(rememberLSTM:forward(songDataT)[5]) -- printing out just the final time-step -- we can forget our history, though, by calling :forget() seqLSTM:forget() print(rememberLSTM:forward(songDataT)[5]) -- printing out just the final time-step -- if we use :remember('neither') or :remember('eval'), :forget() is called internally before each :forward() seqLSTM:remember('neither') print(rememberLSTM:forward(songDataT)[5]) -- printing out just the final time-step -- now it doesn't change if we call forward twice print(rememberLSTM:forward(songDataT)[5]) -- printing out just the final time-step