require 'rnn' -- this imports 'nn' as well, and adds 'rnn' objects to the 'nn' namespace

n = 3 -- sequence length
d_in = 5 -- size of input vectors
d_hid = 5 -- size of RNN hidden state

lstm = nn.LSTM(d_in, d_hid) -- inherits from AbstractRecurrent; expects inputs in R^{d_in} and produces outputs in R^{d_hid}
data = torch.randn(n, d_in) -- a sequence of n random vectors x_1, x_2, x_3, each in R^{d_in}
outputs = torch.zeros(n, d_hid)

for i = 1, data:size(1) do
    outputs[i] = lstm:forward(data[i]) -- note that we don't need to keep track of the s_i
end

print(outputs)

print(lstm.outputs)
print(lstm.cells)

print(lstm.outputs[1])
print(lstm.outputs[2])
print(lstm.outputs[3])

-- note this doesn't work! just supposed to convey how you might have to implement this...
dLdh_i = gradOutForFinalH()
dLdc_i = gradOutForFinalC()

for i = data:size(1), 1, -1 do
    dLdh_iminus1, dLdc_iminus1 = lstm:backward(data[i], {dLdh_i, dLdc_i})
    dLdh_i, dLdc_i = dLdh_iminus1, dLdc_iminus1
end

seqLin = nn.Sequencer(nn.Linear(d_in, d_hid))
t_i = torch.split(data, 1) -- make a table from our sequence of 3 vectors
t_o = seqLin:forward(t_i)
print(t_o) -- t_o gives the output of the Linear layer applied to each element in t_i

seqLSTM = nn.Sequencer(nn.LSTM(d_in, d_hid))
t_o = seqLSTM:forward(t_i)
print(t_o)

gradOutput = torch.split(torch.zeros(n, d_hid), 1) -- all zero except last time-step
-- we generally get the nonzero part of gradOutput from a criterion, so for illustration we'll use an MSECriterion
-- and some random data
mseCrit = nn.MSECriterion()
randTarget = torch.randn(t_o[n]:size())
mseCrit:forward(t_o[n], randTarget)
finalGradOut = mseCrit:backward(t_o[n], randTarget)
gradOutput[n] = finalGradOut
-- now we can BPTT with a single call!
seqLSTM:backward(t_i, gradOutput)

seqLSTM2 = nn.Sequential():add(nn.SplitTable(1)):add(nn.Sequencer(nn.LSTM(d_in, d_hid)))
print(seqLSTM2:forward(data))

seqLSTM3 = nn.Sequential():add(nn.SplitTable(1)):add(nn.Sequencer(nn.LSTM(d_in, d_hid)))
seqLSTM3:add(nn.SelectTable(-1)) -- select the last element in the output table

print(seqLSTM3:forward(data))
gradOutFinal = gradOutput[#gradOutput] -- note that gradOutFinal is just a tensor
seqLSTM3:backward(data, gradOutFinal)

-- our vocabulary
V = {["I"]=1, ["you"]=2, ["the"]=3, ["this"]=4, ["to"]=5, ["fire"]=6, ["Hey"]=7, ["is"]=8, 
    ["just"]=9, ["zee"]=10, ["And"]=11, ["just"]=12, ["rain"]=13, ["cray"]=14, ["met"]=15, ["Set"]=16}

-- get indices of words in each 5-gram
songData = torch.LongTensor({ { V["Hey"],   V["I"],      V["just"],  V["met"],    V["you"] },
                              { V["And"],   V["this"],   V["is"],    V["cray"],   V["zee"] },
                              { V["Set"],   V["fire"],   V["to"],    V["the"],    V["rain"] } })

masterpieceOrNot = torch.Tensor({{1},   -- #carlyrae4ever   
                                 {1},
                                 {0}}) 

print(songData)
-- we'll use a LookupTable to map word indices into vectors in R^6
vocab_size = 16
embed_dim = 6
LT = nn.LookupTable(vocab_size, embed_dim)

-- Using a Sequencer, let's make an LSTM that consumes a sequence of song-word embeddings
songLSTM = nn.Sequential()
songLSTM:add(LT) -- for a single sequence, will return a sequence-length x embedDim tensor
songLSTM:add(nn.SplitTable(1)) -- splits tensor into a sequence-length table containing vectors of size embedDim
songLSTM:add(nn.Sequencer(nn.LSTM(embed_dim, embed_dim)))
songLSTM:add(nn.SelectTable(-1)) -- selects last state of the LSTM
songLSTM:add(nn.Linear(embed_dim, 1)) -- map last state to a score for classification
songLSTM:add(nn.Sigmoid()) -- convert score to a probability
firstSongPred = songLSTM:forward(songData[1])
print(firstSongPred)

-- we can then use a simple BCE Criterion for backprop
bceCrit = nn.BCECriterion()
loss = bceCrit:forward(firstSongPred, masterpieceOrNot[1])
dLdPred = bceCrit:backward(firstSongPred, masterpieceOrNot[1])
songLSTM:backward(songData[1], dLdPred)

-- data representing a sequence of length 3, vectors in R^5, and batch-size of 2
batch_size = 2
batchSeqData = {torch.randn(batch_size, d_in), torch.randn(batch_size, d_in), torch.randn(batch_size, d_in)}
print(batchSeqData)
-- do a batched :forward() call
print(nn.Sequencer(nn.LSTM(d_in, d_hid)):forward(batchSeqData))

-- For batch inputs, it's a little easier to start with sequence-length x batch-size tensor, so we transpose songData
songDataT = songData:t()
batchSongLSTM = nn.Sequential()
batchSongLSTM:add(LT) -- will return a sequence-length x batch-size x embedDim tensor
batchSongLSTM:add(nn.SplitTable(1, 3)) -- splits into a sequence-length table with batch-size x embedDim entries
print(batchSongLSTM:forward(songDataT)) -- sanity check
-- now let's add the LSTM stuff
batchSongLSTM:add(nn.Sequencer(nn.LSTM(embed_dim, embed_dim)))
batchSongLSTM:add(nn.SelectTable(-1)) -- selects last state of the LSTM
batchSongLSTM:add(nn.Linear(embed_dim, 1)) -- map last state to a score for classification
batchSongLSTM:add(nn.Sigmoid()) -- convert score to a probability
songPreds = batchSongLSTM:forward(songDataT)
print(songPreds)

-- we can now call :backward() as follows
loss = bceCrit:forward(songPreds, masterpieceOrNot)
dLdPreds = bceCrit:backward(songPreds, masterpieceOrNot)
batchSongLSTM:backward(songDataT, dLdPreds)

-- you can use FastLSTMs just like ordinary LSTMs
print(nn.Sequencer(nn.FastLSTM(d_in, d_hid)):forward({torch.randn(batch_size, d_in), torch.randn(batch_size, d_in)}))

stackedSongLSTM = nn.Sequential():add(LT):add(nn.SplitTable(1, 3))
stackedSongLSTM:add(nn.Sequencer(nn.LSTM(embed_dim, embed_dim))) -- add first layer
stackedSongLSTM:add(nn.Sequencer(nn.LSTM(embed_dim, embed_dim))) -- add second layer
print(stackedSongLSTM:forward(songDataT))

-- let's make sure the recurrences happened as we expect
-- as a sanity check, let's print out the embeddings we sent into the lstm
print(LT:forward(songDataT))

-- now let's look at the first layer LSTM's input at the third time-step; should match 3rd thing in the above!
firstLayerLSTM = stackedSongLSTM:get(3):get(1) -- Sequencer was 3rd thing added, and its first child is the LSTM
print(firstLayerLSTM.inputs[3])

-- now let's look at the first layer LSTM's output at the 3rd time-step
print(firstLayerLSTM.outputs[3])

-- let's now examine the second layer LSTM and its input
secondLayerLSTM = stackedSongLSTM:get(4):get(1)
print(secondLayerLSTM.inputs[3]) -- should match the OUTPUT of firstLayerLSTM at 3rd timestep

stackedSongLSTMDO = nn.Sequential():add(LT):add(nn.SplitTable(1, 3))
stackedSongLSTMDO:add(nn.Sequencer(nn.LSTM(embed_dim, embed_dim))) -- add first layer
stackedSongLSTMDO:add(nn.Sequencer(nn.Dropout(0.5)))
stackedSongLSTMDO:add(nn.Sequencer(nn.LSTM(embed_dim, embed_dim))) -- add second layer
print(stackedSongLSTMDO:forward(songDataT))

while stillInEpoch do
    batch = next batch of sequence-length x batch-size inputs
    lstm:forward(batch)
    gradOuts = gradOutput for each time-step (for each thing in the batch)
    lstm:backward(batch, gradOuts)
end

-- let's make another LSTM
rememberLSTM = nn.Sequential():add(LT):add(nn.SplitTable(1, 3))
seqLSTM = nn.Sequencer(nn.LSTM(embed_dim, embed_dim))
-- possible arguments for :remember() are 'eval', 'train', 'both', or 'neither', which tells it whether to remember
-- only during evaluation (but not training), only during training, both or neither
seqLSTM:remember('both')  -- :remember() typically needs to only be called once
rememberLSTM:add(seqLSTM)

--since we're remembering, we expect inputting the same sequence twice in a row to give us different outputs
-- (since the first time the pre-first state is the zero vector, and the second it's the end of the sequence)
print(rememberLSTM:forward(songDataT)[5]) -- printing out just the final time-step

-- let's do it again with the same sequence!
print(rememberLSTM:forward(songDataT)[5]) -- printing out just the final time-step

-- we can forget our history, though, by calling :forget()
seqLSTM:forget()
print(rememberLSTM:forward(songDataT)[5]) -- printing out just the final time-step

-- if we use :remember('neither') or :remember('eval'), :forget() is called internally before each :forward()
seqLSTM:remember('neither')
print(rememberLSTM:forward(songDataT)[5]) -- printing out just the final time-step
-- now it doesn't change if we call forward twice
print(rememberLSTM:forward(songDataT)[5]) -- printing out just the final time-step