#!/usr/bin/env luajit

--[[

   dyncnn is a deep-learning algorithm for the prediction of
   interacting object dynamics

   Copyright (c) 2016 Idiap Research Institute, http://www.idiap.ch/
   Written by Francois Fleuret <francois.fleuret@idiap.ch>

   This file is part of dyncnn.

   dyncnn is free software: you can redistribute it and/or modify it
   under the terms of the GNU General Public License version 3 as
   published by the Free Software Foundation.

   dyncnn is distributed in the hope that it will be useful, but
   WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with dyncnn.  If not, see <http://www.gnu.org/licenses/>.

]]--

require 'torch'
require 'nn'
require 'optim'
require 'image'

require 'fftb'

----------------------------------------------------------------------
-- Command line arguments

local cmd = torch.CmdLine()

cmd:text('General setup')

cmd:option('-seed', 1, 'initial random seed')
cmd:option('-nbThreads', defaultNbThreads, 'how many threads (environment variable TORCH_NB_THREADS)')
cmd:option('-useGPU', defaultUseGPU, 'should we use cuda (environment variable TORCH_USE_GPU)')
cmd:option('-fastGPU', true, 'should we go as fast as possible, possibly non-deterministically')

cmd:text('')
cmd:text('Log')

cmd:option('-resultFreq', 100, 'at which epoch frequency should we save result images')
cmd:option('-exampleInternals', '', 'list of comma-separated indices for inner activation images')
cmd:option('-noLog', false, 'should we prevent logging')
cmd:option('-rundir', '', 'the directory for results')
cmd:option('-deltaImages', false, 'should we highlight the difference in result images')

cmd:text('')
cmd:text('Network structure')

cmd:option('-filterSize', 5)
cmd:option('-nbChannels', 16)
cmd:option('-nbBlocks', 8)

cmd:text('')
cmd:text('Training')

cmd:option('-nbEpochs', 1000, 'nb of epochs for the heavy setting')
cmd:option('-learningRate', 0.1, 'learning rate')
cmd:option('-batchSize', 128, 'size of the mini-batches')
cmd:option('-nbTrainSamples', 32768)
cmd:option('-nbValidationSamples', 1024)
cmd:option('-nbTestSamples', 1024)

cmd:text('')
cmd:text('Problem to solve')

cmd:option('-dataDir', './data/10p-mg', 'data directory')

cmd:addTime('DYNCNN','%F %T')

params = cmd:parse(arg)

----------------------------------------------------------------------

fftbInit(cmd, params)

for _, c in pairs({
      'date',
      'uname -a',
      'git log -1 --format=%H'
                 })
do
   logCommand(c)
end

----------------------------------------------------------------------

function loadData(first, nb, name)
   print('Loading data `' .. name .. '\'.')

   local data = {}

   data.name = name
   data.nbSamples = nb
   data.width = 64
   data.height = 64

   data.input = ffnn.SlowTensor(data.nbSamples, 2, data.height, data.width)
   data.target = ffnn.SlowTensor(data.nbSamples, 1, data.height, data.width)

   for i = 1, data.nbSamples do
      local n = i-1 + first-1
      local frame = image.load(string.format('%s/%03d/dyn_%06d.png',
                                             params.dataDir,
                                             math.floor(n/1000), n))

      frame:mul(-1.0):add(1.0)
      frame = frame:max(1):select(1, 1)

      data.input[i][1]:copy(frame:sub(0 * data.height + 1, 1 * data.height,
                                      1 * data.width  + 1, 2 * data.width))

      data.input[i][2]:copy(frame:sub(0 * data.height + 1, 1 * data.height,
                                      0 * data.width  + 1, 1 * data.width))

      data.target[i][1]:copy(frame:sub(1 * data.height + 1, 2 * data.height,
                                       1 * data.width  + 1, 2 * data.width))
   end

   return data
end

----------------------------------------------------------------------

function collectAllOutputs(model, collection, which)
   if torch.type(model) == 'nn.Sequential' then
      for i = 1, #model.modules do
         collectAllOutputs(model.modules[i], collection, which)
      end
   elseif not which or which[torch.type(model)] then
      if torch.isTensor(model.output) then
         collection.nb = collection.nb + 1
         collection.outputs[collection.nb] = model.output
      end
   end
end

function saveInternalsImage(model, data, n)
   -- Explicitely copy to keep input as a ffnn.FastTensor
   local input = ffnn.FastTensor(1, 2, data.height, data.width)
   input:copy(data.input:narrow(1, n, 1))

   local output = model:forward(input)

   local collection = {}
   collection.outputs = {}
   collection.nb = 1
   collection.outputs[collection.nb] = input

   collectAllOutputs(model, collection,
                     {
                        ['nn.ReLU'] = true,
                        ['cunn.ReLU'] = true,
                        ['cudnn.ReLU'] = true,
                     }
   )

   if collection.outputs[collection.nb] ~= model.output then
      collection.nb = collection.nb + 1
      collection.outputs[collection.nb] = model.output
   end

   local fileName = string.format('%s/internals_%s_%06d.png',
                                  params.rundir,
                                  data.name, n)

   print('Saving ' .. fileName)
   image.save(fileName, imageFromTensors(collection.outputs))
end

----------------------------------------------------------------------

function highlightImage(a, b)
   if params.deltaImages then
      local h = torch.csub(a, b):abs()
      h:div(1/h:max()):mul(0.9):add(0.1)
      return torch.cmul(a, h)
   else
      return a
   end
end

function saveResultImage(model, data, nbMax)
   local criterion = nn.MSECriterion()

   if params.useGPU then
      print('Moving the criterion to the GPU.')
      criterion:cuda()
   end

   local input = ffnn.FastTensor(1, 2, data.height, data.width)
   local target = ffnn.FastTensor(1, 1, data.height, data.width)

   local nbMax = nbMax or 50

   local nb = math.min(nbMax, data.nbSamples)

   model:evaluate()

   printf('Write %d result images for `%s\'.', nb, data.name)

   local lossFile = io.open(params.rundir .. '/result_' .. data.name .. '_losses.dat', 'w')

   for n = 1, nb do

      -- Explicitely copy to keep input as a ffnn.FastTensor
      input:copy(data.input:narrow(1, n, 1))
      target:copy(data.target:narrow(1, n, 1))

      local output = model:forward(input)
      local loss = criterion:forward(output, target)

      output = ffnn.SlowTensor(output:size()):copy(output)

      -- We use our magical img.lua to create the result images

      local comp

      comp = {
         {
            vertical = true,
            { pad = 1, data.input[n][1] },
            { pad = 1, data.input[n][2] },
            { pad = 1, highlightImage(data.target[n][1], data.input[n][1]) },
            { pad = 1, highlightImage(output[1][1], data.input[n][1]) },
         }
      }

      local result = combineImages(1.0, comp)

      result:mul(-1.0):add(1.0)

      local fileName = string.format('result_%s_%06d.png', data.name, n)
      image.save(params.rundir .. '/' .. fileName, result)
      lossFile:write(string.format('%f %s\n', loss, fileName))
   end
end

----------------------------------------------------------------------

function createTower(filterSize, nbChannels, nbBlocks)

   local tower

   if nbBlocks == 0 then

      tower = nn.Identity()

   else

      tower = ffnn.Sequential()

      for b = 1, nbBlocks do
         local block = ffnn.Sequential()

         block:add(ffnn.SpatialConvolution(nbChannels,
                                           nbChannels,
                                           filterSize, filterSize,
                                           1, 1,
                                           (filterSize - 1) / 2, (filterSize - 1) / 2))
         block:add(ffnn.SpatialBatchNormalization(nbChannels))
         block:add(ffnn.ReLU(true))

         block:add(ffnn.SpatialConvolution(nbChannels,
                                           nbChannels,
                                           filterSize, filterSize,
                                           1, 1,
                                           (filterSize - 1) / 2, (filterSize - 1) / 2))

         local parallel = ffnn.ConcatTable()
         parallel:add(block):add(ffnn.Identity())

         tower:add(parallel):add(ffnn.CAddTable(true))

         tower:add(ffnn.SpatialBatchNormalization(nbChannels))
         tower:add(ffnn.ReLU(true))
      end

   end

   return tower
end

function createModel(imageWidth, imageHeight,
                     filterSize, nbChannels, nbBlocks)

   local model = ffnn.Sequential()

   -- Encode the two input channels (grasping image and starting
   -- configuration) into the internal number of channels
   model:add(ffnn.SpatialConvolution(2,
                                     nbChannels,
                                     filterSize, filterSize,
                                     1, 1,
                                     (filterSize - 1) / 2, (filterSize - 1) / 2))

   model:add(ffnn.SpatialBatchNormalization(nbChannels))
   model:add(ffnn.ReLU(true))

   -- Add the resnet modules
   model:add(createTower(filterSize, nbChannels, nbBlocks))

   -- Decode down to a single channel, which is the final image
   model:add(ffnn.SpatialConvolution(nbChannels,
                                     1,
                                     filterSize, filterSize,
                                     1, 1,
                                     (filterSize - 1) / 2, (filterSize - 1) / 2))

   return model
end

----------------------------------------------------------------------

function trainModel(model, trainSet, validationSet)

   local criterion = nn.MSECriterion()
   local batchSize = params.batchSize

   local startingEpoch = 1

   if model.epoch then
      startingEpoch = model.epoch + 1
   end

   if model.RNGState then
      printfc(colors.red, 'Using the RNG state from the loaded model.')
      torch.setRNGState(model.RNGState)
   end

   if params.useGPU then
      print('Moving the model and criterion to the GPU.')
      model:cuda()
      criterion:cuda()
   end

   print('Starting training.')

   local parameters, gradParameters = model:getParameters()
   printf('The model has %d parameters.', parameters:storage():size(1))

   local averageTrainLoss, averageValidationLoss
   local trainTime, validationTime

   ----------------------------------------------------------------------

   local sgdState = {
      learningRate = params.learningRate,
      momentum = 0,
      learningRateDecay = 0
   }

   local batch = {}

   for e = startingEpoch, params.nbEpochs do

      model:training()

      local permutation = torch.randperm(trainSet.nbSamples)

      local accLoss = 0.0
      local nbBatches = 0
      local startTime = sys.clock()

      for b = 1, trainSet.nbSamples, batchSize do

         fillBatch(trainSet, b, batch, permutation)

         local opfunc = function(x)
            -- Surprisingly, copy() needs this check
            if x ~= parameters then
               parameters:copy(x)
            end

            local output = model:forward(batch.input)

            local loss = criterion:forward(output, batch.target)
            local dLossdOutput = criterion:backward(output, batch.target)

            gradParameters:zero()
            model:backward(batch.input, dLossdOutput)

            accLoss = accLoss + loss
            nbBatches = nbBatches + 1

            return loss, gradParameters
         end

         optim.sgd(opfunc, parameters, sgdState)

      end

      trainTime = sys.clock() - startTime
      averageTrainLoss = accLoss / nbBatches

      ----------------------------------------------------------------------
      -- Validation losses

      do
         model:evaluate()

         local accLoss = 0.0
         local nbBatches = 0
         local startTime = sys.clock()

         for b = 1, validationSet.nbSamples, batchSize do
            fillBatch(validationSet, b, batch)
            local output = model:forward(batch.input)
            accLoss = accLoss + criterion:forward(output, batch.target)
            nbBatches = nbBatches + 1
         end

         validationTime = sys.clock() - startTime
         averageValidationLoss = accLoss / nbBatches;
      end

      ----------------------------------------------------------------------

      printfc(colors.green,

              'epoch %d acc_train_loss %f validation_loss %f [train %.02fs total %.02fms / sample, validation %.02fs total %.02fms / sample]',

              e,

              averageTrainLoss,

              averageValidationLoss,

              trainTime,
              1000 * trainTime / trainSet.nbSamples,

              validationTime,
              1000 * validationTime / validationSet.nbSamples
      )

      ----------------------------------------------------------------------
      -- Save a persistent state so that we can restart from there

      model:clearState()
      model.RNGState = torch.getRNGState()
      model.epoch = e
      torch.save(params.rundir .. '/model_last.t7', model)

      ----------------------------------------------------------------------
      -- Save a duplicate of the persistent state from time to time

      if params.resultFreq > 0 and e%params.resultFreq == 0 then
         torch.save(string.format('%s/model_%04d.t7', params.rundir, e), model)
         saveResultImage(model, trainSet)
         saveResultImage(model, validationSet)
      end

   end

end

----------------------------------------------------------------------
-- main

local trainSet = loadData(1,
                          params.nbTrainSamples, 'train')

local validationSet = loadData(params.nbTrainSamples + 1,
                               params.nbValidationSamples, 'validation')

local model

if pcall(function () model = torch.load(params.rundir .. '/model_last.t7') end) then

   printfc(colors.red,
           'Found a model with %d epochs completed, starting from there.',
           model.epoch)

   if params.exampleInternals ~= '' then
      for _, i in ipairs(string.split(params.exampleInternals, ',')) do
         saveInternalsImage(model, validationSet, tonumber(i))
      end
      os.exit(0)
   end

else

   model = createModel(trainSet.width, trainSet.height,
                       params.filterSize, params.nbChannels,
                       params.nbBlocks)

end

trainModel(model, trainSet, validationSet)

----------------------------------------------------------------------
-- Test

local testSet = loadData(params.nbTrainSamples + params.nbValidationSamples + 1,
                         params.nbTestSamples, 'test')

if params.useGPU then
   print('Moving the model and criterion to the GPU.')
   model:cuda()
end

saveResultImage(model, trainSet)
saveResultImage(model, validationSet)
saveResultImage(model, testSet, 1024)