#!/usr/bin/env luajit

--[[

   dyncnn is a deep-learning algorithm for the prediction of
   interacting object dynamics

   Copyright (c) 2016 Idiap Research Institute, http://www.idiap.ch/
   Written by Francois Fleuret <francois.fleuret@idiap.ch>

   This file is part of dyncnn.

   dyncnn is free software: you can redistribute it and/or modify it
   under the terms of the GNU General Public License version 3 as
   published by the Free Software Foundation.

   dyncnn is distributed in the hope that it will be useful, but
   WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with dyncnn.  If not, see <http://www.gnu.org/licenses/>.

]]--

require 'torch'
require 'nn'
require 'optim'
require 'image'

require 'img'

----------------------------------------------------------------------

function printf(f, ...)
   print(string.format(f, unpack({...})))
end

colors = sys.COLORS

function printfc(c, f, ...)
   print(c .. string.format(f, unpack({...})) .. colors.black)
end

function logCommand(c)
   print(colors.blue .. '[' .. c .. '] -> [' .. sys.execute(c) .. ']' .. colors.black)
end

----------------------------------------------------------------------
-- Environment variables

local defaultNbThreads = 1
local defaultUseGPU = false

if os.getenv('TORCH_NB_THREADS') then
   defaultNbThreads = os.getenv('TORCH_NB_THREADS')
   print('Environment variable TORCH_NB_THREADS is set and equal to ' .. defaultNbThreads)
else
   print('Environment variable TORCH_NB_THREADS is not set, default is ' .. defaultNbThreads)
end

if os.getenv('TORCH_USE_GPU') then
   defaultUseGPU = os.getenv('TORCH_USE_GPU') == 'yes'
   print('Environment variable TORCH_USE_GPU is set and evaluated as ' .. tostring(defaultUseGPU))
else
   print('Environment variable TORCH_USE_GPU is not set, default is ' .. tostring(defaultUseGPU))
end

----------------------------------------------------------------------
-- Command line arguments

local cmd = torch.CmdLine()

cmd:text('General setup')

cmd:option('-seed', 1, 'initial random seed')
cmd:option('-nbThreads', defaultNbThreads, 'how many threads (environment variable TORCH_NB_THREADS)')
cmd:option('-useGPU', defaultUseGPU, 'should we use cuda (environment variable TORCH_USE_GPU)')

cmd:text('')
cmd:text('Log')

cmd:option('-resultFreq', 100, 'at which epoch frequency should we save result images')
cmd:option('-exampleInternals', '', 'list of comma-separated indices for inner activation images')
cmd:option('-noLog', false, 'should we prevent logging')
cmd:option('-rundir', '', 'the directory for results')
cmd:option('-deltaImages', false, 'should we highlight the difference in result images')

cmd:text('')
cmd:text('Network structure')

cmd:option('-filterSize', 5)
cmd:option('-nbChannels', 16)
cmd:option('-nbBlocks', 8)

cmd:text('')
cmd:text('Training')

cmd:option('-nbEpochs', 1000, 'nb of epochs for the heavy setting')
cmd:option('-learningRate', 0.1, 'learning rate')
cmd:option('-batchSize', 128, 'size of the mini-batches')
cmd:option('-nbTrainSamples', 32768)
cmd:option('-nbValidationSamples', 1024)
cmd:option('-nbTestSamples', 1024)

cmd:text('')
cmd:text('Problem to solve')

cmd:option('-dataDir', './data/10p-mg', 'data directory')

------------------------------
-- Log and stuff

cmd:addTime('DYNCNN','%F %T')

params = cmd:parse(arg)

if params.rundir == '' then
   params.rundir = cmd:string('exp', params, { })
end

paths.mkdir(params.rundir)

if not params.noLog then
   -- Append to the log if there is one
   cmd:log(io.open(params.rundir .. '/log', 'a'), params)
end

----------------------------------------------------------------------
-- The experiment per se

if params.predictGrasp then
   params.targetDepth = 2
else
   params.targetDepth = 1
end

----------------------------------------------------------------------
-- Initializations

torch.setnumthreads(params.nbThreads)
torch.setdefaulttensortype('torch.FloatTensor')
torch.manualSeed(params.seed)

----------------------------------------------------------------------
-- Dealing with the CPU/GPU

-- mynn will take entries in that order: mynn, cudnn, cunn, nn

mynn = {}

setmetatable(mynn,
             {
                __index = function(table, key)
                   return (cudnn and cudnn[key]) or (cunn and cunn[key]) or nn[key]
                end
             }
)

-- These are the tensors that can be kept on the CPU
mynn.SlowTensor = torch.Tensor

-- These are the tensors that should be moved to the GPU
mynn.FastTensor = torch.Tensor

if params.useGPU then
   require 'cutorch'
   require 'cunn'
   require 'cudnn'
   cudnn.benchmark = true
   cudnn.fastest = true
   mynn.FastTensor = torch.CudaTensor
end

----------------------------------------------------------------------

function loadData(first, nb, name)
   print('Loading data `' .. name .. '\'.')

   local data = {}

   data.name = name
   data.nbSamples = nb
   data.width = 64
   data.height = 64

   data.input = mynn.SlowTensor(data.nbSamples, 2, data.height, data.width)
   data.target = mynn.SlowTensor(data.nbSamples, 1, data.height, data.width)

   for i = 1, data.nbSamples do
      local n = i-1 + first-1
      local frame = image.load(string.format('%s/%03d/dyn_%06d.png',
                                             params.dataDir,
                                             math.floor(n/1000), n))

      frame:mul(-1.0):add(1.0)
      frame = frame:max(1):select(1, 1)

      data.input[i][1]:copy(frame:sub(0 * data.height + 1, 1 * data.height,
                                      1 * data.width  + 1, 2 * data.width))

      data.input[i][2]:copy(frame:sub(0 * data.height + 1, 1 * data.height,
                                      0 * data.width  + 1, 1 * data.width))

      data.target[i][1]:copy(frame:sub(1 * data.height + 1, 2 * data.height,
                                       1 * data.width  + 1, 2 * data.width))
   end

   return data
end

----------------------------------------------------------------------

function collectAllOutputs(model, collection, which)
   if torch.type(model) == 'nn.Sequential' then
      for i = 1, #model.modules do
         collectAllOutputs(model.modules[i], collection, which)
      end
   elseif not which or which[torch.type(model)] then
      if torch.isTensor(model.output) then
         collection.nb = collection.nb + 1
         collection.outputs[collection.nb] = model.output
      end
   end
end

function saveInternalsImage(model, data, n)
   -- Explicitely copy to keep input as a mynn.FastTensor
   local input = mynn.FastTensor(1, 2, data.height, data.width)
   input:copy(data.input:narrow(1, n, 1))

   local output = model:forward(input)

   local collection = {}
   collection.outputs = {}
   collection.nb = 1
   collection.outputs[collection.nb] = input

   collectAllOutputs(model, collection,
                     {
                        ['nn.ReLU'] = true,
                        ['cunn.ReLU'] = true,
                        ['cudnn.ReLU'] = true,
                     }
   )

   if collection.outputs[collection.nb] ~= model.output then
      collection.nb = collection.nb + 1
      collection.outputs[collection.nb] = model.output
   end

   local fileName = string.format('%s/internals_%s_%06d.png',
                                  params.rundir,
                                  data.name, n)

   print('Saving ' .. fileName)
   image.save(fileName, imageFromTensors(collection.outputs))
end

----------------------------------------------------------------------

function highlightImage(a, b)
   if params.deltaImages then
      local h = torch.csub(a, b):abs()
      h:div(1/h:max()):mul(0.9):add(0.1)
      return torch.cmul(a, h)
   else
      return a
   end
end

function saveResultImage(model, data, nbMax)
   local criterion = nn.MSECriterion()

   if params.useGPU then
      print('Moving the criterion to the GPU.')
      criterion:cuda()
   end

   local input = mynn.FastTensor(1, 2, data.height, data.width)
   local target = mynn.FastTensor(1, 1, data.height, data.width)

   local nbMax = nbMax or 50

   local nb = math.min(nbMax, data.nbSamples)

   model:evaluate()

   printf('Write %d result images for `%s\'.', nb, data.name)

   local lossFile = io.open(params.rundir .. '/result_' .. data.name .. '_losses.dat', 'w')

   for n = 1, nb do

      -- Explicitely copy to keep input as a mynn.FastTensor
      input:copy(data.input:narrow(1, n, 1))
      target:copy(data.target:narrow(1, n, 1))

      local output = model:forward(input)
      local loss = criterion:forward(output, target)

      output = mynn.SlowTensor(output:size()):copy(output)

      -- We use our magical img.lua to create the result images

      local comp

      comp = {
         {
            vertical = true,
            { pad = 1, data.input[n][1] },
            { pad = 1, data.input[n][2] },
            { pad = 1, highlightImage(data.target[n][1], data.input[n][1]) },
            { pad = 1, highlightImage(output[1][1], data.input[n][1]) },
         }
      }

      local result = combineImages(1.0, comp)

      result:mul(-1.0):add(1.0)

      local fileName = string.format('result_%s_%06d.png', data.name, n)
      image.save(params.rundir .. '/' .. fileName, result)
      lossFile:write(string.format('%f %s\n', loss, fileName))
   end
end

----------------------------------------------------------------------

function createTower(filterSize, nbChannels, nbBlocks)

   local tower

   if nbBlocks == 0 then

      tower = nn.Identity()

   else

      tower = mynn.Sequential()

      for b = 1, nbBlocks do
         local block = mynn.Sequential()

         block:add(mynn.SpatialConvolution(nbChannels,
                                           nbChannels,
                                           filterSize, filterSize,
                                           1, 1,
                                           (filterSize - 1) / 2, (filterSize - 1) / 2))
         block:add(mynn.SpatialBatchNormalization(nbChannels))
         block:add(mynn.ReLU(true))

         block:add(mynn.SpatialConvolution(nbChannels,
                                           nbChannels,
                                           filterSize, filterSize,
                                           1, 1,
                                           (filterSize - 1) / 2, (filterSize - 1) / 2))

         local parallel = mynn.ConcatTable()
         parallel:add(block):add(mynn.Identity())

         tower:add(parallel):add(mynn.CAddTable(true))

         tower:add(mynn.SpatialBatchNormalization(nbChannels))
         tower:add(mynn.ReLU(true))
      end

   end

   return tower

end

function createModel(imageWidth, imageHeight,
                     filterSize, nbChannels, nbBlocks)

   local model = mynn.Sequential()

   -- Encode the two input channels (grasping image and starting
   -- configuration) into the internal number of channels
   model:add(mynn.SpatialConvolution(2,
                                     nbChannels,
                                     filterSize, filterSize,
                                     1, 1,
                                     (filterSize - 1) / 2, (filterSize - 1) / 2))

   model:add(mynn.SpatialBatchNormalization(nbChannels))
   model:add(mynn.ReLU(true))

   -- Add the resnet modules
   model:add(createTower(filterSize, nbChannels, nbBlocks))

   -- Decode down to a single channel, which is the final image
   model:add(mynn.SpatialConvolution(nbChannels,
                                     1,
                                     filterSize, filterSize,
                                     1, 1,
                                     (filterSize - 1) / 2, (filterSize - 1) / 2))

   return model
end

----------------------------------------------------------------------

function fillBatch(data, first, batch, permutation)
   local actualBatchSize = math.min(params.batchSize, data.input:size(1) - first + 1)

   if actualBatchSize ~= batch.input:size(1) then
      local size = batch.input:size()
      size[1] = actualBatchSize
      batch.input:resize(size)
   end

   if actualBatchSize ~= batch.target:size(1) then
      local size = batch.target:size()
      size[1] = actualBatchSize
      batch.target:resize(size)
   end

   for k = 1, batch.input:size(1) do
      local i
      if permutation then
         i = permutation[first + k - 1]
      else
         i = first + k - 1
      end
      batch.input[k] = data.input[i]
      batch.target[k] = data.target[i]
   end
end

function trainModel(model, trainSet, validationSet)

   local criterion = nn.MSECriterion()
   local batchSize = params.batchSize

   local batch = {}
   batch.input = mynn.FastTensor(batchSize, 2, trainSet.height, trainSet.width)
   batch.target = mynn.FastTensor(batchSize, 1, trainSet.height, trainSet.width)

   local startingEpoch = 1

   if model.epoch then
      startingEpoch = model.epoch + 1
   end

   if model.RNGState then
      printfc(colors.red, 'Using the RNG state from the loaded model.')
      torch.setRNGState(model.RNGState)
   end

   if params.useGPU then
      print('Moving the model and criterion to the GPU.')
      model:cuda()
      criterion:cuda()
   end

   print('Starting training.')

   local parameters, gradParameters = model:getParameters()
   printf('The model has %d parameters.', parameters:storage():size(1))

   local averageTrainLoss, averageValidationLoss
   local trainTime, validationTime

   ----------------------------------------------------------------------

   local sgdState = {
      learningRate = params.learningRate,
      momentum = 0,
      learningRateDecay = 0
   }

   for e = startingEpoch, params.nbEpochs do

      model:training()

      local permutation = torch.randperm(trainSet.nbSamples)

      local accLoss = 0.0
      local nbBatches = 0
      local startTime = sys.clock()

      for b = 1, trainSet.nbSamples, batchSize do

         fillBatch(trainSet, b, batch, permutation)

         local opfunc = function(x)
            -- Surprisingly, copy() needs this check
            if x ~= parameters then
               parameters:copy(x)
            end

            local output = model:forward(batch.input)

            local loss = criterion:forward(output, batch.target)
            local dLossdOutput = criterion:backward(output, batch.target)

            gradParameters:zero()
            model:backward(batch.input, dLossdOutput)

            accLoss = accLoss + loss
            nbBatches = nbBatches + 1

            return loss, gradParameters
         end

         optim.sgd(opfunc, parameters, sgdState)

      end

      trainTime = sys.clock() - startTime
      averageTrainLoss = accLoss / nbBatches

      ----------------------------------------------------------------------
      -- Validation losses

      do
         model:evaluate()

         local accLoss = 0.0
         local nbBatches = 0
         local startTime = sys.clock()

         for b = 1, validationSet.nbSamples, batchSize do
            fillBatch(validationSet, b, batch)
            local output = model:forward(batch.input)
            accLoss = accLoss + criterion:forward(output, batch.target)
            nbBatches = nbBatches + 1
         end

         validationTime = sys.clock() - startTime
         averageValidationLoss = accLoss / nbBatches;
      end

      ----------------------------------------------------------------------

      printfc(colors.green,

              'epoch %d acc_train_loss %f validation_loss %f [train %.02fs total %.02fms / sample, validation %.02fs total %.02fms / sample]',

              e,

              averageTrainLoss,

              averageValidationLoss,

              trainTime,
              1000 * trainTime / trainSet.nbSamples,

              validationTime,
              1000 * validationTime / validationSet.nbSamples
      )

      ----------------------------------------------------------------------
      -- Save a persistent state so that we can restart from there

      model:clearState()
      model.RNGState = torch.getRNGState()
      model.epoch = e
      torch.save(params.rundir .. '/model_last.t7', model)

      ----------------------------------------------------------------------
      -- Save a duplicate of the persistent state from time to time

      if params.resultFreq > 0 and e%params.resultFreq == 0 then
         torch.save(string.format('%s/model_%04d.t7', params.rundir, e), model)
         saveResultImage(model, trainSet)
         saveResultImage(model, validationSet)
      end

   end

end

function createAndTrainModel(trainSet, validationSet)

   -- Load the current training state, or create a new model from
   -- scratch

   if pcall(function () model = torch.load(params.rundir .. '/model_last.t7') end) then

      printfc(colors.red,
              'Found a model with %d epochs completed, starting from there.',
              model.epoch)

      if params.exampleInternals ~= '' then
         for _, i in ipairs(string.split(params.exampleInternals, ',')) do
            saveInternalsImage(model, validationSet, tonumber(i))
         end
         os.exit(0)
      end

   else

      model = createModel(trainSet.width, trainSet.height,
                          params.filterSize, params.nbChannels,
                          params.nbBlocks)

   end

   trainModel(model, trainSet, validationSet)

   return model

end

----------------------------------------------------------------------
-- main

for _, c in pairs({
      'date',
      'uname -a',
      'git log -1 --format=%H'
                 })
do
   logCommand(c)
end

local trainSet = loadData(1,
                          params.nbTrainSamples, 'train')

local validationSet = loadData(params.nbTrainSamples + 1,
                               params.nbValidationSamples, 'validation')

local model = createAndTrainModel(trainSet, validationSet)

----------------------------------------------------------------------
-- Test

local testSet = loadData(params.nbTrainSamples + params.nbValidationSamples + 1,
                         params.nbTestSamples, 'test')

if params.useGPU then
   print('Moving the model and criterion to the GPU.')
   model:cuda()
end

saveResultImage(model, trainSet)
saveResultImage(model, validationSet)
saveResultImage(model, testSet, 1024)