X-Git-Url: https://www.fleuret.org/cgi-bin/gitweb/gitweb.cgi?p=dyncnn.git;a=blobdiff_plain;f=dyncnn.lua;h=53625934e1cbfca2e779abf1b8ef245c91836feb;hp=e1043868c3c432849059f527c145131b5041a336;hb=fe5dee151313b6abd8ffee2c5fc5593f326e663f;hpb=be0c7d53f21ce96c70e7c13ef0ba2c9eca10ca23 diff --git a/dyncnn.lua b/dyncnn.lua index e104386..5362593 100755 --- a/dyncnn.lua +++ b/dyncnn.lua @@ -30,346 +30,195 @@ require 'optim' require 'image' require 'pl' ----------------------------------------------------------------------- - -local opt = lapp[[ - --seed (default 1) random seed - - --learningStateFile (default '') - --dataDir (default './data/10p-mg/') - --resultDir (default '/tmp/dyncnn') - - --learningRate (default -1) - --momentum (default -1) - --nbEpochs (default -1) nb of epochs for the heavy setting - - --heavy use the heavy configuration - --nbChannels (default -1) nb of channels in the internal layers - --resultFreq (default 100) - - --noLog supress logging - - --exampleInternals (default -1) -]] +require 'img' ---------------------------------------------------------------------- -commandLine='' -for i = 0, #arg do - commandLine = commandLine .. ' \'' .. arg[i] .. '\'' +function printf(f, ...) + print(string.format(f, unpack({...}))) end ----------------------------------------------------------------------- - colors = sys.COLORS -global = {} - -function logString(s, c) - if global.logFile then - global.logFile:write(s) - global.logFile:flush() - end - local c = c or colors.black - io.write(c .. s) - io.flush() +function printfc(c, f, ...) + printf(c .. string.format(f, unpack({...})) .. colors.black) end function logCommand(c) - logString('[' .. c .. '] -> [' .. sys.execute(c) .. ']\n', colors.blue) -end - -logString('commandline: ' .. commandLine .. '\n', colors.blue) - -logCommand('mkdir -v -p ' .. opt.resultDir) - -if not opt.noLog then - global.logName = opt.resultDir .. '/log' - global.logFile = io.open(global.logName, 'a') + print(colors.blue .. '[' .. c .. '] -> [' .. sys.execute(c) .. ']' .. colors.black) end ---------------------------------------------------------------------- +-- Environment and command line arguments -alreadyLoggedString = {} +local defaultNbThreads = 1 +local defaultUseGPU = false -function logOnce(s) - local l = debug.getinfo(1).currentline - if not alreadyLoggedString[l] then - logString('@line ' .. l .. ' ' .. s, colors.red) - alreadyLoggedString[l] = s - end +if os.getenv('TORCH_NB_THREADS') then + defaultNbThreads = os.getenv('TORCH_NB_THREADS') + print('Environment variable TORCH_NB_THREADS is set and equal to ' .. defaultNbThreads) +else + print('Environment variable TORCH_NB_THREADS is not set') end ----------------------------------------------------------------------- - -nbThreads = os.getenv('TORCH_NB_THREADS') or 1 - -useGPU = os.getenv('TORCH_USE_GPU') == 'yes' - -for _, c in pairs({ 'date', - 'uname -a', - 'git log -1 --format=%H' - }) -do - logCommand(c) +if os.getenv('TORCH_USE_GPU') then + defaultUseGPU = os.getenv('TORCH_USE_GPU') == 'yes' + print('Environment variable TORCH_USE_GPU is set and evaluated as ' .. tostring(defaultUseGPU)) +else + print('Environment variable TORCH_USE_GPU is not set.') end -logString('useGPU is \'' .. tostring(useGPU) .. '\'.\n') - -logString('nbThreads is \'' .. nbThreads .. '\'.\n') - ---------------------------------------------------------------------- -torch.setnumthreads(nbThreads) -torch.setdefaulttensortype('torch.FloatTensor') -torch.manualSeed(opt.seed) +local cmd = torch.CmdLine() -mynn = {} +cmd:text('') +cmd:text('General setup') --- To deal elegantly with CPU/GPU -local mt = {} -function mt.__index(table, key) - return (cudnn and cudnn[key]) or (cunn and cunn[key]) or nn[key] -end -setmetatable(mynn, mt) +cmd:option('-seed', 1, 'initial random seed') +cmd:option('-nbThreads', defaultNbThreads, 'how many threads (environment variable TORCH_NB_THREADS)') +cmd:option('-useGPU', defaultUseGPU, 'should we use cuda (environment variable TORCH_USE_GPU)') --- These are the tensors that can be kept on the CPU -mynn.SlowTensor = torch.Tensor --- These are the tensors that should be moved to the GPU -mynn.FastTensor = torch.Tensor +cmd:text('') +cmd:text('Log') ----------------------------------------------------------------------- +cmd:option('-resultFreq', 100, 'at which epoch frequency should we save result images') +cmd:option('-exampleInternals', -1, 'should we save inner activation images') +cmd:option('-noLog', false, 'should we prevent logging') +cmd:option('-rundir', '', 'the directory for results') -if useGPU then - require 'cutorch' - require 'cunn' - require 'cudnn' +cmd:text('') +cmd:text('Training') - mynn.FastTensor = torch.CudaTensor +cmd:option('-nbEpochs', 1000, 'nb of epochs for the heavy setting') +cmd:option('-learningRate', 0.1, 'learning rate') +cmd:option('-batchSize', 128, 'size of the mini-batches') +cmd:option('-filterSize', 5, 'convolution filter size') +cmd:option('-nbTrainSamples', 32768) +cmd:option('-nbValidationSamples', 1024) +cmd:option('-nbTestSamples', 1024) - if cudnn then - cudnn.benchmark = true - cudnn.fastest = true - end -end +cmd:text('') +cmd:text('Problem to solve') ----------------------------------------------------------------------- +cmd:option('-dataDir', './data/10p-mg', 'data directory') -config = {} -config.learningRate = 0.1 -config.momentum = 0 -config.batchSize = 128 -config.filterSize = 5 +cmd:text('') +cmd:text('Network structure') -if opt.heavy then +cmd:option('-nbChannels', 16) +cmd:option('-nbBlocks', 8) - logString('Using the heavy configuration.\n') - config.nbChannels = 16 - config.nbBlocks = 4 - config.nbEpochs = 250 - config.nbEpochsInit = 100 - config.nbTrainSamples = 32768 - config.nbValidationSamples = 1024 - config.nbTestSamples = 1024 +------------------------------ +-- Log and stuff -else +cmd:addTime('DYNCNN','%F %T') - logString('Using the light configuration.\n') - config.nbChannels = 2 - config.nbBlocks = 2 - config.nbEpochs = 6 - config.nbEpochsInit = 3 - config.nbTrainSamples = 1024 - config.nbValidationSamples = 1024 - config.nbTestSamples = 1024 +params = cmd:parse(arg) +if params.rundir == '' then + params.rundir = cmd:string('exp', params, { }) end -if opt.nbEpochs > 0 then - config.nbEpochs = opt.nbEpochs -end +paths.mkdir(params.rundir) -if opt.nbChannels > 0 then - config.nbChannels = opt.nbChannels +if not params.noLog then + -- Append to the log if there is one + cmd:log(io.open(params.rundir .. '/log', 'a'), params) end -if opt.learningRate > 0 then - config.learningRate = opt.learningRate -end +---------------------------------------------------------------------- +-- The experiment per se -if opt.momentum >= 0 then - config.momentum = opt.momentum +if params.predictGrasp then + params.targetDepth = 2 +else + params.targetDepth = 1 end ---------------------------------------------------------------------- +-- Initializations -function tensorCensus(tensorType, model) +torch.setnumthreads(params.nbThreads) +torch.setdefaulttensortype('torch.FloatTensor') +torch.manualSeed(params.seed) - local nb = {} +---------------------------------------------------------------------- +-- Dealing with the CPU/GPU - local function countThings(m) - for k, i in pairs(m) do - if torch.type(i) == tensorType then - nb[k] = (nb[k] or 0) + i:nElement() - end - end - end +-- mynn will take entries in that order: mynn, cudnn, cunn, nn - model:apply(countThings) +mynn = {} - return nb +setmetatable(mynn, + { + __index = function(table, key) + return (cudnn and cudnn[key]) or (cunn and cunn[key]) or nn[key] + end + } +) +-- These are the tensors that can be kept on the CPU +mynn.SlowTensor = torch.Tensor + +-- These are the tensors that should be moved to the GPU +mynn.FastTensor = torch.Tensor + +if params.useGPU then + require 'cutorch' + require 'cunn' + require 'cudnn' + cudnn.benchmark = true + cudnn.fastest = true + mynn.FastTensor = torch.CudaTensor end ---------------------------------------------------------------------- function loadData(first, nb, name) - logString('Loading data `' .. name .. '\'.\n') - - local persistentFileName = string.format('%s/persistent_%d_%d.dat', - opt.dataDir, - first, - nb) - - -- This is at what framerate we work. It is greater than 1 so that - -- we can keep on disk sequences at a higher frame rate for videos - -- and explaining materials - - local frameRate = 4 - - local data - - if not path.exists(persistentFileName) then - logString(string.format('No persistent data structure, creating it (%d samples).\n', nb)) - local data = {} - data.name = name - data.nbSamples = nb - data.width = 64 - data.height = 64 - data.input = mynn.SlowTensor(data.nbSamples, 2, data.height, data.width) - data.target = mynn.SlowTensor(data.nbSamples, 1, data.height, data.width) - - for i = 1, data.nbSamples do - local n = i-1 + first-1 - local prefix = string.format('%s/%03d/dyn_%06d', - opt.dataDir, - math.floor(n/1000), n) - - function localLoad(filename, tensor) - local tmp - tmp = image.load(filename) - tmp:mul(-1.0):add(1.0) - tensor:copy(torch.max(tmp, 1)) - end + print('Loading data `' .. name .. '\'.') - localLoad(prefix .. '_world_000.png', data.input[i][1]) - localLoad(prefix .. '_grab.png', data.input[i][2]) - localLoad(string.format('%s_world_%03d.png', prefix, frameRate), - data.target[i][1]) - end + local data = {} - data.persistentFileName = persistentFileName + data.name = name + data.nbSamples = nb + data.width = 64 + data.height = 64 - torch.save(persistentFileName, data) - end + data.input = mynn.SlowTensor(data.nbSamples, 2, data.height, data.width) + data.target = mynn.SlowTensor(data.nbSamples, 1, data.height, data.width) - logCommand('sha256sum -b ' .. persistentFileName) + for i = 1, data.nbSamples do + local n = i-1 + first-1 + local frame = image.load(string.format('%s/%03d/dyn_%06d.png', + params.dataDir, + math.floor(n/1000), n)) - data = torch.load(persistentFileName) + frame:mul(-1.0):add(1.0) + frame = frame:max(1):select(1, 1) - return data -end + data.input[i][1]:copy(frame:sub(0 * data.height + 1, 1 * data.height, + 1 * data.width + 1, 2 * data.width)) ----------------------------------------------------------------------- + data.input[i][2]:copy(frame:sub(0 * data.height + 1, 1 * data.height, + 0 * data.width + 1, 1 * data.width)) --- This function gets as input a list of tensors of arbitrary --- dimensions each, but whose two last dimension stands for height x --- width. It creates an image tensor (2d, one channel) with each --- argument tensor unfolded per row. - -function imageFromTensors(bt, signed) - local gap = 1 - local tgap = -1 - local width = 0 - local height = gap - - for _, t in pairs(bt) do - -- print(t:size()) - local d = t:dim() - local h, w = t:size(d - 1), t:size(d) - local n = t:nElement() / (w * h) - width = math.max(width, gap + n * (gap + w)) - height = height + gap + tgap + gap + h + data.target[i][1]:copy(frame:sub(1 * data.height + 1, 2 * data.height, + 1 * data.width + 1, 2 * data.width)) end - local e = torch.Tensor(3, height, width):fill(1.0) - local y0 = 1 + gap - - for _, t in pairs(bt) do - local d = t:dim() - local h, w = t:size(d - 1), t:size(d) - local n = t:nElement() / (w * h) - local z = t:norm() / math.sqrt(t:nElement()) - - local x0 = 1 + gap + math.floor( (width - n * (w + gap)) /2 ) - local u = torch.Tensor(t:size()):copy(t):resize(n, h, w) - for m = 1, n do - - for c = 1, 3 do - for y = 0, h+1 do - e[c][y0 + y - 1][x0 - 1] = 0.0 - e[c][y0 + y - 1][x0 + w ] = 0.0 - end - for x = 0, w+1 do - e[c][y0 - 1][x0 + x - 1] = 0.0 - e[c][y0 + h ][x0 + x - 1] = 0.0 - end - end - - for y = 1, h do - for x = 1, w do - local v = u[m][y][x] / z - local r, g, b - if signed then - if v < -1 then - r, g, b = 0.0, 0.0, 1.0 - elseif v > 1 then - r, g, b = 1.0, 0.0, 0.0 - elseif v >= 0 then - r, g, b = 1.0, 1.0 - v, 1.0 - v - else - r, g, b = 1.0 + v, 1.0 + v, 1.0 - end - else - if v <= 0 then - r, g, b = 1.0, 1.0, 1.0 - elseif v > 1 then - r, g, b = 0.0, 0.0, 0.0 - else - r, g, b = 1.0 - v, 1.0 - v, 1.0 - v - end - end - e[1][y0 + y - 1][x0 + x - 1] = r - e[2][y0 + y - 1][x0 + x - 1] = g - e[3][y0 + y - 1][x0 + x - 1] = b - end - end - x0 = x0 + w + gap - end - y0 = y0 + h + gap + tgap + gap - end - - return e + return data end +---------------------------------------------------------------------- + function collectAllOutputs(model, collection, which) if torch.type(model) == 'nn.Sequential' then for i = 1, #model.modules do collectAllOutputs(model.modules[i], collection, which) end elseif not which or which[torch.type(model)] then - local t = torch.type(model.output) - if t == 'torch.FloatTensor' or t == 'torch.CudaTensor' then + if torch.isTensor(model.output) then collection.nb = collection.nb + 1 collection.outputs[collection.nb] = model.output end @@ -388,9 +237,13 @@ function saveInternalsImage(model, data, n) collection.nb = 1 collection.outputs[collection.nb] = input - local which = {} - which['nn.ReLU'] = true - collectAllOutputs(model, collection, which) + collectAllOutputs(model, collection, + { + ['nn.ReLU'] = true, + ['cunn.ReLU'] = true, + ['cudnn.ReLU'] = true, + } + ) if collection.outputs[collection.nb] ~= model.output then collection.nb = collection.nb + 1 @@ -398,25 +251,23 @@ function saveInternalsImage(model, data, n) end local fileName = string.format('%s/internals_%s_%06d.png', - opt.resultDir, + params.rundir, data.name, n) - logString('Saving ' .. fileName .. '\n') + print('Saving ' .. fileName) image.save(fileName, imageFromTensors(collection.outputs)) end ---------------------------------------------------------------------- -function saveResultImage(model, data, prefix, nbMax, highlight) - local l2criterion = nn.MSECriterion() +function saveResultImage(model, data, nbMax) + local criterion = nn.MSECriterion() - if useGPU then - logString('Moving the criterion to the GPU.\n') - l2criterion:cuda() + if params.useGPU then + print('Moving the criterion to the GPU.') + criterion:cuda() end - local prefix = prefix or 'result' - local result = torch.Tensor(data.height * 4 + 5, data.width + 2) local input = mynn.FastTensor(1, 2, data.height, data.width) local target = mynn.FastTensor(1, 1, data.height, data.width) @@ -426,9 +277,9 @@ function saveResultImage(model, data, prefix, nbMax, highlight) model:evaluate() - logString(string.format('Write %d result images `%s\' for set `%s\' in %s.\n', - nb, prefix, data.name, - opt.resultDir)) + printf('Write %d result images for `%s\'.', nb, data.name) + + local lossFile = io.open(params.rundir .. '/result_' .. data.name .. '_losses.dat', 'w') for n = 1, nb do @@ -437,86 +288,101 @@ function saveResultImage(model, data, prefix, nbMax, highlight) target:copy(data.target:narrow(1, n, 1)) local output = model:forward(input) + local loss = criterion:forward(output, target) + + output = mynn.SlowTensor(output:size()):copy(output) + + -- We use our magical img.lua to create the result images + + local comp = { + { + { pad = 1, data.input[n][1] }, + { pad = 1, data.input[n][2] }, + { pad = 1, data.target[n][1] }, + { pad = 1, output[1][1] }, + } + } + + --[[ + local comp = { + { + vertical = true, + { pad = 1, data.input[n][1] }, + { pad = 1, data.input[n][2] } + }, + torch.Tensor(4, 4):fill(1.0), + { + vertical = true, + { pad = 1, data.target[n][1] }, + { pad = 1, output[1][1] }, + { pad = 1, torch.csub(data.target[n][1], output[1][1]):abs() } + } + } + ]]-- + +local result = combineImages(1.0, comp) + +result:mul(-1.0):add(1.0) + +local fileName = string.format('result_%s_%06d.png', data.name, n) +image.save(params.rundir .. '/' .. fileName, result) +lossFile:write(string.format('%f %s\n', loss, fileName)) +end +end - local loss = l2criterion:forward(output, target) - - result:fill(1.0) - - if highlight then - for i = 1, data.height do - for j = 1, data.width do - local v = data.input[n][1][i][j] - result[1 + i + 0 * (data.height + 1)][1 + j] = data.input[n][2][i][j] - result[1 + i + 1 * (data.height + 1)][1 + j] = v - local a = data.target[n][1][i][j] - local b = output[1][1][i][j] - result[1 + i + 2 * (data.height + 1)][1 + j] = - a * math.min(1, 0.1 + 2.0 * math.abs(a - v)) - result[1 + i + 3 * (data.height + 1)][1 + j] = - b * math.min(1, 0.1 + 2.0 * math.abs(b - v)) - end - end - else - for i = 1, data.height do - for j = 1, data.width do - result[1 + i + 0 * (data.height + 1)][1 + j] = data.input[n][2][i][j] - result[1 + i + 1 * (data.height + 1)][1 + j] = data.input[n][1][i][j] - result[1 + i + 2 * (data.height + 1)][1 + j] = data.target[n][1][i][j] - result[1 + i + 3 * (data.height + 1)][1 + j] = output[1][1][i][j] - end - end - end +---------------------------------------------------------------------- - result:mul(-1.0):add(1.0) +function createTower(filterSize, nbChannels, nbBlocks) - local fileName = string.format('%s/%s_%s_%06d.png', - opt.resultDir, - prefix, - data.name, n) + local tower - logString(string.format('LOSS_ON_SAMPLE %f %s\n', loss, fileName)) + if nbBlocks == 0 then - image.save(fileName, result) - end -end + tower = nn.Identity() ----------------------------------------------------------------------- + else -function createTower(filterSize, nbChannels, nbBlocks) - local tower = mynn.Sequential() + tower = mynn.Sequential() - for b = 1, nbBlocks do - local block = mynn.Sequential() + for b = 1, nbBlocks do + local block = mynn.Sequential() - block:add(mynn.SpatialConvolution(nbChannels, - nbChannels, - filterSize, filterSize, - 1, 1, - (filterSize - 1) / 2, (filterSize - 1) / 2)) - block:add(mynn.SpatialBatchNormalization(nbChannels)) - block:add(mynn.ReLU(true)) + block:add(mynn.SpatialConvolution(nbChannels, + nbChannels, + filterSize, filterSize, + 1, 1, + (filterSize - 1) / 2, (filterSize - 1) / 2)) + block:add(mynn.SpatialBatchNormalization(nbChannels)) + block:add(mynn.ReLU(true)) - block:add(mynn.SpatialConvolution(nbChannels, - nbChannels, - filterSize, filterSize, - 1, 1, - (filterSize - 1) / 2, (filterSize - 1) / 2)) + block:add(mynn.SpatialConvolution(nbChannels, + nbChannels, + filterSize, filterSize, + 1, 1, + (filterSize - 1) / 2, (filterSize - 1) / 2)) - local parallel = mynn.ConcatTable() - parallel:add(block):add(mynn.Identity()) + local parallel = mynn.ConcatTable() + parallel:add(block):add(mynn.Identity()) - tower:add(parallel):add(mynn.CAddTable(true)) + tower:add(parallel):add(mynn.CAddTable(true)) + + tower:add(mynn.SpatialBatchNormalization(nbChannels)) + tower:add(mynn.ReLU(true)) + end - tower:add(mynn.SpatialBatchNormalization(nbChannels)) - tower:add(mynn.ReLU(true)) end return tower + end -function createModel(filterSize, nbChannels, nbBlocks) +function createModel(imageWidth, imageHeight, + filterSize, nbChannels, nbBlocks) + local model = mynn.Sequential() + -- Encode the two input channels (grasping image and starting + -- configuration) into the internal number of channels model:add(mynn.SpatialConvolution(2, nbChannels, filterSize, filterSize, @@ -526,13 +392,10 @@ function createModel(filterSize, nbChannels, nbBlocks) model:add(mynn.SpatialBatchNormalization(nbChannels)) model:add(mynn.ReLU(true)) - local towerCode = createTower(filterSize, nbChannels, nbBlocks) - local towerDecode = createTower(filterSize, nbChannels, nbBlocks) + -- Add the resnet modules + model:add(createTower(filterSize, nbChannels, nbBlocks)) - model:add(towerCode) - model:add(towerDecode) - - -- Decode to a single channel, which is the final image + -- Decode down to a single channel, which is the final image model:add(mynn.SpatialConvolution(nbChannels, 1, filterSize, filterSize, @@ -544,8 +407,22 @@ end ---------------------------------------------------------------------- -function fillBatch(data, first, nb, batch, permutation) - for k = 1, nb do +function fillBatch(data, first, batch, permutation) + local actualBatchSize = math.min(params.batchSize, data.input:size(1) - first + 1) + + if actualBatchSize ~= batch.input:size(1) then + local size = batch.input:size() + size[1] = actualBatchSize + batch.input:resize(size) + end + + if actualBatchSize ~= batch.target:size(1) then + local size = batch.target:size() + size[1] = actualBatchSize + batch.target:resize(size) + end + + for k = 1, batch.input:size(1) do local i if permutation then i = permutation[first + k - 1] @@ -557,17 +434,10 @@ function fillBatch(data, first, nb, batch, permutation) end end -function trainModel(model, - trainData, validationData, nbEpochs, learningRate, - learningStateFile) +function trainModel(model, trainData, validationData) - local l2criterion = nn.MSECriterion() - local batchSize = config.batchSize - - if useGPU then - logString('Moving the criterion to the GPU.\n') - l2criterion:cuda() - end + local criterion = nn.MSECriterion() + local batchSize = params.batchSize local batch = {} batch.input = mynn.FastTensor(batchSize, 2, trainData.height, trainData.width) @@ -583,21 +453,29 @@ function trainModel(model, torch.setRNGState(model.RNGState) end - logString('Starting training.\n') + if params.useGPU then + print('Moving the model and criterion to the GPU.') + model:cuda() + criterion:cuda() + end + + print('Starting training.') local parameters, gradParameters = model:getParameters() - logString(string.format('model has %d parameters.\n', parameters:storage():size(1))) + printf('The model has %d parameters.', parameters:storage():size(1)) local averageTrainLoss, averageValidationLoss local trainTime, validationTime + ---------------------------------------------------------------------- + local sgdState = { - learningRate = config.learningRate, - momentum = config.momentum, + learningRate = params.learningRate, + momentum = 0, learningRateDecay = 0 } - for e = startingEpoch, nbEpochs do + for e = startingEpoch, params.nbEpochs do model:training() @@ -609,18 +487,19 @@ function trainModel(model, for b = 1, trainData.nbSamples, batchSize do - fillBatch(trainData, b, batchSize, batch, permutation) + fillBatch(trainData, b, batch, permutation) local opfunc = function(x) - -- Surprisingly copy() needs this check + -- Surprisingly, copy() needs this check if x ~= parameters then parameters:copy(x) end local output = model:forward(batch.input) - local loss = l2criterion:forward(output, batch.target) - local dLossdOutput = l2criterion:backward(output, batch.target) + local loss = criterion:forward(output, batch.target) + local dLossdOutput = criterion:backward(output, batch.target) + gradParameters:zero() model:backward(batch.input, dLossdOutput) @@ -639,6 +518,7 @@ function trainModel(model, ---------------------------------------------------------------------- -- Validation losses + do model:evaluate() @@ -647,9 +527,9 @@ function trainModel(model, local startTime = sys.clock() for b = 1, validationData.nbSamples, batchSize do - fillBatch(validationData, b, batchSize, batch) + fillBatch(validationData, b, batch) local output = model:forward(batch.input) - accLoss = accLoss + l2criterion:forward(output, batch.target) + accLoss = accLoss + criterion:forward(output, batch.target) nbBatches = nbBatches + 1 end @@ -657,31 +537,27 @@ function trainModel(model, averageValidationLoss = accLoss / nbBatches; end - logString(string.format('Epoch train %0.2fs (%0.2fms / sample), validation %0.2fs (%0.2fms / sample).\n', - trainTime, - 1000 * trainTime / trainData.nbSamples, - validationTime, - 1000 * validationTime / validationData.nbSamples)) + printf('Epoch train %0.2fs (%0.2fms / sample), validation %0.2fs (%0.2fms / sample).', + trainTime, + 1000 * trainTime / trainData.nbSamples, + validationTime, + 1000 * validationTime / validationData.nbSamples) - logString(string.format('LOSS %d %f %f\n', e, averageTrainLoss, averageValidationLoss), - colors.green) + printfc(colors.green, 'LOSS %d %f %f', e, averageTrainLoss, averageValidationLoss) ---------------------------------------------------------------------- -- Save a persistent state so that we can restart from there - if learningStateFile then - model.RNGState = torch.getRNGState() - model.epoch = e - model:clearState() - logString('Writing ' .. learningStateFile .. '.\n') - torch.save(learningStateFile, model) - end + model:clearState() + model.RNGState = torch.getRNGState() + model.epoch = e + torch.save(params.rundir .. '/model_last.t7', model) ---------------------------------------------------------------------- -- Save a duplicate of the persistent state from time to time - if opt.resultFreq > 0 and e%opt.resultFreq == 0 then - torch.save(string.format('%s/epoch_%05d_model', opt.resultDir, e), model) + if params.resultFreq > 0 and e%params.resultFreq == 0 then + torch.save(string.format('%s/model_%04d.t7', params.rundir, e), model) saveResultImage(model, trainData) saveResultImage(model, validationData) end @@ -692,64 +568,65 @@ end function createAndTrainModel(trainData, validationData) - local model + -- Load the current training state, or create a new model from + -- scratch - local learningStateFile = opt.learningStateFile - - if learningStateFile == '' then - learningStateFile = opt.resultDir .. '/learning.state' - end + if pcall(function () model = torch.load(params.rundir .. '/model_last.t7') end) then - local gotlearningStateFile + printfc(colors.red, + 'Found a learning state with %d epochs finished, starting from there.', + model.epoch) - logString('Using the learning state file ' .. learningStateFile .. '\n') - - if pcall(function () model = torch.load(learningStateFile) end) then - - gotlearningStateFile = true - - else - - model = createModel(config.filterSize, config.nbChannels, config.nbBlocks) - - if useGPU then - logString('Moving the model to the GPU.\n') - model:cuda() + if params.exampleInternals > 0 then + saveInternalsImage(model, validationData, params.exampleInternals) + os.exit(0) end - end + else - logString(tostring(model) .. '\n') + model = createModel(trainData.width, trainData.height, + params.filterSize, params.nbChannels, + params.nbBlocks) - if gotlearningStateFile then - logString(string.format('Found a learning state with %d epochs finished.\n', model.epoch), - colors.red) end - if opt.exampleInternals > 0 then - saveInternalsImage(model, validationData, opt.exampleInternals) - os.exit(0) - end - - trainModel(model, - trainData, validationData, - config.nbEpochs, config.learningRate, - learningStateFile) + trainModel(model, trainData, validationData) return model end -for i, j in pairs(config) do - logString('config ' .. i .. ' = \'' .. j ..'\'\n') +---------------------------------------------------------------------- +-- main + +for _, c in pairs({ + 'date', + 'uname -a', + 'git log -1 --format=%H' + }) +do + logCommand(c) end -local trainData = loadData(1, config.nbTrainSamples, 'train') -local validationData = loadData(config.nbTrainSamples + 1, config.nbValidationSamples, 'validation') -local testData = loadData(config.nbTrainSamples + config.nbValidationSamples + 1, config.nbTestSamples, 'test') +local trainData = loadData(1, + params.nbTrainSamples, 'train') + +local validationData = loadData(params.nbTrainSamples + 1, + params.nbValidationSamples, 'validation') local model = createAndTrainModel(trainData, validationData) +---------------------------------------------------------------------- +-- Test + +local testData = loadData(params.nbTrainSamples + params.nbValidationSamples + 1, + params.nbTestSamples, 'test') + +if params.useGPU then + print('Moving the model and criterion to the GPU.') + model:cuda() +end + saveResultImage(model, trainData) saveResultImage(model, validationData) -saveResultImage(model, testData, nil, testData.nbSamples) +saveResultImage(model, testData, 1024)