lstm icon indicating copy to clipboard operation
lstm copied to clipboard

why reload the model, the perplexity became confused?

Open BigNewbiePlus opened this issue 7 years ago • 0 comments

I want to save the trainned model after every epoch, I save the local model defined in this code and also extra data .i.e step....every time reload from the previous model, the perplexity became very lower(158) and start to increase(not decrease), the norm alwayes keep 0. it not act as saved before,what i MISS? what I changed is only the main function

local function main()
   --torch.setnumthreads(10)
   --print('threads: ', torch.getnumthreads())
   g_make_deterministic(1)
   
   state_train = {data=ptb.traindataset(params.batch_size)}
   state_valid = {data=ptb.validdataset(params.batch_size)}
   state_test  = {data=ptb.testdataset(params.batch_size)}
   params.vocab_size = ptb.vocab_size()
   print('Network parameters')
   print(params)
      
   local states = {state_train, state_valid, state_test}
   for _, state in pairs(states) do
      reset_state(state)
   end

   setup()

   -- load saved model before train
   local saved_model
   local file = io.open(params.model_path, "rb")
   if file then
      file:close()
      saved_model = torch.load(params.model_path)
      print('load from previous saved model')
   end
   
   model = saved_model or model
   collectgarbage()

   state_train.pos = model.state_train_pos or 1
   params.lr = model.lr or params.lr
   local step = model.step or 0
   local epoch = model.epoch or 0
   local total_cases = model.total_cases or 0
   local tics = model.tics or 0
   local beginning_time = torch.tic() - tics
   local start_time = torch.tic() - tics
   print('Starting training')
   local words_per_step = params.seq_length * params.batch_size
   local epoch_size = torch.floor(state_train.data:size(1) / params.seq_length)
   local perps
   while epoch < params.max_max_epoch do
      local perp = fp(state_train)
      if perps == nil then
	 perps = torch.zeros(epoch_size):add(perp)
      end
      perps[step % epoch_size + 1] = perp
      step = step + 1
      bp(state_train)
      total_cases = total_cases + params.seq_length * params.batch_size
      epoch = step /epoch_size
      if step % torch.round(epoch_size / 10) == 10 then
	 local wps = torch.floor(total_cases / torch.toc(start_time))
	 local since_beginning = g_d(torch.toc(beginning_time)/60)
	 print('epoch = ' .. g_f3(epoch) ..
		  ', train perp. = ' .. g_f3(torch.exp(perps:mean()))..
		  ', wps = ' .. wps ..
		  ', dw:norm() = ' .. g_f3(model.norm_dw) ..
		  ', lr = ' .. g_f3(params.lr) ..
		  ', since begining = '..since_beginning..' mins')

	 -- save model to model_path file
	 model.step = step
	 model.epoch = epoch
	 model.total_cases = total_cases
	 model.tics = torch.tic() - beginning_time
	 model.state_train_pos = state_train.pos
	 model.lr = params.lr
	 --clear_state()
	 torch.save(params.model_path, model)
	 
      end
      if step % epoch_size == 0 then
	 run_valid()
	 if epoch > params.max_epoch then
	    params.lr = params.lr / params.decay
	 end
      end
      if step % 33 == 0 then
	 collectgarbage()
      end
   end
   run_test()
   print('training is over.')
end

when train from scratch, it's good: tmpdir--17_3_6_15_27_50 when train from the previous saved model, it's wrong: tmpdir--17_3_6_15_29_26

why? anyone know the reason?

BigNewbiePlus avatar Mar 06 '17 06:03 BigNewbiePlus