Person-reID-triplet-loss
Person-reID-triplet-loss copied to clipboard
Validation loss curve
Is there a way to plot validation loss curves along with the training loss during the Training process?
Here is my train method:
def train_model(model, criterion, optimizer, scheduler, num_epochs=25):
since = time.time()
best_model_wts = model.state_dict()
best_acc = 0.0
last_margin = 0.0
for epoch in range(num_epochs - start_epoch):
epoch = epoch + start_epoch
print('Epoch {}/{}'.format(epoch, num_epochs - 1))
print('-' * 10)
# Each epoch has a training and validation phase
for phase in ['train', 'val']:
if phase == 'train':
scheduler.step()
model.train(True) # Set model to training mode
else:
model.train(False) # Set model to evaluate mode
running_loss = 0.0
running_corrects = 0.0
running_margin = 0.0
running_reg = 0.0
# Iterate over data.
for data in dataloaders[phase]:
# get the inputs
inputs, labels, pos, pos_labels = data
now_batch_size, c, h, w = inputs.shape
if now_batch_size<opt.batchsize: # next epoch
continue
pos = pos.view(4*opt.batchsize, c, h, w)
# copy pos 4times
pos_labels = pos_labels.repeat(4).reshape(4, opt.batchsize)
pos_labels = pos_labels.transpose(0, 1).reshape(4*opt.batchsize)
# wrap them in Variable
if use_gpu:
inputs = Variable(inputs.cuda())
pos = Variable(pos.cuda())
labels = Variable(labels.cuda())
else:
inputs, labels = Variable(inputs), Variable(labels)
# zero the parameter gradients
optimizer.zero_grad()
# forward
if phase == 'val':
with torch.no_grad():
outputs, f = model(inputs)
_, pf = model(pos)
else:
# model_eval = copy.deepcopy(model)
# model_eval = model_eval.eval()
outputs, f = model(inputs)
_, pf = model(pos)
# pf = Variable( pf, requires_grad=True)
neg_labels = pos_labels
# hard-neg
# ----------------------------------
nf_data = pf # 128*512
# 128 is too much, we use pool size = 64
rand = np.random.permutation(4*opt.batchsize)[0:opt.poolsize]
nf_data = nf_data[rand, :]
neg_labels = neg_labels[rand]
nf_t = nf_data.transpose(0, 1) # 512*128
score = torch.mm(f.data, nf_t) # cosine 32*128
score, rank = score.sort(dim=1, descending=True) # score high == hard
labels_cpu = labels.cpu()
nf_hard = torch.zeros(f.shape).cuda()
for k in range(now_batch_size):
hard = rank[k, :]
for kk in hard:
now_label = neg_labels[kk]
anchor_label = labels_cpu[k]
if now_label != anchor_label:
nf_hard[k, :] = nf_data[kk, :]
break
# hard-pos
# ----------------------------------
pf_hard = torch.zeros(f.shape).cuda() # 32*512
for k in range(now_batch_size):
pf_data = pf[4*k:4*k+4,:]
pf_t = pf_data.transpose(0,1) # 512*4
ff = f.data[k,:].reshape(1,-1) # 1*512
score = torch.mm(ff, pf_t) #cosine
score, rank = score.sort(dim=1, descending = False) # score low == hard
pf_hard[k,:] = pf_data[rank[0][0],:]
# loss
# ---------------------------------
criterion_triplet = nn.MarginRankingLoss(margin=opt.margin)
pscore = torch.sum(f * pf_hard, dim=1)
nscore = torch.sum(f * nf_hard, dim=1)
y = torch.ones(now_batch_size)
y = Variable(y.cuda())
if not opt.PCB:
_, preds = torch.max(outputs.data, 1)
#loss = criterion(outputs, labels)
#loss_triplet = criterion_triplet(f, pf, nf)
reg = torch.sum((1+nscore)**2) + torch.sum((-1+pscore)**2)
loss = torch.sum(torch.nn.functional.relu(nscore + opt.margin - pscore)) #Here I use sum
loss_triplet = loss + opt.alpha*reg
else:
part = {}
sm = nn.Softmax(dim=1)
num_part = 6
for i in range(num_part):
part[i] = outputs[i]
score = sm(part[0]) + sm(part[1]) +sm(part[2]) + sm(part[3]) +sm(part[4]) +sm(part[5])
_, preds = torch.max(score.data, 1)
loss = criterion(part[0], labels)
for i in range(num_part-1):
loss += criterion(part[i+1], labels)
# backward + optimize only if in training phase
if phase == 'train':
if fp16: # we use optimier to backward loss
with amp.scale_loss(loss_triplet, optimizer) as scaled_loss:
scaled_loss.backward()
else:
loss_triplet.backward()
optimizer.step()
# statistics
if int(version[0]) > 0 or int(version[2]) > 3: # for the new version like 0.4.0 and 0.5.0
running_loss += loss_triplet.item() #* opt.batchsize
else : # for the old version like 0.3.0 and 0.3.1
running_loss += loss_triplet.data[0] #*opt.batchsize
# print( loss_triplet.item())
running_corrects += float(torch.sum(pscore>nscore+opt.margin))
running_margin +=float(torch.sum(pscore-nscore))
running_reg += reg
datasize = dataset_sizes[phase]//opt.batchsize * opt.batchsize
epoch_loss = running_loss / datasize
epoch_reg = opt.alpha*running_reg/ datasize
epoch_acc = running_corrects / datasize
epoch_margin = running_margin / datasize
#if epoch_acc>0.75:
# opt.margin = min(opt.margin+0.02, 1.0)
print('now_margin: %.4f'%opt.margin)
print('{} Loss: {:.4f} Reg: {:.4f} Acc: {:.4f} MeanMargin: {:.4f}'.format(
phase, epoch_loss, epoch_reg, epoch_acc, epoch_margin))
y_loss[phase].append(epoch_loss)
y_err[phase].append(1.0-epoch_acc)
# deep copy the model
if epoch_margin>last_margin:
last_margin = epoch_margin
last_model_wts = model.state_dict()
if phase == 'val':
last_model_wts = model.state_dict()
if epoch % 10 == 9:
save_network(model, epoch)
draw_curve(epoch)
time_elapsed = time.time() - since
print('Training complete in {:.0f}m {:.0f}s'.format(
time_elapsed // 60, time_elapsed % 60))
print()
time_elapsed = time.time() - since
print('Training complete in {:.0f}m {:.0f}s'.format(
time_elapsed // 60, time_elapsed % 60))
#print('Best val Acc: {:4f}'.format(best_acc))
# load best model weights
model.load_state_dict(last_model_wts)
save_network(model, 'last')
return model
I tried to add 'val' phase in your train method: "for phase in ['train', 'val']". But I am getting the following error while looping through the Val dataloader (at this line "for data in dataloaders[phase]"):
but I am getting this error probably because of dataloader['val']:
Traceback (most recent call last):
File "train_siamese.py", line 593, in <module>
num_epochs=150)
File "train_siamese.py", line 323, in train_model
for data in dataloaders[phase]:
File "/home/rajat/MyPC/DFKI/MasterThesis/Vehicle_Reidentification/veri/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 582, in __next__
return self._process_next_batch(batch)
File "/home/rajat/MyPC/DFKI/MasterThesis/Vehicle_Reidentification/veri/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 608, in _process_next_batch
raise batch.exc_type(batch.exc_msg)
ZeroDivisionError: Traceback (most recent call last):
File "/home/rajat/MyPC/DFKI/MasterThesis/Vehicle_Reidentification/veri/lib/python3.6/site-packages/torch/utils/data/_utils/worker.py", line 99, in _worker_loop
samples = collate_fn([dataset[i] for i in batch_indices])
File "/home/rajat/MyPC/DFKI/MasterThesis/Vehicle_Reidentification/veri/lib/python3.6/site-packages/torch/utils/data/_utils/worker.py", line 99, in <listcomp>
samples = collate_fn([dataset[i] for i in batch_indices])
File "/home/rajat/MyPC/DFKI/MasterThesis/Vehicle_Reidentification/tripletfolder.py", line 47, in __getitem__
pos_path = self._get_pos_sample(target, index)
File "/home/rajat/MyPC/DFKI/MasterThesis/Vehicle_Reidentification/tripletfolder.py", line 32, in _get_pos_sample
t = i%len(rand)
ZeroDivisionError: integer division or modulo by zero
Of course it will be this result. In ‘val’ set, only one snapshot per identity.
ZeroDivisionError
cause by t = i % len(rand)
, len(rand) is zero !