Person-reID-triplet-loss Validation loss curve

Validation loss curve

Open Rajat-Mehta opened this issue 5 years ago • 1 comments

Is there a way to plot validation loss curves along with the training loss during the Training process?

Here is my train method:

def train_model(model, criterion, optimizer, scheduler, num_epochs=25):
    since = time.time()

    best_model_wts = model.state_dict()
    best_acc = 0.0
    last_margin = 0.0

    for epoch in range(num_epochs - start_epoch):
        epoch = epoch + start_epoch
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                scheduler.step()
                model.train(True)  # Set model to training mode
            else:
                model.train(False)  # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0.0
            running_margin = 0.0
            running_reg = 0.0
            # Iterate over data.
            for data in dataloaders[phase]:
                # get the inputs
                inputs, labels, pos, pos_labels = data
                now_batch_size, c, h, w = inputs.shape

                if now_batch_size<opt.batchsize:  # next epoch
                    continue
                pos = pos.view(4*opt.batchsize, c, h, w)
                # copy pos 4times
                pos_labels = pos_labels.repeat(4).reshape(4, opt.batchsize)
                pos_labels = pos_labels.transpose(0, 1).reshape(4*opt.batchsize)

                # wrap them in Variable
                if use_gpu:
                    inputs = Variable(inputs.cuda())
                    pos = Variable(pos.cuda())
                    labels = Variable(labels.cuda())
                else:
                    inputs, labels = Variable(inputs), Variable(labels)

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                if phase == 'val':
                    with torch.no_grad():
                        outputs, f = model(inputs)
                        _, pf = model(pos)
                else:
                    # model_eval = copy.deepcopy(model)
                    # model_eval = model_eval.eval()
                    outputs, f = model(inputs)
                    _, pf = model(pos)
                # pf = Variable( pf, requires_grad=True)
                neg_labels = pos_labels
                # hard-neg
                # ----------------------------------
                nf_data = pf  # 128*512

                # 128 is too much, we use pool size = 64
                rand = np.random.permutation(4*opt.batchsize)[0:opt.poolsize]
                nf_data = nf_data[rand, :]
                neg_labels = neg_labels[rand]
                nf_t = nf_data.transpose(0, 1)  # 512*128
                score = torch.mm(f.data, nf_t)  # cosine 32*128
                score, rank = score.sort(dim=1, descending=True)  # score high == hard
                labels_cpu = labels.cpu()
                nf_hard = torch.zeros(f.shape).cuda()
                for k in range(now_batch_size):
                    hard = rank[k, :]
                    for kk in hard:
                        now_label = neg_labels[kk] 
                        anchor_label = labels_cpu[k]
                        if now_label != anchor_label:
                            nf_hard[k, :] = nf_data[kk, :]
                            break

                # hard-pos
                # ----------------------------------
                pf_hard = torch.zeros(f.shape).cuda() # 32*512
                for k in range(now_batch_size):
                    pf_data = pf[4*k:4*k+4,:]
                    pf_t = pf_data.transpose(0,1) # 512*4
                    ff = f.data[k,:].reshape(1,-1) # 1*512
                    score = torch.mm(ff, pf_t) #cosine
                    score, rank = score.sort(dim=1, descending = False)  # score low == hard
                    pf_hard[k,:] = pf_data[rank[0][0],:]

                # loss
                # ---------------------------------
                criterion_triplet = nn.MarginRankingLoss(margin=opt.margin)                
                pscore = torch.sum(f * pf_hard, dim=1)
                nscore = torch.sum(f * nf_hard, dim=1)
                y = torch.ones(now_batch_size)
                y = Variable(y.cuda())

                if not opt.PCB:
                    _, preds = torch.max(outputs.data, 1)
                    #loss = criterion(outputs, labels)
                    #loss_triplet = criterion_triplet(f, pf, nf)
                    reg = torch.sum((1+nscore)**2) + torch.sum((-1+pscore)**2)
                    loss = torch.sum(torch.nn.functional.relu(nscore + opt.margin - pscore))  #Here I use sum
                    loss_triplet = loss + opt.alpha*reg
                else:
                    part = {}
                    sm = nn.Softmax(dim=1)
                    num_part = 6
                    for i in range(num_part):
                        part[i] = outputs[i]

                    score = sm(part[0]) + sm(part[1]) +sm(part[2]) + sm(part[3]) +sm(part[4]) +sm(part[5])
                    _, preds = torch.max(score.data, 1)

                    loss = criterion(part[0], labels)
                    for i in range(num_part-1):
                        loss += criterion(part[i+1], labels)

                # backward + optimize only if in training phase
                if phase == 'train':
                    if fp16:  # we use optimier to backward loss
                        with amp.scale_loss(loss_triplet, optimizer) as scaled_loss:
                            scaled_loss.backward()
                    else:
                        loss_triplet.backward()
                    optimizer.step()
                # statistics
                if int(version[0]) > 0 or int(version[2]) > 3:  # for the new version like 0.4.0 and 0.5.0
                    running_loss += loss_triplet.item() #* opt.batchsize
                else :  # for the old version like 0.3.0 and 0.3.1
                    running_loss += loss_triplet.data[0] #*opt.batchsize
                # print( loss_triplet.item())
                running_corrects += float(torch.sum(pscore>nscore+opt.margin))
                running_margin +=float(torch.sum(pscore-nscore))
                running_reg += reg

            datasize = dataset_sizes[phase]//opt.batchsize * opt.batchsize
            epoch_loss = running_loss / datasize
            epoch_reg = opt.alpha*running_reg/ datasize
            epoch_acc = running_corrects / datasize
            epoch_margin = running_margin / datasize

            #if epoch_acc>0.75:
            #    opt.margin = min(opt.margin+0.02, 1.0)
            print('now_margin: %.4f'%opt.margin)           
            print('{} Loss: {:.4f} Reg: {:.4f} Acc: {:.4f} MeanMargin: {:.4f}'.format(
                phase, epoch_loss, epoch_reg, epoch_acc, epoch_margin))

            y_loss[phase].append(epoch_loss)
            y_err[phase].append(1.0-epoch_acc)
            # deep copy the model
            if epoch_margin>last_margin:
                last_margin = epoch_margin
                last_model_wts = model.state_dict()


            
            if phase == 'val':
                last_model_wts = model.state_dict()
                if epoch % 10 == 9:
                    save_network(model, epoch)
                draw_curve(epoch)
            

        time_elapsed = time.time() - since
        print('Training complete in {:.0f}m {:.0f}s'.format(
            time_elapsed // 60, time_elapsed % 60))
        print()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    #print('Best val Acc: {:4f}'.format(best_acc))

    # load best model weights
    model.load_state_dict(last_model_wts)
    save_network(model, 'last')
    return model

I tried to add 'val' phase in your train method: "for phase in ['train', 'val']". But I am getting the following error while looping through the Val dataloader (at this line "for data in dataloaders[phase]"):

but I am getting this error probably because of dataloader['val']:

Traceback (most recent call last):
  File "train_siamese.py", line 593, in <module>
    num_epochs=150)
  File "train_siamese.py", line 323, in train_model
    for data in dataloaders[phase]:
  File "/home/rajat/MyPC/DFKI/MasterThesis/Vehicle_Reidentification/veri/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 582, in __next__
    return self._process_next_batch(batch)
  File "/home/rajat/MyPC/DFKI/MasterThesis/Vehicle_Reidentification/veri/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 608, in _process_next_batch
    raise batch.exc_type(batch.exc_msg)
ZeroDivisionError: Traceback (most recent call last):
  File "/home/rajat/MyPC/DFKI/MasterThesis/Vehicle_Reidentification/veri/lib/python3.6/site-packages/torch/utils/data/_utils/worker.py", line 99, in _worker_loop
    samples = collate_fn([dataset[i] for i in batch_indices])
  File "/home/rajat/MyPC/DFKI/MasterThesis/Vehicle_Reidentification/veri/lib/python3.6/site-packages/torch/utils/data/_utils/worker.py", line 99, in <listcomp>
    samples = collate_fn([dataset[i] for i in batch_indices])
  File "/home/rajat/MyPC/DFKI/MasterThesis/Vehicle_Reidentification/tripletfolder.py", line 47, in __getitem__
    pos_path = self._get_pos_sample(target, index)
  File "/home/rajat/MyPC/DFKI/MasterThesis/Vehicle_Reidentification/tripletfolder.py", line 32, in _get_pos_sample
    t = i%len(rand)
ZeroDivisionError: integer division or modulo by zero

Aug 03 '19 14:08 Rajat-Mehta

Of course it will be this result. In ‘val’ set, only one snapshot per identity.
ZeroDivisionError cause by t = i % len(rand), len(rand) is zero !

Dec 01 '20 01:12 CynicalHeart

Person-reID-triplet-loss Person-reID-triplet-loss copied to clipboard

Validation loss curve

Person-reID-triplet-loss
Person-reID-triplet-loss copied to clipboard