deep_sort_pytorch icon indicating copy to clipboard operation
deep_sort_pytorch copied to clipboard

Refactor code in train.py

Open pvti opened this issue 4 years ago • 5 comments

Hi @ZQPei , currently I'm working on a tracking-by-detection project very related to deepsort. I've tried a lot to make your repo works for me. Do you mind rearrange your code in file train.py? Here, I rewrite your train.py code, I push all the related block in functions like the following:

import os, time
import numpy as np
import argparse
import torch
import torchvision
import torch.backends.cudnn as cudnn
from model import Net

def get_parser():
    parser = argparse.ArgumentParser(description='Train feature extractor for DeepSort')
    parser.add_argument('--data-dir', default='./Mars', type=str, help='Path to data directory, e.g. ./Mars or ./Market1501')
    parser.add_argument('--no-cuda', action='store_true')
    parser.add_argument("--gpu-id", default=0, type=int)
    parser.add_argument('--learning-rate', default=0.1, type=float, help='learning rate')
    parser.add_argument('--interval', '-i', default=20, type=int)
    parser.add_argument('--resume', '-r', action='store_true')
    return parser

def setup_device(gpu_id, no_cuda):
    device = "cuda:{}".format(gpu_id) if torch.cuda.is_available() and not no_cuda else "cpu"
    if torch.cuda.is_available() and not no_cuda:
        cudnn.benchmark = True
    return device

def load_data(data_dir):
    train_dir = os.path.join(data_dir, 'train')
    test_dir = os.path.join(data_dir, 'test')
    transform_train = torchvision.transforms.Compose([torchvision.transforms.RandomCrop((128, 64), padding=4), torchvision.transforms.RandomHorizontalFlip(), torchvision.transforms.ToTensor(), torchvision.transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])
    transform_test = torchvision.transforms.Compose([torchvision.transforms.RandomCrop((128, 64), padding=4), torchvision.transforms.RandomHorizontalFlip(), torchvision.transforms.ToTensor(), torchvision.transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])
    trainloader = torch.utils.data.DataLoader(torchvision.datasets.ImageFolder(train_dir, transform=transform_train), batch_size=64, shuffle=True)
    testloader = torch.utils.data.DataLoader(torchvision.datasets.ImageFolder(test_dir, transform=transform_test), batch_size=64, shuffle=True)
    num_classes = len(trainloader.dataset.classes)
    return trainloader, testloader, num_classes

def define_net(num_classes, resume):
    start_epoch = 0
    best_acc = 0.
    net = Net(num_classes=num_classes)
    if resume:
        assert os.path.isfile('./checkpoint/ckpt.t7'), 'Error: no checkpoint file found!'
        print('Loading from ./checkpoint/ckpt.t7')
        checkpoint = torch.load('./checkpoint/ckpt.t7')
        net_dict = checkpoint['net_dict']
        net.load_state_dict(net_dict)
        best_acc = checkpoint['acc']
        start_epoch = checkpoint['epoch']
    return net, best_acc, start_epoch

def setup_loss_optimizer(net, learning_rate):
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(net.parameters(), learning_rate, momentum=0.9, weight_decay=5e-4)
    return criterion, optimizer

def train(interval, epoch, net, trainloader, device, criterion, optimizer):
    print('\nEpoch: %d'%(epoch+1))
    net.train()
    training_loss = 0.
    train_loss = 0.
    correct = 0
    total = 0
    start = time.time()
    for idx, (inputs, labels) in enumerate(trainloader):
        #forward
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        
        #backward
        optimizer.zero_grad()
        loss.backward
        optimizer.step()
        
        #accumurating
        training_loss += loss.item()
        train_loss += loss.item()
        correct += outputs.max(dim=1)[1].eq(labels).sum().item()
        total += labels.size(0)
        
        #print
        if (idx+1)%interval == 0:
            end = time.time()
            print("[progress:{:.1f}%]time:{:.2f}s Loss:{:.5f} Correct:{}/{} Acc:{:.3f}%".format(100.*(idx+1)/len(trainloader), end-start, train_loss/len(trainloader), correct, total, 100.*correct/total))
            training_loss = 0.
            start = time.time()
    return train_loss/len(trainloader), 1. - correct/total, net

def test(epoch, net, testloader, device, criterion, best_acc):
    net.eval()
    test_loss = 0.
    correct = 0
    total = 0
    start = time.time()
    with torch.no_grad():
        for idx, (inputs, labels) in enumerate(testloader):
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = net(inputs)
            loss = criterion(outputs, labels)
            
            test_loss += loss.item()
            correct += outputs.max(dim=1)[1].eq(labels).sum().item()
            total += labels.size(0)
        
        print('Testing ...')
        end = time.time()
        print("[progress:{:.1f}%]time:{:.2f}s Loss:{:.5f} Correct:{}/{} Acc:{:.3f}%".format(100.*(idx+1)/len(testloader), end-start, test_loss/len(testloader), correct, total, 100.*correct/total))
        
    #save ckpt
    acc = 100.*correct/total
    if acc > best_acc:
        best_acc = acc
        print('Saving parameters to ./checkpoint/ckpt.t7')
        checkpoint = {
            'net_dict': net.state_dict(),
            'acc': acc,
            'epoch': epoch,
        }
        if not os.path.isdir('./checkpoint'):
            os.mkdir('checkpoint')
        torch.save(checkpoint, './checkpoint/ckpt.t7')
    return test_loss/len(testloader), 1. - correct/total, best_acc, net

def lr_decay(learning_rate):
    global optimizer
    for params in optimizer.param_groups:
        params['lr'] *= learning_rate
        lr = params['lr']
        print('Learning rate adjusted to {}'.format(lr))
    return lr
def main():
    args = get_parser().parse_args()
    device = setup_device(args.gpu_id, args.no_cuda)
    trainloader, testloader, num_classes = load_data(args.data_dir)
    net, best_acc, start_epoch = define_net(num_classes, args.resume)
    net.to(device)
    criterion, optimizer = setup_loss_optimizer(net, args.learning_rate)
    for epoch in range(start_epoch, start_epoch+40):
        train_loss, train_err, net = train(args.interval, epoch, net, trainloader, device, criterion, optimizer)
        test_loss, test_err, best_acc, net = test(epoch, net, testloader, device, criterion, best_acc)
        if (epoch+1)%20 == 0:
            lr_decay(args.learning_rate)
    return

if __name__ == '__main__':
    main()

And I think we should puts the training log to a log-directory in order to show the training processing by tensorboard? Any suggestion? Thanks for reading!

pvti avatar Apr 18 '20 11:04 pvti

Hi, @pvtien96 You can open a pull request and I will merge it to our code. Thank you for your contribution to this repo!

ZQPei avatar Apr 18 '20 11:04 ZQPei

I have three questions:

  1. why training the model without use yolo3 to detect? 2 . I try to using MARS-v160809 dataset to tain a model, it's ok in traing, but get a very low accuracy in test.what's the problem?
  2. And how to train the model with using MARS-v160809 dataset?

394781865 avatar Apr 26 '20 08:04 394781865

@394781865 I'm not sure whether I truly understood your questions.

  1. You can combine deepsort with any detection model, like yolo and its variations (yolov2, yolov3, yolov4...), faster rcnn, ssd...
  2. I didn't test the model successfully.
  3. I used to ask like you. You should read all the issues in this repo and find out yourself. Hope this helps.

pvti avatar Apr 26 '20 09:04 pvti

I have three questions:

  1. why training the model without use yolo3 to detect? 2 . I try to using MARS-v160809 dataset to tain a model, it's ok in traing, but get a very low accuracy in test.what's the problem?
  2. And how to train the model with using MARS-v160809 dataset?

Hi , i encountered the same problem, i am training the MARS datasets, and get very low accuracy as well as the training speed is very very very slow, i trained the datasets for 3 days ,just run 9 epoches . Did you figure out the problem ?

china56321 avatar Aug 28 '20 02:08 china56321

Hi @pvtien96

I'm curious that you refactored the code above but did not mention/change that the current torchvision.datasets.ImageFolder(train_dir, transform=transform_train) simply does not work with the Market1501 dataset structure.

I created this custom Dataset loader:

import natsort
import os
from PIL import Image

from torch.utils.data import Dataset

class Market1501DataSet(Dataset):
    def __init__(self, main_dir, transform):
        self.main_dir = main_dir
        self.transform = transform
        all_imgs = os.listdir(main_dir)
        self.total_imgs = natsort.natsorted(all_imgs)
        # __getitem__ must return an index of list "classes"

        image_names = [x[0:4] for x in self.total_imgs]
        self.classes_dict = dict.fromkeys(image_names)
        for i, key in enumerate(self.classes_dict):
            self.classes_dict[key] = i
        self.classes = list(self.classes_dict)

    def __len__(self):
        return len(self.total_imgs)

    ## Returns: Tuple (image, target) where target is the index of the target category.
    def __getitem__(self, idx):
        img_loc = os.path.join(self.main_dir, self.total_imgs[idx])
        image = Image.open(img_loc).convert("RGB")
        tensor_image = self.transform(image)

        label = self.total_imgs[idx][0:4]
        index_of_label = self.classes_dict[label]

        return (tensor_image, index_of_label)

and then in train.py

# train_dir = os.path.join(root,"bounding_box_train") # EDITed this change out after rearranging the Market1501 dataset structure
# test_dir = os.path.join(root,"bounding_box_test") # EDITed this change out after rearranging the Market1501 dataset structure
train_dir = os.path.join(root,"train")
test_dir = os.path.join(root,"test")

market_train = Market1501DataSet(train_dir, transform=transform_train)
market_test = Market1501DataSet(test_dir, transform=transform_test)

trainloader = torch.utils.data.DataLoader(
    market_train,
    batch_size=64,shuffle=True
)
testloader = torch.utils.data.DataLoader(
    market_test,
    batch_size=64,shuffle=True
)

What did you do?

(I'd like to discuss this with you and the author because I'm not sure if this is strictly correct, since we'll be training on 751 classes, but then testing in 750 completely different classes but with the same index ids.. (so the network should be confused))

EDIT: K rearranged the Market1501 dataset so that two new folders train and test contain half the images of each person, so that we train with 1500 classes, and test on the same 1500 classes

WurmD avatar Sep 15 '20 21:09 WurmD