DataLoaders_DALI
DataLoaders_DALI copied to clipboard
compared the two ways(dali and pytorch dataloader), the training time almost the same???
@tanglang96 thanks for your summary, and I compared the two ways(dali and pytorch dataloader), the training time almost the same??? the code are following:
- pytorch dataloader format:
CROP_SIZE= 32
CIFAR_MEAN = [0.49139968, 0.48215827, 0.44653124]
CIFAR_STD = [0.24703233, 0.24348505, 0.26158768]
CIFAR_IMAGES_NUM_TRAIN = 50000
CIFAR_IMAGES_NUM_TEST = 10000
IMG_DIR = './data'
TRAIN_BS = 128
TEST_BS = 100
NUM_WORKERS = 2
transform_train = transforms.Compose([
transforms.RandomCrop(CROP_SIZE, padding=4),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
transforms.Normalize(CIFAR_MEAN, CIFAR_STD),])
train_dst = CIFAR10(root=IMG_DIR, train=True, download=False, transform=transform_train)
trainloader = torch.utils.data.DataLoader(train_dst, batch_size=TRAIN_BS, shuffle=True, pin_memory=True, num_workers=NUM_WORKERS)
for epoch in range(start_epoch, start_epoch+200):
print('\nEpoch: %d' % epoch)
net.train()
train_loss = 0
correct = 0
total = 0
for batch_idx, (inputs, targets) in enumerate(trainloader):
inputs, targets = inputs.to(device), targets.to(device)
optimizer.zero_grad()
outputs = net(inputs)
loss = criterion(outputs, targets)
loss.backward()
optimizer.step()
train_loss += loss.item()
_, predicted = outputs.max(1)
total += targets.size(0)
correct += predicted.eq(targets).sum().item()
progress_bar(batch_idx, len(trainloader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)'
% (train_loss/(batch_idx+1), 100.*correct/total, correct, total))
and the corresponding training process
2)dali format:
parser = argparse.ArgumentParser(description='Train cifar10 use DALI data process based on the resnet18')
parser.add_argument('--lr', default=0.1, type=float, help='learning rate')
parser.add_argument('--TRAIN_BS', default=128, type=int, help='batch size of data')
parser.add_argument('--TEST_BS', default=100, type=int, help='batch size of data')
parser.add_argument('--NUM_WORKERS', default=2, type=int)
parser.add_argument('--IMG_DIR', default='./data', type=str, help='data path')
parser.add_argument('--CROP_SIZE', default=32, type=int)
parser.add_argument('--CIFAR_IMAGES_NUM_TRAIN', default=50000, type=int)
parser.add_argument('--CIFAR_IMAGES_NUM_TEST', default=10000, type=int)
parser.add_argument('--resume', '-r', action='store_true',
help='resume from checkpoint')
args = parser.parse_args()
pip_train = HybridTrainPipe_CIFAR(batch_size=args.TRAIN_BS,
num_threads=args.NUM_WORKERS,
device_id=0,
data_dir=args.IMG_DIR,
crop=args.CROP_SIZE,
world_size=1,
local_rank=0,
cutout=0)
trainloader = DALIDataloader(pipeline=pip_train,
size=args.CIFAR_IMAGES_NUM_TRAIN,
batch_size=args.TRAIN_BS,
onehot_label=True)
for epoch in range(start_epoch, start_epoch+200):
print('\nEpoch: %d' % epoch)
net.train()
train_loss = 0
correct = 0
total = 0
for batch_idx, (inputs, targets) in enumerate(trainloader):
inputs = inputs.cuda(non_blocking=True)
targets = targets.cuda(non_blocking=True)
# inputs, targets = inputs.to(device), targets.to(device)
optimizer.zero_grad()
outputs = net(inputs)
loss = criterion(outputs, targets)
loss.backward()
optimizer.step()
train_loss += loss.item()
_, predicted = outputs.max(1)
total += targets.size(0)
correct += predicted.eq(targets).sum().item()
progress_bar(batch_idx, len(trainloader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)'
% (train_loss / (batch_idx + 1), 100. * correct / total, correct, total))
trainloader.reset()
and the corresponding training process
from the two images, we can see that both of them total time almost 18s for each epoch. and for dali process also exists
WARNING:root:DALI iterator does not support resetting while epoch is not finished. Ignoring...
Wow! Is that the case? Don't we get any speedup with this?