tutorials icon indicating copy to clipboard operation
tutorials copied to clipboard

[BUG] - Hyperparameter Tuning with Ray Tune doesn't work for multi GPU

Open f2010126 opened this issue 1 year ago • 3 comments

Add Link

The tutorial given here using DP Pytorch and Ray doesn't work when gpus_per_trial>1. Link: https://pytorch.org/tutorials/beginner/hyperparameter_tuning_tutorial.html

Describe the bug

Using the code from the tutorial without any changes

from functools import partial
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import random_split
import torchvision
import torchvision.transforms as transforms
from ray import tune
from ray.air import Checkpoint, session
from ray.tune.schedulers import ASHAScheduler

def load_data(data_dir="./data"):
    transform = transforms.Compose(
        [transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]

    trainset = torchvision.datasets.CIFAR10(
        root=data_dir, train=True, download=True, transform=transform

    testset = torchvision.datasets.CIFAR10(
        root=data_dir, train=False, download=True, transform=transform

    return trainset, testset

class Net(nn.Module):
    def __init__(self, l1=120, l2=84):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * 5 * 5, l1)
        self.fc2 = nn.Linear(l1, l2)
        self.fc3 = nn.Linear(l2, 10)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = torch.flatten(x, 1)  # flatten all dimensions except batch
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

def train_cifar(config, data_dir=None):
    net = Net(config["l1"], config["l2"])

    device = "cpu"
    if torch.cuda.is_available():
        device = "cuda:0"
        if torch.cuda.device_count() > 1:
            net = nn.DataParallel(net)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(net.parameters(), lr=config["lr"], momentum=0.9)

    checkpoint = session.get_checkpoint()

    if checkpoint:
        checkpoint_state = checkpoint.to_dict()
        start_epoch = checkpoint_state["epoch"]
        start_epoch = 0

    trainset, testset = load_data(data_dir)

    test_abs = int(len(trainset) * 0.8)
    train_subset, val_subset = random_split(
        trainset, [test_abs, len(trainset) - test_abs]

    trainloader = torch.utils.data.DataLoader(
        train_subset, batch_size=int(config["batch_size"]), shuffle=True, num_workers=8
    valloader = torch.utils.data.DataLoader(
        val_subset, batch_size=int(config["batch_size"]), shuffle=True, num_workers=8
    print('Starting training')
    for epoch in range(start_epoch, 1):  # loop over the dataset multiple times
        running_loss = 0.0
        epoch_steps = 0
        for i, data in enumerate(trainloader, 0):
            # get the inputs; data is a list of [inputs, labels]
            inputs, labels = data
            inputs, labels = inputs.to(device), labels.to(device)

            # zero the parameter gradients

            # forward + backward + optimize
            outputs = net(inputs)
            loss = criterion(outputs, labels)

            # print statistics
            running_loss += loss.item()
            epoch_steps += 1
            if i % 2000 == 1999:  # print every 2000 mini-batches
                    "[%d, %5d] loss: %.3f"
                    % (epoch + 1, i + 1, running_loss / epoch_steps)
                running_loss = 0.0

        # Validation loss
        val_loss = 0.0
        val_steps = 0
        total = 0
        correct = 0
        for i, data in enumerate(valloader, 0):
            with torch.no_grad():
                inputs, labels = data
                inputs, labels = inputs.to(device), labels.to(device)

                outputs = net(inputs)
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

                loss = criterion(outputs, labels)
                val_loss += loss.cpu().numpy()
                val_steps += 1

        checkpoint_data = {
            "epoch": epoch,
            "net_state_dict": net.state_dict(),
            "optimizer_state_dict": optimizer.state_dict(),
        checkpoint = Checkpoint.from_dict(checkpoint_data)

            {"loss": val_loss / val_steps, "accuracy": correct / total},
    print("Finished Training")

def test_accuracy(net, device="cpu"):
    trainset, testset = load_data()

    testloader = torch.utils.data.DataLoader(
        testset, batch_size=4, shuffle=False, num_workers=2

    correct = 0
    total = 0
    with torch.no_grad():
        for data in testloader:
            images, labels = data
            images, labels = images.to(device), labels.to(device)
            outputs = net(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    return correct / total

def main(num_samples=10, max_num_epochs=10, gpus_per_trial=2):
    data_dir = os.path.abspath("./data")
    config = {
        "l1": tune.choice([2**i for i in range(9)]),
        "l2": tune.choice([2**i for i in range(9)]),
        "lr": tune.loguniform(1e-4, 1e-1),
        "batch_size": tune.choice([2, 4, 8, 16]),
    scheduler = ASHAScheduler(
    result = tune.run(
        partial(train_cifar, data_dir=data_dir),
        resources_per_trial={"cpu": 2, "gpu": gpus_per_trial},

    best_trial = result.get_best_trial("loss", "min", "last")
    print(f"Best trial config: {best_trial.config}")
    print(f"Best trial final validation loss: {best_trial.last_result['loss']}")
    print(f"Best trial final validation accuracy: {best_trial.last_result['accuracy']}")

    best_trained_model = Net(best_trial.config["l1"], best_trial.config["l2"])
    device = "cpu"
    if torch.cuda.is_available():
        device = "cuda:0"
        if gpus_per_trial > 1:
            best_trained_model = nn.DataParallel(best_trained_model)

    best_checkpoint = best_trial.checkpoint.to_air_checkpoint()
    best_checkpoint_data = best_checkpoint.to_dict()


    test_acc = test_accuracy(best_trained_model, device)
    print("Best trial test set accuracy: {}".format(test_acc))

if __name__ == "__main__":
    # You can change the number of GPUs per trial here:
    main(num_samples=3, max_num_epochs=10, gpus_per_trial=2)

Expected The run works using 2 GPUs per trial.

Observed The code starts, the data loaders are created. But when creating the distributed process group, everything hangs.

Describe your environment

Env: OS: Ubuntu 22.04 on a SLURM Cluster Python: 3.10 torch==2.0.1 ray ==2.6.3

NVCC -V output:

Cuda compilation tools, release 11.7, V11.7.99
Build cuda_11.7.r11.7/compiler.31442593_0

Nvidia-smi output

NVIDIA-SMI 530.30.02              
Driver Version: 530.30.02    
CUDA Version: 12.1

cc @sekyondaMeta @svekars @carljparker @NicolasHug @kit1980 @subramen

f2010126 avatar Aug 25 '23 08:08 f2010126

Codecov Report

Merging #114 (32358df) into main (7fd1ece) will decrease coverage by 6.10%. The diff coverage is 26.78%.

:exclamation: Current head 32358df differs from pull request most recent head f282366. Consider uploading reports for the commit f282366 to get more accurate results

@@            Coverage Diff             @@
##             main     #114      +/-   ##
- Coverage   91.82%   85.71%   -6.11%     
  Files          31       32       +1     
  Lines        2557     2612      +55     
- Hits         2348     2239     -109     
- Misses        209      373     +164     
Impacted Files Coverage Δ
torchsnapshot/storage_plugins/gcs.py 0.00% <ø> (-78.90%) :arrow_down:
torchsnapshot/storage_plugin.py 44.73% <25.00%> (-23.84%) :arrow_down:
torchsnapshot/storage_plugins/fsspec.py 26.92% <26.92%> (ø)
torchsnapshot/storage_plugins/s3.py 26.82% <0.00%> (-63.42%) :arrow_down:
torchsnapshot/memoryview_stream.py 63.33% <0.00%> (-6.67%) :arrow_down:

:mega: We’re building smart automated test selection to slash your CI/CD build times. Learn more

codecov[bot] avatar Oct 21 '22 16:10 codecov[bot]

Hi @shicheng0829!

Thank you for your pull request and welcome to our community.

Action Required

In order to merge any pull request (code, docs, etc.), we require contributors to sign our Contributor License Agreement, and we don't seem to have one on file for you.


In order for us to review and merge your suggested changes, please sign at https://code.facebook.com/cla. If you are contributing on behalf of someone else (eg your employer), the individual CLA may not be sufficient and your employer may need to sign the corporate CLA.

Once the CLA is signed, our tooling will perform checks and validations. Afterwards, the pull request will be tagged with CLA signed. The tagging process may take up to 1 hour after signing. Please give it that time before contacting us about it.

If you have received this in error or have any questions, please contact us at [email protected]. Thanks!

facebook-github-bot avatar Oct 24 '22 06:10 facebook-github-bot

Thank you for signing our Contributor License Agreement. We can now accept your code for this (and any) Meta Open Source project. Thanks!

facebook-github-bot avatar Oct 24 '22 08:10 facebook-github-bot

Fix some bugs and add more unit test. Please review the pr again if you have free time. Thank you!

shicheng0829 avatar Oct 26 '22 12:10 shicheng0829