Does deepspeed automaticlly partition the model to multi gpus when stage=3 is specified?

Open superzhangmch opened this issue 2 years ago • 0 comments

I wrote a simple test code which can not run on a single GPU because of OOM, but runs well using deepspeed on 4 GPUs. The networks is built by 100 big FC layers. I choosed to use deepspeed stage 3, which I expect can automaticly disperse the parameters (and the optimizer status and gradients as well) of the model to multi GPUs somehow.

my question is: Does deepspeed automaticly partition the model to multi gpus when stage=3 is specified？or do I need to assign every layer to some gpu manually? ( in the code bellow，I print the local_rank and the gpu index used for every layer, it seems that they are the same) If it can automaticlly do that, how can I know which layer is in which gpu?

(my real task is to fine tune the bloom, however there was something out of my expectation. I could fine tune a bloom 3b on a 40G A100 gpu, or bloom 7b on a 80G A100. But using deepspeed on four 40G A100s, bloom 7b can only be fine tuned with offloading at a very low speed. all stages tried. )

import torch
import torch.nn as nn
import torch.nn.functional as F
from deepspeed.runtime.zero.stage3 import estimate_zero3_model_states_mem_needs_all_live
import deepspeed

import sys

rank = ''
for a in sys.argv:
    if '--local_rank=' in a:
        rank = a

# Define the number of FC layers
n = 100 # You can change this value
dd = 6000
bs=64

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc_layers = nn.ModuleList() # Use a module list to store the FC layers
        global dd
        for i in range(n):
            if i == 0: # The first layer has input dimension 1000
                self.fc_layers.append(nn.Linear(1000, dd, dtype=torch.float16))
            elif i == n-1: # The last layer has output dimension 10 (assuming 10 classes)
                self.fc_layers.append(nn.Linear(dd, 10, dtype=torch.float16))
            else: # The intermediate layers have input and output dimension 500
                if i % 2 == 0:
                    dd1 =dd + 1000
                else:
                    dd1 = dd - 1000
                self.fc_layers.append(nn.Linear(dd, dd1, dtype=torch.float16))
                dd = dd1

    def forward(self, x):
        print ('fwd', x.shape)
        for i in range(n):
            #print ('ffffwwwwdddd', i, x.shape, x.dtype)
            x = self.fc_layers[i](x) # Apply linear transformation to each layer
            if i < n-1: # Apply ReLU activation to all layers except the last one
                x = F.relu(x)
            print ('fwd', rank, i, ': gpu', self.fc_layers[i].weight.device.index, self.fc_layers[i].weight.shape)
        return x

print ('222')
model = Net()
print ('333')

estimate_zero3_model_states_mem_needs_all_live(model, num_gpus_per_node=4, num_nodes=1)

# Wrap the model with nn.DataParallel to use multiple GPUs
#model = nn.DataParallel(model)
print ('444')

# Define the loss function and the optimizer
criterion = nn.CrossEntropyLoss() # Use cross entropy loss for multi-class classification
print ('555')
print ('666')

# Train the model on some dummy data (you can use your own data loader here)
print ('begin')


ds_config = {
        "train_micro_batch_size_per_gpu": bs,
        "optimizer": {
            "type": "Adam",
            "params": {
                "lr": 1e-4
            }
        },
        "fp16": {
            "enabled": True
        },
        "zero_optimization": {
            "stage": 3
        },
        "flops_profiler": { "enabled": True,
                            "profile_step": 1,
                            "module_depth": -1,
                            "top_modules": 10,
                            "detailed": True,
                            "output_file": "a.txt"
        }
    }
model, _, _, _ = deepspeed.initialize(model=model,
                                          model_parameters=model.parameters(),
                                          config=ds_config)

for epoch in range(10):  # Train for 10 epochs
    for i in range(100): # Iterate over 100 batches of data
        # Generate some random input and target tensors
        inputs = torch.randn(bs, 1000, dtype=torch.float16).cuda() # Batch size is 64, input dimension is 1000

        # Zero the parameter gradients

        # Forward pass
        outputs = model(inputs)
        outputs = outputs.float()

        # Compute loss
        targets = torch.randint(0, 10, (bs,)).cuda() # Batch size is 64, target dimension is 10
        loss = criterion(outputs, targets)

        model.backward(loss)
        model.step()

        print ('loop')

Mar 08 '23 04:03 superzhangmch