FP8-Emulation-Toolkit
FP8-Emulation-Toolkit copied to clipboard
TypeError: zeros_like(): argument 'input' (position 1) must be Tensor, not NoneType
Hello, I intergrated your fp8 emulator with my Lenet (2 conv layers, 3 fc layers) training process.
When I set list_exempt_layers = ["conv1"], everything works well. However, when I set list_exempt_layers = ["fc1"], i.e. exclude all conv layers, the code will report such error TypeError: zeros_like(): argument 'input' (position 1) must be Tensor, not NoneType. It seems I must include at least one conv layer in list_exempt_layers to run correctly.
My environment is Python 3.9, torch=2.1, cuda=12.3
Code is here:
import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from mpemu import mpt_emu
try:
from apex import amp
except ImportError:
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to run this example.")
# ----------------------- 1. Load and normalize CIFAR10 ---------------------- #
transform = transforms.Compose(
[transforms.ToTensor(),
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
batch_size = 4
trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size,
shuffle=True, num_workers=2)
testset = torchvision.datasets.CIFAR10(root='./data', train=False,
download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size,
shuffle=False, num_workers=2)
classes = ('plane', 'car', 'bird', 'cat',
'deer', 'dog', 'frog', 'horse', 'ship', 'truck')
# ----------------- 2. Define a Convolutional Neural Network ----------------- #
class LeNet(nn.Module):
def __init__(self):
super(LeNet, self).__init__()
self.conv1 = nn.Conv2d(3, 6, 5)
self.conv2 = nn.Conv2d(6, 16, 5)
self.fc1 = nn.Linear(16*5*5, 120)
self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(84, 10)
def forward(self, x):
x = F.max_pool2d(F.relu(self.conv1(x)), (2,2))
x = F.max_pool2d(F.relu(self.conv2(x)), 2) # 2 is the same as (2,2)
x = x.view(-1, self.num_flat_features(x))
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = self.fc3(x)
return x
def num_flat_features(self, x):
size = x.size()[1:] # all dimensions except the batch dimension
num_features = 1
for s in size:
num_features *= s
return num_features
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# Assuming that we are on a CUDA machine, this should print a CUDA device:
print(device)
model = LeNet()
model.to(device)
# ------------------ 3. Define a Loss function and optimizer ----------------- #
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
# --------------------------- 4. Train the network --------------------------- #
# layers exempt from FP8 conversion
list_exempt_layers = ["fc1"]
# fused layers will be exempt from converting output tensor to FP8, the following layer will read from FP32 buffer.
list_layers_output_fused = None
# use 'direct' training method, Options : direct, hybrid
model, optimizer = amp.initialize(model, optimizer,
opt_level="O2",
keep_batchnorm_fp32=True
)
model, emulator = mpt_emu.initialize(model, optimizer, training_algo="hybrid",
list_exempt_layers=list_exempt_layers, list_layers_output_fused=list_layers_output_fused,
device="cuda", verbose=True)
for epoch in range(2): # loop over the dataset multiple times
emulator.update_global_steps(epoch*len(trainloader))
running_loss = 0.0
for i, data in enumerate(trainloader, 0):
# get the inputs; data is a list of [inputs, labels]
inputs, labels = data[0].to(device), data[1].to(device) # use GPU
# zero the parameter gradients
optimizer.zero_grad()
outputs = model(inputs)
loss = criterion(outputs, labels)
# forward + backward + optimize
with amp.scale_loss(loss, optimizer) as scaled_loss:
scaled_loss.backward()
emulator.optimizer_step(optimizer)
# print statistics
running_loss += loss.item()
if i % 2000 == 1999: # print every 2000 mini-batches
print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}')
running_loss = 0.0
print('Finished Training')
PATH = './cifar_net.pth'
torch.save(model.state_dict(), PATH)
# ------------------- 5. Test the network on the test data ------------------- #
correct = 0
total = 0
with torch.no_grad():
for data in testloader:
images, labels = data[0].to(device), data[1].to(device)
outputs = model(images)
_, predicted = torch.max(outputs.data, 1)
total += labels.size(0)
correct += (predicted==labels).sum().item()
print('Accuracy of the network on the 10000 test images: %d %%' %
(100*correct/total))
I have just read your arXiv'19 paper, "Mixed Precision Training With 8-bit Floating Point".
In Section 4. Experiments and Results, there is one sentence saying:
"For these convolution networks, the first convolution and the last fully-connected (FC) layers are maintained at a higher precision (16-bit) to maintain the model accuracy."
Is that the reason of the NoneType error?
I meet the same issue and I found the reason for this is that the first layer's grad_input is always None:
import torch
import torch.nn as nn
import torch.optim as optim
# Define a simple DNN
class SimpleDNN(nn.Module):
def __init__(self):
super(SimpleDNN, self).__init__()
self.fc1 = nn.Linear(10, 20)
self.fc2 = nn.Linear(20, 10)
self.fc3 = nn.Linear(10, 1)
def forward(self, x):
x = torch.relu(self.fc1(x))
x = torch.relu(self.fc2(x))
x = self.fc3(x)
return x
# Function to check whether the gradient input is None
def check_gradients(module, grad_input, grad_output):
grad_input_none = [gi is None for gi in grad_input]
print(f"{module.__class__.__name__} grad_input contains None: {grad_input_none}")
# Initialize the model, loss function, and optimizer
model = SimpleDNN()
criterion = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)
# Register backward hooks for each layer
for name, module in model.named_modules():
if isinstance(module, nn.Linear):
module.register_full_backward_hook(check_gradients)
# Dummy input and target
inputs = torch.randn(5, 10) # Batch of 5, input size 10
targets = torch.randn(5, 1) # Batch of 5, target size 1
# Forward pass
outputs = model(inputs)
loss = criterion(outputs, targets)
# Backward pass
optimizer.zero_grad()
loss.backward()
optimizer.step()
And the output is:
Linear grad_input contains None: [False]
Linear grad_input contains None: [False]
Linear grad_input contains None: [True]