enable 3d grouped fwd for gfx1100
enable 3d grouped fwd for gfx1100 with the following test, there is 100x uplift for Conv3d fwd on gfx1100:
import torch
import torch.nn as nn
import time
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
class CustomConv3DNet(nn.Module):
def __init__(self):
super(CustomConv3DNet, self).__init__()
self.conv3d = nn.Conv3d(
in_channels=3, # Number of input channels is 3 (e.g., RGB)
out_channels=1280, # Number of output channels is 1280
kernel_size=(2, 14, 14), # Convolution kernel size is (2, 14, 14)
stride=(2, 14, 14), # Stride is (2, 14, 14)
bias=False # Do not use bias
)
def forward(self, x):
x = self.conv3d(x)
return x
# Create network instance and move to device
net = CustomConv3DNet().to(device)
net = net.half()
# Print the network structure
print(net)
# Generate a random input tensor
# Shape is (batch size, channels, depth, height, width)
input_tensor = torch.randn(12920, 3, 2, 14, 14).to(device) # 12920 samples, 3 channels, depth 2, height 14, width 14
input_tensor = input_tensor.half()
# To accurately measure the time on GPU, first perform a few forward passes to warm up the GPU
# This helps to avoid additional latency the first time, which might affect the measurement
for _ in range(2):
_ = net.conv3d(input_tensor)
# Measure the execution time of the Conv3d layer
num_iterations = 10 # Number of iterations to measure
output = None # Initialize output variable
if device.type == 'cuda':
# Use CUDA events for precise timing
torch.cuda.synchronize() # Ensure all previous CUDA operations are complete
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)
start_event.record()
for _ in range(num_iterations):
output = net.conv3d(input_tensor) # Only measure the Conv3d layer time
end_event.record()
torch.cuda.synchronize() # Wait for all events to complete
elapsed_time_ms = start_event.elapsed_time(end_event) # In milliseconds
avg_time_per_conv3d_ms = elapsed_time_ms / num_iterations
print(f"Average time per Conv3d execution: {avg_time_per_conv3d_ms:.6f} ms")
else:
print(f"Average time per Conv3d execution: {avg_time_per_conv3d_ms:.6f} ms")
# Optional: Print the output shape of the Conv3d layer
print(f"Conv3d layer output shape: {output.shape}")
Here is there result with this PR on gfx1100:
Command:MIOPEN_DEBUG_3D_CONV_IMPLICIT_GEMM_HIP_FWD_XDLOPS=0 python test_conv.py
Result:
Using device: cuda
CustomConv3DNet(
(conv3d): Conv3d(3, 1280, kernel_size=(2, 14, 14), stride=(2, 14, 14), bias=False)
)
Average time per Conv3d execution: 249.599438 ms
Conv3d layer output shape: torch.Size([12920, 1280, 1, 1, 1])
Command:MIOPEN_DEBUG_3D_CONV_IMPLICIT_GEMM_HIP_FWD_XDLOPS=1 python test_conv.py
Result:
Using device: cuda
CustomConv3DNet(
(conv3d): Conv3d(3, 1280, kernel_size=(2, 14, 14), stride=(2, 14, 14), bias=False)
)
Average time per Conv3d execution: 2.412976 ms
Conv3d layer output shape: torch.Size([12920, 1280, 1, 1, 1])