enable 3d grouped fwd for gfx1100

Open jfactory07 opened this issue 9 months ago • 0 comments

enable 3d grouped fwd for gfx1100 with the following test, there is 100x uplift for Conv3d fwd on gfx1100:

import torch  
import torch.nn as nn  
import time  

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  

class CustomConv3DNet(nn.Module):  
    def __init__(self):  
        super(CustomConv3DNet, self).__init__()  
        self.conv3d = nn.Conv3d(  
            in_channels=3,                # Number of input channels is 3 (e.g., RGB)  
            out_channels=1280,        # Number of output channels is 1280  
            kernel_size=(2, 14, 14),    # Convolution kernel size is (2, 14, 14)  
            stride=(2, 14, 14),           # Stride is (2, 14, 14)  
            bias=False                      # Do not use bias  
        )  
        
    def forward(self, x):  
        x = self.conv3d(x)  
        return x  
  
# Create network instance and move to device  
net = CustomConv3DNet().to(device)  
net = net.half()  
# Print the network structure  
print(net)  
  
# Generate a random input tensor  
# Shape is (batch size, channels, depth, height, width)  
input_tensor = torch.randn(12920, 3, 2, 14, 14).to(device)  # 12920 samples, 3 channels, depth 2, height 14, width 14  
input_tensor = input_tensor.half()  
  
# To accurately measure the time on GPU, first perform a few forward passes to warm up the GPU  
# This helps to avoid additional latency the first time, which might affect the measurement  
for _ in range(2):  
    _ = net.conv3d(input_tensor)  
  
# Measure the execution time of the Conv3d layer  
num_iterations = 10  # Number of iterations to measure  
  
output = None  # Initialize output variable  
  
if device.type == 'cuda':  
    # Use CUDA events for precise timing  
    torch.cuda.synchronize()  # Ensure all previous CUDA operations are complete  
  
    start_event = torch.cuda.Event(enable_timing=True)  
    end_event = torch.cuda.Event(enable_timing=True)  
  
    start_event.record()  
  
    for _ in range(num_iterations):  
          output = net.conv3d(input_tensor)  # Only measure the Conv3d layer time  
  
    end_event.record()  
  
    torch.cuda.synchronize()  # Wait for all events to complete  
  
    elapsed_time_ms = start_event.elapsed_time(end_event)  # In milliseconds  
    avg_time_per_conv3d_ms = elapsed_time_ms / num_iterations  
  
    print(f"Average time per Conv3d execution: {avg_time_per_conv3d_ms:.6f} ms")  
else:  
    print(f"Average time per Conv3d execution: {avg_time_per_conv3d_ms:.6f} ms")  
  
# Optional: Print the output shape of the Conv3d layer  
print(f"Conv3d layer output shape: {output.shape}")

Here is there result with this PR on gfx1100:

Command:MIOPEN_DEBUG_3D_CONV_IMPLICIT_GEMM_HIP_FWD_XDLOPS=0 python test_conv.py Result: Using device: cuda CustomConv3DNet( (conv3d): Conv3d(3, 1280, kernel_size=(2, 14, 14), stride=(2, 14, 14), bias=False) ) Average time per Conv3d execution: 249.599438 ms Conv3d layer output shape: torch.Size([12920, 1280, 1, 1, 1])

Command:MIOPEN_DEBUG_3D_CONV_IMPLICIT_GEMM_HIP_FWD_XDLOPS=1 python test_conv.py Result: Using device: cuda CustomConv3DNet( (conv3d): Conv3d(3, 1280, kernel_size=(2, 14, 14), stride=(2, 14, 14), bias=False) ) Average time per Conv3d execution: 2.412976 ms Conv3d layer output shape: torch.Size([12920, 1280, 1, 1, 1])

Mar 28 '25 06:03 jfactory07