Higher Kernel Launch CPU Overhead

Open rajagond opened this issue 1 year ago • 1 comments

I have observed a higher CPU launch overhead when using CuPy's matrix multiplication compared to PyTorch. While the GPU computation is almost similar for both, the CPU overhead for launching the kernel in CuPy appears to be larger.

To Reproduce

import cupy as cp
import time

def matmul_cpu_overhead_cupy():
    BL = 32 * 2048
    hidden_size = 12288
    inter_size = (4 * hidden_size) // 2
    warmup_iters = 50
    active_iters = 250
    cp.cuda.Device(1).use()
    input = cp.random.rand(BL, inter_size).astype(cp.float16)
    weights = cp.random.rand(inter_size, hidden_size).astype(cp.float16)
    output = cp.zeros((BL, hidden_size), dtype=cp.float16)
    start_event = cp.cuda.Event(disable_timing=False)
    end_event = cp.cuda.Event(disable_timing=False)
    cp.cuda.Stream.null.synchronize()  
    # Warm-up  
    for _ in range(warmup_iters):
        cp.matmul(input, weights, out=output) 
    
    # Measure the launch overhead  
    start_time = time.time()
    start_event.record()
    for _ in range(active_iters):
        cp.matmul(input, weights, out=output)
    end_event.record()
    end_time = time.time()  
    
    end_event.synchronize()  
    gpu_time = cp.cuda.get_elapsed_time(start_event, end_event) / active_iters  # in milliseconds
    cpu_time = (end_time - start_time) / active_iters
    total_time = cpu_time * 1000.0  + gpu_time  
    print(f"Cupy: Total time including GPU computation: {total_time} milliseconds")
    
    # CPU launch overhead time  
    cpu_launch_overhead = cpu_time * 1000.0
    print(f"Cupy: CPU launch overhead: {cpu_launch_overhead} milliseconds")

def matmul_cpu_overhead_torch():
    import torch  
    import time  
    
    device = torch.device("cuda:1")  
    torch.cuda.set_device(device)
    BL = 32 * 2048
    hidden_size = 12288
    inter_size = (4 * hidden_size) // 2
    warmup_iters = 50
    active_iters = 250
    input = torch.randn(BL, inter_size, device=device, dtype=torch.bfloat16)
    weights = torch.randn(inter_size, hidden_size, device=device, dtype=torch.bfloat16)
    output = torch.zeros((BL, hidden_size), device=device, dtype=torch.bfloat16)
    start_event = torch.cuda.Event(enable_timing=True)
    end_event = torch.cuda.Event(enable_timing=True)
    # Warm-up 
    for _ in range(warmup_iters):
        torch.mm(input, weights, out=output)
    
    # Measure the launch overhead  
    start_time = time.time()
    start_event.record()
    for _ in range(active_iters):
        torch.mm(input, weights, out=output)
    end_event.record()
    end_time = time.time() 
    end_event.synchronize()
    
    gpu_time = start_event.elapsed_time(end_event) / active_iters
    cpu_time = (end_time - start_time) / active_iters
    total_time = cpu_time * 1000.0  + gpu_time
    print(f"Pytorch: Total time including GPU computation: {total_time} milliseconds")
    
    # CPU launch overhead time  
    cpu_launch_overhead = cpu_time * 1000.0
    print(f"Pytorch: CPU launch overhead: {cpu_launch_overhead} milliseconds")


def main():
    matmul_cpu_overhead_cupy()
    matmul_cpu_overhead_torch()

if __name__ == "__main__":
    main()

Results

Cupy: Total time including GPU computation: 176.1893156967163 milliseconds
Cupy: CPU launch overhead: 0.0694875717163086 milliseconds
Pytorch: Total time including GPU computation: 176.11363986968993 milliseconds
Pytorch: CPU launch overhead: 0.018952369689941406 milliseconds

Environment

OS                           : Linux-6.2.0-1014-azure-x86_64-with-glibc2.29
Python Version               : 3.8.10
CuPy Version                 : 12.3.0
CuPy Platform                : NVIDIA CUDA
NumPy Version                : 1.24.4
SciPy Version                : 1.10.1
Cython Build Version         : 0.29.36
Cython Runtime Version       : None
CUDA Root                    : /usr/local/cuda
nvcc PATH                    : /usr/local/cuda/bin/nvcc
CUDA Build Version           : 12020
CUDA Driver Version          : 12020
CUDA Runtime Version         : 12010
cuBLAS Version               : (available)
cuFFT Version                : 11002
cuRAND Version               : 10302
cuSOLVER Version             : (11, 4, 5)
cuSPARSE Version             : (available)
NVRTC Version                : (12, 1)
Thrust Version               : 200101
CUB Build Version            : 200101
Jitify Build Version         : <unknown>
cuDNN Build Version          : 8801
cuDNN Version                : 8902
NCCL Build Version           : 21602
NCCL Runtime Version         : 21903
cuTENSOR Version             : None
cuSPARSELt Build Version     : None
Device 0 Name                : NVIDIA A100 80GB PCIe
Device 0 Compute Capability  : 80
Device 0 PCI Bus ID          : 0001:00:00.0
Device 1 Name                : NVIDIA A100 80GB PCIe
Device 1 Compute Capability  : 80
Device 1 PCI Bus ID          : 0002:00:00.0
Device 2 Name                : NVIDIA A100 80GB PCIe
Device 2 Compute Capability  : 80
Device 2 PCI Bus ID          : 0003:00:00.0
Device 3 Name                : NVIDIA A100 80GB PCIe
Device 3 Compute Capability  : 80
Device 3 PCI Bus ID          : 0004:00:00.0

Feb 16 '24 07:02 rajagond