cupy
cupy copied to clipboard
Higher Kernel Launch CPU Overhead
I have observed a higher CPU launch overhead when using CuPy's matrix multiplication compared to PyTorch. While the GPU computation is almost similar for both, the CPU overhead for launching the kernel in CuPy appears to be larger.
To Reproduce
import cupy as cp
import time
def matmul_cpu_overhead_cupy():
BL = 32 * 2048
hidden_size = 12288
inter_size = (4 * hidden_size) // 2
warmup_iters = 50
active_iters = 250
cp.cuda.Device(1).use()
input = cp.random.rand(BL, inter_size).astype(cp.float16)
weights = cp.random.rand(inter_size, hidden_size).astype(cp.float16)
output = cp.zeros((BL, hidden_size), dtype=cp.float16)
start_event = cp.cuda.Event(disable_timing=False)
end_event = cp.cuda.Event(disable_timing=False)
cp.cuda.Stream.null.synchronize()
# Warm-up
for _ in range(warmup_iters):
cp.matmul(input, weights, out=output)
# Measure the launch overhead
start_time = time.time()
start_event.record()
for _ in range(active_iters):
cp.matmul(input, weights, out=output)
end_event.record()
end_time = time.time()
end_event.synchronize()
gpu_time = cp.cuda.get_elapsed_time(start_event, end_event) / active_iters # in milliseconds
cpu_time = (end_time - start_time) / active_iters
total_time = cpu_time * 1000.0 + gpu_time
print(f"Cupy: Total time including GPU computation: {total_time} milliseconds")
# CPU launch overhead time
cpu_launch_overhead = cpu_time * 1000.0
print(f"Cupy: CPU launch overhead: {cpu_launch_overhead} milliseconds")
def matmul_cpu_overhead_torch():
import torch
import time
device = torch.device("cuda:1")
torch.cuda.set_device(device)
BL = 32 * 2048
hidden_size = 12288
inter_size = (4 * hidden_size) // 2
warmup_iters = 50
active_iters = 250
input = torch.randn(BL, inter_size, device=device, dtype=torch.bfloat16)
weights = torch.randn(inter_size, hidden_size, device=device, dtype=torch.bfloat16)
output = torch.zeros((BL, hidden_size), device=device, dtype=torch.bfloat16)
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)
# Warm-up
for _ in range(warmup_iters):
torch.mm(input, weights, out=output)
# Measure the launch overhead
start_time = time.time()
start_event.record()
for _ in range(active_iters):
torch.mm(input, weights, out=output)
end_event.record()
end_time = time.time()
end_event.synchronize()
gpu_time = start_event.elapsed_time(end_event) / active_iters
cpu_time = (end_time - start_time) / active_iters
total_time = cpu_time * 1000.0 + gpu_time
print(f"Pytorch: Total time including GPU computation: {total_time} milliseconds")
# CPU launch overhead time
cpu_launch_overhead = cpu_time * 1000.0
print(f"Pytorch: CPU launch overhead: {cpu_launch_overhead} milliseconds")
def main():
matmul_cpu_overhead_cupy()
matmul_cpu_overhead_torch()
if __name__ == "__main__":
main()
Results
Cupy: Total time including GPU computation: 176.1893156967163 milliseconds
Cupy: CPU launch overhead: 0.0694875717163086 milliseconds
Pytorch: Total time including GPU computation: 176.11363986968993 milliseconds
Pytorch: CPU launch overhead: 0.018952369689941406 milliseconds
Environment
OS : Linux-6.2.0-1014-azure-x86_64-with-glibc2.29
Python Version : 3.8.10
CuPy Version : 12.3.0
CuPy Platform : NVIDIA CUDA
NumPy Version : 1.24.4
SciPy Version : 1.10.1
Cython Build Version : 0.29.36
Cython Runtime Version : None
CUDA Root : /usr/local/cuda
nvcc PATH : /usr/local/cuda/bin/nvcc
CUDA Build Version : 12020
CUDA Driver Version : 12020
CUDA Runtime Version : 12010
cuBLAS Version : (available)
cuFFT Version : 11002
cuRAND Version : 10302
cuSOLVER Version : (11, 4, 5)
cuSPARSE Version : (available)
NVRTC Version : (12, 1)
Thrust Version : 200101
CUB Build Version : 200101
Jitify Build Version : <unknown>
cuDNN Build Version : 8801
cuDNN Version : 8902
NCCL Build Version : 21602
NCCL Runtime Version : 21903
cuTENSOR Version : None
cuSPARSELt Build Version : None
Device 0 Name : NVIDIA A100 80GB PCIe
Device 0 Compute Capability : 80
Device 0 PCI Bus ID : 0001:00:00.0
Device 1 Name : NVIDIA A100 80GB PCIe
Device 1 Compute Capability : 80
Device 1 PCI Bus ID : 0002:00:00.0
Device 2 Name : NVIDIA A100 80GB PCIe
Device 2 Compute Capability : 80
Device 2 PCI Bus ID : 0003:00:00.0
Device 3 Name : NVIDIA A100 80GB PCIe
Device 3 Compute Capability : 80
Device 3 PCI Bus ID : 0004:00:00.0