AITemplate
AITemplate copied to clipboard
Slow nn.Linear on MI250
Hi there,
I tried to benchmark the performance of nn.Linear
in AI Template on MI250 GPU and compared with rocBLAS. I expected AI Template should achieve a much higher throughput, but it turns out to be just similar. I'm attaching my benchmark script, could anyone help to see if I did anything wrong? Thanks in advance.
btw, due to #91, I'm using v0.1 stable release to make it work.
import torch
from aitemplate.compiler import compile_model
from aitemplate.frontend import nn, Tensor
from aitemplate.testing import detect_target
from aitemplate.testing.benchmark_pt import benchmark_torch_function
from aitemplate.utils.graph_utils import sorted_graph_pseudo_code
is_ait = True
pt_dtype_mappings = {
"float": torch.float,
"half": torch.half,
"float16": torch.float16,
"bfloat16": torch.bfloat16,
}
def map_pt_params(ait_model, pt_model):
ait_model.name_parameter_tensor()
pt_params = dict(pt_model.named_parameters())
mapped_pt_params = {}
for name, _ in ait_model.named_parameters():
ait_name = name.replace(".", "_")
assert name in pt_params
mapped_pt_params[ait_name] = pt_params[name]
return mapped_pt_params
def run(
warmup=10, iterations=100, dtype="float16", gpu_idx=0
):
batch_size = 1
sequence_size = 2048
hidden_size = 8192
pt_dtype = pt_dtype_mappings[dtype]
ngpu = torch.cuda.device_count()
assert gpu_idx < ngpu, f"GPU index {gpu_idx} is out of range ({ngpu} GPUs available)"
torch.cuda.set_device(gpu_idx)
matmul = torch.nn.Linear(hidden_size, hidden_size, bias=False)
matmul = matmul.to(f"cuda:{gpu_idx}").to(pt_dtype).eval()
a = torch.randn(batch_size * sequence_size, hidden_size, dtype=pt_dtype, device=f"cuda:{gpu_idx}")
if is_ait:
ait_matmul = nn.Linear(hidden_size, hidden_size, bias=False)
A = Tensor(shape=[batch_size * sequence_size, hidden_size], name="A", dtype=dtype, is_input=True)
C = ait_matmul(A)
C._attrs["is_output"] = True
C._attrs["name"] = "C"
target = detect_target()
module = compile_model(C, target, "./tmp", "matmul_demo")
weights = map_pt_params(ait_matmul, matmul)
module.set_constant_with_tensor("tensor_0", weights["weight"])
inputs = {"A": a}
outputs = {"C": torch.empty([batch_size * sequence_size, hidden_size], dtype=pt_dtype, device=f"cuda:{gpu_idx}")}
module.benchmark_with_tensors(inputs, outputs, count=warmup, repeat=1, graph_mode=True)
elapsed_time, _, _ = module.benchmark_with_tensors(
inputs, outputs, graph_mode=True, count=iterations
)
else:
for _ in range(warmup):
matmul(a)
torch.cuda.synchronize()
start = torch.cuda.Event(enable_timing=True)
end = torch.cuda.Event(enable_timing=True)
start.record()
for _ in range(iterations):
matmul(a)
end.record()
torch.cuda.synchronize()
elapsed_time = start.elapsed_time(end) / iterations
print(elapsed_time)
tflops = (
2
* batch_size
* sequence_size
* hidden_size**2
/ elapsed_time
/ 10**9
)
print(
f"The TFLOPS for computing matmul between {dtype} tensor "
f"({batch_size}, {sequence_size}, {hidden_size}) and "
f"({hidden_size}, {hidden_size}) on GPU {gpu_idx} is {tflops}"
)
if __name__ == "__main__":
run()
Running the script would result in the following:
The TFLOPS for computing matmul between float16 tensor (1, 2048, 8192) and (8192, 8192) on GPU 0 is 116.43586503213587
# set is_ait=False
The TFLOPS for computing matmul between float16 tensor (1, 2048, 8192) and (8192, 8192) on GPU 0 is 103.43426138169123
cc @fsx950223
I think for individual gemm kernels, AIT should behave similar compared to rocBlas. AIT's perf gain mostly come from operator fusions.