AITemplate icon indicating copy to clipboard operation
AITemplate copied to clipboard

Slow nn.Linear on MI250

Open comaniac opened this issue 1 year ago • 2 comments

Hi there,

I tried to benchmark the performance of nn.Linear in AI Template on MI250 GPU and compared with rocBLAS. I expected AI Template should achieve a much higher throughput, but it turns out to be just similar. I'm attaching my benchmark script, could anyone help to see if I did anything wrong? Thanks in advance.

btw, due to #91, I'm using v0.1 stable release to make it work.

import torch

from aitemplate.compiler import compile_model
from aitemplate.frontend import nn, Tensor
from aitemplate.testing import detect_target
from aitemplate.testing.benchmark_pt import benchmark_torch_function
from aitemplate.utils.graph_utils import sorted_graph_pseudo_code

is_ait = True

pt_dtype_mappings = {
    "float": torch.float,
    "half": torch.half,
    "float16": torch.float16,
    "bfloat16": torch.bfloat16,
}

def map_pt_params(ait_model, pt_model):
    ait_model.name_parameter_tensor()
    pt_params = dict(pt_model.named_parameters())
    mapped_pt_params = {}

    for name, _ in ait_model.named_parameters():
        ait_name = name.replace(".", "_")
        assert name in pt_params
        mapped_pt_params[ait_name] = pt_params[name]
    return mapped_pt_params

def run(
    warmup=10, iterations=100, dtype="float16", gpu_idx=0
):
    batch_size = 1
    sequence_size = 2048
    hidden_size = 8192
    pt_dtype = pt_dtype_mappings[dtype]

    ngpu = torch.cuda.device_count()
    assert gpu_idx < ngpu, f"GPU index {gpu_idx} is out of range ({ngpu} GPUs available)"

    torch.cuda.set_device(gpu_idx)
    matmul = torch.nn.Linear(hidden_size, hidden_size, bias=False)
    matmul = matmul.to(f"cuda:{gpu_idx}").to(pt_dtype).eval()

    a = torch.randn(batch_size * sequence_size, hidden_size, dtype=pt_dtype, device=f"cuda:{gpu_idx}")

    if is_ait:
        ait_matmul = nn.Linear(hidden_size, hidden_size, bias=False)
        A = Tensor(shape=[batch_size * sequence_size, hidden_size], name="A", dtype=dtype, is_input=True)
        C = ait_matmul(A)
        C._attrs["is_output"] = True
        C._attrs["name"] = "C"

        target = detect_target()
        module = compile_model(C, target, "./tmp", "matmul_demo")
        weights = map_pt_params(ait_matmul, matmul)
        module.set_constant_with_tensor("tensor_0", weights["weight"])

        inputs = {"A": a}
        outputs = {"C": torch.empty([batch_size * sequence_size, hidden_size], dtype=pt_dtype, device=f"cuda:{gpu_idx}")}

        module.benchmark_with_tensors(inputs, outputs, count=warmup, repeat=1, graph_mode=True)

        elapsed_time, _, _ = module.benchmark_with_tensors(
            inputs, outputs, graph_mode=True, count=iterations
        )
    else:
        for _ in range(warmup):
            matmul(a)

        torch.cuda.synchronize()
        start = torch.cuda.Event(enable_timing=True)
        end = torch.cuda.Event(enable_timing=True)

        start.record()
        for _ in range(iterations):
            matmul(a)
        end.record()
        torch.cuda.synchronize()
        elapsed_time = start.elapsed_time(end) / iterations

    print(elapsed_time)
    tflops = (
        2
        * batch_size
        * sequence_size
        * hidden_size**2
        / elapsed_time
        / 10**9
    )
    print(
        f"The TFLOPS for computing matmul between {dtype} tensor "
        f"({batch_size}, {sequence_size}, {hidden_size}) and "
        f"({hidden_size}, {hidden_size}) on GPU {gpu_idx} is {tflops}"
    )

if __name__ == "__main__":
    run()

Running the script would result in the following:

The TFLOPS for computing matmul between float16 tensor (1, 2048, 8192) and (8192, 8192) on GPU 0 is 116.43586503213587
# set is_ait=False
The TFLOPS for computing matmul between float16 tensor (1, 2048, 8192) and (8192, 8192) on GPU 0 is 103.43426138169123

comaniac avatar May 02 '23 19:05 comaniac

cc @fsx950223

ipiszy avatar May 03 '23 01:05 ipiszy

I think for individual gemm kernels, AIT should behave similar compared to rocBlas. AIT's perf gain mostly come from operator fusions.

ipiszy avatar May 03 '23 01:05 ipiszy