TensorRT Conv2d result is diff with pytorch result when running on GPU

Description

Environment

TensorRT 9.2:

A100:

525.105.17:

12.2:

Operating System:

Python Version: 3.10

PyTorch Version: 2.1.2

Baremetal or Container (if so, version): nvcr.io/pytorch:23.05-py3

Jan 26 '24 09:01 zhaoxudong01

How big is the diff? we don't guarantee bit-wise alignment for TRT and other frameworks.

Jan 28 '24 08:01 zerollzeng

We compared the results of matrix multiplication between tensorrt-llm and pytorch Here is the code for accuracy test

import random
import torch
from collections import OrderedDict
from torch import nn
import tensorrt as trt
import tensorrt_llm
from tensorrt_llm.builder import Builder
from tensorrt_llm.network import net_guard
from tensorrt_llm.runtime import Session
from tensorrt_llm.layers import Conv2d
from tensorrt_llm.module import Module
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
tensorrt_llm.logger.set_level("warning")
torch.set_printoptions(precision=12, sci_mode=True)


class TorchMLP(nn.Module):

    def __init__(self, hidden_size, ffn_hidden_size, bias=False):
        super().__init__()
        self.fc = nn.Linear(hidden_size, ffn_hidden_size, bias=bias)
        self.proj = nn.Linear(ffn_hidden_size, hidden_size, bias=bias)

    def forward(self, hidden_states):
        inter = self.fc(hidden_states)
        inter = nn.functional.relu(inter)
        output = self.proj(inter)
        return output


class MLP(Module):

    def __init__(self,
                 hidden_size,
                 ffn_hidden_size,
                 bias=False,
                 tp_group=None,
                 tp_size=1):
        super().__init__()
        self.fc = tensorrt_llm.layers.ColumnLinear(hidden_size,
                                                   ffn_hidden_size,
                                                   bias=bias,
                                                   tp_group=tp_group,
                                                   tp_size=tp_size,
                                                   gather_output=False)
        self.proj = tensorrt_llm.layers.RowLinear(ffn_hidden_size,
                                                  hidden_size,
                                                  bias=bias,
                                                  tp_group=tp_group,
                                                  tp_size=tp_size)

    def forward(self, hidden_states):
        inter = self.fc(hidden_states)
        inter = tensorrt_llm.functional.relu(inter)
        self.register_network_output('inter', inter)
        output = self.proj(inter)
        return output

if __name__ == "__main__":
    hidden_size = 2048
    ffn_hidden_size = 4096
    seq_len = 19
    bsz = 2
    random.seed(2345)
    torch.manual_seed(2345)
    x = torch.ones([bsz, seq_len, hidden_size], device="cuda", dtype=torch.float32)
    pymlp = TorchMLP(
        hidden_size, ffn_hidden_size,
        bias=False).to(torch.float32).to('cuda').eval()
    y1 = pymlp(x)


    builder = Builder()
    builder_config = builder.create_builder_config(
        name="MLP",
        precision="float32",
        opt_level=0,
        strongly_typed=True,
    )

    mymlp = MLP(hidden_size, ffn_hidden_size, bias=False)
    mymlp.fc.weight.value = pymlp.fc.weight
    mymlp.proj.weight.value = pymlp.proj.weight

    network = builder.create_network()

    with net_guard(network):
        trt_x = tensorrt_llm.Tensor(
            name='x',
            dtype=trt.float32,
            shape=[bsz, seq_len, hidden_size],
            dim_range=OrderedDict([
                ('x0', [[bsz, bsz, bsz]]),
                ('x1', [[seq_len, seq_len , seq_len]]),
                ('x2', [[hidden_size, hidden_size, hidden_size]]),
            ]))
        output = mymlp(hidden_states=trt_x)
        output.mark_output("y", trt.float32)
        network.set_named_parameters(mymlp.named_parameters())

    tensorrt_llm.graph_rewriting.optimize(network)
    engine = builder.build_engine(network, builder_config)
    assert engine is not None, 'Failed to build engine.'

    session = Session.from_serialized_engine(engine)
    inputs = { "x" : x }
    outputs = { "y" : torch.empty([bsz, seq_len, hidden_size], dtype=torch.float32, device='cuda') }
    stream = torch.cuda.current_stream().cuda_stream
    ok = session.run(inputs, outputs, stream)
    assert ok
    y2 = outputs["y"]
    print(y1.shape, y2.shape)
    diff = (y1 - y2).abs()
    print(diff.shape)
    diff_idx = torch.nonzero((y1 - y2).abs())
    diff_percent = 0
    diff_mean = 0
    max_idx = None
    for idx in diff_idx:
        if diff_percent < diff[idx[0], idx[1], idx[2]].item()/y1.abs()[idx[0], idx[1], idx[2]].item():
            diff_percent = diff[idx[0], idx[1], idx[2]].item()/y1.abs()[idx[0], idx[1], idx[2]].item()
            max_idx = idx
    print("diff num : ", diff_idx.shape[0])
    print("diff max : ", diff.max().item())
    print("diff mean: ", diff.sum().item()/diff_idx.shape[0])
    if max_idx != None:
        print("diff : ",  diff_percent * 100, "%", y1[max_idx[0], max_idx[1], max_idx[2]].item(), y2[max_idx[0], max_idx[1], max_idx[2]].item())

the result is :

diff num :  77786
diff max :  0.0002981722354888916
diff mean:  6.754354270417382e-05
diff :  474.25495262704567 % -2.162531018257141e-05 8.093379437923431e-05

and Cove2d have the same problem

Jan 29 '24 00:01 zhaoxudong01

It's expected.

Feb 01 '24 13:02 zerollzeng

closing since no activity for more than 3 weeks, pls reopen if you still have question, thanks all!

Mar 05 '24 17:03 ttyio

TensorRT TensorRT copied to clipboard

Conv2d result is diff with pytorch result when running on GPU

Description

Environment

TensorRT
TensorRT copied to clipboard