TensorRT
TensorRT copied to clipboard
Conv2d result is diff with pytorch result when running on GPU
Description
Environment
TensorRT 9.2:
A100:
525.105.17:
12.2:
Operating System:
Python Version: 3.10
PyTorch Version: 2.1.2
Baremetal or Container (if so, version): nvcr.io/pytorch:23.05-py3
How big is the diff? we don't guarantee bit-wise alignment for TRT and other frameworks.
We compared the results of matrix multiplication between tensorrt-llm and pytorch Here is the code for accuracy test
import random
import torch
from collections import OrderedDict
from torch import nn
import tensorrt as trt
import tensorrt_llm
from tensorrt_llm.builder import Builder
from tensorrt_llm.network import net_guard
from tensorrt_llm.runtime import Session
from tensorrt_llm.layers import Conv2d
from tensorrt_llm.module import Module
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
tensorrt_llm.logger.set_level("warning")
torch.set_printoptions(precision=12, sci_mode=True)
class TorchMLP(nn.Module):
def __init__(self, hidden_size, ffn_hidden_size, bias=False):
super().__init__()
self.fc = nn.Linear(hidden_size, ffn_hidden_size, bias=bias)
self.proj = nn.Linear(ffn_hidden_size, hidden_size, bias=bias)
def forward(self, hidden_states):
inter = self.fc(hidden_states)
inter = nn.functional.relu(inter)
output = self.proj(inter)
return output
class MLP(Module):
def __init__(self,
hidden_size,
ffn_hidden_size,
bias=False,
tp_group=None,
tp_size=1):
super().__init__()
self.fc = tensorrt_llm.layers.ColumnLinear(hidden_size,
ffn_hidden_size,
bias=bias,
tp_group=tp_group,
tp_size=tp_size,
gather_output=False)
self.proj = tensorrt_llm.layers.RowLinear(ffn_hidden_size,
hidden_size,
bias=bias,
tp_group=tp_group,
tp_size=tp_size)
def forward(self, hidden_states):
inter = self.fc(hidden_states)
inter = tensorrt_llm.functional.relu(inter)
self.register_network_output('inter', inter)
output = self.proj(inter)
return output
if __name__ == "__main__":
hidden_size = 2048
ffn_hidden_size = 4096
seq_len = 19
bsz = 2
random.seed(2345)
torch.manual_seed(2345)
x = torch.ones([bsz, seq_len, hidden_size], device="cuda", dtype=torch.float32)
pymlp = TorchMLP(
hidden_size, ffn_hidden_size,
bias=False).to(torch.float32).to('cuda').eval()
y1 = pymlp(x)
builder = Builder()
builder_config = builder.create_builder_config(
name="MLP",
precision="float32",
opt_level=0,
strongly_typed=True,
)
mymlp = MLP(hidden_size, ffn_hidden_size, bias=False)
mymlp.fc.weight.value = pymlp.fc.weight
mymlp.proj.weight.value = pymlp.proj.weight
network = builder.create_network()
with net_guard(network):
trt_x = tensorrt_llm.Tensor(
name='x',
dtype=trt.float32,
shape=[bsz, seq_len, hidden_size],
dim_range=OrderedDict([
('x0', [[bsz, bsz, bsz]]),
('x1', [[seq_len, seq_len , seq_len]]),
('x2', [[hidden_size, hidden_size, hidden_size]]),
]))
output = mymlp(hidden_states=trt_x)
output.mark_output("y", trt.float32)
network.set_named_parameters(mymlp.named_parameters())
tensorrt_llm.graph_rewriting.optimize(network)
engine = builder.build_engine(network, builder_config)
assert engine is not None, 'Failed to build engine.'
session = Session.from_serialized_engine(engine)
inputs = { "x" : x }
outputs = { "y" : torch.empty([bsz, seq_len, hidden_size], dtype=torch.float32, device='cuda') }
stream = torch.cuda.current_stream().cuda_stream
ok = session.run(inputs, outputs, stream)
assert ok
y2 = outputs["y"]
print(y1.shape, y2.shape)
diff = (y1 - y2).abs()
print(diff.shape)
diff_idx = torch.nonzero((y1 - y2).abs())
diff_percent = 0
diff_mean = 0
max_idx = None
for idx in diff_idx:
if diff_percent < diff[idx[0], idx[1], idx[2]].item()/y1.abs()[idx[0], idx[1], idx[2]].item():
diff_percent = diff[idx[0], idx[1], idx[2]].item()/y1.abs()[idx[0], idx[1], idx[2]].item()
max_idx = idx
print("diff num : ", diff_idx.shape[0])
print("diff max : ", diff.max().item())
print("diff mean: ", diff.sum().item()/diff_idx.shape[0])
if max_idx != None:
print("diff : ", diff_percent * 100, "%", y1[max_idx[0], max_idx[1], max_idx[2]].item(), y2[max_idx[0], max_idx[1], max_idx[2]].item())
the result is :
diff num : 77786
diff max : 0.0002981722354888916
diff mean: 6.754354270417382e-05
diff : 474.25495262704567 % -2.162531018257141e-05 8.093379437923431e-05
and Cove2d have the same problem
It's expected.
closing since no activity for more than 3 weeks, pls reopen if you still have question, thanks all!