TensorRT
TensorRT copied to clipboard
Cuda MEMCPY failure with float16 of TensorRT 10.3 Custom Plugin when running inference on H100 NVL x 2
Description
I am unable to do an identity operation which involves copying out a tensor 1:1 that is float16. It works fine with an int32 and int64 tensor, but this is a basis of a plugin I am trying to create that manipulates float16. However, if the identity (copying out as output) operation doesn't work, I'm not sure what to do.
self.debug_buffer["full_hidden_states"] tensor([[ 4.1797, -0.2051, -1.8369, ..., -2.8945, 1.3564, 0.3196], [ 2.1680, -2.5840, -2.0742, ..., -2.5312, -0.4636, -2.8867], [ 2.1934, -3.1797, 1.9854, ..., 0.7856, -0.0352, -1.5967], ..., [-0.6577, 2.3828, 7.0742, ..., 2.1074, -1.9043, -0.1153], [ 0.9321, -3.4199, 0.9727, ..., 0.4680, -3.3691, -1.2725], [-0.6484, 0.9282, 0.6196, ..., 5.9570, -4.6875, -0.6816]], device='cuda:0', dtype=torch.float16)
This is the tensor I am tryng to copy out in, and this is the result I'm getting with the below tensorrt plugin I have created
self.debug_buffer['output_hidden_states'] tensor([[[ 4.1797, -0.2051, -1.8369, ..., -2.8945, 1.3564, 0.3196], [ 2.1680, -2.5840, -2.0742, ..., -2.5312, -0.4636, -2.8867], [ 2.1934, -3.1797, 1.9854, ..., 0.7856, -0.0352, -1.5967], ..., [ 0.0000, 0.0000, 0.0000, ..., 0.0000, 0.0000, 0.0000], [ 0.0000, 0.0000, 0.0000, ..., 0.0000, 0.0000, 0.0000], [ 0.0000, 0.0000, 0.0000, ..., 0.0000, 0.0000, 0.0000]]], device='cuda:0', dtype=torch.float16)
This is bizzare, I'm hoping I have misconfigured some settings or something, because evidently this is rather strange. In the below code I am smply doing a straight up copy out operation. The model I'm running is Meta Llama 3 Instruct 8B, but I feel like this issue is independent of model.
I am building the network with TensorRT-LLM.
` cudaMemcpyAsync(outputs[1], inputs[1], hidden_states_size_original, cudaMemcpyDeviceToDevice, stream);`
>
> `/*
> * SPDX-FileCopyrightText: Copyright (c) 1993-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
> * SPDX-License-Identifier: Apache-2.0
> *
> * Licensed under the Apache License, Version 2.0 (the "License");
> * you may not use this file except in compliance with the License.
> * You may obtain a copy of the License at
> *
> * http://www.apache.org/licenses/LICENSE-2.0
> *
> * Unless required by applicable law or agreed to in writing, software
> * distributed under the License is distributed on an "AS IS" BASIS,
> * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> * See the License for the specific language governing permissions and
> * limitations under the License.
> */
> #include "updateInferenceInputsPlugin.h"
> #include "tensorrt_llm/runtime/iBuffer.h"
> #include <iostream>
>
> using namespace nvinfer1;
> using tensorrt_llm::plugins::UpdateInferenceInputsPlugin;
> using tensorrt_llm::plugins::UpdateInferenceInputsPluginCreator;
>
> static char const *UPDATE_INFERENCE_INPUTS_PLUGIN_VERSION{"1"};
> static char const *UPDATE_INFERENCE_INPUTS_PLUGIN_NAME{"UpdateInferenceInputs"};
> PluginFieldCollection UpdateInferenceInputsPluginCreator::mFC{};
> std::vector<nvinfer1::PluginField> UpdateInferenceInputsPluginCreator::mPluginAttributes;
>
> UpdateInferenceInputsPlugin::UpdateInferenceInputsPlugin()
> {
> std::cout << "UpdateInferenceInputsPlugin constructor called" << std::endl;
> }
>
> UpdateInferenceInputsPlugin::UpdateInferenceInputsPlugin(void const *data, size_t length)
> {
> std::cout << "UpdateInferenceInputsPlugin parameterized constructor called" << std::endl;
> char const *d = reinterpret_cast<char const *>(data), *a = d;
> TLLM_CHECK_WITH_INFO(d == a + length,
> "Expected length (%d) != real length (%d). This is often "
> "caused by using different TensorRT-LLM version to build "
> "engine and run engine.",
> (int)length, (int)(d - a));
> }
>
> nvinfer1::IPluginV2DynamicExt *UpdateInferenceInputsPlugin::clone() const noexcept
> {
> std::cout << "UpdateInferenceInputsPlugin::clone called" << std::endl;
> auto *plugin = new UpdateInferenceInputsPlugin(*this);
> plugin->setPluginNamespace(mNamespace.c_str());
> return plugin;
> }
>
> nvinfer1::DimsExprs UpdateInferenceInputsPlugin::getOutputDimensions(
> int outputIndex, nvinfer1::DimsExprs const *inputs, int nbInputs, nvinfer1::IExprBuilder &exprBuilder) noexcept
> {
> // Assuming inputs[0] is the condition, inputs[1] is true_input_ids, inputs[2] is true_hidden_states,
> // inputs[3] is false_input_ids, inputs[4] is false_hidden_states
> if (outputIndex == 0)
> {
>
> nvinfer1::DimsExprs output;
> output.nbDims = 2;
>
> output.d[0] = inputs[0].d[0];
> output.d[1] = exprBuilder.operation(nvinfer1::DimensionOperation::kSUM, *inputs[0].d[1], *exprBuilder.constant(5));
>
> // Dimensions for input_ids (same for both true and false branches)
> return output;
> }
> else
> {
> nvinfer1::DimsExprs output;
> output.nbDims = 3;
>
> output.d[0] = inputs[1].d[0];
> output.d[1] = inputs[1].d[1];
> output.d[2] = inputs[1].d[2];
>
> return output;
> }
> }
>
> bool UpdateInferenceInputsPlugin::supportsFormatCombination(
> int pos, nvinfer1::PluginTensorDesc const *inOut, int nbInputs, int nbOutputs) noexcept
> {
>
> return true;
> // Ensure we don't acce ss out of bounds
> if (pos >= nbInputs + nbOutputs)
> return false;
>
> // Check input tensors
> if (pos < nbInputs)
> {
> if (pos == 0 || pos == 1 || pos == 4 || pos == 5)
> {
> return inOut[pos].type == DataType::kINT32 && inOut[pos].format == TensorFormat::kLINEAR;
> }
> else if (pos == 2 || pos == 3 || pos == 8)
> {
> return inOut[pos].type == DataType::kHALF && inOut[pos].format == TensorFormat::kLINEAR;
> }
> else if (pos == 6 || pos == 7)
> {
> return inOut[pos].type == DataType::kINT64 && inOut[pos].format == TensorFormat::kLINEAR;
> }
> else
> {
> return false; // Unknown input
> }
> }
> // Check output tensors
> else
> {
> int output_index = pos - nbInputs;
> if (output_index == 0)
> {
> return inOut[pos].type == DataType::kINT32 && inOut[pos].format == TensorFormat::kLINEAR;
> }
> else if (output_index == 1)
> {
> return inOut[pos].type == DataType::kHALF && inOut[pos].format == TensorFormat::kLINEAR;
> }
> else
> {
> return false; // Unknown output
> }
> }
> }
>
> void UpdateInferenceInputsPlugin::configurePlugin(nvinfer1::DynamicPluginTensorDesc const *in, int nbInputs,
> nvinfer1::DynamicPluginTensorDesc const *out, int nbOutputs) noexcept
> {
> std::cout << "UpdateInferenceInputsPlugin::configurePlugin called" << std::endl;
> }
>
> size_t UpdateInferenceInputsPlugin::getWorkspaceSize(nvinfer1::PluginTensorDesc const *inputs, int nbInputs,
> nvinfer1::PluginTensorDesc const *outputs, int nbOutputs) const noexcept
> {
> std::cout << "UpdateInferenceInputsPlugin::getWorkspaceSize called" << std::endl;
> return 0;
> }
>
> int UpdateInferenceInputsPlugin::enqueue(nvinfer1::PluginTensorDesc const *inputDesc,
> nvinfer1::PluginTensorDesc const *outputDesc, void const *const *inputs, void *const *outputs, void *workspace,
> cudaStream_t stream) noexcept
> {
>
> int64_t *input_ids = reinterpret_cast<int64_t *>(const_cast<void *>(inputs[0]));
>
> half *full_hidden_states = reinterpret_cast<half *>(const_cast<void *>(inputs[1]));
>
> half *lm_logits = reinterpret_cast<half *>(const_cast<void *>(inputs[2]));
>
> int64_t *retrieve_indices = reinterpret_cast<int64_t *>(const_cast<void *>(inputs[3]));
>
> int64_t *draft_tokens = reinterpret_cast<int64_t *>(const_cast<void *>(inputs[4]));
>
> int64_t *candidates = reinterpret_cast<int64_t *>(const_cast<void *>(inputs[5]));
> int64_t *accept_length = reinterpret_cast<int64_t *>(const_cast<void *>(inputs[6]));
>
> half *sample_p = reinterpret_cast<half *>(const_cast<void *>(inputs[7]));
>
> int64_t *best_candidate = reinterpret_cast<int64_t *>(const_cast<void *>(inputs[8]));
>
> int batch_size = inputDesc[0].dims.d[0];
> int sequence_length = inputDesc[0].dims.d[1];
> int max_candidate_length = inputDesc[5].dims.d[1];
> int candidate_length = 4;
> int vocab_size = inputDesc[7].dims.d[0];
> int hidden_size = inputDesc[1].dims.d[2];
> int num_candidates = inputDesc[3].dims.d[0];
> int retrieve_seq_length = inputDesc[3].dims.d[1];
>
> size_t input_ids_size = batch_size * sequence_length * sizeof(int64_t);
> size_t hidden_states_size = batch_size * (1 + candidate_length) * hidden_size * sizeof(half);
> size_t hidden_states_size_original = batch_size * (sequence_length)*hidden_size * sizeof(half);
>
> std::cout << "Simple Case" << std::endl;
> cudaMemcpyAsync(outputs[0], inputs[0], input_ids_size, cudaMemcpyDeviceToDevice, stream);
> cudaMemcpyAsync(outputs[1], inputs[1], hidden_states_size_original, cudaMemcpyDeviceToDevice, stream);
>
> return 0;
> }
>
> nvinfer1::DataType UpdateInferenceInputsPlugin::getOutputDataType(
>
> int index, nvinfer1::DataType const *inputTypes, int nbInputs) const noexcept
>
> {
>
> if (index == 0)
>
> {
>
> // For output_input_ids, return INT32
>
> return DataType::kINT64;
> }
>
> else
>
> {
>
> // For output_hidden_states, return HALF
>
> return DataType::kHALF;
> }
> }
>
> char const *UpdateInferenceInputsPlugin::getPluginType() const noexcept
> {
> std::cout << "UpdateInferenceInputsPlugin::getPluginType called" << std::endl;
> return UPDATE_INFERENCE_INPUTS_PLUGIN_NAME;
> }
>
> char const *UpdateInferenceInputsPlugin::getPluginVersion() const noexcept
> {
> std::cout << "UpdateInferenceInputsPlugin::getPluginVersion called" << std::endl;
> return UPDATE_INFERENCE_INPUTS_PLUGIN_VERSION;
> }
>
> int UpdateInferenceInputsPlugin::getNbOutputs() const noexcept
> {
> std::cout << "UpdateInferenceInputsPlugin::getNbOutputs called" << std::endl;
> return 2;
> }
>
> int UpdateInferenceInputsPlugin::initialize() noexcept
> {
> std::cout << "UpdateInferenceInputsPlugin::initialize called" << std::endl;
> return 0;
> }
>
> void UpdateInferenceInputsPlugin::terminate() noexcept
> {
> std::cout << "UpdateInferenceInputsPlugin::terminate called" << std::endl;
> }
>
> size_t UpdateInferenceInputsPlugin::getSerializationSize() const noexcept
> {
> std::cout << "UpdateInferenceInputsPlugin::getSerializationSize called" << std::endl;
> return 0;
> }
>
> void UpdateInferenceInputsPlugin::serialize(void *buffer) const noexcept
> {
> std::cout << "UpdateInferenceInputsPlugin::serialize called" << std::endl;
> }
>
> void UpdateInferenceInputsPlugin::destroy() noexcept
> {
> std::cout << "UpdateInferenceInputsPlugin::destroy called" << std::endl;
> delete this;
> }
>
> ///////////////
>
> UpdateInferenceInputsPluginCreator::UpdateInferenceInputsPluginCreator()
> {
> std::cout << "UpdateInferenceInputsPluginCreator constructor called" << std::endl;
> mPluginAttributes.clear();
> mFC.nbFields = mPluginAttributes.size();
> mFC.fields = mPluginAttributes.data();
> }
>
> char const *UpdateInferenceInputsPluginCreator::getPluginName() const noexcept
> {
> std::cout << "UpdateInferenceInputsPluginCreator::getPluginName called" << std::endl;
> return UPDATE_INFERENCE_INPUTS_PLUGIN_NAME;
> }
>
> char const *UpdateInferenceInputsPluginCreator::getPluginVersion() const noexcept
> {
> std::cout << "UpdateInferenceInputsPluginCreator::getPluginVersion called" << std::endl;
> return UPDATE_INFERENCE_INPUTS_PLUGIN_VERSION;
> }
>
> PluginFieldCollection const *UpdateInferenceInputsPluginCreator::getFieldNames() noexcept
> {
> std::cout << "UpdateInferenceInputsPluginCreator::getFieldNames called" << std::endl;
> return &mFC;
> }
>
> IPluginV2 *UpdateInferenceInputsPluginCreator::createPlugin(char const *name, PluginFieldCollection const *fc) noexcept
> {
> std::cout << "UpdateInferenceInputsPluginCreator::createPlugin called" << std::endl;
> try
> {
> auto *obj = new UpdateInferenceInputsPlugin();
> obj->setPluginNamespace(mNamespace.c_str());
> return obj;
> }
> catch (std::exception const &e)
> {
> caughtError(e);
> }
> return nullptr;
> }
>
> IPluginV2 *UpdateInferenceInputsPluginCreator::deserializePlugin(
> char const *name, void const *serialData, size_t serialLength) noexcept
> {
> // This object will be deleted when the network is destroyed, which will
> // call UpdateInferenceInputsPlugin::destroy()
>
> std::cout << "Plugin being desiarialized" << std::endl;
>
> try
> {
>
> std::cout << "suiccess" << std::endl;
>
> auto *obj = new UpdateInferenceInputsPlugin(serialData, serialLength);
> obj->setPluginNamespace(mNamespace.c_str());
> return obj;
> }
> catch (std::exception const &e)
> {
> std::cout << "fail" << std::endl;
>
> caughtError(e);
> }
> return nullptr;
> }
> `
Environment
TensorRT Version: 10.3
NVIDIA GPU: 2x H100 NVL
NVIDIA Driver Version: 5 555.42.06
CUDA Version: 12.5
CUDNN Version:
Operating System:
Python Version (if applicable): 3.10
Tensorflow Version (if applicable):
PyTorch Version (if applicable):
Baremetal or Container (if so, version):
Relevant Files
Model link: https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct
Steps To Reproduce
- Build Llama 3 8B with TensorRT-LLM using official instructions.
- Create TensorRT-LLM plugin with the code I have above
- Copy out any float16 tensor
- Use self.debug_buffer in the Python runtime to log the float16 tensor and observe it will be different.
Commands or scripts:
Have you tried the latest release?:
Yes
Can this model run on other frameworks? For example run ONNX model with ONNXRuntime (polygraphy run <model.onnx> --onnxrt
):
Yes.