RT-DETR
RT-DETR copied to clipboard
pth2onnx和onnx2trt是没有问题,但是trt推理出现box坐标是特别大的值,标签和得分是正确的
pth转换为onnx,用官方的代码转,并且测试是正确。但是onnx转换为trt,用的是tensorRT8.6.1,转换没有问题,但是推理出现box坐标是特别大的值,标签和得分是正确的。 D:\tool\TensorRT-8.6.1.6.Windows10.x86_64.cuda-11.8\TensorRT-8.6.1.6\bin\trtexec.exe --onnx=model.onnx --workspace=4096 --avgRuns=100 --shapes=images:1x3x640x640 --saveEngine=model.trt
我的C++推理代码如下
#include
using namespace nvinfer1; using namespace std;
#define CUDA_CHECK(call)
do {
cudaError_t status = call;
if (status != cudaSuccess) {
fprintf(stderr, "CUDA Error in file '%s' in line %d: %s\n",
FILE, LINE, cudaGetErrorString(status));
exit(EXIT_FAILURE);
}
} while (0)
class Logger : public ILogger { public: void log(Severity severity, const char* msg) noexcept override { if (severity != Severity::kINFO) std::cout << msg << std::endl; } } gLogger; class TensorRTInference { public: TensorRTInference(const std::string& enginePath); ~TensorRTInference(); void doInference(const std::vectorcv::String& image_paths);
private: IRuntime* runtime; ICudaEngine* engine_; IExecutionContext* context_; void* buffers_[5]; int inputIndex1_; int inputIndex2_; int outputIndex1_; int outputIndex2_; int outputIndex3_; int batchSize_; int inputSize1_; int inputSize2_; int outputSize1_; int outputSize2_; int outputSize3_;
void allocateBuffers();
void preprocess(const cv::Mat& image, float* buffer1, float* buffer2);
void postprocess(float* output1, float* output2, float* output3, cv::Mat& image);
};
int volume(const Dims& dims) { int vol = 1; for (int i = 0; i < dims.nbDims; ++i) { vol *= dims.d[i]; } return vol; } TensorRTInference::TensorRTInference(const std::string& enginePath) : engine_(nullptr), context_(nullptr) { std::ifstream engineFile(enginePath, std::ios::binary); if (!engineFile.good()) { std::cerr << "Error opening engine file: " << enginePath << std::endl; return; }
std::stringstream engineStream;
engineStream << engineFile.rdbuf();
engineFile.close();
runtime = createInferRuntime(gLogger);
if (!runtime) {
std::cerr << "Error creating InferRuntime" << std::endl;
return;
}
engine_ = runtime->deserializeCudaEngine(engineStream.str().data(), engineStream.str().size(), nullptr);
if (engine_ == nullptr) {
std::cerr << "Error deserializing the engine file: " << enginePath << std::endl;
return;
}
context_ = engine_->createExecutionContext();
if (!context_) {
std::cerr << "Error creating ExecutionContext" << std::endl;
return;
}
inputIndex1_ = engine_->getBindingIndex("images");
inputIndex2_ = engine_->getBindingIndex("orig_target_sizes");
outputIndex1_ = engine_->getBindingIndex("labels");
outputIndex2_ = engine_->getBindingIndex("boxes");
outputIndex3_ = engine_->getBindingIndex("scores");
// Get input sizes
const Dims& inputDims1 = engine_->getBindingDimensions(inputIndex1_);
const Dims& inputDims2 = engine_->getBindingDimensions(inputIndex2_);
for (int i = 0; i < inputDims2.nbDims; ++i) {
std::cout << "inputDims2[" << i << "]: " << inputDims2.d[i] << std::endl;
}
const Dims& outputDims1 = engine_->getBindingDimensions(outputIndex1_);
const Dims& outputDims2 = engine_->getBindingDimensions(outputIndex2_);
const Dims& outputDims3 = engine_->getBindingDimensions(outputIndex3_);
batchSize_ = 1;
inputSize1_ = volume(inputDims1) * batchSize_ * sizeof(float);
inputSize2_ = volume(inputDims2) * sizeof(float);
outputSize1_ = volume(outputDims1) * sizeof(float);
outputSize2_ = volume(outputDims2) * sizeof(float);
outputSize3_ = volume(outputDims3) * sizeof(float);
cout << inputSize2_ << endl;
allocateBuffers();
}
std::vector
if (image.channels() != 3) {
std::cerr << "Error: Image must have 3 channels" << std::endl;
return {};
}
image.convertTo(image, CV_32FC3, 1.0f / 255.0f);
// Convert image to tensor
std::vector<cv::Mat> channels(3);
cv::split(image, channels);
std::vector<float> tensor(image.total() * image.channels());
int index = 0;
for (int c = 0; c < 3; ++c) {
for (int i = 0; i < channels[c].rows; ++i) {
for (int j = 0; j < channels[c].cols; ++j) {
tensor[index++] = channels[c].at<float>(i, j);
}
}
}
return tensor;
}
std::vector
if (image.channels() != 3) {
std::cerr << "Error: Image must have 3 channels" << std::endl;
return {};
}
cv::resize(image, image, cv::Size(640.f, 640.f));
image.convertTo(image, CV_32FC3, 1.0f / 255.0f);
const float mean[3] = { 0.485f, 0.456f, 0.406f };
const float std[3] = { 0.229f, 0.224f, 0.225f };
std::vector<cv::Mat> channels(3);
cv::split(image, channels);
std::vector<float> input_tensor;
input_tensor.reserve(640 * 640 * 3);
for (int c = 0; c < 3; ++c) {
for (int i = 0; i < channels[c].rows; ++i) {
for (int j = 0; j < channels[c].cols; ++j) {
float pixel = (channels[c].at<float>(i, j) - mean[c]) / std[c];
input_tensor.push_back(pixel);
}
}
}
return input_tensor;
}
TensorRTInference::~TensorRTInference() { if (context_) { context_->destroy(); } if (engine_) { engine_->destroy(); } if (runtime) { runtime->destroy(); } for (int i = 0; i < 5; ++i) { if (buffers_[i]) cudaFree(buffers_[i]); } }
void TensorRTInference::preprocess(const cv::Mat& image, float* buffer1, float* buffer2) { cv::Mat rgb_image;
if (image.channels() == 1) {
cv::cvtColor(image, rgb_image, cv::COLOR_GRAY2RGB);
}
else if (image.channels() == 4) {
cv::cvtColor(image, rgb_image, cv::COLOR_BGRA2RGB);
}
else if (image.channels() == 3) {
cv::cvtColor(image, rgb_image, cv::COLOR_BGR2RGB);
}
else {
rgb_image = image;
}
std::vector<float> tensor1 = ToTensorAndNormalize(rgb_image);
//float tensor[1][2] = { {1, 1} };
//float orig_target_sizes[2] = { 640.0f, 640.0f };
float orig_target_sizes[2] = { static_cast<float>(image.cols), static_cast<float>(image.rows) };
std::cout << "Original target sizes: " << orig_target_sizes[0] << ", " << orig_target_sizes[1] << std::endl;
std::cout << "Input tensor size: " << tensor1.size() << std::endl;
//std::cout << "Input tensor size: " << tensor1.size() << std::endl;
std::memcpy(buffer1, tensor1.data(), inputSize1_);
std::memcpy(buffer2, orig_target_sizes, inputSize2_);
}
void TensorRTInference::postprocess(float* output1, float* output2, float* output3, cv::Mat& image) { float numDetections = outputSize2_ / (4 * sizeof(float)); float confThreshold = 0.5f;
std::cout << "Box " << (output2 + 1 * 4)[0] << std::endl;
for (int i = 0; i < numDetections; ++i) {
float* bbox = output2 + i * 4;
float labels = output1[i];
int x1 = static_cast<float>(bbox[0]);
int y1 = static_cast<float>(bbox[1]);
int x2 = static_cast<float>(bbox[2]);
int y2 = static_cast<float>(bbox[3]);
// Draw bounding box
cv::rectangle(image, cv::Point(x1, y1), cv::Point(x2, y2), cv::Scalar(0, 255, 0), 2);
std::string label = "label: " + std::to_string(labels);
cv::putText(image, label, cv::Point(x1, y1 - 5), cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 255, 0), 1);
}
float* additionalData = output3;
std::cout << "First values of output3:" << std::endl;
for (int i = 0; i < 50; ++i) {
std::cout << additionalData[i] << " ";
}
std::cout << std::endl;
}
std::string replaceFolderName(const std::string& path, const std::string& oldFolder, const std::string& newFolder) { size_t pos = path.find(oldFolder); if (pos != std::string::npos) { std::string newPath = path.substr(0, pos) + newFolder + path.substr(pos + oldFolder.length()); return newPath; } else { return path; } }
void softmax(float* output, std::vector
float* outputBuffer1 = new float[outputSize1_ / sizeof(float)];
float* outputBuffer2 = new float[outputSize2_ / sizeof(float)];
float* outputBuffer3 = new float[outputSize3_ / sizeof(float)];
for (const auto& filename : image_paths) {
std::cout << "Processing image: " << filename << std::endl;
clock_t start = clock();
cv::Mat image = cv::imread(filename);
int height = image.rows;
int width = image.cols;
int channels = image.channels();
// Print the shape of the image
std::cout << "Image shape: (" << height << ", " << width << ", " << channels << ")" << std::endl;
if (image.empty()) {
std::cerr << "Error loading image: " << filename << std::endl;
continue;
}
preprocess(image, inputBuffer1, inputBuffer2);
clock_t gpuStart = clock();
CUDA_CHECK(cudaMemcpy(buffers_[inputIndex1_], inputBuffer1, inputSize1_, cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy(buffers_[inputIndex2_], inputBuffer2, inputSize2_, cudaMemcpyHostToDevice));
bool success = context_->executeV2(buffers_);
if (!success) {
std::cerr << "TensorRT execution failed." << std::endl;
continue;
}
CUDA_CHECK(cudaMemcpy(outputBuffer1, buffers_[outputIndex1_], outputSize1_, cudaMemcpyDeviceToHost));
CUDA_CHECK(cudaMemcpy(outputBuffer2, buffers_[outputIndex2_], outputSize2_, cudaMemcpyDeviceToHost));
CUDA_CHECK(cudaMemcpy(outputBuffer3, buffers_[outputIndex3_], outputSize3_, cudaMemcpyDeviceToHost));
clock_t gpuEnd = clock();
std::cout << "GPU inference time: " << (gpuEnd - gpuStart) / (double)CLOCKS_PER_SEC << " seconds." << std::endl;
postprocess(outputBuffer1, outputBuffer2, outputBuffer3, image);
std::cout << "First values of outputBuffer1:" << std::endl;
for (int i = 0; i < 10; ++i) {
std::cout << outputBuffer2[i] << " ";
}
std::cout << std::endl;
std::string output_path = replaceFolderName(filename, "debug", "debug_out");
cv::imwrite(output_path, image);
}
delete[] inputBuffer1;
delete[] inputBuffer2;
delete[] outputBuffer1;
delete[] outputBuffer2;
delete[] outputBuffer3;
}
int main(int argc, char** argv) { try { cudaSetDevice(0); TensorRTInference inference("D:\tool\TensorRT-8.6.1.6.Windows10.x86_64.cuda-11.8\TensorRT-8.6.1.6\bin\cfg_model.trt"); std::string img_dir = "E:\YOLOv8-main\val2017\debug"; std::vectorcv::String image_paths; cv::utils::fs::glob(img_dir, "*.png", image_paths); inference.doInference(image_paths); } catch (const std::exception& e) { std::cerr << "Exception: " << e.what() << std::endl; return EXIT_FAILURE; } return EXIT_SUCCESS; } 能帮忙检查一下吗