tensorrtx
tensorrtx copied to clipboard
Real EsrGan X2 Model Conversion
Env
- GPU : RTX 3090
- OS : Ubuntu20.04
- Cuda version : 11.4
- TensorRT version : 22
About this repo
- master
- real-esrgan
Your problem
I have successfully converted X4 with outscale : 4 model to tensorrt, and I have seen good improvements in overall inference speed and memory footprint. But I want to convert RealESRGAN_x2plus to tensorrt with outscale 2. X2 plus uses pixel unshuffle before first convolution which seems very difficult to me to implement this in C++.
Please let me know how I can convert X2 to tensorrt or please add support for X2 conversion in your repository.
I am looking forward to hear from you. Thanks
@yester31 Can you please suggest?
added below code(Pixel unshuffle code) before conv_first and use RealESRGAN_x2plus weight

if (OUT_SCALE == 2) {
// Pixel unshuffle.
int h = INPUT_H / OUT_SCALE;
int w = INPUT_W / OUT_SCALE;
auto attn_shuffle = network->addShuffle(*prep);
Dims shape_dims;
std::vector<int> reshape_dims = { INPUT_C, h, OUT_SCALE, w, OUT_SCALE };
shape_dims.nbDims = (int)reshape_dims.size();
memcpy(shape_dims.d, reshape_dims.data(), reshape_dims.size() * sizeof(int));
attn_shuffle->setReshapeDimensions(shape_dims);
std::vector<int> trans_dims{ 0, 2, 4, 1, 3 };
Permutation f_trans_dims; memcpy(f_trans_dims.order, trans_dims.data(), trans_dims.size() * sizeof(int));
attn_shuffle->setSecondTranspose(f_trans_dims);
auto attn_shuffle2 = network->addShuffle(*attn_shuffle->getOutput(0));
attn_shuffle2->setReshapeDimensions(Dims3{ INPUT_C * OUT_SCALE * OUT_SCALE, h, w});
prep = attn_shuffle2->getOutput(0);
}
This solved the issue. Thank you so much
Hello, I have been successful in x2 conversion. But I want to infer the engine in Python and the results engine is giving are very odd.
This is the script I am using for inferencing.
import os
import time
import cv2
import numpy as np
import pycuda.autoinit
import pycuda.driver as cuda
import tensorrt as trt
from PIL import Image
class HostDeviceMem(object):
def __init__(self, host_mem, device_mem):
self.host = host_mem
self.device = device_mem
def __str__(self):
return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)
def __repr__(self):
return self.__str__()
def load_engine(trt_runtime, engine_path):
with open(engine_path, "rb") as f:
engine_data = f.read()
engine = trt_runtime.deserialize_cuda_engine(engine_data)
return engine
def allocate_buffers(engine, batch_size=1):
inputs = []
outputs = []
bindings = []
stream = cuda.Stream()
for binding in engine:
size = trt.volume(engine.get_binding_shape(binding)) * batch_size
dtype = trt.nptype(engine.get_binding_dtype(binding))
# Allocate host and device buffers
host_mem = cuda.pagelocked_empty(size, dtype)
device_mem = cuda.mem_alloc(host_mem.nbytes)
# Append the device buffer to device bindings.
bindings.append(int(device_mem))
# Append to the appropriate list.
if engine.binding_is_input(binding):
inputs.append(HostDeviceMem(host_mem, device_mem))
else:
outputs.append(HostDeviceMem(host_mem, device_mem))
return inputs, outputs, bindings, stream
def do_inference(context, bindings, inputs, outputs, stream, batch_size=1):
# Transfer input data to the GPU.
[cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
# Run inference.
context.execute_async(
batch_size=1, bindings=bindings, stream_handle=stream.handle
)
# Transfer predictions back from the GPU.
[cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
# Synchronize the stream
stream.synchronize()
# Return only the host outputs.
return [out.host for out in outputs]
def preprocess_image(input_image_path):
image_raw = cv2.imread(input_image_path)
return image_raw
def process_image(arr, w, h):
image = Image.fromarray(np.uint8(arr))
image_resized = image.resize(size=(w, h), resample=Image.BILINEAR)
img_np = np.array(image_resized)
# HWC -> CHW
img_np = img_np.transpose((2, 0, 1))
# Normalize to [0.0, 1.0] interval (expected by model)
img_np = (1.0 / 255.0) * img_np
print(img_np.shape)
img_np = img_np.ravel()
return img_np
def predict(image):
img = preprocess_image(image)
print(img.shape)
np.copyto(inputs[0].host, img.ravel())
inference_start_time = time.time()
# Fetch output from the model
output = do_inference(
context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream
)
# Output inference time
print(output)
# And return results
return output
# -------------- MODEL PARAMETERS FOR DETECTNET_V2 --------------------------------
model_h = 1536
model_w = 1536
import ctypes
PLUGIN_LIBRARY = "build/libmyplugins.so"
ctypes.CDLL(PLUGIN_LIBRARY)
# TensorRT logger singleton
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
trt_engine_path = 'build/real-esrgan_f32.engine'
trt_runtime = trt.Runtime(TRT_LOGGER)
trt_engine = load_engine(trt_runtime, trt_engine_path)
# This allocates memory for network inputs/outputs on both CPU and GPU
inputs, outputs, bindings, stream = allocate_buffers(trt_engine)
# Execution context is needed for inference
context = trt_engine.create_execution_context()
output = predict('image.jpg')[0]
output = output.reshape(3072,3072,3)
print(output.shape)
print(output.dtype)
cv2.imwrite('output.jpg', output)`
But the results looks very weird.( Attached sample below ).

@wang-xinyu @yester31 Can you please help
@yester31 @wang-xinyu If you guys can help
Hello, I have been successful in x2 conversion. But I want to infer the engine in Python and the results engine is giving are very odd.
This is the script I am using for inferencing.
import os import time import cv2 import numpy as np import pycuda.autoinit import pycuda.driver as cuda import tensorrt as trt from PIL import Image class HostDeviceMem(object): def __init__(self, host_mem, device_mem): self.host = host_mem self.device = device_mem def __str__(self): return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device) def __repr__(self): return self.__str__() def load_engine(trt_runtime, engine_path): with open(engine_path, "rb") as f: engine_data = f.read() engine = trt_runtime.deserialize_cuda_engine(engine_data) return engine def allocate_buffers(engine, batch_size=1): inputs = [] outputs = [] bindings = [] stream = cuda.Stream() for binding in engine: size = trt.volume(engine.get_binding_shape(binding)) * batch_size dtype = trt.nptype(engine.get_binding_dtype(binding)) # Allocate host and device buffers host_mem = cuda.pagelocked_empty(size, dtype) device_mem = cuda.mem_alloc(host_mem.nbytes) # Append the device buffer to device bindings. bindings.append(int(device_mem)) # Append to the appropriate list. if engine.binding_is_input(binding): inputs.append(HostDeviceMem(host_mem, device_mem)) else: outputs.append(HostDeviceMem(host_mem, device_mem)) return inputs, outputs, bindings, stream def do_inference(context, bindings, inputs, outputs, stream, batch_size=1): # Transfer input data to the GPU. [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs] # Run inference. context.execute_async( batch_size=1, bindings=bindings, stream_handle=stream.handle ) # Transfer predictions back from the GPU. [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs] # Synchronize the stream stream.synchronize() # Return only the host outputs. return [out.host for out in outputs] def preprocess_image(input_image_path): image_raw = cv2.imread(input_image_path) return image_raw def process_image(arr, w, h): image = Image.fromarray(np.uint8(arr)) image_resized = image.resize(size=(w, h), resample=Image.BILINEAR) img_np = np.array(image_resized) # HWC -> CHW img_np = img_np.transpose((2, 0, 1)) # Normalize to [0.0, 1.0] interval (expected by model) img_np = (1.0 / 255.0) * img_np print(img_np.shape) img_np = img_np.ravel() return img_np def predict(image): img = preprocess_image(image) print(img.shape) np.copyto(inputs[0].host, img.ravel()) inference_start_time = time.time() # Fetch output from the model output = do_inference( context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream ) # Output inference time print(output) # And return results return output # -------------- MODEL PARAMETERS FOR DETECTNET_V2 -------------------------------- model_h = 1536 model_w = 1536 import ctypes PLUGIN_LIBRARY = "build/libmyplugins.so" ctypes.CDLL(PLUGIN_LIBRARY) # TensorRT logger singleton TRT_LOGGER = trt.Logger(trt.Logger.WARNING) trt_engine_path = 'build/real-esrgan_f32.engine' trt_runtime = trt.Runtime(TRT_LOGGER) trt_engine = load_engine(trt_runtime, trt_engine_path) # This allocates memory for network inputs/outputs on both CPU and GPU inputs, outputs, bindings, stream = allocate_buffers(trt_engine) # Execution context is needed for inference context = trt_engine.create_execution_context() output = predict('image.jpg')[0] output = output.reshape(3072,3072,3) print(output.shape) print(output.dtype) cv2.imwrite('output.jpg', output)`But the results looks very weird.( Attached sample below ).
@wang-xinyu @yester31 Can you please help
I met the same problem (the results are odd). Have you solved?
Do we support X2 in this repo? I thought we only support this one https://github.com/xinntao/Real-ESRGAN/releases/download/v0.1.0/RealESRGAN_x4plus.pth
It was officially made into an x4 model version. And later on request, how to use the x2 model version was explained in a previous comment https://github.com/wang-xinyu/tensorrtx/issues/1085#issuecomment-1229422132.
This issue has been automatically marked as stale because it has not had recent activity. It will be closed if no further activity occurs. Thank you for your contributions.