aimet
aimet copied to clipboard
the inference time is slower than fp32 after quantize
def evaluate(self,session,samples,prefix='fp32'):
data_set = CalibDataset()
data_loader = DataLoader(dataset=data_set,batch_size=1)
binding = session.io_binding()
input_name = session.get_inputs()[0].name
output_name = session.get_outputs()[0].name
time_cnt = 0
sample_cnt = 0
for input_data,target_data in data_loader:
input_data = input_data.contiguous().cuda()
target_data = target_data.contiguous().cuda()
output0_tensor = torch.zeros_like(target_data).contiguous()
binding.bind_input(
name=input_name,
device_type='cuda',
device_id=0,
element_type=np.float32,
shape=tuple(input_data.shape),
buffer_ptr=input_data.data_ptr(),
)
binding.bind_output(
name=output_name,
device_type='cuda',
device_id=0,
element_type=np.float32,
shape=tuple(output0_tensor.shape),
buffer_ptr=output0_tensor.data_ptr(),
)
begin_time = time.time()
session.run_with_iobinding(binding)
end_time = time.time()
time_one = end_time-begin_time
time_cnt += time_one
sample_cnt+=1
# print('%s:this sample comsune %.7f s...'%(prefix,time_one))
if sample_cnt > samples:
break
time_avg = time_cnt/samples
print("%s:average time:%.7f"%(prefix,time_avg))
the session is sim.session,after quantize. 我传入数据给量化后的模型的方式正确吗?非常期待您的回复
Is the method I used to pass in data to the quantified model correct? Looking forward to your reply very much
Yes, to perform quantization, we add nodes to the model which result in slowdown