Yet-Another-EfficientDet-Pytorch
Yet-Another-EfficientDet-Pytorch copied to clipboard
FPS
Hi, First of all, thank for you excellent work. I am using d0 as my pretrained model on my own dataset and got an idea detection outcome on my test-set. Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.717 Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.909 Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.833 Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.000 Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.721 Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.714 Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.654 Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.784 Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.784 Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.000 Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.790 Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.778
But one problem is that the FPS is only around 12 (30000+ test images in my test-set), which is far away from ideal compared to FPS you post in this repo(36.20) I consider from feeding one image and getting the output (include pre&post process) as one successful inference. And I am using 2080 and I can provide script I am using for inference if you want.
2080Ti? or just 2080. They still have a performance gap. My test didn't include image reading, but it can still do 30 FPS on coco eval. And it also depends on original image shape, if it's big enough, resizing may take longer. Also, cpu tensor to gpu tensor takes time too. Can you provide your environment details? Like system version, python version, torch version, hardware.
It is 2080ti Output for lscpu command: Architecture: x86_64 CPU op-mode(s): 32-bit, 64-bit Byte Order: Little Endian CPU(s): 80 On-line CPU(s) list: 0-79 Thread(s) per core: 2 Core(s) per socket: 20 Socket(s): 2 NUMA node(s): 2 Vendor ID: GenuineIntel CPU family: 6 Model: 85 Model name: Intel(R) Xeon(R) Gold 6248 CPU @ 2.50GHz Stepping: 7 CPU MHz: 2400.000 CPU max MHz: 2501.0000 CPU min MHz: 1000.0000 BogoMIPS: 5000.00 Virtualization: VT-x L1d cache: 32K L1i cache: 32K L2 cache: 1024K L3 cache: 28160K NUMA node0 CPU(s): 0-19,40-59 NUMA node1 CPU(s): 20-39,60-79
python==3.6.8, torch==1.4.0, torchvision==0.5.0
And I think most of my data-set is smaller that 512*512, which you use to test FPS
@zylo117 And this is my inference script def test(self, save_name = 'result.json'): self.model = self.get_model() assert self.test_file != None and self.test_file != '', 'A test file must be provided!' result = [] self.coco = COCO(os.path.join(self.annot_dir, self.test_file)) self.image_ids = self.coco.getImgIds()
for image_id in tqdm(self.image_ids):
image_info = self.coco.loadImgs(image_id)[0]
image_path = os.path.join(self.img_dir, image_info['file_name'])
one_pic_result = self.inference(image_path, image_id)
print(one_pic_result)
self.log.info('{}\n'.format({image_path:one_pic_result}))
result.extend(one_pic_result)
if self.save_file:
save_name = self.save_file
json.dump(result, open(save_name, 'w', encoding='utf-8'))
coco_eval = COCOeval(self.coco, self.coco.loadRes(save_name), 'bbox')
coco_eval.params.imgIds = self.image_ids
coco_eval.evaluate()
coco_eval.accumulate()
coco_eval.summarize()
if not self.save_file:
os.remove(save_name)
'''
def test_batch(self):
input_sizes = [512, 640, 768, 896, 1024, 1280, 1280, 1536]
test_params = {'batch_size': self.batch_size,
'shuffle': False,
'drop_last': True,
'collate_fn': collater,
'num_workers': self.num_workers}
test_set = EfficientdetDataset(self.img_dir,
self.annot_dir,
filename=self.test_file,
transform=transforms.Compose([
Normalizer(mean=self.mean, std=self.std),
Resizer(input_sizes[self.compound_coef])]))
test_generator = DataLoader(test_set, **test_params)
for batch in tqdm(test_generator):
pass
'''
def get_model(self):
model = EfficientDetBackbone(compound_coef=self.compound_coef,
num_classes=len(self.cls_list),
ratios=eval(self.anchors_ratios),
scales=eval(self.anchors_scales))
model.load_state_dict(torch.load(self.model_path, map_location=torch.device('cpu')))
model.requires_grad_(False)
model.eval()
if self.gpu_id:
model.cuda()
if self.use_float16:
model.half()
return model
def imread_binary(self, img_buffer, color=True):
img = np.asarray(bytearray(img_buffer), dtype='uint8')
img = cv2.imdecode(img, cv2.IMREAD_COLOR)
if img is None:
return None
if len(img.shape) == 2:
img = img[:, :, np.newaxis]
if color:
img = np.tile(img, (1, 1, 3))
elif len(img.shape) == 3 and img.shape[2] == 4:
img = img[:, :, :3]
return img
def _preprocess(self, image_path, max_size=512, mean=(0.406, 0.456, 0.485), std=(0.225, 0.224, 0.229)):
if isinstance(image_path, str):
ori_imgs = [cv2.imread(image_path)]
# an image file is passed
elif isinstance(image_path, bytes):
ori_imgs = [self.imread_binary(image_path)]
elif isinstance(image_path, np.ndarray):
ori_imgs = [image_path]
else:
self.log.info("Parameter passed to inference must be one of [str:img_path, bytes, np.ndarray:read by opencv]")
return None
normalized_imgs = [(img / 255 - mean) / std for img in ori_imgs]
imgs_meta = [aspectaware_resize_padding(img[..., ::-1], max_size, max_size,
means=None) for img in normalized_imgs]
framed_imgs = [img_meta[0] for img_meta in imgs_meta]
framed_metas = [img_meta[1:] for img_meta in imgs_meta]
return ori_imgs, framed_imgs, framed_metas
def inference(self, img_path, image_id=''):
result = []
input_sizes = [512, 640, 768, 896, 1024, 1280, 1280, 1536]
ori_imgs, framed_imgs, framed_metas = self._preprocess(img_path, max_size=input_sizes[self.compound_coef])
x = torch.from_numpy(framed_imgs[0])
if self.gpu_id:
x = x.cuda()
if self.use_float16:
x = x.half()
else:
x = x.float()
else:
x = x.float()
x = x.unsqueeze(0).permute(0, 3, 1, 2)
features, regression, classification, anchors = self.model(x)
preds = postprocess(x,
anchors, regression, classification,
BBoxTransform(), ClipBoxes(),
self.det_threshold, self.nms_threshold)
if not preds:
return None
preds = invert_affine(framed_metas, preds)[0]
# retrive {self.topk} bbox information
scores = preds['scores'][:self.topk]
class_ids = preds['class_ids'][:self.topk]
rois = preds['rois'][:self.topk]
if rois.shape[0] > 0:
# x1,y1,x2,y2 -> x1,y1,w,h
rois[:, 2] -= rois[:, 0]
rois[:, 3] -= rois[:, 1]
bbox_score = scores
for roi_id in range(rois.shape[0]):
score = float(bbox_score[roi_id])
label = int(class_ids[roi_id])
box = rois[roi_id, :]
if score < self.det_threshold:
break
image_result = {
'image_id': image_id,
'category_id': label + 1,
'score': round(float(score), 8),
'bbox': list(map(int, box.tolist())),
}
if isinstance(img_path, str):
image_result.update({'image_path':os.path.basename(img_path)})
result.append(image_result)
return result
can you test io preprocess/inference/postprocess individually and find out the slowest part?
can you test io preprocess/inference/postprocess individually and find out the slowest part?
the slowest part is backbone which extracts the features
@bloom1123 yes, and I've just got 35.5 fps at batchsize 1.
do you change any code? I do not know why my code is slow. I test my own datasets.
----- Original Message ----- From: zylo117 @.> To: zylo117/Yet-Another-EfficientDet-Pytorch @.> Cc: bloom1123 @.>, Mention @.> Sent: Thu, 02 Dec 2021 18:04:42 -0800 Subject: Re: [zylo117/Yet-Another-EfficientDet-Pytorch] FPS (#326)
@bloom1123 yes, and I've just got 35.5 fps at batchsize 1.
— You are receiving this because you were mentioned. Reply to this email directly, view it on GitHub, or unsubscribe. Triage notifications on the go with GitHub Mobile for iOS or Android.