models
models copied to clipboard
face detection耗时太多
face detection里面的模块,inference的时候耗时太多如下图,单位为秒, 有能提升的建议吗?
@burness 了解下,您是如何预测的?使用的什么CPU还是GPU? 使用的是如下代码吗?
https://github.com/PaddlePaddle/models/blob/82fa5276d552026ddb636fb5b3ea17ec6ef0070f/fluid/PaddleCV/face_detection/widerface_eval.py#L39-L55
这里提供的脚本face_detection/widerface_eval.py
采用的是multi-scale方式infer的,实际使用可以不用multi-scale,比如只输入原图即可。
如果不是上面脚本也麻烦说下是如何预测?PyramidBox这个模型backbone是VGG,加上后面FPN,CMP等模块确实比较耗时的。
是在这个脚本上面改的, 我在p40上做inference的,代码如下:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import time
import numpy as np
import argparse
import functools
from PIL import Image
import paddle.fluid as fluid
import reader
from pyramidbox import PyramidBox
from visualize import draw_bboxes
from utility import add_arguments, print_arguments
import time
parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(add_arguments, argparser=parser)
import cv2
# yapf: disable
add_arg('use_gpu', bool, True, "Whether use GPU or not.")
add_arg('use_pyramidbox', bool, True, "Whether use PyramidBox model.")
add_arg('data_dir', str, 'data/WIDER_val/images/', "The validation dataset path.")
add_arg('model_dir', str, '', "The model path.")
add_arg('pred_dir', str, 'pred', "The path to save the evaluation results.")
add_arg('file_list', str, 'data/wider_face_split/wider_face_val_bbx_gt.txt', "The validation dataset path.")
add_arg('infer', bool, False, "Whether do infer or eval.")
add_arg('confs_threshold', float, 0.15, "Confidence threshold to draw bbox.")
add_arg('is_video', bool, False, "Whether is for video infer")
add_arg('video_path', str, '', "face detection video")
add_arg('result_dir', str, '', "detected face save dir")
# yapf: enable
def infer(args, config):
model_dir = args.model_dir
pred_dir = args.pred_dir
print("enter infer")
if not os.path.exists(model_dir):
raise ValueError("The model path [%s] does not exist." % (model_dir))
if args.infer:
if args.is_video:
video_path = args.video_path
cap = cv2.VideoCapture(video_path)
frame_num = 0
ret, frame = cap.read()
all_time_start = time.time()
while True:
ret, frame = cap.read()
print("processing {0}th frame".format(frame_num))
each_process_start = time.time()
frame_num += 1
cv2_im = cv2.cvtColor(frame,cv2.COLOR_BGR2RGB)
image = Image.fromarray(cv2_im)
if image.mode == 'L':
image = img.convert('RGB')
shrink, max_shrink = get_shrink(image.size[1], image.size[0])
det0 = detect_face(image, shrink)
det1 = flip_test(image, shrink)
[det2, det3] = multi_scale_test(image, max_shrink)
det4 = multi_scale_test_pyramid(image, max_shrink)
det = np.row_stack((det0, det1, det2, det3, det4))
dets = bbox_vote(det)
keep_index = np.where(dets[:, 4] >= args.confs_threshold)[0]
dets = dets[keep_index, :]
print("dets: {0}".format(dets))
for i in range(len(dets)):
crop_img = image.crop(dets[i, 0:4])
crop_img = crop_img.resize((64, 64),Image.ANTIALIAS)
crop_img.save(os.path.join("./{0}/".format(args.result_dir), str(i)+"_"+str(frame_num)+".jpg"))
print("save to {0} successfully".format(os.path.join("./{0}/".format(args.result_dir), str(i)+"_"+str(frame_num)+".jpg")))
print("process {0}th img, take time: {1}".format(frame_num, time.time() - each_process_start))
print("all time detection faces takes {0} seconds".format(time.time() - all_time_start))
# if not args.is_multi_images:
# image_path = args.image_path
# image = Image.open(image_path)
# if image.mode == 'L':
# image = img.convert('RGB')
# shrink, max_shrink = get_shrink(image.size[1], image.size[0])
# det0 = detect_face(image, shrink)
# det1 = flip_test(image, shrink)
# [det2, det3] = multi_scale_test(image, max_shrink)
# det4 = multi_scale_test_pyramid(image, max_shrink)
# det = np.row_stack((det0, det1, det2, det3, det4))
# dets = bbox_vote(det)
# keep_index = np.where(dets[:, 4] >= args.confs_threshold)[0]
# dets = dets[keep_index, :]
# # clip the face part
# draw_bboxes(image_path, dets[:, 0:4])
# else:
# image_list = [i for i in os.listdir(args.image_dir) if i.endswith('.jpg') ]
# if not os.path.exists(os.path.join(args.image_dir+"/crop/")):
# os.mkdir(os.path.join(args.image_dir+"/crop/"))
# all_time_start = time.time()
# for index, img in enumerate(image_list):
# each_process_start = time.time()
# print("processing {0} img ".format(index))
# img_path = os.path.join(args.image_dir, img)
# image = Image.open(img_path)
# if image.mode == 'L':
# image = img.convert('RGB')
# shrink, max_shrink = get_shrink(image.size[1], image.size[0])
# det0 = detect_face(image, shrink)
# det1 = flip_test(image, shrink)
# [det2, det3] = multi_scale_test(image, max_shrink)
# det4 = multi_scale_test_pyramid(image, max_shrink)
# det = np.row_stack((det0, det1, det2, det3, det4))
# dets = bbox_vote(det)
# keep_index = np.where(dets[:, 4] >= args.confs_threshold)[0]
# dets = dets[keep_index, :]
# print("dets: {0}".format(dets))
# for i in range(len(dets)):
# crop_img = image.crop(dets[i, 0:4])
# crop_img = crop_img.resize((64, 64),Image.ANTIALIAS)
# crop_img.save(os.path.join(args.image_dir+"/crop/", img))
# print("save to {0} successfully".format(os.path.join(args.image_dir+"/crop/", str(i)+"_"+img)))
# print("process {0}th img, take time: {1}".format(index, time.time() - each_process_start))
# print("all time detection faces takes {0} seconds".format(time.time() - all_time_start))
else:
test_reader = reader.test(config, args.file_list)
for image, image_path in test_reader():
shrink, max_shrink = get_shrink(image.size[1], image.size[0])
det0 = detect_face(image, shrink)
det1 = flip_test(image, shrink)
[det2, det3] = multi_scale_test(image, max_shrink)
det4 = multi_scale_test_pyramid(image, max_shrink)
det = np.row_stack((det0, det1, det2, det3, det4))
dets = bbox_vote(det)
save_widerface_bboxes(image_path, dets, pred_dir)
print("Finish evaluation.")
def save_widerface_bboxes(image_path, bboxes_scores, output_dir):
"""
Save predicted results, including bbox and score into text file.
Args:
image_path (string): file name.
bboxes_scores (np.array|list): the predicted bboxed and scores, layout
is (xmin, ymin, xmax, ymax, score)
output_dir (string): output directory.
"""
image_name = image_path.split('/')[-1]
image_class = image_path.split('/')[-2]
odir = os.path.join(output_dir, image_class)
if not os.path.exists(odir):
os.makedirs(odir)
ofname = os.path.join(odir, '%s.txt' % (image_name[:-4]))
f = open(ofname, 'w')
f.write('{:s}\n'.format(image_class + '/' + image_name))
f.write('{:d}\n'.format(bboxes_scores.shape[0]))
for box_score in bboxes_scores:
xmin, ymin, xmax, ymax, score = box_score
f.write('{:.1f} {:.1f} {:.1f} {:.1f} {:.3f}\n'.format(xmin, ymin, (
xmax - xmin + 1), (ymax - ymin + 1), score))
f.close()
print("The predicted result is saved as {}".format(ofname))
def detect_face(image, shrink):
image_shape = [3, image.size[1], image.size[0]]
if shrink != 1:
h, w = int(image_shape[1] * shrink), int(image_shape[2] * shrink)
image = image.resize((w, h), Image.ANTIALIAS)
image_shape = [3, h, w]
img = np.array(image)
img = reader.to_chw_bgr(img)
mean = [104., 117., 123.]
scale = 0.007843
img = img.astype('float32')
img -= np.array(mean)[:, np.newaxis, np.newaxis].astype('float32')
img = img * scale
img = [img]
img = np.array(img)
detection, = exe.run(infer_program,
feed={'image': img},
fetch_list=fetches,
return_numpy=False)
detection = np.array(detection)
# layout: xmin, ymin, xmax. ymax, score
if np.prod(detection.shape) == 1:
print("No face detected")
return np.array([[0, 0, 0, 0, 0]])
det_conf = detection[:, 1]
det_xmin = image_shape[2] * detection[:, 2] / shrink
det_ymin = image_shape[1] * detection[:, 3] / shrink
det_xmax = image_shape[2] * detection[:, 4] / shrink
det_ymax = image_shape[1] * detection[:, 5] / shrink
det = np.column_stack((det_xmin, det_ymin, det_xmax, det_ymax, det_conf))
return det
def bbox_vote(det):
order = det[:, 4].ravel().argsort()[::-1]
det = det[order, :]
if det.shape[0] == 0:
dets = np.array([[10, 10, 20, 20, 0.002]])
det = np.empty(shape=[0, 5])
while det.shape[0] > 0:
# IOU
area = (det[:, 2] - det[:, 0] + 1) * (det[:, 3] - det[:, 1] + 1)
xx1 = np.maximum(det[0, 0], det[:, 0])
yy1 = np.maximum(det[0, 1], det[:, 1])
xx2 = np.minimum(det[0, 2], det[:, 2])
yy2 = np.minimum(det[0, 3], det[:, 3])
w = np.maximum(0.0, xx2 - xx1 + 1)
h = np.maximum(0.0, yy2 - yy1 + 1)
inter = w * h
o = inter / (area[0] + area[:] - inter)
# nms
merge_index = np.where(o >= 0.3)[0]
det_accu = det[merge_index, :]
det = np.delete(det, merge_index, 0)
if merge_index.shape[0] <= 1:
if det.shape[0] == 0:
try:
dets = np.row_stack((dets, det_accu))
except:
dets = det_accu
continue
det_accu[:, 0:4] = det_accu[:, 0:4] * np.tile(det_accu[:, -1:], (1, 4))
max_score = np.max(det_accu[:, 4])
det_accu_sum = np.zeros((1, 5))
det_accu_sum[:, 0:4] = np.sum(det_accu[:, 0:4],
axis=0) / np.sum(det_accu[:, -1:])
det_accu_sum[:, 4] = max_score
try:
dets = np.row_stack((dets, det_accu_sum))
except:
dets = det_accu_sum
dets = dets[0:750, :]
return dets
def flip_test(image, shrink):
img = image.transpose(Image.FLIP_LEFT_RIGHT)
det_f = detect_face(img, shrink)
det_t = np.zeros(det_f.shape)
# image.size: [width, height]
det_t[:, 0] = image.size[0] - det_f[:, 2]
det_t[:, 1] = det_f[:, 1]
det_t[:, 2] = image.size[0] - det_f[:, 0]
det_t[:, 3] = det_f[:, 3]
det_t[:, 4] = det_f[:, 4]
return det_t
def multi_scale_test(image, max_shrink):
# Shrink detecting is only used to detect big faces
st = 0.5 if max_shrink >= 0.75 else 0.5 * max_shrink
det_s = detect_face(image, st)
index = np.where(
np.maximum(det_s[:, 2] - det_s[:, 0] + 1, det_s[:, 3] - det_s[:, 1] + 1)
> 30)[0]
det_s = det_s[index, :]
# Enlarge one times
bt = min(2, max_shrink) if max_shrink > 1 else (st + max_shrink) / 2
det_b = detect_face(image, bt)
# Enlarge small image x times for small faces
if max_shrink > 2:
bt *= 2
while bt < max_shrink:
det_b = np.row_stack((det_b, detect_face(image, bt)))
bt *= 2
det_b = np.row_stack((det_b, detect_face(image, max_shrink)))
# Enlarged images are only used to detect small faces.
if bt > 1:
index = np.where(
np.minimum(det_b[:, 2] - det_b[:, 0] + 1,
det_b[:, 3] - det_b[:, 1] + 1) < 100)[0]
det_b = det_b[index, :]
# Shrinked images are only used to detect big faces.
else:
index = np.where(
np.maximum(det_b[:, 2] - det_b[:, 0] + 1,
det_b[:, 3] - det_b[:, 1] + 1) > 30)[0]
det_b = det_b[index, :]
return det_s, det_b
def multi_scale_test_pyramid(image, max_shrink):
# Use image pyramids to detect faces
det_b = detect_face(image, 0.25)
index = np.where(
np.maximum(det_b[:, 2] - det_b[:, 0] + 1, det_b[:, 3] - det_b[:, 1] + 1)
> 30)[0]
det_b = det_b[index, :]
st = [0.75, 1.25, 1.5, 1.75]
for i in range(len(st)):
if (st[i] <= max_shrink):
det_temp = detect_face(image, st[i])
# Enlarged images are only used to detect small faces.
if st[i] > 1:
index = np.where(
np.minimum(det_temp[:, 2] - det_temp[:, 0] + 1,
det_temp[:, 3] - det_temp[:, 1] + 1) < 100)[0]
det_temp = det_temp[index, :]
# Shrinked images are only used to detect big faces.
else:
index = np.where(
np.maximum(det_temp[:, 2] - det_temp[:, 0] + 1,
det_temp[:, 3] - det_temp[:, 1] + 1) > 30)[0]
det_temp = det_temp[index, :]
det_b = np.row_stack((det_b, det_temp))
return det_b
def get_shrink(height, width):
"""
Args:
height (int): image height.
width (int): image width.
"""
# avoid out of memory
max_shrink_v1 = (0x7fffffff / 577.0 / (height * width))**0.5
max_shrink_v2 = ((678 * 1024 * 2.0 * 2.0) / (height * width))**0.5
def get_round(x, loc):
str_x = str(x)
if '.' in str_x:
str_before, str_after = str_x.split('.')
len_after = len(str_after)
if len_after >= 3:
str_final = str_before + '.' + str_after[0:loc]
return float(str_final)
else:
return x
max_shrink = get_round(min(max_shrink_v1, max_shrink_v2), 2) - 0.3
if max_shrink >= 1.5 and max_shrink < 2:
max_shrink = max_shrink - 0.1
elif max_shrink >= 2 and max_shrink < 3:
max_shrink = max_shrink - 0.2
elif max_shrink >= 3 and max_shrink < 4:
max_shrink = max_shrink - 0.3
elif max_shrink >= 4 and max_shrink < 5:
max_shrink = max_shrink - 0.4
elif max_shrink >= 5:
max_shrink = max_shrink - 0.5
shrink = max_shrink if max_shrink < 1 else 1
return shrink, max_shrink
if __name__ == '__main__':
args = parser.parse_args()
print_arguments(args)
config = reader.Settings(data_dir=args.data_dir)
place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
exe = fluid.Executor(place)
main_program = fluid.Program()
startup_program = fluid.Program()
image_shape = [3, 1024, 1024]
with fluid.program_guard(main_program, startup_program):
network = PyramidBox(
data_shape=image_shape,
sub_network=args.use_pyramidbox,
is_infer=True)
infer_program, nmsed_out = network.infer(main_program)
fetches = [nmsed_out]
fluid.io.load_persistables(
exe, args.model_dir, main_program=infer_program)
# save model and program
#fluid.io.save_inference_model('pyramidbox_model',
# ['image'], [nmsed_out], exe, main_program=infer_program,
# model_filename='model', params_filename='params')
infer(args, config)
multi-scale 这个怎么关 ?只运行其中一个吗 ?
下面代码是各种测试,可以只留第一个det0:
det0 = detect_face(image, shrink)
det1 = flip_test(image, shrink)
[det2, det3] = multi_scale_test(image, max_shrink)
det4 = multi_scale_test_pyramid(image, max_shrink)
det = np.row_stack((det0, det1, det2, det3, det4))
dets = bbox_vote(det)
@qingqing01 多谢
@burness hello, I'm trying to implement video mode into the code just like yours. Besides modifying widerface_eval.py, which else files and how did you modify? just by running that, I have an error like this:
Traceback (most recent call last):
File "widerface_eval_video.py", line 334, in
thank you so much!!!