tensorflow-yolov4-tflite
tensorflow-yolov4-tflite copied to clipboard
detectvideo.py bbox issue on gpu
This issue has been mentioned quite sometime ago on: https://github.com/hunglc007/tensorflow-yolov4-tflite/issues/356 and https://github.com/hunglc007/tensorflow-yolov4-tflite/issues/282 But there still isn't a proper solution to this. I am wondering if anyone managed to solve this. video detection works fine on CPU (although extremely slow) but only detects on the first frame of the video on GPU. Thank you!
I'm having the exact same issue, video prediction works fine on CPU but on GPU only works on the first frame. Tried the solution proposed at #346 and it works but is extremely slow. Did anyone found a solution for that yet?
I think I found a way to solve it. I was using a tensorflow 2.4.2 docker image, changing to the tensorflow version specified in the requirements-gpu.txt file (2.3.0rc0-gpu) solved it, ran on GPU, got predictions for the entire video, and fast as usual. I tested predicting on a model converted using the 2.4.2 version and it didnt't work even with predicting with the 2.3.0rc0 version. I had to re-convert the model using TF 2.3.0rc0, and also predict with 2.3.0rc0.
@Pedrohgv how do you run and change the docker image version? I install tensorflow-gpu==2.3.0rc0 but uninstall tensorflow==2.3.0rc0 and when i run it is not able to find the module 'tensorflow'.
@phykurox what do you mean? I'm using this docker image. But I assume it should work as well on an environment with tensorflow 2.3.0rc0 installed.
import time
import tensorflow as tf
physical_devices = tf.config.experimental.list_physical_devices('GPU')
if len(physical_devices) > 0:
tf.config.experimental.set_memory_growth(physical_devices[0], True)
from absl import app, flags, logging
from absl.flags import FLAGS
import core.utils as utils
from core.yolov4 import filter_boxes, decode, YOLO
from tensorflow.python.saved_model import tag_constants
from PIL import Image
import cv2
import numpy as np
# from tensorflow.compat.v1 import ConfigProto
# from tensorflow.compat.v1 import InteractiveSession
flags.DEFINE_string('framework', 'tf', '(tf, tflite, trt')
# flags.DEFINE_string('weights', './checkpoints/yolov4-416/variables/variables',
# 'path to weights file')
flags.DEFINE_string('weights', './checkpoints/yolov4-416/variables/variables',
'path to weights file')
flags.DEFINE_integer('size', 416, 'resize images to')
flags.DEFINE_boolean('tiny', False, 'yolo or yolo-tiny')
flags.DEFINE_string('model', 'yolov4', 'yolov3 or yolov4')
flags.DEFINE_string('video', './data/test2.mp4', 'path to input video')
flags.DEFINE_float('iou', 0.45, 'iou threshold')
flags.DEFINE_float('score', 0.25, 'score threshold')
flags.DEFINE_string('output', './data/result/vidio.mp4', 'path to output video')
flags.DEFINE_string('output_format', 'XVID', 'codec used in VideoWriter when saving video to file')
flags.DEFINE_boolean('dis_cv2_window', False, 'disable cv2 window during the process') # this is good for the .ipynb
# @tf.function
def infer(batch_data, model):
STRIDES, ANCHORS, NUM_CLASS, XYSCALE = utils.load_config(FLAGS)
# batch_data = tf.constant(image_data)
feature_maps = model(batch_data)
bbox_tensors = []
prob_tensors = []
if FLAGS.tiny:
for i, fm in enumerate(feature_maps):
if i == 0:
output_tensors = decode(fm, FLAGS.size // 16, NUM_CLASS, STRIDES, ANCHORS, i, XYSCALE,
FLAGS.framework)
else:
output_tensors = decode(fm, FLAGS.size // 32, NUM_CLASS, STRIDES, ANCHORS, i, XYSCALE,
FLAGS.framework)
bbox_tensors.append(output_tensors[0])
prob_tensors.append(output_tensors[1])
else:
for i, fm in enumerate(feature_maps):
if i == 0:
output_tensors = decode(fm, FLAGS.size // 8, NUM_CLASS, STRIDES, ANCHORS, i, XYSCALE,
FLAGS.framework)
elif i == 1:
output_tensors = decode(fm, FLAGS.size // 16, NUM_CLASS, STRIDES, ANCHORS, i, XYSCALE,
FLAGS.framework)
else:
output_tensors = decode(fm, FLAGS.size // 32, NUM_CLASS, STRIDES, ANCHORS, i, XYSCALE,
FLAGS.framework)
bbox_tensors.append(output_tensors[0])
prob_tensors.append(output_tensors[1])
pred_bbox = tf.concat(bbox_tensors, axis=1)
pred_prob = tf.concat(prob_tensors, axis=1)
if FLAGS.framework == 'tflite':
pred_bbox = (pred_bbox, pred_prob)
else:
boxes, pred_conf = filter_boxes(pred_bbox, pred_prob, score_threshold=FLAGS.score,
input_shape=tf.constant([FLAGS.size, FLAGS.size]))
pred_bbox = tf.concat([boxes, pred_conf], axis=-1)
boxes = pred_bbox[:, :, 0:4]
pred_conf = pred_bbox[:, :, 4:]
boxes, scores, classes, valid_detections = tf.image.combined_non_max_suppression(
boxes=tf.reshape(boxes, (tf.shape(boxes)[0], -1, 1, 4)),
scores=tf.reshape(
pred_conf, (tf.shape(pred_conf)[0], -1, tf.shape(pred_conf)[-1])),
max_output_size_per_class=50,
max_total_size=50,
iou_threshold=FLAGS.iou,
score_threshold=FLAGS.score
)
return boxes, scores, classes, valid_detections
def main(_argv):
# config = ConfigProto()
# config.gpu_options.allow_growth = True
# session = InteractiveSession(config=config)
STRIDES, ANCHORS, NUM_CLASS, XYSCALE = utils.load_config(FLAGS)
input_size = FLAGS.size
video_path = FLAGS.video
print("Video from: ", video_path )
vid = cv2.VideoCapture(video_path)
if FLAGS.framework == 'tflite':
interpreter = tf.lite.Interpreter(model_path=FLAGS.weights)
interpreter.allocate_tensors()
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()
print(input_details)
print(output_details)
else:
# saved_model_loaded = tf.saved_model.load(FLAGS.weights, tags=[tag_constants.SERVING])
# infer = saved_model_loaded.signatures[tf.saved_model.DEFAULT_SERVING_SIGNATURE_DEF_KEY]
STRIDES, ANCHORS, NUM_CLASS, XYSCALE = utils.load_config(FLAGS)
inputs = tf.keras.layers.Input([FLAGS.size, FLAGS.size, 3])
outputs = YOLO(inputs, NUM_CLASS, FLAGS.model, FLAGS.tiny)
model = tf.keras.Model(inputs, outputs)
model.load_weights(FLAGS.weights)
if FLAGS.output:
# by default VideoCapture returns float instead of int
width = int(vid.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(vid.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = int(vid.get(cv2.CAP_PROP_FPS))
codec = cv2.VideoWriter_fourcc(*FLAGS.output_format)
out = cv2.VideoWriter(FLAGS.output, codec, fps, (width, height))
frame_id = 0
while True:
return_value, frame = vid.read()
if return_value:
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
image = Image.fromarray(frame)
else:
if frame_id == vid.get(cv2.CAP_PROP_FRAME_COUNT):
print("Video processing complete")
break
raise ValueError("No image! Try with another video format")
frame_size = frame.shape[:2]
image_data = cv2.resize(frame, (input_size, input_size))
image_data = image_data / 255.
image_data = image_data[np.newaxis, ...].astype(np.float32)
prev_time = time.time()
if FLAGS.framework == 'tflite':
interpreter.set_tensor(input_details[0]['index'], image_data)
interpreter.invoke()
pred = [interpreter.get_tensor(output_details[i]['index']) for i in range(len(output_details))]
if FLAGS.model == 'yolov3' and FLAGS.tiny == True:
boxes, pred_conf = filter_boxes(pred[1], pred[0], score_threshold=0.25,
input_shape=tf.constant([input_size, input_size]))
else:
boxes, pred_conf = filter_boxes(pred[0], pred[1], score_threshold=0.25,
input_shape=tf.constant([input_size, input_size]))
else:
batch_data = tf.constant(image_data)
boxes, scores, classes, valid_detections = infer(batch_data, model)
pred_bbox = [boxes.numpy(), scores.numpy(), classes.numpy(), valid_detections.numpy()]
image = utils.draw_bbox(frame, pred_bbox)
curr_time = time.time()
exec_time = curr_time - prev_time
result = np.asarray(image)
info = "time: %.2f ms" %(1000*exec_time)
print(info)
result = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
if not FLAGS.dis_cv2_window:
cv2.namedWindow("result", cv2.WINDOW_AUTOSIZE)
cv2.imshow("result", result)
if cv2.waitKey(1) & 0xFF == ord('q'): break
if FLAGS.output:
out.write(result)
frame_id += 1
if __name__ == '__main__':
try:
app.run(main)
except SystemExit:
pass
Above is the detectvideo.py code, you cannot use AutoGraph mode or Static mode, only use Dynamic mode or eager mode! This should be a tensorflow2.4.x version problem.
I think I found a way to solve it. I was using a tensorflow 2.4.2 docker image, changing to the tensorflow version specified in the requirements-gpu.txt file (2.3.0rc0-gpu) solved it, ran on GPU, got predictions for the entire video, and fast as usual. I tested predicting on a model converted using the 2.4.2 version and it didnt't work even with predicting with the 2.3.0rc0 version. I had to re-convert the model using TF 2.3.0rc0, and also predict with 2.3.0rc0.
Also ran into this issue lately... After much fustration, I finally resorted to doing the same, but via building a docker image so I can port it anywhere (thanks NVidia for providing cuda images)
Here is the Dockerfile
I tested to be working:
FROM nvidia/cuda:10.1-cudnn7-devel-ubuntu18.04
COPY . /app
WORKDIR /app
RUN apt-get update && apt-get install -y libgl1 libglib2.0-0 libsm6 libxrender1 libfontconfig1 libice6 python3 python3-pip
RUN python3 -m pip install -U pip
RUN pip3 install -r requirements-gpu.txt
ENTRYPOINT ["python3"]
I will try to create a PR on this, but seeing the last code change in this repo is well over 1 year, I don't think it is very likely to get merged...
Above is the detectvideo.py code, you cannot use AutoGraph mode or Static mode, only use Dynamic mode or eager mode! This should be a tensorflow2.4.x version problem.
Hi, it works but why is it so slow
AutoGraph mode
Looks like, the Eager mode could be slower than the Autograph mode.
What is TensorFlow graph mode? TensorFlow uses graphs as the format for saved models when it exports them from Python. Graphs are also easily optimized, allowing the compiler to do transformations like: Statically infer the value of tensors by folding constant nodes in your computation ("constant folding").
What is TensorFlow eager mode? Eager execution is a powerful execution environment that evaluates operations immediately. It does not build graphs, and the operations return actual values instead of computational graphs to run later. With Eager execution, TensorFlow calculates the values of tensors as they occur in your code.
Above is the detectvideo.py code, you cannot use AutoGraph mode or Static mode, only use Dynamic mode or eager mode! This should be a tensorflow2.4.x version problem.
This worked for me, but it is slow (12FPS) on RTX3060
import time import tensorflow as tf physical_devices = tf.config.experimental.list_physical_devices('GPU') if len(physical_devices) > 0: tf.config.experimental.set_memory_growth(physical_devices[0], True) from absl import app, flags, logging from absl.flags import FLAGS import core.utils as utils from core.yolov4 import filter_boxes, decode, YOLO from tensorflow.python.saved_model import tag_constants from PIL import Image import cv2 import numpy as np # from tensorflow.compat.v1 import ConfigProto # from tensorflow.compat.v1 import InteractiveSession flags.DEFINE_string('framework', 'tf', '(tf, tflite, trt') # flags.DEFINE_string('weights', './checkpoints/yolov4-416/variables/variables', # 'path to weights file') flags.DEFINE_string('weights', './checkpoints/yolov4-416/variables/variables', 'path to weights file') flags.DEFINE_integer('size', 416, 'resize images to') flags.DEFINE_boolean('tiny', False, 'yolo or yolo-tiny') flags.DEFINE_string('model', 'yolov4', 'yolov3 or yolov4') flags.DEFINE_string('video', './data/test2.mp4', 'path to input video') flags.DEFINE_float('iou', 0.45, 'iou threshold') flags.DEFINE_float('score', 0.25, 'score threshold') flags.DEFINE_string('output', './data/result/vidio.mp4', 'path to output video') flags.DEFINE_string('output_format', 'XVID', 'codec used in VideoWriter when saving video to file') flags.DEFINE_boolean('dis_cv2_window', False, 'disable cv2 window during the process') # this is good for the .ipynb # @tf.function def infer(batch_data, model): STRIDES, ANCHORS, NUM_CLASS, XYSCALE = utils.load_config(FLAGS) # batch_data = tf.constant(image_data) feature_maps = model(batch_data) bbox_tensors = [] prob_tensors = [] if FLAGS.tiny: for i, fm in enumerate(feature_maps): if i == 0: output_tensors = decode(fm, FLAGS.size // 16, NUM_CLASS, STRIDES, ANCHORS, i, XYSCALE, FLAGS.framework) else: output_tensors = decode(fm, FLAGS.size // 32, NUM_CLASS, STRIDES, ANCHORS, i, XYSCALE, FLAGS.framework) bbox_tensors.append(output_tensors[0]) prob_tensors.append(output_tensors[1]) else: for i, fm in enumerate(feature_maps): if i == 0: output_tensors = decode(fm, FLAGS.size // 8, NUM_CLASS, STRIDES, ANCHORS, i, XYSCALE, FLAGS.framework) elif i == 1: output_tensors = decode(fm, FLAGS.size // 16, NUM_CLASS, STRIDES, ANCHORS, i, XYSCALE, FLAGS.framework) else: output_tensors = decode(fm, FLAGS.size // 32, NUM_CLASS, STRIDES, ANCHORS, i, XYSCALE, FLAGS.framework) bbox_tensors.append(output_tensors[0]) prob_tensors.append(output_tensors[1]) pred_bbox = tf.concat(bbox_tensors, axis=1) pred_prob = tf.concat(prob_tensors, axis=1) if FLAGS.framework == 'tflite': pred_bbox = (pred_bbox, pred_prob) else: boxes, pred_conf = filter_boxes(pred_bbox, pred_prob, score_threshold=FLAGS.score, input_shape=tf.constant([FLAGS.size, FLAGS.size])) pred_bbox = tf.concat([boxes, pred_conf], axis=-1) boxes = pred_bbox[:, :, 0:4] pred_conf = pred_bbox[:, :, 4:] boxes, scores, classes, valid_detections = tf.image.combined_non_max_suppression( boxes=tf.reshape(boxes, (tf.shape(boxes)[0], -1, 1, 4)), scores=tf.reshape( pred_conf, (tf.shape(pred_conf)[0], -1, tf.shape(pred_conf)[-1])), max_output_size_per_class=50, max_total_size=50, iou_threshold=FLAGS.iou, score_threshold=FLAGS.score ) return boxes, scores, classes, valid_detections def main(_argv): # config = ConfigProto() # config.gpu_options.allow_growth = True # session = InteractiveSession(config=config) STRIDES, ANCHORS, NUM_CLASS, XYSCALE = utils.load_config(FLAGS) input_size = FLAGS.size video_path = FLAGS.video print("Video from: ", video_path ) vid = cv2.VideoCapture(video_path) if FLAGS.framework == 'tflite': interpreter = tf.lite.Interpreter(model_path=FLAGS.weights) interpreter.allocate_tensors() input_details = interpreter.get_input_details() output_details = interpreter.get_output_details() print(input_details) print(output_details) else: # saved_model_loaded = tf.saved_model.load(FLAGS.weights, tags=[tag_constants.SERVING]) # infer = saved_model_loaded.signatures[tf.saved_model.DEFAULT_SERVING_SIGNATURE_DEF_KEY] STRIDES, ANCHORS, NUM_CLASS, XYSCALE = utils.load_config(FLAGS) inputs = tf.keras.layers.Input([FLAGS.size, FLAGS.size, 3]) outputs = YOLO(inputs, NUM_CLASS, FLAGS.model, FLAGS.tiny) model = tf.keras.Model(inputs, outputs) model.load_weights(FLAGS.weights) if FLAGS.output: # by default VideoCapture returns float instead of int width = int(vid.get(cv2.CAP_PROP_FRAME_WIDTH)) height = int(vid.get(cv2.CAP_PROP_FRAME_HEIGHT)) fps = int(vid.get(cv2.CAP_PROP_FPS)) codec = cv2.VideoWriter_fourcc(*FLAGS.output_format) out = cv2.VideoWriter(FLAGS.output, codec, fps, (width, height)) frame_id = 0 while True: return_value, frame = vid.read() if return_value: frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) image = Image.fromarray(frame) else: if frame_id == vid.get(cv2.CAP_PROP_FRAME_COUNT): print("Video processing complete") break raise ValueError("No image! Try with another video format") frame_size = frame.shape[:2] image_data = cv2.resize(frame, (input_size, input_size)) image_data = image_data / 255. image_data = image_data[np.newaxis, ...].astype(np.float32) prev_time = time.time() if FLAGS.framework == 'tflite': interpreter.set_tensor(input_details[0]['index'], image_data) interpreter.invoke() pred = [interpreter.get_tensor(output_details[i]['index']) for i in range(len(output_details))] if FLAGS.model == 'yolov3' and FLAGS.tiny == True: boxes, pred_conf = filter_boxes(pred[1], pred[0], score_threshold=0.25, input_shape=tf.constant([input_size, input_size])) else: boxes, pred_conf = filter_boxes(pred[0], pred[1], score_threshold=0.25, input_shape=tf.constant([input_size, input_size])) else: batch_data = tf.constant(image_data) boxes, scores, classes, valid_detections = infer(batch_data, model) pred_bbox = [boxes.numpy(), scores.numpy(), classes.numpy(), valid_detections.numpy()] image = utils.draw_bbox(frame, pred_bbox) curr_time = time.time() exec_time = curr_time - prev_time result = np.asarray(image) info = "time: %.2f ms" %(1000*exec_time) print(info) result = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) if not FLAGS.dis_cv2_window: cv2.namedWindow("result", cv2.WINDOW_AUTOSIZE) cv2.imshow("result", result) if cv2.waitKey(1) & 0xFF == ord('q'): break if FLAGS.output: out.write(result) frame_id += 1 if __name__ == '__main__': try: app.run(main) except SystemExit: pass
This code can run it, but so slow...