fairseq
fairseq copied to clipboard
VideoCLIP help on demo script
❓ Questions and Help
What is your question?
I wanted to make sure this is the correct way to use the VIDEOCLIP model. I have a basket video and I want to find the clips of one second that contain the ball (for example). I also tried queries such as "slum dunk", "ball". but the score doesn't seem to be good to extract such clips is this code correct? is the preprocessing correct?
Code
import torch
import cv2
from mmpt.models import MMPTModel
from scripts.video_feature_extractor.preprocessing import Preprocessing
from scripts.video_feature_extractor.videoreader import VideoLoader
import numpy as np
model, tokenizer, aligner = MMPTModel.from_pretrained(
"projects/retri/videoclip/how2.yaml")
model.eval()
video_loader = VideoLoader(video_dict={},framerate=30,size= 224,centercrop=True,shards= 0)
out_video = video_loader._decode("","/mnt/data/exp/basket.mp4")['video']
out_video = out_video[:out_video.shape[0]//30*30]
out_video = out_video.permute(0, 2, 3, 1)
# preprocessing
video_frames = out_video.view(1, -1, 30, 224, 224, 3)/255.0
caps, cmasks = aligner._build_text_seq(
tokenizer("ball", add_special_tokens=False)["input_ids"]
)
caps, cmasks = caps[None, :], cmasks[None, :] # bsz=1
for sec in range(video_frames.size(1)):
with torch.no_grad():
output = model(video_frames[:, sec:sec+1], caps, cmasks, return_score=True)["score"]