Transformers 上的InternVL3.0模型推理代码是否不可用于InternVL3.5?
https://huggingface.co/docs/transformers/v4.55.4/en/model_doc/internvl#video-input 此处如果将模型换为InternVL3.5系列,对样例视频输入仍报错:shape '[8, 27, 13, 2048]' is invalid for input of size 5971968,如何解决,这是否是一个错误。
请确保你使用的是包含-HF后缀的ckpt,例如:OpenGVLab/InternVL3_5-8B-HF
Hi @Lee-xeo , have you resolved the issue? Even using the -HF suffix doesn't help.
@Weiyun1025 Can you provide the solution
@nimeshagrawal Hello, I implemented the inference using the official complex processing code, and although it doesn't look elegant, it works. (Note, you need to use the version without the HF suffix)
import argparse
import os
import json
import math
import numpy as np
import torch
import torchvision.transforms as T
from decord import VideoReader, cpu
from PIL import Image
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
best_ratio_diff = float('inf')
best_ratio = (1, 1)
area = width * height
for ratio in target_ratios:
target_aspect_ratio = ratio[0] / ratio[1]
ratio_diff = abs(aspect_ratio - target_aspect_ratio)
if ratio_diff < best_ratio_diff:
best_ratio_diff = ratio_diff
best_ratio = ratio
elif ratio_diff == best_ratio_diff:
if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
best_ratio = ratio
return best_ratio
def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
orig_width, orig_height = image.size
aspect_ratio = orig_width / orig_height
# calculate the existing image aspect ratio
target_ratios = set(
(i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
i * j <= max_num and i * j >= min_num)
target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
# find the closest aspect ratio to the target
target_aspect_ratio = find_closest_aspect_ratio(
aspect_ratio, target_ratios, orig_width, orig_height, image_size)
# calculate the target width and height
target_width = image_size * target_aspect_ratio[0]
target_height = image_size * target_aspect_ratio[1]
blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
# resize the image
resized_img = image.resize((target_width, target_height))
processed_images = []
for i in range(blocks):
box = (
(i % (target_width // image_size)) * image_size,
(i // (target_width // image_size)) * image_size,
((i % (target_width // image_size)) + 1) * image_size,
((i // (target_width // image_size)) + 1) * image_size
)
# split the image
split_img = resized_img.crop(box)
processed_images.append(split_img)
assert len(processed_images) == blocks
if use_thumbnail and len(processed_images) != 1:
thumbnail_img = image.resize((image_size, image_size))
processed_images.append(thumbnail_img)
return processed_images
def get_index(bound, fps, max_frame, first_idx=0, num_segments=32):
if bound:
start, end = bound[0], bound[1]
else:
start, end = -100000, 100000
start_idx = max(first_idx, round(start * fps))
end_idx = min(round(end * fps), max_frame)
seg_size = float(end_idx - start_idx) / num_segments
frame_indices = np.array([
int(start_idx + (seg_size / 2) + np.round(seg_size * idx))
for idx in range(num_segments)
])
return frame_indices
def load_video(video_path, bound=None, input_size=448, max_num=1, num_segments=32):
vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
max_frame = len(vr) - 1
fps = float(vr.get_avg_fps())
pixel_values_list, num_patches_list = [], []
transform = build_transform(input_size=input_size)
frame_indices = get_index(bound, fps, max_frame, first_idx=0, num_segments=num_segments)
for frame_index in frame_indices:
img = Image.fromarray(vr[frame_index].asnumpy()).convert('RGB')
img = dynamic_preprocess(img, image_size=input_size, use_thumbnail=True, max_num=max_num)
pixel_values = [transform(tile) for tile in img]
pixel_values = torch.stack(pixel_values)
num_patches_list.append(pixel_values.shape[0])
pixel_values_list.append(pixel_values)
pixel_values = torch.cat(pixel_values_list)
return pixel_values, num_patches_list
class TestModel:
def __init__(self,
model,
input_dir: str = './annotation/normal',
output_dir: str = './response',
thinking: bool = False
):
self.input_dir = input_dir
self.video_meta_info_path = './annotation/video_meta_info.json'
self.prompt_input_path = os.path.join(input_dir, 'prompt.json')
self.model_name = model
self.output_dir = output_dir
self.thinking = thinking
if self.thinking:
self.response_output_path = os.path.join(output_dir, f'{self.model_name}_thinking_response.json')
else:
self.response_output_path = os.path.join(output_dir, f'{self.model_name}_nothinking_response.json')
# 创建输出目录
if not os.path.exists(output_dir):
os.makedirs(output_dir)
self.meta_prompt_file = "meta_prompt/test_vlm_meta_prompt.txt"
# 加载InternVL模型
model_path = f"./models/InternVL/{model}"
self.model = AutoModel.from_pretrained(
model_path,
torch_dtype=torch.bfloat16,
load_in_8bit=False,
low_cpu_mem_usage=True,
trust_remote_code=True,
device_map="auto"
).eval()
if self.thinking:
R1_SYSTEM_PROMPT = """
You are an AI assistant that rigorously follows this response protocol:
1. First, conduct a detailed analysis of the question. Consider different angles, potential solutions, and reason through the problem step-by-step. Enclose this entire thinking process within <think> and </think> tags.
2. After the thinking section, provide a clear, concise, and direct answer to the user's question. Separate the answer from the think section with a newline.
Ensure that the thinking process is thorough but remains focused on the query. The final answer should be standalone and not reference the thinking section.
""".strip()
self.model.system_message = R1_SYSTEM_PROMPT
self.tokenizer = AutoTokenizer.from_pretrained(
model_path,
trust_remote_code=True,
use_fast=False
)
print(f"成功加载模型: {model}")
print(f"检测到 {torch.cuda.device_count()} 个GPU")
# 生成配置
if self.thinking:
self.temperature = 0.6
self.max_new_tokens = 8192
else:
self.temperature = 0.1
self.max_new_tokens = 2048
self.generation_config = dict(
max_new_tokens=self.max_new_tokens,
do_sample=True,
temperature=self.temperature,
top_p=0.001,
repetition_penalty=1.05
)
# 视频处理配置
self.input_size = 448 # 分辨率处理为448*448
self.fps = 1.0 # 按1fps采帧
self.max_num = 1 # 每帧的最大块数
def process_video_with_internvl(self, video_path: str, meta_prompt: str, prompt: str):
"""
使用 InternVL 处理视频。
"""
if not os.path.exists(video_path):
raise FileNotFoundError(f"视频文件未找到: {video_path}")
# 读取视频并获取帧
vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
video_fps = float(vr.get_avg_fps())
total_frames = len(vr) - 1
# 根据2fps计算采样帧数
duration = total_frames / video_fps
num_segments = max(1, int(duration * self.fps))
# 加载视频帧
pixel_values, num_patches_list = load_video(
video_path,
bound=None,
input_size=self.input_size,
max_num=self.max_num,
num_segments=num_segments
)
pixel_values = pixel_values.to(torch.bfloat16).cuda()
# 构建视频前缀
video_prefix = ''.join([f'Frame{i+1}: <image>\n' for i in range(len(num_patches_list))])
# 构建完整的提示
full_prompt = f"{meta_prompt}\n{prompt}\n{video_prefix}"
# 使用模型进行推理
try:
response, history = self.model.chat(
self.tokenizer,
pixel_values,
full_prompt,
self.generation_config,
num_patches_list=num_patches_list,
history=None,
return_history=True
)
if self.thinking:
# 提取最终答案
if '</think>' in response:
response = response.split('</think>', 1)[-1].strip()
else:
response = response.strip()
return response
except Exception as e:
print(f"模型推理过程中发生错误: {str(e)}")
raise e
@Lee-xeo Thanks for sharing. But is there a way to make it work with -HF version too? Because as per documentation in this: https://huggingface.co/docs/transformers/v4.55.4/en/model_doc/internvl#video-input, it does not work.
我也遇到了同样的问题,使用-HF结尾的模型权重推理视频时,会遇到: RuntimeError: shape '[120, 27, 13, 2048]' is invalid for input of size 89579520
change config in file 'video_preprocessor_config.json' to "size" 448 and it will ok.
@nguyen-khang-ntq There is another alternative to fix this bug. While processing the messages, we can use the following:
inputs = processor.apply_chat_template( messages, return_tensors="pt", add_generation_prompt=True, tokenize=True, return_dict=True, fps=fps, size={"height": height, "width": width} ).to(model.device, dtype=torch.bfloat16)
where height and width needs to be 448. With this method, it can be even inferred using FPS of choice.
@nimeshagrawal i did this and it worked
Related issue: https://github.com/OpenGVLab/InternVL/issues/1222